airbyte-internal-ops 0.1.2.post2.dev20080805740__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {airbyte_internal_ops-0.1.2.post2.dev20080805740.dist-info → airbyte_internal_ops-0.1.4.dist-info}/METADATA +8 -5
  2. {airbyte_internal_ops-0.1.2.post2.dev20080805740.dist-info → airbyte_internal_ops-0.1.4.dist-info}/RECORD +31 -11
  3. airbyte_ops_mcp/_legacy/airbyte_ci/connector_pipelines/airbyte_ci/connectors/test/steps/common.py +1 -1
  4. airbyte_ops_mcp/cli/cloud.py +309 -38
  5. airbyte_ops_mcp/cloud_admin/connection_config.py +131 -0
  6. airbyte_ops_mcp/live_tests/__init__.py +16 -0
  7. airbyte_ops_mcp/live_tests/_connection_retriever/__init__.py +35 -0
  8. airbyte_ops_mcp/live_tests/_connection_retriever/audit_logging.py +88 -0
  9. airbyte_ops_mcp/live_tests/_connection_retriever/consts.py +33 -0
  10. airbyte_ops_mcp/live_tests/_connection_retriever/db_access.py +82 -0
  11. airbyte_ops_mcp/live_tests/_connection_retriever/retrieval.py +391 -0
  12. airbyte_ops_mcp/live_tests/_connection_retriever/secrets_resolution.py +130 -0
  13. airbyte_ops_mcp/live_tests/config.py +190 -0
  14. airbyte_ops_mcp/live_tests/connection_fetcher.py +159 -2
  15. airbyte_ops_mcp/live_tests/connection_secret_retriever.py +173 -0
  16. airbyte_ops_mcp/live_tests/evaluation_modes.py +45 -0
  17. airbyte_ops_mcp/live_tests/http_metrics.py +81 -0
  18. airbyte_ops_mcp/live_tests/message_cache/__init__.py +15 -0
  19. airbyte_ops_mcp/live_tests/message_cache/duckdb_cache.py +415 -0
  20. airbyte_ops_mcp/live_tests/obfuscation.py +126 -0
  21. airbyte_ops_mcp/live_tests/regression/__init__.py +29 -0
  22. airbyte_ops_mcp/live_tests/regression/comparators.py +466 -0
  23. airbyte_ops_mcp/live_tests/schema_generation.py +154 -0
  24. airbyte_ops_mcp/live_tests/validation/__init__.py +43 -0
  25. airbyte_ops_mcp/live_tests/validation/catalog_validators.py +389 -0
  26. airbyte_ops_mcp/live_tests/validation/record_validators.py +227 -0
  27. airbyte_ops_mcp/mcp/_mcp_utils.py +3 -0
  28. airbyte_ops_mcp/mcp/live_tests.py +500 -0
  29. airbyte_ops_mcp/mcp/server.py +3 -0
  30. {airbyte_internal_ops-0.1.2.post2.dev20080805740.dist-info → airbyte_internal_ops-0.1.4.dist-info}/WHEEL +0 -0
  31. {airbyte_internal_ops-0.1.2.post2.dev20080805740.dist-info → airbyte_internal_ops-0.1.4.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,415 @@
1
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
2
+ """DuckDB-based message cache for storing Airbyte messages.
3
+
4
+ This module provides a DuckDB-based message cache that persists Airbyte messages
5
+ to JSONL files and loads them into DuckDB for efficient querying.
6
+
7
+ Based on airbyte-ci implementation:
8
+ https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/live-tests/src/live_tests/commons/backends/duckdb_backend.py
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ import logging
15
+ import re
16
+ from collections.abc import Iterable
17
+ from pathlib import Path
18
+ from typing import TextIO
19
+
20
+ import duckdb
21
+ from airbyte_protocol.models import AirbyteMessage
22
+ from airbyte_protocol.models import Type as AirbyteMessageType
23
+ from cachetools import LRUCache, cached
24
+ from uuid_extensions import uuid7str
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ def sanitize_stream_name(stream_name: str) -> str:
30
+ """Sanitize a stream name for use as a file name.
31
+
32
+ Replaces characters that are not alphanumeric or underscores with underscores.
33
+ """
34
+ return re.sub(r"[^a-zA-Z0-9_]", "_", stream_name)
35
+
36
+
37
+ def sanitize_table_name(table_name: str) -> str:
38
+ """Sanitize a table name for use in DuckDB.
39
+
40
+ Replaces spaces with underscores and removes non-alphanumeric characters.
41
+ Prepends underscore if name starts with a digit.
42
+ """
43
+ sanitized = str(table_name).replace(" ", "_")
44
+ sanitized = re.sub(r"[^\w\s]", "", sanitized)
45
+ if sanitized and sanitized[0].isdigit():
46
+ sanitized = "_" + sanitized
47
+ return sanitized
48
+
49
+
50
+ def _add_message_id(json_str: str) -> str:
51
+ """Add a UUIDv7 _message_id to a JSON string.
52
+
53
+ The _message_id is a time-ordered identifier that provides:
54
+ - Portable ordering in JSONL files (sortable as strings)
55
+ - Traceability for debugging
56
+ - Consistency with PyAirbyte's ab_raw_id pattern
57
+
58
+ The _message_id is cache metadata and should be excluded from
59
+ regression comparisons (like emitted_at).
60
+ """
61
+ payload = json.loads(json_str)
62
+ payload["_message_id"] = uuid7str()
63
+ return json.dumps(payload, sort_keys=True)
64
+
65
+
66
+ class _FileDescriptorLRUCache(LRUCache):
67
+ """LRU cache that closes file descriptors when evicted."""
68
+
69
+ def popitem(self) -> tuple:
70
+ filepath, fd = LRUCache.popitem(self)
71
+ fd.close()
72
+ return filepath, fd
73
+
74
+
75
+ class DuckDbMessageCache:
76
+ """DuckDB-based message cache for Airbyte messages.
77
+
78
+ This cache writes messages to JSONL files (for debugging and portability)
79
+ and loads them into DuckDB for efficient querying. Messages are stored with
80
+ an explicit `message_index` column to preserve ordering.
81
+
82
+ Based on airbyte-ci implementation:
83
+ https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/live-tests/src/live_tests/commons/backends/duckdb_backend.py
84
+ """
85
+
86
+ RELATIVE_CATALOGS_PATH = "catalog.jsonl"
87
+ RELATIVE_CONNECTION_STATUS_PATH = "connection_status.jsonl"
88
+ RELATIVE_RECORDS_PATH = "records.jsonl"
89
+ RELATIVE_SPECS_PATH = "spec.jsonl"
90
+ RELATIVE_STATES_PATH = "states.jsonl"
91
+ RELATIVE_TRACES_PATH = "traces.jsonl"
92
+ RELATIVE_LOGS_PATH = "logs.jsonl"
93
+ RELATIVE_CONTROLS_PATH = "controls.jsonl"
94
+
95
+ SAMPLE_SIZE = -1 # Read all rows for schema inference
96
+
97
+ def __init__(
98
+ self,
99
+ output_directory: Path,
100
+ duckdb_path: Path | None = None,
101
+ schema: Iterable[str] | None = None,
102
+ ):
103
+ """Initialize the message cache.
104
+
105
+ Args:
106
+ output_directory: Directory where JSONL files will be written.
107
+ duckdb_path: Path to the DuckDB database file. If None, uses
108
+ output_directory / "messages.duckdb".
109
+ schema: Optional schema name parts (e.g., ["connector", "version"]).
110
+ """
111
+ self._output_directory = output_directory
112
+ self._output_directory.mkdir(parents=True, exist_ok=True)
113
+
114
+ self.duckdb_path = duckdb_path or (output_directory / "messages.duckdb")
115
+ self.schema = list(schema) if schema else None
116
+
117
+ self.record_per_stream_directory = self._output_directory / "records_per_stream"
118
+ self.record_per_stream_directory.mkdir(exist_ok=True, parents=True)
119
+ self.record_per_stream_paths: dict[str, Path] = {}
120
+ self.record_per_stream_paths_data_only: dict[str, Path] = {}
121
+
122
+ self._file_cache: _FileDescriptorLRUCache = _FileDescriptorLRUCache(maxsize=250)
123
+ self._db_connection: duckdb.DuckDBPyConnection | None = None
124
+
125
+ @property
126
+ def jsonl_specs_path(self) -> Path:
127
+ return (self._output_directory / self.RELATIVE_SPECS_PATH).resolve()
128
+
129
+ @property
130
+ def jsonl_catalogs_path(self) -> Path:
131
+ return (self._output_directory / self.RELATIVE_CATALOGS_PATH).resolve()
132
+
133
+ @property
134
+ def jsonl_connection_status_path(self) -> Path:
135
+ return (self._output_directory / self.RELATIVE_CONNECTION_STATUS_PATH).resolve()
136
+
137
+ @property
138
+ def jsonl_records_path(self) -> Path:
139
+ return (self._output_directory / self.RELATIVE_RECORDS_PATH).resolve()
140
+
141
+ @property
142
+ def jsonl_states_path(self) -> Path:
143
+ return (self._output_directory / self.RELATIVE_STATES_PATH).resolve()
144
+
145
+ @property
146
+ def jsonl_traces_path(self) -> Path:
147
+ return (self._output_directory / self.RELATIVE_TRACES_PATH).resolve()
148
+
149
+ @property
150
+ def jsonl_logs_path(self) -> Path:
151
+ return (self._output_directory / self.RELATIVE_LOGS_PATH).resolve()
152
+
153
+ @property
154
+ def jsonl_controls_path(self) -> Path:
155
+ return (self._output_directory / self.RELATIVE_CONTROLS_PATH).resolve()
156
+
157
+ @property
158
+ def jsonl_files(self) -> list[Path]:
159
+ return [
160
+ self.jsonl_catalogs_path,
161
+ self.jsonl_connection_status_path,
162
+ self.jsonl_records_path,
163
+ self.jsonl_specs_path,
164
+ self.jsonl_states_path,
165
+ self.jsonl_traces_path,
166
+ self.jsonl_logs_path,
167
+ self.jsonl_controls_path,
168
+ ]
169
+
170
+ @property
171
+ def jsonl_files_to_insert(self) -> list[Path]:
172
+ """JSONL files that should be inserted into DuckDB."""
173
+ return self.jsonl_files
174
+
175
+ def write(
176
+ self,
177
+ airbyte_messages: Iterable[AirbyteMessage],
178
+ ) -> None:
179
+ """Write Airbyte messages to JSONL files and load into DuckDB.
180
+
181
+ Messages are written to JSONL files first (preserving order), then
182
+ loaded into DuckDB with an explicit message_index column.
183
+ """
184
+ self._write_to_jsonl(airbyte_messages)
185
+ self._load_into_duckdb()
186
+
187
+ def _write_to_jsonl(
188
+ self,
189
+ airbyte_messages: Iterable[AirbyteMessage],
190
+ ) -> None:
191
+ """Write messages to JSONL files.
192
+
193
+ Uses an LRU cache to manage open file objects, limiting the number of
194
+ concurrently open file descriptors.
195
+ """
196
+
197
+ @cached(cache=self._file_cache)
198
+ def _open_file(path: Path) -> TextIO:
199
+ return open(path, "a")
200
+
201
+ try:
202
+ logger.info("Writing airbyte messages to disk")
203
+ for message in airbyte_messages:
204
+ if not isinstance(message, AirbyteMessage):
205
+ continue
206
+ filepaths, messages = self._get_filepaths_and_messages(message)
207
+ for filepath, msg_json in zip(filepaths, messages, strict=False):
208
+ _open_file(self._output_directory / filepath).write(f"{msg_json}\n")
209
+ logger.info("Finished writing airbyte messages to disk")
210
+ finally:
211
+ for f in self._file_cache.values():
212
+ f.close()
213
+ self._file_cache.clear()
214
+
215
+ def _get_filepaths_and_messages(
216
+ self,
217
+ message: AirbyteMessage,
218
+ ) -> tuple[tuple[str, ...], tuple[str, ...]]:
219
+ """Get file paths and JSON strings for a message.
220
+
221
+ Each message is serialized with a UUIDv7 _message_id for:
222
+ - Portable ordering in JSONL files (sortable as strings)
223
+ - Traceability for debugging
224
+ - Consistency with PyAirbyte's ab_raw_id pattern
225
+
226
+ Note: data-only files don't get _message_id since they only contain
227
+ the record.data payload, not the full message envelope.
228
+ """
229
+ if message.type == AirbyteMessageType.CATALOG:
230
+ return (self.RELATIVE_CATALOGS_PATH,), (
231
+ _add_message_id(message.catalog.model_dump_json()),
232
+ )
233
+
234
+ if message.type == AirbyteMessageType.CONNECTION_STATUS:
235
+ return (self.RELATIVE_CONNECTION_STATUS_PATH,), (
236
+ _add_message_id(message.connectionStatus.model_dump_json()),
237
+ )
238
+
239
+ if message.type == AirbyteMessageType.RECORD:
240
+ stream_name = message.record.stream
241
+ stream_file_path = (
242
+ self.record_per_stream_directory
243
+ / f"{sanitize_stream_name(stream_name)}.jsonl"
244
+ )
245
+ stream_file_path_data_only = (
246
+ self.record_per_stream_directory
247
+ / f"{sanitize_stream_name(stream_name)}_data_only.jsonl"
248
+ )
249
+ self.record_per_stream_paths[stream_name] = stream_file_path
250
+ self.record_per_stream_paths_data_only[stream_name] = (
251
+ stream_file_path_data_only
252
+ )
253
+ # Full message gets _message_id, data-only does not
254
+ message_with_id = _add_message_id(message.model_dump_json())
255
+ return (
256
+ self.RELATIVE_RECORDS_PATH,
257
+ str(stream_file_path),
258
+ str(stream_file_path_data_only),
259
+ ), (
260
+ message_with_id,
261
+ message_with_id,
262
+ json.dumps(message.record.data, sort_keys=True),
263
+ )
264
+
265
+ if message.type == AirbyteMessageType.SPEC:
266
+ return (self.RELATIVE_SPECS_PATH,), (
267
+ _add_message_id(message.spec.model_dump_json()),
268
+ )
269
+
270
+ if message.type == AirbyteMessageType.STATE:
271
+ return (self.RELATIVE_STATES_PATH,), (
272
+ _add_message_id(message.state.model_dump_json()),
273
+ )
274
+
275
+ if message.type == AirbyteMessageType.TRACE:
276
+ return (self.RELATIVE_TRACES_PATH,), (
277
+ _add_message_id(message.trace.model_dump_json()),
278
+ )
279
+
280
+ if message.type == AirbyteMessageType.LOG:
281
+ return (self.RELATIVE_LOGS_PATH,), (
282
+ _add_message_id(message.log.model_dump_json()),
283
+ )
284
+
285
+ if message.type == AirbyteMessageType.CONTROL:
286
+ return (self.RELATIVE_CONTROLS_PATH,), (
287
+ _add_message_id(message.control.model_dump_json()),
288
+ )
289
+
290
+ raise NotImplementedError(
291
+ f"No handling for AirbyteMessage type {message.type} has been implemented."
292
+ )
293
+
294
+ def _load_into_duckdb(self) -> None:
295
+ """Load JSONL files into DuckDB with explicit message ordering."""
296
+ conn = duckdb.connect(str(self.duckdb_path))
297
+
298
+ try:
299
+ schema_name = None
300
+ if self.schema:
301
+ schema_name = "_".join([sanitize_table_name(s) for s in self.schema])
302
+ conn.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_name}")
303
+ conn.sql(f"USE {schema_name}")
304
+ logger.info(f"Using schema {schema_name}")
305
+
306
+ # Load main JSONL files with message_index for ordering
307
+ for json_file in self.jsonl_files_to_insert:
308
+ if json_file.exists():
309
+ table_name = sanitize_table_name(json_file.stem)
310
+ logger.info(f"Creating table {table_name} from {json_file}")
311
+ # Add message_index column for explicit ordering
312
+ conn.sql(f"""
313
+ CREATE TABLE {table_name} AS
314
+ SELECT
315
+ row_number() OVER () AS message_index,
316
+ *
317
+ FROM read_json_auto(
318
+ '{json_file}',
319
+ sample_size = {self.SAMPLE_SIZE},
320
+ format = 'newline_delimited'
321
+ )
322
+ """)
323
+ logger.info(f"Table {table_name} created")
324
+
325
+ # Load per-stream record files
326
+ for json_file in self.record_per_stream_paths_data_only.values():
327
+ if json_file.exists():
328
+ table_name = sanitize_table_name(f"records_{json_file.stem}")
329
+ logger.info(f"Creating table {table_name} from {json_file}")
330
+ conn.sql(f"""
331
+ CREATE TABLE {table_name} AS
332
+ SELECT
333
+ row_number() OVER () AS message_index,
334
+ *
335
+ FROM read_json_auto(
336
+ '{json_file}',
337
+ sample_size = {self.SAMPLE_SIZE},
338
+ format = 'newline_delimited'
339
+ )
340
+ """)
341
+ logger.info(f"Table {table_name} created")
342
+ finally:
343
+ conn.close()
344
+
345
+ def get_connection(self) -> duckdb.DuckDBPyConnection:
346
+ """Get a connection to the DuckDB database.
347
+
348
+ Returns a cached connection to avoid connection lifecycle issues
349
+ when returning DuckDB relations from query methods.
350
+ """
351
+ if self._db_connection is None:
352
+ self._db_connection = duckdb.connect(str(self.duckdb_path))
353
+ return self._db_connection
354
+
355
+ def query(
356
+ self,
357
+ sql: str,
358
+ ) -> duckdb.DuckDBPyRelation:
359
+ """Execute a SQL query against the message cache.
360
+
361
+ Args:
362
+ sql: SQL query to execute.
363
+
364
+ Returns:
365
+ DuckDB relation with query results.
366
+ """
367
+ conn = self.get_connection()
368
+ if self.schema:
369
+ schema_name = "_".join([sanitize_table_name(s) for s in self.schema])
370
+ conn.sql(f"USE {schema_name}")
371
+ return conn.sql(sql)
372
+
373
+ def get_records_ordered(
374
+ self,
375
+ stream_name: str | None = None,
376
+ ) -> duckdb.DuckDBPyRelation:
377
+ """Get records in their original order.
378
+
379
+ Args:
380
+ stream_name: Optional stream name to filter by.
381
+
382
+ Returns:
383
+ DuckDB relation with records ordered by message_index.
384
+ """
385
+ if stream_name:
386
+ table_name = sanitize_table_name(
387
+ f"records_{sanitize_stream_name(stream_name)}_data_only"
388
+ )
389
+ return self.query(f"SELECT * FROM {table_name} ORDER BY message_index")
390
+ return self.query("SELECT * FROM records ORDER BY message_index")
391
+
392
+ def get_states_ordered(self) -> duckdb.DuckDBPyRelation:
393
+ """Get state messages in their original order."""
394
+ return self.query("SELECT * FROM states ORDER BY message_index")
395
+
396
+ def get_record_count(
397
+ self,
398
+ stream_name: str | None = None,
399
+ ) -> int:
400
+ """Get the count of records.
401
+
402
+ Args:
403
+ stream_name: Optional stream name to filter by.
404
+
405
+ Returns:
406
+ Number of records.
407
+ """
408
+ if stream_name:
409
+ table_name = sanitize_table_name(
410
+ f"records_{sanitize_stream_name(stream_name)}_data_only"
411
+ )
412
+ result = self.query(f"SELECT COUNT(*) FROM {table_name}")
413
+ else:
414
+ result = self.query("SELECT COUNT(*) FROM records")
415
+ return result.fetchone()[0]
@@ -0,0 +1,126 @@
1
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
2
+ """Record obfuscation utilities for live tests.
3
+
4
+ This module provides functions for obfuscating sensitive data in Airbyte
5
+ records while preserving type and length information for schema inference.
6
+
7
+ Based on airbyte-ci implementation:
8
+ https://github.com/airbytehq/airbyte/blob/master/tools/bin/record_obfuscator.py
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import hashlib
14
+ import json
15
+ from typing import Any
16
+
17
+ from airbyte_protocol.models import AirbyteMessage
18
+ from airbyte_protocol.models import Type as AirbyteMessageType
19
+
20
+
21
+ def _generate_hash(value: Any) -> str:
22
+ """Generate a SHA256 hash of the value."""
23
+ return hashlib.sha256(str(value).encode()).hexdigest()[:16]
24
+
25
+
26
+ def obfuscate_value(value: Any) -> str:
27
+ """Obfuscate a value while retaining type and length information.
28
+
29
+ The obfuscated value encodes:
30
+ - The original type (string, integer, number, boolean, null, array, object)
31
+ - The length/size of the original value
32
+ - A hash for uniqueness (truncated for readability)
33
+
34
+ This allows schema inference to work correctly on obfuscated data.
35
+ """
36
+ if isinstance(value, str):
37
+ return f"string_len-{len(value)}_{_generate_hash(value)}"
38
+ if isinstance(value, bool): # Must check bool before int (bool is subclass of int)
39
+ return f"boolean_{_generate_hash(value)}"
40
+ if isinstance(value, int):
41
+ return f"integer_len-{len(str(value))}_{_generate_hash(value)}"
42
+ if isinstance(value, float):
43
+ return f"number_len-{len(str(value))}_{_generate_hash(value)}"
44
+ if value is None:
45
+ return f"null_{_generate_hash(value)}"
46
+ if isinstance(value, list):
47
+ return f"array_len-{len(value)}_{_generate_hash(json.dumps(value, sort_keys=True))}"
48
+ if isinstance(value, dict):
49
+ return f"object_len-{len(value.keys())}_{_generate_hash(json.dumps(value, sort_keys=True))}"
50
+ # Fallback for unknown types
51
+ return f"unknown_{_generate_hash(value)}"
52
+
53
+
54
+ def obfuscate_record_data(data: dict[str, Any]) -> dict[str, str]:
55
+ """Obfuscate all values in a record's data dictionary.
56
+
57
+ Preserves the keys but replaces all values with obfuscated versions.
58
+ """
59
+ return {key: obfuscate_value(value) for key, value in data.items()}
60
+
61
+
62
+ def obfuscate_message(message: AirbyteMessage) -> AirbyteMessage:
63
+ """Obfuscate an Airbyte message if it's a RECORD type.
64
+
65
+ Non-RECORD messages are returned unchanged.
66
+ RECORD messages have their data field obfuscated.
67
+ """
68
+ if message.type != AirbyteMessageType.RECORD:
69
+ return message
70
+
71
+ if not message.record or not message.record.data:
72
+ return message
73
+
74
+ # Create a copy with obfuscated data
75
+ obfuscated_data = obfuscate_record_data(message.record.data)
76
+
77
+ # Create new message with obfuscated data
78
+ message_dict = message.dict()
79
+ message_dict["record"]["data"] = obfuscated_data
80
+ return AirbyteMessage.parse_obj(message_dict)
81
+
82
+
83
+ def obfuscate_messages(
84
+ messages: list[AirbyteMessage],
85
+ ) -> list[AirbyteMessage]:
86
+ """Obfuscate a list of Airbyte messages.
87
+
88
+ RECORD messages have their data obfuscated; other messages pass through unchanged.
89
+ """
90
+ return [obfuscate_message(msg) for msg in messages]
91
+
92
+
93
+ def get_type_from_obfuscated_value(obfuscated: str) -> Any:
94
+ """Convert an obfuscated value back to a representative value of the original type.
95
+
96
+ This is useful for schema inference on obfuscated data.
97
+
98
+ Based on airbyte-ci implementation:
99
+ https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/live-tests/src/live_tests/commons/models.py#L369-L390
100
+ """
101
+ if obfuscated.startswith("string_"):
102
+ return "a"
103
+ if obfuscated.startswith("integer_"):
104
+ return 0
105
+ if obfuscated.startswith("number_"):
106
+ return 0.1
107
+ if obfuscated.startswith("boolean_"):
108
+ return True
109
+ if obfuscated.startswith("null_"):
110
+ return None
111
+ if obfuscated.startswith("array_"):
112
+ return []
113
+ if obfuscated.startswith("object_"):
114
+ return {}
115
+ # Unknown type, return as string
116
+ return "unknown"
117
+
118
+
119
+ def convert_obfuscated_record_to_typed(
120
+ obfuscated_data: dict[str, str],
121
+ ) -> dict[str, Any]:
122
+ """Convert obfuscated record data to typed values for schema inference."""
123
+ return {
124
+ key: get_type_from_obfuscated_value(value)
125
+ for key, value in obfuscated_data.items()
126
+ }
@@ -0,0 +1,29 @@
1
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
2
+ """Regression test utilities for comparing connector outputs.
3
+
4
+ This module provides functions for comparing control and target connector
5
+ outputs to detect regressions in data integrity.
6
+
7
+ Based on airbyte-ci implementation:
8
+ https://github.com/airbytehq/airbyte/blob/master/airbyte-ci/connectors/live-tests/src/live_tests/regression_tests/test_read.py
9
+ """
10
+
11
+ from airbyte_ops_mcp.live_tests.regression.comparators import (
12
+ ComparisonResult,
13
+ RecordDiff,
14
+ StreamComparisonResult,
15
+ compare_all_records,
16
+ compare_primary_keys,
17
+ compare_record_counts,
18
+ compare_record_schemas,
19
+ )
20
+
21
+ __all__ = [
22
+ "ComparisonResult",
23
+ "RecordDiff",
24
+ "StreamComparisonResult",
25
+ "compare_all_records",
26
+ "compare_primary_keys",
27
+ "compare_record_counts",
28
+ "compare_record_schemas",
29
+ ]