airbyte-cdk 0.67.0__py3-none-any.whl → 0.67.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/abstract_source.py +30 -69
- airbyte_cdk/sources/connector_state_manager.py +12 -26
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +552 -524
- airbyte_cdk/sources/file_based/config/csv_format.py +2 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
- airbyte_cdk/sources/streams/__init__.py +2 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
- airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
- airbyte_cdk/sources/streams/core.py +36 -34
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/METADATA +3 -2
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/RECORD +31 -31
- unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
- unit_tests/sources/file_based/config/test_csv_format.py +6 -1
- unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
- unit_tests/sources/file_based/test_scenarios.py +2 -2
- unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
- unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
- unit_tests/sources/streams/test_stream_read.py +221 -11
- unit_tests/sources/test_abstract_source.py +142 -130
- unit_tests/sources/test_connector_state_manager.py +3 -124
- unit_tests/sources/test_source.py +18 -14
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,34 @@
|
|
1
1
|
#
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
|
+
|
4
5
|
import logging
|
5
|
-
from typing import Any, Iterable, List, Mapping, Optional, Union
|
6
|
+
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union
|
6
7
|
from unittest.mock import Mock
|
7
8
|
|
8
9
|
import pytest
|
9
|
-
from airbyte_cdk.models import
|
10
|
+
from airbyte_cdk.models import (
|
11
|
+
AirbyteLogMessage,
|
12
|
+
AirbyteMessage,
|
13
|
+
AirbyteStateBlob,
|
14
|
+
AirbyteStateMessage,
|
15
|
+
AirbyteStateType,
|
16
|
+
AirbyteStream,
|
17
|
+
AirbyteStreamState,
|
18
|
+
ConfiguredAirbyteStream,
|
19
|
+
DestinationSyncMode,
|
20
|
+
Level,
|
21
|
+
StreamDescriptor,
|
22
|
+
SyncMode,
|
23
|
+
)
|
10
24
|
from airbyte_cdk.models import Type as MessageType
|
11
|
-
from airbyte_cdk.sources.
|
25
|
+
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
26
|
+
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
|
12
27
|
from airbyte_cdk.sources.streams import Stream
|
13
28
|
from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
|
14
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import NoopCursor
|
29
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, NoopCursor
|
30
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
31
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
15
32
|
from airbyte_cdk.sources.streams.core import StreamData
|
16
33
|
from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
|
17
34
|
from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger
|
@@ -49,20 +66,66 @@ class _MockStream(Stream):
|
|
49
66
|
return {}
|
50
67
|
|
51
68
|
|
69
|
+
class MockConcurrentCursor(Cursor):
|
70
|
+
_state: MutableMapping[str, Any]
|
71
|
+
_message_repository: MessageRepository
|
72
|
+
|
73
|
+
def __init__(self, message_repository: MessageRepository):
|
74
|
+
self._message_repository = message_repository
|
75
|
+
self._state = {}
|
76
|
+
|
77
|
+
@property
|
78
|
+
def state(self) -> MutableMapping[str, Any]:
|
79
|
+
return self._state
|
80
|
+
|
81
|
+
def observe(self, record: Record) -> None:
|
82
|
+
partition = str(record.data.get("partition"))
|
83
|
+
timestamp = record.data.get("created_at")
|
84
|
+
self._state[partition] = {"created_at": timestamp}
|
85
|
+
|
86
|
+
def close_partition(self, partition: Partition) -> None:
|
87
|
+
self._message_repository.emit_message(
|
88
|
+
AirbyteMessage(
|
89
|
+
type=MessageType.STATE,
|
90
|
+
state=AirbyteStateMessage(
|
91
|
+
type=AirbyteStateType.STREAM,
|
92
|
+
stream=AirbyteStreamState(
|
93
|
+
stream_descriptor=StreamDescriptor(name='__mock_stream', namespace=None),
|
94
|
+
stream_state=AirbyteStateBlob(**self._state),
|
95
|
+
)
|
96
|
+
),
|
97
|
+
)
|
98
|
+
)
|
99
|
+
|
100
|
+
def ensure_at_least_one_state_emitted(self) -> None:
|
101
|
+
pass
|
102
|
+
|
103
|
+
|
52
104
|
def _stream(slice_to_partition_mapping, slice_logger, logger, message_repository):
|
53
105
|
return _MockStream(slice_to_partition_mapping)
|
54
106
|
|
55
107
|
|
56
|
-
def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository):
|
108
|
+
def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor: Cursor = NoopCursor()):
|
57
109
|
stream = _stream(slice_to_partition_mapping, slice_logger, logger, message_repository)
|
58
110
|
source = Mock()
|
59
111
|
source._slice_logger = slice_logger
|
60
112
|
source.message_repository = message_repository
|
61
|
-
stream = StreamFacade.create_from_stream(stream, source, logger, _NO_STATE,
|
113
|
+
stream = StreamFacade.create_from_stream(stream, source, logger, _NO_STATE, cursor)
|
62
114
|
stream.logger.setLevel(logger.level)
|
63
115
|
return stream
|
64
116
|
|
65
117
|
|
118
|
+
def _incremental_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, timestamp):
|
119
|
+
stream = _stream(slice_to_partition_mapping, slice_logger, logger, message_repository)
|
120
|
+
stream.state = {"created_at": timestamp}
|
121
|
+
return stream
|
122
|
+
|
123
|
+
|
124
|
+
def _incremental_concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor):
|
125
|
+
stream = _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor)
|
126
|
+
return stream
|
127
|
+
|
128
|
+
|
66
129
|
@pytest.mark.parametrize(
|
67
130
|
"constructor",
|
68
131
|
[
|
@@ -73,6 +136,8 @@ def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message
|
|
73
136
|
def test_full_refresh_read_a_single_slice_with_debug(constructor):
|
74
137
|
# This test verifies that a concurrent stream adapted from a Stream behaves the same as the Stream object.
|
75
138
|
# It is done by running the same test cases on both streams
|
139
|
+
configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}), sync_mode=SyncMode.full_refresh,destination_sync_mode=DestinationSyncMode.overwrite)
|
140
|
+
internal_config = InternalConfig()
|
76
141
|
records = [
|
77
142
|
{"id": 1, "partition": 1},
|
78
143
|
{"id": 2, "partition": 1},
|
@@ -82,6 +147,7 @@ def test_full_refresh_read_a_single_slice_with_debug(constructor):
|
|
82
147
|
logger = _mock_logger(True)
|
83
148
|
message_repository = InMemoryMessageRepository(Level.DEBUG)
|
84
149
|
stream = constructor(slice_to_partition, slice_logger, logger, message_repository)
|
150
|
+
state_manager = ConnectorStateManager(stream_instance_map={})
|
85
151
|
|
86
152
|
expected_records = [
|
87
153
|
AirbyteMessage(
|
@@ -94,7 +160,22 @@ def test_full_refresh_read_a_single_slice_with_debug(constructor):
|
|
94
160
|
*records,
|
95
161
|
]
|
96
162
|
|
97
|
-
|
163
|
+
# Temporary check to only validate the final state message for synchronous sources since it has not been implemented for concurrent yet
|
164
|
+
if constructor == _stream:
|
165
|
+
expected_records.append(
|
166
|
+
AirbyteMessage(
|
167
|
+
type=MessageType.STATE,
|
168
|
+
state=AirbyteStateMessage(
|
169
|
+
type=AirbyteStateType.STREAM,
|
170
|
+
stream=AirbyteStreamState(
|
171
|
+
stream_descriptor=StreamDescriptor(name='__mock_stream', namespace=None),
|
172
|
+
stream_state=AirbyteStateBlob(__ab_full_refresh_state_message=True),
|
173
|
+
)
|
174
|
+
),
|
175
|
+
),
|
176
|
+
)
|
177
|
+
|
178
|
+
actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
|
98
179
|
|
99
180
|
assert expected_records == actual_records
|
100
181
|
|
@@ -109,9 +190,12 @@ def test_full_refresh_read_a_single_slice_with_debug(constructor):
|
|
109
190
|
def test_full_refresh_read_a_single_slice(constructor):
|
110
191
|
# This test verifies that a concurrent stream adapted from a Stream behaves the same as the Stream object.
|
111
192
|
# It is done by running the same test cases on both streams
|
193
|
+
configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}), sync_mode=SyncMode.full_refresh,destination_sync_mode=DestinationSyncMode.overwrite)
|
194
|
+
internal_config = InternalConfig()
|
112
195
|
logger = _mock_logger()
|
113
196
|
slice_logger = DebugSliceLogger()
|
114
197
|
message_repository = InMemoryMessageRepository(Level.INFO)
|
198
|
+
state_manager = ConnectorStateManager(stream_instance_map={})
|
115
199
|
|
116
200
|
records = [
|
117
201
|
{"id": 1, "partition": 1},
|
@@ -122,7 +206,22 @@ def test_full_refresh_read_a_single_slice(constructor):
|
|
122
206
|
|
123
207
|
expected_records = [*records]
|
124
208
|
|
125
|
-
|
209
|
+
# Temporary check to only validate the final state message for synchronous sources since it has not been implemented for concurrent yet
|
210
|
+
if constructor == _stream:
|
211
|
+
expected_records.append(
|
212
|
+
AirbyteMessage(
|
213
|
+
type=MessageType.STATE,
|
214
|
+
state=AirbyteStateMessage(
|
215
|
+
type=AirbyteStateType.STREAM,
|
216
|
+
stream=AirbyteStreamState(
|
217
|
+
stream_descriptor=StreamDescriptor(name='__mock_stream', namespace=None),
|
218
|
+
stream_state=AirbyteStateBlob(__ab_full_refresh_state_message=True),
|
219
|
+
)
|
220
|
+
),
|
221
|
+
),
|
222
|
+
)
|
223
|
+
|
224
|
+
actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
|
126
225
|
|
127
226
|
assert expected_records == actual_records
|
128
227
|
|
@@ -137,9 +236,12 @@ def test_full_refresh_read_a_single_slice(constructor):
|
|
137
236
|
def test_full_refresh_read_a_two_slices(constructor):
|
138
237
|
# This test verifies that a concurrent stream adapted from a Stream behaves the same as the Stream object
|
139
238
|
# It is done by running the same test cases on both streams
|
239
|
+
configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}), sync_mode=SyncMode.full_refresh,destination_sync_mode=DestinationSyncMode.overwrite)
|
240
|
+
internal_config = InternalConfig()
|
140
241
|
logger = _mock_logger()
|
141
242
|
slice_logger = DebugSliceLogger()
|
142
243
|
message_repository = InMemoryMessageRepository(Level.INFO)
|
244
|
+
state_manager = ConnectorStateManager(stream_instance_map={})
|
143
245
|
|
144
246
|
records_partition_1 = [
|
145
247
|
{"id": 1, "partition": 1},
|
@@ -157,16 +259,111 @@ def test_full_refresh_read_a_two_slices(constructor):
|
|
157
259
|
*records_partition_2,
|
158
260
|
]
|
159
261
|
|
160
|
-
|
262
|
+
# Temporary check to only validate the final state message for synchronous sources since it has not been implemented for concurrent yet
|
263
|
+
if constructor == _stream:
|
264
|
+
expected_records.append(
|
265
|
+
AirbyteMessage(
|
266
|
+
type=MessageType.STATE,
|
267
|
+
state=AirbyteStateMessage(
|
268
|
+
type=AirbyteStateType.STREAM,
|
269
|
+
stream=AirbyteStreamState(
|
270
|
+
stream_descriptor=StreamDescriptor(name='__mock_stream', namespace=None),
|
271
|
+
stream_state=AirbyteStateBlob(__ab_full_refresh_state_message=True),
|
272
|
+
)
|
273
|
+
),
|
274
|
+
),
|
275
|
+
)
|
276
|
+
|
277
|
+
actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
|
161
278
|
|
162
279
|
for record in expected_records:
|
163
280
|
assert record in actual_records
|
164
281
|
assert len(expected_records) == len(actual_records)
|
165
282
|
|
166
283
|
|
167
|
-
def
|
284
|
+
def test_incremental_read_two_slices():
|
285
|
+
# This test verifies that a stream running in incremental mode emits state messages correctly
|
286
|
+
configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], json_schema={}), sync_mode=SyncMode.incremental,destination_sync_mode=DestinationSyncMode.overwrite)
|
287
|
+
internal_config = InternalConfig()
|
288
|
+
logger = _mock_logger()
|
289
|
+
slice_logger = DebugSliceLogger()
|
290
|
+
message_repository = InMemoryMessageRepository(Level.INFO)
|
291
|
+
state_manager = ConnectorStateManager(stream_instance_map={})
|
292
|
+
timestamp = "1708899427"
|
293
|
+
|
294
|
+
records_partition_1 = [
|
295
|
+
{"id": 1, "partition": 1},
|
296
|
+
{"id": 2, "partition": 1},
|
297
|
+
]
|
298
|
+
records_partition_2 = [
|
299
|
+
{"id": 3, "partition": 2},
|
300
|
+
{"id": 4, "partition": 2},
|
301
|
+
]
|
302
|
+
slice_to_partition = {1: records_partition_1, 2: records_partition_2}
|
303
|
+
stream = _incremental_stream(slice_to_partition, slice_logger, logger, message_repository, timestamp)
|
304
|
+
|
305
|
+
expected_records = [
|
306
|
+
*records_partition_1,
|
307
|
+
_create_state_message("__mock_stream", {"created_at": timestamp}),
|
308
|
+
*records_partition_2,
|
309
|
+
_create_state_message("__mock_stream", {"created_at": timestamp})
|
310
|
+
]
|
311
|
+
|
312
|
+
actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
|
313
|
+
|
314
|
+
for record in expected_records:
|
315
|
+
assert record in actual_records
|
316
|
+
assert len(expected_records) == len(actual_records)
|
317
|
+
|
318
|
+
|
319
|
+
def test_concurrent_incremental_read_two_slices():
|
320
|
+
# This test verifies that an incremental concurrent stream manages state correctly for multiple slices syncing concurrently
|
321
|
+
configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], json_schema={}), sync_mode=SyncMode.incremental,destination_sync_mode=DestinationSyncMode.overwrite)
|
322
|
+
internal_config = InternalConfig()
|
323
|
+
logger = _mock_logger()
|
324
|
+
slice_logger = DebugSliceLogger()
|
325
|
+
message_repository = InMemoryMessageRepository(Level.INFO)
|
326
|
+
state_manager = ConnectorStateManager(stream_instance_map={})
|
327
|
+
slice_timestamp_1 = "1708850000"
|
328
|
+
slice_timestamp_2 = "1708950000"
|
329
|
+
cursor = MockConcurrentCursor(message_repository)
|
330
|
+
|
331
|
+
records_partition_1 = [
|
332
|
+
{"id": 1, "partition": 1, "created_at": "1708800000"},
|
333
|
+
{"id": 2, "partition": 1, "created_at": slice_timestamp_1},
|
334
|
+
]
|
335
|
+
records_partition_2 = [
|
336
|
+
{"id": 3, "partition": 2, "created_at": "1708900000"},
|
337
|
+
{"id": 4, "partition": 2, "created_at": slice_timestamp_2},
|
338
|
+
]
|
339
|
+
slice_to_partition = {1: records_partition_1, 2: records_partition_2}
|
340
|
+
stream = _incremental_concurrent_stream(slice_to_partition, slice_logger, logger, message_repository, cursor)
|
341
|
+
|
342
|
+
expected_records = [
|
343
|
+
*records_partition_1,
|
344
|
+
*records_partition_2,
|
345
|
+
]
|
346
|
+
|
347
|
+
expected_state = _create_state_message("__mock_stream", {"1": {"created_at": slice_timestamp_1}, "2": {"created_at": slice_timestamp_2}})
|
348
|
+
|
349
|
+
actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
|
350
|
+
|
351
|
+
for record in expected_records:
|
352
|
+
assert record in actual_records
|
353
|
+
assert len(expected_records) == len(actual_records)
|
354
|
+
|
355
|
+
# We don't have a real source that reads from the message_repository for state, so we read from the queue directly to verify
|
356
|
+
# the cursor observed records correctly and updated partition states
|
357
|
+
mock_partition = Mock()
|
358
|
+
cursor.close_partition(mock_partition)
|
359
|
+
actual_state = [state for state in message_repository.consume_queue()]
|
360
|
+
assert len(actual_state) == 1
|
361
|
+
assert actual_state[0] == expected_state
|
362
|
+
|
363
|
+
|
364
|
+
def _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config):
|
168
365
|
records = []
|
169
|
-
for record in stream.
|
366
|
+
for record in stream.read(configured_stream, logger, slice_logger, {}, state_manager, internal_config):
|
170
367
|
for message in message_repository.consume_queue():
|
171
368
|
records.append(message)
|
172
369
|
records.append(record)
|
@@ -192,3 +389,16 @@ def _mock_logger(enabled_for_debug=False):
|
|
192
389
|
logger.isEnabledFor.return_value = enabled_for_debug
|
193
390
|
logger.level = logging.DEBUG if enabled_for_debug else logging.INFO
|
194
391
|
return logger
|
392
|
+
|
393
|
+
|
394
|
+
def _create_state_message(stream: str, state: Mapping[str, Any]) -> AirbyteMessage:
|
395
|
+
return AirbyteMessage(
|
396
|
+
type=MessageType.STATE,
|
397
|
+
state=AirbyteStateMessage(
|
398
|
+
type=AirbyteStateType.STREAM,
|
399
|
+
stream=AirbyteStreamState(
|
400
|
+
stream_descriptor=StreamDescriptor(name=stream, namespace=None),
|
401
|
+
stream_state=AirbyteStateBlob(**state),
|
402
|
+
)
|
403
|
+
),
|
404
|
+
)
|