airbyte-cdk 0.67.0__py3-none-any.whl → 0.67.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. airbyte_cdk/sources/abstract_source.py +30 -69
  2. airbyte_cdk/sources/connector_state_manager.py +12 -26
  3. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +552 -524
  4. airbyte_cdk/sources/file_based/config/csv_format.py +2 -0
  5. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
  6. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
  7. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
  8. airbyte_cdk/sources/streams/__init__.py +2 -2
  9. airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
  10. airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
  11. airbyte_cdk/sources/streams/core.py +36 -34
  12. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/METADATA +3 -2
  13. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/RECORD +31 -31
  14. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
  15. unit_tests/sources/file_based/config/test_csv_format.py +6 -1
  16. unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
  17. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
  18. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
  19. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
  20. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
  21. unit_tests/sources/file_based/test_scenarios.py +2 -2
  22. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
  23. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
  24. unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
  25. unit_tests/sources/streams/test_stream_read.py +221 -11
  26. unit_tests/sources/test_abstract_source.py +142 -130
  27. unit_tests/sources/test_connector_state_manager.py +3 -124
  28. unit_tests/sources/test_source.py +18 -14
  29. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/LICENSE.txt +0 -0
  30. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/WHEEL +0 -0
  31. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,34 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
+
4
5
  import logging
5
- from typing import Any, Iterable, List, Mapping, Optional, Union
6
+ from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union
6
7
  from unittest.mock import Mock
7
8
 
8
9
  import pytest
9
- from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode
10
+ from airbyte_cdk.models import (
11
+ AirbyteLogMessage,
12
+ AirbyteMessage,
13
+ AirbyteStateBlob,
14
+ AirbyteStateMessage,
15
+ AirbyteStateType,
16
+ AirbyteStream,
17
+ AirbyteStreamState,
18
+ ConfiguredAirbyteStream,
19
+ DestinationSyncMode,
20
+ Level,
21
+ StreamDescriptor,
22
+ SyncMode,
23
+ )
10
24
  from airbyte_cdk.models import Type as MessageType
11
- from airbyte_cdk.sources.message import InMemoryMessageRepository
25
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
26
+ from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
12
27
  from airbyte_cdk.sources.streams import Stream
13
28
  from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
14
- from airbyte_cdk.sources.streams.concurrent.cursor import NoopCursor
29
+ from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, NoopCursor
30
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
31
+ from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
15
32
  from airbyte_cdk.sources.streams.core import StreamData
16
33
  from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
17
34
  from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger
@@ -49,20 +66,66 @@ class _MockStream(Stream):
49
66
  return {}
50
67
 
51
68
 
69
+ class MockConcurrentCursor(Cursor):
70
+ _state: MutableMapping[str, Any]
71
+ _message_repository: MessageRepository
72
+
73
+ def __init__(self, message_repository: MessageRepository):
74
+ self._message_repository = message_repository
75
+ self._state = {}
76
+
77
+ @property
78
+ def state(self) -> MutableMapping[str, Any]:
79
+ return self._state
80
+
81
+ def observe(self, record: Record) -> None:
82
+ partition = str(record.data.get("partition"))
83
+ timestamp = record.data.get("created_at")
84
+ self._state[partition] = {"created_at": timestamp}
85
+
86
+ def close_partition(self, partition: Partition) -> None:
87
+ self._message_repository.emit_message(
88
+ AirbyteMessage(
89
+ type=MessageType.STATE,
90
+ state=AirbyteStateMessage(
91
+ type=AirbyteStateType.STREAM,
92
+ stream=AirbyteStreamState(
93
+ stream_descriptor=StreamDescriptor(name='__mock_stream', namespace=None),
94
+ stream_state=AirbyteStateBlob(**self._state),
95
+ )
96
+ ),
97
+ )
98
+ )
99
+
100
+ def ensure_at_least_one_state_emitted(self) -> None:
101
+ pass
102
+
103
+
52
104
  def _stream(slice_to_partition_mapping, slice_logger, logger, message_repository):
53
105
  return _MockStream(slice_to_partition_mapping)
54
106
 
55
107
 
56
- def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository):
108
+ def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor: Cursor = NoopCursor()):
57
109
  stream = _stream(slice_to_partition_mapping, slice_logger, logger, message_repository)
58
110
  source = Mock()
59
111
  source._slice_logger = slice_logger
60
112
  source.message_repository = message_repository
61
- stream = StreamFacade.create_from_stream(stream, source, logger, _NO_STATE, NoopCursor())
113
+ stream = StreamFacade.create_from_stream(stream, source, logger, _NO_STATE, cursor)
62
114
  stream.logger.setLevel(logger.level)
63
115
  return stream
64
116
 
65
117
 
118
+ def _incremental_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, timestamp):
119
+ stream = _stream(slice_to_partition_mapping, slice_logger, logger, message_repository)
120
+ stream.state = {"created_at": timestamp}
121
+ return stream
122
+
123
+
124
+ def _incremental_concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor):
125
+ stream = _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor)
126
+ return stream
127
+
128
+
66
129
  @pytest.mark.parametrize(
67
130
  "constructor",
68
131
  [
@@ -73,6 +136,8 @@ def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message
73
136
  def test_full_refresh_read_a_single_slice_with_debug(constructor):
74
137
  # This test verifies that a concurrent stream adapted from a Stream behaves the same as the Stream object.
75
138
  # It is done by running the same test cases on both streams
139
+ configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}), sync_mode=SyncMode.full_refresh,destination_sync_mode=DestinationSyncMode.overwrite)
140
+ internal_config = InternalConfig()
76
141
  records = [
77
142
  {"id": 1, "partition": 1},
78
143
  {"id": 2, "partition": 1},
@@ -82,6 +147,7 @@ def test_full_refresh_read_a_single_slice_with_debug(constructor):
82
147
  logger = _mock_logger(True)
83
148
  message_repository = InMemoryMessageRepository(Level.DEBUG)
84
149
  stream = constructor(slice_to_partition, slice_logger, logger, message_repository)
150
+ state_manager = ConnectorStateManager(stream_instance_map={})
85
151
 
86
152
  expected_records = [
87
153
  AirbyteMessage(
@@ -94,7 +160,22 @@ def test_full_refresh_read_a_single_slice_with_debug(constructor):
94
160
  *records,
95
161
  ]
96
162
 
97
- actual_records = _read(stream, logger, slice_logger, message_repository)
163
+ # Temporary check to only validate the final state message for synchronous sources since it has not been implemented for concurrent yet
164
+ if constructor == _stream:
165
+ expected_records.append(
166
+ AirbyteMessage(
167
+ type=MessageType.STATE,
168
+ state=AirbyteStateMessage(
169
+ type=AirbyteStateType.STREAM,
170
+ stream=AirbyteStreamState(
171
+ stream_descriptor=StreamDescriptor(name='__mock_stream', namespace=None),
172
+ stream_state=AirbyteStateBlob(__ab_full_refresh_state_message=True),
173
+ )
174
+ ),
175
+ ),
176
+ )
177
+
178
+ actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
98
179
 
99
180
  assert expected_records == actual_records
100
181
 
@@ -109,9 +190,12 @@ def test_full_refresh_read_a_single_slice_with_debug(constructor):
109
190
  def test_full_refresh_read_a_single_slice(constructor):
110
191
  # This test verifies that a concurrent stream adapted from a Stream behaves the same as the Stream object.
111
192
  # It is done by running the same test cases on both streams
193
+ configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}), sync_mode=SyncMode.full_refresh,destination_sync_mode=DestinationSyncMode.overwrite)
194
+ internal_config = InternalConfig()
112
195
  logger = _mock_logger()
113
196
  slice_logger = DebugSliceLogger()
114
197
  message_repository = InMemoryMessageRepository(Level.INFO)
198
+ state_manager = ConnectorStateManager(stream_instance_map={})
115
199
 
116
200
  records = [
117
201
  {"id": 1, "partition": 1},
@@ -122,7 +206,22 @@ def test_full_refresh_read_a_single_slice(constructor):
122
206
 
123
207
  expected_records = [*records]
124
208
 
125
- actual_records = _read(stream, logger, slice_logger, message_repository)
209
+ # Temporary check to only validate the final state message for synchronous sources since it has not been implemented for concurrent yet
210
+ if constructor == _stream:
211
+ expected_records.append(
212
+ AirbyteMessage(
213
+ type=MessageType.STATE,
214
+ state=AirbyteStateMessage(
215
+ type=AirbyteStateType.STREAM,
216
+ stream=AirbyteStreamState(
217
+ stream_descriptor=StreamDescriptor(name='__mock_stream', namespace=None),
218
+ stream_state=AirbyteStateBlob(__ab_full_refresh_state_message=True),
219
+ )
220
+ ),
221
+ ),
222
+ )
223
+
224
+ actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
126
225
 
127
226
  assert expected_records == actual_records
128
227
 
@@ -137,9 +236,12 @@ def test_full_refresh_read_a_single_slice(constructor):
137
236
  def test_full_refresh_read_a_two_slices(constructor):
138
237
  # This test verifies that a concurrent stream adapted from a Stream behaves the same as the Stream object
139
238
  # It is done by running the same test cases on both streams
239
+ configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}), sync_mode=SyncMode.full_refresh,destination_sync_mode=DestinationSyncMode.overwrite)
240
+ internal_config = InternalConfig()
140
241
  logger = _mock_logger()
141
242
  slice_logger = DebugSliceLogger()
142
243
  message_repository = InMemoryMessageRepository(Level.INFO)
244
+ state_manager = ConnectorStateManager(stream_instance_map={})
143
245
 
144
246
  records_partition_1 = [
145
247
  {"id": 1, "partition": 1},
@@ -157,16 +259,111 @@ def test_full_refresh_read_a_two_slices(constructor):
157
259
  *records_partition_2,
158
260
  ]
159
261
 
160
- actual_records = _read(stream, logger, slice_logger, message_repository)
262
+ # Temporary check to only validate the final state message for synchronous sources since it has not been implemented for concurrent yet
263
+ if constructor == _stream:
264
+ expected_records.append(
265
+ AirbyteMessage(
266
+ type=MessageType.STATE,
267
+ state=AirbyteStateMessage(
268
+ type=AirbyteStateType.STREAM,
269
+ stream=AirbyteStreamState(
270
+ stream_descriptor=StreamDescriptor(name='__mock_stream', namespace=None),
271
+ stream_state=AirbyteStateBlob(__ab_full_refresh_state_message=True),
272
+ )
273
+ ),
274
+ ),
275
+ )
276
+
277
+ actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
161
278
 
162
279
  for record in expected_records:
163
280
  assert record in actual_records
164
281
  assert len(expected_records) == len(actual_records)
165
282
 
166
283
 
167
- def _read(stream, logger, slice_logger, message_repository):
284
+ def test_incremental_read_two_slices():
285
+ # This test verifies that a stream running in incremental mode emits state messages correctly
286
+ configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], json_schema={}), sync_mode=SyncMode.incremental,destination_sync_mode=DestinationSyncMode.overwrite)
287
+ internal_config = InternalConfig()
288
+ logger = _mock_logger()
289
+ slice_logger = DebugSliceLogger()
290
+ message_repository = InMemoryMessageRepository(Level.INFO)
291
+ state_manager = ConnectorStateManager(stream_instance_map={})
292
+ timestamp = "1708899427"
293
+
294
+ records_partition_1 = [
295
+ {"id": 1, "partition": 1},
296
+ {"id": 2, "partition": 1},
297
+ ]
298
+ records_partition_2 = [
299
+ {"id": 3, "partition": 2},
300
+ {"id": 4, "partition": 2},
301
+ ]
302
+ slice_to_partition = {1: records_partition_1, 2: records_partition_2}
303
+ stream = _incremental_stream(slice_to_partition, slice_logger, logger, message_repository, timestamp)
304
+
305
+ expected_records = [
306
+ *records_partition_1,
307
+ _create_state_message("__mock_stream", {"created_at": timestamp}),
308
+ *records_partition_2,
309
+ _create_state_message("__mock_stream", {"created_at": timestamp})
310
+ ]
311
+
312
+ actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
313
+
314
+ for record in expected_records:
315
+ assert record in actual_records
316
+ assert len(expected_records) == len(actual_records)
317
+
318
+
319
+ def test_concurrent_incremental_read_two_slices():
320
+ # This test verifies that an incremental concurrent stream manages state correctly for multiple slices syncing concurrently
321
+ configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], json_schema={}), sync_mode=SyncMode.incremental,destination_sync_mode=DestinationSyncMode.overwrite)
322
+ internal_config = InternalConfig()
323
+ logger = _mock_logger()
324
+ slice_logger = DebugSliceLogger()
325
+ message_repository = InMemoryMessageRepository(Level.INFO)
326
+ state_manager = ConnectorStateManager(stream_instance_map={})
327
+ slice_timestamp_1 = "1708850000"
328
+ slice_timestamp_2 = "1708950000"
329
+ cursor = MockConcurrentCursor(message_repository)
330
+
331
+ records_partition_1 = [
332
+ {"id": 1, "partition": 1, "created_at": "1708800000"},
333
+ {"id": 2, "partition": 1, "created_at": slice_timestamp_1},
334
+ ]
335
+ records_partition_2 = [
336
+ {"id": 3, "partition": 2, "created_at": "1708900000"},
337
+ {"id": 4, "partition": 2, "created_at": slice_timestamp_2},
338
+ ]
339
+ slice_to_partition = {1: records_partition_1, 2: records_partition_2}
340
+ stream = _incremental_concurrent_stream(slice_to_partition, slice_logger, logger, message_repository, cursor)
341
+
342
+ expected_records = [
343
+ *records_partition_1,
344
+ *records_partition_2,
345
+ ]
346
+
347
+ expected_state = _create_state_message("__mock_stream", {"1": {"created_at": slice_timestamp_1}, "2": {"created_at": slice_timestamp_2}})
348
+
349
+ actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
350
+
351
+ for record in expected_records:
352
+ assert record in actual_records
353
+ assert len(expected_records) == len(actual_records)
354
+
355
+ # We don't have a real source that reads from the message_repository for state, so we read from the queue directly to verify
356
+ # the cursor observed records correctly and updated partition states
357
+ mock_partition = Mock()
358
+ cursor.close_partition(mock_partition)
359
+ actual_state = [state for state in message_repository.consume_queue()]
360
+ assert len(actual_state) == 1
361
+ assert actual_state[0] == expected_state
362
+
363
+
364
+ def _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config):
168
365
  records = []
169
- for record in stream.read_full_refresh(_A_CURSOR_FIELD, logger, slice_logger):
366
+ for record in stream.read(configured_stream, logger, slice_logger, {}, state_manager, internal_config):
170
367
  for message in message_repository.consume_queue():
171
368
  records.append(message)
172
369
  records.append(record)
@@ -192,3 +389,16 @@ def _mock_logger(enabled_for_debug=False):
192
389
  logger.isEnabledFor.return_value = enabled_for_debug
193
390
  logger.level = logging.DEBUG if enabled_for_debug else logging.INFO
194
391
  return logger
392
+
393
+
394
+ def _create_state_message(stream: str, state: Mapping[str, Any]) -> AirbyteMessage:
395
+ return AirbyteMessage(
396
+ type=MessageType.STATE,
397
+ state=AirbyteStateMessage(
398
+ type=AirbyteStateType.STREAM,
399
+ stream=AirbyteStreamState(
400
+ stream_descriptor=StreamDescriptor(name=stream, namespace=None),
401
+ stream_state=AirbyteStateBlob(**state),
402
+ )
403
+ ),
404
+ )