airbyte-cdk 0.67.1__py3-none-any.whl → 0.67.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (28) hide show
  1. airbyte_cdk/sources/abstract_source.py +30 -69
  2. airbyte_cdk/sources/connector_state_manager.py +12 -26
  3. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
  4. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
  5. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
  6. airbyte_cdk/sources/streams/__init__.py +2 -2
  7. airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
  8. airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
  9. airbyte_cdk/sources/streams/core.py +36 -34
  10. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/METADATA +1 -1
  11. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/RECORD +28 -28
  12. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
  13. unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
  14. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
  15. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
  16. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
  17. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
  18. unit_tests/sources/file_based/test_scenarios.py +2 -2
  19. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
  20. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
  21. unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
  22. unit_tests/sources/streams/test_stream_read.py +221 -11
  23. unit_tests/sources/test_abstract_source.py +142 -130
  24. unit_tests/sources/test_connector_state_manager.py +3 -124
  25. unit_tests/sources/test_source.py +18 -14
  26. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/LICENSE.txt +0 -0
  27. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/WHEEL +0 -0
  28. {airbyte_cdk-0.67.1.dist-info → airbyte_cdk-0.67.2.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,34 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
+
4
5
  import logging
5
- from typing import Any, Iterable, List, Mapping, Optional, Union
6
+ from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union
6
7
  from unittest.mock import Mock
7
8
 
8
9
  import pytest
9
- from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode
10
+ from airbyte_cdk.models import (
11
+ AirbyteLogMessage,
12
+ AirbyteMessage,
13
+ AirbyteStateBlob,
14
+ AirbyteStateMessage,
15
+ AirbyteStateType,
16
+ AirbyteStream,
17
+ AirbyteStreamState,
18
+ ConfiguredAirbyteStream,
19
+ DestinationSyncMode,
20
+ Level,
21
+ StreamDescriptor,
22
+ SyncMode,
23
+ )
10
24
  from airbyte_cdk.models import Type as MessageType
11
- from airbyte_cdk.sources.message import InMemoryMessageRepository
25
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
26
+ from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
12
27
  from airbyte_cdk.sources.streams import Stream
13
28
  from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
14
- from airbyte_cdk.sources.streams.concurrent.cursor import NoopCursor
29
+ from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, NoopCursor
30
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
31
+ from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
15
32
  from airbyte_cdk.sources.streams.core import StreamData
16
33
  from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
17
34
  from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger
@@ -49,20 +66,66 @@ class _MockStream(Stream):
49
66
  return {}
50
67
 
51
68
 
69
+ class MockConcurrentCursor(Cursor):
70
+ _state: MutableMapping[str, Any]
71
+ _message_repository: MessageRepository
72
+
73
+ def __init__(self, message_repository: MessageRepository):
74
+ self._message_repository = message_repository
75
+ self._state = {}
76
+
77
+ @property
78
+ def state(self) -> MutableMapping[str, Any]:
79
+ return self._state
80
+
81
+ def observe(self, record: Record) -> None:
82
+ partition = str(record.data.get("partition"))
83
+ timestamp = record.data.get("created_at")
84
+ self._state[partition] = {"created_at": timestamp}
85
+
86
+ def close_partition(self, partition: Partition) -> None:
87
+ self._message_repository.emit_message(
88
+ AirbyteMessage(
89
+ type=MessageType.STATE,
90
+ state=AirbyteStateMessage(
91
+ type=AirbyteStateType.STREAM,
92
+ stream=AirbyteStreamState(
93
+ stream_descriptor=StreamDescriptor(name='__mock_stream', namespace=None),
94
+ stream_state=AirbyteStateBlob(**self._state),
95
+ )
96
+ ),
97
+ )
98
+ )
99
+
100
+ def ensure_at_least_one_state_emitted(self) -> None:
101
+ pass
102
+
103
+
52
104
  def _stream(slice_to_partition_mapping, slice_logger, logger, message_repository):
53
105
  return _MockStream(slice_to_partition_mapping)
54
106
 
55
107
 
56
- def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository):
108
+ def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor: Cursor = NoopCursor()):
57
109
  stream = _stream(slice_to_partition_mapping, slice_logger, logger, message_repository)
58
110
  source = Mock()
59
111
  source._slice_logger = slice_logger
60
112
  source.message_repository = message_repository
61
- stream = StreamFacade.create_from_stream(stream, source, logger, _NO_STATE, NoopCursor())
113
+ stream = StreamFacade.create_from_stream(stream, source, logger, _NO_STATE, cursor)
62
114
  stream.logger.setLevel(logger.level)
63
115
  return stream
64
116
 
65
117
 
118
+ def _incremental_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, timestamp):
119
+ stream = _stream(slice_to_partition_mapping, slice_logger, logger, message_repository)
120
+ stream.state = {"created_at": timestamp}
121
+ return stream
122
+
123
+
124
+ def _incremental_concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor):
125
+ stream = _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message_repository, cursor)
126
+ return stream
127
+
128
+
66
129
  @pytest.mark.parametrize(
67
130
  "constructor",
68
131
  [
@@ -73,6 +136,8 @@ def _concurrent_stream(slice_to_partition_mapping, slice_logger, logger, message
73
136
  def test_full_refresh_read_a_single_slice_with_debug(constructor):
74
137
  # This test verifies that a concurrent stream adapted from a Stream behaves the same as the Stream object.
75
138
  # It is done by running the same test cases on both streams
139
+ configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}), sync_mode=SyncMode.full_refresh,destination_sync_mode=DestinationSyncMode.overwrite)
140
+ internal_config = InternalConfig()
76
141
  records = [
77
142
  {"id": 1, "partition": 1},
78
143
  {"id": 2, "partition": 1},
@@ -82,6 +147,7 @@ def test_full_refresh_read_a_single_slice_with_debug(constructor):
82
147
  logger = _mock_logger(True)
83
148
  message_repository = InMemoryMessageRepository(Level.DEBUG)
84
149
  stream = constructor(slice_to_partition, slice_logger, logger, message_repository)
150
+ state_manager = ConnectorStateManager(stream_instance_map={})
85
151
 
86
152
  expected_records = [
87
153
  AirbyteMessage(
@@ -94,7 +160,22 @@ def test_full_refresh_read_a_single_slice_with_debug(constructor):
94
160
  *records,
95
161
  ]
96
162
 
97
- actual_records = _read(stream, logger, slice_logger, message_repository)
163
+ # Temporary check to only validate the final state message for synchronous sources since it has not been implemented for concurrent yet
164
+ if constructor == _stream:
165
+ expected_records.append(
166
+ AirbyteMessage(
167
+ type=MessageType.STATE,
168
+ state=AirbyteStateMessage(
169
+ type=AirbyteStateType.STREAM,
170
+ stream=AirbyteStreamState(
171
+ stream_descriptor=StreamDescriptor(name='__mock_stream', namespace=None),
172
+ stream_state=AirbyteStateBlob(__ab_full_refresh_state_message=True),
173
+ )
174
+ ),
175
+ ),
176
+ )
177
+
178
+ actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
98
179
 
99
180
  assert expected_records == actual_records
100
181
 
@@ -109,9 +190,12 @@ def test_full_refresh_read_a_single_slice_with_debug(constructor):
109
190
  def test_full_refresh_read_a_single_slice(constructor):
110
191
  # This test verifies that a concurrent stream adapted from a Stream behaves the same as the Stream object.
111
192
  # It is done by running the same test cases on both streams
193
+ configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}), sync_mode=SyncMode.full_refresh,destination_sync_mode=DestinationSyncMode.overwrite)
194
+ internal_config = InternalConfig()
112
195
  logger = _mock_logger()
113
196
  slice_logger = DebugSliceLogger()
114
197
  message_repository = InMemoryMessageRepository(Level.INFO)
198
+ state_manager = ConnectorStateManager(stream_instance_map={})
115
199
 
116
200
  records = [
117
201
  {"id": 1, "partition": 1},
@@ -122,7 +206,22 @@ def test_full_refresh_read_a_single_slice(constructor):
122
206
 
123
207
  expected_records = [*records]
124
208
 
125
- actual_records = _read(stream, logger, slice_logger, message_repository)
209
+ # Temporary check to only validate the final state message for synchronous sources since it has not been implemented for concurrent yet
210
+ if constructor == _stream:
211
+ expected_records.append(
212
+ AirbyteMessage(
213
+ type=MessageType.STATE,
214
+ state=AirbyteStateMessage(
215
+ type=AirbyteStateType.STREAM,
216
+ stream=AirbyteStreamState(
217
+ stream_descriptor=StreamDescriptor(name='__mock_stream', namespace=None),
218
+ stream_state=AirbyteStateBlob(__ab_full_refresh_state_message=True),
219
+ )
220
+ ),
221
+ ),
222
+ )
223
+
224
+ actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
126
225
 
127
226
  assert expected_records == actual_records
128
227
 
@@ -137,9 +236,12 @@ def test_full_refresh_read_a_single_slice(constructor):
137
236
  def test_full_refresh_read_a_two_slices(constructor):
138
237
  # This test verifies that a concurrent stream adapted from a Stream behaves the same as the Stream object
139
238
  # It is done by running the same test cases on both streams
239
+ configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh], json_schema={}), sync_mode=SyncMode.full_refresh,destination_sync_mode=DestinationSyncMode.overwrite)
240
+ internal_config = InternalConfig()
140
241
  logger = _mock_logger()
141
242
  slice_logger = DebugSliceLogger()
142
243
  message_repository = InMemoryMessageRepository(Level.INFO)
244
+ state_manager = ConnectorStateManager(stream_instance_map={})
143
245
 
144
246
  records_partition_1 = [
145
247
  {"id": 1, "partition": 1},
@@ -157,16 +259,111 @@ def test_full_refresh_read_a_two_slices(constructor):
157
259
  *records_partition_2,
158
260
  ]
159
261
 
160
- actual_records = _read(stream, logger, slice_logger, message_repository)
262
+ # Temporary check to only validate the final state message for synchronous sources since it has not been implemented for concurrent yet
263
+ if constructor == _stream:
264
+ expected_records.append(
265
+ AirbyteMessage(
266
+ type=MessageType.STATE,
267
+ state=AirbyteStateMessage(
268
+ type=AirbyteStateType.STREAM,
269
+ stream=AirbyteStreamState(
270
+ stream_descriptor=StreamDescriptor(name='__mock_stream', namespace=None),
271
+ stream_state=AirbyteStateBlob(__ab_full_refresh_state_message=True),
272
+ )
273
+ ),
274
+ ),
275
+ )
276
+
277
+ actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
161
278
 
162
279
  for record in expected_records:
163
280
  assert record in actual_records
164
281
  assert len(expected_records) == len(actual_records)
165
282
 
166
283
 
167
- def _read(stream, logger, slice_logger, message_repository):
284
+ def test_incremental_read_two_slices():
285
+ # This test verifies that a stream running in incremental mode emits state messages correctly
286
+ configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], json_schema={}), sync_mode=SyncMode.incremental,destination_sync_mode=DestinationSyncMode.overwrite)
287
+ internal_config = InternalConfig()
288
+ logger = _mock_logger()
289
+ slice_logger = DebugSliceLogger()
290
+ message_repository = InMemoryMessageRepository(Level.INFO)
291
+ state_manager = ConnectorStateManager(stream_instance_map={})
292
+ timestamp = "1708899427"
293
+
294
+ records_partition_1 = [
295
+ {"id": 1, "partition": 1},
296
+ {"id": 2, "partition": 1},
297
+ ]
298
+ records_partition_2 = [
299
+ {"id": 3, "partition": 2},
300
+ {"id": 4, "partition": 2},
301
+ ]
302
+ slice_to_partition = {1: records_partition_1, 2: records_partition_2}
303
+ stream = _incremental_stream(slice_to_partition, slice_logger, logger, message_repository, timestamp)
304
+
305
+ expected_records = [
306
+ *records_partition_1,
307
+ _create_state_message("__mock_stream", {"created_at": timestamp}),
308
+ *records_partition_2,
309
+ _create_state_message("__mock_stream", {"created_at": timestamp})
310
+ ]
311
+
312
+ actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
313
+
314
+ for record in expected_records:
315
+ assert record in actual_records
316
+ assert len(expected_records) == len(actual_records)
317
+
318
+
319
+ def test_concurrent_incremental_read_two_slices():
320
+ # This test verifies that an incremental concurrent stream manages state correctly for multiple slices syncing concurrently
321
+ configured_stream = ConfiguredAirbyteStream(stream=AirbyteStream(name="mock_stream", supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental], json_schema={}), sync_mode=SyncMode.incremental,destination_sync_mode=DestinationSyncMode.overwrite)
322
+ internal_config = InternalConfig()
323
+ logger = _mock_logger()
324
+ slice_logger = DebugSliceLogger()
325
+ message_repository = InMemoryMessageRepository(Level.INFO)
326
+ state_manager = ConnectorStateManager(stream_instance_map={})
327
+ slice_timestamp_1 = "1708850000"
328
+ slice_timestamp_2 = "1708950000"
329
+ cursor = MockConcurrentCursor(message_repository)
330
+
331
+ records_partition_1 = [
332
+ {"id": 1, "partition": 1, "created_at": "1708800000"},
333
+ {"id": 2, "partition": 1, "created_at": slice_timestamp_1},
334
+ ]
335
+ records_partition_2 = [
336
+ {"id": 3, "partition": 2, "created_at": "1708900000"},
337
+ {"id": 4, "partition": 2, "created_at": slice_timestamp_2},
338
+ ]
339
+ slice_to_partition = {1: records_partition_1, 2: records_partition_2}
340
+ stream = _incremental_concurrent_stream(slice_to_partition, slice_logger, logger, message_repository, cursor)
341
+
342
+ expected_records = [
343
+ *records_partition_1,
344
+ *records_partition_2,
345
+ ]
346
+
347
+ expected_state = _create_state_message("__mock_stream", {"1": {"created_at": slice_timestamp_1}, "2": {"created_at": slice_timestamp_2}})
348
+
349
+ actual_records = _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config)
350
+
351
+ for record in expected_records:
352
+ assert record in actual_records
353
+ assert len(expected_records) == len(actual_records)
354
+
355
+ # We don't have a real source that reads from the message_repository for state, so we read from the queue directly to verify
356
+ # the cursor observed records correctly and updated partition states
357
+ mock_partition = Mock()
358
+ cursor.close_partition(mock_partition)
359
+ actual_state = [state for state in message_repository.consume_queue()]
360
+ assert len(actual_state) == 1
361
+ assert actual_state[0] == expected_state
362
+
363
+
364
+ def _read(stream, configured_stream, logger, slice_logger, message_repository, state_manager, internal_config):
168
365
  records = []
169
- for record in stream.read_full_refresh(_A_CURSOR_FIELD, logger, slice_logger):
366
+ for record in stream.read(configured_stream, logger, slice_logger, {}, state_manager, internal_config):
170
367
  for message in message_repository.consume_queue():
171
368
  records.append(message)
172
369
  records.append(record)
@@ -192,3 +389,16 @@ def _mock_logger(enabled_for_debug=False):
192
389
  logger.isEnabledFor.return_value = enabled_for_debug
193
390
  logger.level = logging.DEBUG if enabled_for_debug else logging.INFO
194
391
  return logger
392
+
393
+
394
+ def _create_state_message(stream: str, state: Mapping[str, Any]) -> AirbyteMessage:
395
+ return AirbyteMessage(
396
+ type=MessageType.STATE,
397
+ state=AirbyteStateMessage(
398
+ type=AirbyteStateType.STREAM,
399
+ stream=AirbyteStreamState(
400
+ stream_descriptor=StreamDescriptor(name=stream, namespace=None),
401
+ stream_state=AirbyteStateBlob(**state),
402
+ )
403
+ ),
404
+ )