airbyte-cdk 0.67.0__py3-none-any.whl → 0.67.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. airbyte_cdk/sources/abstract_source.py +30 -69
  2. airbyte_cdk/sources/connector_state_manager.py +12 -26
  3. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +552 -524
  4. airbyte_cdk/sources/file_based/config/csv_format.py +2 -0
  5. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
  6. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
  7. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
  8. airbyte_cdk/sources/streams/__init__.py +2 -2
  9. airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
  10. airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
  11. airbyte_cdk/sources/streams/core.py +36 -34
  12. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/METADATA +3 -2
  13. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/RECORD +31 -31
  14. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
  15. unit_tests/sources/file_based/config/test_csv_format.py +6 -1
  16. unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
  17. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
  18. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
  19. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
  20. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
  21. unit_tests/sources/file_based/test_scenarios.py +2 -2
  22. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
  23. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
  24. unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
  25. unit_tests/sources/streams/test_stream_read.py +221 -11
  26. unit_tests/sources/test_abstract_source.py +142 -130
  27. unit_tests/sources/test_connector_state_manager.py +3 -124
  28. unit_tests/sources/test_source.py +18 -14
  29. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/LICENSE.txt +0 -0
  30. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/WHEEL +0 -0
  31. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,7 @@ from airbyte_cdk.models import Type as MessageType
23
23
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
24
24
  from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
25
25
  from airbyte_cdk.sources.source import Source
26
- from airbyte_cdk.sources.streams import Stream
26
+ from airbyte_cdk.sources.streams import FULL_REFRESH_SENTINEL_STATE_KEY, Stream
27
27
  from airbyte_cdk.sources.streams.core import StreamData
28
28
  from airbyte_cdk.sources.streams.http.http import HttpStream
29
29
  from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
@@ -181,10 +181,6 @@ class AbstractSource(Source, ABC):
181
181
  def raise_exception_on_missing_stream(self) -> bool:
182
182
  return True
183
183
 
184
- @property
185
- def per_stream_state_enabled(self) -> bool:
186
- return True
187
-
188
184
  def _read_stream(
189
185
  self,
190
186
  logger: logging.Logger,
@@ -206,22 +202,32 @@ class AbstractSource(Source, ABC):
206
202
  )
207
203
  stream_instance.log_stream_sync_configuration()
208
204
 
209
- use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental
210
- if use_incremental:
211
- record_iterator = self._read_incremental(
212
- logger,
213
- stream_instance,
214
- configured_stream,
215
- state_manager,
216
- internal_config,
217
- )
218
- else:
219
- record_iterator = self._read_full_refresh(logger, stream_instance, configured_stream, internal_config)
205
+ stream_name = configured_stream.stream.name
206
+ # The platform always passes stream state regardless of sync mode. We shouldn't need to consider this case within the
207
+ # connector, but right now we need to prevent accidental usage of the previous stream state
208
+ stream_state = (
209
+ state_manager.get_stream_state(stream_name, stream_instance.namespace)
210
+ if configured_stream.sync_mode == SyncMode.incremental
211
+ else {}
212
+ )
213
+
214
+ if stream_state and "state" in dir(stream_instance) and not self._stream_state_is_full_refresh(stream_state):
215
+ stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
216
+ logger.info(f"Setting state of {self.name} stream to {stream_state}")
217
+
218
+ record_iterator = stream_instance.read(
219
+ configured_stream,
220
+ logger,
221
+ self._slice_logger,
222
+ stream_state,
223
+ state_manager,
224
+ internal_config,
225
+ )
220
226
 
221
227
  record_counter = 0
222
- stream_name = configured_stream.stream.name
223
228
  logger.info(f"Syncing stream: {stream_name} ")
224
- for record in record_iterator:
229
+ for record_data_or_message in record_iterator:
230
+ record = self._get_message(record_data_or_message, stream_instance)
225
231
  if record.type == MessageType.RECORD:
226
232
  record_counter += 1
227
233
  if record_counter == 1:
@@ -233,62 +239,11 @@ class AbstractSource(Source, ABC):
233
239
 
234
240
  logger.info(f"Read {record_counter} records from {stream_name} stream")
235
241
 
236
- def _read_incremental(
237
- self,
238
- logger: logging.Logger,
239
- stream_instance: Stream,
240
- configured_stream: ConfiguredAirbyteStream,
241
- state_manager: ConnectorStateManager,
242
- internal_config: InternalConfig,
243
- ) -> Iterator[AirbyteMessage]:
244
- """Read stream using incremental algorithm
245
-
246
- :param logger:
247
- :param stream_instance:
248
- :param configured_stream:
249
- :param state_manager:
250
- :param internal_config:
251
- :return:
252
- """
253
- stream_name = configured_stream.stream.name
254
- stream_state = state_manager.get_stream_state(stream_name, stream_instance.namespace)
255
-
256
- if stream_state and "state" in dir(stream_instance):
257
- stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
258
- logger.info(f"Setting state of {self.name} stream to {stream_state}")
259
-
260
- for record_data_or_message in stream_instance.read_incremental(
261
- configured_stream.cursor_field,
262
- logger,
263
- self._slice_logger,
264
- stream_state,
265
- state_manager,
266
- self.per_stream_state_enabled,
267
- internal_config,
268
- ):
269
- yield self._get_message(record_data_or_message, stream_instance)
270
-
271
242
  def _emit_queued_messages(self) -> Iterable[AirbyteMessage]:
272
243
  if self.message_repository:
273
244
  yield from self.message_repository.consume_queue()
274
245
  return
275
246
 
276
- def _read_full_refresh(
277
- self,
278
- logger: logging.Logger,
279
- stream_instance: Stream,
280
- configured_stream: ConfiguredAirbyteStream,
281
- internal_config: InternalConfig,
282
- ) -> Iterator[AirbyteMessage]:
283
- total_records_counter = 0
284
- for record_data_or_message in stream_instance.read_full_refresh(configured_stream.cursor_field, logger, self._slice_logger):
285
- message = self._get_message(record_data_or_message, stream_instance)
286
- yield message
287
- if message.type == MessageType.RECORD:
288
- total_records_counter += 1
289
- if internal_config.is_limit_reached(total_records_counter):
290
- return
291
-
292
247
  def _get_message(self, record_data_or_message: Union[StreamData, AirbyteMessage], stream: Stream) -> AirbyteMessage:
293
248
  """
294
249
  Converts the input to an AirbyteMessage if it is a StreamData. Returns the input as is if it is already an AirbyteMessage
@@ -317,3 +272,9 @@ class AbstractSource(Source, ABC):
317
272
  def _generate_failed_streams_error_message(stream_failures: Mapping[str, AirbyteTracedException]) -> str:
318
273
  failures = ", ".join([f"{stream}: {filter_secrets(exception.__repr__())}" for stream, exception in stream_failures.items()])
319
274
  return f"During the sync, the following streams did not sync successfully: {failures}"
275
+
276
+ @staticmethod
277
+ def _stream_state_is_full_refresh(stream_state: Mapping[str, Any]) -> bool:
278
+ # For full refresh syncs that don't have a suitable cursor value, we emit a state that contains a sentinel key.
279
+ # This key is never used by a connector and is needed during a read to skip assigning the incoming state.
280
+ return FULL_REFRESH_SENTINEL_STATE_KEY in stream_state
@@ -77,7 +77,7 @@ class ConnectorStateManager:
77
77
  stream_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
78
78
  self.per_stream_states[stream_descriptor] = AirbyteStateBlob.parse_obj(value)
79
79
 
80
- def create_state_message(self, stream_name: str, namespace: Optional[str], send_per_stream_state: bool) -> AirbyteMessage:
80
+ def create_state_message(self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage:
81
81
  """
82
82
  Generates an AirbyteMessage using the current per-stream state of a specified stream in either the per-stream or legacy format
83
83
  :param stream_name: The name of the stream for the message that is being created
@@ -85,25 +85,18 @@ class ConnectorStateManager:
85
85
  :param send_per_stream_state: Decides which state format the message should be generated as
86
86
  :return: The Airbyte state message to be emitted by the connector during a sync
87
87
  """
88
- if send_per_stream_state:
89
- hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
90
- stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
91
-
92
- # According to the Airbyte protocol, the StreamDescriptor namespace field is not required. However, the platform will throw
93
- # a validation error if it receives namespace=null. That is why if namespace is None, the field should be omitted instead.
94
- stream_descriptor = (
95
- StreamDescriptor(name=stream_name) if namespace is None else StreamDescriptor(name=stream_name, namespace=namespace)
96
- )
97
-
98
- return AirbyteMessage(
99
- type=MessageType.STATE,
100
- state=AirbyteStateMessage(
101
- type=AirbyteStateType.STREAM,
102
- stream=AirbyteStreamState(stream_descriptor=stream_descriptor, stream_state=stream_state),
103
- data=dict(self._get_legacy_state()),
88
+ hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
89
+ stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
90
+
91
+ return AirbyteMessage(
92
+ type=MessageType.STATE,
93
+ state=AirbyteStateMessage(
94
+ type=AirbyteStateType.STREAM,
95
+ stream=AirbyteStreamState(
96
+ stream_descriptor=StreamDescriptor(name=stream_name, namespace=namespace), stream_state=stream_state
104
97
  ),
105
- )
106
- return AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=dict(self._get_legacy_state())))
98
+ ),
99
+ )
107
100
 
108
101
  @classmethod
109
102
  def _extract_from_state_message(
@@ -176,13 +169,6 @@ class ConnectorStateManager:
176
169
  streams[stream_descriptor] = AirbyteStateBlob.parse_obj(state_value or {})
177
170
  return streams
178
171
 
179
- def _get_legacy_state(self) -> Mapping[str, Any]:
180
- """
181
- Using the current per-stream state, creates a mapping of all the stream states for the connector being synced
182
- :return: A deep copy of the mapping of stream name to stream state value
183
- """
184
- return {descriptor.name: state.dict() if state else {} for descriptor, state in self.per_stream_states.items()}
185
-
186
172
  @staticmethod
187
173
  def _is_legacy_dict_state(state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]]) -> bool:
188
174
  return isinstance(state, dict)