airbyte-cdk 0.67.0__py3-none-any.whl → 0.67.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (31) hide show
  1. airbyte_cdk/sources/abstract_source.py +30 -69
  2. airbyte_cdk/sources/connector_state_manager.py +12 -26
  3. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +552 -524
  4. airbyte_cdk/sources/file_based/config/csv_format.py +2 -0
  5. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +32 -14
  6. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +3 -19
  7. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +1 -3
  8. airbyte_cdk/sources/streams/__init__.py +2 -2
  9. airbyte_cdk/sources/streams/concurrent/adapters.py +3 -19
  10. airbyte_cdk/sources/streams/concurrent/cursor.py +1 -3
  11. airbyte_cdk/sources/streams/core.py +36 -34
  12. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/METADATA +3 -2
  13. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/RECORD +31 -31
  14. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +2 -1
  15. unit_tests/sources/file_based/config/test_csv_format.py +6 -1
  16. unit_tests/sources/file_based/file_types/test_parquet_parser.py +51 -6
  17. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +139 -199
  18. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +91 -133
  19. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +2 -13
  20. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +2 -2
  21. unit_tests/sources/file_based/test_scenarios.py +2 -2
  22. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +9 -9
  23. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +5 -5
  24. unit_tests/sources/streams/concurrent/test_adapters.py +2 -13
  25. unit_tests/sources/streams/test_stream_read.py +221 -11
  26. unit_tests/sources/test_abstract_source.py +142 -130
  27. unit_tests/sources/test_connector_state_manager.py +3 -124
  28. unit_tests/sources/test_source.py +18 -14
  29. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/LICENSE.txt +0 -0
  30. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/WHEEL +0 -0
  31. {airbyte_cdk-0.67.0.dist-info → airbyte_cdk-0.67.2.dist-info}/top_level.txt +0 -0
@@ -23,7 +23,7 @@ from airbyte_cdk.models import Type as MessageType
23
23
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
24
24
  from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
25
25
  from airbyte_cdk.sources.source import Source
26
- from airbyte_cdk.sources.streams import Stream
26
+ from airbyte_cdk.sources.streams import FULL_REFRESH_SENTINEL_STATE_KEY, Stream
27
27
  from airbyte_cdk.sources.streams.core import StreamData
28
28
  from airbyte_cdk.sources.streams.http.http import HttpStream
29
29
  from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
@@ -181,10 +181,6 @@ class AbstractSource(Source, ABC):
181
181
  def raise_exception_on_missing_stream(self) -> bool:
182
182
  return True
183
183
 
184
- @property
185
- def per_stream_state_enabled(self) -> bool:
186
- return True
187
-
188
184
  def _read_stream(
189
185
  self,
190
186
  logger: logging.Logger,
@@ -206,22 +202,32 @@ class AbstractSource(Source, ABC):
206
202
  )
207
203
  stream_instance.log_stream_sync_configuration()
208
204
 
209
- use_incremental = configured_stream.sync_mode == SyncMode.incremental and stream_instance.supports_incremental
210
- if use_incremental:
211
- record_iterator = self._read_incremental(
212
- logger,
213
- stream_instance,
214
- configured_stream,
215
- state_manager,
216
- internal_config,
217
- )
218
- else:
219
- record_iterator = self._read_full_refresh(logger, stream_instance, configured_stream, internal_config)
205
+ stream_name = configured_stream.stream.name
206
+ # The platform always passes stream state regardless of sync mode. We shouldn't need to consider this case within the
207
+ # connector, but right now we need to prevent accidental usage of the previous stream state
208
+ stream_state = (
209
+ state_manager.get_stream_state(stream_name, stream_instance.namespace)
210
+ if configured_stream.sync_mode == SyncMode.incremental
211
+ else {}
212
+ )
213
+
214
+ if stream_state and "state" in dir(stream_instance) and not self._stream_state_is_full_refresh(stream_state):
215
+ stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
216
+ logger.info(f"Setting state of {self.name} stream to {stream_state}")
217
+
218
+ record_iterator = stream_instance.read(
219
+ configured_stream,
220
+ logger,
221
+ self._slice_logger,
222
+ stream_state,
223
+ state_manager,
224
+ internal_config,
225
+ )
220
226
 
221
227
  record_counter = 0
222
- stream_name = configured_stream.stream.name
223
228
  logger.info(f"Syncing stream: {stream_name} ")
224
- for record in record_iterator:
229
+ for record_data_or_message in record_iterator:
230
+ record = self._get_message(record_data_or_message, stream_instance)
225
231
  if record.type == MessageType.RECORD:
226
232
  record_counter += 1
227
233
  if record_counter == 1:
@@ -233,62 +239,11 @@ class AbstractSource(Source, ABC):
233
239
 
234
240
  logger.info(f"Read {record_counter} records from {stream_name} stream")
235
241
 
236
- def _read_incremental(
237
- self,
238
- logger: logging.Logger,
239
- stream_instance: Stream,
240
- configured_stream: ConfiguredAirbyteStream,
241
- state_manager: ConnectorStateManager,
242
- internal_config: InternalConfig,
243
- ) -> Iterator[AirbyteMessage]:
244
- """Read stream using incremental algorithm
245
-
246
- :param logger:
247
- :param stream_instance:
248
- :param configured_stream:
249
- :param state_manager:
250
- :param internal_config:
251
- :return:
252
- """
253
- stream_name = configured_stream.stream.name
254
- stream_state = state_manager.get_stream_state(stream_name, stream_instance.namespace)
255
-
256
- if stream_state and "state" in dir(stream_instance):
257
- stream_instance.state = stream_state # type: ignore # we check that state in the dir(stream_instance)
258
- logger.info(f"Setting state of {self.name} stream to {stream_state}")
259
-
260
- for record_data_or_message in stream_instance.read_incremental(
261
- configured_stream.cursor_field,
262
- logger,
263
- self._slice_logger,
264
- stream_state,
265
- state_manager,
266
- self.per_stream_state_enabled,
267
- internal_config,
268
- ):
269
- yield self._get_message(record_data_or_message, stream_instance)
270
-
271
242
  def _emit_queued_messages(self) -> Iterable[AirbyteMessage]:
272
243
  if self.message_repository:
273
244
  yield from self.message_repository.consume_queue()
274
245
  return
275
246
 
276
- def _read_full_refresh(
277
- self,
278
- logger: logging.Logger,
279
- stream_instance: Stream,
280
- configured_stream: ConfiguredAirbyteStream,
281
- internal_config: InternalConfig,
282
- ) -> Iterator[AirbyteMessage]:
283
- total_records_counter = 0
284
- for record_data_or_message in stream_instance.read_full_refresh(configured_stream.cursor_field, logger, self._slice_logger):
285
- message = self._get_message(record_data_or_message, stream_instance)
286
- yield message
287
- if message.type == MessageType.RECORD:
288
- total_records_counter += 1
289
- if internal_config.is_limit_reached(total_records_counter):
290
- return
291
-
292
247
  def _get_message(self, record_data_or_message: Union[StreamData, AirbyteMessage], stream: Stream) -> AirbyteMessage:
293
248
  """
294
249
  Converts the input to an AirbyteMessage if it is a StreamData. Returns the input as is if it is already an AirbyteMessage
@@ -317,3 +272,9 @@ class AbstractSource(Source, ABC):
317
272
  def _generate_failed_streams_error_message(stream_failures: Mapping[str, AirbyteTracedException]) -> str:
318
273
  failures = ", ".join([f"{stream}: {filter_secrets(exception.__repr__())}" for stream, exception in stream_failures.items()])
319
274
  return f"During the sync, the following streams did not sync successfully: {failures}"
275
+
276
+ @staticmethod
277
+ def _stream_state_is_full_refresh(stream_state: Mapping[str, Any]) -> bool:
278
+ # For full refresh syncs that don't have a suitable cursor value, we emit a state that contains a sentinel key.
279
+ # This key is never used by a connector and is needed during a read to skip assigning the incoming state.
280
+ return FULL_REFRESH_SENTINEL_STATE_KEY in stream_state
@@ -77,7 +77,7 @@ class ConnectorStateManager:
77
77
  stream_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
78
78
  self.per_stream_states[stream_descriptor] = AirbyteStateBlob.parse_obj(value)
79
79
 
80
- def create_state_message(self, stream_name: str, namespace: Optional[str], send_per_stream_state: bool) -> AirbyteMessage:
80
+ def create_state_message(self, stream_name: str, namespace: Optional[str]) -> AirbyteMessage:
81
81
  """
82
82
  Generates an AirbyteMessage using the current per-stream state of a specified stream in either the per-stream or legacy format
83
83
  :param stream_name: The name of the stream for the message that is being created
@@ -85,25 +85,18 @@ class ConnectorStateManager:
85
85
  :param send_per_stream_state: Decides which state format the message should be generated as
86
86
  :return: The Airbyte state message to be emitted by the connector during a sync
87
87
  """
88
- if send_per_stream_state:
89
- hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
90
- stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
91
-
92
- # According to the Airbyte protocol, the StreamDescriptor namespace field is not required. However, the platform will throw
93
- # a validation error if it receives namespace=null. That is why if namespace is None, the field should be omitted instead.
94
- stream_descriptor = (
95
- StreamDescriptor(name=stream_name) if namespace is None else StreamDescriptor(name=stream_name, namespace=namespace)
96
- )
97
-
98
- return AirbyteMessage(
99
- type=MessageType.STATE,
100
- state=AirbyteStateMessage(
101
- type=AirbyteStateType.STREAM,
102
- stream=AirbyteStreamState(stream_descriptor=stream_descriptor, stream_state=stream_state),
103
- data=dict(self._get_legacy_state()),
88
+ hashable_descriptor = HashableStreamDescriptor(name=stream_name, namespace=namespace)
89
+ stream_state = self.per_stream_states.get(hashable_descriptor) or AirbyteStateBlob()
90
+
91
+ return AirbyteMessage(
92
+ type=MessageType.STATE,
93
+ state=AirbyteStateMessage(
94
+ type=AirbyteStateType.STREAM,
95
+ stream=AirbyteStreamState(
96
+ stream_descriptor=StreamDescriptor(name=stream_name, namespace=namespace), stream_state=stream_state
104
97
  ),
105
- )
106
- return AirbyteMessage(type=MessageType.STATE, state=AirbyteStateMessage(data=dict(self._get_legacy_state())))
98
+ ),
99
+ )
107
100
 
108
101
  @classmethod
109
102
  def _extract_from_state_message(
@@ -176,13 +169,6 @@ class ConnectorStateManager:
176
169
  streams[stream_descriptor] = AirbyteStateBlob.parse_obj(state_value or {})
177
170
  return streams
178
171
 
179
- def _get_legacy_state(self) -> Mapping[str, Any]:
180
- """
181
- Using the current per-stream state, creates a mapping of all the stream states for the connector being synced
182
- :return: A deep copy of the mapping of stream name to stream state value
183
- """
184
- return {descriptor.name: state.dict() if state else {} for descriptor, state in self.per_stream_states.items()}
185
-
186
172
  @staticmethod
187
173
  def _is_legacy_dict_state(state: Union[List[AirbyteStateMessage], MutableMapping[str, Any]]) -> bool:
188
174
  return isinstance(state, dict)