airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +17 -2
- airbyte_cdk/config_observation.py +10 -3
- airbyte_cdk/connector.py +19 -9
- airbyte_cdk/connector_builder/connector_builder_handler.py +28 -8
- airbyte_cdk/connector_builder/main.py +26 -6
- airbyte_cdk/connector_builder/message_grouper.py +95 -25
- airbyte_cdk/destinations/destination.py +47 -14
- airbyte_cdk/destinations/vector_db_based/config.py +36 -14
- airbyte_cdk/destinations/vector_db_based/document_processor.py +49 -11
- airbyte_cdk/destinations/vector_db_based/embedder.py +52 -11
- airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
- airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +15 -4
- airbyte_cdk/entrypoint.py +82 -26
- airbyte_cdk/exception_handler.py +13 -3
- airbyte_cdk/logger.py +10 -2
- airbyte_cdk/models/airbyte_protocol.py +11 -5
- airbyte_cdk/models/airbyte_protocol_serializers.py +9 -3
- airbyte_cdk/models/well_known_types.py +1 -1
- airbyte_cdk/sources/abstract_source.py +63 -17
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +47 -14
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +25 -7
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +27 -6
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +9 -3
- airbyte_cdk/sources/connector_state_manager.py +32 -10
- airbyte_cdk/sources/declarative/async_job/job.py +3 -1
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +68 -14
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +24 -6
- airbyte_cdk/sources/declarative/async_job/repository.py +3 -1
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/jwt.py +27 -7
- airbyte_cdk/sources/declarative/auth/oauth.py +35 -11
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/token.py +25 -8
- airbyte_cdk/sources/declarative/checks/check_stream.py +12 -4
- airbyte_cdk/sources/declarative/checks/connection_checker.py +3 -1
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +11 -3
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +106 -50
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +20 -6
- airbyte_cdk/sources/declarative/declarative_source.py +3 -1
- airbyte_cdk/sources/declarative/declarative_stream.py +27 -6
- airbyte_cdk/sources/declarative/decoders/decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +3 -1
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +6 -2
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +6 -2
- airbyte_cdk/sources/declarative/extractors/record_filter.py +24 -7
- airbyte_cdk/sources/declarative/extractors/record_selector.py +10 -3
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +15 -5
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +96 -31
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +22 -8
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +46 -15
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +19 -5
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +3 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +20 -2
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +5 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +10 -3
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +7 -1
- airbyte_cdk/sources/declarative/interpolation/jinja.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +106 -24
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +7 -2
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +656 -678
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +13 -4
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +9 -2
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +782 -232
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +29 -7
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +25 -7
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +54 -15
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +6 -2
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +15 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +18 -8
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +16 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +51 -14
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +29 -8
- airbyte_cdk/sources/declarative/requesters/http_requester.py +58 -16
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +49 -14
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +24 -7
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +9 -3
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +6 -2
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +19 -6
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +3 -1
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +21 -7
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +18 -6
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +27 -8
- airbyte_cdk/sources/declarative/requesters/requester.py +3 -1
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -5
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +105 -24
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +3 -1
- airbyte_cdk/sources/declarative/spec/spec.py +8 -2
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +3 -1
- airbyte_cdk/sources/declarative/transformations/add_fields.py +12 -3
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +6 -2
- airbyte_cdk/sources/declarative/types.py +8 -1
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +3 -1
- airbyte_cdk/sources/embedded/base_integration.py +14 -4
- airbyte_cdk/sources/embedded/catalog.py +16 -4
- airbyte_cdk/sources/embedded/runner.py +19 -3
- airbyte_cdk/sources/embedded/tools.py +3 -1
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +27 -7
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +12 -6
- airbyte_cdk/sources/file_based/config/csv_format.py +21 -9
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +6 -2
- airbyte_cdk/sources/file_based/config/unstructured_format.py +10 -3
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
- airbyte_cdk/sources/file_based/exceptions.py +13 -15
- airbyte_cdk/sources/file_based/file_based_source.py +82 -24
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -5
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +58 -17
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +89 -26
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +25 -7
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +20 -6
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +57 -16
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +64 -15
- airbyte_cdk/sources/file_based/schema_helpers.py +33 -10
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +33 -10
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +47 -11
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +13 -22
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +53 -17
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +17 -5
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +26 -9
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +67 -21
- airbyte_cdk/sources/http_logger.py +5 -1
- airbyte_cdk/sources/message/repository.py +18 -4
- airbyte_cdk/sources/source.py +17 -7
- airbyte_cdk/sources/streams/availability_strategy.py +9 -3
- airbyte_cdk/sources/streams/call_rate.py +63 -19
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +31 -7
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +6 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +77 -22
- airbyte_cdk/sources/streams/concurrent/cursor.py +56 -20
- airbyte_cdk/sources/streams/concurrent/default_stream.py +9 -2
- airbyte_cdk/sources/streams/concurrent/helpers.py +6 -2
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +9 -2
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +4 -1
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +10 -2
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +6 -2
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +25 -10
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +32 -16
- airbyte_cdk/sources/streams/core.py +77 -22
- airbyte_cdk/sources/streams/http/availability_strategy.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +4 -1
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +16 -5
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +9 -3
- airbyte_cdk/sources/streams/http/exceptions.py +2 -2
- airbyte_cdk/sources/streams/http/http.py +133 -33
- airbyte_cdk/sources/streams/http/http_client.py +91 -29
- airbyte_cdk/sources/streams/http/rate_limiting.py +23 -7
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +19 -6
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +38 -11
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
- airbyte_cdk/sources/types.py +5 -1
- airbyte_cdk/sources/utils/record_helper.py +12 -3
- airbyte_cdk/sources/utils/schema_helpers.py +9 -3
- airbyte_cdk/sources/utils/slice_logger.py +4 -1
- airbyte_cdk/sources/utils/transform.py +24 -9
- airbyte_cdk/sql/exceptions.py +19 -6
- airbyte_cdk/sql/secrets.py +3 -1
- airbyte_cdk/sql/shared/catalog_providers.py +13 -4
- airbyte_cdk/sql/shared/sql_processor.py +44 -14
- airbyte_cdk/test/catalog_builder.py +19 -8
- airbyte_cdk/test/entrypoint_wrapper.py +27 -8
- airbyte_cdk/test/mock_http/mocker.py +41 -11
- airbyte_cdk/test/mock_http/request.py +9 -3
- airbyte_cdk/test/mock_http/response.py +3 -1
- airbyte_cdk/test/mock_http/response_builder.py +29 -7
- airbyte_cdk/test/state_builder.py +10 -2
- airbyte_cdk/test/utils/data.py +6 -2
- airbyte_cdk/test/utils/http_mocking.py +3 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +3 -1
- airbyte_cdk/utils/analytics_message.py +10 -2
- airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
- airbyte_cdk/utils/mapping_helpers.py +3 -1
- airbyte_cdk/utils/message_utils.py +11 -4
- airbyte_cdk/utils/print_buffer.py +6 -1
- airbyte_cdk/utils/schema_inferrer.py +30 -9
- airbyte_cdk/utils/spec_schema_transformations.py +3 -1
- airbyte_cdk/utils/traced_exception.py +35 -9
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/METADATA +7 -6
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/RECORD +198 -198
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/WHEEL +0 -0
@@ -11,7 +11,9 @@ from airbyte_cdk.sources.message import MessageRepository
|
|
11
11
|
from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY
|
12
12
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
13
13
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
14
|
-
from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import
|
14
|
+
from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import (
|
15
|
+
AbstractStreamStateConverter,
|
16
|
+
)
|
15
17
|
|
16
18
|
|
17
19
|
def _extract_value(mapping: Mapping[str, Any], path: List[str]) -> Any:
|
@@ -62,8 +64,7 @@ class CursorField:
|
|
62
64
|
class Cursor(ABC):
|
63
65
|
@property
|
64
66
|
@abstractmethod
|
65
|
-
def state(self) -> MutableMapping[str, Any]:
|
66
|
-
...
|
67
|
+
def state(self) -> MutableMapping[str, Any]: ...
|
67
68
|
|
68
69
|
@abstractmethod
|
69
70
|
def observe(self, record: Record) -> None:
|
@@ -128,8 +129,12 @@ class FinalStateCursor(Cursor):
|
|
128
129
|
Used primarily for full refresh syncs that do not have a valid cursor value to emit at the end of a sync
|
129
130
|
"""
|
130
131
|
|
131
|
-
self._connector_state_manager.update_state_for_stream(
|
132
|
-
|
132
|
+
self._connector_state_manager.update_state_for_stream(
|
133
|
+
self._stream_name, self._stream_namespace, self.state
|
134
|
+
)
|
135
|
+
state_message = self._connector_state_manager.create_state_message(
|
136
|
+
self._stream_name, self._stream_namespace
|
137
|
+
)
|
133
138
|
self._message_repository.emit_message(state_message)
|
134
139
|
|
135
140
|
|
@@ -182,13 +187,22 @@ class ConcurrentCursor(Cursor):
|
|
182
187
|
def slice_boundary_fields(self) -> Optional[Tuple[str, str]]:
|
183
188
|
return self._slice_boundary_fields
|
184
189
|
|
185
|
-
def _get_concurrent_state(
|
190
|
+
def _get_concurrent_state(
|
191
|
+
self, state: MutableMapping[str, Any]
|
192
|
+
) -> Tuple[CursorValueType, MutableMapping[str, Any]]:
|
186
193
|
if self._connector_state_converter.is_state_message_compatible(state):
|
187
|
-
return
|
188
|
-
|
194
|
+
return (
|
195
|
+
self._start or self._connector_state_converter.zero_value,
|
196
|
+
self._connector_state_converter.deserialize(state),
|
197
|
+
)
|
198
|
+
return self._connector_state_converter.convert_from_sequential_state(
|
199
|
+
self._cursor_field, state, self._start
|
200
|
+
)
|
189
201
|
|
190
202
|
def observe(self, record: Record) -> None:
|
191
|
-
most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get(
|
203
|
+
most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get(
|
204
|
+
record.partition
|
205
|
+
)
|
192
206
|
cursor_value = self._extract_cursor_value(record)
|
193
207
|
|
194
208
|
if most_recent_cursor_value is None or most_recent_cursor_value < cursor_value:
|
@@ -200,7 +214,9 @@ class ConcurrentCursor(Cursor):
|
|
200
214
|
def close_partition(self, partition: Partition) -> None:
|
201
215
|
slice_count_before = len(self.state.get("slices", []))
|
202
216
|
self._add_slice_to_state(partition)
|
203
|
-
if slice_count_before < len(
|
217
|
+
if slice_count_before < len(
|
218
|
+
self.state["slices"]
|
219
|
+
): # only emit if at least one slice has been processed
|
204
220
|
self._merge_partitions()
|
205
221
|
self._emit_state_message()
|
206
222
|
self._has_closed_at_least_one_slice = True
|
@@ -253,9 +269,13 @@ class ConcurrentCursor(Cursor):
|
|
253
269
|
self._connector_state_manager.update_state_for_stream(
|
254
270
|
self._stream_name,
|
255
271
|
self._stream_namespace,
|
256
|
-
self._connector_state_converter.convert_to_state_message(
|
272
|
+
self._connector_state_converter.convert_to_state_message(
|
273
|
+
self._cursor_field, self.state
|
274
|
+
),
|
275
|
+
)
|
276
|
+
state_message = self._connector_state_manager.create_state_message(
|
277
|
+
self._stream_name, self._stream_namespace
|
257
278
|
)
|
258
|
-
state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace)
|
259
279
|
self._message_repository.emit_message(state_message)
|
260
280
|
|
261
281
|
def _merge_partitions(self) -> None:
|
@@ -268,7 +288,9 @@ class ConcurrentCursor(Cursor):
|
|
268
288
|
raise KeyError(f"Could not find key `{key}` in empty slice")
|
269
289
|
return self._connector_state_converter.parse_value(_slice[key]) # type: ignore # we expect the devs to specify a key that would return a CursorValueType
|
270
290
|
except KeyError as exception:
|
271
|
-
raise KeyError(
|
291
|
+
raise KeyError(
|
292
|
+
f"Partition is expected to have key `{key}` but could not be found"
|
293
|
+
) from exception
|
272
294
|
|
273
295
|
def ensure_at_least_one_state_emitted(self) -> None:
|
274
296
|
"""
|
@@ -300,7 +322,9 @@ class ConcurrentCursor(Cursor):
|
|
300
322
|
|
301
323
|
if len(self.state["slices"]) == 1:
|
302
324
|
yield from self._split_per_slice_range(
|
303
|
-
self._calculate_lower_boundary_of_last_slice(
|
325
|
+
self._calculate_lower_boundary_of_last_slice(
|
326
|
+
self.state["slices"][0][self._connector_state_converter.END_KEY]
|
327
|
+
),
|
304
328
|
self._end_provider(),
|
305
329
|
True,
|
306
330
|
)
|
@@ -308,7 +332,8 @@ class ConcurrentCursor(Cursor):
|
|
308
332
|
for i in range(len(self.state["slices"]) - 1):
|
309
333
|
if self._cursor_granularity:
|
310
334
|
yield from self._split_per_slice_range(
|
311
|
-
self.state["slices"][i][self._connector_state_converter.END_KEY]
|
335
|
+
self.state["slices"][i][self._connector_state_converter.END_KEY]
|
336
|
+
+ self._cursor_granularity,
|
312
337
|
self.state["slices"][i + 1][self._connector_state_converter.START_KEY],
|
313
338
|
False,
|
314
339
|
)
|
@@ -319,7 +344,9 @@ class ConcurrentCursor(Cursor):
|
|
319
344
|
False,
|
320
345
|
)
|
321
346
|
yield from self._split_per_slice_range(
|
322
|
-
self._calculate_lower_boundary_of_last_slice(
|
347
|
+
self._calculate_lower_boundary_of_last_slice(
|
348
|
+
self.state["slices"][-1][self._connector_state_converter.END_KEY]
|
349
|
+
),
|
323
350
|
self._end_provider(),
|
324
351
|
True,
|
325
352
|
)
|
@@ -327,9 +354,14 @@ class ConcurrentCursor(Cursor):
|
|
327
354
|
raise ValueError("Expected at least one slice")
|
328
355
|
|
329
356
|
def _is_start_before_first_slice(self) -> bool:
|
330
|
-
return
|
357
|
+
return (
|
358
|
+
self._start is not None
|
359
|
+
and self._start < self.state["slices"][0][self._connector_state_converter.START_KEY]
|
360
|
+
)
|
331
361
|
|
332
|
-
def _calculate_lower_boundary_of_last_slice(
|
362
|
+
def _calculate_lower_boundary_of_last_slice(
|
363
|
+
self, lower_boundary: CursorValueType
|
364
|
+
) -> CursorValueType:
|
333
365
|
if self._lookback_window:
|
334
366
|
return lower_boundary - self._lookback_window
|
335
367
|
return lower_boundary
|
@@ -353,9 +385,13 @@ class ConcurrentCursor(Cursor):
|
|
353
385
|
stop_processing = False
|
354
386
|
current_lower_boundary = lower
|
355
387
|
while not stop_processing:
|
356
|
-
current_upper_boundary = min(
|
388
|
+
current_upper_boundary = min(
|
389
|
+
self._evaluate_upper_safely(current_lower_boundary, self._slice_range), upper
|
390
|
+
)
|
357
391
|
has_reached_upper_boundary = current_upper_boundary >= upper
|
358
|
-
if self._cursor_granularity and (
|
392
|
+
if self._cursor_granularity and (
|
393
|
+
not upper_is_end or not has_reached_upper_boundary
|
394
|
+
):
|
359
395
|
yield current_lower_boundary, current_upper_boundary - self._cursor_granularity
|
360
396
|
else:
|
361
397
|
yield current_lower_boundary, current_upper_boundary
|
@@ -8,7 +8,10 @@ from typing import Any, Iterable, List, Mapping, Optional
|
|
8
8
|
|
9
9
|
from airbyte_cdk.models import AirbyteStream, SyncMode
|
10
10
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
11
|
-
from airbyte_cdk.sources.streams.concurrent.availability_strategy import
|
11
|
+
from airbyte_cdk.sources.streams.concurrent.availability_strategy import (
|
12
|
+
AbstractAvailabilityStrategy,
|
13
|
+
StreamAvailability,
|
14
|
+
)
|
12
15
|
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
|
13
16
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
14
17
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
|
@@ -60,7 +63,11 @@ class DefaultStream(AbstractStream):
|
|
60
63
|
return self._json_schema
|
61
64
|
|
62
65
|
def as_airbyte_stream(self) -> AirbyteStream:
|
63
|
-
stream = AirbyteStream(
|
66
|
+
stream = AirbyteStream(
|
67
|
+
name=self.name,
|
68
|
+
json_schema=dict(self._json_schema),
|
69
|
+
supported_sync_modes=[SyncMode.full_refresh],
|
70
|
+
)
|
64
71
|
|
65
72
|
if self._namespace:
|
66
73
|
stream.namespace = self._namespace
|
@@ -5,7 +5,9 @@ from typing import List, Optional, Union
|
|
5
5
|
from airbyte_cdk.sources.streams import Stream
|
6
6
|
|
7
7
|
|
8
|
-
def get_primary_key_from_stream(
|
8
|
+
def get_primary_key_from_stream(
|
9
|
+
stream_primary_key: Optional[Union[str, List[str], List[List[str]]]],
|
10
|
+
) -> List[str]:
|
9
11
|
if stream_primary_key is None:
|
10
12
|
return []
|
11
13
|
elif isinstance(stream_primary_key, str):
|
@@ -22,7 +24,9 @@ def get_primary_key_from_stream(stream_primary_key: Optional[Union[str, List[str
|
|
22
24
|
def get_cursor_field_from_stream(stream: Stream) -> Optional[str]:
|
23
25
|
if isinstance(stream.cursor_field, list):
|
24
26
|
if len(stream.cursor_field) > 1:
|
25
|
-
raise ValueError(
|
27
|
+
raise ValueError(
|
28
|
+
f"Nested cursor fields are not supported. Got {stream.cursor_field} for {stream.name}"
|
29
|
+
)
|
26
30
|
elif len(stream.cursor_field) == 0:
|
27
31
|
return None
|
28
32
|
else:
|
@@ -4,7 +4,9 @@
|
|
4
4
|
import time
|
5
5
|
from queue import Queue
|
6
6
|
|
7
|
-
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import
|
7
|
+
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import (
|
8
|
+
PartitionGenerationCompletedSentinel,
|
9
|
+
)
|
8
10
|
from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException
|
9
11
|
from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
|
10
12
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
@@ -16,7 +18,12 @@ class PartitionEnqueuer:
|
|
16
18
|
Generates partitions from a partition generator and puts them in a queue.
|
17
19
|
"""
|
18
20
|
|
19
|
-
def __init__(
|
21
|
+
def __init__(
|
22
|
+
self,
|
23
|
+
queue: Queue[QueueItem],
|
24
|
+
thread_pool_manager: ThreadPoolManager,
|
25
|
+
sleep_time_in_seconds: float = 0.1,
|
26
|
+
) -> None:
|
20
27
|
"""
|
21
28
|
:param queue: The queue to put the partitions in.
|
22
29
|
:param throttler: The throttler to use to throttle the partition generation.
|
@@ -5,7 +5,10 @@ from queue import Queue
|
|
5
5
|
|
6
6
|
from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException
|
7
7
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
8
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.types import
|
8
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.types import (
|
9
|
+
PartitionCompleteSentinel,
|
10
|
+
QueueItem,
|
11
|
+
)
|
9
12
|
|
10
13
|
|
11
14
|
class PartitionReader:
|
@@ -13,7 +13,12 @@ class Record:
|
|
13
13
|
Represents a record read from a stream.
|
14
14
|
"""
|
15
15
|
|
16
|
-
def __init__(
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
data: Mapping[str, Any],
|
19
|
+
partition: "Partition",
|
20
|
+
is_file_transfer_message: bool = False,
|
21
|
+
):
|
17
22
|
self.data = data
|
18
23
|
self.partition = partition
|
19
24
|
self.is_file_transfer_message = is_file_transfer_message
|
@@ -21,7 +26,10 @@ class Record:
|
|
21
26
|
def __eq__(self, other: Any) -> bool:
|
22
27
|
if not isinstance(other, Record):
|
23
28
|
return False
|
24
|
-
return
|
29
|
+
return (
|
30
|
+
self.data == other.data
|
31
|
+
and self.partition.stream_name() == other.partition.stream_name()
|
32
|
+
)
|
25
33
|
|
26
34
|
def __repr__(self) -> str:
|
27
35
|
return f"Record(data={self.data}, stream_name={self.partition.stream_name()})"
|
@@ -4,7 +4,9 @@
|
|
4
4
|
|
5
5
|
from typing import Any, Union
|
6
6
|
|
7
|
-
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import
|
7
|
+
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import (
|
8
|
+
PartitionGenerationCompletedSentinel,
|
9
|
+
)
|
8
10
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
9
11
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
10
12
|
|
@@ -31,4 +33,6 @@ class PartitionCompleteSentinel:
|
|
31
33
|
"""
|
32
34
|
Typedef representing the items that can be added to the ThreadBasedConcurrentStream
|
33
35
|
"""
|
34
|
-
QueueItem = Union[
|
36
|
+
QueueItem = Union[
|
37
|
+
Record, Partition, PartitionCompleteSentinel, PartitionGenerationCompletedSentinel, Exception
|
38
|
+
]
|
@@ -30,7 +30,9 @@ class AbstractStreamStateConverter(ABC):
|
|
30
30
|
def __init__(self, is_sequential_state: bool = True):
|
31
31
|
self._is_sequential_state = is_sequential_state
|
32
32
|
|
33
|
-
def convert_to_state_message(
|
33
|
+
def convert_to_state_message(
|
34
|
+
self, cursor_field: "CursorField", stream_state: MutableMapping[str, Any]
|
35
|
+
) -> MutableMapping[str, Any]:
|
34
36
|
"""
|
35
37
|
Convert the state message from the concurrency-compatible format to the stream's original format.
|
36
38
|
|
@@ -41,7 +43,9 @@ class AbstractStreamStateConverter(ABC):
|
|
41
43
|
legacy_state = stream_state.get("legacy", {})
|
42
44
|
latest_complete_time = self._get_latest_complete_time(stream_state.get("slices", []))
|
43
45
|
if latest_complete_time is not None:
|
44
|
-
legacy_state.update(
|
46
|
+
legacy_state.update(
|
47
|
+
{cursor_field.cursor_field_key: self._to_state_message(latest_complete_time)}
|
48
|
+
)
|
45
49
|
return legacy_state or {}
|
46
50
|
else:
|
47
51
|
return self.serialize(stream_state, ConcurrencyCompatibleStateType.date_range)
|
@@ -51,7 +55,9 @@ class AbstractStreamStateConverter(ABC):
|
|
51
55
|
Get the latest time before which all records have been processed.
|
52
56
|
"""
|
53
57
|
if not slices:
|
54
|
-
raise RuntimeError(
|
58
|
+
raise RuntimeError(
|
59
|
+
"Expected at least one slice but there were none. This is unexpected; please contact Support."
|
60
|
+
)
|
55
61
|
merged_intervals = self.merge_intervals(slices)
|
56
62
|
first_interval = merged_intervals[0]
|
57
63
|
|
@@ -66,7 +72,9 @@ class AbstractStreamStateConverter(ABC):
|
|
66
72
|
stream_slice[self.END_KEY] = self._from_state_message(stream_slice[self.END_KEY])
|
67
73
|
return state
|
68
74
|
|
69
|
-
def serialize(
|
75
|
+
def serialize(
|
76
|
+
self, state: MutableMapping[str, Any], state_type: ConcurrencyCompatibleStateType
|
77
|
+
) -> MutableMapping[str, Any]:
|
70
78
|
"""
|
71
79
|
Perform any transformations needed for compatibility with the converter.
|
72
80
|
"""
|
@@ -77,13 +85,17 @@ class AbstractStreamStateConverter(ABC):
|
|
77
85
|
self.END_KEY: self._to_state_message(stream_slice[self.END_KEY]),
|
78
86
|
}
|
79
87
|
if stream_slice.get(self.MOST_RECENT_RECORD_KEY):
|
80
|
-
serialized_slice[self.MOST_RECENT_RECORD_KEY] = self._to_state_message(
|
88
|
+
serialized_slice[self.MOST_RECENT_RECORD_KEY] = self._to_state_message(
|
89
|
+
stream_slice[self.MOST_RECENT_RECORD_KEY]
|
90
|
+
)
|
81
91
|
serialized_slices.append(serialized_slice)
|
82
92
|
return {"slices": serialized_slices, "state_type": state_type.value}
|
83
93
|
|
84
94
|
@staticmethod
|
85
95
|
def is_state_message_compatible(state: MutableMapping[str, Any]) -> bool:
|
86
|
-
return bool(state) and state.get("state_type") in [
|
96
|
+
return bool(state) and state.get("state_type") in [
|
97
|
+
t.value for t in ConcurrencyCompatibleStateType
|
98
|
+
]
|
87
99
|
|
88
100
|
@abstractmethod
|
89
101
|
def convert_from_sequential_state(
|
@@ -112,7 +124,9 @@ class AbstractStreamStateConverter(ABC):
|
|
112
124
|
"""
|
113
125
|
...
|
114
126
|
|
115
|
-
def merge_intervals(
|
127
|
+
def merge_intervals(
|
128
|
+
self, intervals: List[MutableMapping[str, Any]]
|
129
|
+
) -> List[MutableMapping[str, Any]]:
|
116
130
|
"""
|
117
131
|
Compute and return a list of merged intervals.
|
118
132
|
|
@@ -122,7 +136,9 @@ class AbstractStreamStateConverter(ABC):
|
|
122
136
|
if not intervals:
|
123
137
|
return []
|
124
138
|
|
125
|
-
sorted_intervals = sorted(
|
139
|
+
sorted_intervals = sorted(
|
140
|
+
intervals, key=lambda interval: (interval[self.START_KEY], interval[self.END_KEY])
|
141
|
+
)
|
126
142
|
merged_intervals = [sorted_intervals[0]]
|
127
143
|
|
128
144
|
for current_interval in sorted_intervals[1:]:
|
@@ -155,5 +171,4 @@ class AbstractStreamStateConverter(ABC):
|
|
155
171
|
|
156
172
|
@property
|
157
173
|
@abstractmethod
|
158
|
-
def zero_value(self) -> Any:
|
159
|
-
...
|
174
|
+
def zero_value(self) -> Any: ...
|
@@ -28,8 +28,7 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
28
28
|
|
29
29
|
@property
|
30
30
|
@abstractmethod
|
31
|
-
def _zero_value(self) -> Any:
|
32
|
-
...
|
31
|
+
def _zero_value(self) -> Any: ...
|
33
32
|
|
34
33
|
@property
|
35
34
|
def zero_value(self) -> datetime:
|
@@ -40,16 +39,13 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
40
39
|
return lambda: datetime.now(timezone.utc)
|
41
40
|
|
42
41
|
@abstractmethod
|
43
|
-
def increment(self, timestamp: datetime) -> datetime:
|
44
|
-
...
|
42
|
+
def increment(self, timestamp: datetime) -> datetime: ...
|
45
43
|
|
46
44
|
@abstractmethod
|
47
|
-
def parse_timestamp(self, timestamp: Any) -> datetime:
|
48
|
-
...
|
45
|
+
def parse_timestamp(self, timestamp: Any) -> datetime: ...
|
49
46
|
|
50
47
|
@abstractmethod
|
51
|
-
def output_format(self, timestamp: datetime) -> Any:
|
52
|
-
...
|
48
|
+
def output_format(self, timestamp: datetime) -> Any: ...
|
53
49
|
|
54
50
|
def parse_value(self, value: Any) -> Any:
|
55
51
|
"""
|
@@ -61,7 +57,10 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
61
57
|
return bool(self.increment(end_time) >= start_time)
|
62
58
|
|
63
59
|
def convert_from_sequential_state(
|
64
|
-
self,
|
60
|
+
self,
|
61
|
+
cursor_field: CursorField,
|
62
|
+
stream_state: MutableMapping[str, Any],
|
63
|
+
start: Optional[datetime],
|
65
64
|
) -> Tuple[datetime, MutableMapping[str, Any]]:
|
66
65
|
"""
|
67
66
|
Convert the state message to the format required by the ConcurrentCursor.
|
@@ -82,7 +81,9 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
82
81
|
# Create a slice to represent the records synced during prior syncs.
|
83
82
|
# The start and end are the same to avoid confusion as to whether the records for this slice
|
84
83
|
# were actually synced
|
85
|
-
slices = [
|
84
|
+
slices = [
|
85
|
+
{self.START_KEY: start if start is not None else sync_start, self.END_KEY: sync_start}
|
86
|
+
]
|
86
87
|
|
87
88
|
return sync_start, {
|
88
89
|
"state_type": ConcurrencyCompatibleStateType.date_range.value,
|
@@ -90,10 +91,17 @@ class DateTimeStreamStateConverter(AbstractStreamStateConverter):
|
|
90
91
|
"legacy": stream_state,
|
91
92
|
}
|
92
93
|
|
93
|
-
def _get_sync_start(
|
94
|
+
def _get_sync_start(
|
95
|
+
self,
|
96
|
+
cursor_field: CursorField,
|
97
|
+
stream_state: MutableMapping[str, Any],
|
98
|
+
start: Optional[datetime],
|
99
|
+
) -> datetime:
|
94
100
|
sync_start = start if start is not None else self.zero_value
|
95
101
|
prev_sync_low_water_mark = (
|
96
|
-
self.parse_timestamp(stream_state[cursor_field.cursor_field_key])
|
102
|
+
self.parse_timestamp(stream_state[cursor_field.cursor_field_key])
|
103
|
+
if cursor_field.cursor_field_key in stream_state
|
104
|
+
else None
|
97
105
|
)
|
98
106
|
if prev_sync_low_water_mark and prev_sync_low_water_mark >= sync_start:
|
99
107
|
return prev_sync_low_water_mark
|
@@ -126,7 +134,9 @@ class EpochValueConcurrentStreamStateConverter(DateTimeStreamStateConverter):
|
|
126
134
|
def parse_timestamp(self, timestamp: int) -> datetime:
|
127
135
|
dt_object = pendulum.from_timestamp(timestamp)
|
128
136
|
if not isinstance(dt_object, DateTime):
|
129
|
-
raise ValueError(
|
137
|
+
raise ValueError(
|
138
|
+
f"DateTime object was expected but got {type(dt_object)} from pendulum.parse({timestamp})"
|
139
|
+
)
|
130
140
|
return dt_object # type: ignore # we are manually type checking because pendulum.parse may return different types
|
131
141
|
|
132
142
|
|
@@ -146,7 +156,9 @@ class IsoMillisConcurrentStreamStateConverter(DateTimeStreamStateConverter):
|
|
146
156
|
|
147
157
|
_zero_value = "0001-01-01T00:00:00.000Z"
|
148
158
|
|
149
|
-
def __init__(
|
159
|
+
def __init__(
|
160
|
+
self, is_sequential_state: bool = True, cursor_granularity: Optional[timedelta] = None
|
161
|
+
):
|
150
162
|
super().__init__(is_sequential_state=is_sequential_state)
|
151
163
|
self._cursor_granularity = cursor_granularity or timedelta(milliseconds=1)
|
152
164
|
|
@@ -159,7 +171,9 @@ class IsoMillisConcurrentStreamStateConverter(DateTimeStreamStateConverter):
|
|
159
171
|
def parse_timestamp(self, timestamp: str) -> datetime:
|
160
172
|
dt_object = pendulum.parse(timestamp)
|
161
173
|
if not isinstance(dt_object, DateTime):
|
162
|
-
raise ValueError(
|
174
|
+
raise ValueError(
|
175
|
+
f"DateTime object was expected but got {type(dt_object)} from pendulum.parse({timestamp})"
|
176
|
+
)
|
163
177
|
return dt_object # type: ignore # we are manually type checking because pendulum.parse may return different types
|
164
178
|
|
165
179
|
|
@@ -176,7 +190,9 @@ class CustomFormatConcurrentStreamStateConverter(IsoMillisConcurrentStreamStateC
|
|
176
190
|
is_sequential_state: bool = True,
|
177
191
|
cursor_granularity: Optional[timedelta] = None,
|
178
192
|
):
|
179
|
-
super().__init__(
|
193
|
+
super().__init__(
|
194
|
+
is_sequential_state=is_sequential_state, cursor_granularity=cursor_granularity
|
195
|
+
)
|
180
196
|
self._datetime_format = datetime_format
|
181
197
|
self._input_datetime_formats = input_datetime_formats if input_datetime_formats else []
|
182
198
|
self._input_datetime_formats += [self._datetime_format]
|