airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +17 -2
- airbyte_cdk/config_observation.py +10 -3
- airbyte_cdk/connector.py +19 -9
- airbyte_cdk/connector_builder/connector_builder_handler.py +28 -8
- airbyte_cdk/connector_builder/main.py +26 -6
- airbyte_cdk/connector_builder/message_grouper.py +95 -25
- airbyte_cdk/destinations/destination.py +47 -14
- airbyte_cdk/destinations/vector_db_based/config.py +36 -14
- airbyte_cdk/destinations/vector_db_based/document_processor.py +49 -11
- airbyte_cdk/destinations/vector_db_based/embedder.py +52 -11
- airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
- airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +15 -4
- airbyte_cdk/entrypoint.py +82 -26
- airbyte_cdk/exception_handler.py +13 -3
- airbyte_cdk/logger.py +10 -2
- airbyte_cdk/models/airbyte_protocol.py +11 -5
- airbyte_cdk/models/airbyte_protocol_serializers.py +9 -3
- airbyte_cdk/models/well_known_types.py +1 -1
- airbyte_cdk/sources/abstract_source.py +63 -17
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +47 -14
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +25 -7
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +27 -6
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +9 -3
- airbyte_cdk/sources/connector_state_manager.py +32 -10
- airbyte_cdk/sources/declarative/async_job/job.py +3 -1
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +68 -14
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +24 -6
- airbyte_cdk/sources/declarative/async_job/repository.py +3 -1
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/jwt.py +27 -7
- airbyte_cdk/sources/declarative/auth/oauth.py +35 -11
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/token.py +25 -8
- airbyte_cdk/sources/declarative/checks/check_stream.py +12 -4
- airbyte_cdk/sources/declarative/checks/connection_checker.py +3 -1
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +11 -3
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +106 -50
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +20 -6
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +43 -0
- airbyte_cdk/sources/declarative/declarative_source.py +3 -1
- airbyte_cdk/sources/declarative/declarative_stream.py +27 -6
- airbyte_cdk/sources/declarative/decoders/__init__.py +2 -2
- airbyte_cdk/sources/declarative/decoders/decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +48 -13
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +3 -1
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +6 -2
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +6 -2
- airbyte_cdk/sources/declarative/extractors/record_filter.py +24 -7
- airbyte_cdk/sources/declarative/extractors/record_selector.py +10 -3
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +15 -5
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +96 -31
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +22 -8
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +46 -15
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +19 -5
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +3 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +20 -2
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +5 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +10 -3
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +7 -1
- airbyte_cdk/sources/declarative/interpolation/jinja.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +106 -24
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +14 -5
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +697 -678
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +13 -4
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +9 -2
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +802 -232
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +29 -7
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +25 -7
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +54 -15
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +6 -2
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +15 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +18 -8
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +16 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +51 -14
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +29 -8
- airbyte_cdk/sources/declarative/requesters/http_requester.py +58 -16
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +49 -14
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +24 -7
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +9 -3
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +6 -2
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +19 -6
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +3 -1
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +21 -7
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +18 -6
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +27 -8
- airbyte_cdk/sources/declarative/requesters/requester.py +3 -1
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -5
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +105 -24
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +3 -1
- airbyte_cdk/sources/declarative/spec/spec.py +8 -2
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +3 -1
- airbyte_cdk/sources/declarative/transformations/add_fields.py +12 -3
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +6 -2
- airbyte_cdk/sources/declarative/types.py +8 -1
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +3 -1
- airbyte_cdk/sources/embedded/base_integration.py +14 -4
- airbyte_cdk/sources/embedded/catalog.py +16 -4
- airbyte_cdk/sources/embedded/runner.py +19 -3
- airbyte_cdk/sources/embedded/tools.py +3 -1
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +27 -7
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +12 -6
- airbyte_cdk/sources/file_based/config/csv_format.py +21 -9
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +6 -2
- airbyte_cdk/sources/file_based/config/unstructured_format.py +10 -3
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
- airbyte_cdk/sources/file_based/exceptions.py +13 -15
- airbyte_cdk/sources/file_based/file_based_source.py +82 -24
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -5
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +58 -17
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +89 -26
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +25 -7
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +20 -6
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +57 -16
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +64 -15
- airbyte_cdk/sources/file_based/schema_helpers.py +33 -10
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +33 -10
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +47 -11
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +13 -22
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +53 -17
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +17 -5
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +26 -9
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +67 -21
- airbyte_cdk/sources/http_logger.py +5 -1
- airbyte_cdk/sources/message/repository.py +18 -4
- airbyte_cdk/sources/source.py +17 -7
- airbyte_cdk/sources/streams/availability_strategy.py +9 -3
- airbyte_cdk/sources/streams/call_rate.py +63 -19
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +31 -7
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +6 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +77 -22
- airbyte_cdk/sources/streams/concurrent/cursor.py +56 -20
- airbyte_cdk/sources/streams/concurrent/default_stream.py +9 -2
- airbyte_cdk/sources/streams/concurrent/helpers.py +6 -2
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +9 -2
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +4 -1
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +10 -2
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +6 -2
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +25 -10
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +32 -16
- airbyte_cdk/sources/streams/core.py +77 -22
- airbyte_cdk/sources/streams/http/availability_strategy.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +4 -1
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +16 -5
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +9 -3
- airbyte_cdk/sources/streams/http/exceptions.py +2 -2
- airbyte_cdk/sources/streams/http/http.py +133 -33
- airbyte_cdk/sources/streams/http/http_client.py +91 -29
- airbyte_cdk/sources/streams/http/rate_limiting.py +23 -7
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +19 -6
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +38 -11
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
- airbyte_cdk/sources/types.py +5 -1
- airbyte_cdk/sources/utils/record_helper.py +12 -3
- airbyte_cdk/sources/utils/schema_helpers.py +9 -3
- airbyte_cdk/sources/utils/slice_logger.py +4 -1
- airbyte_cdk/sources/utils/transform.py +24 -9
- airbyte_cdk/sql/exceptions.py +19 -6
- airbyte_cdk/sql/secrets.py +3 -1
- airbyte_cdk/sql/shared/catalog_providers.py +13 -4
- airbyte_cdk/sql/shared/sql_processor.py +44 -14
- airbyte_cdk/test/catalog_builder.py +19 -8
- airbyte_cdk/test/entrypoint_wrapper.py +27 -8
- airbyte_cdk/test/mock_http/mocker.py +41 -11
- airbyte_cdk/test/mock_http/request.py +9 -3
- airbyte_cdk/test/mock_http/response.py +3 -1
- airbyte_cdk/test/mock_http/response_builder.py +29 -7
- airbyte_cdk/test/state_builder.py +10 -2
- airbyte_cdk/test/utils/data.py +6 -2
- airbyte_cdk/test/utils/http_mocking.py +3 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +3 -1
- airbyte_cdk/utils/analytics_message.py +10 -2
- airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
- airbyte_cdk/utils/mapping_helpers.py +3 -1
- airbyte_cdk/utils/message_utils.py +11 -4
- airbyte_cdk/utils/print_buffer.py +6 -1
- airbyte_cdk/utils/schema_inferrer.py +30 -9
- airbyte_cdk/utils/spec_schema_transformations.py +3 -1
- airbyte_cdk/utils/traced_exception.py +35 -9
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/METADATA +8 -7
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/RECORD +200 -200
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/WHEEL +0 -0
airbyte_cdk/sources/source.py
CHANGED
@@ -27,15 +27,19 @@ class ExperimentalClassWarning(DeprecationWarning):
|
|
27
27
|
|
28
28
|
class BaseSource(BaseConnector[TConfig], ABC, Generic[TConfig, TState, TCatalog]):
|
29
29
|
@abstractmethod
|
30
|
-
def read_state(self, state_path: str) -> TState:
|
31
|
-
...
|
30
|
+
def read_state(self, state_path: str) -> TState: ...
|
32
31
|
|
33
32
|
@abstractmethod
|
34
|
-
def read_catalog(self, catalog_path: str) -> TCatalog:
|
35
|
-
...
|
33
|
+
def read_catalog(self, catalog_path: str) -> TCatalog: ...
|
36
34
|
|
37
35
|
@abstractmethod
|
38
|
-
def read(
|
36
|
+
def read(
|
37
|
+
self,
|
38
|
+
logger: logging.Logger,
|
39
|
+
config: TConfig,
|
40
|
+
catalog: TCatalog,
|
41
|
+
state: Optional[TState] = None,
|
42
|
+
) -> Iterable[AirbyteMessage]:
|
39
43
|
"""
|
40
44
|
Returns a generator of the AirbyteMessages generated by reading the source with the given configuration, catalog, and state.
|
41
45
|
"""
|
@@ -69,8 +73,14 @@ class Source(
|
|
69
73
|
if state_obj:
|
70
74
|
for state in state_obj: # type: ignore # `isinstance(state_obj, List)` ensures that this is a list
|
71
75
|
parsed_message = AirbyteStateMessageSerializer.load(state)
|
72
|
-
if
|
73
|
-
|
76
|
+
if (
|
77
|
+
not parsed_message.stream
|
78
|
+
and not parsed_message.data
|
79
|
+
and not parsed_message.global_
|
80
|
+
):
|
81
|
+
raise ValueError(
|
82
|
+
"AirbyteStateMessage should contain either a stream, global, or state field"
|
83
|
+
)
|
74
84
|
parsed_state_messages.append(parsed_message)
|
75
85
|
return parsed_state_messages
|
76
86
|
|
@@ -20,7 +20,9 @@ class AvailabilityStrategy(ABC):
|
|
20
20
|
"""
|
21
21
|
|
22
22
|
@abstractmethod
|
23
|
-
def check_availability(
|
23
|
+
def check_availability(
|
24
|
+
self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None
|
25
|
+
) -> Tuple[bool, Optional[str]]:
|
24
26
|
"""
|
25
27
|
Checks stream availability.
|
26
28
|
|
@@ -52,7 +54,9 @@ class AvailabilityStrategy(ABC):
|
|
52
54
|
return next(slices)
|
53
55
|
|
54
56
|
@staticmethod
|
55
|
-
def get_first_record_for_slice(
|
57
|
+
def get_first_record_for_slice(
|
58
|
+
stream: Stream, stream_slice: Optional[Mapping[str, Any]]
|
59
|
+
) -> StreamData:
|
56
60
|
"""
|
57
61
|
Gets the first record for a stream_slice of a stream.
|
58
62
|
|
@@ -70,7 +74,9 @@ class AvailabilityStrategy(ABC):
|
|
70
74
|
|
71
75
|
# We wrap the return output of read_records() because some implementations return types that are iterable,
|
72
76
|
# but not iterators such as lists or tuples
|
73
|
-
records_for_slice = iter(
|
77
|
+
records_for_slice = iter(
|
78
|
+
stream.read_records(sync_mode=SyncMode.full_refresh, stream_slice=stream_slice)
|
79
|
+
)
|
74
80
|
|
75
81
|
return next(records_for_slice)
|
76
82
|
finally:
|
@@ -76,7 +76,9 @@ class AbstractCallRatePolicy(abc.ABC):
|
|
76
76
|
"""
|
77
77
|
|
78
78
|
@abc.abstractmethod
|
79
|
-
def update(
|
79
|
+
def update(
|
80
|
+
self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime]
|
81
|
+
) -> None:
|
80
82
|
"""Update call rate counting with current values
|
81
83
|
|
82
84
|
:param available_calls:
|
@@ -202,12 +204,20 @@ class UnlimitedCallRatePolicy(BaseCallRatePolicy):
|
|
202
204
|
def try_acquire(self, request: Any, weight: int) -> None:
|
203
205
|
"""Do nothing"""
|
204
206
|
|
205
|
-
def update(
|
207
|
+
def update(
|
208
|
+
self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime]
|
209
|
+
) -> None:
|
206
210
|
"""Do nothing"""
|
207
211
|
|
208
212
|
|
209
213
|
class FixedWindowCallRatePolicy(BaseCallRatePolicy):
|
210
|
-
def __init__(
|
214
|
+
def __init__(
|
215
|
+
self,
|
216
|
+
next_reset_ts: datetime.datetime,
|
217
|
+
period: timedelta,
|
218
|
+
call_limit: int,
|
219
|
+
matchers: list[RequestMatcher],
|
220
|
+
):
|
211
221
|
"""A policy that allows {call_limit} calls within a {period} time interval
|
212
222
|
|
213
223
|
:param next_reset_ts: next call rate reset time point
|
@@ -235,7 +245,8 @@ class FixedWindowCallRatePolicy(BaseCallRatePolicy):
|
|
235
245
|
if self._calls_num + weight > self._call_limit:
|
236
246
|
reset_in = self._next_reset_ts - datetime.datetime.now()
|
237
247
|
error_message = (
|
238
|
-
f"reached maximum number of allowed calls {self._call_limit} "
|
248
|
+
f"reached maximum number of allowed calls {self._call_limit} "
|
249
|
+
f"per {self._offset} interval, next reset in {reset_in}."
|
239
250
|
)
|
240
251
|
raise CallRateLimitHit(
|
241
252
|
error=error_message,
|
@@ -247,7 +258,9 @@ class FixedWindowCallRatePolicy(BaseCallRatePolicy):
|
|
247
258
|
|
248
259
|
self._calls_num += weight
|
249
260
|
|
250
|
-
def update(
|
261
|
+
def update(
|
262
|
+
self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime]
|
263
|
+
) -> None:
|
251
264
|
"""Update call rate counters, by default, only reacts to decreasing updates of available_calls and changes to call_reset_ts.
|
252
265
|
We ignore updates with available_calls > current_available_calls to support call rate limits that are lower than API limits.
|
253
266
|
|
@@ -260,12 +273,18 @@ class FixedWindowCallRatePolicy(BaseCallRatePolicy):
|
|
260
273
|
|
261
274
|
if available_calls is not None and current_available_calls > available_calls:
|
262
275
|
logger.debug(
|
263
|
-
"got rate limit update from api, adjusting available calls from %s to %s",
|
276
|
+
"got rate limit update from api, adjusting available calls from %s to %s",
|
277
|
+
current_available_calls,
|
278
|
+
available_calls,
|
264
279
|
)
|
265
280
|
self._calls_num = self._call_limit - available_calls
|
266
281
|
|
267
282
|
if call_reset_ts is not None and call_reset_ts != self._next_reset_ts:
|
268
|
-
logger.debug(
|
283
|
+
logger.debug(
|
284
|
+
"got rate limit update from api, adjusting reset time from %s to %s",
|
285
|
+
self._next_reset_ts,
|
286
|
+
call_reset_ts,
|
287
|
+
)
|
269
288
|
self._next_reset_ts = call_reset_ts
|
270
289
|
|
271
290
|
def _update_current_window(self) -> None:
|
@@ -292,7 +311,10 @@ class MovingWindowCallRatePolicy(BaseCallRatePolicy):
|
|
292
311
|
"""
|
293
312
|
if not rates:
|
294
313
|
raise ValueError("The list of rates can not be empty")
|
295
|
-
pyrate_rates = [
|
314
|
+
pyrate_rates = [
|
315
|
+
PyRateRate(limit=rate.limit, interval=int(rate.interval.total_seconds() * 1000))
|
316
|
+
for rate in rates
|
317
|
+
]
|
296
318
|
self._bucket = InMemoryBucket(pyrate_rates)
|
297
319
|
# Limiter will create the background task that clears old requests in the bucket
|
298
320
|
self._limiter = Limiter(self._bucket)
|
@@ -320,14 +342,18 @@ class MovingWindowCallRatePolicy(BaseCallRatePolicy):
|
|
320
342
|
time_to_wait=timedelta(milliseconds=time_to_wait),
|
321
343
|
)
|
322
344
|
|
323
|
-
def update(
|
345
|
+
def update(
|
346
|
+
self, available_calls: Optional[int], call_reset_ts: Optional[datetime.datetime]
|
347
|
+
) -> None:
|
324
348
|
"""Adjust call bucket to reflect the state of the API server
|
325
349
|
|
326
350
|
:param available_calls:
|
327
351
|
:param call_reset_ts:
|
328
352
|
:return:
|
329
353
|
"""
|
330
|
-
if
|
354
|
+
if (
|
355
|
+
available_calls is not None and call_reset_ts is None
|
356
|
+
): # we do our best to sync buckets with API
|
331
357
|
if available_calls == 0:
|
332
358
|
with self._limiter.lock:
|
333
359
|
items_to_add = self._bucket.count() < self._bucket.rates[0].limit
|
@@ -350,7 +376,9 @@ class AbstractAPIBudget(abc.ABC):
|
|
350
376
|
"""
|
351
377
|
|
352
378
|
@abc.abstractmethod
|
353
|
-
def acquire_call(
|
379
|
+
def acquire_call(
|
380
|
+
self, request: Any, block: bool = True, timeout: Optional[float] = None
|
381
|
+
) -> None:
|
354
382
|
"""Try to get a call from budget, will block by default
|
355
383
|
|
356
384
|
:param request:
|
@@ -375,7 +403,9 @@ class AbstractAPIBudget(abc.ABC):
|
|
375
403
|
class APIBudget(AbstractAPIBudget):
|
376
404
|
"""Default APIBudget implementation"""
|
377
405
|
|
378
|
-
def __init__(
|
406
|
+
def __init__(
|
407
|
+
self, policies: list[AbstractCallRatePolicy], maximum_attempts_to_acquire: int = 100000
|
408
|
+
) -> None:
|
379
409
|
"""Constructor
|
380
410
|
|
381
411
|
:param policies: list of policies in this budget
|
@@ -392,7 +422,9 @@ class APIBudget(AbstractAPIBudget):
|
|
392
422
|
return policy
|
393
423
|
return None
|
394
424
|
|
395
|
-
def acquire_call(
|
425
|
+
def acquire_call(
|
426
|
+
self, request: Any, block: bool = True, timeout: Optional[float] = None
|
427
|
+
) -> None:
|
396
428
|
"""Try to get a call from budget, will block by default.
|
397
429
|
Matchers will be called sequentially in the same order they were added.
|
398
430
|
The first matcher that returns True will
|
@@ -417,7 +449,9 @@ class APIBudget(AbstractAPIBudget):
|
|
417
449
|
"""
|
418
450
|
pass
|
419
451
|
|
420
|
-
def _do_acquire(
|
452
|
+
def _do_acquire(
|
453
|
+
self, request: Any, policy: AbstractCallRatePolicy, block: bool, timeout: Optional[float]
|
454
|
+
) -> None:
|
421
455
|
"""Internal method to try to acquire a call credit
|
422
456
|
|
423
457
|
:param request:
|
@@ -439,14 +473,20 @@ class APIBudget(AbstractAPIBudget):
|
|
439
473
|
else:
|
440
474
|
time_to_wait = exc.time_to_wait
|
441
475
|
|
442
|
-
time_to_wait = max(
|
443
|
-
|
476
|
+
time_to_wait = max(
|
477
|
+
timedelta(0), time_to_wait
|
478
|
+
) # sometimes we get negative duration
|
479
|
+
logger.info(
|
480
|
+
"reached call limit %s. going to sleep for %s", exc.rate, time_to_wait
|
481
|
+
)
|
444
482
|
time.sleep(time_to_wait.total_seconds())
|
445
483
|
else:
|
446
484
|
raise
|
447
485
|
|
448
486
|
if last_exception:
|
449
|
-
logger.info(
|
487
|
+
logger.info(
|
488
|
+
"we used all %s attempts to acquire and failed", self._maximum_attempts_to_acquire
|
489
|
+
)
|
450
490
|
raise last_exception
|
451
491
|
|
452
492
|
|
@@ -481,9 +521,13 @@ class HttpAPIBudget(APIBudget):
|
|
481
521
|
reset_ts = self.get_reset_ts_from_response(response)
|
482
522
|
policy.update(available_calls=available_calls, call_reset_ts=reset_ts)
|
483
523
|
|
484
|
-
def get_reset_ts_from_response(
|
524
|
+
def get_reset_ts_from_response(
|
525
|
+
self, response: requests.Response
|
526
|
+
) -> Optional[datetime.datetime]:
|
485
527
|
if response.headers.get(self._ratelimit_reset_header):
|
486
|
-
return datetime.datetime.fromtimestamp(
|
528
|
+
return datetime.datetime.fromtimestamp(
|
529
|
+
int(response.headers[self._ratelimit_reset_header])
|
530
|
+
)
|
487
531
|
return None
|
488
532
|
|
489
533
|
def get_calls_left_from_response(self, response: requests.Response) -> Optional[int]:
|
@@ -53,7 +53,9 @@ class IncrementalCheckpointReader(CheckpointReader):
|
|
53
53
|
before syncing data.
|
54
54
|
"""
|
55
55
|
|
56
|
-
def __init__(
|
56
|
+
def __init__(
|
57
|
+
self, stream_state: Mapping[str, Any], stream_slices: Iterable[Optional[Mapping[str, Any]]]
|
58
|
+
):
|
57
59
|
self._state: Optional[Mapping[str, Any]] = stream_state
|
58
60
|
self._stream_slices = iter(stream_slices)
|
59
61
|
self._has_slices = False
|
@@ -87,7 +89,12 @@ class CursorBasedCheckpointReader(CheckpointReader):
|
|
87
89
|
that belongs to the Concurrent CDK.
|
88
90
|
"""
|
89
91
|
|
90
|
-
def __init__(
|
92
|
+
def __init__(
|
93
|
+
self,
|
94
|
+
cursor: Cursor,
|
95
|
+
stream_slices: Iterable[Optional[Mapping[str, Any]]],
|
96
|
+
read_state_from_cursor: bool = False,
|
97
|
+
):
|
91
98
|
self._cursor = cursor
|
92
99
|
self._stream_slices = iter(stream_slices)
|
93
100
|
# read_state_from_cursor is used to delineate that partitions should determine when to stop syncing dynamically according
|
@@ -153,7 +160,11 @@ class CursorBasedCheckpointReader(CheckpointReader):
|
|
153
160
|
next_slice = self.read_and_convert_slice()
|
154
161
|
state_for_slice = self._cursor.select_state(next_slice)
|
155
162
|
has_more = state_for_slice == FULL_REFRESH_COMPLETE_STATE
|
156
|
-
return StreamSlice(
|
163
|
+
return StreamSlice(
|
164
|
+
cursor_slice=state_for_slice or {},
|
165
|
+
partition=next_slice.partition,
|
166
|
+
extra_fields=next_slice.extra_fields,
|
167
|
+
)
|
157
168
|
else:
|
158
169
|
state_for_slice = self._cursor.select_state(self.current_slice)
|
159
170
|
if state_for_slice == FULL_REFRESH_COMPLETE_STATE:
|
@@ -173,7 +184,9 @@ class CursorBasedCheckpointReader(CheckpointReader):
|
|
173
184
|
)
|
174
185
|
# The reader continues to process the current partition if it's state is still in progress
|
175
186
|
return StreamSlice(
|
176
|
-
cursor_slice=state_for_slice or {},
|
187
|
+
cursor_slice=state_for_slice or {},
|
188
|
+
partition=self.current_slice.partition,
|
189
|
+
extra_fields=self.current_slice.extra_fields,
|
177
190
|
)
|
178
191
|
else:
|
179
192
|
# Unlike RFR cursors that iterate dynamically according to how stream state is updated, most cursors operate
|
@@ -218,8 +231,17 @@ class LegacyCursorBasedCheckpointReader(CursorBasedCheckpointReader):
|
|
218
231
|
}
|
219
232
|
"""
|
220
233
|
|
221
|
-
def __init__(
|
222
|
-
|
234
|
+
def __init__(
|
235
|
+
self,
|
236
|
+
cursor: Cursor,
|
237
|
+
stream_slices: Iterable[Optional[Mapping[str, Any]]],
|
238
|
+
read_state_from_cursor: bool = False,
|
239
|
+
):
|
240
|
+
super().__init__(
|
241
|
+
cursor=cursor,
|
242
|
+
stream_slices=stream_slices,
|
243
|
+
read_state_from_cursor=read_state_from_cursor,
|
244
|
+
)
|
223
245
|
|
224
246
|
def next(self) -> Optional[Mapping[str, Any]]:
|
225
247
|
try:
|
@@ -228,7 +250,9 @@ class LegacyCursorBasedCheckpointReader(CursorBasedCheckpointReader):
|
|
228
250
|
if "partition" in dict(self.current_slice):
|
229
251
|
raise ValueError("Stream is configured to use invalid stream slice key 'partition'")
|
230
252
|
elif "cursor_slice" in dict(self.current_slice):
|
231
|
-
raise ValueError(
|
253
|
+
raise ValueError(
|
254
|
+
"Stream is configured to use invalid stream slice key 'cursor_slice'"
|
255
|
+
)
|
232
256
|
|
233
257
|
# We convert StreamSlice to a regular mapping because legacy connectors operate on the basic Mapping object. We
|
234
258
|
# also duplicate all fields at the top level for backwards compatibility for existing Python sources
|
@@ -5,7 +5,9 @@ from typing import Any, Mapping, MutableMapping, Optional
|
|
5
5
|
|
6
6
|
from airbyte_cdk.models import FailureType
|
7
7
|
from airbyte_cdk.sources.streams.checkpoint import Cursor
|
8
|
-
from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import
|
8
|
+
from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
|
9
|
+
PerPartitionKeySerializer,
|
10
|
+
)
|
9
11
|
from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
|
10
12
|
from airbyte_cdk.utils import AirbyteTracedException
|
11
13
|
|
@@ -97,7 +99,9 @@ class SubstreamResumableFullRefreshCursor(Cursor):
|
|
97
99
|
if not stream_slice:
|
98
100
|
raise ValueError("A partition needs to be provided in order to extract a state")
|
99
101
|
|
100
|
-
return self._per_partition_state.get(
|
102
|
+
return self._per_partition_state.get(
|
103
|
+
self._to_partition_key(stream_slice.partition), {}
|
104
|
+
).get("cursor")
|
101
105
|
|
102
106
|
def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
|
103
107
|
return self._partition_serializer.to_partition_key(partition)
|
@@ -6,9 +6,17 @@ import copy
|
|
6
6
|
import json
|
7
7
|
import logging
|
8
8
|
from functools import lru_cache
|
9
|
-
from typing import Any,
|
10
|
-
|
11
|
-
from airbyte_cdk.models import
|
9
|
+
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union
|
10
|
+
|
11
|
+
from airbyte_cdk.models import (
|
12
|
+
AirbyteLogMessage,
|
13
|
+
AirbyteMessage,
|
14
|
+
AirbyteStream,
|
15
|
+
ConfiguredAirbyteStream,
|
16
|
+
Level,
|
17
|
+
SyncMode,
|
18
|
+
Type,
|
19
|
+
)
|
12
20
|
from airbyte_cdk.sources import AbstractSource, Source
|
13
21
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
14
22
|
from airbyte_cdk.sources.message import MessageRepository
|
@@ -16,15 +24,23 @@ from airbyte_cdk.sources.source import ExperimentalClassWarning
|
|
16
24
|
from airbyte_cdk.sources.streams import Stream
|
17
25
|
from airbyte_cdk.sources.streams.availability_strategy import AvailabilityStrategy
|
18
26
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade
|
19
|
-
from airbyte_cdk.sources.streams.concurrent.availability_strategy import
|
27
|
+
from airbyte_cdk.sources.streams.concurrent.availability_strategy import (
|
28
|
+
AbstractAvailabilityStrategy,
|
29
|
+
AlwaysAvailableAvailabilityStrategy,
|
30
|
+
)
|
20
31
|
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, FinalStateCursor
|
21
32
|
from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
|
22
33
|
from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage
|
23
|
-
from airbyte_cdk.sources.streams.concurrent.helpers import
|
34
|
+
from airbyte_cdk.sources.streams.concurrent.helpers import (
|
35
|
+
get_cursor_field_from_stream,
|
36
|
+
get_primary_key_from_stream,
|
37
|
+
)
|
24
38
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
25
39
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
|
26
40
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
27
|
-
from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import
|
41
|
+
from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
|
42
|
+
DateTimeStreamStateConverter,
|
43
|
+
)
|
28
44
|
from airbyte_cdk.sources.streams.core import StreamData
|
29
45
|
from airbyte_cdk.sources.types import StreamSlice
|
30
46
|
from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
|
@@ -75,7 +91,9 @@ class StreamFacade(AbstractStreamFacade[DefaultStream], Stream):
|
|
75
91
|
partition_generator=StreamPartitionGenerator(
|
76
92
|
stream,
|
77
93
|
message_repository,
|
78
|
-
SyncMode.full_refresh
|
94
|
+
SyncMode.full_refresh
|
95
|
+
if isinstance(cursor, FinalStateCursor)
|
96
|
+
else SyncMode.incremental,
|
79
97
|
[cursor_field] if cursor_field is not None else None,
|
80
98
|
state,
|
81
99
|
cursor,
|
@@ -97,14 +115,23 @@ class StreamFacade(AbstractStreamFacade[DefaultStream], Stream):
|
|
97
115
|
|
98
116
|
@property
|
99
117
|
def state(self) -> MutableMapping[str, Any]:
|
100
|
-
raise NotImplementedError(
|
118
|
+
raise NotImplementedError(
|
119
|
+
"This should not be called as part of the Concurrent CDK code. Please report the problem to Airbyte"
|
120
|
+
)
|
101
121
|
|
102
122
|
@state.setter
|
103
123
|
def state(self, value: Mapping[str, Any]) -> None:
|
104
124
|
if "state" in dir(self._legacy_stream):
|
105
125
|
self._legacy_stream.state = value # type: ignore # validating `state` is attribute of stream using `if` above
|
106
126
|
|
107
|
-
def __init__(
|
127
|
+
def __init__(
|
128
|
+
self,
|
129
|
+
stream: DefaultStream,
|
130
|
+
legacy_stream: Stream,
|
131
|
+
cursor: Cursor,
|
132
|
+
slice_logger: SliceLogger,
|
133
|
+
logger: logging.Logger,
|
134
|
+
):
|
108
135
|
"""
|
109
136
|
:param stream: The underlying AbstractStream
|
110
137
|
"""
|
@@ -141,7 +168,10 @@ class StreamFacade(AbstractStreamFacade[DefaultStream], Stream):
|
|
141
168
|
# This shouldn't happen if the ConcurrentCursor was used
|
142
169
|
state = "unknown; no state attribute was available on the cursor"
|
143
170
|
yield AirbyteMessage(
|
144
|
-
type=Type.LOG,
|
171
|
+
type=Type.LOG,
|
172
|
+
log=AirbyteLogMessage(
|
173
|
+
level=Level.ERROR, message=f"Cursor State at time of exception: {state}"
|
174
|
+
),
|
145
175
|
)
|
146
176
|
raise exc
|
147
177
|
|
@@ -180,7 +210,9 @@ class StreamFacade(AbstractStreamFacade[DefaultStream], Stream):
|
|
180
210
|
def supports_incremental(self) -> bool:
|
181
211
|
return self._legacy_stream.supports_incremental
|
182
212
|
|
183
|
-
def check_availability(
|
213
|
+
def check_availability(
|
214
|
+
self, logger: logging.Logger, source: Optional["Source"] = None
|
215
|
+
) -> Tuple[bool, Optional[str]]:
|
184
216
|
"""
|
185
217
|
Verifies the stream is available. Delegates to the underlying AbstractStream and ignores the parameters
|
186
218
|
:param logger: (ignored)
|
@@ -264,7 +296,9 @@ class StreamPartition(Partition):
|
|
264
296
|
):
|
265
297
|
if isinstance(record_data, Mapping):
|
266
298
|
data_to_return = dict(record_data)
|
267
|
-
self._stream.transformer.transform(
|
299
|
+
self._stream.transformer.transform(
|
300
|
+
data_to_return, self._stream.get_json_schema()
|
301
|
+
)
|
268
302
|
yield Record(data_to_return, self)
|
269
303
|
else:
|
270
304
|
self._message_repository.emit_message(record_data)
|
@@ -329,9 +363,17 @@ class StreamPartitionGenerator(PartitionGenerator):
|
|
329
363
|
self._cursor = cursor
|
330
364
|
|
331
365
|
def generate(self) -> Iterable[Partition]:
|
332
|
-
for s in self._stream.stream_slices(
|
366
|
+
for s in self._stream.stream_slices(
|
367
|
+
sync_mode=self._sync_mode, cursor_field=self._cursor_field, stream_state=self._state
|
368
|
+
):
|
333
369
|
yield StreamPartition(
|
334
|
-
self._stream,
|
370
|
+
self._stream,
|
371
|
+
copy.deepcopy(s),
|
372
|
+
self.message_repository,
|
373
|
+
self._sync_mode,
|
374
|
+
self._cursor_field,
|
375
|
+
self._state,
|
376
|
+
self._cursor,
|
335
377
|
)
|
336
378
|
|
337
379
|
|
@@ -348,7 +390,7 @@ class CursorPartitionGenerator(PartitionGenerator):
|
|
348
390
|
|
349
391
|
def __init__(
|
350
392
|
self,
|
351
|
-
|
393
|
+
stream: Stream,
|
352
394
|
message_repository: MessageRepository,
|
353
395
|
cursor: Cursor,
|
354
396
|
connector_state_converter: DateTimeStreamStateConverter,
|
@@ -358,12 +400,12 @@ class CursorPartitionGenerator(PartitionGenerator):
|
|
358
400
|
"""
|
359
401
|
Initialize the CursorPartitionGenerator with a stream, sync mode, and cursor.
|
360
402
|
|
361
|
-
:param
|
403
|
+
:param stream: The stream to delegate to for partition generation.
|
362
404
|
:param message_repository: The message repository to use to emit non-record messages.
|
363
405
|
:param sync_mode: The synchronization mode.
|
364
406
|
:param cursor: A Cursor object that maintains the state and the cursor field.
|
365
407
|
"""
|
366
|
-
self.
|
408
|
+
self._stream = stream
|
367
409
|
self.message_repository = message_repository
|
368
410
|
self._sync_mode = SyncMode.full_refresh
|
369
411
|
self._cursor = cursor
|
@@ -382,8 +424,16 @@ class CursorPartitionGenerator(PartitionGenerator):
|
|
382
424
|
:return: An iterable of StreamPartition objects.
|
383
425
|
"""
|
384
426
|
|
385
|
-
start_boundary =
|
386
|
-
|
427
|
+
start_boundary = (
|
428
|
+
self._slice_boundary_fields[self._START_BOUNDARY]
|
429
|
+
if self._slice_boundary_fields
|
430
|
+
else "start"
|
431
|
+
)
|
432
|
+
end_boundary = (
|
433
|
+
self._slice_boundary_fields[self._END_BOUNDARY]
|
434
|
+
if self._slice_boundary_fields
|
435
|
+
else "end"
|
436
|
+
)
|
387
437
|
|
388
438
|
for slice_start, slice_end in self._cursor.generate_slices():
|
389
439
|
stream_slice = StreamSlice(
|
@@ -395,7 +445,7 @@ class CursorPartitionGenerator(PartitionGenerator):
|
|
395
445
|
)
|
396
446
|
|
397
447
|
yield StreamPartition(
|
398
|
-
self.
|
448
|
+
self._stream,
|
399
449
|
copy.deepcopy(stream_slice),
|
400
450
|
self.message_repository,
|
401
451
|
self._sync_mode,
|
@@ -405,12 +455,17 @@ class CursorPartitionGenerator(PartitionGenerator):
|
|
405
455
|
)
|
406
456
|
|
407
457
|
|
408
|
-
@deprecated(
|
458
|
+
@deprecated(
|
459
|
+
"Availability strategy has been soft deprecated. Do not use. Class is subject to removal",
|
460
|
+
category=ExperimentalClassWarning,
|
461
|
+
)
|
409
462
|
class AvailabilityStrategyFacade(AvailabilityStrategy):
|
410
463
|
def __init__(self, abstract_availability_strategy: AbstractAvailabilityStrategy):
|
411
464
|
self._abstract_availability_strategy = abstract_availability_strategy
|
412
465
|
|
413
|
-
def check_availability(
|
466
|
+
def check_availability(
|
467
|
+
self, stream: Stream, logger: logging.Logger, source: Optional["Source"] = None
|
468
|
+
) -> Tuple[bool, Optional[str]]:
|
414
469
|
"""
|
415
470
|
Checks stream availability.
|
416
471
|
|