airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +17 -2
- airbyte_cdk/config_observation.py +10 -3
- airbyte_cdk/connector.py +19 -9
- airbyte_cdk/connector_builder/connector_builder_handler.py +28 -8
- airbyte_cdk/connector_builder/main.py +26 -6
- airbyte_cdk/connector_builder/message_grouper.py +95 -25
- airbyte_cdk/destinations/destination.py +47 -14
- airbyte_cdk/destinations/vector_db_based/config.py +36 -14
- airbyte_cdk/destinations/vector_db_based/document_processor.py +49 -11
- airbyte_cdk/destinations/vector_db_based/embedder.py +52 -11
- airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
- airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +15 -4
- airbyte_cdk/entrypoint.py +82 -26
- airbyte_cdk/exception_handler.py +13 -3
- airbyte_cdk/logger.py +10 -2
- airbyte_cdk/models/airbyte_protocol.py +11 -5
- airbyte_cdk/models/airbyte_protocol_serializers.py +9 -3
- airbyte_cdk/models/well_known_types.py +1 -1
- airbyte_cdk/sources/abstract_source.py +63 -17
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +47 -14
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +25 -7
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +27 -6
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +9 -3
- airbyte_cdk/sources/connector_state_manager.py +32 -10
- airbyte_cdk/sources/declarative/async_job/job.py +3 -1
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +68 -14
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +24 -6
- airbyte_cdk/sources/declarative/async_job/repository.py +3 -1
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/jwt.py +27 -7
- airbyte_cdk/sources/declarative/auth/oauth.py +35 -11
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/token.py +25 -8
- airbyte_cdk/sources/declarative/checks/check_stream.py +12 -4
- airbyte_cdk/sources/declarative/checks/connection_checker.py +3 -1
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +11 -3
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +106 -50
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +20 -6
- airbyte_cdk/sources/declarative/declarative_source.py +3 -1
- airbyte_cdk/sources/declarative/declarative_stream.py +27 -6
- airbyte_cdk/sources/declarative/decoders/decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +3 -1
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +6 -2
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +6 -2
- airbyte_cdk/sources/declarative/extractors/record_filter.py +24 -7
- airbyte_cdk/sources/declarative/extractors/record_selector.py +10 -3
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +15 -5
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +96 -31
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +22 -8
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +46 -15
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +19 -5
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +3 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +20 -2
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +5 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +10 -3
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +7 -1
- airbyte_cdk/sources/declarative/interpolation/jinja.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +106 -24
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +7 -2
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +656 -678
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +13 -4
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +9 -2
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +782 -232
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +29 -7
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +25 -7
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +54 -15
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +6 -2
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +15 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +18 -8
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +16 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +51 -14
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +29 -8
- airbyte_cdk/sources/declarative/requesters/http_requester.py +58 -16
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +49 -14
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +24 -7
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +9 -3
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +6 -2
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +19 -6
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +3 -1
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +21 -7
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +18 -6
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +27 -8
- airbyte_cdk/sources/declarative/requesters/requester.py +3 -1
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -5
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +105 -24
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +3 -1
- airbyte_cdk/sources/declarative/spec/spec.py +8 -2
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +3 -1
- airbyte_cdk/sources/declarative/transformations/add_fields.py +12 -3
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +6 -2
- airbyte_cdk/sources/declarative/types.py +8 -1
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +3 -1
- airbyte_cdk/sources/embedded/base_integration.py +14 -4
- airbyte_cdk/sources/embedded/catalog.py +16 -4
- airbyte_cdk/sources/embedded/runner.py +19 -3
- airbyte_cdk/sources/embedded/tools.py +3 -1
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +27 -7
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +12 -6
- airbyte_cdk/sources/file_based/config/csv_format.py +21 -9
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +6 -2
- airbyte_cdk/sources/file_based/config/unstructured_format.py +10 -3
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
- airbyte_cdk/sources/file_based/exceptions.py +13 -15
- airbyte_cdk/sources/file_based/file_based_source.py +82 -24
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -5
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +58 -17
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +89 -26
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +25 -7
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +20 -6
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +57 -16
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +64 -15
- airbyte_cdk/sources/file_based/schema_helpers.py +33 -10
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +33 -10
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +47 -11
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +13 -22
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +53 -17
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +17 -5
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +26 -9
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +67 -21
- airbyte_cdk/sources/http_logger.py +5 -1
- airbyte_cdk/sources/message/repository.py +18 -4
- airbyte_cdk/sources/source.py +17 -7
- airbyte_cdk/sources/streams/availability_strategy.py +9 -3
- airbyte_cdk/sources/streams/call_rate.py +63 -19
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +31 -7
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +6 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +77 -22
- airbyte_cdk/sources/streams/concurrent/cursor.py +56 -20
- airbyte_cdk/sources/streams/concurrent/default_stream.py +9 -2
- airbyte_cdk/sources/streams/concurrent/helpers.py +6 -2
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +9 -2
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +4 -1
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +10 -2
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +6 -2
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +25 -10
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +32 -16
- airbyte_cdk/sources/streams/core.py +77 -22
- airbyte_cdk/sources/streams/http/availability_strategy.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +4 -1
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +16 -5
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +9 -3
- airbyte_cdk/sources/streams/http/exceptions.py +2 -2
- airbyte_cdk/sources/streams/http/http.py +133 -33
- airbyte_cdk/sources/streams/http/http_client.py +91 -29
- airbyte_cdk/sources/streams/http/rate_limiting.py +23 -7
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +19 -6
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +38 -11
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
- airbyte_cdk/sources/types.py +5 -1
- airbyte_cdk/sources/utils/record_helper.py +12 -3
- airbyte_cdk/sources/utils/schema_helpers.py +9 -3
- airbyte_cdk/sources/utils/slice_logger.py +4 -1
- airbyte_cdk/sources/utils/transform.py +24 -9
- airbyte_cdk/sql/exceptions.py +19 -6
- airbyte_cdk/sql/secrets.py +3 -1
- airbyte_cdk/sql/shared/catalog_providers.py +13 -4
- airbyte_cdk/sql/shared/sql_processor.py +44 -14
- airbyte_cdk/test/catalog_builder.py +19 -8
- airbyte_cdk/test/entrypoint_wrapper.py +27 -8
- airbyte_cdk/test/mock_http/mocker.py +41 -11
- airbyte_cdk/test/mock_http/request.py +9 -3
- airbyte_cdk/test/mock_http/response.py +3 -1
- airbyte_cdk/test/mock_http/response_builder.py +29 -7
- airbyte_cdk/test/state_builder.py +10 -2
- airbyte_cdk/test/utils/data.py +6 -2
- airbyte_cdk/test/utils/http_mocking.py +3 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +3 -1
- airbyte_cdk/utils/analytics_message.py +10 -2
- airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
- airbyte_cdk/utils/mapping_helpers.py +3 -1
- airbyte_cdk/utils/message_utils.py +11 -4
- airbyte_cdk/utils/print_buffer.py +6 -1
- airbyte_cdk/utils/schema_inferrer.py +30 -9
- airbyte_cdk/utils/spec_schema_transformations.py +3 -1
- airbyte_cdk/utils/traced_exception.py +35 -9
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/METADATA +7 -6
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/RECORD +198 -198
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/WHEEL +0 -0
@@ -61,7 +61,9 @@ class RecordSelector(HttpSelector):
|
|
61
61
|
:return: List of Records selected from the response
|
62
62
|
"""
|
63
63
|
all_data: Iterable[Mapping[str, Any]] = self.extractor.extract_records(response)
|
64
|
-
yield from self.filter_and_transform(
|
64
|
+
yield from self.filter_and_transform(
|
65
|
+
all_data, stream_state, records_schema, stream_slice, next_page_token
|
66
|
+
)
|
65
67
|
|
66
68
|
def filter_and_transform(
|
67
69
|
self,
|
@@ -106,7 +108,10 @@ class RecordSelector(HttpSelector):
|
|
106
108
|
) -> Iterable[Mapping[str, Any]]:
|
107
109
|
if self.record_filter:
|
108
110
|
yield from self.record_filter.filter_records(
|
109
|
-
records,
|
111
|
+
records,
|
112
|
+
stream_state=stream_state,
|
113
|
+
stream_slice=stream_slice,
|
114
|
+
next_page_token=next_page_token,
|
110
115
|
)
|
111
116
|
else:
|
112
117
|
yield from records
|
@@ -119,5 +124,7 @@ class RecordSelector(HttpSelector):
|
|
119
124
|
) -> Iterable[Mapping[str, Any]]:
|
120
125
|
for record in records:
|
121
126
|
for transformation in self.transformations:
|
122
|
-
transformation.transform(
|
127
|
+
transformation.transform(
|
128
|
+
record, config=self.config, stream_state=stream_state, stream_slice=stream_slice
|
129
|
+
) # type: ignore # record has type Mapping[str, Any], but Dict[str, Any] expected
|
123
130
|
yield record
|
@@ -68,7 +68,9 @@ class ResponseToFileExtractor(RecordExtractor):
|
|
68
68
|
|
69
69
|
res = b.replace(b"\x00", b"")
|
70
70
|
if len(res) < len(b):
|
71
|
-
self.logger.warning(
|
71
|
+
self.logger.warning(
|
72
|
+
"Filter 'null' bytes from string, size reduced %d -> %d chars", len(b), len(res)
|
73
|
+
)
|
72
74
|
return res
|
73
75
|
|
74
76
|
def _save_to_file(self, response: requests.Response) -> Tuple[str, str]:
|
@@ -106,9 +108,13 @@ class ResponseToFileExtractor(RecordExtractor):
|
|
106
108
|
if os.path.isfile(tmp_file):
|
107
109
|
return tmp_file, response_encoding
|
108
110
|
else:
|
109
|
-
raise ValueError(
|
111
|
+
raise ValueError(
|
112
|
+
f"The IO/Error occured while verifying binary data. Tmp file {tmp_file} doesn't exist."
|
113
|
+
)
|
110
114
|
|
111
|
-
def _read_with_chunks(
|
115
|
+
def _read_with_chunks(
|
116
|
+
self, path: str, file_encoding: str, chunk_size: int = 100
|
117
|
+
) -> Iterable[Mapping[str, Any]]:
|
112
118
|
"""
|
113
119
|
Reads data from a file in chunks and yields each row as a dictionary.
|
114
120
|
|
@@ -126,7 +132,9 @@ class ResponseToFileExtractor(RecordExtractor):
|
|
126
132
|
|
127
133
|
try:
|
128
134
|
with open(path, "r", encoding=file_encoding) as data:
|
129
|
-
chunks = pd.read_csv(
|
135
|
+
chunks = pd.read_csv(
|
136
|
+
data, chunksize=chunk_size, iterator=True, dialect="unix", dtype=object
|
137
|
+
)
|
130
138
|
for chunk in chunks:
|
131
139
|
chunk = chunk.replace({nan: None}).to_dict(orient="records")
|
132
140
|
for row in chunk:
|
@@ -140,7 +148,9 @@ class ResponseToFileExtractor(RecordExtractor):
|
|
140
148
|
# remove binary tmp file, after data is read
|
141
149
|
os.remove(path)
|
142
150
|
|
143
|
-
def extract_records(
|
151
|
+
def extract_records(
|
152
|
+
self, response: Optional[requests.Response] = None
|
153
|
+
) -> Iterable[Mapping[str, Any]]:
|
144
154
|
"""
|
145
155
|
Extracts records from the given response by:
|
146
156
|
1) Saving the result to a tmp file
|
@@ -13,7 +13,10 @@ from airbyte_cdk.sources.declarative.datetime.min_max_datetime import MinMaxDate
|
|
13
13
|
from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
|
14
14
|
from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString
|
15
15
|
from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation
|
16
|
-
from airbyte_cdk.sources.declarative.requesters.request_option import
|
16
|
+
from airbyte_cdk.sources.declarative.requesters.request_option import (
|
17
|
+
RequestOption,
|
18
|
+
RequestOptionType,
|
19
|
+
)
|
17
20
|
from airbyte_cdk.sources.message import MessageRepository
|
18
21
|
from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState
|
19
22
|
from isodate import Duration, duration_isoformat, parse_duration
|
@@ -72,27 +75,41 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
72
75
|
cursor_datetime_formats: List[str] = field(default_factory=lambda: [])
|
73
76
|
|
74
77
|
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
75
|
-
if (self.step and not self.cursor_granularity) or (
|
78
|
+
if (self.step and not self.cursor_granularity) or (
|
79
|
+
not self.step and self.cursor_granularity
|
80
|
+
):
|
76
81
|
raise ValueError(
|
77
82
|
f"If step is defined, cursor_granularity should be as well and vice-versa. "
|
78
83
|
f"Right now, step is `{self.step}` and cursor_granularity is `{self.cursor_granularity}`"
|
79
84
|
)
|
80
85
|
self._start_datetime = MinMaxDatetime.create(self.start_datetime, parameters)
|
81
|
-
self._end_datetime =
|
86
|
+
self._end_datetime = (
|
87
|
+
None if not self.end_datetime else MinMaxDatetime.create(self.end_datetime, parameters)
|
88
|
+
)
|
82
89
|
|
83
90
|
self._timezone = datetime.timezone.utc
|
84
91
|
self._interpolation = JinjaInterpolation()
|
85
92
|
|
86
93
|
self._step = (
|
87
|
-
self._parse_timedelta(
|
94
|
+
self._parse_timedelta(
|
95
|
+
InterpolatedString.create(self.step, parameters=parameters).eval(self.config)
|
96
|
+
)
|
88
97
|
if self.step
|
89
98
|
else datetime.timedelta.max
|
90
99
|
)
|
91
100
|
self._cursor_granularity = self._parse_timedelta(self.cursor_granularity)
|
92
101
|
self.cursor_field = InterpolatedString.create(self.cursor_field, parameters=parameters)
|
93
|
-
self._lookback_window =
|
94
|
-
|
95
|
-
|
102
|
+
self._lookback_window = (
|
103
|
+
InterpolatedString.create(self.lookback_window, parameters=parameters)
|
104
|
+
if self.lookback_window
|
105
|
+
else None
|
106
|
+
)
|
107
|
+
self._partition_field_start = InterpolatedString.create(
|
108
|
+
self.partition_field_start or "start_time", parameters=parameters
|
109
|
+
)
|
110
|
+
self._partition_field_end = InterpolatedString.create(
|
111
|
+
self.partition_field_end or "end_time", parameters=parameters
|
112
|
+
)
|
96
113
|
self._parser = DatetimeParser()
|
97
114
|
|
98
115
|
# If datetime format is not specified then start/end datetime should inherit it from the stream slicer
|
@@ -114,7 +131,9 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
114
131
|
|
115
132
|
:param stream_state: The state of the stream as returned by get_stream_state
|
116
133
|
"""
|
117
|
-
self._cursor =
|
134
|
+
self._cursor = (
|
135
|
+
stream_state.get(self.cursor_field.eval(self.config)) if stream_state else None
|
136
|
+
) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__
|
118
137
|
|
119
138
|
def observe(self, stream_slice: StreamSlice, record: Record) -> None:
|
120
139
|
"""
|
@@ -131,28 +150,38 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
131
150
|
|
132
151
|
start_field = self._partition_field_start.eval(self.config)
|
133
152
|
end_field = self._partition_field_end.eval(self.config)
|
134
|
-
is_highest_observed_cursor_value =
|
135
|
-
|
136
|
-
|
153
|
+
is_highest_observed_cursor_value = (
|
154
|
+
not self._highest_observed_cursor_field_value
|
155
|
+
or self.parse_date(record_cursor_value)
|
156
|
+
> self.parse_date(self._highest_observed_cursor_field_value)
|
157
|
+
)
|
137
158
|
if (
|
138
|
-
self._is_within_daterange_boundaries(
|
159
|
+
self._is_within_daterange_boundaries(
|
160
|
+
record, stream_slice.get(start_field), stream_slice.get(end_field)
|
161
|
+
) # type: ignore # we know that stream_slices for these cursors will use a string representing an unparsed date
|
139
162
|
and is_highest_observed_cursor_value
|
140
163
|
):
|
141
164
|
self._highest_observed_cursor_field_value = record_cursor_value
|
142
165
|
|
143
166
|
def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
|
144
167
|
if stream_slice.partition:
|
145
|
-
raise ValueError(
|
168
|
+
raise ValueError(
|
169
|
+
f"Stream slice {stream_slice} should not have a partition. Got {stream_slice.partition}."
|
170
|
+
)
|
146
171
|
cursor_value_str_by_cursor_value_datetime = dict(
|
147
172
|
map(
|
148
173
|
# we need to ensure the cursor value is preserved as is in the state else the CATs might complain of something like
|
149
174
|
# 2023-01-04T17:30:19.000Z' <= '2023-01-04T17:30:19.000000Z'
|
150
175
|
lambda datetime_str: (self.parse_date(datetime_str), datetime_str), # type: ignore # because of the filter on the next line, this will only be called with a str
|
151
|
-
filter(
|
176
|
+
filter(
|
177
|
+
lambda item: item, [self._cursor, self._highest_observed_cursor_field_value]
|
178
|
+
),
|
152
179
|
)
|
153
180
|
)
|
154
181
|
self._cursor = (
|
155
|
-
cursor_value_str_by_cursor_value_datetime[
|
182
|
+
cursor_value_str_by_cursor_value_datetime[
|
183
|
+
max(cursor_value_str_by_cursor_value_datetime.keys())
|
184
|
+
]
|
156
185
|
if cursor_value_str_by_cursor_value_datetime
|
157
186
|
else None
|
158
187
|
)
|
@@ -175,11 +204,19 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
175
204
|
# through each slice and does not belong to a specific slice. We just return stream state as it is.
|
176
205
|
return self.get_stream_state()
|
177
206
|
|
178
|
-
def _calculate_earliest_possible_value(
|
179
|
-
|
180
|
-
|
207
|
+
def _calculate_earliest_possible_value(
|
208
|
+
self, end_datetime: datetime.datetime
|
209
|
+
) -> datetime.datetime:
|
210
|
+
lookback_delta = self._parse_timedelta(
|
211
|
+
self._lookback_window.eval(self.config) if self._lookback_window else "P0D"
|
212
|
+
)
|
213
|
+
earliest_possible_start_datetime = min(
|
214
|
+
self._start_datetime.get_datetime(self.config), end_datetime
|
215
|
+
)
|
181
216
|
try:
|
182
|
-
cursor_datetime =
|
217
|
+
cursor_datetime = (
|
218
|
+
self._calculate_cursor_datetime_from_state(self.get_stream_state()) - lookback_delta
|
219
|
+
)
|
183
220
|
except OverflowError:
|
184
221
|
# cursor_datetime defers to the minimum date if it does not exist in the state. Trying to subtract
|
185
222
|
# a timedelta from the minimum datetime results in an OverflowError
|
@@ -200,7 +237,9 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
200
237
|
return now
|
201
238
|
return min(self._end_datetime.get_datetime(self.config), now)
|
202
239
|
|
203
|
-
def _calculate_cursor_datetime_from_state(
|
240
|
+
def _calculate_cursor_datetime_from_state(
|
241
|
+
self, stream_state: Mapping[str, Any]
|
242
|
+
) -> datetime.datetime:
|
204
243
|
if self.cursor_field.eval(self.config, stream_state=stream_state) in stream_state: # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__
|
205
244
|
return self.parse_date(stream_state[self.cursor_field.eval(self.config)]) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__
|
206
245
|
return datetime.datetime.min.replace(tzinfo=datetime.timezone.utc)
|
@@ -209,7 +248,10 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
209
248
|
return self._parser.format(dt, self.datetime_format)
|
210
249
|
|
211
250
|
def _partition_daterange(
|
212
|
-
self,
|
251
|
+
self,
|
252
|
+
start: datetime.datetime,
|
253
|
+
end: datetime.datetime,
|
254
|
+
step: Union[datetime.timedelta, Duration],
|
213
255
|
) -> List[StreamSlice]:
|
214
256
|
start_field = self._partition_field_start.eval(self.config)
|
215
257
|
end_field = self._partition_field_end.eval(self.config)
|
@@ -220,7 +262,11 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
220
262
|
end_date = self._get_date(next_start - self._cursor_granularity, end, min)
|
221
263
|
dates.append(
|
222
264
|
StreamSlice(
|
223
|
-
partition={},
|
265
|
+
partition={},
|
266
|
+
cursor_slice={
|
267
|
+
start_field: self._format_datetime(start),
|
268
|
+
end_field: self._format_datetime(end_date),
|
269
|
+
},
|
224
270
|
)
|
225
271
|
)
|
226
272
|
start = next_start
|
@@ -231,7 +277,9 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
231
277
|
return start < end
|
232
278
|
return start <= end
|
233
279
|
|
234
|
-
def _evaluate_next_start_date_safely(
|
280
|
+
def _evaluate_next_start_date_safely(
|
281
|
+
self, start: datetime.datetime, step: datetime.timedelta
|
282
|
+
) -> datetime.datetime:
|
235
283
|
"""
|
236
284
|
Given that we set the default step at datetime.timedelta.max, we will generate an OverflowError when evaluating the next start_date
|
237
285
|
This method assumes that users would never enter a step that would generate an overflow. Given that would be the case, the code
|
@@ -308,7 +356,9 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
308
356
|
# Never update kwargs
|
309
357
|
return {}
|
310
358
|
|
311
|
-
def _get_request_options(
|
359
|
+
def _get_request_options(
|
360
|
+
self, option_type: RequestOptionType, stream_slice: Optional[StreamSlice]
|
361
|
+
) -> Mapping[str, Any]:
|
312
362
|
options: MutableMapping[str, Any] = {}
|
313
363
|
if not stream_slice:
|
314
364
|
return options
|
@@ -317,7 +367,9 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
317
367
|
self._partition_field_start.eval(self.config)
|
318
368
|
)
|
319
369
|
if self.end_time_option and self.end_time_option.inject_into == option_type:
|
320
|
-
options[self.end_time_option.field_name.eval(config=self.config)] = stream_slice.get(
|
370
|
+
options[self.end_time_option.field_name.eval(config=self.config)] = stream_slice.get(
|
371
|
+
self._partition_field_end.eval(self.config)
|
372
|
+
) # type: ignore # field_name is always casted to an interpolated string
|
321
373
|
return options
|
322
374
|
|
323
375
|
def should_be_synced(self, record: Record) -> bool:
|
@@ -330,11 +382,18 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
330
382
|
)
|
331
383
|
return True
|
332
384
|
latest_possible_cursor_value = self.select_best_end_datetime()
|
333
|
-
earliest_possible_cursor_value = self._calculate_earliest_possible_value(
|
334
|
-
|
385
|
+
earliest_possible_cursor_value = self._calculate_earliest_possible_value(
|
386
|
+
latest_possible_cursor_value
|
387
|
+
)
|
388
|
+
return self._is_within_daterange_boundaries(
|
389
|
+
record, earliest_possible_cursor_value, latest_possible_cursor_value
|
390
|
+
)
|
335
391
|
|
336
392
|
def _is_within_daterange_boundaries(
|
337
|
-
self,
|
393
|
+
self,
|
394
|
+
record: Record,
|
395
|
+
start_datetime_boundary: Union[datetime.datetime, str],
|
396
|
+
end_datetime_boundary: Union[datetime.datetime, str],
|
338
397
|
) -> bool:
|
339
398
|
cursor_field = self.cursor_field.eval(self.config) # type: ignore # cursor_field is converted to an InterpolatedString in __post_init__
|
340
399
|
record_cursor_value = record.get(cursor_field)
|
@@ -348,7 +407,9 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
348
407
|
start_datetime_boundary = self.parse_date(start_datetime_boundary)
|
349
408
|
if isinstance(end_datetime_boundary, str):
|
350
409
|
end_datetime_boundary = self.parse_date(end_datetime_boundary)
|
351
|
-
return
|
410
|
+
return (
|
411
|
+
start_datetime_boundary <= self.parse_date(record_cursor_value) <= end_datetime_boundary
|
412
|
+
)
|
352
413
|
|
353
414
|
def _send_log(self, level: Level, message: str) -> None:
|
354
415
|
if self.message_repository:
|
@@ -378,8 +439,12 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
378
439
|
:param lookback_window_in_seconds: The lookback duration in seconds to potentially update to.
|
379
440
|
"""
|
380
441
|
runtime_lookback_window = duration_isoformat(timedelta(seconds=lookback_window_in_seconds))
|
381
|
-
config_lookback = parse_duration(
|
442
|
+
config_lookback = parse_duration(
|
443
|
+
self._lookback_window.eval(self.config) if self._lookback_window else "P0D"
|
444
|
+
)
|
382
445
|
|
383
446
|
# Check if the new runtime lookback window is greater than the current config lookback
|
384
447
|
if parse_duration(runtime_lookback_window) > config_lookback:
|
385
|
-
self._lookback_window = InterpolatedString.create(
|
448
|
+
self._lookback_window = InterpolatedString.create(
|
449
|
+
runtime_lookback_window, parameters={}
|
450
|
+
)
|
@@ -84,7 +84,9 @@ class GlobalSubstreamCursor(DeclarativeCursor):
|
|
84
84
|
self._partition_router = partition_router
|
85
85
|
self._timer = Timer()
|
86
86
|
self._lock = threading.Lock()
|
87
|
-
self._slice_semaphore = threading.Semaphore(
|
87
|
+
self._slice_semaphore = threading.Semaphore(
|
88
|
+
0
|
89
|
+
) # Start with 0, indicating no slices being tracked
|
88
90
|
self._all_slices_yielded = False
|
89
91
|
self._lookback_window: Optional[int] = None
|
90
92
|
self._current_partition: Optional[Mapping[str, Any]] = None
|
@@ -116,7 +118,9 @@ class GlobalSubstreamCursor(DeclarativeCursor):
|
|
116
118
|
)
|
117
119
|
|
118
120
|
self.start_slices_generation()
|
119
|
-
for slice, last, state in iterate_with_last_flag_and_state(
|
121
|
+
for slice, last, state in iterate_with_last_flag_and_state(
|
122
|
+
slice_generator, self._partition_router.get_stream_state
|
123
|
+
):
|
120
124
|
self._parent_state = state
|
121
125
|
self.register_slice(last)
|
122
126
|
yield slice
|
@@ -124,7 +128,8 @@ class GlobalSubstreamCursor(DeclarativeCursor):
|
|
124
128
|
|
125
129
|
def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
|
126
130
|
slice_generator = (
|
127
|
-
StreamSlice(partition=partition, cursor_slice=cursor_slice)
|
131
|
+
StreamSlice(partition=partition, cursor_slice=cursor_slice)
|
132
|
+
for cursor_slice in self._stream_cursor.stream_slices()
|
128
133
|
)
|
129
134
|
|
130
135
|
yield from slice_generator
|
@@ -199,10 +204,14 @@ class GlobalSubstreamCursor(DeclarativeCursor):
|
|
199
204
|
if hasattr(self._stream_cursor, "set_runtime_lookback_window"):
|
200
205
|
self._stream_cursor.set_runtime_lookback_window(lookback_window)
|
201
206
|
else:
|
202
|
-
raise ValueError(
|
207
|
+
raise ValueError(
|
208
|
+
"The cursor class for Global Substream Cursor does not have a set_runtime_lookback_window method"
|
209
|
+
)
|
203
210
|
|
204
211
|
def observe(self, stream_slice: StreamSlice, record: Record) -> None:
|
205
|
-
self._stream_cursor.observe(
|
212
|
+
self._stream_cursor.observe(
|
213
|
+
StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), record
|
214
|
+
)
|
206
215
|
|
207
216
|
def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
|
208
217
|
"""
|
@@ -220,7 +229,9 @@ class GlobalSubstreamCursor(DeclarativeCursor):
|
|
220
229
|
self._slice_semaphore.acquire()
|
221
230
|
if self._all_slices_yielded and self._slice_semaphore._value == 0:
|
222
231
|
self._lookback_window = self._timer.finish()
|
223
|
-
self._stream_cursor.close_slice(
|
232
|
+
self._stream_cursor.close_slice(
|
233
|
+
StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), *args
|
234
|
+
)
|
224
235
|
|
225
236
|
def get_stream_state(self) -> StreamState:
|
226
237
|
state: dict[str, Any] = {"state": self._stream_cursor.get_stream_state()}
|
@@ -322,12 +333,15 @@ class GlobalSubstreamCursor(DeclarativeCursor):
|
|
322
333
|
|
323
334
|
def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
|
324
335
|
return self._stream_cursor.is_greater_than_or_equal(
|
325
|
-
self._convert_record_to_cursor_record(first),
|
336
|
+
self._convert_record_to_cursor_record(first),
|
337
|
+
self._convert_record_to_cursor_record(second),
|
326
338
|
)
|
327
339
|
|
328
340
|
@staticmethod
|
329
341
|
def _convert_record_to_cursor_record(record: Record) -> Record:
|
330
342
|
return Record(
|
331
343
|
record.data,
|
332
|
-
StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
|
344
|
+
StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
|
345
|
+
if record.associated_slice
|
346
|
+
else None,
|
333
347
|
)
|
@@ -8,7 +8,9 @@ from typing import Any, Callable, Iterable, Mapping, Optional, Union
|
|
8
8
|
|
9
9
|
from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
|
10
10
|
from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
|
11
|
-
from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import
|
11
|
+
from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
|
12
|
+
PerPartitionKeySerializer,
|
13
|
+
)
|
12
14
|
from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
|
13
15
|
|
14
16
|
logger = logging.getLogger("airbyte")
|
@@ -67,12 +69,18 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
67
69
|
|
68
70
|
cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
|
69
71
|
if not cursor:
|
70
|
-
partition_state =
|
72
|
+
partition_state = (
|
73
|
+
self._state_to_migrate_from
|
74
|
+
if self._state_to_migrate_from
|
75
|
+
else self._NO_CURSOR_STATE
|
76
|
+
)
|
71
77
|
cursor = self._create_cursor(partition_state)
|
72
78
|
self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
|
73
79
|
|
74
80
|
for cursor_slice in cursor.stream_slices():
|
75
|
-
yield StreamSlice(
|
81
|
+
yield StreamSlice(
|
82
|
+
partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
|
83
|
+
)
|
76
84
|
|
77
85
|
def _ensure_partition_limit(self) -> None:
|
78
86
|
"""
|
@@ -80,7 +88,9 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
80
88
|
"""
|
81
89
|
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
|
82
90
|
self._over_limit += 1
|
83
|
-
oldest_partition = self._cursor_per_partition.popitem(last=False)[
|
91
|
+
oldest_partition = self._cursor_per_partition.popitem(last=False)[
|
92
|
+
0
|
93
|
+
] # Remove the oldest partition
|
84
94
|
logger.warning(
|
85
95
|
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
|
86
96
|
)
|
@@ -128,7 +138,9 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
128
138
|
|
129
139
|
else:
|
130
140
|
for state in stream_state["states"]:
|
131
|
-
self._cursor_per_partition[self._to_partition_key(state["partition"])] =
|
141
|
+
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
|
142
|
+
self._create_cursor(state["cursor"])
|
143
|
+
)
|
132
144
|
|
133
145
|
# set default state for missing partitions if it is per partition with fallback to global
|
134
146
|
if "state" in stream_state:
|
@@ -214,7 +226,9 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
214
226
|
stream_state=stream_state,
|
215
227
|
stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}),
|
216
228
|
next_page_token=next_page_token,
|
217
|
-
) | self._cursor_per_partition[
|
229
|
+
) | self._cursor_per_partition[
|
230
|
+
self._to_partition_key(stream_slice.partition)
|
231
|
+
].get_request_params(
|
218
232
|
stream_state=stream_state,
|
219
233
|
stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice),
|
220
234
|
next_page_token=next_page_token,
|
@@ -234,7 +248,9 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
234
248
|
stream_state=stream_state,
|
235
249
|
stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}),
|
236
250
|
next_page_token=next_page_token,
|
237
|
-
) | self._cursor_per_partition[
|
251
|
+
) | self._cursor_per_partition[
|
252
|
+
self._to_partition_key(stream_slice.partition)
|
253
|
+
].get_request_headers(
|
238
254
|
stream_state=stream_state,
|
239
255
|
stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice),
|
240
256
|
next_page_token=next_page_token,
|
@@ -254,7 +270,9 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
254
270
|
stream_state=stream_state,
|
255
271
|
stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}),
|
256
272
|
next_page_token=next_page_token,
|
257
|
-
) | self._cursor_per_partition[
|
273
|
+
) | self._cursor_per_partition[
|
274
|
+
self._to_partition_key(stream_slice.partition)
|
275
|
+
].get_request_body_data(
|
258
276
|
stream_state=stream_state,
|
259
277
|
stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice),
|
260
278
|
next_page_token=next_page_token,
|
@@ -274,7 +292,9 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
274
292
|
stream_state=stream_state,
|
275
293
|
stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}),
|
276
294
|
next_page_token=next_page_token,
|
277
|
-
) | self._cursor_per_partition[
|
295
|
+
) | self._cursor_per_partition[
|
296
|
+
self._to_partition_key(stream_slice.partition)
|
297
|
+
].get_request_body_json(
|
278
298
|
stream_state=stream_state,
|
279
299
|
stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice),
|
280
300
|
next_page_token=next_page_token,
|
@@ -283,32 +303,43 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
283
303
|
raise ValueError("A partition needs to be provided in order to get request body json")
|
284
304
|
|
285
305
|
def should_be_synced(self, record: Record) -> bool:
|
286
|
-
return self._get_cursor(record).should_be_synced(
|
306
|
+
return self._get_cursor(record).should_be_synced(
|
307
|
+
self._convert_record_to_cursor_record(record)
|
308
|
+
)
|
287
309
|
|
288
310
|
def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
|
289
311
|
if not first.associated_slice or not second.associated_slice:
|
290
|
-
raise ValueError(
|
312
|
+
raise ValueError(
|
313
|
+
f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}"
|
314
|
+
)
|
291
315
|
if first.associated_slice.partition != second.associated_slice.partition:
|
292
316
|
raise ValueError(
|
293
317
|
f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}"
|
294
318
|
)
|
295
319
|
|
296
320
|
return self._get_cursor(first).is_greater_than_or_equal(
|
297
|
-
self._convert_record_to_cursor_record(first),
|
321
|
+
self._convert_record_to_cursor_record(first),
|
322
|
+
self._convert_record_to_cursor_record(second),
|
298
323
|
)
|
299
324
|
|
300
325
|
@staticmethod
|
301
326
|
def _convert_record_to_cursor_record(record: Record) -> Record:
|
302
327
|
return Record(
|
303
328
|
record.data,
|
304
|
-
StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
|
329
|
+
StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
|
330
|
+
if record.associated_slice
|
331
|
+
else None,
|
305
332
|
)
|
306
333
|
|
307
334
|
def _get_cursor(self, record: Record) -> DeclarativeCursor:
|
308
335
|
if not record.associated_slice:
|
309
|
-
raise ValueError(
|
336
|
+
raise ValueError(
|
337
|
+
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
338
|
+
)
|
310
339
|
partition_key = self._to_partition_key(record.associated_slice.partition)
|
311
340
|
if partition_key not in self._cursor_per_partition:
|
312
|
-
raise ValueError(
|
341
|
+
raise ValueError(
|
342
|
+
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
343
|
+
)
|
313
344
|
cursor = self._cursor_per_partition[partition_key]
|
314
345
|
return cursor
|
@@ -5,8 +5,14 @@ from typing import Any, Iterable, Mapping, MutableMapping, Optional, Union
|
|
5
5
|
|
6
6
|
from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
|
7
7
|
from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
|
8
|
-
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import
|
9
|
-
|
8
|
+
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
|
9
|
+
GlobalSubstreamCursor,
|
10
|
+
iterate_with_last_flag_and_state,
|
11
|
+
)
|
12
|
+
from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import (
|
13
|
+
CursorFactory,
|
14
|
+
PerPartitionCursor,
|
15
|
+
)
|
10
16
|
from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
|
11
17
|
from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
|
12
18
|
|
@@ -60,7 +66,12 @@ class PerPartitionWithGlobalCursor(DeclarativeCursor):
|
|
60
66
|
Suitable for streams where the number of partitions may vary significantly, requiring dynamic switching between per-partition and global state management to ensure data consistency and efficient synchronization.
|
61
67
|
"""
|
62
68
|
|
63
|
-
def __init__(
|
69
|
+
def __init__(
|
70
|
+
self,
|
71
|
+
cursor_factory: CursorFactory,
|
72
|
+
partition_router: PartitionRouter,
|
73
|
+
stream_cursor: DatetimeBasedCursor,
|
74
|
+
):
|
64
75
|
self._partition_router = partition_router
|
65
76
|
self._per_partition_cursor = PerPartitionCursor(cursor_factory, partition_router)
|
66
77
|
self._global_cursor = GlobalSubstreamCursor(stream_cursor, partition_router)
|
@@ -82,7 +93,8 @@ class PerPartitionWithGlobalCursor(DeclarativeCursor):
|
|
82
93
|
# Generate slices for the current cursor and handle the last slice using the flag
|
83
94
|
self._parent_state = parent_state
|
84
95
|
for slice, is_last_slice, _ in iterate_with_last_flag_and_state(
|
85
|
-
self._get_active_cursor().generate_slices_from_partition(partition=partition),
|
96
|
+
self._get_active_cursor().generate_slices_from_partition(partition=partition),
|
97
|
+
lambda: None,
|
86
98
|
):
|
87
99
|
self._global_cursor.register_slice(is_last_slice and is_last_partition)
|
88
100
|
yield slice
|
@@ -182,7 +194,9 @@ class PerPartitionWithGlobalCursor(DeclarativeCursor):
|
|
182
194
|
)
|
183
195
|
|
184
196
|
def should_be_synced(self, record: Record) -> bool:
|
185
|
-
return self._global_cursor.should_be_synced(
|
197
|
+
return self._global_cursor.should_be_synced(
|
198
|
+
record
|
199
|
+
) or self._per_partition_cursor.should_be_synced(record)
|
186
200
|
|
187
201
|
def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
|
188
202
|
return self._global_cursor.is_greater_than_or_equal(first, second)
|
@@ -30,7 +30,9 @@ class ResumableFullRefreshCursor(DeclarativeCursor):
|
|
30
30
|
def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
|
31
31
|
# The ResumableFullRefreshCursor doesn't support nested streams yet so receiving a partition is unexpected
|
32
32
|
if stream_slice.partition:
|
33
|
-
raise ValueError(
|
33
|
+
raise ValueError(
|
34
|
+
f"Stream slice {stream_slice} should not have a partition. Got {stream_slice.partition}."
|
35
|
+
)
|
34
36
|
self._cursor = stream_slice.cursor_slice
|
35
37
|
|
36
38
|
def should_be_synced(self, record: Record) -> bool:
|
@@ -8,7 +8,21 @@ from typing import Any, Final, List, Mapping
|
|
8
8
|
from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation
|
9
9
|
from airbyte_cdk.sources.types import Config
|
10
10
|
|
11
|
-
FALSE_VALUES: Final[List[Any]] = [
|
11
|
+
FALSE_VALUES: Final[List[Any]] = [
|
12
|
+
"False",
|
13
|
+
"false",
|
14
|
+
"{}",
|
15
|
+
"[]",
|
16
|
+
"()",
|
17
|
+
"",
|
18
|
+
"0",
|
19
|
+
"0.0",
|
20
|
+
{},
|
21
|
+
False,
|
22
|
+
[],
|
23
|
+
(),
|
24
|
+
set(),
|
25
|
+
]
|
12
26
|
|
13
27
|
|
14
28
|
@dataclass
|
@@ -40,7 +54,11 @@ class InterpolatedBoolean:
|
|
40
54
|
return self.condition
|
41
55
|
else:
|
42
56
|
evaluated = self._interpolation.eval(
|
43
|
-
self.condition,
|
57
|
+
self.condition,
|
58
|
+
config,
|
59
|
+
self._default,
|
60
|
+
parameters=self._parameters,
|
61
|
+
**additional_parameters,
|
44
62
|
)
|
45
63
|
if evaluated in FALSE_VALUES:
|
46
64
|
return False
|