airbyte-cdk 6.8.1rc9__py3-none-any.whl → 6.8.2.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/cli/source_declarative_manifest/_run.py +11 -5
- airbyte_cdk/config_observation.py +1 -1
- airbyte_cdk/connector_builder/main.py +1 -1
- airbyte_cdk/connector_builder/message_grouper.py +10 -10
- airbyte_cdk/destinations/destination.py +1 -1
- airbyte_cdk/destinations/vector_db_based/embedder.py +2 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +12 -4
- airbyte_cdk/entrypoint.py +7 -6
- airbyte_cdk/logger.py +2 -2
- airbyte_cdk/sources/abstract_source.py +1 -1
- airbyte_cdk/sources/config.py +1 -1
- airbyte_cdk/sources/connector_state_manager.py +9 -4
- airbyte_cdk/sources/declarative/auth/oauth.py +1 -1
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +6 -1
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +76 -28
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +10 -4
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +16 -17
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +4 -1
- airbyte_cdk/sources/declarative/extractors/record_filter.py +3 -5
- airbyte_cdk/sources/declarative/incremental/__init__.py +3 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +270 -0
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +8 -6
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +9 -0
- airbyte_cdk/sources/declarative/interpolation/jinja.py +35 -36
- airbyte_cdk/sources/declarative/interpolation/macros.py +1 -1
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +71 -17
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +13 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +1 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +8 -6
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +1 -1
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +2 -2
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +1 -1
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +5 -2
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +1 -1
- airbyte_cdk/sources/declarative/spec/spec.py +1 -1
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +0 -1
- airbyte_cdk/sources/embedded/base_integration.py +3 -2
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +18 -7
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +14 -11
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +3 -3
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +11 -5
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +1 -1
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +2 -2
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +6 -3
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +1 -1
- airbyte_cdk/sources/http_logger.py +3 -3
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +5 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +6 -3
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +9 -3
- airbyte_cdk/sources/streams/concurrent/cursor.py +10 -1
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +2 -2
- airbyte_cdk/sources/streams/core.py +17 -14
- airbyte_cdk/sources/streams/http/http.py +19 -19
- airbyte_cdk/sources/streams/http/http_client.py +4 -48
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +62 -33
- airbyte_cdk/sources/utils/record_helper.py +1 -1
- airbyte_cdk/sources/utils/schema_helpers.py +1 -1
- airbyte_cdk/sources/utils/transform.py +34 -15
- airbyte_cdk/test/entrypoint_wrapper.py +11 -6
- airbyte_cdk/test/mock_http/response_builder.py +1 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +1 -1
- airbyte_cdk/utils/event_timing.py +10 -10
- airbyte_cdk/utils/message_utils.py +4 -3
- airbyte_cdk/utils/spec_schema_transformations.py +3 -2
- airbyte_cdk/utils/traced_exception.py +14 -12
- airbyte_cdk-6.8.2.dev1.dist-info/METADATA +111 -0
- {airbyte_cdk-6.8.1rc9.dist-info → airbyte_cdk-6.8.2.dev1.dist-info}/RECORD +72 -71
- airbyte_cdk-6.8.1rc9.dist-info/METADATA +0 -307
- {airbyte_cdk-6.8.1rc9.dist-info → airbyte_cdk-6.8.2.dev1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.8.1rc9.dist-info → airbyte_cdk-6.8.2.dev1.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.8.1rc9.dist-info → airbyte_cdk-6.8.2.dev1.dist-info}/entry_points.txt +0 -0
@@ -59,13 +59,11 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
|
|
59
59
|
|
60
60
|
def __init__(
|
61
61
|
self,
|
62
|
-
|
63
|
-
substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]],
|
62
|
+
cursor: Union[DatetimeBasedCursor, PerPartitionWithGlobalCursor, GlobalSubstreamCursor],
|
64
63
|
**kwargs: Any,
|
65
64
|
):
|
66
65
|
super().__init__(**kwargs)
|
67
|
-
self.
|
68
|
-
self._substream_cursor = substream_cursor
|
66
|
+
self._cursor = cursor
|
69
67
|
|
70
68
|
def filter_records(
|
71
69
|
self,
|
@@ -77,7 +75,7 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
|
|
77
75
|
records = (
|
78
76
|
record
|
79
77
|
for record in records
|
80
|
-
if
|
78
|
+
if self._cursor.should_be_synced(
|
81
79
|
# Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here
|
82
80
|
# Record stream name is empty cause it is not used durig the filtering
|
83
81
|
Record(data=record, associated_slice=stream_slice, stream_name="")
|
@@ -2,6 +2,7 @@
|
|
2
2
|
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
+
from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import ConcurrentCursorFactory, ConcurrentPerPartitionCursor
|
5
6
|
from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
|
6
7
|
from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
|
7
8
|
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import GlobalSubstreamCursor
|
@@ -14,6 +15,8 @@ from airbyte_cdk.sources.declarative.incremental.resumable_full_refresh_cursor i
|
|
14
15
|
|
15
16
|
__all__ = [
|
16
17
|
"CursorFactory",
|
18
|
+
"ConcurrentCursorFactory"
|
19
|
+
"ConcurrentPerPartitionCursor",
|
17
20
|
"DatetimeBasedCursor",
|
18
21
|
"DeclarativeCursor",
|
19
22
|
"GlobalSubstreamCursor",
|
@@ -0,0 +1,270 @@
|
|
1
|
+
import copy
|
2
|
+
|
3
|
+
#
|
4
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
5
|
+
#
|
6
|
+
import logging
|
7
|
+
from collections import OrderedDict
|
8
|
+
from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
|
9
|
+
|
10
|
+
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
11
|
+
from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
|
12
|
+
from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
|
13
|
+
from airbyte_cdk.sources.message import MessageRepository
|
14
|
+
from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
|
15
|
+
PerPartitionKeySerializer,
|
16
|
+
)
|
17
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField
|
18
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
19
|
+
from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
|
20
|
+
|
21
|
+
logger = logging.getLogger("airbyte")
|
22
|
+
|
23
|
+
|
24
|
+
class ConcurrentCursorFactory:
|
25
|
+
def __init__(self, create_function: Callable[..., Cursor]):
|
26
|
+
self._create_function = create_function
|
27
|
+
|
28
|
+
def create(self, stream_state: Mapping[str, Any]) -> Cursor:
|
29
|
+
return self._create_function(stream_state=stream_state)
|
30
|
+
|
31
|
+
|
32
|
+
class ConcurrentPerPartitionCursor(Cursor):
|
33
|
+
"""
|
34
|
+
Manages state per partition when a stream has many partitions, to prevent data loss or duplication.
|
35
|
+
|
36
|
+
**Partition Limitation and Limit Reached Logic**
|
37
|
+
|
38
|
+
- **DEFAULT_MAX_PARTITIONS_NUMBER**: The maximum number of partitions to keep in memory (default is 10,000).
|
39
|
+
- **_cursor_per_partition**: An ordered dictionary that stores cursors for each partition.
|
40
|
+
- **_over_limit**: A counter that increments each time an oldest partition is removed when the limit is exceeded.
|
41
|
+
|
42
|
+
The class ensures that the number of partitions tracked does not exceed the `DEFAULT_MAX_PARTITIONS_NUMBER` to prevent excessive memory usage.
|
43
|
+
|
44
|
+
- When the number of partitions exceeds the limit, the oldest partitions are removed from `_cursor_per_partition`, and `_over_limit` is incremented accordingly.
|
45
|
+
- The `limit_reached` method returns `True` when `_over_limit` exceeds `DEFAULT_MAX_PARTITIONS_NUMBER`, indicating that the global cursor should be used instead of per-partition cursors.
|
46
|
+
|
47
|
+
This approach avoids unnecessary switching to a global cursor due to temporary spikes in partition counts, ensuring that switching is only done when a sustained high number of partitions is observed.
|
48
|
+
"""
|
49
|
+
|
50
|
+
DEFAULT_MAX_PARTITIONS_NUMBER = 10000
|
51
|
+
_NO_STATE: Mapping[str, Any] = {}
|
52
|
+
_NO_CURSOR_STATE: Mapping[str, Any] = {}
|
53
|
+
_KEY = 0
|
54
|
+
_VALUE = 1
|
55
|
+
_state_to_migrate_from: Mapping[str, Any] = {}
|
56
|
+
|
57
|
+
def __init__(
|
58
|
+
self,
|
59
|
+
cursor_factory: ConcurrentCursorFactory,
|
60
|
+
partition_router: PartitionRouter,
|
61
|
+
stream_name: str,
|
62
|
+
stream_namespace: Optional[str],
|
63
|
+
stream_state: Any,
|
64
|
+
message_repository: MessageRepository,
|
65
|
+
connector_state_manager: ConnectorStateManager,
|
66
|
+
cursor_field: CursorField,
|
67
|
+
) -> None:
|
68
|
+
self._stream_name = stream_name
|
69
|
+
self._stream_namespace = stream_namespace
|
70
|
+
self._message_repository = message_repository
|
71
|
+
self._connector_state_manager = connector_state_manager
|
72
|
+
self._cursor_field = cursor_field
|
73
|
+
|
74
|
+
self._cursor_factory = cursor_factory
|
75
|
+
self._partition_router = partition_router
|
76
|
+
|
77
|
+
# The dict is ordered to ensure that once the maximum number of partitions is reached,
|
78
|
+
# the oldest partitions can be efficiently removed, maintaining the most recent partitions.
|
79
|
+
self._cursor_per_partition: OrderedDict[str, Cursor] = OrderedDict()
|
80
|
+
self._over_limit = 0
|
81
|
+
self._partition_serializer = PerPartitionKeySerializer()
|
82
|
+
|
83
|
+
self._set_initial_state(stream_state)
|
84
|
+
|
85
|
+
@property
|
86
|
+
def cursor_field(self) -> CursorField:
|
87
|
+
return self._cursor_field
|
88
|
+
|
89
|
+
@property
|
90
|
+
def state(self) -> MutableMapping[str, Any]:
|
91
|
+
states = []
|
92
|
+
for partition_tuple, cursor in self._cursor_per_partition.items():
|
93
|
+
cursor_state = cursor._connector_state_converter.convert_to_state_message(
|
94
|
+
cursor._cursor_field, cursor.state
|
95
|
+
)
|
96
|
+
if cursor_state:
|
97
|
+
states.append(
|
98
|
+
{
|
99
|
+
"partition": self._to_dict(partition_tuple),
|
100
|
+
"cursor": copy.deepcopy(cursor_state),
|
101
|
+
}
|
102
|
+
)
|
103
|
+
state: dict[str, Any] = {"states": states}
|
104
|
+
return state
|
105
|
+
|
106
|
+
def close_partition(self, partition: Partition) -> None:
|
107
|
+
self._cursor_per_partition[self._to_partition_key(partition._stream_slice.partition)].close_partition_without_emit(partition=partition)
|
108
|
+
|
109
|
+
def ensure_at_least_one_state_emitted(self) -> None:
|
110
|
+
"""
|
111
|
+
The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
|
112
|
+
called.
|
113
|
+
"""
|
114
|
+
self._emit_state_message()
|
115
|
+
|
116
|
+
def _emit_state_message(self) -> None:
|
117
|
+
self._connector_state_manager.update_state_for_stream(
|
118
|
+
self._stream_name,
|
119
|
+
self._stream_namespace,
|
120
|
+
self.state,
|
121
|
+
)
|
122
|
+
state_message = self._connector_state_manager.create_state_message(
|
123
|
+
self._stream_name, self._stream_namespace
|
124
|
+
)
|
125
|
+
self._message_repository.emit_message(state_message)
|
126
|
+
|
127
|
+
|
128
|
+
def stream_slices(self) -> Iterable[StreamSlice]:
|
129
|
+
slices = self._partition_router.stream_slices()
|
130
|
+
for partition in slices:
|
131
|
+
yield from self.generate_slices_from_partition(partition)
|
132
|
+
|
133
|
+
def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
|
134
|
+
# Ensure the maximum number of partitions is not exceeded
|
135
|
+
self._ensure_partition_limit()
|
136
|
+
|
137
|
+
cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
|
138
|
+
if not cursor:
|
139
|
+
partition_state = (
|
140
|
+
self._state_to_migrate_from
|
141
|
+
if self._state_to_migrate_from
|
142
|
+
else self._NO_CURSOR_STATE
|
143
|
+
)
|
144
|
+
cursor = self._create_cursor(partition_state)
|
145
|
+
self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
|
146
|
+
|
147
|
+
for cursor_slice in cursor.stream_slices():
|
148
|
+
yield StreamSlice(
|
149
|
+
partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
|
150
|
+
)
|
151
|
+
|
152
|
+
def _ensure_partition_limit(self) -> None:
|
153
|
+
"""
|
154
|
+
Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
|
155
|
+
"""
|
156
|
+
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
|
157
|
+
self._over_limit += 1
|
158
|
+
oldest_partition = self._cursor_per_partition.popitem(last=False)[
|
159
|
+
0
|
160
|
+
] # Remove the oldest partition
|
161
|
+
logger.warning(
|
162
|
+
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
|
163
|
+
)
|
164
|
+
|
165
|
+
def limit_reached(self) -> bool:
|
166
|
+
return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
|
167
|
+
|
168
|
+
def _set_initial_state(self, stream_state: StreamState) -> None:
|
169
|
+
"""
|
170
|
+
Set the initial state for the cursors.
|
171
|
+
|
172
|
+
This method initializes the state for each partition cursor using the provided stream state.
|
173
|
+
If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
|
174
|
+
|
175
|
+
Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router
|
176
|
+
does not have parent streams, this step will be skipped due to the default PartitionRouter implementation.
|
177
|
+
|
178
|
+
Args:
|
179
|
+
stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
|
180
|
+
{
|
181
|
+
"states": [
|
182
|
+
{
|
183
|
+
"partition": {
|
184
|
+
"partition_key": "value"
|
185
|
+
},
|
186
|
+
"cursor": {
|
187
|
+
"last_updated": "2023-05-27T00:00:00Z"
|
188
|
+
}
|
189
|
+
}
|
190
|
+
],
|
191
|
+
"parent_state": {
|
192
|
+
"parent_stream_name": {
|
193
|
+
"last_updated": "2023-05-27T00:00:00Z"
|
194
|
+
}
|
195
|
+
}
|
196
|
+
}
|
197
|
+
"""
|
198
|
+
if not stream_state:
|
199
|
+
return
|
200
|
+
|
201
|
+
if "states" not in stream_state:
|
202
|
+
# We assume that `stream_state` is in a global format that can be applied to all partitions.
|
203
|
+
# Example: {"global_state_format_key": "global_state_format_value"}
|
204
|
+
self._state_to_migrate_from = stream_state
|
205
|
+
|
206
|
+
else:
|
207
|
+
for state in stream_state["states"]:
|
208
|
+
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
|
209
|
+
self._create_cursor(state["cursor"])
|
210
|
+
)
|
211
|
+
|
212
|
+
# set default state for missing partitions if it is per partition with fallback to global
|
213
|
+
if "state" in stream_state:
|
214
|
+
self._state_to_migrate_from = stream_state["state"]
|
215
|
+
|
216
|
+
# Set parent state for partition routers based on parent streams
|
217
|
+
self._partition_router.set_initial_state(stream_state)
|
218
|
+
|
219
|
+
def observe(self, record: Record) -> None:
|
220
|
+
self._cursor_per_partition[self._to_partition_key(record.associated_slice.partition)].observe(record)
|
221
|
+
|
222
|
+
def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
|
223
|
+
return self._partition_serializer.to_partition_key(partition)
|
224
|
+
|
225
|
+
def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
|
226
|
+
return self._partition_serializer.to_partition(partition_key)
|
227
|
+
|
228
|
+
def _create_cursor(self, cursor_state: Any) -> DeclarativeCursor:
|
229
|
+
cursor = self._cursor_factory.create(stream_state=cursor_state)
|
230
|
+
return cursor
|
231
|
+
|
232
|
+
def should_be_synced(self, record: Record) -> bool:
|
233
|
+
return self._get_cursor(record).should_be_synced(record)
|
234
|
+
|
235
|
+
def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
|
236
|
+
if not first.associated_slice or not second.associated_slice:
|
237
|
+
raise ValueError(
|
238
|
+
f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}"
|
239
|
+
)
|
240
|
+
if first.associated_slice.partition != second.associated_slice.partition:
|
241
|
+
raise ValueError(
|
242
|
+
f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}"
|
243
|
+
)
|
244
|
+
|
245
|
+
return self._get_cursor(first).is_greater_than_or_equal(
|
246
|
+
self._convert_record_to_cursor_record(first),
|
247
|
+
self._convert_record_to_cursor_record(second),
|
248
|
+
)
|
249
|
+
|
250
|
+
@staticmethod
|
251
|
+
def _convert_record_to_cursor_record(record: Record) -> Record:
|
252
|
+
return Record(
|
253
|
+
record.data,
|
254
|
+
StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
|
255
|
+
if record.associated_slice
|
256
|
+
else None,
|
257
|
+
)
|
258
|
+
|
259
|
+
def _get_cursor(self, record: Record) -> Cursor:
|
260
|
+
if not record.associated_slice:
|
261
|
+
raise ValueError(
|
262
|
+
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
263
|
+
)
|
264
|
+
partition_key = self._to_partition_key(record.associated_slice.partition)
|
265
|
+
if partition_key not in self._cursor_per_partition:
|
266
|
+
raise ValueError(
|
267
|
+
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
268
|
+
)
|
269
|
+
cursor = self._cursor_per_partition[partition_key]
|
270
|
+
return cursor
|
@@ -133,8 +133,8 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
133
133
|
:param stream_state: The state of the stream as returned by get_stream_state
|
134
134
|
"""
|
135
135
|
self._cursor = (
|
136
|
-
stream_state.get(self.cursor_field.eval(self.config)) if stream_state else None
|
137
|
-
)
|
136
|
+
stream_state.get(self.cursor_field.eval(self.config)) if stream_state else None # type: ignore [union-attr]
|
137
|
+
)
|
138
138
|
|
139
139
|
def observe(self, stream_slice: StreamSlice, record: Record) -> None:
|
140
140
|
"""
|
@@ -158,8 +158,10 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
158
158
|
)
|
159
159
|
if (
|
160
160
|
self._is_within_daterange_boundaries(
|
161
|
-
record,
|
162
|
-
|
161
|
+
record,
|
162
|
+
stream_slice.get(start_field), # type: ignore [arg-type]
|
163
|
+
stream_slice.get(end_field), # type: ignore [arg-type]
|
164
|
+
)
|
163
165
|
and is_highest_observed_cursor_value
|
164
166
|
):
|
165
167
|
self._highest_observed_cursor_field_value = record_cursor_value
|
@@ -368,9 +370,9 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
368
370
|
self._partition_field_start.eval(self.config)
|
369
371
|
)
|
370
372
|
if self.end_time_option and self.end_time_option.inject_into == option_type:
|
371
|
-
options[self.end_time_option.field_name.eval(config=self.config)] = stream_slice.get(
|
373
|
+
options[self.end_time_option.field_name.eval(config=self.config)] = stream_slice.get( # type: ignore [union-attr]
|
372
374
|
self._partition_field_end.eval(self.config)
|
373
|
-
)
|
375
|
+
)
|
374
376
|
return options
|
375
377
|
|
376
378
|
def should_be_synced(self, record: Record) -> bool:
|
@@ -303,6 +303,15 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
303
303
|
raise ValueError("A partition needs to be provided in order to get request body json")
|
304
304
|
|
305
305
|
def should_be_synced(self, record: Record) -> bool:
|
306
|
+
if self._to_partition_key(record.associated_slice.partition) not in self._cursor_per_partition:
|
307
|
+
partition_state = (
|
308
|
+
self._state_to_migrate_from
|
309
|
+
if self._state_to_migrate_from
|
310
|
+
else self._NO_CURSOR_STATE
|
311
|
+
)
|
312
|
+
cursor = self._create_cursor(partition_state)
|
313
|
+
|
314
|
+
self._cursor_per_partition[self._to_partition_key(record.associated_slice.partition)] = cursor
|
306
315
|
return self._get_cursor(record).should_be_synced(
|
307
316
|
self._convert_record_to_cursor_record(record)
|
308
317
|
)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
import ast
|
6
6
|
from functools import cache
|
7
|
-
from typing import Any, Mapping, Optional,
|
7
|
+
from typing import Any, Mapping, Optional, Tuple, Type
|
8
8
|
|
9
9
|
from jinja2 import meta
|
10
10
|
from jinja2.environment import Template
|
@@ -27,35 +27,7 @@ class StreamPartitionAccessEnvironment(SandboxedEnvironment):
|
|
27
27
|
def is_safe_attribute(self, obj: Any, attr: str, value: Any) -> bool:
|
28
28
|
if attr in ["_partition"]:
|
29
29
|
return True
|
30
|
-
return super().is_safe_attribute(obj, attr, value)
|
31
|
-
|
32
|
-
|
33
|
-
# These aliases are used to deprecate existing keywords without breaking all existing connectors.
|
34
|
-
_ALIASES = {
|
35
|
-
"stream_interval": "stream_slice", # Use stream_interval to access incremental_sync values
|
36
|
-
"stream_partition": "stream_slice", # Use stream_partition to access partition router's values
|
37
|
-
}
|
38
|
-
|
39
|
-
# These extensions are not installed so they're not currently a problem,
|
40
|
-
# but we're still explicitely removing them from the jinja context.
|
41
|
-
# At worst, this is documentation that we do NOT want to include these extensions because of the potential security risks
|
42
|
-
_RESTRICTED_EXTENSIONS = ["jinja2.ext.loopcontrols"] # Adds support for break continue in loops
|
43
|
-
|
44
|
-
# By default, these Python builtin functions are available in the Jinja context.
|
45
|
-
# We explicitely remove them because of the potential security risk.
|
46
|
-
# Please add a unit test to test_jinja.py when adding a restriction.
|
47
|
-
_RESTRICTED_BUILTIN_FUNCTIONS = [
|
48
|
-
"range"
|
49
|
-
] # The range function can cause very expensive computations
|
50
|
-
|
51
|
-
_ENVIRONMENT = StreamPartitionAccessEnvironment()
|
52
|
-
_ENVIRONMENT.filters.update(**filters)
|
53
|
-
_ENVIRONMENT.globals.update(**macros)
|
54
|
-
|
55
|
-
for extension in _RESTRICTED_EXTENSIONS:
|
56
|
-
_ENVIRONMENT.extensions.pop(extension, None)
|
57
|
-
for builtin in _RESTRICTED_BUILTIN_FUNCTIONS:
|
58
|
-
_ENVIRONMENT.globals.pop(builtin, None)
|
30
|
+
return super().is_safe_attribute(obj, attr, value)
|
59
31
|
|
60
32
|
|
61
33
|
class JinjaInterpolation(Interpolation):
|
@@ -76,6 +48,34 @@ class JinjaInterpolation(Interpolation):
|
|
76
48
|
Additional information on jinja templating can be found at https://jinja.palletsprojects.com/en/3.1.x/templates/#
|
77
49
|
"""
|
78
50
|
|
51
|
+
# These aliases are used to deprecate existing keywords without breaking all existing connectors.
|
52
|
+
ALIASES = {
|
53
|
+
"stream_interval": "stream_slice", # Use stream_interval to access incremental_sync values
|
54
|
+
"stream_partition": "stream_slice", # Use stream_partition to access partition router's values
|
55
|
+
}
|
56
|
+
|
57
|
+
# These extensions are not installed so they're not currently a problem,
|
58
|
+
# but we're still explicitely removing them from the jinja context.
|
59
|
+
# At worst, this is documentation that we do NOT want to include these extensions because of the potential security risks
|
60
|
+
RESTRICTED_EXTENSIONS = ["jinja2.ext.loopcontrols"] # Adds support for break continue in loops
|
61
|
+
|
62
|
+
# By default, these Python builtin functions are available in the Jinja context.
|
63
|
+
# We explicitely remove them because of the potential security risk.
|
64
|
+
# Please add a unit test to test_jinja.py when adding a restriction.
|
65
|
+
RESTRICTED_BUILTIN_FUNCTIONS = [
|
66
|
+
"range"
|
67
|
+
] # The range function can cause very expensive computations
|
68
|
+
|
69
|
+
def __init__(self) -> None:
|
70
|
+
self._environment = StreamPartitionAccessEnvironment()
|
71
|
+
self._environment.filters.update(**filters)
|
72
|
+
self._environment.globals.update(**macros)
|
73
|
+
|
74
|
+
for extension in self.RESTRICTED_EXTENSIONS:
|
75
|
+
self._environment.extensions.pop(extension, None)
|
76
|
+
for builtin in self.RESTRICTED_BUILTIN_FUNCTIONS:
|
77
|
+
self._environment.globals.pop(builtin, None)
|
78
|
+
|
79
79
|
def eval(
|
80
80
|
self,
|
81
81
|
input_str: str,
|
@@ -86,7 +86,7 @@ class JinjaInterpolation(Interpolation):
|
|
86
86
|
) -> Any:
|
87
87
|
context = {"config": config, **additional_parameters}
|
88
88
|
|
89
|
-
for alias, equivalent in
|
89
|
+
for alias, equivalent in self.ALIASES.items():
|
90
90
|
if alias in context:
|
91
91
|
# This is unexpected. We could ignore or log a warning, but failing loudly should result in fewer surprises
|
92
92
|
raise ValueError(
|
@@ -105,7 +105,6 @@ class JinjaInterpolation(Interpolation):
|
|
105
105
|
raise Exception(f"Expected a string, got {input_str}")
|
106
106
|
except UndefinedError:
|
107
107
|
pass
|
108
|
-
|
109
108
|
# If result is empty or resulted in an undefined error, evaluate and return the default string
|
110
109
|
return self._literal_eval(self._eval(default, context), valid_types)
|
111
110
|
|
@@ -133,16 +132,16 @@ class JinjaInterpolation(Interpolation):
|
|
133
132
|
return s
|
134
133
|
|
135
134
|
@cache
|
136
|
-
def _find_undeclared_variables(self, s: Optional[str]) ->
|
135
|
+
def _find_undeclared_variables(self, s: Optional[str]) -> set[str]:
|
137
136
|
"""
|
138
137
|
Find undeclared variables and cache them
|
139
138
|
"""
|
140
|
-
ast =
|
139
|
+
ast = self._environment.parse(s) # type: ignore # parse is able to handle None
|
141
140
|
return meta.find_undeclared_variables(ast)
|
142
141
|
|
143
142
|
@cache
|
144
|
-
def _compile(self, s: str) -> Template:
|
143
|
+
def _compile(self, s: Optional[str]) -> Template:
|
145
144
|
"""
|
146
145
|
We must cache the Jinja Template ourselves because we're using `from_string` instead of a template loader
|
147
146
|
"""
|
148
|
-
return
|
147
|
+
return self._environment.from_string(s) # type: ignore [arg-type] # Expected `str | Template` but passed `str | None`
|
@@ -116,7 +116,7 @@ def duration(datestring: str) -> Union[datetime.timedelta, isodate.Duration]:
|
|
116
116
|
Usage:
|
117
117
|
`"{{ now_utc() - duration('P1D') }}"`
|
118
118
|
"""
|
119
|
-
return parse_duration(datestring)
|
119
|
+
return parse_duration(datestring)
|
120
120
|
|
121
121
|
|
122
122
|
def format_datetime(
|
@@ -81,6 +81,8 @@ from airbyte_cdk.sources.declarative.extractors.record_selector import (
|
|
81
81
|
)
|
82
82
|
from airbyte_cdk.sources.declarative.incremental import (
|
83
83
|
ChildPartitionResumableFullRefreshCursor,
|
84
|
+
ConcurrentCursorFactory,
|
85
|
+
ConcurrentPerPartitionCursor,
|
84
86
|
CursorFactory,
|
85
87
|
DatetimeBasedCursor,
|
86
88
|
DeclarativeCursor,
|
@@ -396,7 +398,7 @@ class ModelToComponentFactory:
|
|
396
398
|
self._disable_retries = disable_retries
|
397
399
|
self._disable_cache = disable_cache
|
398
400
|
self._disable_resumable_full_refresh = disable_resumable_full_refresh
|
399
|
-
self._message_repository = message_repository or InMemoryMessageRepository(
|
401
|
+
self._message_repository = message_repository or InMemoryMessageRepository(
|
400
402
|
self._evaluate_log_level(emit_connector_builder_messages)
|
401
403
|
)
|
402
404
|
|
@@ -644,7 +646,7 @@ class ModelToComponentFactory:
|
|
644
646
|
declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams.
|
645
647
|
config,
|
646
648
|
declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any]
|
647
|
-
)
|
649
|
+
)
|
648
650
|
|
649
651
|
def create_session_token_authenticator(
|
650
652
|
self, model: SessionTokenAuthenticatorModel, config: Config, name: str, **kwargs: Any
|
@@ -674,7 +676,7 @@ class ModelToComponentFactory:
|
|
674
676
|
return ModelToComponentFactory.create_bearer_authenticator(
|
675
677
|
BearerAuthenticatorModel(type="BearerAuthenticator", api_token=""), # type: ignore # $parameters has a default value
|
676
678
|
config,
|
677
|
-
token_provider=token_provider,
|
679
|
+
token_provider=token_provider,
|
678
680
|
)
|
679
681
|
else:
|
680
682
|
return ModelToComponentFactory.create_api_key_authenticator(
|
@@ -821,7 +823,6 @@ class ModelToComponentFactory:
|
|
821
823
|
input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats,
|
822
824
|
is_sequential_state=True,
|
823
825
|
cursor_granularity=cursor_granularity,
|
824
|
-
# type: ignore # Having issues w/ inspection for GapType and CursorValueType as shown in existing tests. Confirmed functionality is working in practice
|
825
826
|
)
|
826
827
|
|
827
828
|
start_date_runtime_value: Union[InterpolatedString, str, MinMaxDatetime]
|
@@ -894,7 +895,7 @@ class ModelToComponentFactory:
|
|
894
895
|
stream_name=stream_name,
|
895
896
|
stream_namespace=stream_namespace,
|
896
897
|
stream_state=stream_state,
|
897
|
-
message_repository=self._message_repository,
|
898
|
+
message_repository=self._message_repository,
|
898
899
|
connector_state_manager=state_manager,
|
899
900
|
connector_state_converter=connector_state_converter,
|
900
901
|
cursor_field=cursor_field,
|
@@ -906,6 +907,62 @@ class ModelToComponentFactory:
|
|
906
907
|
cursor_granularity=cursor_granularity,
|
907
908
|
)
|
908
909
|
|
910
|
+
def create_concurrent_cursor_from_perpartition_cursor(
|
911
|
+
self,
|
912
|
+
state_manager: ConnectorStateManager,
|
913
|
+
model_type: Type[BaseModel],
|
914
|
+
component_definition: ComponentDefinition,
|
915
|
+
stream_name: str,
|
916
|
+
stream_namespace: Optional[str],
|
917
|
+
config: Config,
|
918
|
+
stream_state: MutableMapping[str, Any],
|
919
|
+
partition_router,
|
920
|
+
**kwargs: Any,
|
921
|
+
) -> ConcurrentPerPartitionCursor:
|
922
|
+
component_type = component_definition.get("type")
|
923
|
+
if component_definition.get("type") != model_type.__name__:
|
924
|
+
raise ValueError(
|
925
|
+
f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead"
|
926
|
+
)
|
927
|
+
|
928
|
+
datetime_based_cursor_model = model_type.parse_obj(component_definition)
|
929
|
+
|
930
|
+
if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel):
|
931
|
+
raise ValueError(
|
932
|
+
f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
|
933
|
+
)
|
934
|
+
|
935
|
+
interpolated_cursor_field = InterpolatedString.create(
|
936
|
+
datetime_based_cursor_model.cursor_field,
|
937
|
+
parameters=datetime_based_cursor_model.parameters or {},
|
938
|
+
)
|
939
|
+
cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
|
940
|
+
|
941
|
+
# Create the cursor factory
|
942
|
+
cursor_factory = ConcurrentCursorFactory(
|
943
|
+
partial(
|
944
|
+
self.create_concurrent_cursor_from_datetime_based_cursor,
|
945
|
+
state_manager=state_manager,
|
946
|
+
model_type=model_type,
|
947
|
+
component_definition=component_definition,
|
948
|
+
stream_name=stream_name,
|
949
|
+
stream_namespace=stream_namespace,
|
950
|
+
config=config,
|
951
|
+
)
|
952
|
+
)
|
953
|
+
|
954
|
+
# Return the concurrent cursor and state converter
|
955
|
+
return ConcurrentPerPartitionCursor(
|
956
|
+
cursor_factory=cursor_factory,
|
957
|
+
partition_router=partition_router,
|
958
|
+
stream_name=stream_name,
|
959
|
+
stream_namespace=stream_namespace,
|
960
|
+
stream_state=stream_state,
|
961
|
+
message_repository=self._message_repository, # type: ignore
|
962
|
+
connector_state_manager=state_manager,
|
963
|
+
cursor_field=cursor_field,
|
964
|
+
)
|
965
|
+
|
909
966
|
@staticmethod
|
910
967
|
def create_constant_backoff_strategy(
|
911
968
|
model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any
|
@@ -1188,17 +1245,14 @@ class ModelToComponentFactory:
|
|
1188
1245
|
raise ValueError(
|
1189
1246
|
"Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead"
|
1190
1247
|
)
|
1248
|
+
cursor = combined_slicers if isinstance(
|
1249
|
+
combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
|
1250
|
+
) else self._create_component_from_model(
|
1251
|
+
model=model.incremental_sync, config=config
|
1252
|
+
)
|
1253
|
+
|
1191
1254
|
client_side_incremental_sync = {
|
1192
|
-
"
|
1193
|
-
model=model.incremental_sync, config=config
|
1194
|
-
),
|
1195
|
-
"substream_cursor": (
|
1196
|
-
combined_slicers
|
1197
|
-
if isinstance(
|
1198
|
-
combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
|
1199
|
-
)
|
1200
|
-
else None
|
1201
|
-
),
|
1255
|
+
"cursor": cursor
|
1202
1256
|
}
|
1203
1257
|
|
1204
1258
|
if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
|
@@ -1705,7 +1759,7 @@ class ModelToComponentFactory:
|
|
1705
1759
|
refresh_token=model.refresh_token,
|
1706
1760
|
scopes=model.scopes,
|
1707
1761
|
token_expiry_date=model.token_expiry_date,
|
1708
|
-
token_expiry_date_format=model.token_expiry_date_format,
|
1762
|
+
token_expiry_date_format=model.token_expiry_date_format,
|
1709
1763
|
token_expiry_is_time_of_expiration=bool(model.token_expiry_date_format),
|
1710
1764
|
token_refresh_endpoint=model.token_refresh_endpoint,
|
1711
1765
|
config=config,
|
@@ -1912,7 +1966,7 @@ class ModelToComponentFactory:
|
|
1912
1966
|
if (
|
1913
1967
|
not isinstance(stream_slicer, DatetimeBasedCursor)
|
1914
1968
|
or type(stream_slicer) is not DatetimeBasedCursor
|
1915
|
-
):
|
1969
|
+
) and not isinstance(stream_slicer, PerPartitionWithGlobalCursor):
|
1916
1970
|
# Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods).
|
1917
1971
|
# Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement
|
1918
1972
|
# their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's
|