airbyte-cdk 6.21.1.dev0__py3-none-any.whl → 6.26.0.dev4103__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/cli/source_declarative_manifest/_run.py +6 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +1 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +68 -11
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +81 -16
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +58 -2
- airbyte_cdk/sources/declarative/decoders/__init__.py +9 -1
- airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +59 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +3 -5
- airbyte_cdk/sources/declarative/incremental/__init__.py +6 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +334 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +3 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +35 -3
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +15 -4
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +50 -14
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +143 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +220 -22
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +6 -2
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +22 -0
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +1 -1
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +15 -0
- airbyte_cdk/sources/file_based/config/identities_based_stream_config.py +8 -0
- airbyte_cdk/sources/file_based/config/permissions.py +34 -0
- airbyte_cdk/sources/file_based/file_based_source.py +65 -1
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +33 -0
- airbyte_cdk/sources/file_based/schema_helpers.py +25 -0
- airbyte_cdk/sources/file_based/stream/__init__.py +2 -1
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +29 -0
- airbyte_cdk/sources/file_based/stream/identities_stream.py +99 -0
- airbyte_cdk/sources/http_logger.py +1 -1
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +51 -57
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +20 -20
- airbyte_cdk/test/utils/manifest_only_fixtures.py +1 -2
- {airbyte_cdk-6.21.1.dev0.dist-info → airbyte_cdk-6.26.0.dev4103.dist-info}/METADATA +3 -3
- {airbyte_cdk-6.21.1.dev0.dist-info → airbyte_cdk-6.26.0.dev4103.dist-info}/RECORD +39 -31
- {airbyte_cdk-6.21.1.dev0.dist-info → airbyte_cdk-6.26.0.dev4103.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.21.1.dev0.dist-info → airbyte_cdk-6.26.0.dev4103.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.21.1.dev0.dist-info → airbyte_cdk-6.26.0.dev4103.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,334 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
import copy
|
6
|
+
import logging
|
7
|
+
import threading
|
8
|
+
from collections import OrderedDict
|
9
|
+
from copy import deepcopy
|
10
|
+
from datetime import timedelta
|
11
|
+
from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
|
12
|
+
|
13
|
+
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
14
|
+
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
|
15
|
+
Timer,
|
16
|
+
iterate_with_last_flag_and_state,
|
17
|
+
)
|
18
|
+
from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
|
19
|
+
from airbyte_cdk.sources.message import MessageRepository
|
20
|
+
from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
|
21
|
+
PerPartitionKeySerializer,
|
22
|
+
)
|
23
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, Cursor, CursorField
|
24
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
25
|
+
from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
|
26
|
+
|
27
|
+
logger = logging.getLogger("airbyte")
|
28
|
+
|
29
|
+
|
30
|
+
class ConcurrentCursorFactory:
|
31
|
+
def __init__(self, create_function: Callable[..., ConcurrentCursor]):
|
32
|
+
self._create_function = create_function
|
33
|
+
|
34
|
+
def create(
|
35
|
+
self, stream_state: Mapping[str, Any], runtime_lookback_window: Optional[timedelta]
|
36
|
+
) -> ConcurrentCursor:
|
37
|
+
return self._create_function(
|
38
|
+
stream_state=stream_state, runtime_lookback_window=runtime_lookback_window
|
39
|
+
)
|
40
|
+
|
41
|
+
|
42
|
+
class ConcurrentPerPartitionCursor(Cursor):
|
43
|
+
"""
|
44
|
+
Manages state per partition when a stream has many partitions, preventing data loss or duplication.
|
45
|
+
|
46
|
+
Attributes:
|
47
|
+
DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000).
|
48
|
+
|
49
|
+
- **Partition Limitation Logic**
|
50
|
+
Ensures the number of tracked partitions does not exceed the specified limit to prevent memory overuse. Oldest partitions are removed when the limit is reached.
|
51
|
+
|
52
|
+
- **Global Cursor Fallback**
|
53
|
+
New partitions use global state as the initial state to progress the state for deleted or new partitions. The history data added after the initial sync will be missing.
|
54
|
+
|
55
|
+
CurrentPerPartitionCursor expects the state of the ConcurrentCursor to follow the format {cursor_field: cursor_value}.
|
56
|
+
"""
|
57
|
+
|
58
|
+
DEFAULT_MAX_PARTITIONS_NUMBER = 10000
|
59
|
+
_NO_STATE: Mapping[str, Any] = {}
|
60
|
+
_NO_CURSOR_STATE: Mapping[str, Any] = {}
|
61
|
+
_GLOBAL_STATE_KEY = "state"
|
62
|
+
_PERPARTITION_STATE_KEY = "states"
|
63
|
+
_KEY = 0
|
64
|
+
_VALUE = 1
|
65
|
+
|
66
|
+
def __init__(
|
67
|
+
self,
|
68
|
+
cursor_factory: ConcurrentCursorFactory,
|
69
|
+
partition_router: PartitionRouter,
|
70
|
+
stream_name: str,
|
71
|
+
stream_namespace: Optional[str],
|
72
|
+
stream_state: Any,
|
73
|
+
message_repository: MessageRepository,
|
74
|
+
connector_state_manager: ConnectorStateManager,
|
75
|
+
cursor_field: CursorField,
|
76
|
+
) -> None:
|
77
|
+
self._global_cursor: Optional[StreamState] = {}
|
78
|
+
self._stream_name = stream_name
|
79
|
+
self._stream_namespace = stream_namespace
|
80
|
+
self._message_repository = message_repository
|
81
|
+
self._connector_state_manager = connector_state_manager
|
82
|
+
self._cursor_field = cursor_field
|
83
|
+
|
84
|
+
self._cursor_factory = cursor_factory
|
85
|
+
self._partition_router = partition_router
|
86
|
+
|
87
|
+
# The dict is ordered to ensure that once the maximum number of partitions is reached,
|
88
|
+
# the oldest partitions can be efficiently removed, maintaining the most recent partitions.
|
89
|
+
self._cursor_per_partition: OrderedDict[str, ConcurrentCursor] = OrderedDict()
|
90
|
+
self._semaphore_per_partition: OrderedDict[str, threading.Semaphore] = OrderedDict()
|
91
|
+
self._finished_partitions: set[str] = set()
|
92
|
+
self._lock = threading.Lock()
|
93
|
+
self._timer = Timer()
|
94
|
+
self._new_global_cursor: Optional[StreamState] = None
|
95
|
+
self._lookback_window: int = 0
|
96
|
+
self._parent_state: Optional[StreamState] = None
|
97
|
+
self._over_limit: int = 0
|
98
|
+
self._partition_serializer = PerPartitionKeySerializer()
|
99
|
+
|
100
|
+
self._set_initial_state(stream_state)
|
101
|
+
|
102
|
+
@property
|
103
|
+
def cursor_field(self) -> CursorField:
|
104
|
+
return self._cursor_field
|
105
|
+
|
106
|
+
@property
|
107
|
+
def state(self) -> MutableMapping[str, Any]:
|
108
|
+
states = []
|
109
|
+
for partition_tuple, cursor in self._cursor_per_partition.items():
|
110
|
+
if cursor.state:
|
111
|
+
states.append(
|
112
|
+
{
|
113
|
+
"partition": self._to_dict(partition_tuple),
|
114
|
+
"cursor": copy.deepcopy(cursor.state),
|
115
|
+
}
|
116
|
+
)
|
117
|
+
state: dict[str, Any] = {self._PERPARTITION_STATE_KEY: states}
|
118
|
+
|
119
|
+
if self._global_cursor:
|
120
|
+
state[self._GLOBAL_STATE_KEY] = self._global_cursor
|
121
|
+
if self._lookback_window is not None:
|
122
|
+
state["lookback_window"] = self._lookback_window
|
123
|
+
if self._parent_state is not None:
|
124
|
+
state["parent_state"] = self._parent_state
|
125
|
+
return state
|
126
|
+
|
127
|
+
def close_partition(self, partition: Partition) -> None:
|
128
|
+
# Attempt to retrieve the stream slice
|
129
|
+
stream_slice: Optional[StreamSlice] = partition.to_slice() # type: ignore[assignment]
|
130
|
+
|
131
|
+
# Ensure stream_slice is not None
|
132
|
+
if stream_slice is None:
|
133
|
+
raise ValueError("stream_slice cannot be None")
|
134
|
+
|
135
|
+
partition_key = self._to_partition_key(stream_slice.partition)
|
136
|
+
self._cursor_per_partition[partition_key].close_partition(partition=partition)
|
137
|
+
with self._lock:
|
138
|
+
self._semaphore_per_partition[partition_key].acquire()
|
139
|
+
cursor = self._cursor_per_partition[partition_key]
|
140
|
+
if (
|
141
|
+
partition_key in self._finished_partitions
|
142
|
+
and self._semaphore_per_partition[partition_key]._value == 0
|
143
|
+
):
|
144
|
+
if (
|
145
|
+
self._new_global_cursor is None
|
146
|
+
or self._new_global_cursor[self.cursor_field.cursor_field_key]
|
147
|
+
< cursor.state[self.cursor_field.cursor_field_key]
|
148
|
+
):
|
149
|
+
self._new_global_cursor = copy.deepcopy(cursor.state)
|
150
|
+
self._emit_state_message()
|
151
|
+
|
152
|
+
def ensure_at_least_one_state_emitted(self) -> None:
|
153
|
+
"""
|
154
|
+
The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
|
155
|
+
called.
|
156
|
+
"""
|
157
|
+
if not any(
|
158
|
+
semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
|
159
|
+
):
|
160
|
+
self._global_cursor = self._new_global_cursor
|
161
|
+
self._lookback_window = self._timer.finish()
|
162
|
+
self._parent_state = self._partition_router.get_stream_state()
|
163
|
+
self._emit_state_message()
|
164
|
+
|
165
|
+
def _emit_state_message(self) -> None:
|
166
|
+
self._connector_state_manager.update_state_for_stream(
|
167
|
+
self._stream_name,
|
168
|
+
self._stream_namespace,
|
169
|
+
self.state,
|
170
|
+
)
|
171
|
+
state_message = self._connector_state_manager.create_state_message(
|
172
|
+
self._stream_name, self._stream_namespace
|
173
|
+
)
|
174
|
+
self._message_repository.emit_message(state_message)
|
175
|
+
|
176
|
+
def stream_slices(self) -> Iterable[StreamSlice]:
|
177
|
+
if self._timer.is_running():
|
178
|
+
raise RuntimeError("stream_slices has been executed more than once.")
|
179
|
+
|
180
|
+
slices = self._partition_router.stream_slices()
|
181
|
+
self._timer.start()
|
182
|
+
for partition in slices:
|
183
|
+
yield from self._generate_slices_from_partition(partition)
|
184
|
+
|
185
|
+
def _generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
|
186
|
+
# Ensure the maximum number of partitions is not exceeded
|
187
|
+
self._ensure_partition_limit()
|
188
|
+
|
189
|
+
cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
|
190
|
+
if not cursor:
|
191
|
+
cursor = self._create_cursor(
|
192
|
+
self._global_cursor,
|
193
|
+
self._lookback_window if self._global_cursor else 0,
|
194
|
+
)
|
195
|
+
self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
|
196
|
+
self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
|
197
|
+
threading.Semaphore(0)
|
198
|
+
)
|
199
|
+
|
200
|
+
for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
|
201
|
+
cursor.stream_slices(),
|
202
|
+
lambda: None,
|
203
|
+
):
|
204
|
+
self._semaphore_per_partition[self._to_partition_key(partition.partition)].release()
|
205
|
+
if is_last_slice:
|
206
|
+
self._finished_partitions.add(self._to_partition_key(partition.partition))
|
207
|
+
yield StreamSlice(
|
208
|
+
partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
|
209
|
+
)
|
210
|
+
|
211
|
+
def _ensure_partition_limit(self) -> None:
|
212
|
+
"""
|
213
|
+
Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
|
214
|
+
"""
|
215
|
+
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
|
216
|
+
self._over_limit += 1
|
217
|
+
oldest_partition = self._cursor_per_partition.popitem(last=False)[
|
218
|
+
0
|
219
|
+
] # Remove the oldest partition
|
220
|
+
logger.warning(
|
221
|
+
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
|
222
|
+
)
|
223
|
+
|
224
|
+
def _set_initial_state(self, stream_state: StreamState) -> None:
|
225
|
+
"""
|
226
|
+
Initialize the cursor's state using the provided `stream_state`.
|
227
|
+
|
228
|
+
This method supports global and per-partition state initialization.
|
229
|
+
|
230
|
+
- **Global State**: If `states` is missing, the `state` is treated as global and applied to all partitions.
|
231
|
+
The `global state` holds a single cursor position representing the latest processed record across all partitions.
|
232
|
+
|
233
|
+
- **Lookback Window**: Configured via `lookback_window`, it defines the period (in seconds) for reprocessing records.
|
234
|
+
This ensures robustness in case of upstream data delays or reordering. If not specified, it defaults to 0.
|
235
|
+
|
236
|
+
- **Per-Partition State**: If `states` is present, each partition's cursor state is initialized separately.
|
237
|
+
|
238
|
+
- **Parent State**: (if available) Used to initialize partition routers based on parent streams.
|
239
|
+
|
240
|
+
Args:
|
241
|
+
stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
|
242
|
+
{
|
243
|
+
"states": [
|
244
|
+
{
|
245
|
+
"partition": {
|
246
|
+
"partition_key": "value"
|
247
|
+
},
|
248
|
+
"cursor": {
|
249
|
+
"last_updated": "2023-05-27T00:00:00Z"
|
250
|
+
}
|
251
|
+
}
|
252
|
+
],
|
253
|
+
"state": {
|
254
|
+
"last_updated": "2023-05-27T00:00:00Z"
|
255
|
+
},
|
256
|
+
lookback_window: 10,
|
257
|
+
"parent_state": {
|
258
|
+
"parent_stream_name": {
|
259
|
+
"last_updated": "2023-05-27T00:00:00Z"
|
260
|
+
}
|
261
|
+
}
|
262
|
+
}
|
263
|
+
"""
|
264
|
+
if not stream_state:
|
265
|
+
return
|
266
|
+
|
267
|
+
if self._PERPARTITION_STATE_KEY not in stream_state:
|
268
|
+
# We assume that `stream_state` is in a global format that can be applied to all partitions.
|
269
|
+
# Example: {"global_state_format_key": "global_state_format_value"}
|
270
|
+
self._global_cursor = deepcopy(stream_state)
|
271
|
+
self._new_global_cursor = deepcopy(stream_state)
|
272
|
+
|
273
|
+
else:
|
274
|
+
self._lookback_window = int(stream_state.get("lookback_window", 0))
|
275
|
+
|
276
|
+
for state in stream_state[self._PERPARTITION_STATE_KEY]:
|
277
|
+
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
|
278
|
+
self._create_cursor(state["cursor"])
|
279
|
+
)
|
280
|
+
self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
|
281
|
+
threading.Semaphore(0)
|
282
|
+
)
|
283
|
+
|
284
|
+
# set default state for missing partitions if it is per partition with fallback to global
|
285
|
+
if self._GLOBAL_STATE_KEY in stream_state:
|
286
|
+
self._global_cursor = deepcopy(stream_state[self._GLOBAL_STATE_KEY])
|
287
|
+
self._new_global_cursor = deepcopy(stream_state[self._GLOBAL_STATE_KEY])
|
288
|
+
|
289
|
+
# Set initial parent state
|
290
|
+
if stream_state.get("parent_state"):
|
291
|
+
self._parent_state = stream_state["parent_state"]
|
292
|
+
|
293
|
+
# Set parent state for partition routers based on parent streams
|
294
|
+
self._partition_router.set_initial_state(stream_state)
|
295
|
+
|
296
|
+
def observe(self, record: Record) -> None:
|
297
|
+
if not record.associated_slice:
|
298
|
+
raise ValueError(
|
299
|
+
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
300
|
+
)
|
301
|
+
self._cursor_per_partition[
|
302
|
+
self._to_partition_key(record.associated_slice.partition)
|
303
|
+
].observe(record)
|
304
|
+
|
305
|
+
def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
|
306
|
+
return self._partition_serializer.to_partition_key(partition)
|
307
|
+
|
308
|
+
def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
|
309
|
+
return self._partition_serializer.to_partition(partition_key)
|
310
|
+
|
311
|
+
def _create_cursor(
|
312
|
+
self, cursor_state: Any, runtime_lookback_window: int = 0
|
313
|
+
) -> ConcurrentCursor:
|
314
|
+
cursor = self._cursor_factory.create(
|
315
|
+
stream_state=deepcopy(cursor_state),
|
316
|
+
runtime_lookback_window=timedelta(seconds=runtime_lookback_window),
|
317
|
+
)
|
318
|
+
return cursor
|
319
|
+
|
320
|
+
def should_be_synced(self, record: Record) -> bool:
|
321
|
+
return self._get_cursor(record).should_be_synced(record)
|
322
|
+
|
323
|
+
def _get_cursor(self, record: Record) -> ConcurrentCursor:
|
324
|
+
if not record.associated_slice:
|
325
|
+
raise ValueError(
|
326
|
+
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
327
|
+
)
|
328
|
+
partition_key = self._to_partition_key(record.associated_slice.partition)
|
329
|
+
if partition_key not in self._cursor_per_partition:
|
330
|
+
raise ValueError(
|
331
|
+
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
332
|
+
)
|
333
|
+
cursor = self._cursor_per_partition[partition_key]
|
334
|
+
return cursor
|
@@ -222,6 +222,8 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
222
222
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
223
223
|
) -> Mapping[str, Any]:
|
224
224
|
if stream_slice:
|
225
|
+
if self._to_partition_key(stream_slice.partition) not in self._cursor_per_partition:
|
226
|
+
self._create_cursor_for_partition(self._to_partition_key(stream_slice.partition))
|
225
227
|
return self._partition_router.get_request_params( # type: ignore # this always returns a mapping
|
226
228
|
stream_state=stream_state,
|
227
229
|
stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}),
|
@@ -244,6 +246,8 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
244
246
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
245
247
|
) -> Mapping[str, Any]:
|
246
248
|
if stream_slice:
|
249
|
+
if self._to_partition_key(stream_slice.partition) not in self._cursor_per_partition:
|
250
|
+
self._create_cursor_for_partition(self._to_partition_key(stream_slice.partition))
|
247
251
|
return self._partition_router.get_request_headers( # type: ignore # this always returns a mapping
|
248
252
|
stream_state=stream_state,
|
249
253
|
stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}),
|
@@ -266,6 +270,8 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
266
270
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
267
271
|
) -> Union[Mapping[str, Any], str]:
|
268
272
|
if stream_slice:
|
273
|
+
if self._to_partition_key(stream_slice.partition) not in self._cursor_per_partition:
|
274
|
+
self._create_cursor_for_partition(self._to_partition_key(stream_slice.partition))
|
269
275
|
return self._partition_router.get_request_body_data( # type: ignore # this always returns a mapping
|
270
276
|
stream_state=stream_state,
|
271
277
|
stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}),
|
@@ -288,6 +294,8 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
288
294
|
next_page_token: Optional[Mapping[str, Any]] = None,
|
289
295
|
) -> Mapping[str, Any]:
|
290
296
|
if stream_slice:
|
297
|
+
if self._to_partition_key(stream_slice.partition) not in self._cursor_per_partition:
|
298
|
+
self._create_cursor_for_partition(self._to_partition_key(stream_slice.partition))
|
291
299
|
return self._partition_router.get_request_body_json( # type: ignore # this always returns a mapping
|
292
300
|
stream_state=stream_state,
|
293
301
|
stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}),
|
@@ -341,8 +349,32 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
341
349
|
)
|
342
350
|
partition_key = self._to_partition_key(record.associated_slice.partition)
|
343
351
|
if partition_key not in self._cursor_per_partition:
|
344
|
-
|
345
|
-
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
346
|
-
)
|
352
|
+
self._create_cursor_for_partition(partition_key)
|
347
353
|
cursor = self._cursor_per_partition[partition_key]
|
348
354
|
return cursor
|
355
|
+
|
356
|
+
def _create_cursor_for_partition(self, partition_key: str) -> None:
|
357
|
+
"""
|
358
|
+
Dynamically creates and initializes a cursor for the specified partition.
|
359
|
+
|
360
|
+
This method is required for `ConcurrentPerPartitionCursor`. For concurrent cursors,
|
361
|
+
stream_slices is executed only for the concurrent cursor, so cursors per partition
|
362
|
+
are not created for the declarative cursor. This method ensures that a cursor is available
|
363
|
+
to create requests for the specified partition. The cursor is initialized
|
364
|
+
with the per-partition state if present in the initial state, or with the global state
|
365
|
+
adjusted by the lookback window, or with the state to migrate from.
|
366
|
+
|
367
|
+
Note:
|
368
|
+
This is a temporary workaround and should be removed once the declarative cursor
|
369
|
+
is decoupled from the concurrent cursor implementation.
|
370
|
+
|
371
|
+
Args:
|
372
|
+
partition_key (str): The unique identifier for the partition for which the cursor
|
373
|
+
needs to be created.
|
374
|
+
"""
|
375
|
+
partition_state = (
|
376
|
+
self._state_to_migrate_from if self._state_to_migrate_from else self._NO_CURSOR_STATE
|
377
|
+
)
|
378
|
+
cursor = self._create_cursor(partition_state)
|
379
|
+
|
380
|
+
self._cursor_per_partition[partition_key] = cursor
|
@@ -7,6 +7,7 @@ import logging
|
|
7
7
|
import pkgutil
|
8
8
|
from copy import deepcopy
|
9
9
|
from importlib import metadata
|
10
|
+
from types import ModuleType
|
10
11
|
from typing import Any, Dict, Iterator, List, Mapping, Optional, Set
|
11
12
|
|
12
13
|
import yaml
|
@@ -32,6 +33,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
32
33
|
DeclarativeStream as DeclarativeStreamModel,
|
33
34
|
)
|
34
35
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import Spec as SpecModel
|
36
|
+
from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import (
|
37
|
+
get_registered_components_module,
|
38
|
+
)
|
35
39
|
from airbyte_cdk.sources.declarative.parsers.manifest_component_transformer import (
|
36
40
|
ManifestComponentTransformer,
|
37
41
|
)
|
@@ -59,22 +63,29 @@ class ManifestDeclarativeSource(DeclarativeSource):
|
|
59
63
|
def __init__(
|
60
64
|
self,
|
61
65
|
source_config: ConnectionDefinition,
|
66
|
+
*,
|
67
|
+
config: Mapping[str, Any] | None = None,
|
62
68
|
debug: bool = False,
|
63
69
|
emit_connector_builder_messages: bool = False,
|
64
70
|
component_factory: Optional[ModelToComponentFactory] = None,
|
65
71
|
):
|
66
72
|
"""
|
67
|
-
:
|
68
|
-
|
69
|
-
|
73
|
+
Args:
|
74
|
+
config: The provided config dict.
|
75
|
+
source_config: The manifest of low-code components that describe the source connector.
|
76
|
+
debug: True if debug mode is enabled.
|
77
|
+
emit_connector_builder_messages: True if messages should be emitted to the connector builder.
|
78
|
+
component_factory: optional factory if ModelToComponentFactory's default behavior needs to be tweaked.
|
70
79
|
"""
|
71
80
|
self.logger = logging.getLogger(f"airbyte.{self.name}")
|
72
|
-
|
73
81
|
# For ease of use we don't require the type to be specified at the top level manifest, but it should be included during processing
|
74
82
|
manifest = dict(source_config)
|
75
83
|
if "type" not in manifest:
|
76
84
|
manifest["type"] = "DeclarativeSource"
|
77
85
|
|
86
|
+
# If custom components are needed, locate and/or register them.
|
87
|
+
self.components_module: ModuleType | None = get_registered_components_module(config=config)
|
88
|
+
|
78
89
|
resolved_source_config = ManifestReferenceResolver().preprocess_manifest(manifest)
|
79
90
|
propagated_source_config = ManifestComponentTransformer().propagate_types_and_parameters(
|
80
91
|
"", resolved_source_config, {}
|
@@ -328,6 +328,16 @@ class LegacyToPerPartitionStateMigration(BaseModel):
|
|
328
328
|
type: Optional[Literal["LegacyToPerPartitionStateMigration"]] = None
|
329
329
|
|
330
330
|
|
331
|
+
class Clamping(BaseModel):
|
332
|
+
target: str = Field(
|
333
|
+
...,
|
334
|
+
description="The period of time that datetime windows will be clamped by",
|
335
|
+
examples=["DAY", "WEEK", "MONTH", "{{ config['target'] }}"],
|
336
|
+
title="Target",
|
337
|
+
)
|
338
|
+
target_details: Optional[Dict[str, Any]] = None
|
339
|
+
|
340
|
+
|
331
341
|
class Algorithm(Enum):
|
332
342
|
HS256 = "HS256"
|
333
343
|
HS384 = "HS384"
|
@@ -496,8 +506,8 @@ class OAuthAuthenticator(BaseModel):
|
|
496
506
|
examples=["custom_app_id"],
|
497
507
|
title="Client ID Property Name",
|
498
508
|
)
|
499
|
-
client_id: str = Field(
|
500
|
-
|
509
|
+
client_id: Optional[str] = Field(
|
510
|
+
None,
|
501
511
|
description="The OAuth client ID. Fill it in the user inputs.",
|
502
512
|
examples=["{{ config['client_id }}", "{{ config['credentials']['client_id }}"],
|
503
513
|
title="Client ID",
|
@@ -508,8 +518,8 @@ class OAuthAuthenticator(BaseModel):
|
|
508
518
|
examples=["custom_app_secret"],
|
509
519
|
title="Client Secret Property Name",
|
510
520
|
)
|
511
|
-
client_secret: str = Field(
|
512
|
-
|
521
|
+
client_secret: Optional[str] = Field(
|
522
|
+
None,
|
513
523
|
description="The OAuth client secret. Fill it in the user inputs.",
|
514
524
|
examples=[
|
515
525
|
"{{ config['client_secret }}",
|
@@ -614,6 +624,16 @@ class OAuthAuthenticator(BaseModel):
|
|
614
624
|
description="When the token updater is defined, new refresh tokens, access tokens and the access token expiry date are written back from the authentication response to the config object. This is important if the refresh token can only used once.",
|
615
625
|
title="Token Updater",
|
616
626
|
)
|
627
|
+
profile_assertion: Optional[JwtAuthenticator] = Field(
|
628
|
+
None,
|
629
|
+
description="The authenticator being used to authenticate the client authenticator.",
|
630
|
+
title="Profile Assertion",
|
631
|
+
)
|
632
|
+
use_profile_assertion: Optional[bool] = Field(
|
633
|
+
False,
|
634
|
+
description="Enable using profile assertion as a flow for OAuth authorization.",
|
635
|
+
title="Use Profile Assertion",
|
636
|
+
)
|
617
637
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
618
638
|
|
619
639
|
|
@@ -719,7 +739,7 @@ class HttpResponseFilter(BaseModel):
|
|
719
739
|
class TypesMap(BaseModel):
|
720
740
|
target_type: Union[str, List[str]]
|
721
741
|
current_type: Union[str, List[str]]
|
722
|
-
condition: Optional[str]
|
742
|
+
condition: Optional[str] = None
|
723
743
|
|
724
744
|
|
725
745
|
class SchemaTypeIdentifier(BaseModel):
|
@@ -797,14 +817,11 @@ class DpathFlattenFields(BaseModel):
|
|
797
817
|
field_path: List[str] = Field(
|
798
818
|
...,
|
799
819
|
description="A path to field that needs to be flattened.",
|
800
|
-
examples=[
|
801
|
-
["data"],
|
802
|
-
["data", "*", "field"],
|
803
|
-
],
|
820
|
+
examples=[["data"], ["data", "*", "field"]],
|
804
821
|
title="Field Path",
|
805
822
|
)
|
806
823
|
delete_origin_value: Optional[bool] = Field(
|
807
|
-
|
824
|
+
None,
|
808
825
|
description="Whether to delete the origin value or keep it. Default is False.",
|
809
826
|
title="Delete Origin Value",
|
810
827
|
)
|
@@ -1242,9 +1259,6 @@ class LegacySessionTokenAuthenticator(BaseModel):
|
|
1242
1259
|
|
1243
1260
|
|
1244
1261
|
class JsonParser(BaseModel):
|
1245
|
-
class Config:
|
1246
|
-
extra = Extra.allow
|
1247
|
-
|
1248
1262
|
type: Literal["JsonParser"]
|
1249
1263
|
encoding: Optional[str] = "utf-8"
|
1250
1264
|
|
@@ -1457,6 +1471,11 @@ class AuthFlow(BaseModel):
|
|
1457
1471
|
|
1458
1472
|
class DatetimeBasedCursor(BaseModel):
|
1459
1473
|
type: Literal["DatetimeBasedCursor"]
|
1474
|
+
clamping: Optional[Clamping] = Field(
|
1475
|
+
None,
|
1476
|
+
description="This option is used to adjust the upper and lower boundaries of each datetime window to beginning and end of the provided target period (day, week, month)",
|
1477
|
+
title="Date Range Clamping",
|
1478
|
+
)
|
1460
1479
|
cursor_field: str = Field(
|
1461
1480
|
...,
|
1462
1481
|
description="The location of the value on a record that will be used as a bookmark during sync. To ensure no data loss, the API must return records in ascending order based on the cursor field. Nested fields are not supported, so the field must be at the top level of the record. You can use a combination of Add Field and Remove Field transformations to move the nested field to the top.",
|
@@ -1680,6 +1699,18 @@ class CompositeErrorHandler(BaseModel):
|
|
1680
1699
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
1681
1700
|
|
1682
1701
|
|
1702
|
+
class ZipfileDecoder(BaseModel):
|
1703
|
+
class Config:
|
1704
|
+
extra = Extra.allow
|
1705
|
+
|
1706
|
+
type: Literal["ZipfileDecoder"]
|
1707
|
+
parser: Union[GzipParser, JsonParser, JsonLineParser, CsvParser] = Field(
|
1708
|
+
...,
|
1709
|
+
description="Parser to parse the decompressed data from the zipfile(s).",
|
1710
|
+
title="Parser",
|
1711
|
+
)
|
1712
|
+
|
1713
|
+
|
1683
1714
|
class CompositeRawDecoder(BaseModel):
|
1684
1715
|
type: Literal["CompositeRawDecoder"]
|
1685
1716
|
parser: Union[GzipParser, JsonParser, JsonLineParser, CsvParser]
|
@@ -1886,7 +1917,7 @@ class SessionTokenAuthenticator(BaseModel):
|
|
1886
1917
|
description="Authentication method to use for requests sent to the API, specifying how to inject the session token.",
|
1887
1918
|
title="Data Request Authentication",
|
1888
1919
|
)
|
1889
|
-
decoder: Optional[Union[JsonDecoder, XmlDecoder]] = Field(
|
1920
|
+
decoder: Optional[Union[JsonDecoder, XmlDecoder, CompositeRawDecoder]] = Field(
|
1890
1921
|
None, description="Component used to decode the response.", title="Decoder"
|
1891
1922
|
)
|
1892
1923
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
@@ -2092,6 +2123,7 @@ class SimpleRetriever(BaseModel):
|
|
2092
2123
|
XmlDecoder,
|
2093
2124
|
GzipJsonDecoder,
|
2094
2125
|
CompositeRawDecoder,
|
2126
|
+
ZipfileDecoder,
|
2095
2127
|
]
|
2096
2128
|
] = Field(
|
2097
2129
|
None,
|
@@ -2168,6 +2200,8 @@ class AsyncRetriever(BaseModel):
|
|
2168
2200
|
IterableDecoder,
|
2169
2201
|
XmlDecoder,
|
2170
2202
|
GzipJsonDecoder,
|
2203
|
+
CompositeRawDecoder,
|
2204
|
+
ZipfileDecoder,
|
2171
2205
|
]
|
2172
2206
|
] = Field(
|
2173
2207
|
None,
|
@@ -2182,6 +2216,8 @@ class AsyncRetriever(BaseModel):
|
|
2182
2216
|
IterableDecoder,
|
2183
2217
|
XmlDecoder,
|
2184
2218
|
GzipJsonDecoder,
|
2219
|
+
CompositeRawDecoder,
|
2220
|
+
ZipfileDecoder,
|
2185
2221
|
]
|
2186
2222
|
] = Field(
|
2187
2223
|
None,
|