airbyte-cdk 6.26.0.dev4106__py3-none-any.whl → 6.26.0.dev4108__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/cli/source_declarative_manifest/_run.py +3 -3
- airbyte_cdk/connector_builder/connector_builder_handler.py +2 -2
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +7 -7
- airbyte_cdk/sources/declarative/auth/jwt.py +17 -11
- airbyte_cdk/sources/declarative/auth/oauth.py +22 -13
- airbyte_cdk/sources/declarative/auth/token.py +3 -8
- airbyte_cdk/sources/declarative/auth/token_provider.py +4 -5
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +19 -9
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +71 -34
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +33 -4
- airbyte_cdk/sources/declarative/declarative_stream.py +3 -1
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +93 -27
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +7 -6
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +5 -3
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +22 -5
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +138 -38
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +5 -5
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +4 -2
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +49 -25
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +4 -4
- airbyte_cdk/sources/declarative/requesters/http_requester.py +5 -1
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +6 -5
- airbyte_cdk/sources/declarative/requesters/request_option.py +83 -4
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +7 -6
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +6 -12
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +4 -1
- airbyte_cdk/sources/declarative/schema/__init__.py +2 -0
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +44 -5
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +18 -11
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +51 -0
- airbyte_cdk/sources/file_based/file_based_source.py +16 -55
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +19 -31
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +7 -7
- airbyte_cdk/sources/file_based/stream/identities_stream.py +5 -2
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +22 -13
- airbyte_cdk/sources/streams/core.py +6 -6
- airbyte_cdk/sources/streams/http/http.py +1 -2
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +231 -62
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +166 -83
- airbyte_cdk/sources/types.py +4 -2
- airbyte_cdk/sources/utils/transform.py +23 -2
- airbyte_cdk/utils/datetime_helpers.py +499 -0
- airbyte_cdk/utils/mapping_helpers.py +86 -27
- airbyte_cdk/utils/slice_hasher.py +8 -1
- airbyte_cdk-6.26.0.dev4108.dist-info/LICENSE_SHORT +1 -0
- {airbyte_cdk-6.26.0.dev4106.dist-info → airbyte_cdk-6.26.0.dev4108.dist-info}/METADATA +5 -5
- {airbyte_cdk-6.26.0.dev4106.dist-info → airbyte_cdk-6.26.0.dev4108.dist-info}/RECORD +50 -48
- {airbyte_cdk-6.26.0.dev4106.dist-info → airbyte_cdk-6.26.0.dev4108.dist-info}/WHEEL +1 -1
- airbyte_cdk/sources/file_based/config/permissions.py +0 -34
- {airbyte_cdk-6.26.0.dev4106.dist-info → airbyte_cdk-6.26.0.dev4108.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.26.0.dev4106.dist-info → airbyte_cdk-6.26.0.dev4108.dist-info}/entry_points.txt +0 -0
@@ -138,7 +138,9 @@ class DeclarativeStream(Stream):
|
|
138
138
|
"""
|
139
139
|
:param: stream_state We knowingly avoid using stream_state as we want cursors to manage their own state.
|
140
140
|
"""
|
141
|
-
if stream_slice is None or
|
141
|
+
if stream_slice is None or (
|
142
|
+
not isinstance(stream_slice, StreamSlice) and stream_slice == {}
|
143
|
+
):
|
142
144
|
# As the parameter is Optional, many would just call `read_records(sync_mode)` during testing without specifying the field
|
143
145
|
# As part of the declarative model without custom components, this should never happen as the CDK would wire up a
|
144
146
|
# SinglePartitionRouter that would create this StreamSlice properly
|
@@ -22,6 +22,9 @@ from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import
|
|
22
22
|
)
|
23
23
|
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, Cursor, CursorField
|
24
24
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
25
|
+
from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import (
|
26
|
+
AbstractStreamStateConverter,
|
27
|
+
)
|
25
28
|
from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
|
26
29
|
|
27
30
|
logger = logging.getLogger("airbyte")
|
@@ -72,6 +75,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
72
75
|
stream_state: Any,
|
73
76
|
message_repository: MessageRepository,
|
74
77
|
connector_state_manager: ConnectorStateManager,
|
78
|
+
connector_state_converter: AbstractStreamStateConverter,
|
75
79
|
cursor_field: CursorField,
|
76
80
|
) -> None:
|
77
81
|
self._global_cursor: Optional[StreamState] = {}
|
@@ -79,6 +83,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
79
83
|
self._stream_namespace = stream_namespace
|
80
84
|
self._message_repository = message_repository
|
81
85
|
self._connector_state_manager = connector_state_manager
|
86
|
+
self._connector_state_converter = connector_state_converter
|
82
87
|
self._cursor_field = cursor_field
|
83
88
|
|
84
89
|
self._cursor_factory = cursor_factory
|
@@ -95,6 +100,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
95
100
|
self._lookback_window: int = 0
|
96
101
|
self._parent_state: Optional[StreamState] = None
|
97
102
|
self._over_limit: int = 0
|
103
|
+
self._use_global_cursor: bool = False
|
98
104
|
self._partition_serializer = PerPartitionKeySerializer()
|
99
105
|
|
100
106
|
self._set_initial_state(stream_state)
|
@@ -105,16 +111,18 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
105
111
|
|
106
112
|
@property
|
107
113
|
def state(self) -> MutableMapping[str, Any]:
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
114
|
+
state: dict[str, Any] = {"use_global_cursor": self._use_global_cursor}
|
115
|
+
if not self._use_global_cursor:
|
116
|
+
states = []
|
117
|
+
for partition_tuple, cursor in self._cursor_per_partition.items():
|
118
|
+
if cursor.state:
|
119
|
+
states.append(
|
120
|
+
{
|
121
|
+
"partition": self._to_dict(partition_tuple),
|
122
|
+
"cursor": copy.deepcopy(cursor.state),
|
123
|
+
}
|
124
|
+
)
|
125
|
+
state[self._PERPARTITION_STATE_KEY] = states
|
118
126
|
|
119
127
|
if self._global_cursor:
|
120
128
|
state[self._GLOBAL_STATE_KEY] = self._global_cursor
|
@@ -147,7 +155,8 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
147
155
|
< cursor.state[self.cursor_field.cursor_field_key]
|
148
156
|
):
|
149
157
|
self._new_global_cursor = copy.deepcopy(cursor.state)
|
150
|
-
|
158
|
+
if not self._use_global_cursor:
|
159
|
+
self._emit_state_message()
|
151
160
|
|
152
161
|
def ensure_at_least_one_state_emitted(self) -> None:
|
153
162
|
"""
|
@@ -192,7 +201,8 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
192
201
|
self._global_cursor,
|
193
202
|
self._lookback_window if self._global_cursor else 0,
|
194
203
|
)
|
195
|
-
self.
|
204
|
+
with self._lock:
|
205
|
+
self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
|
196
206
|
self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
|
197
207
|
threading.Semaphore(0)
|
198
208
|
)
|
@@ -210,16 +220,42 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
210
220
|
|
211
221
|
def _ensure_partition_limit(self) -> None:
|
212
222
|
"""
|
213
|
-
Ensure the maximum number of partitions
|
223
|
+
Ensure the maximum number of partitions does not exceed the predefined limit.
|
224
|
+
|
225
|
+
Steps:
|
226
|
+
1. Attempt to remove partitions that are marked as finished in `_finished_partitions`.
|
227
|
+
These partitions are considered processed and safe to delete.
|
228
|
+
2. If the limit is still exceeded and no finished partitions are available for removal,
|
229
|
+
remove the oldest partition unconditionally. We expect failed partitions to be removed.
|
230
|
+
|
231
|
+
Logging:
|
232
|
+
- Logs a warning each time a partition is removed, indicating whether it was finished
|
233
|
+
or removed due to being the oldest.
|
214
234
|
"""
|
215
|
-
|
216
|
-
self.
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
235
|
+
with self._lock:
|
236
|
+
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
|
237
|
+
self._over_limit += 1
|
238
|
+
# Try removing finished partitions first
|
239
|
+
for partition_key in list(self._cursor_per_partition.keys()):
|
240
|
+
if (
|
241
|
+
partition_key in self._finished_partitions
|
242
|
+
and self._semaphore_per_partition[partition_key]._value == 0
|
243
|
+
):
|
244
|
+
oldest_partition = self._cursor_per_partition.pop(
|
245
|
+
partition_key
|
246
|
+
) # Remove the oldest partition
|
247
|
+
logger.warning(
|
248
|
+
f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self._over_limit}."
|
249
|
+
)
|
250
|
+
break
|
251
|
+
else:
|
252
|
+
# If no finished partitions can be removed, fall back to removing the oldest partition
|
253
|
+
oldest_partition = self._cursor_per_partition.popitem(last=False)[
|
254
|
+
1
|
255
|
+
] # Remove the oldest partition
|
256
|
+
logger.warning(
|
257
|
+
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
|
258
|
+
)
|
223
259
|
|
224
260
|
def _set_initial_state(self, stream_state: StreamState) -> None:
|
225
261
|
"""
|
@@ -264,16 +300,20 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
264
300
|
if not stream_state:
|
265
301
|
return
|
266
302
|
|
267
|
-
if
|
303
|
+
if (
|
304
|
+
self._PERPARTITION_STATE_KEY not in stream_state
|
305
|
+
and self._GLOBAL_STATE_KEY not in stream_state
|
306
|
+
):
|
268
307
|
# We assume that `stream_state` is in a global format that can be applied to all partitions.
|
269
308
|
# Example: {"global_state_format_key": "global_state_format_value"}
|
270
|
-
self.
|
271
|
-
self._new_global_cursor = deepcopy(stream_state)
|
309
|
+
self._set_global_state(stream_state)
|
272
310
|
|
273
311
|
else:
|
312
|
+
self._use_global_cursor = stream_state.get("use_global_cursor", False)
|
313
|
+
|
274
314
|
self._lookback_window = int(stream_state.get("lookback_window", 0))
|
275
315
|
|
276
|
-
for state in stream_state
|
316
|
+
for state in stream_state.get(self._PERPARTITION_STATE_KEY, []):
|
277
317
|
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
|
278
318
|
self._create_cursor(state["cursor"])
|
279
319
|
)
|
@@ -283,8 +323,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
283
323
|
|
284
324
|
# set default state for missing partitions if it is per partition with fallback to global
|
285
325
|
if self._GLOBAL_STATE_KEY in stream_state:
|
286
|
-
self.
|
287
|
-
self._new_global_cursor = deepcopy(stream_state[self._GLOBAL_STATE_KEY])
|
326
|
+
self._set_global_state(stream_state[self._GLOBAL_STATE_KEY])
|
288
327
|
|
289
328
|
# Set initial parent state
|
290
329
|
if stream_state.get("parent_state"):
|
@@ -293,7 +332,31 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
293
332
|
# Set parent state for partition routers based on parent streams
|
294
333
|
self._partition_router.set_initial_state(stream_state)
|
295
334
|
|
335
|
+
def _set_global_state(self, stream_state: Mapping[str, Any]) -> None:
|
336
|
+
"""
|
337
|
+
Initializes the global cursor state from the provided stream state.
|
338
|
+
|
339
|
+
If the cursor field key is present in the stream state, its value is parsed,
|
340
|
+
formatted, and stored as the global cursor. This ensures consistency in state
|
341
|
+
representation across partitions.
|
342
|
+
"""
|
343
|
+
if self.cursor_field.cursor_field_key in stream_state:
|
344
|
+
global_state_value = stream_state[self.cursor_field.cursor_field_key]
|
345
|
+
final_format_global_state_value = self._connector_state_converter.output_format(
|
346
|
+
self._connector_state_converter.parse_value(global_state_value)
|
347
|
+
)
|
348
|
+
|
349
|
+
fixed_global_state = {
|
350
|
+
self.cursor_field.cursor_field_key: final_format_global_state_value
|
351
|
+
}
|
352
|
+
|
353
|
+
self._global_cursor = deepcopy(fixed_global_state)
|
354
|
+
self._new_global_cursor = deepcopy(fixed_global_state)
|
355
|
+
|
296
356
|
def observe(self, record: Record) -> None:
|
357
|
+
if not self._use_global_cursor and self.limit_reached():
|
358
|
+
self._use_global_cursor = True
|
359
|
+
|
297
360
|
if not record.associated_slice:
|
298
361
|
raise ValueError(
|
299
362
|
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
@@ -332,3 +395,6 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
332
395
|
)
|
333
396
|
cursor = self._cursor_per_partition[partition_key]
|
334
397
|
return cursor
|
398
|
+
|
399
|
+
def limit_reached(self) -> bool:
|
400
|
+
return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
|
@@ -365,14 +365,15 @@ class DatetimeBasedCursor(DeclarativeCursor):
|
|
365
365
|
options: MutableMapping[str, Any] = {}
|
366
366
|
if not stream_slice:
|
367
367
|
return options
|
368
|
+
|
368
369
|
if self.start_time_option and self.start_time_option.inject_into == option_type:
|
369
|
-
|
370
|
-
|
371
|
-
|
370
|
+
start_time_value = stream_slice.get(self._partition_field_start.eval(self.config))
|
371
|
+
self.start_time_option.inject_into_request(options, start_time_value, self.config)
|
372
|
+
|
372
373
|
if self.end_time_option and self.end_time_option.inject_into == option_type:
|
373
|
-
|
374
|
-
|
375
|
-
|
374
|
+
end_time_value = stream_slice.get(self._partition_field_end.eval(self.config))
|
375
|
+
self.end_time_option.inject_into_request(options, end_time_value, self.config)
|
376
|
+
|
376
377
|
return options
|
377
378
|
|
378
379
|
def should_be_synced(self, record: Record) -> bool:
|
@@ -26,9 +26,6 @@ from airbyte_cdk.models import (
|
|
26
26
|
from airbyte_cdk.sources.declarative.checks import COMPONENTS_CHECKER_TYPE_MAPPING
|
27
27
|
from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker
|
28
28
|
from airbyte_cdk.sources.declarative.declarative_source import DeclarativeSource
|
29
|
-
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
30
|
-
CheckStream as CheckStreamModel,
|
31
|
-
)
|
32
29
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
33
30
|
DeclarativeStream as DeclarativeStreamModel,
|
34
31
|
)
|
@@ -368,6 +365,11 @@ class ManifestDeclarativeSource(DeclarativeSource):
|
|
368
365
|
# Ensure that each stream is created with a unique name
|
369
366
|
name = dynamic_stream.get("name")
|
370
367
|
|
368
|
+
if not isinstance(name, str):
|
369
|
+
raise ValueError(
|
370
|
+
f"Expected stream name {name} to be a string, got {type(name)}."
|
371
|
+
)
|
372
|
+
|
371
373
|
if name in seen_dynamic_streams:
|
372
374
|
error_message = f"Dynamic streams list contains a duplicate name: {name}. Please contact Airbyte Support."
|
373
375
|
failure_type = FailureType.system_error
|
@@ -59,6 +59,11 @@ class CheckDynamicStream(BaseModel):
|
|
59
59
|
description="Numbers of the streams to try reading from when running a check operation.",
|
60
60
|
title="Stream Count",
|
61
61
|
)
|
62
|
+
use_check_availability: Optional[bool] = Field(
|
63
|
+
True,
|
64
|
+
description="Enables stream check availability. This field is automatically set by the CDK.",
|
65
|
+
title="Use Check Availability",
|
66
|
+
)
|
62
67
|
|
63
68
|
|
64
69
|
class ConcurrencyLevel(BaseModel):
|
@@ -736,8 +741,13 @@ class HttpResponseFilter(BaseModel):
|
|
736
741
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
737
742
|
|
738
743
|
|
744
|
+
class ComplexFieldType(BaseModel):
|
745
|
+
field_type: str
|
746
|
+
items: Optional[Union[str, ComplexFieldType]] = None
|
747
|
+
|
748
|
+
|
739
749
|
class TypesMap(BaseModel):
|
740
|
-
target_type: Union[str, List[str]]
|
750
|
+
target_type: Union[str, List[str], ComplexFieldType]
|
741
751
|
current_type: Union[str, List[str]]
|
742
752
|
condition: Optional[str] = None
|
743
753
|
|
@@ -1190,11 +1200,17 @@ class InjectInto(Enum):
|
|
1190
1200
|
|
1191
1201
|
class RequestOption(BaseModel):
|
1192
1202
|
type: Literal["RequestOption"]
|
1193
|
-
field_name: str = Field(
|
1194
|
-
|
1195
|
-
description="Configures which key should be used in the location that the descriptor is being injected into",
|
1203
|
+
field_name: Optional[str] = Field(
|
1204
|
+
None,
|
1205
|
+
description="Configures which key should be used in the location that the descriptor is being injected into. We hope to eventually deprecate this field in favor of `field_path` for all request_options, but must currently maintain it for backwards compatibility in the Builder.",
|
1196
1206
|
examples=["segment_id"],
|
1197
|
-
title="
|
1207
|
+
title="Field Name",
|
1208
|
+
)
|
1209
|
+
field_path: Optional[List[str]] = Field(
|
1210
|
+
None,
|
1211
|
+
description="Configures a path to be used for nested structures in JSON body requests (e.g. GraphQL queries)",
|
1212
|
+
examples=[["data", "viewer", "id"]],
|
1213
|
+
title="Field Path",
|
1198
1214
|
)
|
1199
1215
|
inject_into: InjectInto = Field(
|
1200
1216
|
...,
|
@@ -2260,6 +2276,7 @@ class DynamicDeclarativeStream(BaseModel):
|
|
2260
2276
|
)
|
2261
2277
|
|
2262
2278
|
|
2279
|
+
ComplexFieldType.update_forward_refs()
|
2263
2280
|
CompositeErrorHandler.update_forward_refs()
|
2264
2281
|
DeclarativeSource1.update_forward_refs()
|
2265
2282
|
DeclarativeSource2.update_forward_refs()
|
@@ -133,6 +133,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
133
133
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
134
134
|
CheckStream as CheckStreamModel,
|
135
135
|
)
|
136
|
+
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
137
|
+
ComplexFieldType as ComplexFieldTypeModel,
|
138
|
+
)
|
136
139
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
137
140
|
ComponentMappingDefinition as ComponentMappingDefinitionModel,
|
138
141
|
)
|
@@ -429,6 +432,7 @@ from airbyte_cdk.sources.declarative.retrievers import (
|
|
429
432
|
SimpleRetrieverTestReadDecorator,
|
430
433
|
)
|
431
434
|
from airbyte_cdk.sources.declarative.schema import (
|
435
|
+
ComplexFieldType,
|
432
436
|
DefaultSchemaLoader,
|
433
437
|
DynamicSchemaLoader,
|
434
438
|
InlineSchemaLoader,
|
@@ -503,6 +507,7 @@ class ModelToComponentFactory:
|
|
503
507
|
disable_cache: bool = False,
|
504
508
|
disable_resumable_full_refresh: bool = False,
|
505
509
|
message_repository: Optional[MessageRepository] = None,
|
510
|
+
connector_state_manager: Optional[ConnectorStateManager] = None,
|
506
511
|
):
|
507
512
|
self._init_mappings()
|
508
513
|
self._limit_pages_fetched_per_slice = limit_pages_fetched_per_slice
|
@@ -514,6 +519,7 @@ class ModelToComponentFactory:
|
|
514
519
|
self._message_repository = message_repository or InMemoryMessageRepository(
|
515
520
|
self._evaluate_log_level(emit_connector_builder_messages)
|
516
521
|
)
|
522
|
+
self._connector_state_manager = connector_state_manager or ConnectorStateManager()
|
517
523
|
|
518
524
|
def _init_mappings(self) -> None:
|
519
525
|
self.PYDANTIC_MODEL_TO_CONSTRUCTOR: Mapping[Type[BaseModel], Callable[..., Any]] = {
|
@@ -572,6 +578,7 @@ class ModelToComponentFactory:
|
|
572
578
|
DynamicSchemaLoaderModel: self.create_dynamic_schema_loader,
|
573
579
|
SchemaTypeIdentifierModel: self.create_schema_type_identifier,
|
574
580
|
TypesMapModel: self.create_types_map,
|
581
|
+
ComplexFieldTypeModel: self.create_complex_field_type,
|
575
582
|
JwtAuthenticatorModel: self.create_jwt_authenticator,
|
576
583
|
LegacyToPerPartitionStateMigrationModel: self.create_legacy_to_per_partition_state_migration,
|
577
584
|
ListPartitionRouterModel: self.create_list_partition_router,
|
@@ -726,8 +733,8 @@ class ModelToComponentFactory:
|
|
726
733
|
}
|
727
734
|
return names_to_types[value_type]
|
728
735
|
|
729
|
-
@staticmethod
|
730
736
|
def create_api_key_authenticator(
|
737
|
+
self,
|
731
738
|
model: ApiKeyAuthenticatorModel,
|
732
739
|
config: Config,
|
733
740
|
token_provider: Optional[TokenProvider] = None,
|
@@ -749,10 +756,8 @@ class ModelToComponentFactory:
|
|
749
756
|
)
|
750
757
|
|
751
758
|
request_option = (
|
752
|
-
|
753
|
-
inject_into=
|
754
|
-
field_name=model.inject_into.field_name,
|
755
|
-
parameters=model.parameters or {},
|
759
|
+
self._create_component_from_model(
|
760
|
+
model.inject_into, config, parameters=model.parameters or {}
|
756
761
|
)
|
757
762
|
if model.inject_into
|
758
763
|
else RequestOption(
|
@@ -761,6 +766,7 @@ class ModelToComponentFactory:
|
|
761
766
|
parameters=model.parameters or {},
|
762
767
|
)
|
763
768
|
)
|
769
|
+
|
764
770
|
return ApiKeyAuthenticator(
|
765
771
|
token_provider=(
|
766
772
|
token_provider
|
@@ -842,7 +848,7 @@ class ModelToComponentFactory:
|
|
842
848
|
token_provider=token_provider,
|
843
849
|
)
|
844
850
|
else:
|
845
|
-
return
|
851
|
+
return self.create_api_key_authenticator(
|
846
852
|
ApiKeyAuthenticatorModel(
|
847
853
|
type="ApiKeyAuthenticator",
|
848
854
|
api_token="",
|
@@ -896,7 +902,15 @@ class ModelToComponentFactory:
|
|
896
902
|
def create_check_dynamic_stream(
|
897
903
|
model: CheckDynamicStreamModel, config: Config, **kwargs: Any
|
898
904
|
) -> CheckDynamicStream:
|
899
|
-
|
905
|
+
assert model.use_check_availability is not None # for mypy
|
906
|
+
|
907
|
+
use_check_availability = model.use_check_availability
|
908
|
+
|
909
|
+
return CheckDynamicStream(
|
910
|
+
stream_count=model.stream_count,
|
911
|
+
use_check_availability=use_check_availability,
|
912
|
+
parameters={},
|
913
|
+
)
|
900
914
|
|
901
915
|
def create_composite_error_handler(
|
902
916
|
self, model: CompositeErrorHandlerModel, config: Config, **kwargs: Any
|
@@ -922,17 +936,24 @@ class ModelToComponentFactory:
|
|
922
936
|
|
923
937
|
def create_concurrent_cursor_from_datetime_based_cursor(
|
924
938
|
self,
|
925
|
-
state_manager: ConnectorStateManager,
|
926
939
|
model_type: Type[BaseModel],
|
927
940
|
component_definition: ComponentDefinition,
|
928
941
|
stream_name: str,
|
929
942
|
stream_namespace: Optional[str],
|
930
943
|
config: Config,
|
931
|
-
stream_state: MutableMapping[str, Any],
|
932
944
|
message_repository: Optional[MessageRepository] = None,
|
933
945
|
runtime_lookback_window: Optional[datetime.timedelta] = None,
|
934
946
|
**kwargs: Any,
|
935
947
|
) -> ConcurrentCursor:
|
948
|
+
# Per-partition incremental streams can dynamically create child cursors which will pass their current
|
949
|
+
# state via the stream_state keyword argument. Incremental syncs without parent streams use the
|
950
|
+
# incoming state and connector_state_manager that is initialized when the component factory is created
|
951
|
+
stream_state = (
|
952
|
+
self._connector_state_manager.get_stream_state(stream_name, stream_namespace)
|
953
|
+
if "stream_state" not in kwargs
|
954
|
+
else kwargs["stream_state"]
|
955
|
+
)
|
956
|
+
|
936
957
|
component_type = component_definition.get("type")
|
937
958
|
if component_definition.get("type") != model_type.__name__:
|
938
959
|
raise ValueError(
|
@@ -1126,7 +1147,7 @@ class ModelToComponentFactory:
|
|
1126
1147
|
stream_namespace=stream_namespace,
|
1127
1148
|
stream_state=stream_state,
|
1128
1149
|
message_repository=message_repository or self._message_repository,
|
1129
|
-
connector_state_manager=
|
1150
|
+
connector_state_manager=self._connector_state_manager,
|
1130
1151
|
connector_state_converter=connector_state_converter,
|
1131
1152
|
cursor_field=cursor_field,
|
1132
1153
|
slice_boundary_fields=slice_boundary_fields,
|
@@ -1188,6 +1209,22 @@ class ModelToComponentFactory:
|
|
1188
1209
|
)
|
1189
1210
|
cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
|
1190
1211
|
|
1212
|
+
datetime_format = datetime_based_cursor_model.datetime_format
|
1213
|
+
|
1214
|
+
cursor_granularity = (
|
1215
|
+
parse_duration(datetime_based_cursor_model.cursor_granularity)
|
1216
|
+
if datetime_based_cursor_model.cursor_granularity
|
1217
|
+
else None
|
1218
|
+
)
|
1219
|
+
|
1220
|
+
connector_state_converter: DateTimeStreamStateConverter
|
1221
|
+
connector_state_converter = CustomFormatConcurrentStreamStateConverter(
|
1222
|
+
datetime_format=datetime_format,
|
1223
|
+
input_datetime_formats=datetime_based_cursor_model.cursor_datetime_formats,
|
1224
|
+
is_sequential_state=True, # ConcurrentPerPartitionCursor only works with sequential state
|
1225
|
+
cursor_granularity=cursor_granularity,
|
1226
|
+
)
|
1227
|
+
|
1191
1228
|
# Create the cursor factory
|
1192
1229
|
cursor_factory = ConcurrentCursorFactory(
|
1193
1230
|
partial(
|
@@ -1211,6 +1248,7 @@ class ModelToComponentFactory:
|
|
1211
1248
|
stream_state=stream_state,
|
1212
1249
|
message_repository=self._message_repository, # type: ignore
|
1213
1250
|
connector_state_manager=state_manager,
|
1251
|
+
connector_state_converter=connector_state_converter,
|
1214
1252
|
cursor_field=cursor_field,
|
1215
1253
|
)
|
1216
1254
|
|
@@ -1450,19 +1488,15 @@ class ModelToComponentFactory:
|
|
1450
1488
|
)
|
1451
1489
|
|
1452
1490
|
end_time_option = (
|
1453
|
-
|
1454
|
-
|
1455
|
-
field_name=model.end_time_option.field_name,
|
1456
|
-
parameters=model.parameters or {},
|
1491
|
+
self._create_component_from_model(
|
1492
|
+
model.end_time_option, config, parameters=model.parameters or {}
|
1457
1493
|
)
|
1458
1494
|
if model.end_time_option
|
1459
1495
|
else None
|
1460
1496
|
)
|
1461
1497
|
start_time_option = (
|
1462
|
-
|
1463
|
-
|
1464
|
-
field_name=model.start_time_option.field_name,
|
1465
|
-
parameters=model.parameters or {},
|
1498
|
+
self._create_component_from_model(
|
1499
|
+
model.start_time_option, config, parameters=model.parameters or {}
|
1466
1500
|
)
|
1467
1501
|
if model.start_time_option
|
1468
1502
|
else None
|
@@ -1533,19 +1567,15 @@ class ModelToComponentFactory:
|
|
1533
1567
|
cursor_model = model.incremental_sync
|
1534
1568
|
|
1535
1569
|
end_time_option = (
|
1536
|
-
|
1537
|
-
|
1538
|
-
field_name=cursor_model.end_time_option.field_name,
|
1539
|
-
parameters=cursor_model.parameters or {},
|
1570
|
+
self._create_component_from_model(
|
1571
|
+
cursor_model.end_time_option, config, parameters=cursor_model.parameters or {}
|
1540
1572
|
)
|
1541
1573
|
if cursor_model.end_time_option
|
1542
1574
|
else None
|
1543
1575
|
)
|
1544
1576
|
start_time_option = (
|
1545
|
-
|
1546
|
-
|
1547
|
-
field_name=cursor_model.start_time_option.field_name,
|
1548
|
-
parameters=cursor_model.parameters or {},
|
1577
|
+
self._create_component_from_model(
|
1578
|
+
cursor_model.start_time_option, config, parameters=cursor_model.parameters or {}
|
1549
1579
|
)
|
1550
1580
|
if cursor_model.start_time_option
|
1551
1581
|
else None
|
@@ -1617,7 +1647,7 @@ class ModelToComponentFactory:
|
|
1617
1647
|
) -> Optional[PartitionRouter]:
|
1618
1648
|
if (
|
1619
1649
|
hasattr(model, "partition_router")
|
1620
|
-
and isinstance(model, SimpleRetrieverModel)
|
1650
|
+
and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel)
|
1621
1651
|
and model.partition_router
|
1622
1652
|
):
|
1623
1653
|
stream_slicer_model = model.partition_router
|
@@ -1651,6 +1681,31 @@ class ModelToComponentFactory:
|
|
1651
1681
|
stream_slicer = self._build_stream_slicer_from_partition_router(model.retriever, config)
|
1652
1682
|
|
1653
1683
|
if model.incremental_sync and stream_slicer:
|
1684
|
+
if model.retriever.type == "AsyncRetriever":
|
1685
|
+
if model.incremental_sync.type != "DatetimeBasedCursor":
|
1686
|
+
# We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the support or unordered slices (for example, when we trigger reports for January and February, the report in February can be completed first). Once we have support for custom concurrent cursor or have a new implementation available in the CDK, we can enable more cursors here.
|
1687
|
+
raise ValueError(
|
1688
|
+
"AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet"
|
1689
|
+
)
|
1690
|
+
if stream_slicer:
|
1691
|
+
return self.create_concurrent_cursor_from_perpartition_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
|
1692
|
+
state_manager=self._connector_state_manager,
|
1693
|
+
model_type=DatetimeBasedCursorModel,
|
1694
|
+
component_definition=model.incremental_sync.__dict__,
|
1695
|
+
stream_name=model.name or "",
|
1696
|
+
stream_namespace=None,
|
1697
|
+
config=config or {},
|
1698
|
+
stream_state={},
|
1699
|
+
partition_router=stream_slicer,
|
1700
|
+
)
|
1701
|
+
return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
|
1702
|
+
model_type=DatetimeBasedCursorModel,
|
1703
|
+
component_definition=model.incremental_sync.__dict__,
|
1704
|
+
stream_name=model.name or "",
|
1705
|
+
stream_namespace=None,
|
1706
|
+
config=config or {},
|
1707
|
+
)
|
1708
|
+
|
1654
1709
|
incremental_sync_model = model.incremental_sync
|
1655
1710
|
if (
|
1656
1711
|
hasattr(incremental_sync_model, "global_substream_cursor")
|
@@ -1676,6 +1731,22 @@ class ModelToComponentFactory:
|
|
1676
1731
|
stream_cursor=cursor_component,
|
1677
1732
|
)
|
1678
1733
|
elif model.incremental_sync:
|
1734
|
+
if model.retriever.type == "AsyncRetriever":
|
1735
|
+
if model.incremental_sync.type != "DatetimeBasedCursor":
|
1736
|
+
# We are currently in a transition to the Concurrent CDK and AsyncRetriever can only work with the support or unordered slices (for example, when we trigger reports for January and February, the report in February can be completed first). Once we have support for custom concurrent cursor or have a new implementation available in the CDK, we can enable more cursors here.
|
1737
|
+
raise ValueError(
|
1738
|
+
"AsyncRetriever with cursor other than DatetimeBasedCursor is not supported yet"
|
1739
|
+
)
|
1740
|
+
if model.retriever.partition_router:
|
1741
|
+
# Note that this development is also done in parallel to the per partition development which once merged we could support here by calling `create_concurrent_cursor_from_perpartition_cursor`
|
1742
|
+
raise ValueError("Per partition state is not supported yet for AsyncRetriever")
|
1743
|
+
return self.create_concurrent_cursor_from_datetime_based_cursor( # type: ignore # This is a known issue that we are creating and returning a ConcurrentCursor which does not technically implement the (low-code) StreamSlicer. However, (low-code) StreamSlicer and ConcurrentCursor both implement StreamSlicer.stream_slices() which is the primary method needed for checkpointing
|
1744
|
+
model_type=DatetimeBasedCursorModel,
|
1745
|
+
component_definition=model.incremental_sync.__dict__,
|
1746
|
+
stream_name=model.name or "",
|
1747
|
+
stream_namespace=None,
|
1748
|
+
config=config or {},
|
1749
|
+
)
|
1679
1750
|
return (
|
1680
1751
|
self._create_component_from_model(model=model.incremental_sync, config=config)
|
1681
1752
|
if model.incremental_sync
|
@@ -1894,10 +1965,26 @@ class ModelToComponentFactory:
|
|
1894
1965
|
) -> InlineSchemaLoader:
|
1895
1966
|
return InlineSchemaLoader(schema=model.schema_ or {}, parameters={})
|
1896
1967
|
|
1897
|
-
|
1898
|
-
|
1968
|
+
def create_complex_field_type(
|
1969
|
+
self, model: ComplexFieldTypeModel, config: Config, **kwargs: Any
|
1970
|
+
) -> ComplexFieldType:
|
1971
|
+
items = (
|
1972
|
+
self._create_component_from_model(model=model.items, config=config)
|
1973
|
+
if isinstance(model.items, ComplexFieldTypeModel)
|
1974
|
+
else model.items
|
1975
|
+
)
|
1976
|
+
|
1977
|
+
return ComplexFieldType(field_type=model.field_type, items=items)
|
1978
|
+
|
1979
|
+
def create_types_map(self, model: TypesMapModel, config: Config, **kwargs: Any) -> TypesMap:
|
1980
|
+
target_type = (
|
1981
|
+
self._create_component_from_model(model=model.target_type, config=config)
|
1982
|
+
if isinstance(model.target_type, ComplexFieldTypeModel)
|
1983
|
+
else model.target_type
|
1984
|
+
)
|
1985
|
+
|
1899
1986
|
return TypesMap(
|
1900
|
-
target_type=
|
1987
|
+
target_type=target_type,
|
1901
1988
|
current_type=model.current_type,
|
1902
1989
|
condition=model.condition if model.condition is not None else "True",
|
1903
1990
|
)
|
@@ -2054,16 +2141,11 @@ class ModelToComponentFactory:
|
|
2054
2141
|
additional_jwt_payload=model.additional_jwt_payload,
|
2055
2142
|
)
|
2056
2143
|
|
2057
|
-
@staticmethod
|
2058
2144
|
def create_list_partition_router(
|
2059
|
-
model: ListPartitionRouterModel, config: Config, **kwargs: Any
|
2145
|
+
self, model: ListPartitionRouterModel, config: Config, **kwargs: Any
|
2060
2146
|
) -> ListPartitionRouter:
|
2061
2147
|
request_option = (
|
2062
|
-
|
2063
|
-
inject_into=RequestOptionType(model.request_option.inject_into.value),
|
2064
|
-
field_name=model.request_option.field_name,
|
2065
|
-
parameters=model.parameters or {},
|
2066
|
-
)
|
2148
|
+
self._create_component_from_model(model.request_option, config)
|
2067
2149
|
if model.request_option
|
2068
2150
|
else None
|
2069
2151
|
)
|
@@ -2259,7 +2341,25 @@ class ModelToComponentFactory:
|
|
2259
2341
|
model: RequestOptionModel, config: Config, **kwargs: Any
|
2260
2342
|
) -> RequestOption:
|
2261
2343
|
inject_into = RequestOptionType(model.inject_into.value)
|
2262
|
-
|
2344
|
+
field_path: Optional[List[Union[InterpolatedString, str]]] = (
|
2345
|
+
[
|
2346
|
+
InterpolatedString.create(segment, parameters=kwargs.get("parameters", {}))
|
2347
|
+
for segment in model.field_path
|
2348
|
+
]
|
2349
|
+
if model.field_path
|
2350
|
+
else None
|
2351
|
+
)
|
2352
|
+
field_name = (
|
2353
|
+
InterpolatedString.create(model.field_name, parameters=kwargs.get("parameters", {}))
|
2354
|
+
if model.field_name
|
2355
|
+
else None
|
2356
|
+
)
|
2357
|
+
return RequestOption(
|
2358
|
+
field_name=field_name,
|
2359
|
+
field_path=field_path,
|
2360
|
+
inject_into=inject_into,
|
2361
|
+
parameters=kwargs.get("parameters", {}),
|
2362
|
+
)
|
2263
2363
|
|
2264
2364
|
def create_record_selector(
|
2265
2365
|
self,
|