airbyte-cdk 6.17.1.dev0__py3-none-any.whl → 6.18.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +0 -69
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +5 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +5 -3
- airbyte_cdk/sources/declarative/incremental/__init__.py +0 -6
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +0 -14
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +26 -12
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +26 -79
- airbyte_cdk/sources/declarative/requesters/README.md +57 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +33 -4
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +1 -1
- airbyte_cdk/sources/types.py +3 -0
- {airbyte_cdk-6.17.1.dev0.dist-info → airbyte_cdk-6.18.0.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.17.1.dev0.dist-info → airbyte_cdk-6.18.0.dist-info}/RECORD +16 -16
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +0 -346
- {airbyte_cdk-6.17.1.dev0.dist-info → airbyte_cdk-6.18.0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.17.1.dev0.dist-info → airbyte_cdk-6.18.0.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.17.1.dev0.dist-info → airbyte_cdk-6.18.0.dist-info}/entry_points.txt +0 -0
@@ -20,9 +20,6 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
|
|
20
20
|
ClientSideIncrementalRecordFilterDecorator,
|
21
21
|
)
|
22
22
|
from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
|
23
|
-
from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import (
|
24
|
-
PerPartitionWithGlobalCursor,
|
25
|
-
)
|
26
23
|
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
|
27
24
|
from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
|
28
25
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
@@ -307,72 +304,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
307
304
|
cursor=final_state_cursor,
|
308
305
|
)
|
309
306
|
)
|
310
|
-
elif (
|
311
|
-
incremental_sync_component_definition
|
312
|
-
and incremental_sync_component_definition.get("type", "")
|
313
|
-
== DatetimeBasedCursorModel.__name__
|
314
|
-
and self._stream_supports_concurrent_partition_processing(
|
315
|
-
declarative_stream=declarative_stream
|
316
|
-
)
|
317
|
-
and hasattr(declarative_stream.retriever, "stream_slicer")
|
318
|
-
and isinstance(
|
319
|
-
declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor
|
320
|
-
)
|
321
|
-
):
|
322
|
-
stream_state = state_manager.get_stream_state(
|
323
|
-
stream_name=declarative_stream.name, namespace=declarative_stream.namespace
|
324
|
-
)
|
325
|
-
partition_router = declarative_stream.retriever.stream_slicer._partition_router
|
326
|
-
|
327
|
-
cursor = self._constructor.create_concurrent_cursor_from_perpartition_cursor(
|
328
|
-
state_manager=state_manager,
|
329
|
-
model_type=DatetimeBasedCursorModel,
|
330
|
-
component_definition=incremental_sync_component_definition,
|
331
|
-
stream_name=declarative_stream.name,
|
332
|
-
stream_namespace=declarative_stream.namespace,
|
333
|
-
config=config or {},
|
334
|
-
stream_state=stream_state,
|
335
|
-
partition_router=partition_router,
|
336
|
-
)
|
337
|
-
|
338
|
-
retriever = declarative_stream.retriever
|
339
|
-
|
340
|
-
# This is an optimization so that we don't invoke any cursor or state management flows within the
|
341
|
-
# low-code framework because state management is handled through the ConcurrentCursor.
|
342
|
-
if declarative_stream and isinstance(retriever, SimpleRetriever):
|
343
|
-
# Also a temporary hack. In the legacy Stream implementation, as part of the read,
|
344
|
-
# set_initial_state() is called to instantiate incoming state on the cursor. Although we no
|
345
|
-
# longer rely on the legacy low-code cursor for concurrent checkpointing, low-code components
|
346
|
-
# like StopConditionPaginationStrategyDecorator and ClientSideIncrementalRecordFilterDecorator
|
347
|
-
# still rely on a DatetimeBasedCursor that is properly initialized with state.
|
348
|
-
if retriever.cursor:
|
349
|
-
retriever.cursor.set_initial_state(stream_state=stream_state)
|
350
|
-
# We zero it out here, but since this is a cursor reference, the state is still properly
|
351
|
-
# instantiated for the other components that reference it
|
352
|
-
retriever.cursor = None
|
353
|
-
|
354
|
-
partition_generator = StreamSlicerPartitionGenerator(
|
355
|
-
DeclarativePartitionFactory(
|
356
|
-
declarative_stream.name,
|
357
|
-
declarative_stream.get_json_schema(),
|
358
|
-
retriever,
|
359
|
-
self.message_repository,
|
360
|
-
),
|
361
|
-
cursor,
|
362
|
-
)
|
363
|
-
|
364
|
-
concurrent_streams.append(
|
365
|
-
DefaultStream(
|
366
|
-
partition_generator=partition_generator,
|
367
|
-
name=declarative_stream.name,
|
368
|
-
json_schema=declarative_stream.get_json_schema(),
|
369
|
-
availability_strategy=AlwaysAvailableAvailabilityStrategy(),
|
370
|
-
primary_key=get_primary_key_from_stream(declarative_stream.primary_key),
|
371
|
-
cursor_field=cursor.cursor_field.cursor_field_key,
|
372
|
-
logger=self.logger,
|
373
|
-
cursor=cursor,
|
374
|
-
)
|
375
|
-
)
|
376
307
|
else:
|
377
308
|
synchronous_streams.append(declarative_stream)
|
378
309
|
else:
|
@@ -2977,6 +2977,11 @@ definitions:
|
|
2977
2977
|
anyOf:
|
2978
2978
|
- "$ref": "#/definitions/CustomRequester"
|
2979
2979
|
- "$ref": "#/definitions/HttpRequester"
|
2980
|
+
url_requester:
|
2981
|
+
description: Requester component that describes how to prepare HTTP requests to send to the source API to extract the url from polling response by the completed async job.
|
2982
|
+
anyOf:
|
2983
|
+
- "$ref": "#/definitions/CustomRequester"
|
2984
|
+
- "$ref": "#/definitions/HttpRequester"
|
2980
2985
|
download_requester:
|
2981
2986
|
description: Requester component that describes how to prepare HTTP requests to send to the source API to download the data provided by the completed async job.
|
2982
2987
|
anyOf:
|
@@ -59,11 +59,13 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
|
|
59
59
|
|
60
60
|
def __init__(
|
61
61
|
self,
|
62
|
-
|
62
|
+
date_time_based_cursor: DatetimeBasedCursor,
|
63
|
+
substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]],
|
63
64
|
**kwargs: Any,
|
64
65
|
):
|
65
66
|
super().__init__(**kwargs)
|
66
|
-
self.
|
67
|
+
self._date_time_based_cursor = date_time_based_cursor
|
68
|
+
self._substream_cursor = substream_cursor
|
67
69
|
|
68
70
|
def filter_records(
|
69
71
|
self,
|
@@ -75,7 +77,7 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
|
|
75
77
|
records = (
|
76
78
|
record
|
77
79
|
for record in records
|
78
|
-
if self.
|
80
|
+
if (self._substream_cursor or self._date_time_based_cursor).should_be_synced(
|
79
81
|
# Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here
|
80
82
|
# Record stream name is empty cause it is not used durig the filtering
|
81
83
|
Record(data=record, associated_slice=stream_slice, stream_name="")
|
@@ -2,10 +2,6 @@
|
|
2
2
|
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import (
|
6
|
-
ConcurrentCursorFactory,
|
7
|
-
ConcurrentPerPartitionCursor,
|
8
|
-
)
|
9
5
|
from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
|
10
6
|
from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
|
11
7
|
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
|
@@ -25,8 +21,6 @@ from airbyte_cdk.sources.declarative.incremental.resumable_full_refresh_cursor i
|
|
25
21
|
|
26
22
|
__all__ = [
|
27
23
|
"CursorFactory",
|
28
|
-
"ConcurrentCursorFactory",
|
29
|
-
"ConcurrentPerPartitionCursor",
|
30
24
|
"DatetimeBasedCursor",
|
31
25
|
"DeclarativeCursor",
|
32
26
|
"GlobalSubstreamCursor",
|
@@ -303,20 +303,6 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
303
303
|
raise ValueError("A partition needs to be provided in order to get request body json")
|
304
304
|
|
305
305
|
def should_be_synced(self, record: Record) -> bool:
|
306
|
-
if (
|
307
|
-
self._to_partition_key(record.associated_slice.partition)
|
308
|
-
not in self._cursor_per_partition
|
309
|
-
):
|
310
|
-
partition_state = (
|
311
|
-
self._state_to_migrate_from
|
312
|
-
if self._state_to_migrate_from
|
313
|
-
else self._NO_CURSOR_STATE
|
314
|
-
)
|
315
|
-
cursor = self._create_cursor(partition_state)
|
316
|
-
|
317
|
-
self._cursor_per_partition[
|
318
|
-
self._to_partition_key(record.associated_slice.partition)
|
319
|
-
] = cursor
|
320
306
|
return self._get_cursor(record).should_be_synced(
|
321
307
|
self._convert_record_to_cursor_record(record)
|
322
308
|
)
|
@@ -737,33 +737,43 @@ class KeysToSnakeCase(BaseModel):
|
|
737
737
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
738
738
|
|
739
739
|
|
740
|
+
class FlattenFields(BaseModel):
|
741
|
+
type: Literal["FlattenFields"]
|
742
|
+
flatten_lists: Optional[bool] = Field(
|
743
|
+
True,
|
744
|
+
description="Whether to flatten lists or leave it as is. Default is True.",
|
745
|
+
title="Flatten Lists",
|
746
|
+
)
|
747
|
+
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
748
|
+
|
749
|
+
|
740
750
|
class KeysReplace(BaseModel):
|
741
751
|
type: Literal["KeysReplace"]
|
742
752
|
old: str = Field(
|
743
753
|
...,
|
744
754
|
description="Old value to replace.",
|
745
|
-
examples=[
|
755
|
+
examples=[
|
756
|
+
" ",
|
757
|
+
"{{ record.id }}",
|
758
|
+
"{{ config['id'] }}",
|
759
|
+
"{{ stream_slice['id'] }}",
|
760
|
+
],
|
746
761
|
title="Old value",
|
747
762
|
)
|
748
763
|
new: str = Field(
|
749
764
|
...,
|
750
765
|
description="New value to set.",
|
751
|
-
examples=[
|
766
|
+
examples=[
|
767
|
+
"_",
|
768
|
+
"{{ record.id }}",
|
769
|
+
"{{ config['id'] }}",
|
770
|
+
"{{ stream_slice['id'] }}",
|
771
|
+
],
|
752
772
|
title="New value",
|
753
773
|
)
|
754
774
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
755
775
|
|
756
776
|
|
757
|
-
class FlattenFields(BaseModel):
|
758
|
-
type: Literal["FlattenFields"]
|
759
|
-
flatten_lists: Optional[bool] = Field(
|
760
|
-
True,
|
761
|
-
description="Whether to flatten lists or leave it as is. Default is True.",
|
762
|
-
title="Flatten Lists",
|
763
|
-
)
|
764
|
-
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
765
|
-
|
766
|
-
|
767
777
|
class IterableDecoder(BaseModel):
|
768
778
|
type: Literal["IterableDecoder"]
|
769
779
|
|
@@ -2040,6 +2050,10 @@ class AsyncRetriever(BaseModel):
|
|
2040
2050
|
...,
|
2041
2051
|
description="Requester component that describes how to prepare HTTP requests to send to the source API to fetch the status of the running async job.",
|
2042
2052
|
)
|
2053
|
+
url_requester: Optional[Union[CustomRequester, HttpRequester]] = Field(
|
2054
|
+
None,
|
2055
|
+
description="Requester component that describes how to prepare HTTP requests to send to the source API to extract the url from polling response by the completed async job.",
|
2056
|
+
)
|
2043
2057
|
download_requester: Union[CustomRequester, HttpRequester] = Field(
|
2044
2058
|
...,
|
2045
2059
|
description="Requester component that describes how to prepare HTTP requests to send to the source API to download the data provided by the completed async job.",
|
@@ -84,8 +84,6 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
|
|
84
84
|
)
|
85
85
|
from airbyte_cdk.sources.declarative.incremental import (
|
86
86
|
ChildPartitionResumableFullRefreshCursor,
|
87
|
-
ConcurrentCursorFactory,
|
88
|
-
ConcurrentPerPartitionCursor,
|
89
87
|
CursorFactory,
|
90
88
|
DatetimeBasedCursor,
|
91
89
|
DeclarativeCursor,
|
@@ -440,7 +438,6 @@ from airbyte_cdk.sources.message import (
|
|
440
438
|
InMemoryMessageRepository,
|
441
439
|
LogAppenderMessageRepositoryDecorator,
|
442
440
|
MessageRepository,
|
443
|
-
NoopMessageRepository,
|
444
441
|
)
|
445
442
|
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField
|
446
443
|
from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
|
@@ -874,8 +871,6 @@ class ModelToComponentFactory:
|
|
874
871
|
stream_namespace: Optional[str],
|
875
872
|
config: Config,
|
876
873
|
stream_state: MutableMapping[str, Any],
|
877
|
-
message_repository: Optional[MessageRepository] = None,
|
878
|
-
runtime_lookback_window: Optional[int] = None,
|
879
874
|
**kwargs: Any,
|
880
875
|
) -> ConcurrentCursor:
|
881
876
|
component_type = component_definition.get("type")
|
@@ -933,11 +928,6 @@ class ModelToComponentFactory:
|
|
933
928
|
if evaluated_lookback_window:
|
934
929
|
lookback_window = parse_duration(evaluated_lookback_window)
|
935
930
|
|
936
|
-
if runtime_lookback_window and lookback_window:
|
937
|
-
lookback_window = max(lookback_window, runtime_lookback_window)
|
938
|
-
elif runtime_lookback_window:
|
939
|
-
lookback_window = runtime_lookback_window
|
940
|
-
|
941
931
|
connector_state_converter: DateTimeStreamStateConverter
|
942
932
|
connector_state_converter = CustomFormatConcurrentStreamStateConverter(
|
943
933
|
datetime_format=datetime_format,
|
@@ -1016,7 +1006,7 @@ class ModelToComponentFactory:
|
|
1016
1006
|
stream_name=stream_name,
|
1017
1007
|
stream_namespace=stream_namespace,
|
1018
1008
|
stream_state=stream_state,
|
1019
|
-
message_repository=
|
1009
|
+
message_repository=self._message_repository,
|
1020
1010
|
connector_state_manager=state_manager,
|
1021
1011
|
connector_state_converter=connector_state_converter,
|
1022
1012
|
cursor_field=cursor_field,
|
@@ -1028,63 +1018,6 @@ class ModelToComponentFactory:
|
|
1028
1018
|
cursor_granularity=cursor_granularity,
|
1029
1019
|
)
|
1030
1020
|
|
1031
|
-
def create_concurrent_cursor_from_perpartition_cursor(
|
1032
|
-
self,
|
1033
|
-
state_manager: ConnectorStateManager,
|
1034
|
-
model_type: Type[BaseModel],
|
1035
|
-
component_definition: ComponentDefinition,
|
1036
|
-
stream_name: str,
|
1037
|
-
stream_namespace: Optional[str],
|
1038
|
-
config: Config,
|
1039
|
-
stream_state: MutableMapping[str, Any],
|
1040
|
-
partition_router,
|
1041
|
-
**kwargs: Any,
|
1042
|
-
) -> ConcurrentPerPartitionCursor:
|
1043
|
-
component_type = component_definition.get("type")
|
1044
|
-
if component_definition.get("type") != model_type.__name__:
|
1045
|
-
raise ValueError(
|
1046
|
-
f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead"
|
1047
|
-
)
|
1048
|
-
|
1049
|
-
datetime_based_cursor_model = model_type.parse_obj(component_definition)
|
1050
|
-
|
1051
|
-
if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel):
|
1052
|
-
raise ValueError(
|
1053
|
-
f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
|
1054
|
-
)
|
1055
|
-
|
1056
|
-
interpolated_cursor_field = InterpolatedString.create(
|
1057
|
-
datetime_based_cursor_model.cursor_field,
|
1058
|
-
parameters=datetime_based_cursor_model.parameters or {},
|
1059
|
-
)
|
1060
|
-
cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
|
1061
|
-
|
1062
|
-
# Create the cursor factory
|
1063
|
-
cursor_factory = ConcurrentCursorFactory(
|
1064
|
-
partial(
|
1065
|
-
self.create_concurrent_cursor_from_datetime_based_cursor,
|
1066
|
-
state_manager=state_manager,
|
1067
|
-
model_type=model_type,
|
1068
|
-
component_definition=component_definition,
|
1069
|
-
stream_name=stream_name,
|
1070
|
-
stream_namespace=stream_namespace,
|
1071
|
-
config=config,
|
1072
|
-
message_repository=NoopMessageRepository(),
|
1073
|
-
)
|
1074
|
-
)
|
1075
|
-
|
1076
|
-
# Return the concurrent cursor and state converter
|
1077
|
-
return ConcurrentPerPartitionCursor(
|
1078
|
-
cursor_factory=cursor_factory,
|
1079
|
-
partition_router=partition_router,
|
1080
|
-
stream_name=stream_name,
|
1081
|
-
stream_namespace=stream_namespace,
|
1082
|
-
stream_state=stream_state,
|
1083
|
-
message_repository=self._message_repository, # type: ignore
|
1084
|
-
connector_state_manager=state_manager,
|
1085
|
-
cursor_field=cursor_field,
|
1086
|
-
)
|
1087
|
-
|
1088
1021
|
@staticmethod
|
1089
1022
|
def create_constant_backoff_strategy(
|
1090
1023
|
model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any
|
@@ -1367,15 +1300,18 @@ class ModelToComponentFactory:
|
|
1367
1300
|
raise ValueError(
|
1368
1301
|
"Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead"
|
1369
1302
|
)
|
1370
|
-
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1375
|
-
|
1376
|
-
|
1377
|
-
|
1378
|
-
|
1303
|
+
client_side_incremental_sync = {
|
1304
|
+
"date_time_based_cursor": self._create_component_from_model(
|
1305
|
+
model=model.incremental_sync, config=config
|
1306
|
+
),
|
1307
|
+
"substream_cursor": (
|
1308
|
+
combined_slicers
|
1309
|
+
if isinstance(
|
1310
|
+
combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
|
1311
|
+
)
|
1312
|
+
else None
|
1313
|
+
),
|
1314
|
+
}
|
1379
1315
|
|
1380
1316
|
if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
|
1381
1317
|
cursor_model = model.incremental_sync
|
@@ -2191,7 +2127,7 @@ class ModelToComponentFactory:
|
|
2191
2127
|
if (
|
2192
2128
|
not isinstance(stream_slicer, DatetimeBasedCursor)
|
2193
2129
|
or type(stream_slicer) is not DatetimeBasedCursor
|
2194
|
-
)
|
2130
|
+
):
|
2195
2131
|
# Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods).
|
2196
2132
|
# Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement
|
2197
2133
|
# their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's
|
@@ -2351,7 +2287,7 @@ class ModelToComponentFactory:
|
|
2351
2287
|
extractor=download_extractor,
|
2352
2288
|
name=name,
|
2353
2289
|
record_filter=None,
|
2354
|
-
transformations=
|
2290
|
+
transformations=transformations,
|
2355
2291
|
schema_normalization=TypeTransformer(TransformConfig.NoTransform),
|
2356
2292
|
config=config,
|
2357
2293
|
parameters={},
|
@@ -2388,6 +2324,16 @@ class ModelToComponentFactory:
|
|
2388
2324
|
if model.delete_requester
|
2389
2325
|
else None
|
2390
2326
|
)
|
2327
|
+
url_requester = (
|
2328
|
+
self._create_component_from_model(
|
2329
|
+
model=model.url_requester,
|
2330
|
+
decoder=decoder,
|
2331
|
+
config=config,
|
2332
|
+
name=f"job extract_url - {name}",
|
2333
|
+
)
|
2334
|
+
if model.url_requester
|
2335
|
+
else None
|
2336
|
+
)
|
2391
2337
|
status_extractor = self._create_component_from_model(
|
2392
2338
|
model=model.status_extractor, decoder=decoder, config=config, name=name
|
2393
2339
|
)
|
@@ -2398,6 +2344,7 @@ class ModelToComponentFactory:
|
|
2398
2344
|
creation_requester=creation_requester,
|
2399
2345
|
polling_requester=polling_requester,
|
2400
2346
|
download_retriever=download_retriever,
|
2347
|
+
url_requester=url_requester,
|
2401
2348
|
abort_requester=abort_requester,
|
2402
2349
|
delete_requester=delete_requester,
|
2403
2350
|
status_extractor=status_extractor,
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# AsyncHttpJobRepository sequence diagram
|
2
|
+
|
3
|
+
- Components marked as optional are not required and can be ignored.
|
4
|
+
- if `url_requester` is not provided, `urls_extractor` will get urls from the `polling_job_response`
|
5
|
+
- interpolation_context, e.g. `create_job_response` or `polling_job_response` can be obtained from stream_slice
|
6
|
+
|
7
|
+
|
8
|
+
```mermaid
|
9
|
+
---
|
10
|
+
title: AsyncHttpJobRepository Sequence Diagram
|
11
|
+
---
|
12
|
+
sequenceDiagram
|
13
|
+
participant AsyncHttpJobRepository as AsyncOrchestrator
|
14
|
+
participant CreationRequester as creation_requester
|
15
|
+
participant PollingRequester as polling_requester
|
16
|
+
participant UrlRequester as url_requester (Optional)
|
17
|
+
participant DownloadRetriever as download_retriever
|
18
|
+
participant AbortRequester as abort_requester (Optional)
|
19
|
+
participant DeleteRequester as delete_requester (Optional)
|
20
|
+
participant Reporting Server as Async Reporting Server
|
21
|
+
|
22
|
+
AsyncHttpJobRepository ->> CreationRequester: Initiate job creation
|
23
|
+
CreationRequester ->> Reporting Server: Create job request
|
24
|
+
Reporting Server -->> CreationRequester: Job ID response
|
25
|
+
CreationRequester -->> AsyncHttpJobRepository: Job ID
|
26
|
+
|
27
|
+
loop Poll for job status
|
28
|
+
AsyncHttpJobRepository ->> PollingRequester: Check job status
|
29
|
+
PollingRequester ->> Reporting Server: Status request (interpolation_context: `create_job_response`)
|
30
|
+
Reporting Server -->> PollingRequester: Status response
|
31
|
+
PollingRequester -->> AsyncHttpJobRepository: Job status
|
32
|
+
end
|
33
|
+
|
34
|
+
alt Status: Ready
|
35
|
+
AsyncHttpJobRepository ->> UrlRequester: Request download URLs (if applicable)
|
36
|
+
UrlRequester ->> Reporting Server: URL request (interpolation_context: `polling_job_response`)
|
37
|
+
Reporting Server -->> UrlRequester: Download URLs
|
38
|
+
UrlRequester -->> AsyncHttpJobRepository: Download URLs
|
39
|
+
|
40
|
+
AsyncHttpJobRepository ->> DownloadRetriever: Download reports
|
41
|
+
DownloadRetriever ->> Reporting Server: Retrieve report data (interpolation_context: `url`)
|
42
|
+
Reporting Server -->> DownloadRetriever: Report data
|
43
|
+
DownloadRetriever -->> AsyncHttpJobRepository: Report data
|
44
|
+
else Status: Failed
|
45
|
+
AsyncHttpJobRepository ->> AbortRequester: Send abort request
|
46
|
+
AbortRequester ->> Reporting Server: Abort job
|
47
|
+
Reporting Server -->> AbortRequester: Abort confirmation
|
48
|
+
AbortRequester -->> AsyncHttpJobRepository: Confirmation
|
49
|
+
end
|
50
|
+
|
51
|
+
AsyncHttpJobRepository ->> DeleteRequester: Send delete job request
|
52
|
+
DeleteRequester ->> Reporting Server: Delete job
|
53
|
+
Reporting Server -->> DeleteRequester: Deletion confirmation
|
54
|
+
DeleteRequester -->> AsyncHttpJobRepository: Confirmation
|
55
|
+
|
56
|
+
|
57
|
+
```
|
@@ -31,6 +31,10 @@ LOGGER = logging.getLogger("airbyte")
|
|
31
31
|
|
32
32
|
@dataclass
|
33
33
|
class AsyncHttpJobRepository(AsyncJobRepository):
|
34
|
+
"""
|
35
|
+
See Readme file for more details about flow.
|
36
|
+
"""
|
37
|
+
|
34
38
|
creation_requester: Requester
|
35
39
|
polling_requester: Requester
|
36
40
|
download_retriever: SimpleRetriever
|
@@ -44,6 +48,9 @@ class AsyncHttpJobRepository(AsyncJobRepository):
|
|
44
48
|
record_extractor: RecordExtractor = field(
|
45
49
|
init=False, repr=False, default_factory=lambda: ResponseToFileExtractor({})
|
46
50
|
)
|
51
|
+
url_requester: Optional[Requester] = (
|
52
|
+
None # use it in case polling_requester provides some <id> and extra request is needed to obtain list of urls to download from
|
53
|
+
)
|
47
54
|
|
48
55
|
def __post_init__(self) -> None:
|
49
56
|
self._create_job_response_by_id: Dict[str, Response] = {}
|
@@ -186,10 +193,13 @@ class AsyncHttpJobRepository(AsyncJobRepository):
|
|
186
193
|
|
187
194
|
"""
|
188
195
|
|
189
|
-
for url in self.
|
190
|
-
|
191
|
-
|
192
|
-
|
196
|
+
for url in self._get_download_url(job):
|
197
|
+
job_slice = job.job_parameters()
|
198
|
+
stream_slice = StreamSlice(
|
199
|
+
partition=job_slice.partition,
|
200
|
+
cursor_slice=job_slice.cursor_slice,
|
201
|
+
extra_fields={**job_slice.extra_fields, "url": url},
|
202
|
+
)
|
193
203
|
for message in self.download_retriever.read_records({}, stream_slice):
|
194
204
|
if isinstance(message, Record):
|
195
205
|
yield message.data
|
@@ -226,3 +236,22 @@ class AsyncHttpJobRepository(AsyncJobRepository):
|
|
226
236
|
cursor_slice={},
|
227
237
|
)
|
228
238
|
return stream_slice
|
239
|
+
|
240
|
+
def _get_download_url(self, job: AsyncJob) -> Iterable[str]:
|
241
|
+
if not self.url_requester:
|
242
|
+
url_response = self._polling_job_response_by_id[job.api_job_id()]
|
243
|
+
else:
|
244
|
+
stream_slice: StreamSlice = StreamSlice(
|
245
|
+
partition={
|
246
|
+
"polling_job_response": self._polling_job_response_by_id[job.api_job_id()]
|
247
|
+
},
|
248
|
+
cursor_slice={},
|
249
|
+
)
|
250
|
+
url_response = self.url_requester.send_request(stream_slice=stream_slice) # type: ignore # we expect url_requester to always be presented, otherwise raise an exception as we cannot proceed with the report
|
251
|
+
if not url_response:
|
252
|
+
raise AirbyteTracedException(
|
253
|
+
internal_message="Always expect a response or an exception from url_requester",
|
254
|
+
failure_type=FailureType.system_error,
|
255
|
+
)
|
256
|
+
|
257
|
+
yield from self.urls_extractor.extract_records(url_response) # type: ignore # we expect urls_extractor to always return list of strings
|
@@ -160,7 +160,7 @@ class SimpleRetriever(Retriever):
|
|
160
160
|
stream_slice,
|
161
161
|
next_page_token,
|
162
162
|
self._paginator.get_request_headers,
|
163
|
-
self.
|
163
|
+
self.stream_slicer.get_request_headers,
|
164
164
|
)
|
165
165
|
if isinstance(headers, str):
|
166
166
|
raise ValueError("Request headers cannot be a string")
|
airbyte_cdk/sources/types.py
CHANGED
@@ -62,11 +62,11 @@ airbyte_cdk/sources/declarative/checks/check_stream.py,sha256=dAA-UhmMj0WLXCkRQr
|
|
62
62
|
airbyte_cdk/sources/declarative/checks/connection_checker.py,sha256=MBRJo6WJlZQHpIfOGaNOkkHUmgUl_4wDM6VPo41z5Ss,1383
|
63
63
|
airbyte_cdk/sources/declarative/concurrency_level/__init__.py,sha256=5XUqrmlstYlMM0j6crktlKQwALek0uiz2D3WdM46MyA,191
|
64
64
|
airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py,sha256=YIwCTCpOr_QSNW4ltQK0yUGWInI8PKNY216HOOegYLk,2101
|
65
|
-
airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=
|
65
|
+
airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=tSTCSmyMCu1qoGsne1Ooz3c1da-8EDZk6Suiy2gIq9Q,22475
|
66
66
|
airbyte_cdk/sources/declarative/datetime/__init__.py,sha256=l9LG7Qm6e5r_qgqfVKnx3mXYtg1I9MmMjomVIPfU4XA,177
|
67
67
|
airbyte_cdk/sources/declarative/datetime/datetime_parser.py,sha256=SX9JjdesN1edN2WVUVMzU_ptqp2QB1OnsnjZ4mwcX7w,2579
|
68
68
|
airbyte_cdk/sources/declarative/datetime/min_max_datetime.py,sha256=0BHBtDNQZfvwM45-tY5pNlTcKAFSGGNxemoi0Jic-0E,5785
|
69
|
-
airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=
|
69
|
+
airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=2t3_QVXWOImPcH-apR_Xd8qNl6K_URFwBbQ47YHcjXg,133490
|
70
70
|
airbyte_cdk/sources/declarative/declarative_source.py,sha256=nF7wBqFd3AQmEKAm4CnIo29CJoQL562cJGSCeL8U8bA,1531
|
71
71
|
airbyte_cdk/sources/declarative/declarative_stream.py,sha256=JRyNeOIpsFu4ztVZsN6sncqUEIqIE-bUkD2TPgbMgk0,10375
|
72
72
|
airbyte_cdk/sources/declarative/decoders/__init__.py,sha256=edGj4fGxznBk4xzRQyCA1rGfbpqe7z-RE0K3kQQWbgA,858
|
@@ -81,16 +81,15 @@ airbyte_cdk/sources/declarative/extractors/__init__.py,sha256=RmV-IkO1YLj0PSOrrq
|
|
81
81
|
airbyte_cdk/sources/declarative/extractors/dpath_extractor.py,sha256=wR4Ol4MG2lt5UlqXF5EU_k7qa5cN4_-luu3PJ1PlO3A,3131
|
82
82
|
airbyte_cdk/sources/declarative/extractors/http_selector.py,sha256=2zWZ4ewTqQC8VwkjS0xD_u350Km3SiYP7hpOOgiLg5o,1169
|
83
83
|
airbyte_cdk/sources/declarative/extractors/record_extractor.py,sha256=XJELMjahAsaomlvQgN2zrNO0DJX0G0fr9r682gUz7Pg,691
|
84
|
-
airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=
|
84
|
+
airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=OJ9xmhNWNwwzxYOeIrDy1GINb1zH9MBy6suC5tm2LSk,3545
|
85
85
|
airbyte_cdk/sources/declarative/extractors/record_selector.py,sha256=tjNwcURmlyD-TGCScXvW95ThNKyPGcx2SiWbG1-H-sc,6552
|
86
86
|
airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py,sha256=LhqGDfX06_dDYLKsIVnwQ_nAWCln-v8PV7Wgt_QVeTI,6533
|
87
87
|
airbyte_cdk/sources/declarative/extractors/type_transformer.py,sha256=d6Y2Rfg8pMVEEnHllfVksWZdNVOU55yk34O03dP9muY,1626
|
88
|
-
airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=
|
89
|
-
airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py,sha256=1qloF1gfV5nsOqkOxDfviwyckPUq1ur6sglvhIt6AeQ,15344
|
88
|
+
airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=huRz3KQJSUFmJCg5GPE9TckEBsB5TMsCa_THhJAhPVI,1037
|
90
89
|
airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py,sha256=_UzUnSIUsDbRgbFTXgSyZEFb4ws-KdhdQPWO8mFbV7U,22028
|
91
90
|
airbyte_cdk/sources/declarative/incremental/declarative_cursor.py,sha256=5Bhw9VRPyIuCaD0wmmq_L3DZsa-rJgtKSEUzSd8YYD0,536
|
92
91
|
airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py,sha256=3_EEZop94bMitZaJd2PF5Q2Xt9v94tYg7p7YJz8tAFc,15869
|
93
|
-
airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=
|
92
|
+
airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=hElcYijbOHjdLKOMA7W7aizEbf22r7OSApXALP875uI,15749
|
94
93
|
airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py,sha256=2YBOA2NnwAeIKlIhSwUB_W-FaGnPcmrG_liY7b4mV2Y,8365
|
95
94
|
airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py,sha256=10LFv1QPM-agVKl6eaANmEBOfd7gZgBrkoTcMggsieQ,4809
|
96
95
|
airbyte_cdk/sources/declarative/interpolation/__init__.py,sha256=tjUJkn3B-iZ-p7RP2c3dVZejrGiQeooGmS5ibWTuUL4,437
|
@@ -107,12 +106,12 @@ airbyte_cdk/sources/declarative/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW
|
|
107
106
|
airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py,sha256=iemy3fKLczcU0-Aor7tx5jcT6DRedKMqyK7kCOp01hg,3924
|
108
107
|
airbyte_cdk/sources/declarative/migrations/state_migration.py,sha256=KWPjealMLKSMtajXgkdGgKg7EmTLR-CqqD7UIh0-eDU,794
|
109
108
|
airbyte_cdk/sources/declarative/models/__init__.py,sha256=nUFxNCiKeYRVXuZEKA7GD-lTHxsiKcQ8FitZjKhPIvE,100
|
110
|
-
airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=
|
109
|
+
airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=3xWpeDNDGOw_I2pQ1LDiUhNBEWEvNAtd-HCi_1aklSQ,93666
|
111
110
|
airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
|
112
111
|
airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=Rir9_z3Kcd5Es0-LChrzk-0qubAsiK_RSEnLmK2OXm8,553
|
113
112
|
airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py,sha256=CXwTfD3wSQq3okcqwigpprbHhSURUokh4GK2OmOyKC8,9132
|
114
113
|
airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py,sha256=IWUOdF03o-aQn0Occo1BJCxU0Pz-QILk5L67nzw2thw,6803
|
115
|
-
airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=
|
114
|
+
airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=dpRWxZaPghPcE5vGkI4swKDaXyLWLMAbvDoazuNSobU,109709
|
116
115
|
airbyte_cdk/sources/declarative/partition_routers/__init__.py,sha256=HJ-Syp3p7RpyR_OK0X_a2kSyISfu3W-PKrRI16iY0a8,957
|
117
116
|
airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py,sha256=n82J15S8bjeMZ5uROu--P3hnbQoxkY5v7RPHYx7g7ro,2929
|
118
117
|
airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py,sha256=c5cuVFM6NFkuQqG8Z5IwkBuwDrvXZN1CunUOM_L0ezg,6892
|
@@ -120,6 +119,7 @@ airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py,sha25
|
|
120
119
|
airbyte_cdk/sources/declarative/partition_routers/partition_router.py,sha256=YyEIzdmLd1FjbVP3QbQ2VFCLW_P-OGbVh6VpZShp54k,2218
|
121
120
|
airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py,sha256=SKzKjSyfccq4dxGIh-J6ejrgkCHzaiTIazmbmeQiRD4,1942
|
122
121
|
airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py,sha256=5bgXoJfBg_6i53krQMptAGb50XB5XoVfqQxKQhlLtBA,15383
|
122
|
+
airbyte_cdk/sources/declarative/requesters/README.md,sha256=WabtHlwHg_J34aL1Kwm8vboYqBaSgsFjq10qR-P2sx8,2658
|
123
123
|
airbyte_cdk/sources/declarative/requesters/__init__.py,sha256=d7a3OoHbqaJDyyPli3nqqJ2yAW_SLX6XDaBAKOwvpxw,364
|
124
124
|
airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py,sha256=SkEDcJxlT1683rNx93K9whoS0OyUukkuOfToGtgpF58,776
|
125
125
|
airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py,sha256=1WZdpFmWL6W_Dko0qjflTaKIWeqt8jHT-D6HcujIp3s,884
|
@@ -134,7 +134,7 @@ airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.
|
|
134
134
|
airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py,sha256=q0YkeYUUWO6iErUy0vjqiOkhg8_9d5YcCmtlpXAJJ9E,1314
|
135
135
|
airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py,sha256=Tan66odx8VHzfdyyXMQkXz2pJYksllGqvxmpoajgcK4,669
|
136
136
|
airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py,sha256=vhWsEKNTYEzZ4gerhHqnDNKu4wGIP485NAzpSQ5DRZg,7941
|
137
|
-
airbyte_cdk/sources/declarative/requesters/http_job_repository.py,sha256=
|
137
|
+
airbyte_cdk/sources/declarative/requesters/http_job_repository.py,sha256=3GtOefPH08evlSUxaILkiKLTHbIspFY4qd5B3ZqNE60,10063
|
138
138
|
airbyte_cdk/sources/declarative/requesters/http_requester.py,sha256=RqYPkgJFAWfcZBTc-JBcGHPm4JL1ZQOhs9GKU4MP2eE,14723
|
139
139
|
airbyte_cdk/sources/declarative/requesters/paginators/__init__.py,sha256=uArbKs9JKNCt7t9tZoeWwjDpyI1HoPp29FNW0JzvaEM,644
|
140
140
|
airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py,sha256=FnSl3qPvv5wD6ieAI2Ic5c4dqBk-3fRe4tCaWzq3YwM,11840
|
@@ -163,7 +163,7 @@ airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py,sha256=Aio
|
|
163
163
|
airbyte_cdk/sources/declarative/retrievers/__init__.py,sha256=ix9m1dkR69DcXCXUKC5RK_ZZM7ojTLBQ4IkWQTfmfCk,456
|
164
164
|
airbyte_cdk/sources/declarative/retrievers/async_retriever.py,sha256=kX9ltelK2xLIBWDJBK2ucrvVe5tc5xmhdbVbgsjvlxY,3696
|
165
165
|
airbyte_cdk/sources/declarative/retrievers/retriever.py,sha256=XPLs593Xv8c5cKMc37XzUAYmzlXd1a7eSsspM-CMuWA,1696
|
166
|
-
airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=
|
166
|
+
airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=jxQ_9xcVD07r9PKhofitAqMkdX1k8ZNyy50qz5NwkFs,24540
|
167
167
|
airbyte_cdk/sources/declarative/schema/__init__.py,sha256=HztgVVaZdil5UfgUZcv_Hyy84r89_EKRwyO2hoewNVg,749
|
168
168
|
airbyte_cdk/sources/declarative/schema/default_schema_loader.py,sha256=KTACrIE23a83wsm3Rd9Eb4K6-20lrGqYxTHNp9yxsso,1820
|
169
169
|
airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py,sha256=H6A3NQ6kPPM-cUNPmdvDPc9xNzR1rQNrK95GbgCW334,8822
|
@@ -293,7 +293,7 @@ airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py,sha256=Y
|
|
293
293
|
airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py,sha256=ka-bBRWvIv09LmZNYl49p2lK9nd_Tvi2g0lIp3OkU40,14872
|
294
294
|
airbyte_cdk/sources/streams/http/requests_native_auth/token.py,sha256=h5PTzcdH-RQLeCg7xZ45w_484OPUDSwNWl_iMJQmZoI,2526
|
295
295
|
airbyte_cdk/sources/streams/utils/__init__.py,sha256=4Hw-PX1-VgESLF16cDdvuYCzGJtHntThLF4qIiULWeo,61
|
296
|
-
airbyte_cdk/sources/types.py,sha256=
|
296
|
+
airbyte_cdk/sources/types.py,sha256=nLPkTpyfGV4E6e99qcBWX4r8C3fE4I8Fvgx2EjvT9ic,5005
|
297
297
|
airbyte_cdk/sources/utils/__init__.py,sha256=TTN6VUxVy6Is8BhYQZR5pxJGQh8yH4duXh4O1TiMiEY,118
|
298
298
|
airbyte_cdk/sources/utils/casing.py,sha256=QC-gV1O4e8DR4-bhdXieUPKm_JamzslVyfABLYYRSXA,256
|
299
299
|
airbyte_cdk/sources/utils/record_helper.py,sha256=jeB0mucudzna7Zvj-pCBbwFrbLJ36SlAWZTh5O4Fb9Y,2168
|
@@ -343,8 +343,8 @@ airbyte_cdk/utils/slice_hasher.py,sha256=-pHexlNYoWYPnXNH-M7HEbjmeJe9Zk7SJijdQ7d
|
|
343
343
|
airbyte_cdk/utils/spec_schema_transformations.py,sha256=-5HTuNsnDBAhj-oLeQXwpTGA0HdcjFOf2zTEMUTTg_Y,816
|
344
344
|
airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
|
345
345
|
airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
|
346
|
-
airbyte_cdk-6.
|
347
|
-
airbyte_cdk-6.
|
348
|
-
airbyte_cdk-6.
|
349
|
-
airbyte_cdk-6.
|
350
|
-
airbyte_cdk-6.
|
346
|
+
airbyte_cdk-6.18.0.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
347
|
+
airbyte_cdk-6.18.0.dist-info/METADATA,sha256=RvVkgbg-LBbS5eGTntO-mp34yRIDMuPYZ26VRmSkhCA,6000
|
348
|
+
airbyte_cdk-6.18.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
349
|
+
airbyte_cdk-6.18.0.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
|
350
|
+
airbyte_cdk-6.18.0.dist-info/RECORD,,
|
@@ -1,346 +0,0 @@
|
|
1
|
-
import copy
|
2
|
-
import logging
|
3
|
-
|
4
|
-
#
|
5
|
-
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
6
|
-
#
|
7
|
-
import threading
|
8
|
-
from collections import OrderedDict
|
9
|
-
from copy import deepcopy
|
10
|
-
from datetime import timedelta
|
11
|
-
from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
|
12
|
-
|
13
|
-
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
14
|
-
from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
|
15
|
-
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
|
16
|
-
Timer,
|
17
|
-
iterate_with_last_flag_and_state,
|
18
|
-
)
|
19
|
-
from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
|
20
|
-
from airbyte_cdk.sources.message import MessageRepository
|
21
|
-
from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
|
22
|
-
PerPartitionKeySerializer,
|
23
|
-
)
|
24
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField
|
25
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
26
|
-
from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
|
27
|
-
|
28
|
-
logger = logging.getLogger("airbyte")
|
29
|
-
|
30
|
-
|
31
|
-
class ConcurrentCursorFactory:
|
32
|
-
def __init__(self, create_function: Callable[..., Cursor]):
|
33
|
-
self._create_function = create_function
|
34
|
-
|
35
|
-
def create(self, stream_state: Mapping[str, Any], runtime_lookback_window: Any) -> Cursor:
|
36
|
-
return self._create_function(
|
37
|
-
stream_state=stream_state, runtime_lookback_window=runtime_lookback_window
|
38
|
-
)
|
39
|
-
|
40
|
-
|
41
|
-
class ConcurrentPerPartitionCursor(Cursor):
|
42
|
-
"""
|
43
|
-
Manages state per partition when a stream has many partitions, to prevent data loss or duplication.
|
44
|
-
|
45
|
-
**Partition Limitation and Limit Reached Logic**
|
46
|
-
|
47
|
-
- **DEFAULT_MAX_PARTITIONS_NUMBER**: The maximum number of partitions to keep in memory (default is 10,000).
|
48
|
-
- **_cursor_per_partition**: An ordered dictionary that stores cursors for each partition.
|
49
|
-
- **_over_limit**: A counter that increments each time an oldest partition is removed when the limit is exceeded.
|
50
|
-
|
51
|
-
The class ensures that the number of partitions tracked does not exceed the `DEFAULT_MAX_PARTITIONS_NUMBER` to prevent excessive memory usage.
|
52
|
-
|
53
|
-
- When the number of partitions exceeds the limit, the oldest partitions are removed from `_cursor_per_partition`, and `_over_limit` is incremented accordingly.
|
54
|
-
- The `limit_reached` method returns `True` when `_over_limit` exceeds `DEFAULT_MAX_PARTITIONS_NUMBER`, indicating that the global cursor should be used instead of per-partition cursors.
|
55
|
-
|
56
|
-
This approach avoids unnecessary switching to a global cursor due to temporary spikes in partition counts, ensuring that switching is only done when a sustained high number of partitions is observed.
|
57
|
-
"""
|
58
|
-
|
59
|
-
DEFAULT_MAX_PARTITIONS_NUMBER = 10000
|
60
|
-
_NO_STATE: Mapping[str, Any] = {}
|
61
|
-
_NO_CURSOR_STATE: Mapping[str, Any] = {}
|
62
|
-
_KEY = 0
|
63
|
-
_VALUE = 1
|
64
|
-
|
65
|
-
def __init__(
|
66
|
-
self,
|
67
|
-
cursor_factory: ConcurrentCursorFactory,
|
68
|
-
partition_router: PartitionRouter,
|
69
|
-
stream_name: str,
|
70
|
-
stream_namespace: Optional[str],
|
71
|
-
stream_state: Any,
|
72
|
-
message_repository: MessageRepository,
|
73
|
-
connector_state_manager: ConnectorStateManager,
|
74
|
-
cursor_field: CursorField,
|
75
|
-
) -> None:
|
76
|
-
self._global_cursor: Mapping[str, Any] = {}
|
77
|
-
self._stream_name = stream_name
|
78
|
-
self._stream_namespace = stream_namespace
|
79
|
-
self._message_repository = message_repository
|
80
|
-
self._connector_state_manager = connector_state_manager
|
81
|
-
self._cursor_field = cursor_field
|
82
|
-
|
83
|
-
self._cursor_factory = cursor_factory
|
84
|
-
self._partition_router = partition_router
|
85
|
-
|
86
|
-
# The dict is ordered to ensure that once the maximum number of partitions is reached,
|
87
|
-
# the oldest partitions can be efficiently removed, maintaining the most recent partitions.
|
88
|
-
self._cursor_per_partition: OrderedDict[str, Cursor] = OrderedDict()
|
89
|
-
self._state = {"states": []}
|
90
|
-
self._semaphore_per_partition = OrderedDict()
|
91
|
-
self._finished_partitions = set()
|
92
|
-
self._lock = threading.Lock()
|
93
|
-
self._timer = Timer()
|
94
|
-
self._new_global_cursor = None
|
95
|
-
self._lookback_window = 0
|
96
|
-
self._parent_state = None
|
97
|
-
self._over_limit = 0
|
98
|
-
self._partition_serializer = PerPartitionKeySerializer()
|
99
|
-
|
100
|
-
self._set_initial_state(stream_state)
|
101
|
-
|
102
|
-
@property
|
103
|
-
def cursor_field(self) -> CursorField:
|
104
|
-
return self._cursor_field
|
105
|
-
|
106
|
-
@property
|
107
|
-
def state(self) -> MutableMapping[str, Any]:
|
108
|
-
states = []
|
109
|
-
for partition_tuple, cursor in self._cursor_per_partition.items():
|
110
|
-
cursor_state = cursor._connector_state_converter.convert_to_state_message(
|
111
|
-
self.cursor_field, cursor.state
|
112
|
-
)
|
113
|
-
if cursor_state:
|
114
|
-
states.append(
|
115
|
-
{
|
116
|
-
"partition": self._to_dict(partition_tuple),
|
117
|
-
"cursor": copy.deepcopy(cursor_state),
|
118
|
-
}
|
119
|
-
)
|
120
|
-
state: dict[str, Any] = {"states": states}
|
121
|
-
|
122
|
-
if self._global_cursor:
|
123
|
-
state["state"] = self._global_cursor
|
124
|
-
if self._lookback_window is not None:
|
125
|
-
state["lookback_window"] = self._lookback_window
|
126
|
-
if self._parent_state is not None:
|
127
|
-
state["parent_state"] = self._parent_state
|
128
|
-
return state
|
129
|
-
|
130
|
-
def close_partition(self, partition: Partition) -> None:
|
131
|
-
self._cursor_per_partition[
|
132
|
-
self._to_partition_key(partition._stream_slice.partition)
|
133
|
-
].close_partition(partition=partition)
|
134
|
-
with self._lock:
|
135
|
-
self._semaphore_per_partition[
|
136
|
-
self._to_partition_key(partition._stream_slice.partition)
|
137
|
-
].acquire()
|
138
|
-
cursor = self._cursor_per_partition[
|
139
|
-
self._to_partition_key(partition._stream_slice.partition)
|
140
|
-
]
|
141
|
-
cursor_state = cursor._connector_state_converter.convert_to_state_message(
|
142
|
-
cursor._cursor_field, cursor.state
|
143
|
-
)
|
144
|
-
if (
|
145
|
-
self._to_partition_key(partition._stream_slice.partition)
|
146
|
-
in self._finished_partitions
|
147
|
-
and self._semaphore_per_partition[
|
148
|
-
self._to_partition_key(partition._stream_slice.partition)
|
149
|
-
]._value
|
150
|
-
== 0
|
151
|
-
):
|
152
|
-
if (
|
153
|
-
self._new_global_cursor is None
|
154
|
-
or self._new_global_cursor[self.cursor_field.cursor_field_key]
|
155
|
-
< cursor_state[self.cursor_field.cursor_field_key]
|
156
|
-
):
|
157
|
-
self._new_global_cursor = copy.deepcopy(cursor_state)
|
158
|
-
|
159
|
-
def ensure_at_least_one_state_emitted(self) -> None:
|
160
|
-
"""
|
161
|
-
The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
|
162
|
-
called.
|
163
|
-
"""
|
164
|
-
if not any(
|
165
|
-
semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
|
166
|
-
):
|
167
|
-
self._global_cursor = self._new_global_cursor
|
168
|
-
self._lookback_window = self._timer.finish()
|
169
|
-
self._parent_state = self._partition_router.get_stream_state()
|
170
|
-
self._emit_state_message()
|
171
|
-
|
172
|
-
def _emit_state_message(self) -> None:
|
173
|
-
self._connector_state_manager.update_state_for_stream(
|
174
|
-
self._stream_name,
|
175
|
-
self._stream_namespace,
|
176
|
-
self.state,
|
177
|
-
)
|
178
|
-
state_message = self._connector_state_manager.create_state_message(
|
179
|
-
self._stream_name, self._stream_namespace
|
180
|
-
)
|
181
|
-
self._message_repository.emit_message(state_message)
|
182
|
-
|
183
|
-
def stream_slices(self) -> Iterable[StreamSlice]:
|
184
|
-
slices = self._partition_router.stream_slices()
|
185
|
-
self._timer.start()
|
186
|
-
for partition in slices:
|
187
|
-
yield from self.generate_slices_from_partition(partition)
|
188
|
-
|
189
|
-
def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
|
190
|
-
# Ensure the maximum number of partitions is not exceeded
|
191
|
-
self._ensure_partition_limit()
|
192
|
-
|
193
|
-
cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
|
194
|
-
if not cursor:
|
195
|
-
partition_state = self._global_cursor if self._global_cursor else self._NO_CURSOR_STATE
|
196
|
-
cursor = self._create_cursor(partition_state)
|
197
|
-
self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
|
198
|
-
self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
|
199
|
-
threading.Semaphore(0)
|
200
|
-
)
|
201
|
-
|
202
|
-
for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
|
203
|
-
cursor.stream_slices(),
|
204
|
-
lambda: None,
|
205
|
-
):
|
206
|
-
self._semaphore_per_partition[self._to_partition_key(partition.partition)].release()
|
207
|
-
if is_last_slice:
|
208
|
-
self._finished_partitions.add(self._to_partition_key(partition.partition))
|
209
|
-
yield StreamSlice(
|
210
|
-
partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
|
211
|
-
)
|
212
|
-
|
213
|
-
def _ensure_partition_limit(self) -> None:
|
214
|
-
"""
|
215
|
-
Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
|
216
|
-
"""
|
217
|
-
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
|
218
|
-
self._over_limit += 1
|
219
|
-
oldest_partition = self._cursor_per_partition.popitem(last=False)[
|
220
|
-
0
|
221
|
-
] # Remove the oldest partition
|
222
|
-
logger.warning(
|
223
|
-
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
|
224
|
-
)
|
225
|
-
|
226
|
-
def limit_reached(self) -> bool:
|
227
|
-
return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
|
228
|
-
|
229
|
-
def _set_initial_state(self, stream_state: StreamState) -> None:
|
230
|
-
"""
|
231
|
-
Set the initial state for the cursors.
|
232
|
-
|
233
|
-
This method initializes the state for each partition cursor using the provided stream state.
|
234
|
-
If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
|
235
|
-
|
236
|
-
Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router
|
237
|
-
does not have parent streams, this step will be skipped due to the default PartitionRouter implementation.
|
238
|
-
|
239
|
-
Args:
|
240
|
-
stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
|
241
|
-
{
|
242
|
-
"states": [
|
243
|
-
{
|
244
|
-
"partition": {
|
245
|
-
"partition_key": "value"
|
246
|
-
},
|
247
|
-
"cursor": {
|
248
|
-
"last_updated": "2023-05-27T00:00:00Z"
|
249
|
-
}
|
250
|
-
}
|
251
|
-
],
|
252
|
-
"parent_state": {
|
253
|
-
"parent_stream_name": {
|
254
|
-
"last_updated": "2023-05-27T00:00:00Z"
|
255
|
-
}
|
256
|
-
}
|
257
|
-
}
|
258
|
-
"""
|
259
|
-
if not stream_state:
|
260
|
-
return
|
261
|
-
|
262
|
-
if "states" not in stream_state:
|
263
|
-
# We assume that `stream_state` is in a global format that can be applied to all partitions.
|
264
|
-
# Example: {"global_state_format_key": "global_state_format_value"}
|
265
|
-
self._global_cursor = deepcopy(stream_state)
|
266
|
-
self._new_global_cursor = deepcopy(stream_state)
|
267
|
-
|
268
|
-
else:
|
269
|
-
self._lookback_window = stream_state.get("lookback_window")
|
270
|
-
|
271
|
-
for state in stream_state["states"]:
|
272
|
-
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
|
273
|
-
self._create_cursor(
|
274
|
-
state["cursor"], runtime_lookback_window=self._lookback_window
|
275
|
-
)
|
276
|
-
)
|
277
|
-
self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
|
278
|
-
threading.Semaphore(0)
|
279
|
-
)
|
280
|
-
|
281
|
-
# set default state for missing partitions if it is per partition with fallback to global
|
282
|
-
if "state" in stream_state:
|
283
|
-
self._global_cursor = deepcopy(stream_state["state"])
|
284
|
-
self._new_global_cursor = deepcopy(stream_state["state"])
|
285
|
-
|
286
|
-
# Set parent state for partition routers based on parent streams
|
287
|
-
self._partition_router.set_initial_state(stream_state)
|
288
|
-
|
289
|
-
def observe(self, record: Record) -> None:
|
290
|
-
self._cursor_per_partition[
|
291
|
-
self._to_partition_key(record.associated_slice.partition)
|
292
|
-
].observe(record)
|
293
|
-
|
294
|
-
def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
|
295
|
-
return self._partition_serializer.to_partition_key(partition)
|
296
|
-
|
297
|
-
def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
|
298
|
-
return self._partition_serializer.to_partition(partition_key)
|
299
|
-
|
300
|
-
def _create_cursor(self, cursor_state: Any, runtime_lookback_window: Any = None) -> Cursor:
|
301
|
-
if runtime_lookback_window:
|
302
|
-
runtime_lookback_window = timedelta(seconds=runtime_lookback_window)
|
303
|
-
cursor = self._cursor_factory.create(
|
304
|
-
stream_state=deepcopy(cursor_state), runtime_lookback_window=runtime_lookback_window
|
305
|
-
)
|
306
|
-
return cursor
|
307
|
-
|
308
|
-
def should_be_synced(self, record: Record) -> bool:
|
309
|
-
return self._get_cursor(record).should_be_synced(record)
|
310
|
-
|
311
|
-
def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
|
312
|
-
if not first.associated_slice or not second.associated_slice:
|
313
|
-
raise ValueError(
|
314
|
-
f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}"
|
315
|
-
)
|
316
|
-
if first.associated_slice.partition != second.associated_slice.partition:
|
317
|
-
raise ValueError(
|
318
|
-
f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}"
|
319
|
-
)
|
320
|
-
|
321
|
-
return self._get_cursor(first).is_greater_than_or_equal(
|
322
|
-
self._convert_record_to_cursor_record(first),
|
323
|
-
self._convert_record_to_cursor_record(second),
|
324
|
-
)
|
325
|
-
|
326
|
-
@staticmethod
|
327
|
-
def _convert_record_to_cursor_record(record: Record) -> Record:
|
328
|
-
return Record(
|
329
|
-
record.data,
|
330
|
-
StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
|
331
|
-
if record.associated_slice
|
332
|
-
else None,
|
333
|
-
)
|
334
|
-
|
335
|
-
def _get_cursor(self, record: Record) -> Cursor:
|
336
|
-
if not record.associated_slice:
|
337
|
-
raise ValueError(
|
338
|
-
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
339
|
-
)
|
340
|
-
partition_key = self._to_partition_key(record.associated_slice.partition)
|
341
|
-
if partition_key not in self._cursor_per_partition:
|
342
|
-
raise ValueError(
|
343
|
-
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
344
|
-
)
|
345
|
-
cursor = self._cursor_per_partition[partition_key]
|
346
|
-
return cursor
|
File without changes
|
File without changes
|
File without changes
|