airbyte-cdk 6.17.1.dev1__py3-none-any.whl → 6.18.0.dev1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +0 -69
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +15 -1
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +28 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +5 -3
- airbyte_cdk/sources/declarative/incremental/__init__.py +0 -6
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +0 -15
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +30 -12
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +24 -78
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +1 -1
- airbyte_cdk/sources/streams/concurrent/cursor.py +21 -30
- {airbyte_cdk-6.17.1.dev1.dist-info → airbyte_cdk-6.18.0.dev1.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.17.1.dev1.dist-info → airbyte_cdk-6.18.0.dev1.dist-info}/RECORD +15 -16
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +0 -340
- {airbyte_cdk-6.17.1.dev1.dist-info → airbyte_cdk-6.18.0.dev1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.17.1.dev1.dist-info → airbyte_cdk-6.18.0.dev1.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.17.1.dev1.dist-info → airbyte_cdk-6.18.0.dev1.dist-info}/entry_points.txt +0 -0
@@ -20,9 +20,6 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
|
|
20
20
|
ClientSideIncrementalRecordFilterDecorator,
|
21
21
|
)
|
22
22
|
from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
|
23
|
-
from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import (
|
24
|
-
PerPartitionWithGlobalCursor,
|
25
|
-
)
|
26
23
|
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
|
27
24
|
from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
|
28
25
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
@@ -307,72 +304,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
307
304
|
cursor=final_state_cursor,
|
308
305
|
)
|
309
306
|
)
|
310
|
-
elif (
|
311
|
-
incremental_sync_component_definition
|
312
|
-
and incremental_sync_component_definition.get("type", "")
|
313
|
-
== DatetimeBasedCursorModel.__name__
|
314
|
-
and self._stream_supports_concurrent_partition_processing(
|
315
|
-
declarative_stream=declarative_stream
|
316
|
-
)
|
317
|
-
and hasattr(declarative_stream.retriever, "stream_slicer")
|
318
|
-
and isinstance(
|
319
|
-
declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor
|
320
|
-
)
|
321
|
-
):
|
322
|
-
stream_state = state_manager.get_stream_state(
|
323
|
-
stream_name=declarative_stream.name, namespace=declarative_stream.namespace
|
324
|
-
)
|
325
|
-
partition_router = declarative_stream.retriever.stream_slicer._partition_router
|
326
|
-
|
327
|
-
cursor = self._constructor.create_concurrent_cursor_from_perpartition_cursor(
|
328
|
-
state_manager=state_manager,
|
329
|
-
model_type=DatetimeBasedCursorModel,
|
330
|
-
component_definition=incremental_sync_component_definition,
|
331
|
-
stream_name=declarative_stream.name,
|
332
|
-
stream_namespace=declarative_stream.namespace,
|
333
|
-
config=config or {},
|
334
|
-
stream_state=stream_state,
|
335
|
-
partition_router=partition_router,
|
336
|
-
)
|
337
|
-
|
338
|
-
retriever = declarative_stream.retriever
|
339
|
-
|
340
|
-
# This is an optimization so that we don't invoke any cursor or state management flows within the
|
341
|
-
# low-code framework because state management is handled through the ConcurrentCursor.
|
342
|
-
if declarative_stream and isinstance(retriever, SimpleRetriever):
|
343
|
-
# Also a temporary hack. In the legacy Stream implementation, as part of the read,
|
344
|
-
# set_initial_state() is called to instantiate incoming state on the cursor. Although we no
|
345
|
-
# longer rely on the legacy low-code cursor for concurrent checkpointing, low-code components
|
346
|
-
# like StopConditionPaginationStrategyDecorator and ClientSideIncrementalRecordFilterDecorator
|
347
|
-
# still rely on a DatetimeBasedCursor that is properly initialized with state.
|
348
|
-
if retriever.cursor:
|
349
|
-
retriever.cursor.set_initial_state(stream_state=stream_state)
|
350
|
-
# We zero it out here, but since this is a cursor reference, the state is still properly
|
351
|
-
# instantiated for the other components that reference it
|
352
|
-
retriever.cursor = None
|
353
|
-
|
354
|
-
partition_generator = StreamSlicerPartitionGenerator(
|
355
|
-
DeclarativePartitionFactory(
|
356
|
-
declarative_stream.name,
|
357
|
-
declarative_stream.get_json_schema(),
|
358
|
-
retriever,
|
359
|
-
self.message_repository,
|
360
|
-
),
|
361
|
-
cursor,
|
362
|
-
)
|
363
|
-
|
364
|
-
concurrent_streams.append(
|
365
|
-
DefaultStream(
|
366
|
-
partition_generator=partition_generator,
|
367
|
-
name=declarative_stream.name,
|
368
|
-
json_schema=declarative_stream.get_json_schema(),
|
369
|
-
availability_strategy=AlwaysAvailableAvailabilityStrategy(),
|
370
|
-
primary_key=get_primary_key_from_stream(declarative_stream.primary_key),
|
371
|
-
cursor_field=cursor.cursor_field.cursor_field_key,
|
372
|
-
logger=self.logger,
|
373
|
-
cursor=cursor,
|
374
|
-
)
|
375
|
-
)
|
376
307
|
else:
|
377
308
|
synchronous_streams.append(declarative_stream)
|
378
309
|
else:
|
@@ -678,7 +678,7 @@ definitions:
|
|
678
678
|
properties:
|
679
679
|
type:
|
680
680
|
type: string
|
681
|
-
enum: [
|
681
|
+
enum: [CustomSchemaNormalization]
|
682
682
|
class_name:
|
683
683
|
title: Class Name
|
684
684
|
description: Fully-qualified name of the class that will be implementing the custom normalization. The format is `source_<name>.<package>.<class_name>`.
|
@@ -2014,6 +2014,20 @@ definitions:
|
|
2014
2014
|
$parameters:
|
2015
2015
|
type: object
|
2016
2016
|
additionalProperties: true
|
2017
|
+
JsonParser:
|
2018
|
+
title: JsonParser
|
2019
|
+
description: Parser used for parsing str, bytes, or bytearray data and returning data in a dictionary format.
|
2020
|
+
type: object
|
2021
|
+
additionalProperties: true
|
2022
|
+
required:
|
2023
|
+
- type
|
2024
|
+
properties:
|
2025
|
+
type:
|
2026
|
+
type: string
|
2027
|
+
enum: [JsonParser]
|
2028
|
+
encoding:
|
2029
|
+
type: string
|
2030
|
+
default: utf-8
|
2017
2031
|
ListPartitionRouter:
|
2018
2032
|
title: List Partition Router
|
2019
2033
|
description: A Partition router that specifies a list of attributes where each attribute describes a portion of the complete data set for a stream. During a sync, each value is iterated over and can be used as input to outbound API requests.
|
@@ -7,9 +7,12 @@ from dataclasses import dataclass
|
|
7
7
|
from io import BufferedIOBase, TextIOWrapper
|
8
8
|
from typing import Any, Generator, MutableMapping, Optional
|
9
9
|
|
10
|
+
import orjson
|
10
11
|
import requests
|
11
12
|
|
13
|
+
from airbyte_cdk.models import FailureType
|
12
14
|
from airbyte_cdk.sources.declarative.decoders.decoder import Decoder
|
15
|
+
from airbyte_cdk.utils import AirbyteTracedException
|
13
16
|
|
14
17
|
logger = logging.getLogger("airbyte")
|
15
18
|
|
@@ -42,6 +45,31 @@ class GzipParser(Parser):
|
|
42
45
|
yield from self.inner_parser.parse(gzipobj)
|
43
46
|
|
44
47
|
|
48
|
+
@dataclass
|
49
|
+
class JsonParser(Parser):
|
50
|
+
encoding: str = "utf-8"
|
51
|
+
|
52
|
+
def parse(self, data: BufferedIOBase) -> Generator[MutableMapping[str, Any], None, None]:
|
53
|
+
raw_data = data.read()
|
54
|
+
try:
|
55
|
+
body_json = orjson.loads(raw_data.decode(self.encoding))
|
56
|
+
except Exception:
|
57
|
+
try:
|
58
|
+
body_json = json.loads(raw_data.decode(self.encoding))
|
59
|
+
except Exception as exc:
|
60
|
+
raise AirbyteTracedException(
|
61
|
+
message="Response JSON data failed to be parsed. See logs for more inforation.",
|
62
|
+
internal_message=f"Response JSON data faild to be parsed: {exc=}, {raw_data=}",
|
63
|
+
failure_type=FailureType.system_error,
|
64
|
+
exception=exc,
|
65
|
+
)
|
66
|
+
|
67
|
+
if isinstance(body_json, list):
|
68
|
+
yield from body_json
|
69
|
+
else:
|
70
|
+
yield from [body_json]
|
71
|
+
|
72
|
+
|
45
73
|
@dataclass
|
46
74
|
class JsonLineParser(Parser):
|
47
75
|
encoding: Optional[str] = "utf-8"
|
@@ -59,11 +59,13 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
|
|
59
59
|
|
60
60
|
def __init__(
|
61
61
|
self,
|
62
|
-
|
62
|
+
date_time_based_cursor: DatetimeBasedCursor,
|
63
|
+
substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]],
|
63
64
|
**kwargs: Any,
|
64
65
|
):
|
65
66
|
super().__init__(**kwargs)
|
66
|
-
self.
|
67
|
+
self._date_time_based_cursor = date_time_based_cursor
|
68
|
+
self._substream_cursor = substream_cursor
|
67
69
|
|
68
70
|
def filter_records(
|
69
71
|
self,
|
@@ -75,7 +77,7 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
|
|
75
77
|
records = (
|
76
78
|
record
|
77
79
|
for record in records
|
78
|
-
if self.
|
80
|
+
if (self._substream_cursor or self._date_time_based_cursor).should_be_synced(
|
79
81
|
# Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here
|
80
82
|
# Record stream name is empty cause it is not used durig the filtering
|
81
83
|
Record(data=record, associated_slice=stream_slice, stream_name="")
|
@@ -2,10 +2,6 @@
|
|
2
2
|
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import (
|
6
|
-
ConcurrentCursorFactory,
|
7
|
-
ConcurrentPerPartitionCursor,
|
8
|
-
)
|
9
5
|
from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
|
10
6
|
from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
|
11
7
|
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
|
@@ -25,8 +21,6 @@ from airbyte_cdk.sources.declarative.incremental.resumable_full_refresh_cursor i
|
|
25
21
|
|
26
22
|
__all__ = [
|
27
23
|
"CursorFactory",
|
28
|
-
"ConcurrentCursorFactory",
|
29
|
-
"ConcurrentPerPartitionCursor",
|
30
24
|
"DatetimeBasedCursor",
|
31
25
|
"DeclarativeCursor",
|
32
26
|
"GlobalSubstreamCursor",
|
@@ -303,21 +303,6 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
303
303
|
raise ValueError("A partition needs to be provided in order to get request body json")
|
304
304
|
|
305
305
|
def should_be_synced(self, record: Record) -> bool:
|
306
|
-
if (
|
307
|
-
record.associated_slice
|
308
|
-
and self._to_partition_key(record.associated_slice.partition)
|
309
|
-
not in self._cursor_per_partition
|
310
|
-
):
|
311
|
-
partition_state = (
|
312
|
-
self._state_to_migrate_from
|
313
|
-
if self._state_to_migrate_from
|
314
|
-
else self._NO_CURSOR_STATE
|
315
|
-
)
|
316
|
-
cursor = self._create_cursor(partition_state)
|
317
|
-
|
318
|
-
self._cursor_per_partition[
|
319
|
-
self._to_partition_key(record.associated_slice.partition)
|
320
|
-
] = cursor
|
321
306
|
return self._get_cursor(record).should_be_synced(
|
322
307
|
self._convert_record_to_cursor_record(record)
|
323
308
|
)
|
@@ -737,33 +737,43 @@ class KeysToSnakeCase(BaseModel):
|
|
737
737
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
738
738
|
|
739
739
|
|
740
|
+
class FlattenFields(BaseModel):
|
741
|
+
type: Literal["FlattenFields"]
|
742
|
+
flatten_lists: Optional[bool] = Field(
|
743
|
+
True,
|
744
|
+
description="Whether to flatten lists or leave it as is. Default is True.",
|
745
|
+
title="Flatten Lists",
|
746
|
+
)
|
747
|
+
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
748
|
+
|
749
|
+
|
740
750
|
class KeysReplace(BaseModel):
|
741
751
|
type: Literal["KeysReplace"]
|
742
752
|
old: str = Field(
|
743
753
|
...,
|
744
754
|
description="Old value to replace.",
|
745
|
-
examples=[
|
755
|
+
examples=[
|
756
|
+
" ",
|
757
|
+
"{{ record.id }}",
|
758
|
+
"{{ config['id'] }}",
|
759
|
+
"{{ stream_slice['id'] }}",
|
760
|
+
],
|
746
761
|
title="Old value",
|
747
762
|
)
|
748
763
|
new: str = Field(
|
749
764
|
...,
|
750
765
|
description="New value to set.",
|
751
|
-
examples=[
|
766
|
+
examples=[
|
767
|
+
"_",
|
768
|
+
"{{ record.id }}",
|
769
|
+
"{{ config['id'] }}",
|
770
|
+
"{{ stream_slice['id'] }}",
|
771
|
+
],
|
752
772
|
title="New value",
|
753
773
|
)
|
754
774
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
755
775
|
|
756
776
|
|
757
|
-
class FlattenFields(BaseModel):
|
758
|
-
type: Literal["FlattenFields"]
|
759
|
-
flatten_lists: Optional[bool] = Field(
|
760
|
-
True,
|
761
|
-
description="Whether to flatten lists or leave it as is. Default is True.",
|
762
|
-
title="Flatten Lists",
|
763
|
-
)
|
764
|
-
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
765
|
-
|
766
|
-
|
767
777
|
class IterableDecoder(BaseModel):
|
768
778
|
type: Literal["IterableDecoder"]
|
769
779
|
|
@@ -795,6 +805,14 @@ class GzipJsonDecoder(BaseModel):
|
|
795
805
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
796
806
|
|
797
807
|
|
808
|
+
class JsonParser(BaseModel):
|
809
|
+
class Config:
|
810
|
+
extra = Extra.allow
|
811
|
+
|
812
|
+
type: Literal["JsonParser"]
|
813
|
+
encoding: Optional[str] = "utf-8"
|
814
|
+
|
815
|
+
|
798
816
|
class MinMaxDatetime(BaseModel):
|
799
817
|
type: Literal["MinMaxDatetime"]
|
800
818
|
datetime: str = Field(
|
@@ -72,6 +72,7 @@ from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import (
|
|
72
72
|
CsvParser,
|
73
73
|
GzipParser,
|
74
74
|
JsonLineParser,
|
75
|
+
JsonParser,
|
75
76
|
)
|
76
77
|
from airbyte_cdk.sources.declarative.extractors import (
|
77
78
|
DpathExtractor,
|
@@ -84,8 +85,6 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
|
|
84
85
|
)
|
85
86
|
from airbyte_cdk.sources.declarative.incremental import (
|
86
87
|
ChildPartitionResumableFullRefreshCursor,
|
87
|
-
ConcurrentCursorFactory,
|
88
|
-
ConcurrentPerPartitionCursor,
|
89
88
|
CursorFactory,
|
90
89
|
DatetimeBasedCursor,
|
91
90
|
DeclarativeCursor,
|
@@ -249,6 +248,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
249
248
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
250
249
|
JsonLineParser as JsonLineParserModel,
|
251
250
|
)
|
251
|
+
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
252
|
+
JsonParser as JsonParserModel,
|
253
|
+
)
|
252
254
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
253
255
|
JwtAuthenticator as JwtAuthenticatorModel,
|
254
256
|
)
|
@@ -440,7 +442,6 @@ from airbyte_cdk.sources.message import (
|
|
440
442
|
InMemoryMessageRepository,
|
441
443
|
LogAppenderMessageRepositoryDecorator,
|
442
444
|
MessageRepository,
|
443
|
-
NoopMessageRepository,
|
444
445
|
)
|
445
446
|
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField
|
446
447
|
from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
|
@@ -525,6 +526,7 @@ class ModelToComponentFactory:
|
|
525
526
|
JsonDecoderModel: self.create_json_decoder,
|
526
527
|
JsonlDecoderModel: self.create_jsonl_decoder,
|
527
528
|
JsonLineParserModel: self.create_json_line_parser,
|
529
|
+
JsonParserModel: self.create_json_parser,
|
528
530
|
GzipJsonDecoderModel: self.create_gzipjson_decoder,
|
529
531
|
GzipParserModel: self.create_gzip_parser,
|
530
532
|
KeysToLowerModel: self.create_keys_to_lower_transformation,
|
@@ -874,8 +876,6 @@ class ModelToComponentFactory:
|
|
874
876
|
stream_namespace: Optional[str],
|
875
877
|
config: Config,
|
876
878
|
stream_state: MutableMapping[str, Any],
|
877
|
-
message_repository: Optional[MessageRepository] = None,
|
878
|
-
runtime_lookback_window: Optional[int] = None,
|
879
879
|
**kwargs: Any,
|
880
880
|
) -> ConcurrentCursor:
|
881
881
|
component_type = component_definition.get("type")
|
@@ -933,11 +933,6 @@ class ModelToComponentFactory:
|
|
933
933
|
if evaluated_lookback_window:
|
934
934
|
lookback_window = parse_duration(evaluated_lookback_window)
|
935
935
|
|
936
|
-
if runtime_lookback_window and lookback_window:
|
937
|
-
lookback_window = max(lookback_window, runtime_lookback_window)
|
938
|
-
elif runtime_lookback_window:
|
939
|
-
lookback_window = runtime_lookback_window
|
940
|
-
|
941
936
|
connector_state_converter: DateTimeStreamStateConverter
|
942
937
|
connector_state_converter = CustomFormatConcurrentStreamStateConverter(
|
943
938
|
datetime_format=datetime_format,
|
@@ -1016,7 +1011,7 @@ class ModelToComponentFactory:
|
|
1016
1011
|
stream_name=stream_name,
|
1017
1012
|
stream_namespace=stream_namespace,
|
1018
1013
|
stream_state=stream_state,
|
1019
|
-
message_repository=
|
1014
|
+
message_repository=self._message_repository,
|
1020
1015
|
connector_state_manager=state_manager,
|
1021
1016
|
connector_state_converter=connector_state_converter,
|
1022
1017
|
cursor_field=cursor_field,
|
@@ -1028,63 +1023,6 @@ class ModelToComponentFactory:
|
|
1028
1023
|
cursor_granularity=cursor_granularity,
|
1029
1024
|
)
|
1030
1025
|
|
1031
|
-
def create_concurrent_cursor_from_perpartition_cursor(
|
1032
|
-
self,
|
1033
|
-
state_manager: ConnectorStateManager,
|
1034
|
-
model_type: Type[BaseModel],
|
1035
|
-
component_definition: ComponentDefinition,
|
1036
|
-
stream_name: str,
|
1037
|
-
stream_namespace: Optional[str],
|
1038
|
-
config: Config,
|
1039
|
-
stream_state: MutableMapping[str, Any],
|
1040
|
-
partition_router,
|
1041
|
-
**kwargs: Any,
|
1042
|
-
) -> ConcurrentPerPartitionCursor:
|
1043
|
-
component_type = component_definition.get("type")
|
1044
|
-
if component_definition.get("type") != model_type.__name__:
|
1045
|
-
raise ValueError(
|
1046
|
-
f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead"
|
1047
|
-
)
|
1048
|
-
|
1049
|
-
datetime_based_cursor_model = model_type.parse_obj(component_definition)
|
1050
|
-
|
1051
|
-
if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel):
|
1052
|
-
raise ValueError(
|
1053
|
-
f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
|
1054
|
-
)
|
1055
|
-
|
1056
|
-
interpolated_cursor_field = InterpolatedString.create(
|
1057
|
-
datetime_based_cursor_model.cursor_field,
|
1058
|
-
parameters=datetime_based_cursor_model.parameters or {},
|
1059
|
-
)
|
1060
|
-
cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
|
1061
|
-
|
1062
|
-
# Create the cursor factory
|
1063
|
-
cursor_factory = ConcurrentCursorFactory(
|
1064
|
-
partial(
|
1065
|
-
self.create_concurrent_cursor_from_datetime_based_cursor,
|
1066
|
-
state_manager=state_manager,
|
1067
|
-
model_type=model_type,
|
1068
|
-
component_definition=component_definition,
|
1069
|
-
stream_name=stream_name,
|
1070
|
-
stream_namespace=stream_namespace,
|
1071
|
-
config=config,
|
1072
|
-
message_repository=NoopMessageRepository(),
|
1073
|
-
)
|
1074
|
-
)
|
1075
|
-
|
1076
|
-
# Return the concurrent cursor and state converter
|
1077
|
-
return ConcurrentPerPartitionCursor(
|
1078
|
-
cursor_factory=cursor_factory,
|
1079
|
-
partition_router=partition_router,
|
1080
|
-
stream_name=stream_name,
|
1081
|
-
stream_namespace=stream_namespace,
|
1082
|
-
stream_state=stream_state,
|
1083
|
-
message_repository=self._message_repository, # type: ignore
|
1084
|
-
connector_state_manager=state_manager,
|
1085
|
-
cursor_field=cursor_field,
|
1086
|
-
)
|
1087
|
-
|
1088
1026
|
@staticmethod
|
1089
1027
|
def create_constant_backoff_strategy(
|
1090
1028
|
model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any
|
@@ -1367,15 +1305,18 @@ class ModelToComponentFactory:
|
|
1367
1305
|
raise ValueError(
|
1368
1306
|
"Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead"
|
1369
1307
|
)
|
1370
|
-
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1375
|
-
|
1376
|
-
|
1377
|
-
|
1378
|
-
|
1308
|
+
client_side_incremental_sync = {
|
1309
|
+
"date_time_based_cursor": self._create_component_from_model(
|
1310
|
+
model=model.incremental_sync, config=config
|
1311
|
+
),
|
1312
|
+
"substream_cursor": (
|
1313
|
+
combined_slicers
|
1314
|
+
if isinstance(
|
1315
|
+
combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
|
1316
|
+
)
|
1317
|
+
else None
|
1318
|
+
),
|
1319
|
+
}
|
1379
1320
|
|
1380
1321
|
if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
|
1381
1322
|
cursor_model = model.incremental_sync
|
@@ -1812,6 +1753,11 @@ class ModelToComponentFactory:
|
|
1812
1753
|
def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> JsonDecoder:
|
1813
1754
|
return JsonDecoder(parameters={})
|
1814
1755
|
|
1756
|
+
@staticmethod
|
1757
|
+
def create_json_parser(model: JsonParserModel, config: Config, **kwargs: Any) -> JsonParser:
|
1758
|
+
encoding = model.encoding or "utf-8"
|
1759
|
+
return JsonParser(encoding=encoding)
|
1760
|
+
|
1815
1761
|
@staticmethod
|
1816
1762
|
def create_jsonl_decoder(
|
1817
1763
|
model: JsonlDecoderModel, config: Config, **kwargs: Any
|
@@ -2191,7 +2137,7 @@ class ModelToComponentFactory:
|
|
2191
2137
|
if (
|
2192
2138
|
not isinstance(stream_slicer, DatetimeBasedCursor)
|
2193
2139
|
or type(stream_slicer) is not DatetimeBasedCursor
|
2194
|
-
)
|
2140
|
+
):
|
2195
2141
|
# Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods).
|
2196
2142
|
# Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement
|
2197
2143
|
# their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's
|
@@ -160,7 +160,7 @@ class SimpleRetriever(Retriever):
|
|
160
160
|
stream_slice,
|
161
161
|
next_page_token,
|
162
162
|
self._paginator.get_request_headers,
|
163
|
-
self.
|
163
|
+
self.stream_slicer.get_request_headers,
|
164
164
|
)
|
165
165
|
if isinstance(headers, str):
|
166
166
|
raise ValueError("Request headers cannot be a string")
|
@@ -196,9 +196,7 @@ class ConcurrentCursor(Cursor):
|
|
196
196
|
|
197
197
|
@property
|
198
198
|
def state(self) -> MutableMapping[str, Any]:
|
199
|
-
return self.
|
200
|
-
self.cursor_field, self._concurrent_state
|
201
|
-
)
|
199
|
+
return self._concurrent_state
|
202
200
|
|
203
201
|
@property
|
204
202
|
def cursor_field(self) -> CursorField:
|
@@ -243,10 +241,10 @@ class ConcurrentCursor(Cursor):
|
|
243
241
|
return self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
|
244
242
|
|
245
243
|
def close_partition(self, partition: Partition) -> None:
|
246
|
-
slice_count_before = len(self.
|
244
|
+
slice_count_before = len(self.state.get("slices", []))
|
247
245
|
self._add_slice_to_state(partition)
|
248
246
|
if slice_count_before < len(
|
249
|
-
self.
|
247
|
+
self.state["slices"]
|
250
248
|
): # only emit if at least one slice has been processed
|
251
249
|
self._merge_partitions()
|
252
250
|
self._emit_state_message()
|
@@ -258,11 +256,11 @@ class ConcurrentCursor(Cursor):
|
|
258
256
|
)
|
259
257
|
|
260
258
|
if self._slice_boundary_fields:
|
261
|
-
if "slices" not in self.
|
259
|
+
if "slices" not in self.state:
|
262
260
|
raise RuntimeError(
|
263
261
|
f"The state for stream {self._stream_name} should have at least one slice to delineate the sync start time, but no slices are present. This is unexpected. Please contact Support."
|
264
262
|
)
|
265
|
-
self.
|
263
|
+
self.state["slices"].append(
|
266
264
|
{
|
267
265
|
self._connector_state_converter.START_KEY: self._extract_from_slice(
|
268
266
|
partition, self._slice_boundary_fields[self._START_BOUNDARY]
|
@@ -290,7 +288,7 @@ class ConcurrentCursor(Cursor):
|
|
290
288
|
"expected. Please contact the Airbyte team."
|
291
289
|
)
|
292
290
|
|
293
|
-
self.
|
291
|
+
self.state["slices"].append(
|
294
292
|
{
|
295
293
|
self._connector_state_converter.START_KEY: self.start,
|
296
294
|
self._connector_state_converter.END_KEY: most_recent_cursor_value,
|
@@ -302,7 +300,9 @@ class ConcurrentCursor(Cursor):
|
|
302
300
|
self._connector_state_manager.update_state_for_stream(
|
303
301
|
self._stream_name,
|
304
302
|
self._stream_namespace,
|
305
|
-
self.
|
303
|
+
self._connector_state_converter.convert_to_state_message(
|
304
|
+
self._cursor_field, self.state
|
305
|
+
),
|
306
306
|
)
|
307
307
|
state_message = self._connector_state_manager.create_state_message(
|
308
308
|
self._stream_name, self._stream_namespace
|
@@ -310,9 +310,7 @@ class ConcurrentCursor(Cursor):
|
|
310
310
|
self._message_repository.emit_message(state_message)
|
311
311
|
|
312
312
|
def _merge_partitions(self) -> None:
|
313
|
-
self.
|
314
|
-
self._concurrent_state["slices"]
|
315
|
-
)
|
313
|
+
self.state["slices"] = self._connector_state_converter.merge_intervals(self.state["slices"])
|
316
314
|
|
317
315
|
def _extract_from_slice(self, partition: Partition, key: str) -> CursorValueType:
|
318
316
|
try:
|
@@ -349,42 +347,36 @@ class ConcurrentCursor(Cursor):
|
|
349
347
|
if self._start is not None and self._is_start_before_first_slice():
|
350
348
|
yield from self._split_per_slice_range(
|
351
349
|
self._start,
|
352
|
-
self.
|
350
|
+
self.state["slices"][0][self._connector_state_converter.START_KEY],
|
353
351
|
False,
|
354
352
|
)
|
355
353
|
|
356
|
-
if len(self.
|
354
|
+
if len(self.state["slices"]) == 1:
|
357
355
|
yield from self._split_per_slice_range(
|
358
356
|
self._calculate_lower_boundary_of_last_slice(
|
359
|
-
self.
|
357
|
+
self.state["slices"][0][self._connector_state_converter.END_KEY]
|
360
358
|
),
|
361
359
|
self._end_provider(),
|
362
360
|
True,
|
363
361
|
)
|
364
|
-
elif len(self.
|
365
|
-
for i in range(len(self.
|
362
|
+
elif len(self.state["slices"]) > 1:
|
363
|
+
for i in range(len(self.state["slices"]) - 1):
|
366
364
|
if self._cursor_granularity:
|
367
365
|
yield from self._split_per_slice_range(
|
368
|
-
self.
|
366
|
+
self.state["slices"][i][self._connector_state_converter.END_KEY]
|
369
367
|
+ self._cursor_granularity,
|
370
|
-
self.
|
371
|
-
self._connector_state_converter.START_KEY
|
372
|
-
],
|
368
|
+
self.state["slices"][i + 1][self._connector_state_converter.START_KEY],
|
373
369
|
False,
|
374
370
|
)
|
375
371
|
else:
|
376
372
|
yield from self._split_per_slice_range(
|
377
|
-
self.
|
378
|
-
|
379
|
-
],
|
380
|
-
self._concurrent_state["slices"][i + 1][
|
381
|
-
self._connector_state_converter.START_KEY
|
382
|
-
],
|
373
|
+
self.state["slices"][i][self._connector_state_converter.END_KEY],
|
374
|
+
self.state["slices"][i + 1][self._connector_state_converter.START_KEY],
|
383
375
|
False,
|
384
376
|
)
|
385
377
|
yield from self._split_per_slice_range(
|
386
378
|
self._calculate_lower_boundary_of_last_slice(
|
387
|
-
self.
|
379
|
+
self.state["slices"][-1][self._connector_state_converter.END_KEY]
|
388
380
|
),
|
389
381
|
self._end_provider(),
|
390
382
|
True,
|
@@ -395,8 +387,7 @@ class ConcurrentCursor(Cursor):
|
|
395
387
|
def _is_start_before_first_slice(self) -> bool:
|
396
388
|
return (
|
397
389
|
self._start is not None
|
398
|
-
and self._start
|
399
|
-
< self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY]
|
390
|
+
and self._start < self.state["slices"][0][self._connector_state_converter.START_KEY]
|
400
391
|
)
|
401
392
|
|
402
393
|
def _calculate_lower_boundary_of_last_slice(
|
@@ -62,15 +62,15 @@ airbyte_cdk/sources/declarative/checks/check_stream.py,sha256=dAA-UhmMj0WLXCkRQr
|
|
62
62
|
airbyte_cdk/sources/declarative/checks/connection_checker.py,sha256=MBRJo6WJlZQHpIfOGaNOkkHUmgUl_4wDM6VPo41z5Ss,1383
|
63
63
|
airbyte_cdk/sources/declarative/concurrency_level/__init__.py,sha256=5XUqrmlstYlMM0j6crktlKQwALek0uiz2D3WdM46MyA,191
|
64
64
|
airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py,sha256=YIwCTCpOr_QSNW4ltQK0yUGWInI8PKNY216HOOegYLk,2101
|
65
|
-
airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=
|
65
|
+
airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=tSTCSmyMCu1qoGsne1Ooz3c1da-8EDZk6Suiy2gIq9Q,22475
|
66
66
|
airbyte_cdk/sources/declarative/datetime/__init__.py,sha256=l9LG7Qm6e5r_qgqfVKnx3mXYtg1I9MmMjomVIPfU4XA,177
|
67
67
|
airbyte_cdk/sources/declarative/datetime/datetime_parser.py,sha256=SX9JjdesN1edN2WVUVMzU_ptqp2QB1OnsnjZ4mwcX7w,2579
|
68
68
|
airbyte_cdk/sources/declarative/datetime/min_max_datetime.py,sha256=0BHBtDNQZfvwM45-tY5pNlTcKAFSGGNxemoi0Jic-0E,5785
|
69
|
-
airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=
|
69
|
+
airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=QDqDyKmkYDDW3fXA8ImE61p4v_sBNQnqnV-uX_qNHNM,133531
|
70
70
|
airbyte_cdk/sources/declarative/declarative_source.py,sha256=nF7wBqFd3AQmEKAm4CnIo29CJoQL562cJGSCeL8U8bA,1531
|
71
71
|
airbyte_cdk/sources/declarative/declarative_stream.py,sha256=JRyNeOIpsFu4ztVZsN6sncqUEIqIE-bUkD2TPgbMgk0,10375
|
72
72
|
airbyte_cdk/sources/declarative/decoders/__init__.py,sha256=edGj4fGxznBk4xzRQyCA1rGfbpqe7z-RE0K3kQQWbgA,858
|
73
|
-
airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py,sha256
|
73
|
+
airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py,sha256=rSvqdGsVgBT3ZfY_bthjZl_OmxY84iKz8g9GQIWyq8k,3766
|
74
74
|
airbyte_cdk/sources/declarative/decoders/decoder.py,sha256=sl-Gt8lXi7yD2Q-sD8je5QS2PbgrgsYjxRLWsay7DMc,826
|
75
75
|
airbyte_cdk/sources/declarative/decoders/json_decoder.py,sha256=qdbjeR6RffKaah_iWvMsOcDolYuxJY5DaI3b9AMTZXg,3327
|
76
76
|
airbyte_cdk/sources/declarative/decoders/noop_decoder.py,sha256=iZh0yKY_JzgBnJWiubEusf5c0o6Khd-8EWFWT-8EgFo,542
|
@@ -81,16 +81,15 @@ airbyte_cdk/sources/declarative/extractors/__init__.py,sha256=RmV-IkO1YLj0PSOrrq
|
|
81
81
|
airbyte_cdk/sources/declarative/extractors/dpath_extractor.py,sha256=wR4Ol4MG2lt5UlqXF5EU_k7qa5cN4_-luu3PJ1PlO3A,3131
|
82
82
|
airbyte_cdk/sources/declarative/extractors/http_selector.py,sha256=2zWZ4ewTqQC8VwkjS0xD_u350Km3SiYP7hpOOgiLg5o,1169
|
83
83
|
airbyte_cdk/sources/declarative/extractors/record_extractor.py,sha256=XJELMjahAsaomlvQgN2zrNO0DJX0G0fr9r682gUz7Pg,691
|
84
|
-
airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=
|
84
|
+
airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=OJ9xmhNWNwwzxYOeIrDy1GINb1zH9MBy6suC5tm2LSk,3545
|
85
85
|
airbyte_cdk/sources/declarative/extractors/record_selector.py,sha256=tjNwcURmlyD-TGCScXvW95ThNKyPGcx2SiWbG1-H-sc,6552
|
86
86
|
airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py,sha256=LhqGDfX06_dDYLKsIVnwQ_nAWCln-v8PV7Wgt_QVeTI,6533
|
87
87
|
airbyte_cdk/sources/declarative/extractors/type_transformer.py,sha256=d6Y2Rfg8pMVEEnHllfVksWZdNVOU55yk34O03dP9muY,1626
|
88
|
-
airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=
|
89
|
-
airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py,sha256=vU6bcVgjDFou7szl5UKxv2-theKSsV78oSME84-C78A,15043
|
88
|
+
airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=huRz3KQJSUFmJCg5GPE9TckEBsB5TMsCa_THhJAhPVI,1037
|
90
89
|
airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py,sha256=_UzUnSIUsDbRgbFTXgSyZEFb4ws-KdhdQPWO8mFbV7U,22028
|
91
90
|
airbyte_cdk/sources/declarative/incremental/declarative_cursor.py,sha256=5Bhw9VRPyIuCaD0wmmq_L3DZsa-rJgtKSEUzSd8YYD0,536
|
92
91
|
airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py,sha256=3_EEZop94bMitZaJd2PF5Q2Xt9v94tYg7p7YJz8tAFc,15869
|
93
|
-
airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=
|
92
|
+
airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=hElcYijbOHjdLKOMA7W7aizEbf22r7OSApXALP875uI,15749
|
94
93
|
airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py,sha256=2YBOA2NnwAeIKlIhSwUB_W-FaGnPcmrG_liY7b4mV2Y,8365
|
95
94
|
airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py,sha256=10LFv1QPM-agVKl6eaANmEBOfd7gZgBrkoTcMggsieQ,4809
|
96
95
|
airbyte_cdk/sources/declarative/interpolation/__init__.py,sha256=tjUJkn3B-iZ-p7RP2c3dVZejrGiQeooGmS5ibWTuUL4,437
|
@@ -107,12 +106,12 @@ airbyte_cdk/sources/declarative/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW
|
|
107
106
|
airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py,sha256=iemy3fKLczcU0-Aor7tx5jcT6DRedKMqyK7kCOp01hg,3924
|
108
107
|
airbyte_cdk/sources/declarative/migrations/state_migration.py,sha256=KWPjealMLKSMtajXgkdGgKg7EmTLR-CqqD7UIh0-eDU,794
|
109
108
|
airbyte_cdk/sources/declarative/models/__init__.py,sha256=nUFxNCiKeYRVXuZEKA7GD-lTHxsiKcQ8FitZjKhPIvE,100
|
110
|
-
airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=
|
109
|
+
airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=SpMwCe-6NZTxICSFIXzwlAnAwNLlC8xS12ncEC1NcbA,93536
|
111
110
|
airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
|
112
111
|
airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=Rir9_z3Kcd5Es0-LChrzk-0qubAsiK_RSEnLmK2OXm8,553
|
113
112
|
airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py,sha256=CXwTfD3wSQq3okcqwigpprbHhSURUokh4GK2OmOyKC8,9132
|
114
113
|
airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py,sha256=IWUOdF03o-aQn0Occo1BJCxU0Pz-QILk5L67nzw2thw,6803
|
115
|
-
airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=
|
114
|
+
airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=lmSh2Yp-lgRTbbSw3m6UH8L2nTRjt0w3aiISWHRG6IM,109739
|
116
115
|
airbyte_cdk/sources/declarative/partition_routers/__init__.py,sha256=HJ-Syp3p7RpyR_OK0X_a2kSyISfu3W-PKrRI16iY0a8,957
|
117
116
|
airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py,sha256=n82J15S8bjeMZ5uROu--P3hnbQoxkY5v7RPHYx7g7ro,2929
|
118
117
|
airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py,sha256=c5cuVFM6NFkuQqG8Z5IwkBuwDrvXZN1CunUOM_L0ezg,6892
|
@@ -163,7 +162,7 @@ airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py,sha256=Aio
|
|
163
162
|
airbyte_cdk/sources/declarative/retrievers/__init__.py,sha256=ix9m1dkR69DcXCXUKC5RK_ZZM7ojTLBQ4IkWQTfmfCk,456
|
164
163
|
airbyte_cdk/sources/declarative/retrievers/async_retriever.py,sha256=kX9ltelK2xLIBWDJBK2ucrvVe5tc5xmhdbVbgsjvlxY,3696
|
165
164
|
airbyte_cdk/sources/declarative/retrievers/retriever.py,sha256=XPLs593Xv8c5cKMc37XzUAYmzlXd1a7eSsspM-CMuWA,1696
|
166
|
-
airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=
|
165
|
+
airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=jxQ_9xcVD07r9PKhofitAqMkdX1k8ZNyy50qz5NwkFs,24540
|
167
166
|
airbyte_cdk/sources/declarative/schema/__init__.py,sha256=HztgVVaZdil5UfgUZcv_Hyy84r89_EKRwyO2hoewNVg,749
|
168
167
|
airbyte_cdk/sources/declarative/schema/default_schema_loader.py,sha256=KTACrIE23a83wsm3Rd9Eb4K6-20lrGqYxTHNp9yxsso,1820
|
169
168
|
airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py,sha256=H6A3NQ6kPPM-cUNPmdvDPc9xNzR1rQNrK95GbgCW334,8822
|
@@ -257,7 +256,7 @@ airbyte_cdk/sources/streams/concurrent/abstract_stream.py,sha256=3OB5VsvOkJmCxIM
|
|
257
256
|
airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py,sha256=QTry1QCBUwJDw1QSCEvz23s7zIEx_7QMxkPq9j-oPIQ,1358
|
258
257
|
airbyte_cdk/sources/streams/concurrent/adapters.py,sha256=QP_64kQo-b3sRNHZA5aqrgCJqAhIVegRM3vJ8jGyuSY,15213
|
259
258
|
airbyte_cdk/sources/streams/concurrent/availability_strategy.py,sha256=4La5v2UffSjGnhmF4kwNIKt_g3RXk2ux1mSHA1ejgYM,2898
|
260
|
-
airbyte_cdk/sources/streams/concurrent/cursor.py,sha256=
|
259
|
+
airbyte_cdk/sources/streams/concurrent/cursor.py,sha256=Hke6CpD8Sq1FS4g1Xuht39UN7hKkGy1mvOxvQrm1lLM,20810
|
261
260
|
airbyte_cdk/sources/streams/concurrent/default_stream.py,sha256=K3rLMpYhS7nnmvwQ52lqBy7DQdFMJpvvT7sgBg_ckA8,3207
|
262
261
|
airbyte_cdk/sources/streams/concurrent/exceptions.py,sha256=JOZ446MCLpmF26r9KfS6OO_6rGjcjgJNZdcw6jccjEI,468
|
263
262
|
airbyte_cdk/sources/streams/concurrent/helpers.py,sha256=S6AW8TgIASCZ2UuUcQLE8OzgYUHWt2-KPOvNPwnQf-Q,1596
|
@@ -343,8 +342,8 @@ airbyte_cdk/utils/slice_hasher.py,sha256=-pHexlNYoWYPnXNH-M7HEbjmeJe9Zk7SJijdQ7d
|
|
343
342
|
airbyte_cdk/utils/spec_schema_transformations.py,sha256=-5HTuNsnDBAhj-oLeQXwpTGA0HdcjFOf2zTEMUTTg_Y,816
|
344
343
|
airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
|
345
344
|
airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
|
346
|
-
airbyte_cdk-6.
|
347
|
-
airbyte_cdk-6.
|
348
|
-
airbyte_cdk-6.
|
349
|
-
airbyte_cdk-6.
|
350
|
-
airbyte_cdk-6.
|
345
|
+
airbyte_cdk-6.18.0.dev1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
346
|
+
airbyte_cdk-6.18.0.dev1.dist-info/METADATA,sha256=ALXOgvI3pTcF2tNmvbQ9S8fG424n229th_tx1u2uSCo,6005
|
347
|
+
airbyte_cdk-6.18.0.dev1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
348
|
+
airbyte_cdk-6.18.0.dev1.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
|
349
|
+
airbyte_cdk-6.18.0.dev1.dist-info/RECORD,,
|
@@ -1,340 +0,0 @@
|
|
1
|
-
import copy
|
2
|
-
import logging
|
3
|
-
|
4
|
-
#
|
5
|
-
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
6
|
-
#
|
7
|
-
import threading
|
8
|
-
from collections import OrderedDict
|
9
|
-
from copy import deepcopy
|
10
|
-
from datetime import timedelta
|
11
|
-
from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
|
12
|
-
|
13
|
-
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
14
|
-
from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
|
15
|
-
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
|
16
|
-
Timer,
|
17
|
-
iterate_with_last_flag_and_state,
|
18
|
-
)
|
19
|
-
from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
|
20
|
-
from airbyte_cdk.sources.message import MessageRepository
|
21
|
-
from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
|
22
|
-
PerPartitionKeySerializer,
|
23
|
-
)
|
24
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField
|
25
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
26
|
-
from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
|
27
|
-
|
28
|
-
logger = logging.getLogger("airbyte")
|
29
|
-
|
30
|
-
|
31
|
-
class ConcurrentCursorFactory:
|
32
|
-
def __init__(self, create_function: Callable[..., Cursor]):
|
33
|
-
self._create_function = create_function
|
34
|
-
|
35
|
-
def create(self, stream_state: Mapping[str, Any], runtime_lookback_window: Any) -> Cursor:
|
36
|
-
return self._create_function(
|
37
|
-
stream_state=stream_state, runtime_lookback_window=runtime_lookback_window
|
38
|
-
)
|
39
|
-
|
40
|
-
|
41
|
-
class ConcurrentPerPartitionCursor(Cursor):
|
42
|
-
"""
|
43
|
-
Manages state per partition when a stream has many partitions, to prevent data loss or duplication.
|
44
|
-
|
45
|
-
**Partition Limitation and Limit Reached Logic**
|
46
|
-
|
47
|
-
- **DEFAULT_MAX_PARTITIONS_NUMBER**: The maximum number of partitions to keep in memory (default is 10,000).
|
48
|
-
- **_cursor_per_partition**: An ordered dictionary that stores cursors for each partition.
|
49
|
-
- **_over_limit**: A counter that increments each time an oldest partition is removed when the limit is exceeded.
|
50
|
-
|
51
|
-
The class ensures that the number of partitions tracked does not exceed the `DEFAULT_MAX_PARTITIONS_NUMBER` to prevent excessive memory usage.
|
52
|
-
|
53
|
-
- When the number of partitions exceeds the limit, the oldest partitions are removed from `_cursor_per_partition`, and `_over_limit` is incremented accordingly.
|
54
|
-
- The `limit_reached` method returns `True` when `_over_limit` exceeds `DEFAULT_MAX_PARTITIONS_NUMBER`, indicating that the global cursor should be used instead of per-partition cursors.
|
55
|
-
|
56
|
-
This approach avoids unnecessary switching to a global cursor due to temporary spikes in partition counts, ensuring that switching is only done when a sustained high number of partitions is observed.
|
57
|
-
"""
|
58
|
-
|
59
|
-
DEFAULT_MAX_PARTITIONS_NUMBER = 10000
|
60
|
-
_NO_STATE: Mapping[str, Any] = {}
|
61
|
-
_NO_CURSOR_STATE: Mapping[str, Any] = {}
|
62
|
-
_KEY = 0
|
63
|
-
_VALUE = 1
|
64
|
-
|
65
|
-
def __init__(
|
66
|
-
self,
|
67
|
-
cursor_factory: ConcurrentCursorFactory,
|
68
|
-
partition_router: PartitionRouter,
|
69
|
-
stream_name: str,
|
70
|
-
stream_namespace: Optional[str],
|
71
|
-
stream_state: Any,
|
72
|
-
message_repository: MessageRepository,
|
73
|
-
connector_state_manager: ConnectorStateManager,
|
74
|
-
cursor_field: CursorField,
|
75
|
-
) -> None:
|
76
|
-
self._global_cursor: Mapping[str, Any] = {}
|
77
|
-
self._stream_name = stream_name
|
78
|
-
self._stream_namespace = stream_namespace
|
79
|
-
self._message_repository = message_repository
|
80
|
-
self._connector_state_manager = connector_state_manager
|
81
|
-
self._cursor_field = cursor_field
|
82
|
-
|
83
|
-
self._cursor_factory = cursor_factory
|
84
|
-
self._partition_router = partition_router
|
85
|
-
|
86
|
-
# The dict is ordered to ensure that once the maximum number of partitions is reached,
|
87
|
-
# the oldest partitions can be efficiently removed, maintaining the most recent partitions.
|
88
|
-
self._cursor_per_partition: OrderedDict[str, Cursor] = OrderedDict()
|
89
|
-
self._state = {"states": []}
|
90
|
-
self._semaphore_per_partition = OrderedDict()
|
91
|
-
self._finished_partitions = set()
|
92
|
-
self._lock = threading.Lock()
|
93
|
-
self._timer = Timer()
|
94
|
-
self._new_global_cursor = None
|
95
|
-
self._lookback_window = 0
|
96
|
-
self._parent_state = None
|
97
|
-
self._over_limit = 0
|
98
|
-
self._partition_serializer = PerPartitionKeySerializer()
|
99
|
-
|
100
|
-
self._set_initial_state(stream_state)
|
101
|
-
|
102
|
-
@property
|
103
|
-
def cursor_field(self) -> CursorField:
|
104
|
-
return self._cursor_field
|
105
|
-
|
106
|
-
@property
|
107
|
-
def state(self) -> MutableMapping[str, Any]:
|
108
|
-
states = []
|
109
|
-
for partition_tuple, cursor in self._cursor_per_partition.items():
|
110
|
-
if cursor.state:
|
111
|
-
states.append(
|
112
|
-
{
|
113
|
-
"partition": self._to_dict(partition_tuple),
|
114
|
-
"cursor": copy.deepcopy(cursor.state),
|
115
|
-
}
|
116
|
-
)
|
117
|
-
state: dict[str, Any] = {"states": states}
|
118
|
-
|
119
|
-
if self._global_cursor:
|
120
|
-
state["state"] = self._global_cursor
|
121
|
-
if self._lookback_window is not None:
|
122
|
-
state["lookback_window"] = self._lookback_window
|
123
|
-
if self._parent_state is not None:
|
124
|
-
state["parent_state"] = self._parent_state
|
125
|
-
return state
|
126
|
-
|
127
|
-
def close_partition(self, partition: Partition) -> None:
|
128
|
-
self._cursor_per_partition[
|
129
|
-
self._to_partition_key(partition._stream_slice.partition)
|
130
|
-
].close_partition(partition=partition)
|
131
|
-
with self._lock:
|
132
|
-
self._semaphore_per_partition[
|
133
|
-
self._to_partition_key(partition._stream_slice.partition)
|
134
|
-
].acquire()
|
135
|
-
cursor = self._cursor_per_partition[
|
136
|
-
self._to_partition_key(partition._stream_slice.partition)
|
137
|
-
]
|
138
|
-
if (
|
139
|
-
self._to_partition_key(partition._stream_slice.partition)
|
140
|
-
in self._finished_partitions
|
141
|
-
and self._semaphore_per_partition[
|
142
|
-
self._to_partition_key(partition._stream_slice.partition)
|
143
|
-
]._value
|
144
|
-
== 0
|
145
|
-
):
|
146
|
-
if (
|
147
|
-
self._new_global_cursor is None
|
148
|
-
or self._new_global_cursor[self.cursor_field.cursor_field_key]
|
149
|
-
< cursor.state[self.cursor_field.cursor_field_key]
|
150
|
-
):
|
151
|
-
self._new_global_cursor = copy.deepcopy(cursor.state)
|
152
|
-
|
153
|
-
def ensure_at_least_one_state_emitted(self) -> None:
|
154
|
-
"""
|
155
|
-
The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
|
156
|
-
called.
|
157
|
-
"""
|
158
|
-
if not any(
|
159
|
-
semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
|
160
|
-
):
|
161
|
-
self._global_cursor = self._new_global_cursor
|
162
|
-
self._lookback_window = self._timer.finish()
|
163
|
-
self._parent_state = self._partition_router.get_stream_state()
|
164
|
-
self._emit_state_message()
|
165
|
-
|
166
|
-
def _emit_state_message(self) -> None:
|
167
|
-
self._connector_state_manager.update_state_for_stream(
|
168
|
-
self._stream_name,
|
169
|
-
self._stream_namespace,
|
170
|
-
self.state,
|
171
|
-
)
|
172
|
-
state_message = self._connector_state_manager.create_state_message(
|
173
|
-
self._stream_name, self._stream_namespace
|
174
|
-
)
|
175
|
-
self._message_repository.emit_message(state_message)
|
176
|
-
|
177
|
-
def stream_slices(self) -> Iterable[StreamSlice]:
|
178
|
-
slices = self._partition_router.stream_slices()
|
179
|
-
self._timer.start()
|
180
|
-
for partition in slices:
|
181
|
-
yield from self.generate_slices_from_partition(partition)
|
182
|
-
|
183
|
-
def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
|
184
|
-
# Ensure the maximum number of partitions is not exceeded
|
185
|
-
self._ensure_partition_limit()
|
186
|
-
|
187
|
-
cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
|
188
|
-
if not cursor:
|
189
|
-
partition_state = self._global_cursor if self._global_cursor else self._NO_CURSOR_STATE
|
190
|
-
cursor = self._create_cursor(partition_state)
|
191
|
-
self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
|
192
|
-
self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
|
193
|
-
threading.Semaphore(0)
|
194
|
-
)
|
195
|
-
|
196
|
-
for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
|
197
|
-
cursor.stream_slices(),
|
198
|
-
lambda: None,
|
199
|
-
):
|
200
|
-
self._semaphore_per_partition[self._to_partition_key(partition.partition)].release()
|
201
|
-
if is_last_slice:
|
202
|
-
self._finished_partitions.add(self._to_partition_key(partition.partition))
|
203
|
-
yield StreamSlice(
|
204
|
-
partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
|
205
|
-
)
|
206
|
-
|
207
|
-
def _ensure_partition_limit(self) -> None:
|
208
|
-
"""
|
209
|
-
Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
|
210
|
-
"""
|
211
|
-
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
|
212
|
-
self._over_limit += 1
|
213
|
-
oldest_partition = self._cursor_per_partition.popitem(last=False)[
|
214
|
-
0
|
215
|
-
] # Remove the oldest partition
|
216
|
-
logger.warning(
|
217
|
-
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
|
218
|
-
)
|
219
|
-
|
220
|
-
def limit_reached(self) -> bool:
|
221
|
-
return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
|
222
|
-
|
223
|
-
def _set_initial_state(self, stream_state: StreamState) -> None:
|
224
|
-
"""
|
225
|
-
Set the initial state for the cursors.
|
226
|
-
|
227
|
-
This method initializes the state for each partition cursor using the provided stream state.
|
228
|
-
If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
|
229
|
-
|
230
|
-
Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router
|
231
|
-
does not have parent streams, this step will be skipped due to the default PartitionRouter implementation.
|
232
|
-
|
233
|
-
Args:
|
234
|
-
stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
|
235
|
-
{
|
236
|
-
"states": [
|
237
|
-
{
|
238
|
-
"partition": {
|
239
|
-
"partition_key": "value"
|
240
|
-
},
|
241
|
-
"cursor": {
|
242
|
-
"last_updated": "2023-05-27T00:00:00Z"
|
243
|
-
}
|
244
|
-
}
|
245
|
-
],
|
246
|
-
"parent_state": {
|
247
|
-
"parent_stream_name": {
|
248
|
-
"last_updated": "2023-05-27T00:00:00Z"
|
249
|
-
}
|
250
|
-
}
|
251
|
-
}
|
252
|
-
"""
|
253
|
-
if not stream_state:
|
254
|
-
return
|
255
|
-
|
256
|
-
if "states" not in stream_state:
|
257
|
-
# We assume that `stream_state` is in a global format that can be applied to all partitions.
|
258
|
-
# Example: {"global_state_format_key": "global_state_format_value"}
|
259
|
-
self._global_cursor = deepcopy(stream_state)
|
260
|
-
self._new_global_cursor = deepcopy(stream_state)
|
261
|
-
|
262
|
-
else:
|
263
|
-
self._lookback_window = stream_state.get("lookback_window")
|
264
|
-
|
265
|
-
for state in stream_state["states"]:
|
266
|
-
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
|
267
|
-
self._create_cursor(
|
268
|
-
state["cursor"], runtime_lookback_window=self._lookback_window
|
269
|
-
)
|
270
|
-
)
|
271
|
-
self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
|
272
|
-
threading.Semaphore(0)
|
273
|
-
)
|
274
|
-
|
275
|
-
# set default state for missing partitions if it is per partition with fallback to global
|
276
|
-
if "state" in stream_state:
|
277
|
-
self._global_cursor = deepcopy(stream_state["state"])
|
278
|
-
self._new_global_cursor = deepcopy(stream_state["state"])
|
279
|
-
|
280
|
-
# Set parent state for partition routers based on parent streams
|
281
|
-
self._partition_router.set_initial_state(stream_state)
|
282
|
-
|
283
|
-
def observe(self, record: Record) -> None:
|
284
|
-
self._cursor_per_partition[
|
285
|
-
self._to_partition_key(record.associated_slice.partition)
|
286
|
-
].observe(record)
|
287
|
-
|
288
|
-
def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
|
289
|
-
return self._partition_serializer.to_partition_key(partition)
|
290
|
-
|
291
|
-
def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
|
292
|
-
return self._partition_serializer.to_partition(partition_key)
|
293
|
-
|
294
|
-
def _create_cursor(self, cursor_state: Any, runtime_lookback_window: Any = None) -> Cursor:
|
295
|
-
if runtime_lookback_window:
|
296
|
-
runtime_lookback_window = timedelta(seconds=runtime_lookback_window)
|
297
|
-
cursor = self._cursor_factory.create(
|
298
|
-
stream_state=deepcopy(cursor_state), runtime_lookback_window=runtime_lookback_window
|
299
|
-
)
|
300
|
-
return cursor
|
301
|
-
|
302
|
-
def should_be_synced(self, record: Record) -> bool:
|
303
|
-
return self._get_cursor(record).should_be_synced(record)
|
304
|
-
|
305
|
-
def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
|
306
|
-
if not first.associated_slice or not second.associated_slice:
|
307
|
-
raise ValueError(
|
308
|
-
f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}"
|
309
|
-
)
|
310
|
-
if first.associated_slice.partition != second.associated_slice.partition:
|
311
|
-
raise ValueError(
|
312
|
-
f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}"
|
313
|
-
)
|
314
|
-
|
315
|
-
return self._get_cursor(first).is_greater_than_or_equal(
|
316
|
-
self._convert_record_to_cursor_record(first),
|
317
|
-
self._convert_record_to_cursor_record(second),
|
318
|
-
)
|
319
|
-
|
320
|
-
@staticmethod
|
321
|
-
def _convert_record_to_cursor_record(record: Record) -> Record:
|
322
|
-
return Record(
|
323
|
-
record.data,
|
324
|
-
StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
|
325
|
-
if record.associated_slice
|
326
|
-
else None,
|
327
|
-
)
|
328
|
-
|
329
|
-
def _get_cursor(self, record: Record) -> Cursor:
|
330
|
-
if not record.associated_slice:
|
331
|
-
raise ValueError(
|
332
|
-
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
333
|
-
)
|
334
|
-
partition_key = self._to_partition_key(record.associated_slice.partition)
|
335
|
-
if partition_key not in self._cursor_per_partition:
|
336
|
-
raise ValueError(
|
337
|
-
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
338
|
-
)
|
339
|
-
cursor = self._cursor_per_partition[partition_key]
|
340
|
-
return cursor
|
File without changes
|
File without changes
|
File without changes
|