airbyte-cdk 6.17.1.dev1__py3-none-any.whl → 6.18.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +0 -69
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +15 -1
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +28 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +5 -3
- airbyte_cdk/sources/declarative/incremental/__init__.py +0 -6
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +0 -15
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +30 -12
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +24 -78
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +1 -1
- airbyte_cdk/sources/streams/concurrent/cursor.py +21 -30
- {airbyte_cdk-6.17.1.dev1.dist-info → airbyte_cdk-6.18.0.dev1.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.17.1.dev1.dist-info → airbyte_cdk-6.18.0.dev1.dist-info}/RECORD +15 -16
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +0 -340
- {airbyte_cdk-6.17.1.dev1.dist-info → airbyte_cdk-6.18.0.dev1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.17.1.dev1.dist-info → airbyte_cdk-6.18.0.dev1.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.17.1.dev1.dist-info → airbyte_cdk-6.18.0.dev1.dist-info}/entry_points.txt +0 -0
@@ -20,9 +20,6 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
|
|
20
20
|
ClientSideIncrementalRecordFilterDecorator,
|
21
21
|
)
|
22
22
|
from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
|
23
|
-
from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import (
|
24
|
-
PerPartitionWithGlobalCursor,
|
25
|
-
)
|
26
23
|
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
|
27
24
|
from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
|
28
25
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
@@ -307,72 +304,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
307
304
|
cursor=final_state_cursor,
|
308
305
|
)
|
309
306
|
)
|
310
|
-
elif (
|
311
|
-
incremental_sync_component_definition
|
312
|
-
and incremental_sync_component_definition.get("type", "")
|
313
|
-
== DatetimeBasedCursorModel.__name__
|
314
|
-
and self._stream_supports_concurrent_partition_processing(
|
315
|
-
declarative_stream=declarative_stream
|
316
|
-
)
|
317
|
-
and hasattr(declarative_stream.retriever, "stream_slicer")
|
318
|
-
and isinstance(
|
319
|
-
declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor
|
320
|
-
)
|
321
|
-
):
|
322
|
-
stream_state = state_manager.get_stream_state(
|
323
|
-
stream_name=declarative_stream.name, namespace=declarative_stream.namespace
|
324
|
-
)
|
325
|
-
partition_router = declarative_stream.retriever.stream_slicer._partition_router
|
326
|
-
|
327
|
-
cursor = self._constructor.create_concurrent_cursor_from_perpartition_cursor(
|
328
|
-
state_manager=state_manager,
|
329
|
-
model_type=DatetimeBasedCursorModel,
|
330
|
-
component_definition=incremental_sync_component_definition,
|
331
|
-
stream_name=declarative_stream.name,
|
332
|
-
stream_namespace=declarative_stream.namespace,
|
333
|
-
config=config or {},
|
334
|
-
stream_state=stream_state,
|
335
|
-
partition_router=partition_router,
|
336
|
-
)
|
337
|
-
|
338
|
-
retriever = declarative_stream.retriever
|
339
|
-
|
340
|
-
# This is an optimization so that we don't invoke any cursor or state management flows within the
|
341
|
-
# low-code framework because state management is handled through the ConcurrentCursor.
|
342
|
-
if declarative_stream and isinstance(retriever, SimpleRetriever):
|
343
|
-
# Also a temporary hack. In the legacy Stream implementation, as part of the read,
|
344
|
-
# set_initial_state() is called to instantiate incoming state on the cursor. Although we no
|
345
|
-
# longer rely on the legacy low-code cursor for concurrent checkpointing, low-code components
|
346
|
-
# like StopConditionPaginationStrategyDecorator and ClientSideIncrementalRecordFilterDecorator
|
347
|
-
# still rely on a DatetimeBasedCursor that is properly initialized with state.
|
348
|
-
if retriever.cursor:
|
349
|
-
retriever.cursor.set_initial_state(stream_state=stream_state)
|
350
|
-
# We zero it out here, but since this is a cursor reference, the state is still properly
|
351
|
-
# instantiated for the other components that reference it
|
352
|
-
retriever.cursor = None
|
353
|
-
|
354
|
-
partition_generator = StreamSlicerPartitionGenerator(
|
355
|
-
DeclarativePartitionFactory(
|
356
|
-
declarative_stream.name,
|
357
|
-
declarative_stream.get_json_schema(),
|
358
|
-
retriever,
|
359
|
-
self.message_repository,
|
360
|
-
),
|
361
|
-
cursor,
|
362
|
-
)
|
363
|
-
|
364
|
-
concurrent_streams.append(
|
365
|
-
DefaultStream(
|
366
|
-
partition_generator=partition_generator,
|
367
|
-
name=declarative_stream.name,
|
368
|
-
json_schema=declarative_stream.get_json_schema(),
|
369
|
-
availability_strategy=AlwaysAvailableAvailabilityStrategy(),
|
370
|
-
primary_key=get_primary_key_from_stream(declarative_stream.primary_key),
|
371
|
-
cursor_field=cursor.cursor_field.cursor_field_key,
|
372
|
-
logger=self.logger,
|
373
|
-
cursor=cursor,
|
374
|
-
)
|
375
|
-
)
|
376
307
|
else:
|
377
308
|
synchronous_streams.append(declarative_stream)
|
378
309
|
else:
|
@@ -678,7 +678,7 @@ definitions:
|
|
678
678
|
properties:
|
679
679
|
type:
|
680
680
|
type: string
|
681
|
-
enum: [
|
681
|
+
enum: [CustomSchemaNormalization]
|
682
682
|
class_name:
|
683
683
|
title: Class Name
|
684
684
|
description: Fully-qualified name of the class that will be implementing the custom normalization. The format is `source_<name>.<package>.<class_name>`.
|
@@ -2014,6 +2014,20 @@ definitions:
|
|
2014
2014
|
$parameters:
|
2015
2015
|
type: object
|
2016
2016
|
additionalProperties: true
|
2017
|
+
JsonParser:
|
2018
|
+
title: JsonParser
|
2019
|
+
description: Parser used for parsing str, bytes, or bytearray data and returning data in a dictionary format.
|
2020
|
+
type: object
|
2021
|
+
additionalProperties: true
|
2022
|
+
required:
|
2023
|
+
- type
|
2024
|
+
properties:
|
2025
|
+
type:
|
2026
|
+
type: string
|
2027
|
+
enum: [JsonParser]
|
2028
|
+
encoding:
|
2029
|
+
type: string
|
2030
|
+
default: utf-8
|
2017
2031
|
ListPartitionRouter:
|
2018
2032
|
title: List Partition Router
|
2019
2033
|
description: A Partition router that specifies a list of attributes where each attribute describes a portion of the complete data set for a stream. During a sync, each value is iterated over and can be used as input to outbound API requests.
|
@@ -7,9 +7,12 @@ from dataclasses import dataclass
|
|
7
7
|
from io import BufferedIOBase, TextIOWrapper
|
8
8
|
from typing import Any, Generator, MutableMapping, Optional
|
9
9
|
|
10
|
+
import orjson
|
10
11
|
import requests
|
11
12
|
|
13
|
+
from airbyte_cdk.models import FailureType
|
12
14
|
from airbyte_cdk.sources.declarative.decoders.decoder import Decoder
|
15
|
+
from airbyte_cdk.utils import AirbyteTracedException
|
13
16
|
|
14
17
|
logger = logging.getLogger("airbyte")
|
15
18
|
|
@@ -42,6 +45,31 @@ class GzipParser(Parser):
|
|
42
45
|
yield from self.inner_parser.parse(gzipobj)
|
43
46
|
|
44
47
|
|
48
|
+
@dataclass
|
49
|
+
class JsonParser(Parser):
|
50
|
+
encoding: str = "utf-8"
|
51
|
+
|
52
|
+
def parse(self, data: BufferedIOBase) -> Generator[MutableMapping[str, Any], None, None]:
|
53
|
+
raw_data = data.read()
|
54
|
+
try:
|
55
|
+
body_json = orjson.loads(raw_data.decode(self.encoding))
|
56
|
+
except Exception:
|
57
|
+
try:
|
58
|
+
body_json = json.loads(raw_data.decode(self.encoding))
|
59
|
+
except Exception as exc:
|
60
|
+
raise AirbyteTracedException(
|
61
|
+
message="Response JSON data failed to be parsed. See logs for more inforation.",
|
62
|
+
internal_message=f"Response JSON data faild to be parsed: {exc=}, {raw_data=}",
|
63
|
+
failure_type=FailureType.system_error,
|
64
|
+
exception=exc,
|
65
|
+
)
|
66
|
+
|
67
|
+
if isinstance(body_json, list):
|
68
|
+
yield from body_json
|
69
|
+
else:
|
70
|
+
yield from [body_json]
|
71
|
+
|
72
|
+
|
45
73
|
@dataclass
|
46
74
|
class JsonLineParser(Parser):
|
47
75
|
encoding: Optional[str] = "utf-8"
|
@@ -59,11 +59,13 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
|
|
59
59
|
|
60
60
|
def __init__(
|
61
61
|
self,
|
62
|
-
|
62
|
+
date_time_based_cursor: DatetimeBasedCursor,
|
63
|
+
substream_cursor: Optional[Union[PerPartitionWithGlobalCursor, GlobalSubstreamCursor]],
|
63
64
|
**kwargs: Any,
|
64
65
|
):
|
65
66
|
super().__init__(**kwargs)
|
66
|
-
self.
|
67
|
+
self._date_time_based_cursor = date_time_based_cursor
|
68
|
+
self._substream_cursor = substream_cursor
|
67
69
|
|
68
70
|
def filter_records(
|
69
71
|
self,
|
@@ -75,7 +77,7 @@ class ClientSideIncrementalRecordFilterDecorator(RecordFilter):
|
|
75
77
|
records = (
|
76
78
|
record
|
77
79
|
for record in records
|
78
|
-
if self.
|
80
|
+
if (self._substream_cursor or self._date_time_based_cursor).should_be_synced(
|
79
81
|
# Record is created on the fly to align with cursors interface; stream name is ignored as we don't need it here
|
80
82
|
# Record stream name is empty cause it is not used durig the filtering
|
81
83
|
Record(data=record, associated_slice=stream_slice, stream_name="")
|
@@ -2,10 +2,6 @@
|
|
2
2
|
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
from airbyte_cdk.sources.declarative.incremental.concurrent_partition_cursor import (
|
6
|
-
ConcurrentCursorFactory,
|
7
|
-
ConcurrentPerPartitionCursor,
|
8
|
-
)
|
9
5
|
from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
|
10
6
|
from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
|
11
7
|
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
|
@@ -25,8 +21,6 @@ from airbyte_cdk.sources.declarative.incremental.resumable_full_refresh_cursor i
|
|
25
21
|
|
26
22
|
__all__ = [
|
27
23
|
"CursorFactory",
|
28
|
-
"ConcurrentCursorFactory",
|
29
|
-
"ConcurrentPerPartitionCursor",
|
30
24
|
"DatetimeBasedCursor",
|
31
25
|
"DeclarativeCursor",
|
32
26
|
"GlobalSubstreamCursor",
|
@@ -303,21 +303,6 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
303
303
|
raise ValueError("A partition needs to be provided in order to get request body json")
|
304
304
|
|
305
305
|
def should_be_synced(self, record: Record) -> bool:
|
306
|
-
if (
|
307
|
-
record.associated_slice
|
308
|
-
and self._to_partition_key(record.associated_slice.partition)
|
309
|
-
not in self._cursor_per_partition
|
310
|
-
):
|
311
|
-
partition_state = (
|
312
|
-
self._state_to_migrate_from
|
313
|
-
if self._state_to_migrate_from
|
314
|
-
else self._NO_CURSOR_STATE
|
315
|
-
)
|
316
|
-
cursor = self._create_cursor(partition_state)
|
317
|
-
|
318
|
-
self._cursor_per_partition[
|
319
|
-
self._to_partition_key(record.associated_slice.partition)
|
320
|
-
] = cursor
|
321
306
|
return self._get_cursor(record).should_be_synced(
|
322
307
|
self._convert_record_to_cursor_record(record)
|
323
308
|
)
|
@@ -737,33 +737,43 @@ class KeysToSnakeCase(BaseModel):
|
|
737
737
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
738
738
|
|
739
739
|
|
740
|
+
class FlattenFields(BaseModel):
|
741
|
+
type: Literal["FlattenFields"]
|
742
|
+
flatten_lists: Optional[bool] = Field(
|
743
|
+
True,
|
744
|
+
description="Whether to flatten lists or leave it as is. Default is True.",
|
745
|
+
title="Flatten Lists",
|
746
|
+
)
|
747
|
+
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
748
|
+
|
749
|
+
|
740
750
|
class KeysReplace(BaseModel):
|
741
751
|
type: Literal["KeysReplace"]
|
742
752
|
old: str = Field(
|
743
753
|
...,
|
744
754
|
description="Old value to replace.",
|
745
|
-
examples=[
|
755
|
+
examples=[
|
756
|
+
" ",
|
757
|
+
"{{ record.id }}",
|
758
|
+
"{{ config['id'] }}",
|
759
|
+
"{{ stream_slice['id'] }}",
|
760
|
+
],
|
746
761
|
title="Old value",
|
747
762
|
)
|
748
763
|
new: str = Field(
|
749
764
|
...,
|
750
765
|
description="New value to set.",
|
751
|
-
examples=[
|
766
|
+
examples=[
|
767
|
+
"_",
|
768
|
+
"{{ record.id }}",
|
769
|
+
"{{ config['id'] }}",
|
770
|
+
"{{ stream_slice['id'] }}",
|
771
|
+
],
|
752
772
|
title="New value",
|
753
773
|
)
|
754
774
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
755
775
|
|
756
776
|
|
757
|
-
class FlattenFields(BaseModel):
|
758
|
-
type: Literal["FlattenFields"]
|
759
|
-
flatten_lists: Optional[bool] = Field(
|
760
|
-
True,
|
761
|
-
description="Whether to flatten lists or leave it as is. Default is True.",
|
762
|
-
title="Flatten Lists",
|
763
|
-
)
|
764
|
-
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
765
|
-
|
766
|
-
|
767
777
|
class IterableDecoder(BaseModel):
|
768
778
|
type: Literal["IterableDecoder"]
|
769
779
|
|
@@ -795,6 +805,14 @@ class GzipJsonDecoder(BaseModel):
|
|
795
805
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
796
806
|
|
797
807
|
|
808
|
+
class JsonParser(BaseModel):
|
809
|
+
class Config:
|
810
|
+
extra = Extra.allow
|
811
|
+
|
812
|
+
type: Literal["JsonParser"]
|
813
|
+
encoding: Optional[str] = "utf-8"
|
814
|
+
|
815
|
+
|
798
816
|
class MinMaxDatetime(BaseModel):
|
799
817
|
type: Literal["MinMaxDatetime"]
|
800
818
|
datetime: str = Field(
|
@@ -72,6 +72,7 @@ from airbyte_cdk.sources.declarative.decoders.composite_raw_decoder import (
|
|
72
72
|
CsvParser,
|
73
73
|
GzipParser,
|
74
74
|
JsonLineParser,
|
75
|
+
JsonParser,
|
75
76
|
)
|
76
77
|
from airbyte_cdk.sources.declarative.extractors import (
|
77
78
|
DpathExtractor,
|
@@ -84,8 +85,6 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
|
|
84
85
|
)
|
85
86
|
from airbyte_cdk.sources.declarative.incremental import (
|
86
87
|
ChildPartitionResumableFullRefreshCursor,
|
87
|
-
ConcurrentCursorFactory,
|
88
|
-
ConcurrentPerPartitionCursor,
|
89
88
|
CursorFactory,
|
90
89
|
DatetimeBasedCursor,
|
91
90
|
DeclarativeCursor,
|
@@ -249,6 +248,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
249
248
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
250
249
|
JsonLineParser as JsonLineParserModel,
|
251
250
|
)
|
251
|
+
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
252
|
+
JsonParser as JsonParserModel,
|
253
|
+
)
|
252
254
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
253
255
|
JwtAuthenticator as JwtAuthenticatorModel,
|
254
256
|
)
|
@@ -440,7 +442,6 @@ from airbyte_cdk.sources.message import (
|
|
440
442
|
InMemoryMessageRepository,
|
441
443
|
LogAppenderMessageRepositoryDecorator,
|
442
444
|
MessageRepository,
|
443
|
-
NoopMessageRepository,
|
444
445
|
)
|
445
446
|
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, CursorField
|
446
447
|
from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
|
@@ -525,6 +526,7 @@ class ModelToComponentFactory:
|
|
525
526
|
JsonDecoderModel: self.create_json_decoder,
|
526
527
|
JsonlDecoderModel: self.create_jsonl_decoder,
|
527
528
|
JsonLineParserModel: self.create_json_line_parser,
|
529
|
+
JsonParserModel: self.create_json_parser,
|
528
530
|
GzipJsonDecoderModel: self.create_gzipjson_decoder,
|
529
531
|
GzipParserModel: self.create_gzip_parser,
|
530
532
|
KeysToLowerModel: self.create_keys_to_lower_transformation,
|
@@ -874,8 +876,6 @@ class ModelToComponentFactory:
|
|
874
876
|
stream_namespace: Optional[str],
|
875
877
|
config: Config,
|
876
878
|
stream_state: MutableMapping[str, Any],
|
877
|
-
message_repository: Optional[MessageRepository] = None,
|
878
|
-
runtime_lookback_window: Optional[int] = None,
|
879
879
|
**kwargs: Any,
|
880
880
|
) -> ConcurrentCursor:
|
881
881
|
component_type = component_definition.get("type")
|
@@ -933,11 +933,6 @@ class ModelToComponentFactory:
|
|
933
933
|
if evaluated_lookback_window:
|
934
934
|
lookback_window = parse_duration(evaluated_lookback_window)
|
935
935
|
|
936
|
-
if runtime_lookback_window and lookback_window:
|
937
|
-
lookback_window = max(lookback_window, runtime_lookback_window)
|
938
|
-
elif runtime_lookback_window:
|
939
|
-
lookback_window = runtime_lookback_window
|
940
|
-
|
941
936
|
connector_state_converter: DateTimeStreamStateConverter
|
942
937
|
connector_state_converter = CustomFormatConcurrentStreamStateConverter(
|
943
938
|
datetime_format=datetime_format,
|
@@ -1016,7 +1011,7 @@ class ModelToComponentFactory:
|
|
1016
1011
|
stream_name=stream_name,
|
1017
1012
|
stream_namespace=stream_namespace,
|
1018
1013
|
stream_state=stream_state,
|
1019
|
-
message_repository=
|
1014
|
+
message_repository=self._message_repository,
|
1020
1015
|
connector_state_manager=state_manager,
|
1021
1016
|
connector_state_converter=connector_state_converter,
|
1022
1017
|
cursor_field=cursor_field,
|
@@ -1028,63 +1023,6 @@ class ModelToComponentFactory:
|
|
1028
1023
|
cursor_granularity=cursor_granularity,
|
1029
1024
|
)
|
1030
1025
|
|
1031
|
-
def create_concurrent_cursor_from_perpartition_cursor(
|
1032
|
-
self,
|
1033
|
-
state_manager: ConnectorStateManager,
|
1034
|
-
model_type: Type[BaseModel],
|
1035
|
-
component_definition: ComponentDefinition,
|
1036
|
-
stream_name: str,
|
1037
|
-
stream_namespace: Optional[str],
|
1038
|
-
config: Config,
|
1039
|
-
stream_state: MutableMapping[str, Any],
|
1040
|
-
partition_router,
|
1041
|
-
**kwargs: Any,
|
1042
|
-
) -> ConcurrentPerPartitionCursor:
|
1043
|
-
component_type = component_definition.get("type")
|
1044
|
-
if component_definition.get("type") != model_type.__name__:
|
1045
|
-
raise ValueError(
|
1046
|
-
f"Expected manifest component of type {model_type.__name__}, but received {component_type} instead"
|
1047
|
-
)
|
1048
|
-
|
1049
|
-
datetime_based_cursor_model = model_type.parse_obj(component_definition)
|
1050
|
-
|
1051
|
-
if not isinstance(datetime_based_cursor_model, DatetimeBasedCursorModel):
|
1052
|
-
raise ValueError(
|
1053
|
-
f"Expected {model_type.__name__} component, but received {datetime_based_cursor_model.__class__.__name__}"
|
1054
|
-
)
|
1055
|
-
|
1056
|
-
interpolated_cursor_field = InterpolatedString.create(
|
1057
|
-
datetime_based_cursor_model.cursor_field,
|
1058
|
-
parameters=datetime_based_cursor_model.parameters or {},
|
1059
|
-
)
|
1060
|
-
cursor_field = CursorField(interpolated_cursor_field.eval(config=config))
|
1061
|
-
|
1062
|
-
# Create the cursor factory
|
1063
|
-
cursor_factory = ConcurrentCursorFactory(
|
1064
|
-
partial(
|
1065
|
-
self.create_concurrent_cursor_from_datetime_based_cursor,
|
1066
|
-
state_manager=state_manager,
|
1067
|
-
model_type=model_type,
|
1068
|
-
component_definition=component_definition,
|
1069
|
-
stream_name=stream_name,
|
1070
|
-
stream_namespace=stream_namespace,
|
1071
|
-
config=config,
|
1072
|
-
message_repository=NoopMessageRepository(),
|
1073
|
-
)
|
1074
|
-
)
|
1075
|
-
|
1076
|
-
# Return the concurrent cursor and state converter
|
1077
|
-
return ConcurrentPerPartitionCursor(
|
1078
|
-
cursor_factory=cursor_factory,
|
1079
|
-
partition_router=partition_router,
|
1080
|
-
stream_name=stream_name,
|
1081
|
-
stream_namespace=stream_namespace,
|
1082
|
-
stream_state=stream_state,
|
1083
|
-
message_repository=self._message_repository, # type: ignore
|
1084
|
-
connector_state_manager=state_manager,
|
1085
|
-
cursor_field=cursor_field,
|
1086
|
-
)
|
1087
|
-
|
1088
1026
|
@staticmethod
|
1089
1027
|
def create_constant_backoff_strategy(
|
1090
1028
|
model: ConstantBackoffStrategyModel, config: Config, **kwargs: Any
|
@@ -1367,15 +1305,18 @@ class ModelToComponentFactory:
|
|
1367
1305
|
raise ValueError(
|
1368
1306
|
"Unsupported Slicer is used. PerPartitionWithGlobalCursor should be used here instead"
|
1369
1307
|
)
|
1370
|
-
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1375
|
-
|
1376
|
-
|
1377
|
-
|
1378
|
-
|
1308
|
+
client_side_incremental_sync = {
|
1309
|
+
"date_time_based_cursor": self._create_component_from_model(
|
1310
|
+
model=model.incremental_sync, config=config
|
1311
|
+
),
|
1312
|
+
"substream_cursor": (
|
1313
|
+
combined_slicers
|
1314
|
+
if isinstance(
|
1315
|
+
combined_slicers, (PerPartitionWithGlobalCursor, GlobalSubstreamCursor)
|
1316
|
+
)
|
1317
|
+
else None
|
1318
|
+
),
|
1319
|
+
}
|
1379
1320
|
|
1380
1321
|
if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
|
1381
1322
|
cursor_model = model.incremental_sync
|
@@ -1812,6 +1753,11 @@ class ModelToComponentFactory:
|
|
1812
1753
|
def create_json_decoder(model: JsonDecoderModel, config: Config, **kwargs: Any) -> JsonDecoder:
|
1813
1754
|
return JsonDecoder(parameters={})
|
1814
1755
|
|
1756
|
+
@staticmethod
|
1757
|
+
def create_json_parser(model: JsonParserModel, config: Config, **kwargs: Any) -> JsonParser:
|
1758
|
+
encoding = model.encoding or "utf-8"
|
1759
|
+
return JsonParser(encoding=encoding)
|
1760
|
+
|
1815
1761
|
@staticmethod
|
1816
1762
|
def create_jsonl_decoder(
|
1817
1763
|
model: JsonlDecoderModel, config: Config, **kwargs: Any
|
@@ -2191,7 +2137,7 @@ class ModelToComponentFactory:
|
|
2191
2137
|
if (
|
2192
2138
|
not isinstance(stream_slicer, DatetimeBasedCursor)
|
2193
2139
|
or type(stream_slicer) is not DatetimeBasedCursor
|
2194
|
-
)
|
2140
|
+
):
|
2195
2141
|
# Many of the custom component implementations of DatetimeBasedCursor override get_request_params() (or other methods).
|
2196
2142
|
# Because we're decoupling RequestOptionsProvider from the Cursor, custom components will eventually need to reimplement
|
2197
2143
|
# their own RequestOptionsProvider. However, right now the existing StreamSlicer/Cursor still can act as the SimpleRetriever's
|
@@ -160,7 +160,7 @@ class SimpleRetriever(Retriever):
|
|
160
160
|
stream_slice,
|
161
161
|
next_page_token,
|
162
162
|
self._paginator.get_request_headers,
|
163
|
-
self.
|
163
|
+
self.stream_slicer.get_request_headers,
|
164
164
|
)
|
165
165
|
if isinstance(headers, str):
|
166
166
|
raise ValueError("Request headers cannot be a string")
|
@@ -196,9 +196,7 @@ class ConcurrentCursor(Cursor):
|
|
196
196
|
|
197
197
|
@property
|
198
198
|
def state(self) -> MutableMapping[str, Any]:
|
199
|
-
return self.
|
200
|
-
self.cursor_field, self._concurrent_state
|
201
|
-
)
|
199
|
+
return self._concurrent_state
|
202
200
|
|
203
201
|
@property
|
204
202
|
def cursor_field(self) -> CursorField:
|
@@ -243,10 +241,10 @@ class ConcurrentCursor(Cursor):
|
|
243
241
|
return self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
|
244
242
|
|
245
243
|
def close_partition(self, partition: Partition) -> None:
|
246
|
-
slice_count_before = len(self.
|
244
|
+
slice_count_before = len(self.state.get("slices", []))
|
247
245
|
self._add_slice_to_state(partition)
|
248
246
|
if slice_count_before < len(
|
249
|
-
self.
|
247
|
+
self.state["slices"]
|
250
248
|
): # only emit if at least one slice has been processed
|
251
249
|
self._merge_partitions()
|
252
250
|
self._emit_state_message()
|
@@ -258,11 +256,11 @@ class ConcurrentCursor(Cursor):
|
|
258
256
|
)
|
259
257
|
|
260
258
|
if self._slice_boundary_fields:
|
261
|
-
if "slices" not in self.
|
259
|
+
if "slices" not in self.state:
|
262
260
|
raise RuntimeError(
|
263
261
|
f"The state for stream {self._stream_name} should have at least one slice to delineate the sync start time, but no slices are present. This is unexpected. Please contact Support."
|
264
262
|
)
|
265
|
-
self.
|
263
|
+
self.state["slices"].append(
|
266
264
|
{
|
267
265
|
self._connector_state_converter.START_KEY: self._extract_from_slice(
|
268
266
|
partition, self._slice_boundary_fields[self._START_BOUNDARY]
|
@@ -290,7 +288,7 @@ class ConcurrentCursor(Cursor):
|
|
290
288
|
"expected. Please contact the Airbyte team."
|
291
289
|
)
|
292
290
|
|
293
|
-
self.
|
291
|
+
self.state["slices"].append(
|
294
292
|
{
|
295
293
|
self._connector_state_converter.START_KEY: self.start,
|
296
294
|
self._connector_state_converter.END_KEY: most_recent_cursor_value,
|
@@ -302,7 +300,9 @@ class ConcurrentCursor(Cursor):
|
|
302
300
|
self._connector_state_manager.update_state_for_stream(
|
303
301
|
self._stream_name,
|
304
302
|
self._stream_namespace,
|
305
|
-
self.
|
303
|
+
self._connector_state_converter.convert_to_state_message(
|
304
|
+
self._cursor_field, self.state
|
305
|
+
),
|
306
306
|
)
|
307
307
|
state_message = self._connector_state_manager.create_state_message(
|
308
308
|
self._stream_name, self._stream_namespace
|
@@ -310,9 +310,7 @@ class ConcurrentCursor(Cursor):
|
|
310
310
|
self._message_repository.emit_message(state_message)
|
311
311
|
|
312
312
|
def _merge_partitions(self) -> None:
|
313
|
-
self.
|
314
|
-
self._concurrent_state["slices"]
|
315
|
-
)
|
313
|
+
self.state["slices"] = self._connector_state_converter.merge_intervals(self.state["slices"])
|
316
314
|
|
317
315
|
def _extract_from_slice(self, partition: Partition, key: str) -> CursorValueType:
|
318
316
|
try:
|
@@ -349,42 +347,36 @@ class ConcurrentCursor(Cursor):
|
|
349
347
|
if self._start is not None and self._is_start_before_first_slice():
|
350
348
|
yield from self._split_per_slice_range(
|
351
349
|
self._start,
|
352
|
-
self.
|
350
|
+
self.state["slices"][0][self._connector_state_converter.START_KEY],
|
353
351
|
False,
|
354
352
|
)
|
355
353
|
|
356
|
-
if len(self.
|
354
|
+
if len(self.state["slices"]) == 1:
|
357
355
|
yield from self._split_per_slice_range(
|
358
356
|
self._calculate_lower_boundary_of_last_slice(
|
359
|
-
self.
|
357
|
+
self.state["slices"][0][self._connector_state_converter.END_KEY]
|
360
358
|
),
|
361
359
|
self._end_provider(),
|
362
360
|
True,
|
363
361
|
)
|
364
|
-
elif len(self.
|
365
|
-
for i in range(len(self.
|
362
|
+
elif len(self.state["slices"]) > 1:
|
363
|
+
for i in range(len(self.state["slices"]) - 1):
|
366
364
|
if self._cursor_granularity:
|
367
365
|
yield from self._split_per_slice_range(
|
368
|
-
self.
|
366
|
+
self.state["slices"][i][self._connector_state_converter.END_KEY]
|
369
367
|
+ self._cursor_granularity,
|
370
|
-
self.
|
371
|
-
self._connector_state_converter.START_KEY
|
372
|
-
],
|
368
|
+
self.state["slices"][i + 1][self._connector_state_converter.START_KEY],
|
373
369
|
False,
|
374
370
|
)
|
375
371
|
else:
|
376
372
|
yield from self._split_per_slice_range(
|
377
|
-
self.
|
378
|
-
|
379
|
-
],
|
380
|
-
self._concurrent_state["slices"][i + 1][
|
381
|
-
self._connector_state_converter.START_KEY
|
382
|
-
],
|
373
|
+
self.state["slices"][i][self._connector_state_converter.END_KEY],
|
374
|
+
self.state["slices"][i + 1][self._connector_state_converter.START_KEY],
|
383
375
|
False,
|
384
376
|
)
|
385
377
|
yield from self._split_per_slice_range(
|
386
378
|
self._calculate_lower_boundary_of_last_slice(
|
387
|
-
self.
|
379
|
+
self.state["slices"][-1][self._connector_state_converter.END_KEY]
|
388
380
|
),
|
389
381
|
self._end_provider(),
|
390
382
|
True,
|
@@ -395,8 +387,7 @@ class ConcurrentCursor(Cursor):
|
|
395
387
|
def _is_start_before_first_slice(self) -> bool:
|
396
388
|
return (
|
397
389
|
self._start is not None
|
398
|
-
and self._start
|
399
|
-
< self._concurrent_state["slices"][0][self._connector_state_converter.START_KEY]
|
390
|
+
and self._start < self.state["slices"][0][self._connector_state_converter.START_KEY]
|
400
391
|
)
|
401
392
|
|
402
393
|
def _calculate_lower_boundary_of_last_slice(
|
@@ -62,15 +62,15 @@ airbyte_cdk/sources/declarative/checks/check_stream.py,sha256=dAA-UhmMj0WLXCkRQr
|
|
62
62
|
airbyte_cdk/sources/declarative/checks/connection_checker.py,sha256=MBRJo6WJlZQHpIfOGaNOkkHUmgUl_4wDM6VPo41z5Ss,1383
|
63
63
|
airbyte_cdk/sources/declarative/concurrency_level/__init__.py,sha256=5XUqrmlstYlMM0j6crktlKQwALek0uiz2D3WdM46MyA,191
|
64
64
|
airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py,sha256=YIwCTCpOr_QSNW4ltQK0yUGWInI8PKNY216HOOegYLk,2101
|
65
|
-
airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=
|
65
|
+
airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=tSTCSmyMCu1qoGsne1Ooz3c1da-8EDZk6Suiy2gIq9Q,22475
|
66
66
|
airbyte_cdk/sources/declarative/datetime/__init__.py,sha256=l9LG7Qm6e5r_qgqfVKnx3mXYtg1I9MmMjomVIPfU4XA,177
|
67
67
|
airbyte_cdk/sources/declarative/datetime/datetime_parser.py,sha256=SX9JjdesN1edN2WVUVMzU_ptqp2QB1OnsnjZ4mwcX7w,2579
|
68
68
|
airbyte_cdk/sources/declarative/datetime/min_max_datetime.py,sha256=0BHBtDNQZfvwM45-tY5pNlTcKAFSGGNxemoi0Jic-0E,5785
|
69
|
-
airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=
|
69
|
+
airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=QDqDyKmkYDDW3fXA8ImE61p4v_sBNQnqnV-uX_qNHNM,133531
|
70
70
|
airbyte_cdk/sources/declarative/declarative_source.py,sha256=nF7wBqFd3AQmEKAm4CnIo29CJoQL562cJGSCeL8U8bA,1531
|
71
71
|
airbyte_cdk/sources/declarative/declarative_stream.py,sha256=JRyNeOIpsFu4ztVZsN6sncqUEIqIE-bUkD2TPgbMgk0,10375
|
72
72
|
airbyte_cdk/sources/declarative/decoders/__init__.py,sha256=edGj4fGxznBk4xzRQyCA1rGfbpqe7z-RE0K3kQQWbgA,858
|
73
|
-
airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py,sha256
|
73
|
+
airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py,sha256=rSvqdGsVgBT3ZfY_bthjZl_OmxY84iKz8g9GQIWyq8k,3766
|
74
74
|
airbyte_cdk/sources/declarative/decoders/decoder.py,sha256=sl-Gt8lXi7yD2Q-sD8je5QS2PbgrgsYjxRLWsay7DMc,826
|
75
75
|
airbyte_cdk/sources/declarative/decoders/json_decoder.py,sha256=qdbjeR6RffKaah_iWvMsOcDolYuxJY5DaI3b9AMTZXg,3327
|
76
76
|
airbyte_cdk/sources/declarative/decoders/noop_decoder.py,sha256=iZh0yKY_JzgBnJWiubEusf5c0o6Khd-8EWFWT-8EgFo,542
|
@@ -81,16 +81,15 @@ airbyte_cdk/sources/declarative/extractors/__init__.py,sha256=RmV-IkO1YLj0PSOrrq
|
|
81
81
|
airbyte_cdk/sources/declarative/extractors/dpath_extractor.py,sha256=wR4Ol4MG2lt5UlqXF5EU_k7qa5cN4_-luu3PJ1PlO3A,3131
|
82
82
|
airbyte_cdk/sources/declarative/extractors/http_selector.py,sha256=2zWZ4ewTqQC8VwkjS0xD_u350Km3SiYP7hpOOgiLg5o,1169
|
83
83
|
airbyte_cdk/sources/declarative/extractors/record_extractor.py,sha256=XJELMjahAsaomlvQgN2zrNO0DJX0G0fr9r682gUz7Pg,691
|
84
|
-
airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=
|
84
|
+
airbyte_cdk/sources/declarative/extractors/record_filter.py,sha256=OJ9xmhNWNwwzxYOeIrDy1GINb1zH9MBy6suC5tm2LSk,3545
|
85
85
|
airbyte_cdk/sources/declarative/extractors/record_selector.py,sha256=tjNwcURmlyD-TGCScXvW95ThNKyPGcx2SiWbG1-H-sc,6552
|
86
86
|
airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py,sha256=LhqGDfX06_dDYLKsIVnwQ_nAWCln-v8PV7Wgt_QVeTI,6533
|
87
87
|
airbyte_cdk/sources/declarative/extractors/type_transformer.py,sha256=d6Y2Rfg8pMVEEnHllfVksWZdNVOU55yk34O03dP9muY,1626
|
88
|
-
airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=
|
89
|
-
airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py,sha256=vU6bcVgjDFou7szl5UKxv2-theKSsV78oSME84-C78A,15043
|
88
|
+
airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=huRz3KQJSUFmJCg5GPE9TckEBsB5TMsCa_THhJAhPVI,1037
|
90
89
|
airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py,sha256=_UzUnSIUsDbRgbFTXgSyZEFb4ws-KdhdQPWO8mFbV7U,22028
|
91
90
|
airbyte_cdk/sources/declarative/incremental/declarative_cursor.py,sha256=5Bhw9VRPyIuCaD0wmmq_L3DZsa-rJgtKSEUzSd8YYD0,536
|
92
91
|
airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py,sha256=3_EEZop94bMitZaJd2PF5Q2Xt9v94tYg7p7YJz8tAFc,15869
|
93
|
-
airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=
|
92
|
+
airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py,sha256=hElcYijbOHjdLKOMA7W7aizEbf22r7OSApXALP875uI,15749
|
94
93
|
airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py,sha256=2YBOA2NnwAeIKlIhSwUB_W-FaGnPcmrG_liY7b4mV2Y,8365
|
95
94
|
airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py,sha256=10LFv1QPM-agVKl6eaANmEBOfd7gZgBrkoTcMggsieQ,4809
|
96
95
|
airbyte_cdk/sources/declarative/interpolation/__init__.py,sha256=tjUJkn3B-iZ-p7RP2c3dVZejrGiQeooGmS5ibWTuUL4,437
|
@@ -107,12 +106,12 @@ airbyte_cdk/sources/declarative/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW
|
|
107
106
|
airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py,sha256=iemy3fKLczcU0-Aor7tx5jcT6DRedKMqyK7kCOp01hg,3924
|
108
107
|
airbyte_cdk/sources/declarative/migrations/state_migration.py,sha256=KWPjealMLKSMtajXgkdGgKg7EmTLR-CqqD7UIh0-eDU,794
|
109
108
|
airbyte_cdk/sources/declarative/models/__init__.py,sha256=nUFxNCiKeYRVXuZEKA7GD-lTHxsiKcQ8FitZjKhPIvE,100
|
110
|
-
airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=
|
109
|
+
airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=SpMwCe-6NZTxICSFIXzwlAnAwNLlC8xS12ncEC1NcbA,93536
|
111
110
|
airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
|
112
111
|
airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=Rir9_z3Kcd5Es0-LChrzk-0qubAsiK_RSEnLmK2OXm8,553
|
113
112
|
airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py,sha256=CXwTfD3wSQq3okcqwigpprbHhSURUokh4GK2OmOyKC8,9132
|
114
113
|
airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py,sha256=IWUOdF03o-aQn0Occo1BJCxU0Pz-QILk5L67nzw2thw,6803
|
115
|
-
airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=
|
114
|
+
airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=lmSh2Yp-lgRTbbSw3m6UH8L2nTRjt0w3aiISWHRG6IM,109739
|
116
115
|
airbyte_cdk/sources/declarative/partition_routers/__init__.py,sha256=HJ-Syp3p7RpyR_OK0X_a2kSyISfu3W-PKrRI16iY0a8,957
|
117
116
|
airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py,sha256=n82J15S8bjeMZ5uROu--P3hnbQoxkY5v7RPHYx7g7ro,2929
|
118
117
|
airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py,sha256=c5cuVFM6NFkuQqG8Z5IwkBuwDrvXZN1CunUOM_L0ezg,6892
|
@@ -163,7 +162,7 @@ airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py,sha256=Aio
|
|
163
162
|
airbyte_cdk/sources/declarative/retrievers/__init__.py,sha256=ix9m1dkR69DcXCXUKC5RK_ZZM7ojTLBQ4IkWQTfmfCk,456
|
164
163
|
airbyte_cdk/sources/declarative/retrievers/async_retriever.py,sha256=kX9ltelK2xLIBWDJBK2ucrvVe5tc5xmhdbVbgsjvlxY,3696
|
165
164
|
airbyte_cdk/sources/declarative/retrievers/retriever.py,sha256=XPLs593Xv8c5cKMc37XzUAYmzlXd1a7eSsspM-CMuWA,1696
|
166
|
-
airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=
|
165
|
+
airbyte_cdk/sources/declarative/retrievers/simple_retriever.py,sha256=jxQ_9xcVD07r9PKhofitAqMkdX1k8ZNyy50qz5NwkFs,24540
|
167
166
|
airbyte_cdk/sources/declarative/schema/__init__.py,sha256=HztgVVaZdil5UfgUZcv_Hyy84r89_EKRwyO2hoewNVg,749
|
168
167
|
airbyte_cdk/sources/declarative/schema/default_schema_loader.py,sha256=KTACrIE23a83wsm3Rd9Eb4K6-20lrGqYxTHNp9yxsso,1820
|
169
168
|
airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py,sha256=H6A3NQ6kPPM-cUNPmdvDPc9xNzR1rQNrK95GbgCW334,8822
|
@@ -257,7 +256,7 @@ airbyte_cdk/sources/streams/concurrent/abstract_stream.py,sha256=3OB5VsvOkJmCxIM
|
|
257
256
|
airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py,sha256=QTry1QCBUwJDw1QSCEvz23s7zIEx_7QMxkPq9j-oPIQ,1358
|
258
257
|
airbyte_cdk/sources/streams/concurrent/adapters.py,sha256=QP_64kQo-b3sRNHZA5aqrgCJqAhIVegRM3vJ8jGyuSY,15213
|
259
258
|
airbyte_cdk/sources/streams/concurrent/availability_strategy.py,sha256=4La5v2UffSjGnhmF4kwNIKt_g3RXk2ux1mSHA1ejgYM,2898
|
260
|
-
airbyte_cdk/sources/streams/concurrent/cursor.py,sha256=
|
259
|
+
airbyte_cdk/sources/streams/concurrent/cursor.py,sha256=Hke6CpD8Sq1FS4g1Xuht39UN7hKkGy1mvOxvQrm1lLM,20810
|
261
260
|
airbyte_cdk/sources/streams/concurrent/default_stream.py,sha256=K3rLMpYhS7nnmvwQ52lqBy7DQdFMJpvvT7sgBg_ckA8,3207
|
262
261
|
airbyte_cdk/sources/streams/concurrent/exceptions.py,sha256=JOZ446MCLpmF26r9KfS6OO_6rGjcjgJNZdcw6jccjEI,468
|
263
262
|
airbyte_cdk/sources/streams/concurrent/helpers.py,sha256=S6AW8TgIASCZ2UuUcQLE8OzgYUHWt2-KPOvNPwnQf-Q,1596
|
@@ -343,8 +342,8 @@ airbyte_cdk/utils/slice_hasher.py,sha256=-pHexlNYoWYPnXNH-M7HEbjmeJe9Zk7SJijdQ7d
|
|
343
342
|
airbyte_cdk/utils/spec_schema_transformations.py,sha256=-5HTuNsnDBAhj-oLeQXwpTGA0HdcjFOf2zTEMUTTg_Y,816
|
344
343
|
airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
|
345
344
|
airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
|
346
|
-
airbyte_cdk-6.
|
347
|
-
airbyte_cdk-6.
|
348
|
-
airbyte_cdk-6.
|
349
|
-
airbyte_cdk-6.
|
350
|
-
airbyte_cdk-6.
|
345
|
+
airbyte_cdk-6.18.0.dev1.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
346
|
+
airbyte_cdk-6.18.0.dev1.dist-info/METADATA,sha256=ALXOgvI3pTcF2tNmvbQ9S8fG424n229th_tx1u2uSCo,6005
|
347
|
+
airbyte_cdk-6.18.0.dev1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
348
|
+
airbyte_cdk-6.18.0.dev1.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
|
349
|
+
airbyte_cdk-6.18.0.dev1.dist-info/RECORD,,
|
@@ -1,340 +0,0 @@
|
|
1
|
-
import copy
|
2
|
-
import logging
|
3
|
-
|
4
|
-
#
|
5
|
-
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
6
|
-
#
|
7
|
-
import threading
|
8
|
-
from collections import OrderedDict
|
9
|
-
from copy import deepcopy
|
10
|
-
from datetime import timedelta
|
11
|
-
from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
|
12
|
-
|
13
|
-
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
14
|
-
from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
|
15
|
-
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
|
16
|
-
Timer,
|
17
|
-
iterate_with_last_flag_and_state,
|
18
|
-
)
|
19
|
-
from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
|
20
|
-
from airbyte_cdk.sources.message import MessageRepository
|
21
|
-
from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
|
22
|
-
PerPartitionKeySerializer,
|
23
|
-
)
|
24
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor, CursorField
|
25
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
26
|
-
from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
|
27
|
-
|
28
|
-
logger = logging.getLogger("airbyte")
|
29
|
-
|
30
|
-
|
31
|
-
class ConcurrentCursorFactory:
|
32
|
-
def __init__(self, create_function: Callable[..., Cursor]):
|
33
|
-
self._create_function = create_function
|
34
|
-
|
35
|
-
def create(self, stream_state: Mapping[str, Any], runtime_lookback_window: Any) -> Cursor:
|
36
|
-
return self._create_function(
|
37
|
-
stream_state=stream_state, runtime_lookback_window=runtime_lookback_window
|
38
|
-
)
|
39
|
-
|
40
|
-
|
41
|
-
class ConcurrentPerPartitionCursor(Cursor):
|
42
|
-
"""
|
43
|
-
Manages state per partition when a stream has many partitions, to prevent data loss or duplication.
|
44
|
-
|
45
|
-
**Partition Limitation and Limit Reached Logic**
|
46
|
-
|
47
|
-
- **DEFAULT_MAX_PARTITIONS_NUMBER**: The maximum number of partitions to keep in memory (default is 10,000).
|
48
|
-
- **_cursor_per_partition**: An ordered dictionary that stores cursors for each partition.
|
49
|
-
- **_over_limit**: A counter that increments each time an oldest partition is removed when the limit is exceeded.
|
50
|
-
|
51
|
-
The class ensures that the number of partitions tracked does not exceed the `DEFAULT_MAX_PARTITIONS_NUMBER` to prevent excessive memory usage.
|
52
|
-
|
53
|
-
- When the number of partitions exceeds the limit, the oldest partitions are removed from `_cursor_per_partition`, and `_over_limit` is incremented accordingly.
|
54
|
-
- The `limit_reached` method returns `True` when `_over_limit` exceeds `DEFAULT_MAX_PARTITIONS_NUMBER`, indicating that the global cursor should be used instead of per-partition cursors.
|
55
|
-
|
56
|
-
This approach avoids unnecessary switching to a global cursor due to temporary spikes in partition counts, ensuring that switching is only done when a sustained high number of partitions is observed.
|
57
|
-
"""
|
58
|
-
|
59
|
-
DEFAULT_MAX_PARTITIONS_NUMBER = 10000
|
60
|
-
_NO_STATE: Mapping[str, Any] = {}
|
61
|
-
_NO_CURSOR_STATE: Mapping[str, Any] = {}
|
62
|
-
_KEY = 0
|
63
|
-
_VALUE = 1
|
64
|
-
|
65
|
-
def __init__(
|
66
|
-
self,
|
67
|
-
cursor_factory: ConcurrentCursorFactory,
|
68
|
-
partition_router: PartitionRouter,
|
69
|
-
stream_name: str,
|
70
|
-
stream_namespace: Optional[str],
|
71
|
-
stream_state: Any,
|
72
|
-
message_repository: MessageRepository,
|
73
|
-
connector_state_manager: ConnectorStateManager,
|
74
|
-
cursor_field: CursorField,
|
75
|
-
) -> None:
|
76
|
-
self._global_cursor: Mapping[str, Any] = {}
|
77
|
-
self._stream_name = stream_name
|
78
|
-
self._stream_namespace = stream_namespace
|
79
|
-
self._message_repository = message_repository
|
80
|
-
self._connector_state_manager = connector_state_manager
|
81
|
-
self._cursor_field = cursor_field
|
82
|
-
|
83
|
-
self._cursor_factory = cursor_factory
|
84
|
-
self._partition_router = partition_router
|
85
|
-
|
86
|
-
# The dict is ordered to ensure that once the maximum number of partitions is reached,
|
87
|
-
# the oldest partitions can be efficiently removed, maintaining the most recent partitions.
|
88
|
-
self._cursor_per_partition: OrderedDict[str, Cursor] = OrderedDict()
|
89
|
-
self._state = {"states": []}
|
90
|
-
self._semaphore_per_partition = OrderedDict()
|
91
|
-
self._finished_partitions = set()
|
92
|
-
self._lock = threading.Lock()
|
93
|
-
self._timer = Timer()
|
94
|
-
self._new_global_cursor = None
|
95
|
-
self._lookback_window = 0
|
96
|
-
self._parent_state = None
|
97
|
-
self._over_limit = 0
|
98
|
-
self._partition_serializer = PerPartitionKeySerializer()
|
99
|
-
|
100
|
-
self._set_initial_state(stream_state)
|
101
|
-
|
102
|
-
@property
|
103
|
-
def cursor_field(self) -> CursorField:
|
104
|
-
return self._cursor_field
|
105
|
-
|
106
|
-
@property
|
107
|
-
def state(self) -> MutableMapping[str, Any]:
|
108
|
-
states = []
|
109
|
-
for partition_tuple, cursor in self._cursor_per_partition.items():
|
110
|
-
if cursor.state:
|
111
|
-
states.append(
|
112
|
-
{
|
113
|
-
"partition": self._to_dict(partition_tuple),
|
114
|
-
"cursor": copy.deepcopy(cursor.state),
|
115
|
-
}
|
116
|
-
)
|
117
|
-
state: dict[str, Any] = {"states": states}
|
118
|
-
|
119
|
-
if self._global_cursor:
|
120
|
-
state["state"] = self._global_cursor
|
121
|
-
if self._lookback_window is not None:
|
122
|
-
state["lookback_window"] = self._lookback_window
|
123
|
-
if self._parent_state is not None:
|
124
|
-
state["parent_state"] = self._parent_state
|
125
|
-
return state
|
126
|
-
|
127
|
-
def close_partition(self, partition: Partition) -> None:
|
128
|
-
self._cursor_per_partition[
|
129
|
-
self._to_partition_key(partition._stream_slice.partition)
|
130
|
-
].close_partition(partition=partition)
|
131
|
-
with self._lock:
|
132
|
-
self._semaphore_per_partition[
|
133
|
-
self._to_partition_key(partition._stream_slice.partition)
|
134
|
-
].acquire()
|
135
|
-
cursor = self._cursor_per_partition[
|
136
|
-
self._to_partition_key(partition._stream_slice.partition)
|
137
|
-
]
|
138
|
-
if (
|
139
|
-
self._to_partition_key(partition._stream_slice.partition)
|
140
|
-
in self._finished_partitions
|
141
|
-
and self._semaphore_per_partition[
|
142
|
-
self._to_partition_key(partition._stream_slice.partition)
|
143
|
-
]._value
|
144
|
-
== 0
|
145
|
-
):
|
146
|
-
if (
|
147
|
-
self._new_global_cursor is None
|
148
|
-
or self._new_global_cursor[self.cursor_field.cursor_field_key]
|
149
|
-
< cursor.state[self.cursor_field.cursor_field_key]
|
150
|
-
):
|
151
|
-
self._new_global_cursor = copy.deepcopy(cursor.state)
|
152
|
-
|
153
|
-
def ensure_at_least_one_state_emitted(self) -> None:
|
154
|
-
"""
|
155
|
-
The platform expect to have at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
|
156
|
-
called.
|
157
|
-
"""
|
158
|
-
if not any(
|
159
|
-
semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
|
160
|
-
):
|
161
|
-
self._global_cursor = self._new_global_cursor
|
162
|
-
self._lookback_window = self._timer.finish()
|
163
|
-
self._parent_state = self._partition_router.get_stream_state()
|
164
|
-
self._emit_state_message()
|
165
|
-
|
166
|
-
def _emit_state_message(self) -> None:
|
167
|
-
self._connector_state_manager.update_state_for_stream(
|
168
|
-
self._stream_name,
|
169
|
-
self._stream_namespace,
|
170
|
-
self.state,
|
171
|
-
)
|
172
|
-
state_message = self._connector_state_manager.create_state_message(
|
173
|
-
self._stream_name, self._stream_namespace
|
174
|
-
)
|
175
|
-
self._message_repository.emit_message(state_message)
|
176
|
-
|
177
|
-
def stream_slices(self) -> Iterable[StreamSlice]:
|
178
|
-
slices = self._partition_router.stream_slices()
|
179
|
-
self._timer.start()
|
180
|
-
for partition in slices:
|
181
|
-
yield from self.generate_slices_from_partition(partition)
|
182
|
-
|
183
|
-
def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
|
184
|
-
# Ensure the maximum number of partitions is not exceeded
|
185
|
-
self._ensure_partition_limit()
|
186
|
-
|
187
|
-
cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
|
188
|
-
if not cursor:
|
189
|
-
partition_state = self._global_cursor if self._global_cursor else self._NO_CURSOR_STATE
|
190
|
-
cursor = self._create_cursor(partition_state)
|
191
|
-
self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
|
192
|
-
self._semaphore_per_partition[self._to_partition_key(partition.partition)] = (
|
193
|
-
threading.Semaphore(0)
|
194
|
-
)
|
195
|
-
|
196
|
-
for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
|
197
|
-
cursor.stream_slices(),
|
198
|
-
lambda: None,
|
199
|
-
):
|
200
|
-
self._semaphore_per_partition[self._to_partition_key(partition.partition)].release()
|
201
|
-
if is_last_slice:
|
202
|
-
self._finished_partitions.add(self._to_partition_key(partition.partition))
|
203
|
-
yield StreamSlice(
|
204
|
-
partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
|
205
|
-
)
|
206
|
-
|
207
|
-
def _ensure_partition_limit(self) -> None:
|
208
|
-
"""
|
209
|
-
Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
|
210
|
-
"""
|
211
|
-
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
|
212
|
-
self._over_limit += 1
|
213
|
-
oldest_partition = self._cursor_per_partition.popitem(last=False)[
|
214
|
-
0
|
215
|
-
] # Remove the oldest partition
|
216
|
-
logger.warning(
|
217
|
-
f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
|
218
|
-
)
|
219
|
-
|
220
|
-
def limit_reached(self) -> bool:
|
221
|
-
return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
|
222
|
-
|
223
|
-
def _set_initial_state(self, stream_state: StreamState) -> None:
|
224
|
-
"""
|
225
|
-
Set the initial state for the cursors.
|
226
|
-
|
227
|
-
This method initializes the state for each partition cursor using the provided stream state.
|
228
|
-
If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
|
229
|
-
|
230
|
-
Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router
|
231
|
-
does not have parent streams, this step will be skipped due to the default PartitionRouter implementation.
|
232
|
-
|
233
|
-
Args:
|
234
|
-
stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
|
235
|
-
{
|
236
|
-
"states": [
|
237
|
-
{
|
238
|
-
"partition": {
|
239
|
-
"partition_key": "value"
|
240
|
-
},
|
241
|
-
"cursor": {
|
242
|
-
"last_updated": "2023-05-27T00:00:00Z"
|
243
|
-
}
|
244
|
-
}
|
245
|
-
],
|
246
|
-
"parent_state": {
|
247
|
-
"parent_stream_name": {
|
248
|
-
"last_updated": "2023-05-27T00:00:00Z"
|
249
|
-
}
|
250
|
-
}
|
251
|
-
}
|
252
|
-
"""
|
253
|
-
if not stream_state:
|
254
|
-
return
|
255
|
-
|
256
|
-
if "states" not in stream_state:
|
257
|
-
# We assume that `stream_state` is in a global format that can be applied to all partitions.
|
258
|
-
# Example: {"global_state_format_key": "global_state_format_value"}
|
259
|
-
self._global_cursor = deepcopy(stream_state)
|
260
|
-
self._new_global_cursor = deepcopy(stream_state)
|
261
|
-
|
262
|
-
else:
|
263
|
-
self._lookback_window = stream_state.get("lookback_window")
|
264
|
-
|
265
|
-
for state in stream_state["states"]:
|
266
|
-
self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
|
267
|
-
self._create_cursor(
|
268
|
-
state["cursor"], runtime_lookback_window=self._lookback_window
|
269
|
-
)
|
270
|
-
)
|
271
|
-
self._semaphore_per_partition[self._to_partition_key(state["partition"])] = (
|
272
|
-
threading.Semaphore(0)
|
273
|
-
)
|
274
|
-
|
275
|
-
# set default state for missing partitions if it is per partition with fallback to global
|
276
|
-
if "state" in stream_state:
|
277
|
-
self._global_cursor = deepcopy(stream_state["state"])
|
278
|
-
self._new_global_cursor = deepcopy(stream_state["state"])
|
279
|
-
|
280
|
-
# Set parent state for partition routers based on parent streams
|
281
|
-
self._partition_router.set_initial_state(stream_state)
|
282
|
-
|
283
|
-
def observe(self, record: Record) -> None:
|
284
|
-
self._cursor_per_partition[
|
285
|
-
self._to_partition_key(record.associated_slice.partition)
|
286
|
-
].observe(record)
|
287
|
-
|
288
|
-
def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
|
289
|
-
return self._partition_serializer.to_partition_key(partition)
|
290
|
-
|
291
|
-
def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
|
292
|
-
return self._partition_serializer.to_partition(partition_key)
|
293
|
-
|
294
|
-
def _create_cursor(self, cursor_state: Any, runtime_lookback_window: Any = None) -> Cursor:
|
295
|
-
if runtime_lookback_window:
|
296
|
-
runtime_lookback_window = timedelta(seconds=runtime_lookback_window)
|
297
|
-
cursor = self._cursor_factory.create(
|
298
|
-
stream_state=deepcopy(cursor_state), runtime_lookback_window=runtime_lookback_window
|
299
|
-
)
|
300
|
-
return cursor
|
301
|
-
|
302
|
-
def should_be_synced(self, record: Record) -> bool:
|
303
|
-
return self._get_cursor(record).should_be_synced(record)
|
304
|
-
|
305
|
-
def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
|
306
|
-
if not first.associated_slice or not second.associated_slice:
|
307
|
-
raise ValueError(
|
308
|
-
f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}"
|
309
|
-
)
|
310
|
-
if first.associated_slice.partition != second.associated_slice.partition:
|
311
|
-
raise ValueError(
|
312
|
-
f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}"
|
313
|
-
)
|
314
|
-
|
315
|
-
return self._get_cursor(first).is_greater_than_or_equal(
|
316
|
-
self._convert_record_to_cursor_record(first),
|
317
|
-
self._convert_record_to_cursor_record(second),
|
318
|
-
)
|
319
|
-
|
320
|
-
@staticmethod
|
321
|
-
def _convert_record_to_cursor_record(record: Record) -> Record:
|
322
|
-
return Record(
|
323
|
-
record.data,
|
324
|
-
StreamSlice(partition={}, cursor_slice=record.associated_slice.cursor_slice)
|
325
|
-
if record.associated_slice
|
326
|
-
else None,
|
327
|
-
)
|
328
|
-
|
329
|
-
def _get_cursor(self, record: Record) -> Cursor:
|
330
|
-
if not record.associated_slice:
|
331
|
-
raise ValueError(
|
332
|
-
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
333
|
-
)
|
334
|
-
partition_key = self._to_partition_key(record.associated_slice.partition)
|
335
|
-
if partition_key not in self._cursor_per_partition:
|
336
|
-
raise ValueError(
|
337
|
-
"Invalid state as stream slices that are emitted should refer to an existing cursor"
|
338
|
-
)
|
339
|
-
cursor = self._cursor_per_partition[partition_key]
|
340
|
-
return cursor
|
File without changes
|
File without changes
|
File without changes
|