airbyte-cdk 6.60.15__py3-none-any.whl → 6.61.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +32 -36
- airbyte_cdk/connector_builder/main.py +3 -3
- airbyte_cdk/connector_builder/test_reader/helpers.py +24 -2
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +1 -1
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +15 -22
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +30 -18
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +73 -3
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +9 -5
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +3 -5
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +72 -39
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +42 -4
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer_test_read_decorator.py +2 -2
- airbyte_cdk/sources/message/concurrent_repository.py +47 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +23 -7
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +46 -5
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +7 -1
- airbyte_cdk/sources/streams/http/http_client.py +4 -1
- airbyte_cdk/sources/utils/slice_logger.py +4 -0
- {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.61.0.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.61.0.dist-info}/RECORD +24 -23
- {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.61.0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.61.0.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.61.0.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.61.0.dist-info}/entry_points.txt +0 -0
@@ -94,16 +94,13 @@ from airbyte_cdk.sources.declarative.extractors.record_filter import (
|
|
94
94
|
ClientSideIncrementalRecordFilterDecorator,
|
95
95
|
)
|
96
96
|
from airbyte_cdk.sources.declarative.incremental import (
|
97
|
-
ChildPartitionResumableFullRefreshCursor,
|
98
97
|
ConcurrentCursorFactory,
|
99
98
|
ConcurrentPerPartitionCursor,
|
100
99
|
CursorFactory,
|
101
100
|
DatetimeBasedCursor,
|
102
101
|
DeclarativeCursor,
|
103
102
|
GlobalSubstreamCursor,
|
104
|
-
PerPartitionCursor,
|
105
103
|
PerPartitionWithGlobalCursor,
|
106
|
-
ResumableFullRefreshCursor,
|
107
104
|
)
|
108
105
|
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
|
109
106
|
from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping
|
@@ -446,10 +443,6 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
446
443
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
447
444
|
ZipfileDecoder as ZipfileDecoderModel,
|
448
445
|
)
|
449
|
-
from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import (
|
450
|
-
COMPONENTS_MODULE_NAME,
|
451
|
-
SDM_COMPONENTS_MODULE_NAME,
|
452
|
-
)
|
453
446
|
from airbyte_cdk.sources.declarative.partition_routers import (
|
454
447
|
CartesianProductStreamSlicer,
|
455
448
|
GroupingPartitionRouter,
|
@@ -508,7 +501,7 @@ from airbyte_cdk.sources.declarative.requesters.request_options import (
|
|
508
501
|
RequestOptionsProvider,
|
509
502
|
)
|
510
503
|
from airbyte_cdk.sources.declarative.requesters.request_path import RequestPath
|
511
|
-
from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod
|
504
|
+
from airbyte_cdk.sources.declarative.requesters.requester import HttpMethod
|
512
505
|
from airbyte_cdk.sources.declarative.resolvers import (
|
513
506
|
ComponentMappingDefinition,
|
514
507
|
ConfigComponentsResolver,
|
@@ -617,6 +610,9 @@ from airbyte_cdk.sources.streams.concurrent.cursor import (
|
|
617
610
|
)
|
618
611
|
from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
|
619
612
|
from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream
|
613
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import (
|
614
|
+
StreamSlicer as ConcurrentStreamSlicer,
|
615
|
+
)
|
620
616
|
from airbyte_cdk.sources.streams.concurrent.state_converters.datetime_stream_state_converter import (
|
621
617
|
CustomFormatConcurrentStreamStateConverter,
|
622
618
|
DateTimeStreamStateConverter,
|
@@ -635,6 +631,10 @@ SCHEMA_TRANSFORMER_TYPE_MAPPING = {
|
|
635
631
|
SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization,
|
636
632
|
}
|
637
633
|
|
634
|
+
# Ideally this should use the value defined in ConcurrentDeclarativeSource, but
|
635
|
+
# this would be a circular import
|
636
|
+
MAX_SLICES = 5
|
637
|
+
|
638
638
|
|
639
639
|
class ModelToComponentFactory:
|
640
640
|
EPOCH_DATETIME_FORMAT = "%s"
|
@@ -1933,29 +1933,7 @@ class ModelToComponentFactory:
|
|
1933
1933
|
def create_declarative_stream(
|
1934
1934
|
self, model: DeclarativeStreamModel, config: Config, is_parent: bool = False, **kwargs: Any
|
1935
1935
|
) -> Union[DeclarativeStream, AbstractStream]:
|
1936
|
-
# When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field
|
1937
|
-
# components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the
|
1938
|
-
# Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in
|
1939
|
-
# the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one.
|
1940
|
-
combined_slicers = self._merge_stream_slicers(model=model, config=config)
|
1941
|
-
|
1942
1936
|
primary_key = model.primary_key.__root__ if model.primary_key else None
|
1943
|
-
stop_condition_on_cursor = (
|
1944
|
-
model.incremental_sync
|
1945
|
-
and hasattr(model.incremental_sync, "is_data_feed")
|
1946
|
-
and model.incremental_sync.is_data_feed
|
1947
|
-
)
|
1948
|
-
client_side_filtering_enabled = (
|
1949
|
-
model.incremental_sync
|
1950
|
-
and hasattr(model.incremental_sync, "is_client_side_incremental")
|
1951
|
-
and model.incremental_sync.is_client_side_incremental
|
1952
|
-
)
|
1953
|
-
concurrent_cursor = None
|
1954
|
-
if stop_condition_on_cursor or client_side_filtering_enabled:
|
1955
|
-
stream_slicer = self._build_stream_slicer_from_partition_router(
|
1956
|
-
model.retriever, config, stream_name=model.name
|
1957
|
-
)
|
1958
|
-
concurrent_cursor = self._build_concurrent_cursor(model, stream_slicer, config)
|
1959
1937
|
|
1960
1938
|
if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
|
1961
1939
|
cursor_model = model.incremental_sync
|
@@ -2023,6 +2001,15 @@ class ModelToComponentFactory:
|
|
2023
2001
|
model=model.file_uploader, config=config
|
2024
2002
|
)
|
2025
2003
|
|
2004
|
+
# When constructing a declarative stream, we assemble the incremental_sync component and retriever's partition_router field
|
2005
|
+
# components if they exist into a single CartesianProductStreamSlicer. This is then passed back as an argument when constructing the
|
2006
|
+
# Retriever. This is done in the declarative stream not the retriever to support custom retrievers. The custom create methods in
|
2007
|
+
# the factory only support passing arguments to the component constructors, whereas this performs a merge of all slicers into one.
|
2008
|
+
combined_slicers = self._merge_stream_slicers(model=model, config=config)
|
2009
|
+
partition_router = self._build_stream_slicer_from_partition_router(
|
2010
|
+
model.retriever, config, stream_name=model.name
|
2011
|
+
)
|
2012
|
+
concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config)
|
2026
2013
|
retriever = self._create_component_from_model(
|
2027
2014
|
model=model.retriever,
|
2028
2015
|
config=config,
|
@@ -2030,9 +2017,11 @@ class ModelToComponentFactory:
|
|
2030
2017
|
primary_key=primary_key,
|
2031
2018
|
stream_slicer=combined_slicers,
|
2032
2019
|
request_options_provider=request_options_provider,
|
2033
|
-
stop_condition_cursor=concurrent_cursor
|
2020
|
+
stop_condition_cursor=concurrent_cursor
|
2021
|
+
if self._is_stop_condition_on_cursor(model)
|
2022
|
+
else None,
|
2034
2023
|
client_side_incremental_sync={"cursor": concurrent_cursor}
|
2035
|
-
if
|
2024
|
+
if self._is_client_side_filtering_enabled(model)
|
2036
2025
|
else None,
|
2037
2026
|
transformations=transformations,
|
2038
2027
|
file_uploader=file_uploader,
|
@@ -2066,18 +2055,47 @@ class ModelToComponentFactory:
|
|
2066
2055
|
schema_loader = DefaultSchemaLoader(config=config, parameters=options)
|
2067
2056
|
|
2068
2057
|
if (
|
2069
|
-
|
2058
|
+
(
|
2059
|
+
isinstance(combined_slicers, PartitionRouter)
|
2060
|
+
or isinstance(concurrent_cursor, ConcurrentCursor)
|
2061
|
+
)
|
2070
2062
|
and not self._emit_connector_builder_messages
|
2071
2063
|
and not is_parent
|
2072
2064
|
):
|
2073
2065
|
# We are starting to migrate streams to instantiate directly the DefaultStream instead of instantiating the
|
2074
2066
|
# DeclarativeStream and assembling the DefaultStream from that. The plan is the following:
|
2075
2067
|
# * Streams without partition router nor cursors and streams with only partition router. This is the `isinstance(combined_slicers, PartitionRouter)` condition as the first kind with have a SinglePartitionRouter
|
2076
|
-
# * Streams without partition router but with cursor
|
2068
|
+
# * Streams without partition router but with cursor. This is the `isinstance(concurrent_cursor, ConcurrentCursor)` condition
|
2077
2069
|
# * Streams with both partition router and cursor
|
2078
2070
|
# We specifically exclude parent streams here because SubstreamPartitionRouter has not been updated yet
|
2079
2071
|
# We specifically exclude Connector Builder stuff for now as Brian is working on this anyway
|
2072
|
+
|
2080
2073
|
stream_name = model.name or ""
|
2074
|
+
stream_slicer: ConcurrentStreamSlicer = (
|
2075
|
+
concurrent_cursor if concurrent_cursor else SinglePartitionRouter(parameters={})
|
2076
|
+
)
|
2077
|
+
cursor: Cursor = FinalStateCursor(stream_name, None, self._message_repository)
|
2078
|
+
if isinstance(retriever, AsyncRetriever):
|
2079
|
+
# The AsyncRetriever only ever worked with a cursor from the concurrent package. Hence, the method
|
2080
|
+
# `_build_incremental_cursor` which we would usually think would return only declarative stuff has a
|
2081
|
+
# special clause and return a concurrent cursor. This stream slicer is passed to AsyncRetriever when
|
2082
|
+
# built because the async retriever has a specific partition router which relies on this stream slicer.
|
2083
|
+
# We can't re-use `concurrent_cursor` because it is a different instance than the one passed in
|
2084
|
+
# AsyncJobPartitionRouter.
|
2085
|
+
stream_slicer = retriever.stream_slicer
|
2086
|
+
if isinstance(combined_slicers, Cursor):
|
2087
|
+
cursor = combined_slicers
|
2088
|
+
elif isinstance(combined_slicers, PartitionRouter):
|
2089
|
+
stream_slicer = combined_slicers
|
2090
|
+
elif concurrent_cursor:
|
2091
|
+
cursor = concurrent_cursor
|
2092
|
+
|
2093
|
+
# FIXME to be removed once we migrate everything to DefaultStream
|
2094
|
+
if isinstance(retriever, SimpleRetriever):
|
2095
|
+
# We zero it out here, but since this is a cursor reference, the state is still properly
|
2096
|
+
# instantiated for the other components that reference it
|
2097
|
+
retriever.cursor = None
|
2098
|
+
|
2081
2099
|
partition_generator = StreamSlicerPartitionGenerator(
|
2082
2100
|
DeclarativePartitionFactory(
|
2083
2101
|
stream_name,
|
@@ -2085,18 +2103,19 @@ class ModelToComponentFactory:
|
|
2085
2103
|
retriever,
|
2086
2104
|
self._message_repository,
|
2087
2105
|
),
|
2088
|
-
stream_slicer=
|
2106
|
+
stream_slicer=stream_slicer,
|
2089
2107
|
)
|
2090
2108
|
return DefaultStream(
|
2091
2109
|
partition_generator=partition_generator,
|
2092
2110
|
name=stream_name,
|
2093
2111
|
json_schema=schema_loader.get_json_schema,
|
2094
2112
|
primary_key=get_primary_key_from_stream(primary_key),
|
2095
|
-
cursor_field=
|
2096
|
-
|
2113
|
+
cursor_field=cursor.cursor_field.cursor_field_key
|
2114
|
+
if hasattr(cursor, "cursor_field")
|
2115
|
+
else "", # FIXME we should have the cursor field has part of the interface of cursor,
|
2097
2116
|
logger=logging.getLogger(f"airbyte.{stream_name}"),
|
2098
|
-
# FIXME this is a breaking change compared to the old implementation
|
2099
|
-
cursor=
|
2117
|
+
# FIXME this is a breaking change compared to the old implementation which used the source name instead
|
2118
|
+
cursor=cursor,
|
2100
2119
|
supports_file_transfer=hasattr(model, "file_uploader")
|
2101
2120
|
and bool(model.file_uploader),
|
2102
2121
|
)
|
@@ -2120,6 +2139,20 @@ class ModelToComponentFactory:
|
|
2120
2139
|
parameters=model.parameters or {},
|
2121
2140
|
)
|
2122
2141
|
|
2142
|
+
def _is_stop_condition_on_cursor(self, model: DeclarativeStreamModel) -> bool:
|
2143
|
+
return bool(
|
2144
|
+
model.incremental_sync
|
2145
|
+
and hasattr(model.incremental_sync, "is_data_feed")
|
2146
|
+
and model.incremental_sync.is_data_feed
|
2147
|
+
)
|
2148
|
+
|
2149
|
+
def _is_client_side_filtering_enabled(self, model: DeclarativeStreamModel) -> bool:
|
2150
|
+
return bool(
|
2151
|
+
model.incremental_sync
|
2152
|
+
and hasattr(model.incremental_sync, "is_client_side_incremental")
|
2153
|
+
and model.incremental_sync.is_client_side_incremental
|
2154
|
+
)
|
2155
|
+
|
2123
2156
|
def _build_stream_slicer_from_partition_router(
|
2124
2157
|
self,
|
2125
2158
|
model: Union[
|
@@ -1,9 +1,12 @@
|
|
1
|
-
# Copyright (c)
|
1
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
2
2
|
|
3
|
-
from typing import Any, Iterable, Mapping, Optional
|
3
|
+
from typing import Any, Iterable, Mapping, Optional, cast
|
4
4
|
|
5
5
|
from airbyte_cdk.sources.declarative.retrievers import Retriever
|
6
6
|
from airbyte_cdk.sources.declarative.schema import SchemaLoader
|
7
|
+
from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer_test_read_decorator import (
|
8
|
+
StreamSlicerTestReadDecorator,
|
9
|
+
)
|
7
10
|
from airbyte_cdk.sources.message import MessageRepository
|
8
11
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
9
12
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
|
@@ -11,6 +14,11 @@ from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import Stre
|
|
11
14
|
from airbyte_cdk.sources.types import Record, StreamSlice
|
12
15
|
from airbyte_cdk.utils.slice_hasher import SliceHasher
|
13
16
|
|
17
|
+
# For Connector Builder test read operations, we track the total number of records
|
18
|
+
# read for the stream at the global level so that we can stop reading early if we
|
19
|
+
# exceed the record limit
|
20
|
+
total_record_counter = 0
|
21
|
+
|
14
22
|
|
15
23
|
class SchemaLoaderCachingDecorator(SchemaLoader):
|
16
24
|
def __init__(self, schema_loader: SchemaLoader):
|
@@ -31,6 +39,7 @@ class DeclarativePartitionFactory:
|
|
31
39
|
schema_loader: SchemaLoader,
|
32
40
|
retriever: Retriever,
|
33
41
|
message_repository: MessageRepository,
|
42
|
+
max_records_limit: Optional[int] = None,
|
34
43
|
) -> None:
|
35
44
|
"""
|
36
45
|
The DeclarativePartitionFactory takes a retriever_factory and not a retriever directly. The reason is that our components are not
|
@@ -41,6 +50,7 @@ class DeclarativePartitionFactory:
|
|
41
50
|
self._schema_loader = SchemaLoaderCachingDecorator(schema_loader)
|
42
51
|
self._retriever = retriever
|
43
52
|
self._message_repository = message_repository
|
53
|
+
self._max_records_limit = max_records_limit
|
44
54
|
|
45
55
|
def create(self, stream_slice: StreamSlice) -> Partition:
|
46
56
|
return DeclarativePartition(
|
@@ -48,6 +58,7 @@ class DeclarativePartitionFactory:
|
|
48
58
|
schema_loader=self._schema_loader,
|
49
59
|
retriever=self._retriever,
|
50
60
|
message_repository=self._message_repository,
|
61
|
+
max_records_limit=self._max_records_limit,
|
51
62
|
stream_slice=stream_slice,
|
52
63
|
)
|
53
64
|
|
@@ -59,19 +70,29 @@ class DeclarativePartition(Partition):
|
|
59
70
|
schema_loader: SchemaLoader,
|
60
71
|
retriever: Retriever,
|
61
72
|
message_repository: MessageRepository,
|
73
|
+
max_records_limit: Optional[int],
|
62
74
|
stream_slice: StreamSlice,
|
63
75
|
):
|
64
76
|
self._stream_name = stream_name
|
65
77
|
self._schema_loader = schema_loader
|
66
78
|
self._retriever = retriever
|
67
79
|
self._message_repository = message_repository
|
80
|
+
self._max_records_limit = max_records_limit
|
68
81
|
self._stream_slice = stream_slice
|
69
82
|
self._hash = SliceHasher.hash(self._stream_name, self._stream_slice)
|
70
83
|
|
71
84
|
def read(self) -> Iterable[Record]:
|
85
|
+
if self._max_records_limit is not None:
|
86
|
+
global total_record_counter
|
87
|
+
if total_record_counter >= self._max_records_limit:
|
88
|
+
return
|
72
89
|
for stream_data in self._retriever.read_records(
|
73
90
|
self._schema_loader.get_json_schema(), self._stream_slice
|
74
91
|
):
|
92
|
+
if self._max_records_limit is not None:
|
93
|
+
if total_record_counter >= self._max_records_limit:
|
94
|
+
break
|
95
|
+
|
75
96
|
if isinstance(stream_data, Mapping):
|
76
97
|
record = (
|
77
98
|
stream_data
|
@@ -86,6 +107,9 @@ class DeclarativePartition(Partition):
|
|
86
107
|
else:
|
87
108
|
self._message_repository.emit_message(stream_data)
|
88
109
|
|
110
|
+
if self._max_records_limit is not None:
|
111
|
+
total_record_counter += 1
|
112
|
+
|
89
113
|
def to_slice(self) -> Optional[Mapping[str, Any]]:
|
90
114
|
return self._stream_slice
|
91
115
|
|
@@ -98,10 +122,24 @@ class DeclarativePartition(Partition):
|
|
98
122
|
|
99
123
|
class StreamSlicerPartitionGenerator(PartitionGenerator):
|
100
124
|
def __init__(
|
101
|
-
self,
|
125
|
+
self,
|
126
|
+
partition_factory: DeclarativePartitionFactory,
|
127
|
+
stream_slicer: StreamSlicer,
|
128
|
+
slice_limit: Optional[int] = None,
|
129
|
+
max_records_limit: Optional[int] = None,
|
102
130
|
) -> None:
|
103
131
|
self._partition_factory = partition_factory
|
104
|
-
|
132
|
+
|
133
|
+
if slice_limit:
|
134
|
+
self._stream_slicer = cast(
|
135
|
+
StreamSlicer,
|
136
|
+
StreamSlicerTestReadDecorator(
|
137
|
+
wrapped_slicer=stream_slicer,
|
138
|
+
maximum_number_of_slices=slice_limit,
|
139
|
+
),
|
140
|
+
)
|
141
|
+
else:
|
142
|
+
self._stream_slicer = stream_slicer
|
105
143
|
|
106
144
|
def generate(self) -> Iterable[Partition]:
|
107
145
|
for stream_slice in self._stream_slicer.stream_slices():
|
@@ -4,10 +4,10 @@
|
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
from itertools import islice
|
7
|
-
from typing import Any, Iterable
|
7
|
+
from typing import Any, Iterable
|
8
8
|
|
9
9
|
from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer
|
10
|
-
from airbyte_cdk.sources.types import StreamSlice
|
10
|
+
from airbyte_cdk.sources.types import StreamSlice
|
11
11
|
|
12
12
|
|
13
13
|
@dataclass
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
2
|
+
import logging
|
3
|
+
import os
|
4
|
+
from queue import Queue
|
5
|
+
from typing import Callable, Iterable
|
6
|
+
|
7
|
+
from airbyte_cdk.models import AirbyteMessage, Level
|
8
|
+
from airbyte_cdk.models import Type as MessageType
|
9
|
+
from airbyte_cdk.sources.message.repository import LogMessage, MessageRepository
|
10
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem
|
11
|
+
|
12
|
+
logger = logging.getLogger("airbyte")
|
13
|
+
|
14
|
+
|
15
|
+
class ConcurrentMessageRepository(MessageRepository):
|
16
|
+
"""
|
17
|
+
Message repository that immediately loads messages onto the queue processed on the
|
18
|
+
main thread. This ensures that messages are processed in the correct order they are
|
19
|
+
received. The InMemoryMessageRepository implementation does not have guaranteed
|
20
|
+
ordering since whether to process the main thread vs. partitions is non-deterministic
|
21
|
+
and there can be a lag between reading the main-thread and consuming messages on the
|
22
|
+
MessageRepository.
|
23
|
+
|
24
|
+
This is particularly important for the connector builder which relies on grouping
|
25
|
+
of messages to organize request/response, pages, and partitions.
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(self, queue: Queue[QueueItem], message_repository: MessageRepository):
|
29
|
+
self._queue = queue
|
30
|
+
self._decorated_message_repository = message_repository
|
31
|
+
|
32
|
+
def emit_message(self, message: AirbyteMessage) -> None:
|
33
|
+
self._decorated_message_repository.emit_message(message)
|
34
|
+
for message in self._decorated_message_repository.consume_queue():
|
35
|
+
self._queue.put(message)
|
36
|
+
|
37
|
+
def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None:
|
38
|
+
self._decorated_message_repository.log_message(level, message_provider)
|
39
|
+
for message in self._decorated_message_repository.consume_queue():
|
40
|
+
self._queue.put(message)
|
41
|
+
|
42
|
+
def consume_queue(self) -> Iterable[AirbyteMessage]:
|
43
|
+
"""
|
44
|
+
This method shouldn't need to be called because as part of emit_message() we are already
|
45
|
+
loading messages onto the queue processed on the main thread.
|
46
|
+
"""
|
47
|
+
yield from []
|
@@ -4,6 +4,7 @@
|
|
4
4
|
|
5
5
|
import functools
|
6
6
|
import logging
|
7
|
+
import threading
|
7
8
|
from abc import ABC, abstractmethod
|
8
9
|
from typing import (
|
9
10
|
Any,
|
@@ -174,6 +175,12 @@ class ConcurrentCursor(Cursor):
|
|
174
175
|
self._should_be_synced_logger_triggered = False
|
175
176
|
self._clamping_strategy = clamping_strategy
|
176
177
|
|
178
|
+
# A lock is required when closing a partition because updating the cursor's concurrent_state is
|
179
|
+
# not thread safe. When multiple partitions are being closed by the cursor at the same time, it is
|
180
|
+
# possible for one partition to update concurrent_state after a second partition has already read
|
181
|
+
# the previous state. This can lead to the second partition overwriting the previous one's state.
|
182
|
+
self._lock = threading.Lock()
|
183
|
+
|
177
184
|
@property
|
178
185
|
def state(self) -> MutableMapping[str, Any]:
|
179
186
|
return self._connector_state_converter.convert_to_state_message(
|
@@ -222,6 +229,14 @@ class ConcurrentCursor(Cursor):
|
|
222
229
|
)
|
223
230
|
|
224
231
|
def observe(self, record: Record) -> None:
|
232
|
+
# Because observe writes to the most_recent_cursor_value_per_partition mapping,
|
233
|
+
# it is not thread-safe. However, this shouldn't lead to concurrency issues because
|
234
|
+
# observe() is only invoked by PartitionReader.process_partition(). Since the map is
|
235
|
+
# broken down according to partition, concurrent threads processing only read/write
|
236
|
+
# from different keys which avoids any conflicts.
|
237
|
+
#
|
238
|
+
# If we were to add thread safety, we should implement a lock per-partition
|
239
|
+
# which is instantiated during stream_slices()
|
225
240
|
most_recent_cursor_value = self._most_recent_cursor_value_per_partition.get(
|
226
241
|
record.associated_slice
|
227
242
|
)
|
@@ -237,13 +252,14 @@ class ConcurrentCursor(Cursor):
|
|
237
252
|
return self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
|
238
253
|
|
239
254
|
def close_partition(self, partition: Partition) -> None:
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
255
|
+
with self._lock:
|
256
|
+
slice_count_before = len(self._concurrent_state.get("slices", []))
|
257
|
+
self._add_slice_to_state(partition)
|
258
|
+
if slice_count_before < len(
|
259
|
+
self._concurrent_state["slices"]
|
260
|
+
): # only emit if at least one slice has been processed
|
261
|
+
self._merge_partitions()
|
262
|
+
self._emit_state_message()
|
247
263
|
self._has_closed_at_least_one_slice = True
|
248
264
|
|
249
265
|
def _add_slice_to_state(self, partition: Partition) -> None:
|
@@ -1,14 +1,45 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
|
1
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
import logging
|
4
4
|
from queue import Queue
|
5
|
+
from typing import Optional
|
5
6
|
|
6
7
|
from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException
|
8
|
+
from airbyte_cdk.sources.message.repository import MessageRepository
|
9
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
|
7
10
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
8
11
|
from airbyte_cdk.sources.streams.concurrent.partitions.types import (
|
9
12
|
PartitionCompleteSentinel,
|
10
13
|
QueueItem,
|
11
14
|
)
|
15
|
+
from airbyte_cdk.sources.utils.slice_logger import SliceLogger
|
16
|
+
|
17
|
+
|
18
|
+
# Since moving all the connector builder workflow to the concurrent CDK which required correct ordering
|
19
|
+
# of grouping log messages onto the main write thread using the ConcurrentMessageRepository, this
|
20
|
+
# separate flow and class that was used to log slices onto this partition's message_repository
|
21
|
+
# should just be replaced by emitting messages directly onto the repository instead of an intermediary.
|
22
|
+
class PartitionLogger:
|
23
|
+
"""
|
24
|
+
Helper class that provides a mechanism for passing a log message onto the current
|
25
|
+
partitions message repository
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(
|
29
|
+
self,
|
30
|
+
slice_logger: SliceLogger,
|
31
|
+
logger: logging.Logger,
|
32
|
+
message_repository: MessageRepository,
|
33
|
+
):
|
34
|
+
self._slice_logger = slice_logger
|
35
|
+
self._logger = logger
|
36
|
+
self._message_repository = message_repository
|
37
|
+
|
38
|
+
def log(self, partition: Partition) -> None:
|
39
|
+
if self._slice_logger.should_log_slice_message(self._logger):
|
40
|
+
self._message_repository.emit_message(
|
41
|
+
self._slice_logger.create_slice_log_message(partition.to_slice())
|
42
|
+
)
|
12
43
|
|
13
44
|
|
14
45
|
class PartitionReader:
|
@@ -18,13 +49,18 @@ class PartitionReader:
|
|
18
49
|
|
19
50
|
_IS_SUCCESSFUL = True
|
20
51
|
|
21
|
-
def __init__(
|
52
|
+
def __init__(
|
53
|
+
self,
|
54
|
+
queue: Queue[QueueItem],
|
55
|
+
partition_logger: Optional[PartitionLogger] = None,
|
56
|
+
) -> None:
|
22
57
|
"""
|
23
58
|
:param queue: The queue to put the records in.
|
24
59
|
"""
|
25
60
|
self._queue = queue
|
61
|
+
self._partition_logger = partition_logger
|
26
62
|
|
27
|
-
def process_partition(self, partition: Partition) -> None:
|
63
|
+
def process_partition(self, partition: Partition, cursor: Cursor) -> None:
|
28
64
|
"""
|
29
65
|
Process a partition and put the records in the output queue.
|
30
66
|
When all the partitions are added to the queue, a sentinel is added to the queue to indicate that all the partitions have been generated.
|
@@ -37,8 +73,13 @@ class PartitionReader:
|
|
37
73
|
:return: None
|
38
74
|
"""
|
39
75
|
try:
|
76
|
+
if self._partition_logger:
|
77
|
+
self._partition_logger.log(partition)
|
78
|
+
|
40
79
|
for record in partition.read():
|
41
80
|
self._queue.put(record)
|
81
|
+
cursor.observe(record)
|
82
|
+
cursor.close_partition(partition)
|
42
83
|
self._queue.put(PartitionCompleteSentinel(partition, self._IS_SUCCESSFUL))
|
43
84
|
except Exception as e:
|
44
85
|
self._queue.put(StreamThreadException(e, partition.stream_name()))
|
@@ -4,6 +4,7 @@
|
|
4
4
|
|
5
5
|
from typing import Any, Union
|
6
6
|
|
7
|
+
from airbyte_cdk.models import AirbyteMessage
|
7
8
|
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import (
|
8
9
|
PartitionGenerationCompletedSentinel,
|
9
10
|
)
|
@@ -34,5 +35,10 @@ class PartitionCompleteSentinel:
|
|
34
35
|
Typedef representing the items that can be added to the ThreadBasedConcurrentStream
|
35
36
|
"""
|
36
37
|
QueueItem = Union[
|
37
|
-
Record,
|
38
|
+
Record,
|
39
|
+
Partition,
|
40
|
+
PartitionCompleteSentinel,
|
41
|
+
PartitionGenerationCompletedSentinel,
|
42
|
+
Exception,
|
43
|
+
AirbyteMessage,
|
38
44
|
]
|
@@ -153,7 +153,10 @@ class HttpClient:
|
|
153
153
|
# * `If the application running SQLite crashes, the data will be safe, but the database [might become corrupted](https://www.sqlite.org/howtocorrupt.html#cfgerr) if the operating system crashes or the computer loses power before that data has been written to the disk surface.` in [this description](https://www.sqlite.org/pragma.html#pragma_synchronous).
|
154
154
|
backend = requests_cache.SQLiteCache(sqlite_path, fast_save=True, wal=True)
|
155
155
|
return CachedLimiterSession(
|
156
|
-
sqlite_path,
|
156
|
+
cache_name=sqlite_path,
|
157
|
+
backend=backend,
|
158
|
+
api_budget=self._api_budget,
|
159
|
+
match_headers=True,
|
157
160
|
)
|
158
161
|
else:
|
159
162
|
return LimiterSession(api_budget=self._api_budget)
|
@@ -11,6 +11,10 @@ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level
|
|
11
11
|
from airbyte_cdk.models import Type as MessageType
|
12
12
|
|
13
13
|
|
14
|
+
# Once everything runs on the concurrent CDK and we've cleaned up the legacy flows, we should try to remove
|
15
|
+
# this class and write messages directly to the message_repository instead of through the logger because for
|
16
|
+
# cases like the connector builder where ordering of messages is important, using the logger can cause
|
17
|
+
# messages to be grouped out of order. Alas work for a different day.
|
14
18
|
class SliceLogger(ABC):
|
15
19
|
"""
|
16
20
|
SliceLogger is an interface that allows us to log slices of data in a uniform way.
|