airbyte-cdk 0.61.2__py3-none-any.whl → 0.62.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/abstract_source.py +14 -33
- airbyte_cdk/sources/connector_state_manager.py +16 -4
- airbyte_cdk/sources/file_based/file_based_source.py +87 -35
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +3 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +15 -13
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -0
- airbyte_cdk/sources/file_based/stream/concurrent/{cursor.py → cursor/abstract_concurrent_file_based_cursor.py} +22 -44
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +279 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_noop_cursor.py +56 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +11 -2
- airbyte_cdk/test/mock_http/mocker.py +3 -1
- airbyte_cdk/test/mock_http/response.py +9 -1
- airbyte_cdk/utils/traced_exception.py +1 -16
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/RECORD +33 -26
- unit_tests/sources/file_based/helpers.py +5 -0
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +2860 -0
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +11 -0
- unit_tests/sources/file_based/scenarios/scenario_builder.py +6 -2
- unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +365 -0
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +462 -0
- unit_tests/sources/file_based/test_file_based_scenarios.py +45 -0
- unit_tests/sources/file_based/test_scenarios.py +16 -8
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +13 -2
- unit_tests/sources/test_abstract_source.py +36 -170
- unit_tests/sources/test_connector_state_manager.py +20 -13
- unit_tests/sources/test_integration_source.py +8 -25
- unit_tests/sources/test_source_read.py +1 -1
- unit_tests/test/mock_http/test_mocker.py +3 -1
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,6 @@ from airbyte_cdk.models import (
|
|
15
15
|
ConfiguredAirbyteCatalog,
|
16
16
|
ConfiguredAirbyteStream,
|
17
17
|
Status,
|
18
|
-
StreamDescriptor,
|
19
18
|
SyncMode,
|
20
19
|
)
|
21
20
|
from airbyte_cdk.models import Type as MessageType
|
@@ -28,7 +27,6 @@ from airbyte_cdk.sources.streams.http.http import HttpStream
|
|
28
27
|
from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
|
29
28
|
from airbyte_cdk.sources.utils.schema_helpers import InternalConfig, split_config
|
30
29
|
from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
|
31
|
-
from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets
|
32
30
|
from airbyte_cdk.utils.event_timing import create_timer
|
33
31
|
from airbyte_cdk.utils.stream_status_utils import as_airbyte_message as stream_status_as_airbyte_message
|
34
32
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
@@ -101,7 +99,7 @@ class AbstractSource(Source, ABC):
|
|
101
99
|
# TODO assert all streams exist in the connector
|
102
100
|
# get the streams once in case the connector needs to make any queries to generate them
|
103
101
|
stream_instances = {s.name: s for s in self.streams(config)}
|
104
|
-
state_manager = ConnectorStateManager(stream_instance_map=
|
102
|
+
state_manager = ConnectorStateManager(stream_instance_map={s.stream.name: s.stream for s in catalog.streams}, state=state)
|
105
103
|
self._stream_to_instance_map = stream_instances
|
106
104
|
|
107
105
|
stream_name_to_exception: MutableMapping[str, AirbyteTracedException] = {}
|
@@ -135,16 +133,11 @@ class AbstractSource(Source, ABC):
|
|
135
133
|
logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
|
136
134
|
yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.COMPLETE)
|
137
135
|
except AirbyteTracedException as e:
|
138
|
-
logger.exception(f"Encountered an exception while reading stream {configured_stream.stream.name}")
|
139
|
-
logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
|
140
136
|
yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.INCOMPLETE)
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
f"Stopping sync on error from stream {configured_stream.stream.name} because {self.name} does not support continuing syncs on error."
|
146
|
-
)
|
147
|
-
break
|
137
|
+
if self.continue_sync_on_stream_failure:
|
138
|
+
stream_name_to_exception[stream_instance.name] = e
|
139
|
+
else:
|
140
|
+
raise e
|
148
141
|
except Exception as e:
|
149
142
|
yield from self._emit_queued_messages()
|
150
143
|
logger.exception(f"Encountered an exception while reading stream {configured_stream.stream.name}")
|
@@ -152,27 +145,15 @@ class AbstractSource(Source, ABC):
|
|
152
145
|
yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.INCOMPLETE)
|
153
146
|
display_message = stream_instance.get_error_display_message(e)
|
154
147
|
if display_message:
|
155
|
-
|
156
|
-
|
157
|
-
traced_exception = AirbyteTracedException.from_exception(e)
|
158
|
-
yield traced_exception.as_sanitized_airbyte_message(
|
159
|
-
stream_descriptor=StreamDescriptor(name=configured_stream.stream.name)
|
160
|
-
)
|
161
|
-
stream_name_to_exception[stream_instance.name] = traced_exception
|
162
|
-
if self.stop_sync_on_stream_failure:
|
163
|
-
logger.info(f"{self.name} does not support continuing syncs on error from stream {configured_stream.stream.name}")
|
164
|
-
break
|
148
|
+
raise AirbyteTracedException.from_exception(e, message=display_message) from e
|
149
|
+
raise e
|
165
150
|
finally:
|
166
151
|
timer.finish_event()
|
167
152
|
logger.info(f"Finished syncing {configured_stream.stream.name}")
|
168
153
|
logger.info(timer.report())
|
169
154
|
|
170
|
-
if len(stream_name_to_exception) > 0:
|
171
|
-
|
172
|
-
logger.info(error_message)
|
173
|
-
# We still raise at least one exception when a stream raises an exception because the platform
|
174
|
-
# currently relies on a non-zero exit code to determine if a sync attempt has failed
|
175
|
-
raise AirbyteTracedException(message=error_message)
|
155
|
+
if self.continue_sync_on_stream_failure and len(stream_name_to_exception) > 0:
|
156
|
+
raise AirbyteTracedException(message=self._generate_failed_streams_error_message(stream_name_to_exception))
|
176
157
|
logger.info(f"Finished syncing {self.name}")
|
177
158
|
|
178
159
|
@property
|
@@ -301,17 +282,17 @@ class AbstractSource(Source, ABC):
|
|
301
282
|
return _default_message_repository
|
302
283
|
|
303
284
|
@property
|
304
|
-
def
|
285
|
+
def continue_sync_on_stream_failure(self) -> bool:
|
305
286
|
"""
|
306
287
|
WARNING: This function is in-development which means it is subject to change. Use at your own risk.
|
307
288
|
|
308
|
-
By default,
|
309
|
-
|
310
|
-
|
289
|
+
By default, a source should raise an exception and stop the sync when it encounters an error while syncing a stream. This
|
290
|
+
method can be overridden on a per-source basis so that a source will continue syncing streams other streams even if an
|
291
|
+
exception is raised for a stream.
|
311
292
|
"""
|
312
293
|
return False
|
313
294
|
|
314
295
|
@staticmethod
|
315
296
|
def _generate_failed_streams_error_message(stream_failures: Mapping[str, AirbyteTracedException]) -> str:
|
316
|
-
failures = ", ".join([f"{stream}: {
|
297
|
+
failures = ", ".join([f"{stream}: {exception.__repr__()}" for stream, exception in stream_failures.items()])
|
317
298
|
return f"During the sync, the following streams did not sync successfully: {failures}"
|
@@ -5,7 +5,15 @@
|
|
5
5
|
import copy
|
6
6
|
from typing import Any, List, Mapping, MutableMapping, Optional, Tuple, Union
|
7
7
|
|
8
|
-
from airbyte_cdk.models import
|
8
|
+
from airbyte_cdk.models import (
|
9
|
+
AirbyteMessage,
|
10
|
+
AirbyteStateBlob,
|
11
|
+
AirbyteStateMessage,
|
12
|
+
AirbyteStateType,
|
13
|
+
AirbyteStream,
|
14
|
+
AirbyteStreamState,
|
15
|
+
StreamDescriptor,
|
16
|
+
)
|
9
17
|
from airbyte_cdk.models import Type as MessageType
|
10
18
|
from airbyte_cdk.sources.streams import Stream
|
11
19
|
from pydantic import Extra
|
@@ -29,7 +37,9 @@ class ConnectorStateManager:
|
|
29
37
|
"""
|
30
38
|
|
31
39
|
def __init__(
|
32
|
-
self,
|
40
|
+
self,
|
41
|
+
stream_instance_map: Mapping[str, Union[Stream, AirbyteStream]],
|
42
|
+
state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None,
|
33
43
|
):
|
34
44
|
shared_state, per_stream_states = self._extract_from_state_message(state, stream_instance_map)
|
35
45
|
|
@@ -97,7 +107,9 @@ class ConnectorStateManager:
|
|
97
107
|
|
98
108
|
@classmethod
|
99
109
|
def _extract_from_state_message(
|
100
|
-
cls,
|
110
|
+
cls,
|
111
|
+
state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]],
|
112
|
+
stream_instance_map: Mapping[str, Union[Stream, AirbyteStream]],
|
101
113
|
) -> Tuple[Optional[AirbyteStateBlob], MutableMapping[HashableStreamDescriptor, Optional[AirbyteStateBlob]]]:
|
102
114
|
"""
|
103
115
|
Takes an incoming list of state messages or the legacy state format and extracts state attributes according to type
|
@@ -149,7 +161,7 @@ class ConnectorStateManager:
|
|
149
161
|
|
150
162
|
@staticmethod
|
151
163
|
def _create_descriptor_to_stream_state_mapping(
|
152
|
-
state: MutableMapping[str, Any], stream_to_instance_map: Mapping[str, Stream]
|
164
|
+
state: MutableMapping[str, Any], stream_to_instance_map: Mapping[str, Union[Stream, AirbyteStream]]
|
153
165
|
) -> MutableMapping[HashableStreamDescriptor, Optional[AirbyteStateBlob]]:
|
154
166
|
"""
|
155
167
|
Takes incoming state received in the legacy format and transforms it into a mapping of StreamDescriptor to AirbyteStreamState
|
@@ -12,6 +12,7 @@ from airbyte_cdk.logger import AirbyteLogFormatter, init_logger
|
|
12
12
|
from airbyte_cdk.models import (
|
13
13
|
AirbyteMessage,
|
14
14
|
AirbyteStateMessage,
|
15
|
+
AirbyteStream,
|
15
16
|
ConfiguredAirbyteCatalog,
|
16
17
|
ConnectorSpecification,
|
17
18
|
FailureType,
|
@@ -20,6 +21,7 @@ from airbyte_cdk.models import (
|
|
20
21
|
)
|
21
22
|
from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
|
22
23
|
from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
|
24
|
+
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
23
25
|
from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy, DefaultFileBasedAvailabilityStrategy
|
24
26
|
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
25
27
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
|
@@ -31,12 +33,15 @@ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeP
|
|
31
33
|
from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy
|
32
34
|
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
|
33
35
|
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
|
34
|
-
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import
|
36
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
|
37
|
+
AbstractConcurrentFileBasedCursor,
|
38
|
+
FileBasedConcurrentCursor,
|
39
|
+
FileBasedNoopCursor,
|
40
|
+
)
|
35
41
|
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
36
|
-
from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
|
37
42
|
from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository
|
38
|
-
from airbyte_cdk.sources.source import TState
|
39
43
|
from airbyte_cdk.sources.streams import Stream
|
44
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
|
40
45
|
from airbyte_cdk.utils.analytics_message import create_analytics_message
|
41
46
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
42
47
|
from pydantic.error_wrappers import ValidationError
|
@@ -56,12 +61,12 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
56
61
|
spec_class: Type[AbstractFileBasedSpec],
|
57
62
|
catalog: Optional[ConfiguredAirbyteCatalog],
|
58
63
|
config: Optional[Mapping[str, Any]],
|
59
|
-
state: Optional[
|
64
|
+
state: Optional[MutableMapping[str, Any]],
|
60
65
|
availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy] = None,
|
61
66
|
discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
|
62
67
|
parsers: Mapping[Type[Any], FileTypeParser] = default_parsers,
|
63
68
|
validation_policies: Mapping[ValidationPolicy, AbstractSchemaValidationPolicy] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
|
64
|
-
cursor_cls: Type[AbstractFileBasedCursor] =
|
69
|
+
cursor_cls: Type[Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]] = FileBasedConcurrentCursor,
|
65
70
|
):
|
66
71
|
self.stream_reader = stream_reader
|
67
72
|
self.spec_class = spec_class
|
@@ -137,52 +142,99 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
137
142
|
"""
|
138
143
|
Return a list of this source's streams.
|
139
144
|
"""
|
140
|
-
file_based_streams = self._get_file_based_streams(config)
|
141
|
-
|
142
|
-
configured_streams: List[Stream] = []
|
143
|
-
|
144
|
-
for stream in file_based_streams:
|
145
|
-
sync_mode = self._get_sync_mode_from_catalog(stream)
|
146
|
-
if sync_mode == SyncMode.full_refresh and hasattr(self, "_concurrency_level") and self._concurrency_level is not None:
|
147
|
-
configured_streams.append(
|
148
|
-
FileBasedStreamFacade.create_from_stream(stream, self, self.logger, None, FileBasedNoopCursor(stream.config))
|
149
|
-
)
|
150
|
-
else:
|
151
|
-
configured_streams.append(stream)
|
152
145
|
|
153
|
-
|
146
|
+
if self.catalog:
|
147
|
+
state_manager = ConnectorStateManager(
|
148
|
+
stream_instance_map={s.stream.name: s.stream for s in self.catalog.streams},
|
149
|
+
state=self.state,
|
150
|
+
)
|
151
|
+
else:
|
152
|
+
# During `check` operations we don't have a catalog so cannot create a state manager.
|
153
|
+
# Since the state manager is only required for incremental syncs, this is fine.
|
154
|
+
state_manager = None
|
154
155
|
|
155
|
-
def _get_file_based_streams(self, config: Mapping[str, Any]) -> List[AbstractFileBasedStream]:
|
156
156
|
try:
|
157
157
|
parsed_config = self._get_parsed_config(config)
|
158
158
|
self.stream_reader.config = parsed_config
|
159
|
-
streams: List[
|
159
|
+
streams: List[Stream] = []
|
160
160
|
for stream_config in parsed_config.streams:
|
161
|
+
# Like state_manager, `catalog_stream` may be None during `check`
|
162
|
+
catalog_stream = self._get_stream_from_catalog(stream_config)
|
163
|
+
stream_state = (
|
164
|
+
state_manager.get_stream_state(catalog_stream.name, catalog_stream.namespace)
|
165
|
+
if (state_manager and catalog_stream)
|
166
|
+
else None
|
167
|
+
)
|
161
168
|
self._validate_input_schema(stream_config)
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
parsers=self.parsers,
|
170
|
-
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
171
|
-
cursor=self.cursor_cls(stream_config),
|
172
|
-
errors_collector=self.errors_collector,
|
169
|
+
|
170
|
+
sync_mode = self._get_sync_mode_from_catalog(stream_config.name)
|
171
|
+
|
172
|
+
if sync_mode == SyncMode.full_refresh and hasattr(self, "_concurrency_level") and self._concurrency_level is not None:
|
173
|
+
cursor = FileBasedNoopCursor(stream_config)
|
174
|
+
stream = FileBasedStreamFacade.create_from_stream(
|
175
|
+
self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
|
173
176
|
)
|
174
|
-
|
177
|
+
|
178
|
+
elif (
|
179
|
+
sync_mode == SyncMode.incremental
|
180
|
+
and issubclass(self.cursor_cls, AbstractConcurrentFileBasedCursor)
|
181
|
+
and hasattr(self, "_concurrency_level")
|
182
|
+
and self._concurrency_level is not None
|
183
|
+
):
|
184
|
+
assert (
|
185
|
+
state_manager is not None
|
186
|
+
), "No ConnectorStateManager was created, but it is required for incremental syncs. This is unexpected. Please contact Support."
|
187
|
+
|
188
|
+
cursor = self.cursor_cls(
|
189
|
+
stream_config,
|
190
|
+
stream_config.name,
|
191
|
+
None,
|
192
|
+
stream_state,
|
193
|
+
self.message_repository,
|
194
|
+
state_manager,
|
195
|
+
CursorField(DefaultFileBasedStream.ab_last_mod_col),
|
196
|
+
)
|
197
|
+
stream = FileBasedStreamFacade.create_from_stream(
|
198
|
+
self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
|
199
|
+
)
|
200
|
+
else:
|
201
|
+
cursor = self.cursor_cls(stream_config)
|
202
|
+
stream = self._make_default_stream(stream_config, cursor)
|
203
|
+
|
204
|
+
streams.append(stream)
|
175
205
|
return streams
|
176
206
|
|
177
207
|
except ValidationError as exc:
|
178
208
|
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) from exc
|
179
209
|
|
180
|
-
def
|
210
|
+
def _make_default_stream(
|
211
|
+
self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor]
|
212
|
+
) -> AbstractFileBasedStream:
|
213
|
+
return DefaultFileBasedStream(
|
214
|
+
config=stream_config,
|
215
|
+
catalog_schema=self.stream_schemas.get(stream_config.name),
|
216
|
+
stream_reader=self.stream_reader,
|
217
|
+
availability_strategy=self.availability_strategy,
|
218
|
+
discovery_policy=self.discovery_policy,
|
219
|
+
parsers=self.parsers,
|
220
|
+
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
221
|
+
errors_collector=self.errors_collector,
|
222
|
+
cursor=cursor,
|
223
|
+
)
|
224
|
+
|
225
|
+
def _get_stream_from_catalog(self, stream_config: FileBasedStreamConfig) -> Optional[AirbyteStream]:
|
226
|
+
if self.catalog:
|
227
|
+
for stream in self.catalog.streams or []:
|
228
|
+
if stream.stream.name == stream_config.name:
|
229
|
+
return stream.stream
|
230
|
+
return None
|
231
|
+
|
232
|
+
def _get_sync_mode_from_catalog(self, stream_name: str) -> Optional[SyncMode]:
|
181
233
|
if self.catalog:
|
182
234
|
for catalog_stream in self.catalog.streams:
|
183
|
-
if
|
235
|
+
if stream_name == catalog_stream.stream.name:
|
184
236
|
return catalog_stream.sync_mode
|
185
|
-
self.logger.warning(f"No sync mode was found for {
|
237
|
+
self.logger.warning(f"No sync mode was found for {stream_name}.")
|
186
238
|
return None
|
187
239
|
|
188
240
|
def read(
|
@@ -15,6 +15,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFile
|
|
15
15
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
16
16
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
17
17
|
from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
|
18
|
+
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
18
19
|
from airbyte_cdk.sources.file_based.types import StreamSlice
|
19
20
|
from airbyte_cdk.sources.streams import Stream
|
20
21
|
|
@@ -45,6 +46,7 @@ class AbstractFileBasedStream(Stream):
|
|
45
46
|
parsers: Dict[Type[Any], FileTypeParser],
|
46
47
|
validation_policy: AbstractSchemaValidationPolicy,
|
47
48
|
errors_collector: FileBasedErrorsCollector,
|
49
|
+
cursor: AbstractFileBasedCursor,
|
48
50
|
):
|
49
51
|
super().__init__()
|
50
52
|
self.config = config
|
@@ -55,6 +57,7 @@ class AbstractFileBasedStream(Stream):
|
|
55
57
|
self._availability_strategy = availability_strategy
|
56
58
|
self._parsers = parsers
|
57
59
|
self.errors_collector = errors_collector
|
60
|
+
self._cursor = cursor
|
58
61
|
|
59
62
|
@property
|
60
63
|
@abstractmethod
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import copy
|
6
6
|
import logging
|
7
7
|
from functools import lru_cache
|
8
|
-
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union
|
8
|
+
from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union
|
9
9
|
|
10
10
|
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode, Type
|
11
11
|
from airbyte_cdk.sources import AbstractSource
|
@@ -19,6 +19,7 @@ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeP
|
|
19
19
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
20
20
|
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
|
21
21
|
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedNoopCursor
|
22
|
+
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
22
23
|
from airbyte_cdk.sources.file_based.types import StreamSlice
|
23
24
|
from airbyte_cdk.sources.message import MessageRepository
|
24
25
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade
|
@@ -33,6 +34,9 @@ from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
|
|
33
34
|
from airbyte_cdk.sources.utils.slice_logger import SliceLogger
|
34
35
|
from deprecated.classic import deprecated
|
35
36
|
|
37
|
+
if TYPE_CHECKING:
|
38
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor
|
39
|
+
|
36
40
|
"""
|
37
41
|
This module contains adapters to help enabling concurrency on File-based Stream objects without needing to migrate to AbstractStream
|
38
42
|
"""
|
@@ -47,13 +51,14 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
|
|
47
51
|
source: AbstractSource,
|
48
52
|
logger: logging.Logger,
|
49
53
|
state: Optional[MutableMapping[str, Any]],
|
50
|
-
cursor:
|
54
|
+
cursor: "AbstractConcurrentFileBasedCursor",
|
51
55
|
) -> "FileBasedStreamFacade":
|
52
56
|
"""
|
53
57
|
Create a ConcurrentStream from a FileBasedStream object.
|
54
58
|
"""
|
55
59
|
pk = get_primary_key_from_stream(stream.primary_key)
|
56
60
|
cursor_field = get_cursor_field_from_stream(stream)
|
61
|
+
stream._cursor = cursor
|
57
62
|
|
58
63
|
if not source.message_repository:
|
59
64
|
raise ValueError(
|
@@ -62,7 +67,7 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
|
|
62
67
|
|
63
68
|
message_repository = source.message_repository
|
64
69
|
return FileBasedStreamFacade(
|
65
|
-
DefaultStream(
|
70
|
+
DefaultStream(
|
66
71
|
partition_generator=FileBasedStreamPartitionGenerator(
|
67
72
|
stream,
|
68
73
|
message_repository,
|
@@ -90,14 +95,13 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
|
|
90
95
|
self,
|
91
96
|
stream: DefaultStream,
|
92
97
|
legacy_stream: AbstractFileBasedStream,
|
93
|
-
cursor:
|
98
|
+
cursor: AbstractFileBasedCursor,
|
94
99
|
slice_logger: SliceLogger,
|
95
100
|
logger: logging.Logger,
|
96
101
|
):
|
97
102
|
"""
|
98
103
|
:param stream: The underlying AbstractStream
|
99
104
|
"""
|
100
|
-
# super().__init__(stream, legacy_stream, cursor, slice_logger, logger)
|
101
105
|
self._abstract_stream = stream
|
102
106
|
self._legacy_stream = legacy_stream
|
103
107
|
self._cursor = cursor
|
@@ -216,7 +220,7 @@ class FileBasedStreamPartition(Partition):
|
|
216
220
|
sync_mode: SyncMode,
|
217
221
|
cursor_field: Optional[List[str]],
|
218
222
|
state: Optional[MutableMapping[str, Any]],
|
219
|
-
cursor:
|
223
|
+
cursor: "AbstractConcurrentFileBasedCursor",
|
220
224
|
):
|
221
225
|
self._stream = stream
|
222
226
|
self._slice = _slice
|
@@ -292,7 +296,7 @@ class FileBasedStreamPartitionGenerator(PartitionGenerator):
|
|
292
296
|
sync_mode: SyncMode,
|
293
297
|
cursor_field: Optional[List[str]],
|
294
298
|
state: Optional[MutableMapping[str, Any]],
|
295
|
-
cursor:
|
299
|
+
cursor: "AbstractConcurrentFileBasedCursor",
|
296
300
|
):
|
297
301
|
self._stream = stream
|
298
302
|
self._message_repository = message_repository
|
@@ -305,19 +309,17 @@ class FileBasedStreamPartitionGenerator(PartitionGenerator):
|
|
305
309
|
pending_partitions = []
|
306
310
|
for _slice in self._stream.stream_slices(sync_mode=self._sync_mode, cursor_field=self._cursor_field, stream_state=self._state):
|
307
311
|
if _slice is not None:
|
308
|
-
|
309
|
-
|
312
|
+
for file in _slice.get("files", []):
|
313
|
+
pending_partitions.append(
|
310
314
|
FileBasedStreamPartition(
|
311
315
|
self._stream,
|
312
|
-
{"files": [copy.deepcopy(
|
316
|
+
{"files": [copy.deepcopy(file)]},
|
313
317
|
self._message_repository,
|
314
318
|
self._sync_mode,
|
315
319
|
self._cursor_field,
|
316
320
|
self._state,
|
317
321
|
self._cursor,
|
318
322
|
)
|
319
|
-
|
320
|
-
]
|
321
|
-
)
|
323
|
+
)
|
322
324
|
self._cursor.set_pending_partitions(pending_partitions)
|
323
325
|
yield from pending_partitions
|
@@ -0,0 +1,5 @@
|
|
1
|
+
from .abstract_concurrent_file_based_cursor import AbstractConcurrentFileBasedCursor
|
2
|
+
from .file_based_noop_cursor import FileBasedNoopCursor
|
3
|
+
from .file_based_concurrent_cursor import FileBasedConcurrentCursor
|
4
|
+
|
5
|
+
__all__ = ["AbstractConcurrentFileBasedCursor", "FileBasedConcurrentCursor", "FileBasedNoopCursor"]
|
@@ -1,12 +1,12 @@
|
|
1
1
|
#
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
|
+
|
4
5
|
import logging
|
5
|
-
from abc import abstractmethod
|
6
|
+
from abc import ABC, abstractmethod
|
6
7
|
from datetime import datetime
|
7
|
-
from typing import Any, Iterable, MutableMapping
|
8
|
+
from typing import TYPE_CHECKING, Any, Iterable, List, MutableMapping
|
8
9
|
|
9
|
-
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
10
10
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
11
11
|
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
12
12
|
from airbyte_cdk.sources.file_based.types import StreamState
|
@@ -14,27 +14,33 @@ from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
|
|
14
14
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
15
15
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
16
16
|
|
17
|
+
if TYPE_CHECKING:
|
18
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
|
19
|
+
|
20
|
+
|
21
|
+
class AbstractConcurrentFileBasedCursor(Cursor, AbstractFileBasedCursor, ABC):
|
22
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
23
|
+
pass
|
17
24
|
|
18
|
-
class AbstractFileBasedConcurrentCursor(Cursor, AbstractFileBasedCursor):
|
19
25
|
@property
|
20
26
|
@abstractmethod
|
21
27
|
def state(self) -> MutableMapping[str, Any]:
|
22
28
|
...
|
23
29
|
|
24
30
|
@abstractmethod
|
25
|
-
def
|
31
|
+
def observe(self, record: Record) -> None:
|
26
32
|
...
|
27
33
|
|
28
34
|
@abstractmethod
|
29
|
-
def
|
35
|
+
def close_partition(self, partition: Partition) -> None:
|
30
36
|
...
|
31
37
|
|
32
38
|
@abstractmethod
|
33
|
-
def
|
39
|
+
def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None:
|
34
40
|
...
|
35
41
|
|
36
42
|
@abstractmethod
|
37
|
-
def
|
43
|
+
def add_file(self, file: RemoteFile) -> None:
|
38
44
|
...
|
39
45
|
|
40
46
|
@abstractmethod
|
@@ -42,49 +48,21 @@ class AbstractFileBasedConcurrentCursor(Cursor, AbstractFileBasedCursor):
|
|
42
48
|
...
|
43
49
|
|
44
50
|
@abstractmethod
|
45
|
-
def
|
51
|
+
def get_state(self) -> MutableMapping[str, Any]:
|
46
52
|
...
|
47
53
|
|
48
54
|
@abstractmethod
|
49
|
-
def
|
55
|
+
def set_initial_state(self, value: StreamState) -> None:
|
50
56
|
...
|
51
57
|
|
52
58
|
@abstractmethod
|
53
|
-
def set_pending_partitions(self, partitions: Iterable[Partition]) -> None:
|
54
|
-
...
|
55
|
-
|
56
|
-
|
57
|
-
class FileBasedNoopCursor(AbstractFileBasedConcurrentCursor):
|
58
|
-
def __init__(self, stream_config: FileBasedStreamConfig, **kwargs: Any):
|
59
|
-
pass
|
60
|
-
|
61
|
-
@property
|
62
|
-
def state(self) -> MutableMapping[str, Any]:
|
63
|
-
return {}
|
64
|
-
|
65
|
-
def add_file(self, file: RemoteFile) -> None:
|
66
|
-
return None
|
67
|
-
|
68
|
-
def set_initial_state(self, value: StreamState) -> None:
|
69
|
-
return None
|
70
|
-
|
71
|
-
def get_state(self) -> MutableMapping[str, Any]:
|
72
|
-
return {}
|
73
|
-
|
74
59
|
def get_start_time(self) -> datetime:
|
75
|
-
|
76
|
-
|
77
|
-
def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]:
|
78
|
-
return []
|
79
|
-
|
80
|
-
def observe(self, record: Record) -> None:
|
81
|
-
return None
|
60
|
+
...
|
82
61
|
|
83
|
-
|
84
|
-
|
62
|
+
@abstractmethod
|
63
|
+
def emit_state_message(self) -> None:
|
64
|
+
...
|
85
65
|
|
66
|
+
@abstractmethod
|
86
67
|
def ensure_at_least_one_state_emitted(self) -> None:
|
87
|
-
|
88
|
-
|
89
|
-
def set_pending_partitions(self, partitions: Iterable[Partition]) -> None:
|
90
|
-
return None
|
68
|
+
...
|