airbyte-cdk 0.61.2__py3-none-any.whl → 0.62.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/sources/abstract_source.py +14 -33
- airbyte_cdk/sources/connector_state_manager.py +16 -4
- airbyte_cdk/sources/file_based/file_based_source.py +87 -35
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +3 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +15 -13
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -0
- airbyte_cdk/sources/file_based/stream/concurrent/{cursor.py → cursor/abstract_concurrent_file_based_cursor.py} +22 -44
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +279 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_noop_cursor.py +56 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +11 -2
- airbyte_cdk/test/mock_http/mocker.py +3 -1
- airbyte_cdk/test/mock_http/response.py +9 -1
- airbyte_cdk/utils/traced_exception.py +1 -16
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/METADATA +1 -1
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/RECORD +33 -26
- unit_tests/sources/file_based/helpers.py +5 -0
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +2860 -0
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +11 -0
- unit_tests/sources/file_based/scenarios/scenario_builder.py +6 -2
- unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +365 -0
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +462 -0
- unit_tests/sources/file_based/test_file_based_scenarios.py +45 -0
- unit_tests/sources/file_based/test_scenarios.py +16 -8
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +13 -2
- unit_tests/sources/test_abstract_source.py +36 -170
- unit_tests/sources/test_connector_state_manager.py +20 -13
- unit_tests/sources/test_integration_source.py +8 -25
- unit_tests/sources/test_source_read.py +1 -1
- unit_tests/test/mock_http/test_mocker.py +3 -1
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/WHEEL +0 -0
- {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,6 @@ from airbyte_cdk.models import (
|
|
15
15
|
ConfiguredAirbyteCatalog,
|
16
16
|
ConfiguredAirbyteStream,
|
17
17
|
Status,
|
18
|
-
StreamDescriptor,
|
19
18
|
SyncMode,
|
20
19
|
)
|
21
20
|
from airbyte_cdk.models import Type as MessageType
|
@@ -28,7 +27,6 @@ from airbyte_cdk.sources.streams.http.http import HttpStream
|
|
28
27
|
from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
|
29
28
|
from airbyte_cdk.sources.utils.schema_helpers import InternalConfig, split_config
|
30
29
|
from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
|
31
|
-
from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets
|
32
30
|
from airbyte_cdk.utils.event_timing import create_timer
|
33
31
|
from airbyte_cdk.utils.stream_status_utils import as_airbyte_message as stream_status_as_airbyte_message
|
34
32
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
@@ -101,7 +99,7 @@ class AbstractSource(Source, ABC):
|
|
101
99
|
# TODO assert all streams exist in the connector
|
102
100
|
# get the streams once in case the connector needs to make any queries to generate them
|
103
101
|
stream_instances = {s.name: s for s in self.streams(config)}
|
104
|
-
state_manager = ConnectorStateManager(stream_instance_map=
|
102
|
+
state_manager = ConnectorStateManager(stream_instance_map={s.stream.name: s.stream for s in catalog.streams}, state=state)
|
105
103
|
self._stream_to_instance_map = stream_instances
|
106
104
|
|
107
105
|
stream_name_to_exception: MutableMapping[str, AirbyteTracedException] = {}
|
@@ -135,16 +133,11 @@ class AbstractSource(Source, ABC):
|
|
135
133
|
logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
|
136
134
|
yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.COMPLETE)
|
137
135
|
except AirbyteTracedException as e:
|
138
|
-
logger.exception(f"Encountered an exception while reading stream {configured_stream.stream.name}")
|
139
|
-
logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
|
140
136
|
yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.INCOMPLETE)
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
f"Stopping sync on error from stream {configured_stream.stream.name} because {self.name} does not support continuing syncs on error."
|
146
|
-
)
|
147
|
-
break
|
137
|
+
if self.continue_sync_on_stream_failure:
|
138
|
+
stream_name_to_exception[stream_instance.name] = e
|
139
|
+
else:
|
140
|
+
raise e
|
148
141
|
except Exception as e:
|
149
142
|
yield from self._emit_queued_messages()
|
150
143
|
logger.exception(f"Encountered an exception while reading stream {configured_stream.stream.name}")
|
@@ -152,27 +145,15 @@ class AbstractSource(Source, ABC):
|
|
152
145
|
yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.INCOMPLETE)
|
153
146
|
display_message = stream_instance.get_error_display_message(e)
|
154
147
|
if display_message:
|
155
|
-
|
156
|
-
|
157
|
-
traced_exception = AirbyteTracedException.from_exception(e)
|
158
|
-
yield traced_exception.as_sanitized_airbyte_message(
|
159
|
-
stream_descriptor=StreamDescriptor(name=configured_stream.stream.name)
|
160
|
-
)
|
161
|
-
stream_name_to_exception[stream_instance.name] = traced_exception
|
162
|
-
if self.stop_sync_on_stream_failure:
|
163
|
-
logger.info(f"{self.name} does not support continuing syncs on error from stream {configured_stream.stream.name}")
|
164
|
-
break
|
148
|
+
raise AirbyteTracedException.from_exception(e, message=display_message) from e
|
149
|
+
raise e
|
165
150
|
finally:
|
166
151
|
timer.finish_event()
|
167
152
|
logger.info(f"Finished syncing {configured_stream.stream.name}")
|
168
153
|
logger.info(timer.report())
|
169
154
|
|
170
|
-
if len(stream_name_to_exception) > 0:
|
171
|
-
|
172
|
-
logger.info(error_message)
|
173
|
-
# We still raise at least one exception when a stream raises an exception because the platform
|
174
|
-
# currently relies on a non-zero exit code to determine if a sync attempt has failed
|
175
|
-
raise AirbyteTracedException(message=error_message)
|
155
|
+
if self.continue_sync_on_stream_failure and len(stream_name_to_exception) > 0:
|
156
|
+
raise AirbyteTracedException(message=self._generate_failed_streams_error_message(stream_name_to_exception))
|
176
157
|
logger.info(f"Finished syncing {self.name}")
|
177
158
|
|
178
159
|
@property
|
@@ -301,17 +282,17 @@ class AbstractSource(Source, ABC):
|
|
301
282
|
return _default_message_repository
|
302
283
|
|
303
284
|
@property
|
304
|
-
def
|
285
|
+
def continue_sync_on_stream_failure(self) -> bool:
|
305
286
|
"""
|
306
287
|
WARNING: This function is in-development which means it is subject to change. Use at your own risk.
|
307
288
|
|
308
|
-
By default,
|
309
|
-
|
310
|
-
|
289
|
+
By default, a source should raise an exception and stop the sync when it encounters an error while syncing a stream. This
|
290
|
+
method can be overridden on a per-source basis so that a source will continue syncing streams other streams even if an
|
291
|
+
exception is raised for a stream.
|
311
292
|
"""
|
312
293
|
return False
|
313
294
|
|
314
295
|
@staticmethod
|
315
296
|
def _generate_failed_streams_error_message(stream_failures: Mapping[str, AirbyteTracedException]) -> str:
|
316
|
-
failures = ", ".join([f"{stream}: {
|
297
|
+
failures = ", ".join([f"{stream}: {exception.__repr__()}" for stream, exception in stream_failures.items()])
|
317
298
|
return f"During the sync, the following streams did not sync successfully: {failures}"
|
@@ -5,7 +5,15 @@
|
|
5
5
|
import copy
|
6
6
|
from typing import Any, List, Mapping, MutableMapping, Optional, Tuple, Union
|
7
7
|
|
8
|
-
from airbyte_cdk.models import
|
8
|
+
from airbyte_cdk.models import (
|
9
|
+
AirbyteMessage,
|
10
|
+
AirbyteStateBlob,
|
11
|
+
AirbyteStateMessage,
|
12
|
+
AirbyteStateType,
|
13
|
+
AirbyteStream,
|
14
|
+
AirbyteStreamState,
|
15
|
+
StreamDescriptor,
|
16
|
+
)
|
9
17
|
from airbyte_cdk.models import Type as MessageType
|
10
18
|
from airbyte_cdk.sources.streams import Stream
|
11
19
|
from pydantic import Extra
|
@@ -29,7 +37,9 @@ class ConnectorStateManager:
|
|
29
37
|
"""
|
30
38
|
|
31
39
|
def __init__(
|
32
|
-
self,
|
40
|
+
self,
|
41
|
+
stream_instance_map: Mapping[str, Union[Stream, AirbyteStream]],
|
42
|
+
state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None,
|
33
43
|
):
|
34
44
|
shared_state, per_stream_states = self._extract_from_state_message(state, stream_instance_map)
|
35
45
|
|
@@ -97,7 +107,9 @@ class ConnectorStateManager:
|
|
97
107
|
|
98
108
|
@classmethod
|
99
109
|
def _extract_from_state_message(
|
100
|
-
cls,
|
110
|
+
cls,
|
111
|
+
state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]],
|
112
|
+
stream_instance_map: Mapping[str, Union[Stream, AirbyteStream]],
|
101
113
|
) -> Tuple[Optional[AirbyteStateBlob], MutableMapping[HashableStreamDescriptor, Optional[AirbyteStateBlob]]]:
|
102
114
|
"""
|
103
115
|
Takes an incoming list of state messages or the legacy state format and extracts state attributes according to type
|
@@ -149,7 +161,7 @@ class ConnectorStateManager:
|
|
149
161
|
|
150
162
|
@staticmethod
|
151
163
|
def _create_descriptor_to_stream_state_mapping(
|
152
|
-
state: MutableMapping[str, Any], stream_to_instance_map: Mapping[str, Stream]
|
164
|
+
state: MutableMapping[str, Any], stream_to_instance_map: Mapping[str, Union[Stream, AirbyteStream]]
|
153
165
|
) -> MutableMapping[HashableStreamDescriptor, Optional[AirbyteStateBlob]]:
|
154
166
|
"""
|
155
167
|
Takes incoming state received in the legacy format and transforms it into a mapping of StreamDescriptor to AirbyteStreamState
|
@@ -12,6 +12,7 @@ from airbyte_cdk.logger import AirbyteLogFormatter, init_logger
|
|
12
12
|
from airbyte_cdk.models import (
|
13
13
|
AirbyteMessage,
|
14
14
|
AirbyteStateMessage,
|
15
|
+
AirbyteStream,
|
15
16
|
ConfiguredAirbyteCatalog,
|
16
17
|
ConnectorSpecification,
|
17
18
|
FailureType,
|
@@ -20,6 +21,7 @@ from airbyte_cdk.models import (
|
|
20
21
|
)
|
21
22
|
from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
|
22
23
|
from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
|
24
|
+
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
23
25
|
from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy, DefaultFileBasedAvailabilityStrategy
|
24
26
|
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
25
27
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
|
@@ -31,12 +33,15 @@ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeP
|
|
31
33
|
from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy
|
32
34
|
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
|
33
35
|
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
|
34
|
-
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import
|
36
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
|
37
|
+
AbstractConcurrentFileBasedCursor,
|
38
|
+
FileBasedConcurrentCursor,
|
39
|
+
FileBasedNoopCursor,
|
40
|
+
)
|
35
41
|
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
36
|
-
from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
|
37
42
|
from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository
|
38
|
-
from airbyte_cdk.sources.source import TState
|
39
43
|
from airbyte_cdk.sources.streams import Stream
|
44
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
|
40
45
|
from airbyte_cdk.utils.analytics_message import create_analytics_message
|
41
46
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
42
47
|
from pydantic.error_wrappers import ValidationError
|
@@ -56,12 +61,12 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
56
61
|
spec_class: Type[AbstractFileBasedSpec],
|
57
62
|
catalog: Optional[ConfiguredAirbyteCatalog],
|
58
63
|
config: Optional[Mapping[str, Any]],
|
59
|
-
state: Optional[
|
64
|
+
state: Optional[MutableMapping[str, Any]],
|
60
65
|
availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy] = None,
|
61
66
|
discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
|
62
67
|
parsers: Mapping[Type[Any], FileTypeParser] = default_parsers,
|
63
68
|
validation_policies: Mapping[ValidationPolicy, AbstractSchemaValidationPolicy] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
|
64
|
-
cursor_cls: Type[AbstractFileBasedCursor] =
|
69
|
+
cursor_cls: Type[Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]] = FileBasedConcurrentCursor,
|
65
70
|
):
|
66
71
|
self.stream_reader = stream_reader
|
67
72
|
self.spec_class = spec_class
|
@@ -137,52 +142,99 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
137
142
|
"""
|
138
143
|
Return a list of this source's streams.
|
139
144
|
"""
|
140
|
-
file_based_streams = self._get_file_based_streams(config)
|
141
|
-
|
142
|
-
configured_streams: List[Stream] = []
|
143
|
-
|
144
|
-
for stream in file_based_streams:
|
145
|
-
sync_mode = self._get_sync_mode_from_catalog(stream)
|
146
|
-
if sync_mode == SyncMode.full_refresh and hasattr(self, "_concurrency_level") and self._concurrency_level is not None:
|
147
|
-
configured_streams.append(
|
148
|
-
FileBasedStreamFacade.create_from_stream(stream, self, self.logger, None, FileBasedNoopCursor(stream.config))
|
149
|
-
)
|
150
|
-
else:
|
151
|
-
configured_streams.append(stream)
|
152
145
|
|
153
|
-
|
146
|
+
if self.catalog:
|
147
|
+
state_manager = ConnectorStateManager(
|
148
|
+
stream_instance_map={s.stream.name: s.stream for s in self.catalog.streams},
|
149
|
+
state=self.state,
|
150
|
+
)
|
151
|
+
else:
|
152
|
+
# During `check` operations we don't have a catalog so cannot create a state manager.
|
153
|
+
# Since the state manager is only required for incremental syncs, this is fine.
|
154
|
+
state_manager = None
|
154
155
|
|
155
|
-
def _get_file_based_streams(self, config: Mapping[str, Any]) -> List[AbstractFileBasedStream]:
|
156
156
|
try:
|
157
157
|
parsed_config = self._get_parsed_config(config)
|
158
158
|
self.stream_reader.config = parsed_config
|
159
|
-
streams: List[
|
159
|
+
streams: List[Stream] = []
|
160
160
|
for stream_config in parsed_config.streams:
|
161
|
+
# Like state_manager, `catalog_stream` may be None during `check`
|
162
|
+
catalog_stream = self._get_stream_from_catalog(stream_config)
|
163
|
+
stream_state = (
|
164
|
+
state_manager.get_stream_state(catalog_stream.name, catalog_stream.namespace)
|
165
|
+
if (state_manager and catalog_stream)
|
166
|
+
else None
|
167
|
+
)
|
161
168
|
self._validate_input_schema(stream_config)
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
parsers=self.parsers,
|
170
|
-
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
171
|
-
cursor=self.cursor_cls(stream_config),
|
172
|
-
errors_collector=self.errors_collector,
|
169
|
+
|
170
|
+
sync_mode = self._get_sync_mode_from_catalog(stream_config.name)
|
171
|
+
|
172
|
+
if sync_mode == SyncMode.full_refresh and hasattr(self, "_concurrency_level") and self._concurrency_level is not None:
|
173
|
+
cursor = FileBasedNoopCursor(stream_config)
|
174
|
+
stream = FileBasedStreamFacade.create_from_stream(
|
175
|
+
self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
|
173
176
|
)
|
174
|
-
|
177
|
+
|
178
|
+
elif (
|
179
|
+
sync_mode == SyncMode.incremental
|
180
|
+
and issubclass(self.cursor_cls, AbstractConcurrentFileBasedCursor)
|
181
|
+
and hasattr(self, "_concurrency_level")
|
182
|
+
and self._concurrency_level is not None
|
183
|
+
):
|
184
|
+
assert (
|
185
|
+
state_manager is not None
|
186
|
+
), "No ConnectorStateManager was created, but it is required for incremental syncs. This is unexpected. Please contact Support."
|
187
|
+
|
188
|
+
cursor = self.cursor_cls(
|
189
|
+
stream_config,
|
190
|
+
stream_config.name,
|
191
|
+
None,
|
192
|
+
stream_state,
|
193
|
+
self.message_repository,
|
194
|
+
state_manager,
|
195
|
+
CursorField(DefaultFileBasedStream.ab_last_mod_col),
|
196
|
+
)
|
197
|
+
stream = FileBasedStreamFacade.create_from_stream(
|
198
|
+
self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
|
199
|
+
)
|
200
|
+
else:
|
201
|
+
cursor = self.cursor_cls(stream_config)
|
202
|
+
stream = self._make_default_stream(stream_config, cursor)
|
203
|
+
|
204
|
+
streams.append(stream)
|
175
205
|
return streams
|
176
206
|
|
177
207
|
except ValidationError as exc:
|
178
208
|
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) from exc
|
179
209
|
|
180
|
-
def
|
210
|
+
def _make_default_stream(
|
211
|
+
self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor]
|
212
|
+
) -> AbstractFileBasedStream:
|
213
|
+
return DefaultFileBasedStream(
|
214
|
+
config=stream_config,
|
215
|
+
catalog_schema=self.stream_schemas.get(stream_config.name),
|
216
|
+
stream_reader=self.stream_reader,
|
217
|
+
availability_strategy=self.availability_strategy,
|
218
|
+
discovery_policy=self.discovery_policy,
|
219
|
+
parsers=self.parsers,
|
220
|
+
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
221
|
+
errors_collector=self.errors_collector,
|
222
|
+
cursor=cursor,
|
223
|
+
)
|
224
|
+
|
225
|
+
def _get_stream_from_catalog(self, stream_config: FileBasedStreamConfig) -> Optional[AirbyteStream]:
|
226
|
+
if self.catalog:
|
227
|
+
for stream in self.catalog.streams or []:
|
228
|
+
if stream.stream.name == stream_config.name:
|
229
|
+
return stream.stream
|
230
|
+
return None
|
231
|
+
|
232
|
+
def _get_sync_mode_from_catalog(self, stream_name: str) -> Optional[SyncMode]:
|
181
233
|
if self.catalog:
|
182
234
|
for catalog_stream in self.catalog.streams:
|
183
|
-
if
|
235
|
+
if stream_name == catalog_stream.stream.name:
|
184
236
|
return catalog_stream.sync_mode
|
185
|
-
self.logger.warning(f"No sync mode was found for {
|
237
|
+
self.logger.warning(f"No sync mode was found for {stream_name}.")
|
186
238
|
return None
|
187
239
|
|
188
240
|
def read(
|
@@ -15,6 +15,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFile
|
|
15
15
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
16
16
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
17
17
|
from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
|
18
|
+
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
18
19
|
from airbyte_cdk.sources.file_based.types import StreamSlice
|
19
20
|
from airbyte_cdk.sources.streams import Stream
|
20
21
|
|
@@ -45,6 +46,7 @@ class AbstractFileBasedStream(Stream):
|
|
45
46
|
parsers: Dict[Type[Any], FileTypeParser],
|
46
47
|
validation_policy: AbstractSchemaValidationPolicy,
|
47
48
|
errors_collector: FileBasedErrorsCollector,
|
49
|
+
cursor: AbstractFileBasedCursor,
|
48
50
|
):
|
49
51
|
super().__init__()
|
50
52
|
self.config = config
|
@@ -55,6 +57,7 @@ class AbstractFileBasedStream(Stream):
|
|
55
57
|
self._availability_strategy = availability_strategy
|
56
58
|
self._parsers = parsers
|
57
59
|
self.errors_collector = errors_collector
|
60
|
+
self._cursor = cursor
|
58
61
|
|
59
62
|
@property
|
60
63
|
@abstractmethod
|
@@ -5,7 +5,7 @@
|
|
5
5
|
import copy
|
6
6
|
import logging
|
7
7
|
from functools import lru_cache
|
8
|
-
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union
|
8
|
+
from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union
|
9
9
|
|
10
10
|
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode, Type
|
11
11
|
from airbyte_cdk.sources import AbstractSource
|
@@ -19,6 +19,7 @@ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeP
|
|
19
19
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
20
20
|
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
|
21
21
|
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedNoopCursor
|
22
|
+
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
22
23
|
from airbyte_cdk.sources.file_based.types import StreamSlice
|
23
24
|
from airbyte_cdk.sources.message import MessageRepository
|
24
25
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade
|
@@ -33,6 +34,9 @@ from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
|
|
33
34
|
from airbyte_cdk.sources.utils.slice_logger import SliceLogger
|
34
35
|
from deprecated.classic import deprecated
|
35
36
|
|
37
|
+
if TYPE_CHECKING:
|
38
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor
|
39
|
+
|
36
40
|
"""
|
37
41
|
This module contains adapters to help enabling concurrency on File-based Stream objects without needing to migrate to AbstractStream
|
38
42
|
"""
|
@@ -47,13 +51,14 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
|
|
47
51
|
source: AbstractSource,
|
48
52
|
logger: logging.Logger,
|
49
53
|
state: Optional[MutableMapping[str, Any]],
|
50
|
-
cursor:
|
54
|
+
cursor: "AbstractConcurrentFileBasedCursor",
|
51
55
|
) -> "FileBasedStreamFacade":
|
52
56
|
"""
|
53
57
|
Create a ConcurrentStream from a FileBasedStream object.
|
54
58
|
"""
|
55
59
|
pk = get_primary_key_from_stream(stream.primary_key)
|
56
60
|
cursor_field = get_cursor_field_from_stream(stream)
|
61
|
+
stream._cursor = cursor
|
57
62
|
|
58
63
|
if not source.message_repository:
|
59
64
|
raise ValueError(
|
@@ -62,7 +67,7 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
|
|
62
67
|
|
63
68
|
message_repository = source.message_repository
|
64
69
|
return FileBasedStreamFacade(
|
65
|
-
DefaultStream(
|
70
|
+
DefaultStream(
|
66
71
|
partition_generator=FileBasedStreamPartitionGenerator(
|
67
72
|
stream,
|
68
73
|
message_repository,
|
@@ -90,14 +95,13 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
|
|
90
95
|
self,
|
91
96
|
stream: DefaultStream,
|
92
97
|
legacy_stream: AbstractFileBasedStream,
|
93
|
-
cursor:
|
98
|
+
cursor: AbstractFileBasedCursor,
|
94
99
|
slice_logger: SliceLogger,
|
95
100
|
logger: logging.Logger,
|
96
101
|
):
|
97
102
|
"""
|
98
103
|
:param stream: The underlying AbstractStream
|
99
104
|
"""
|
100
|
-
# super().__init__(stream, legacy_stream, cursor, slice_logger, logger)
|
101
105
|
self._abstract_stream = stream
|
102
106
|
self._legacy_stream = legacy_stream
|
103
107
|
self._cursor = cursor
|
@@ -216,7 +220,7 @@ class FileBasedStreamPartition(Partition):
|
|
216
220
|
sync_mode: SyncMode,
|
217
221
|
cursor_field: Optional[List[str]],
|
218
222
|
state: Optional[MutableMapping[str, Any]],
|
219
|
-
cursor:
|
223
|
+
cursor: "AbstractConcurrentFileBasedCursor",
|
220
224
|
):
|
221
225
|
self._stream = stream
|
222
226
|
self._slice = _slice
|
@@ -292,7 +296,7 @@ class FileBasedStreamPartitionGenerator(PartitionGenerator):
|
|
292
296
|
sync_mode: SyncMode,
|
293
297
|
cursor_field: Optional[List[str]],
|
294
298
|
state: Optional[MutableMapping[str, Any]],
|
295
|
-
cursor:
|
299
|
+
cursor: "AbstractConcurrentFileBasedCursor",
|
296
300
|
):
|
297
301
|
self._stream = stream
|
298
302
|
self._message_repository = message_repository
|
@@ -305,19 +309,17 @@ class FileBasedStreamPartitionGenerator(PartitionGenerator):
|
|
305
309
|
pending_partitions = []
|
306
310
|
for _slice in self._stream.stream_slices(sync_mode=self._sync_mode, cursor_field=self._cursor_field, stream_state=self._state):
|
307
311
|
if _slice is not None:
|
308
|
-
|
309
|
-
|
312
|
+
for file in _slice.get("files", []):
|
313
|
+
pending_partitions.append(
|
310
314
|
FileBasedStreamPartition(
|
311
315
|
self._stream,
|
312
|
-
{"files": [copy.deepcopy(
|
316
|
+
{"files": [copy.deepcopy(file)]},
|
313
317
|
self._message_repository,
|
314
318
|
self._sync_mode,
|
315
319
|
self._cursor_field,
|
316
320
|
self._state,
|
317
321
|
self._cursor,
|
318
322
|
)
|
319
|
-
|
320
|
-
]
|
321
|
-
)
|
323
|
+
)
|
322
324
|
self._cursor.set_pending_partitions(pending_partitions)
|
323
325
|
yield from pending_partitions
|
@@ -0,0 +1,5 @@
|
|
1
|
+
from .abstract_concurrent_file_based_cursor import AbstractConcurrentFileBasedCursor
|
2
|
+
from .file_based_noop_cursor import FileBasedNoopCursor
|
3
|
+
from .file_based_concurrent_cursor import FileBasedConcurrentCursor
|
4
|
+
|
5
|
+
__all__ = ["AbstractConcurrentFileBasedCursor", "FileBasedConcurrentCursor", "FileBasedNoopCursor"]
|
@@ -1,12 +1,12 @@
|
|
1
1
|
#
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
|
+
|
4
5
|
import logging
|
5
|
-
from abc import abstractmethod
|
6
|
+
from abc import ABC, abstractmethod
|
6
7
|
from datetime import datetime
|
7
|
-
from typing import Any, Iterable, MutableMapping
|
8
|
+
from typing import TYPE_CHECKING, Any, Iterable, List, MutableMapping
|
8
9
|
|
9
|
-
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
10
10
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
11
11
|
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
12
12
|
from airbyte_cdk.sources.file_based.types import StreamState
|
@@ -14,27 +14,33 @@ from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
|
|
14
14
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
15
15
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
16
16
|
|
17
|
+
if TYPE_CHECKING:
|
18
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
|
19
|
+
|
20
|
+
|
21
|
+
class AbstractConcurrentFileBasedCursor(Cursor, AbstractFileBasedCursor, ABC):
|
22
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
23
|
+
pass
|
17
24
|
|
18
|
-
class AbstractFileBasedConcurrentCursor(Cursor, AbstractFileBasedCursor):
|
19
25
|
@property
|
20
26
|
@abstractmethod
|
21
27
|
def state(self) -> MutableMapping[str, Any]:
|
22
28
|
...
|
23
29
|
|
24
30
|
@abstractmethod
|
25
|
-
def
|
31
|
+
def observe(self, record: Record) -> None:
|
26
32
|
...
|
27
33
|
|
28
34
|
@abstractmethod
|
29
|
-
def
|
35
|
+
def close_partition(self, partition: Partition) -> None:
|
30
36
|
...
|
31
37
|
|
32
38
|
@abstractmethod
|
33
|
-
def
|
39
|
+
def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None:
|
34
40
|
...
|
35
41
|
|
36
42
|
@abstractmethod
|
37
|
-
def
|
43
|
+
def add_file(self, file: RemoteFile) -> None:
|
38
44
|
...
|
39
45
|
|
40
46
|
@abstractmethod
|
@@ -42,49 +48,21 @@ class AbstractFileBasedConcurrentCursor(Cursor, AbstractFileBasedCursor):
|
|
42
48
|
...
|
43
49
|
|
44
50
|
@abstractmethod
|
45
|
-
def
|
51
|
+
def get_state(self) -> MutableMapping[str, Any]:
|
46
52
|
...
|
47
53
|
|
48
54
|
@abstractmethod
|
49
|
-
def
|
55
|
+
def set_initial_state(self, value: StreamState) -> None:
|
50
56
|
...
|
51
57
|
|
52
58
|
@abstractmethod
|
53
|
-
def set_pending_partitions(self, partitions: Iterable[Partition]) -> None:
|
54
|
-
...
|
55
|
-
|
56
|
-
|
57
|
-
class FileBasedNoopCursor(AbstractFileBasedConcurrentCursor):
|
58
|
-
def __init__(self, stream_config: FileBasedStreamConfig, **kwargs: Any):
|
59
|
-
pass
|
60
|
-
|
61
|
-
@property
|
62
|
-
def state(self) -> MutableMapping[str, Any]:
|
63
|
-
return {}
|
64
|
-
|
65
|
-
def add_file(self, file: RemoteFile) -> None:
|
66
|
-
return None
|
67
|
-
|
68
|
-
def set_initial_state(self, value: StreamState) -> None:
|
69
|
-
return None
|
70
|
-
|
71
|
-
def get_state(self) -> MutableMapping[str, Any]:
|
72
|
-
return {}
|
73
|
-
|
74
59
|
def get_start_time(self) -> datetime:
|
75
|
-
|
76
|
-
|
77
|
-
def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]:
|
78
|
-
return []
|
79
|
-
|
80
|
-
def observe(self, record: Record) -> None:
|
81
|
-
return None
|
60
|
+
...
|
82
61
|
|
83
|
-
|
84
|
-
|
62
|
+
@abstractmethod
|
63
|
+
def emit_state_message(self) -> None:
|
64
|
+
...
|
85
65
|
|
66
|
+
@abstractmethod
|
86
67
|
def ensure_at_least_one_state_emitted(self) -> None:
|
87
|
-
|
88
|
-
|
89
|
-
def set_pending_partitions(self, partitions: Iterable[Partition]) -> None:
|
90
|
-
return None
|
68
|
+
...
|