airbyte-cdk 0.61.2__py3-none-any.whl → 0.62.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. airbyte_cdk/sources/abstract_source.py +14 -33
  2. airbyte_cdk/sources/connector_state_manager.py +16 -4
  3. airbyte_cdk/sources/file_based/file_based_source.py +87 -35
  4. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +3 -0
  5. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +15 -13
  6. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -0
  7. airbyte_cdk/sources/file_based/stream/concurrent/{cursor.py → cursor/abstract_concurrent_file_based_cursor.py} +22 -44
  8. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +279 -0
  9. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_noop_cursor.py +56 -0
  10. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +11 -2
  11. airbyte_cdk/test/mock_http/mocker.py +3 -1
  12. airbyte_cdk/test/mock_http/response.py +9 -1
  13. airbyte_cdk/utils/traced_exception.py +1 -16
  14. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/METADATA +1 -1
  15. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/RECORD +33 -26
  16. unit_tests/sources/file_based/helpers.py +5 -0
  17. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +2860 -0
  18. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +11 -0
  19. unit_tests/sources/file_based/scenarios/scenario_builder.py +6 -2
  20. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  21. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +365 -0
  22. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +462 -0
  23. unit_tests/sources/file_based/test_file_based_scenarios.py +45 -0
  24. unit_tests/sources/file_based/test_scenarios.py +16 -8
  25. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +13 -2
  26. unit_tests/sources/test_abstract_source.py +36 -170
  27. unit_tests/sources/test_connector_state_manager.py +20 -13
  28. unit_tests/sources/test_integration_source.py +8 -25
  29. unit_tests/sources/test_source_read.py +1 -1
  30. unit_tests/test/mock_http/test_mocker.py +3 -1
  31. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/LICENSE.txt +0 -0
  32. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/WHEEL +0 -0
  33. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,6 @@ from airbyte_cdk.models import (
15
15
  ConfiguredAirbyteCatalog,
16
16
  ConfiguredAirbyteStream,
17
17
  Status,
18
- StreamDescriptor,
19
18
  SyncMode,
20
19
  )
21
20
  from airbyte_cdk.models import Type as MessageType
@@ -28,7 +27,6 @@ from airbyte_cdk.sources.streams.http.http import HttpStream
28
27
  from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
29
28
  from airbyte_cdk.sources.utils.schema_helpers import InternalConfig, split_config
30
29
  from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
31
- from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets
32
30
  from airbyte_cdk.utils.event_timing import create_timer
33
31
  from airbyte_cdk.utils.stream_status_utils import as_airbyte_message as stream_status_as_airbyte_message
34
32
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
@@ -101,7 +99,7 @@ class AbstractSource(Source, ABC):
101
99
  # TODO assert all streams exist in the connector
102
100
  # get the streams once in case the connector needs to make any queries to generate them
103
101
  stream_instances = {s.name: s for s in self.streams(config)}
104
- state_manager = ConnectorStateManager(stream_instance_map=stream_instances, state=state)
102
+ state_manager = ConnectorStateManager(stream_instance_map={s.stream.name: s.stream for s in catalog.streams}, state=state)
105
103
  self._stream_to_instance_map = stream_instances
106
104
 
107
105
  stream_name_to_exception: MutableMapping[str, AirbyteTracedException] = {}
@@ -135,16 +133,11 @@ class AbstractSource(Source, ABC):
135
133
  logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
136
134
  yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.COMPLETE)
137
135
  except AirbyteTracedException as e:
138
- logger.exception(f"Encountered an exception while reading stream {configured_stream.stream.name}")
139
- logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
140
136
  yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.INCOMPLETE)
141
- yield e.as_sanitized_airbyte_message(stream_descriptor=StreamDescriptor(name=configured_stream.stream.name))
142
- stream_name_to_exception[stream_instance.name] = e
143
- if self.stop_sync_on_stream_failure:
144
- logger.info(
145
- f"Stopping sync on error from stream {configured_stream.stream.name} because {self.name} does not support continuing syncs on error."
146
- )
147
- break
137
+ if self.continue_sync_on_stream_failure:
138
+ stream_name_to_exception[stream_instance.name] = e
139
+ else:
140
+ raise e
148
141
  except Exception as e:
149
142
  yield from self._emit_queued_messages()
150
143
  logger.exception(f"Encountered an exception while reading stream {configured_stream.stream.name}")
@@ -152,27 +145,15 @@ class AbstractSource(Source, ABC):
152
145
  yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.INCOMPLETE)
153
146
  display_message = stream_instance.get_error_display_message(e)
154
147
  if display_message:
155
- traced_exception = AirbyteTracedException.from_exception(e, message=display_message)
156
- else:
157
- traced_exception = AirbyteTracedException.from_exception(e)
158
- yield traced_exception.as_sanitized_airbyte_message(
159
- stream_descriptor=StreamDescriptor(name=configured_stream.stream.name)
160
- )
161
- stream_name_to_exception[stream_instance.name] = traced_exception
162
- if self.stop_sync_on_stream_failure:
163
- logger.info(f"{self.name} does not support continuing syncs on error from stream {configured_stream.stream.name}")
164
- break
148
+ raise AirbyteTracedException.from_exception(e, message=display_message) from e
149
+ raise e
165
150
  finally:
166
151
  timer.finish_event()
167
152
  logger.info(f"Finished syncing {configured_stream.stream.name}")
168
153
  logger.info(timer.report())
169
154
 
170
- if len(stream_name_to_exception) > 0:
171
- error_message = self._generate_failed_streams_error_message(stream_name_to_exception)
172
- logger.info(error_message)
173
- # We still raise at least one exception when a stream raises an exception because the platform
174
- # currently relies on a non-zero exit code to determine if a sync attempt has failed
175
- raise AirbyteTracedException(message=error_message)
155
+ if self.continue_sync_on_stream_failure and len(stream_name_to_exception) > 0:
156
+ raise AirbyteTracedException(message=self._generate_failed_streams_error_message(stream_name_to_exception))
176
157
  logger.info(f"Finished syncing {self.name}")
177
158
 
178
159
  @property
@@ -301,17 +282,17 @@ class AbstractSource(Source, ABC):
301
282
  return _default_message_repository
302
283
 
303
284
  @property
304
- def stop_sync_on_stream_failure(self) -> bool:
285
+ def continue_sync_on_stream_failure(self) -> bool:
305
286
  """
306
287
  WARNING: This function is in-development which means it is subject to change. Use at your own risk.
307
288
 
308
- By default, when a source encounters an exception while syncing a stream, it will emit an error trace message and then
309
- continue syncing the next stream. This can be overwridden on a per-source basis so that the source will stop the sync
310
- on the first error seen and emit a single error trace message for that stream.
289
+ By default, a source should raise an exception and stop the sync when it encounters an error while syncing a stream. This
290
+ method can be overridden on a per-source basis so that a source will continue syncing streams other streams even if an
291
+ exception is raised for a stream.
311
292
  """
312
293
  return False
313
294
 
314
295
  @staticmethod
315
296
  def _generate_failed_streams_error_message(stream_failures: Mapping[str, AirbyteTracedException]) -> str:
316
- failures = ", ".join([f"{stream}: {filter_secrets(exception.__repr__())}" for stream, exception in stream_failures.items()])
297
+ failures = ", ".join([f"{stream}: {exception.__repr__()}" for stream, exception in stream_failures.items()])
317
298
  return f"During the sync, the following streams did not sync successfully: {failures}"
@@ -5,7 +5,15 @@
5
5
  import copy
6
6
  from typing import Any, List, Mapping, MutableMapping, Optional, Tuple, Union
7
7
 
8
- from airbyte_cdk.models import AirbyteMessage, AirbyteStateBlob, AirbyteStateMessage, AirbyteStateType, AirbyteStreamState, StreamDescriptor
8
+ from airbyte_cdk.models import (
9
+ AirbyteMessage,
10
+ AirbyteStateBlob,
11
+ AirbyteStateMessage,
12
+ AirbyteStateType,
13
+ AirbyteStream,
14
+ AirbyteStreamState,
15
+ StreamDescriptor,
16
+ )
9
17
  from airbyte_cdk.models import Type as MessageType
10
18
  from airbyte_cdk.sources.streams import Stream
11
19
  from pydantic import Extra
@@ -29,7 +37,9 @@ class ConnectorStateManager:
29
37
  """
30
38
 
31
39
  def __init__(
32
- self, stream_instance_map: Mapping[str, Stream], state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None
40
+ self,
41
+ stream_instance_map: Mapping[str, Union[Stream, AirbyteStream]],
42
+ state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None,
33
43
  ):
34
44
  shared_state, per_stream_states = self._extract_from_state_message(state, stream_instance_map)
35
45
 
@@ -97,7 +107,9 @@ class ConnectorStateManager:
97
107
 
98
108
  @classmethod
99
109
  def _extract_from_state_message(
100
- cls, state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]], stream_instance_map: Mapping[str, Stream]
110
+ cls,
111
+ state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]],
112
+ stream_instance_map: Mapping[str, Union[Stream, AirbyteStream]],
101
113
  ) -> Tuple[Optional[AirbyteStateBlob], MutableMapping[HashableStreamDescriptor, Optional[AirbyteStateBlob]]]:
102
114
  """
103
115
  Takes an incoming list of state messages or the legacy state format and extracts state attributes according to type
@@ -149,7 +161,7 @@ class ConnectorStateManager:
149
161
 
150
162
  @staticmethod
151
163
  def _create_descriptor_to_stream_state_mapping(
152
- state: MutableMapping[str, Any], stream_to_instance_map: Mapping[str, Stream]
164
+ state: MutableMapping[str, Any], stream_to_instance_map: Mapping[str, Union[Stream, AirbyteStream]]
153
165
  ) -> MutableMapping[HashableStreamDescriptor, Optional[AirbyteStateBlob]]:
154
166
  """
155
167
  Takes incoming state received in the legacy format and transforms it into a mapping of StreamDescriptor to AirbyteStreamState
@@ -12,6 +12,7 @@ from airbyte_cdk.logger import AirbyteLogFormatter, init_logger
12
12
  from airbyte_cdk.models import (
13
13
  AirbyteMessage,
14
14
  AirbyteStateMessage,
15
+ AirbyteStream,
15
16
  ConfiguredAirbyteCatalog,
16
17
  ConnectorSpecification,
17
18
  FailureType,
@@ -20,6 +21,7 @@ from airbyte_cdk.models import (
20
21
  )
21
22
  from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
22
23
  from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
24
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
23
25
  from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy, DefaultFileBasedAvailabilityStrategy
24
26
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
25
27
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
@@ -31,12 +33,15 @@ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeP
31
33
  from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy
32
34
  from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
33
35
  from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
34
- from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedNoopCursor
36
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
37
+ AbstractConcurrentFileBasedCursor,
38
+ FileBasedConcurrentCursor,
39
+ FileBasedNoopCursor,
40
+ )
35
41
  from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
36
- from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
37
42
  from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository
38
- from airbyte_cdk.sources.source import TState
39
43
  from airbyte_cdk.sources.streams import Stream
44
+ from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
40
45
  from airbyte_cdk.utils.analytics_message import create_analytics_message
41
46
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
42
47
  from pydantic.error_wrappers import ValidationError
@@ -56,12 +61,12 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
56
61
  spec_class: Type[AbstractFileBasedSpec],
57
62
  catalog: Optional[ConfiguredAirbyteCatalog],
58
63
  config: Optional[Mapping[str, Any]],
59
- state: Optional[TState],
64
+ state: Optional[MutableMapping[str, Any]],
60
65
  availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy] = None,
61
66
  discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
62
67
  parsers: Mapping[Type[Any], FileTypeParser] = default_parsers,
63
68
  validation_policies: Mapping[ValidationPolicy, AbstractSchemaValidationPolicy] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
64
- cursor_cls: Type[AbstractFileBasedCursor] = DefaultFileBasedCursor,
69
+ cursor_cls: Type[Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]] = FileBasedConcurrentCursor,
65
70
  ):
66
71
  self.stream_reader = stream_reader
67
72
  self.spec_class = spec_class
@@ -137,52 +142,99 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
137
142
  """
138
143
  Return a list of this source's streams.
139
144
  """
140
- file_based_streams = self._get_file_based_streams(config)
141
-
142
- configured_streams: List[Stream] = []
143
-
144
- for stream in file_based_streams:
145
- sync_mode = self._get_sync_mode_from_catalog(stream)
146
- if sync_mode == SyncMode.full_refresh and hasattr(self, "_concurrency_level") and self._concurrency_level is not None:
147
- configured_streams.append(
148
- FileBasedStreamFacade.create_from_stream(stream, self, self.logger, None, FileBasedNoopCursor(stream.config))
149
- )
150
- else:
151
- configured_streams.append(stream)
152
145
 
153
- return configured_streams
146
+ if self.catalog:
147
+ state_manager = ConnectorStateManager(
148
+ stream_instance_map={s.stream.name: s.stream for s in self.catalog.streams},
149
+ state=self.state,
150
+ )
151
+ else:
152
+ # During `check` operations we don't have a catalog so cannot create a state manager.
153
+ # Since the state manager is only required for incremental syncs, this is fine.
154
+ state_manager = None
154
155
 
155
- def _get_file_based_streams(self, config: Mapping[str, Any]) -> List[AbstractFileBasedStream]:
156
156
  try:
157
157
  parsed_config = self._get_parsed_config(config)
158
158
  self.stream_reader.config = parsed_config
159
- streams: List[AbstractFileBasedStream] = []
159
+ streams: List[Stream] = []
160
160
  for stream_config in parsed_config.streams:
161
+ # Like state_manager, `catalog_stream` may be None during `check`
162
+ catalog_stream = self._get_stream_from_catalog(stream_config)
163
+ stream_state = (
164
+ state_manager.get_stream_state(catalog_stream.name, catalog_stream.namespace)
165
+ if (state_manager and catalog_stream)
166
+ else None
167
+ )
161
168
  self._validate_input_schema(stream_config)
162
- streams.append(
163
- DefaultFileBasedStream(
164
- config=stream_config,
165
- catalog_schema=self.stream_schemas.get(stream_config.name),
166
- stream_reader=self.stream_reader,
167
- availability_strategy=self.availability_strategy,
168
- discovery_policy=self.discovery_policy,
169
- parsers=self.parsers,
170
- validation_policy=self._validate_and_get_validation_policy(stream_config),
171
- cursor=self.cursor_cls(stream_config),
172
- errors_collector=self.errors_collector,
169
+
170
+ sync_mode = self._get_sync_mode_from_catalog(stream_config.name)
171
+
172
+ if sync_mode == SyncMode.full_refresh and hasattr(self, "_concurrency_level") and self._concurrency_level is not None:
173
+ cursor = FileBasedNoopCursor(stream_config)
174
+ stream = FileBasedStreamFacade.create_from_stream(
175
+ self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
173
176
  )
174
- )
177
+
178
+ elif (
179
+ sync_mode == SyncMode.incremental
180
+ and issubclass(self.cursor_cls, AbstractConcurrentFileBasedCursor)
181
+ and hasattr(self, "_concurrency_level")
182
+ and self._concurrency_level is not None
183
+ ):
184
+ assert (
185
+ state_manager is not None
186
+ ), "No ConnectorStateManager was created, but it is required for incremental syncs. This is unexpected. Please contact Support."
187
+
188
+ cursor = self.cursor_cls(
189
+ stream_config,
190
+ stream_config.name,
191
+ None,
192
+ stream_state,
193
+ self.message_repository,
194
+ state_manager,
195
+ CursorField(DefaultFileBasedStream.ab_last_mod_col),
196
+ )
197
+ stream = FileBasedStreamFacade.create_from_stream(
198
+ self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
199
+ )
200
+ else:
201
+ cursor = self.cursor_cls(stream_config)
202
+ stream = self._make_default_stream(stream_config, cursor)
203
+
204
+ streams.append(stream)
175
205
  return streams
176
206
 
177
207
  except ValidationError as exc:
178
208
  raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) from exc
179
209
 
180
- def _get_sync_mode_from_catalog(self, stream: Stream) -> Optional[SyncMode]:
210
+ def _make_default_stream(
211
+ self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor]
212
+ ) -> AbstractFileBasedStream:
213
+ return DefaultFileBasedStream(
214
+ config=stream_config,
215
+ catalog_schema=self.stream_schemas.get(stream_config.name),
216
+ stream_reader=self.stream_reader,
217
+ availability_strategy=self.availability_strategy,
218
+ discovery_policy=self.discovery_policy,
219
+ parsers=self.parsers,
220
+ validation_policy=self._validate_and_get_validation_policy(stream_config),
221
+ errors_collector=self.errors_collector,
222
+ cursor=cursor,
223
+ )
224
+
225
+ def _get_stream_from_catalog(self, stream_config: FileBasedStreamConfig) -> Optional[AirbyteStream]:
226
+ if self.catalog:
227
+ for stream in self.catalog.streams or []:
228
+ if stream.stream.name == stream_config.name:
229
+ return stream.stream
230
+ return None
231
+
232
+ def _get_sync_mode_from_catalog(self, stream_name: str) -> Optional[SyncMode]:
181
233
  if self.catalog:
182
234
  for catalog_stream in self.catalog.streams:
183
- if stream.name == catalog_stream.stream.name:
235
+ if stream_name == catalog_stream.stream.name:
184
236
  return catalog_stream.sync_mode
185
- self.logger.warning(f"No sync mode was found for {stream.name}.")
237
+ self.logger.warning(f"No sync mode was found for {stream_name}.")
186
238
  return None
187
239
 
188
240
  def read(
@@ -15,6 +15,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFile
15
15
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
16
16
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
17
17
  from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
18
+ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
18
19
  from airbyte_cdk.sources.file_based.types import StreamSlice
19
20
  from airbyte_cdk.sources.streams import Stream
20
21
 
@@ -45,6 +46,7 @@ class AbstractFileBasedStream(Stream):
45
46
  parsers: Dict[Type[Any], FileTypeParser],
46
47
  validation_policy: AbstractSchemaValidationPolicy,
47
48
  errors_collector: FileBasedErrorsCollector,
49
+ cursor: AbstractFileBasedCursor,
48
50
  ):
49
51
  super().__init__()
50
52
  self.config = config
@@ -55,6 +57,7 @@ class AbstractFileBasedStream(Stream):
55
57
  self._availability_strategy = availability_strategy
56
58
  self._parsers = parsers
57
59
  self.errors_collector = errors_collector
60
+ self._cursor = cursor
58
61
 
59
62
  @property
60
63
  @abstractmethod
@@ -5,7 +5,7 @@
5
5
  import copy
6
6
  import logging
7
7
  from functools import lru_cache
8
- from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union
8
+ from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union
9
9
 
10
10
  from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode, Type
11
11
  from airbyte_cdk.sources import AbstractSource
@@ -19,6 +19,7 @@ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeP
19
19
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
20
20
  from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
21
21
  from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedNoopCursor
22
+ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
22
23
  from airbyte_cdk.sources.file_based.types import StreamSlice
23
24
  from airbyte_cdk.sources.message import MessageRepository
24
25
  from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade
@@ -33,6 +34,9 @@ from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
33
34
  from airbyte_cdk.sources.utils.slice_logger import SliceLogger
34
35
  from deprecated.classic import deprecated
35
36
 
37
+ if TYPE_CHECKING:
38
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor
39
+
36
40
  """
37
41
  This module contains adapters to help enabling concurrency on File-based Stream objects without needing to migrate to AbstractStream
38
42
  """
@@ -47,13 +51,14 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
47
51
  source: AbstractSource,
48
52
  logger: logging.Logger,
49
53
  state: Optional[MutableMapping[str, Any]],
50
- cursor: FileBasedNoopCursor,
54
+ cursor: "AbstractConcurrentFileBasedCursor",
51
55
  ) -> "FileBasedStreamFacade":
52
56
  """
53
57
  Create a ConcurrentStream from a FileBasedStream object.
54
58
  """
55
59
  pk = get_primary_key_from_stream(stream.primary_key)
56
60
  cursor_field = get_cursor_field_from_stream(stream)
61
+ stream._cursor = cursor
57
62
 
58
63
  if not source.message_repository:
59
64
  raise ValueError(
@@ -62,7 +67,7 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
62
67
 
63
68
  message_repository = source.message_repository
64
69
  return FileBasedStreamFacade(
65
- DefaultStream( # type: ignore
70
+ DefaultStream(
66
71
  partition_generator=FileBasedStreamPartitionGenerator(
67
72
  stream,
68
73
  message_repository,
@@ -90,14 +95,13 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
90
95
  self,
91
96
  stream: DefaultStream,
92
97
  legacy_stream: AbstractFileBasedStream,
93
- cursor: FileBasedNoopCursor,
98
+ cursor: AbstractFileBasedCursor,
94
99
  slice_logger: SliceLogger,
95
100
  logger: logging.Logger,
96
101
  ):
97
102
  """
98
103
  :param stream: The underlying AbstractStream
99
104
  """
100
- # super().__init__(stream, legacy_stream, cursor, slice_logger, logger)
101
105
  self._abstract_stream = stream
102
106
  self._legacy_stream = legacy_stream
103
107
  self._cursor = cursor
@@ -216,7 +220,7 @@ class FileBasedStreamPartition(Partition):
216
220
  sync_mode: SyncMode,
217
221
  cursor_field: Optional[List[str]],
218
222
  state: Optional[MutableMapping[str, Any]],
219
- cursor: FileBasedNoopCursor,
223
+ cursor: "AbstractConcurrentFileBasedCursor",
220
224
  ):
221
225
  self._stream = stream
222
226
  self._slice = _slice
@@ -292,7 +296,7 @@ class FileBasedStreamPartitionGenerator(PartitionGenerator):
292
296
  sync_mode: SyncMode,
293
297
  cursor_field: Optional[List[str]],
294
298
  state: Optional[MutableMapping[str, Any]],
295
- cursor: FileBasedNoopCursor,
299
+ cursor: "AbstractConcurrentFileBasedCursor",
296
300
  ):
297
301
  self._stream = stream
298
302
  self._message_repository = message_repository
@@ -305,19 +309,17 @@ class FileBasedStreamPartitionGenerator(PartitionGenerator):
305
309
  pending_partitions = []
306
310
  for _slice in self._stream.stream_slices(sync_mode=self._sync_mode, cursor_field=self._cursor_field, stream_state=self._state):
307
311
  if _slice is not None:
308
- pending_partitions.extend(
309
- [
312
+ for file in _slice.get("files", []):
313
+ pending_partitions.append(
310
314
  FileBasedStreamPartition(
311
315
  self._stream,
312
- {"files": [copy.deepcopy(f)]},
316
+ {"files": [copy.deepcopy(file)]},
313
317
  self._message_repository,
314
318
  self._sync_mode,
315
319
  self._cursor_field,
316
320
  self._state,
317
321
  self._cursor,
318
322
  )
319
- for f in _slice.get("files", [])
320
- ]
321
- )
323
+ )
322
324
  self._cursor.set_pending_partitions(pending_partitions)
323
325
  yield from pending_partitions
@@ -0,0 +1,5 @@
1
+ from .abstract_concurrent_file_based_cursor import AbstractConcurrentFileBasedCursor
2
+ from .file_based_noop_cursor import FileBasedNoopCursor
3
+ from .file_based_concurrent_cursor import FileBasedConcurrentCursor
4
+
5
+ __all__ = ["AbstractConcurrentFileBasedCursor", "FileBasedConcurrentCursor", "FileBasedNoopCursor"]
@@ -1,12 +1,12 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
+
4
5
  import logging
5
- from abc import abstractmethod
6
+ from abc import ABC, abstractmethod
6
7
  from datetime import datetime
7
- from typing import Any, Iterable, MutableMapping
8
+ from typing import TYPE_CHECKING, Any, Iterable, List, MutableMapping
8
9
 
9
- from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
10
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
11
  from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
12
12
  from airbyte_cdk.sources.file_based.types import StreamState
@@ -14,27 +14,33 @@ from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
14
14
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
15
15
  from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
16
16
 
17
+ if TYPE_CHECKING:
18
+ from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
19
+
20
+
21
+ class AbstractConcurrentFileBasedCursor(Cursor, AbstractFileBasedCursor, ABC):
22
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
23
+ pass
17
24
 
18
- class AbstractFileBasedConcurrentCursor(Cursor, AbstractFileBasedCursor):
19
25
  @property
20
26
  @abstractmethod
21
27
  def state(self) -> MutableMapping[str, Any]:
22
28
  ...
23
29
 
24
30
  @abstractmethod
25
- def add_file(self, file: RemoteFile) -> None:
31
+ def observe(self, record: Record) -> None:
26
32
  ...
27
33
 
28
34
  @abstractmethod
29
- def set_initial_state(self, value: StreamState) -> None:
35
+ def close_partition(self, partition: Partition) -> None:
30
36
  ...
31
37
 
32
38
  @abstractmethod
33
- def get_state(self) -> MutableMapping[str, Any]:
39
+ def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None:
34
40
  ...
35
41
 
36
42
  @abstractmethod
37
- def get_start_time(self) -> datetime:
43
+ def add_file(self, file: RemoteFile) -> None:
38
44
  ...
39
45
 
40
46
  @abstractmethod
@@ -42,49 +48,21 @@ class AbstractFileBasedConcurrentCursor(Cursor, AbstractFileBasedCursor):
42
48
  ...
43
49
 
44
50
  @abstractmethod
45
- def observe(self, record: Record) -> None:
51
+ def get_state(self) -> MutableMapping[str, Any]:
46
52
  ...
47
53
 
48
54
  @abstractmethod
49
- def close_partition(self, partition: Partition) -> None:
55
+ def set_initial_state(self, value: StreamState) -> None:
50
56
  ...
51
57
 
52
58
  @abstractmethod
53
- def set_pending_partitions(self, partitions: Iterable[Partition]) -> None:
54
- ...
55
-
56
-
57
- class FileBasedNoopCursor(AbstractFileBasedConcurrentCursor):
58
- def __init__(self, stream_config: FileBasedStreamConfig, **kwargs: Any):
59
- pass
60
-
61
- @property
62
- def state(self) -> MutableMapping[str, Any]:
63
- return {}
64
-
65
- def add_file(self, file: RemoteFile) -> None:
66
- return None
67
-
68
- def set_initial_state(self, value: StreamState) -> None:
69
- return None
70
-
71
- def get_state(self) -> MutableMapping[str, Any]:
72
- return {}
73
-
74
59
  def get_start_time(self) -> datetime:
75
- return datetime.min
76
-
77
- def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]:
78
- return []
79
-
80
- def observe(self, record: Record) -> None:
81
- return None
60
+ ...
82
61
 
83
- def close_partition(self, partition: Partition) -> None:
84
- return None
62
+ @abstractmethod
63
+ def emit_state_message(self) -> None:
64
+ ...
85
65
 
66
+ @abstractmethod
86
67
  def ensure_at_least_one_state_emitted(self) -> None:
87
- return None
88
-
89
- def set_pending_partitions(self, partitions: Iterable[Partition]) -> None:
90
- return None
68
+ ...