airbyte-cdk 0.61.2__py3-none-any.whl → 0.62.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (33) hide show
  1. airbyte_cdk/sources/abstract_source.py +14 -33
  2. airbyte_cdk/sources/connector_state_manager.py +16 -4
  3. airbyte_cdk/sources/file_based/file_based_source.py +87 -35
  4. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +3 -0
  5. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +15 -13
  6. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -0
  7. airbyte_cdk/sources/file_based/stream/concurrent/{cursor.py → cursor/abstract_concurrent_file_based_cursor.py} +22 -44
  8. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +279 -0
  9. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_noop_cursor.py +56 -0
  10. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +11 -2
  11. airbyte_cdk/test/mock_http/mocker.py +3 -1
  12. airbyte_cdk/test/mock_http/response.py +9 -1
  13. airbyte_cdk/utils/traced_exception.py +1 -16
  14. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/METADATA +1 -1
  15. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/RECORD +33 -26
  16. unit_tests/sources/file_based/helpers.py +5 -0
  17. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +2860 -0
  18. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +11 -0
  19. unit_tests/sources/file_based/scenarios/scenario_builder.py +6 -2
  20. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  21. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +365 -0
  22. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +462 -0
  23. unit_tests/sources/file_based/test_file_based_scenarios.py +45 -0
  24. unit_tests/sources/file_based/test_scenarios.py +16 -8
  25. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +13 -2
  26. unit_tests/sources/test_abstract_source.py +36 -170
  27. unit_tests/sources/test_connector_state_manager.py +20 -13
  28. unit_tests/sources/test_integration_source.py +8 -25
  29. unit_tests/sources/test_source_read.py +1 -1
  30. unit_tests/test/mock_http/test_mocker.py +3 -1
  31. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/LICENSE.txt +0 -0
  32. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/WHEEL +0 -0
  33. {airbyte_cdk-0.61.2.dist-info → airbyte_cdk-0.62.1.dist-info}/top_level.txt +0 -0
@@ -15,7 +15,6 @@ from airbyte_cdk.models import (
15
15
  ConfiguredAirbyteCatalog,
16
16
  ConfiguredAirbyteStream,
17
17
  Status,
18
- StreamDescriptor,
19
18
  SyncMode,
20
19
  )
21
20
  from airbyte_cdk.models import Type as MessageType
@@ -28,7 +27,6 @@ from airbyte_cdk.sources.streams.http.http import HttpStream
28
27
  from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
29
28
  from airbyte_cdk.sources.utils.schema_helpers import InternalConfig, split_config
30
29
  from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
31
- from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets
32
30
  from airbyte_cdk.utils.event_timing import create_timer
33
31
  from airbyte_cdk.utils.stream_status_utils import as_airbyte_message as stream_status_as_airbyte_message
34
32
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
@@ -101,7 +99,7 @@ class AbstractSource(Source, ABC):
101
99
  # TODO assert all streams exist in the connector
102
100
  # get the streams once in case the connector needs to make any queries to generate them
103
101
  stream_instances = {s.name: s for s in self.streams(config)}
104
- state_manager = ConnectorStateManager(stream_instance_map=stream_instances, state=state)
102
+ state_manager = ConnectorStateManager(stream_instance_map={s.stream.name: s.stream for s in catalog.streams}, state=state)
105
103
  self._stream_to_instance_map = stream_instances
106
104
 
107
105
  stream_name_to_exception: MutableMapping[str, AirbyteTracedException] = {}
@@ -135,16 +133,11 @@ class AbstractSource(Source, ABC):
135
133
  logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
136
134
  yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.COMPLETE)
137
135
  except AirbyteTracedException as e:
138
- logger.exception(f"Encountered an exception while reading stream {configured_stream.stream.name}")
139
- logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
140
136
  yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.INCOMPLETE)
141
- yield e.as_sanitized_airbyte_message(stream_descriptor=StreamDescriptor(name=configured_stream.stream.name))
142
- stream_name_to_exception[stream_instance.name] = e
143
- if self.stop_sync_on_stream_failure:
144
- logger.info(
145
- f"Stopping sync on error from stream {configured_stream.stream.name} because {self.name} does not support continuing syncs on error."
146
- )
147
- break
137
+ if self.continue_sync_on_stream_failure:
138
+ stream_name_to_exception[stream_instance.name] = e
139
+ else:
140
+ raise e
148
141
  except Exception as e:
149
142
  yield from self._emit_queued_messages()
150
143
  logger.exception(f"Encountered an exception while reading stream {configured_stream.stream.name}")
@@ -152,27 +145,15 @@ class AbstractSource(Source, ABC):
152
145
  yield stream_status_as_airbyte_message(configured_stream.stream, AirbyteStreamStatus.INCOMPLETE)
153
146
  display_message = stream_instance.get_error_display_message(e)
154
147
  if display_message:
155
- traced_exception = AirbyteTracedException.from_exception(e, message=display_message)
156
- else:
157
- traced_exception = AirbyteTracedException.from_exception(e)
158
- yield traced_exception.as_sanitized_airbyte_message(
159
- stream_descriptor=StreamDescriptor(name=configured_stream.stream.name)
160
- )
161
- stream_name_to_exception[stream_instance.name] = traced_exception
162
- if self.stop_sync_on_stream_failure:
163
- logger.info(f"{self.name} does not support continuing syncs on error from stream {configured_stream.stream.name}")
164
- break
148
+ raise AirbyteTracedException.from_exception(e, message=display_message) from e
149
+ raise e
165
150
  finally:
166
151
  timer.finish_event()
167
152
  logger.info(f"Finished syncing {configured_stream.stream.name}")
168
153
  logger.info(timer.report())
169
154
 
170
- if len(stream_name_to_exception) > 0:
171
- error_message = self._generate_failed_streams_error_message(stream_name_to_exception)
172
- logger.info(error_message)
173
- # We still raise at least one exception when a stream raises an exception because the platform
174
- # currently relies on a non-zero exit code to determine if a sync attempt has failed
175
- raise AirbyteTracedException(message=error_message)
155
+ if self.continue_sync_on_stream_failure and len(stream_name_to_exception) > 0:
156
+ raise AirbyteTracedException(message=self._generate_failed_streams_error_message(stream_name_to_exception))
176
157
  logger.info(f"Finished syncing {self.name}")
177
158
 
178
159
  @property
@@ -301,17 +282,17 @@ class AbstractSource(Source, ABC):
301
282
  return _default_message_repository
302
283
 
303
284
  @property
304
- def stop_sync_on_stream_failure(self) -> bool:
285
+ def continue_sync_on_stream_failure(self) -> bool:
305
286
  """
306
287
  WARNING: This function is in-development which means it is subject to change. Use at your own risk.
307
288
 
308
- By default, when a source encounters an exception while syncing a stream, it will emit an error trace message and then
309
- continue syncing the next stream. This can be overwridden on a per-source basis so that the source will stop the sync
310
- on the first error seen and emit a single error trace message for that stream.
289
+ By default, a source should raise an exception and stop the sync when it encounters an error while syncing a stream. This
290
+ method can be overridden on a per-source basis so that a source will continue syncing streams other streams even if an
291
+ exception is raised for a stream.
311
292
  """
312
293
  return False
313
294
 
314
295
  @staticmethod
315
296
  def _generate_failed_streams_error_message(stream_failures: Mapping[str, AirbyteTracedException]) -> str:
316
- failures = ", ".join([f"{stream}: {filter_secrets(exception.__repr__())}" for stream, exception in stream_failures.items()])
297
+ failures = ", ".join([f"{stream}: {exception.__repr__()}" for stream, exception in stream_failures.items()])
317
298
  return f"During the sync, the following streams did not sync successfully: {failures}"
@@ -5,7 +5,15 @@
5
5
  import copy
6
6
  from typing import Any, List, Mapping, MutableMapping, Optional, Tuple, Union
7
7
 
8
- from airbyte_cdk.models import AirbyteMessage, AirbyteStateBlob, AirbyteStateMessage, AirbyteStateType, AirbyteStreamState, StreamDescriptor
8
+ from airbyte_cdk.models import (
9
+ AirbyteMessage,
10
+ AirbyteStateBlob,
11
+ AirbyteStateMessage,
12
+ AirbyteStateType,
13
+ AirbyteStream,
14
+ AirbyteStreamState,
15
+ StreamDescriptor,
16
+ )
9
17
  from airbyte_cdk.models import Type as MessageType
10
18
  from airbyte_cdk.sources.streams import Stream
11
19
  from pydantic import Extra
@@ -29,7 +37,9 @@ class ConnectorStateManager:
29
37
  """
30
38
 
31
39
  def __init__(
32
- self, stream_instance_map: Mapping[str, Stream], state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None
40
+ self,
41
+ stream_instance_map: Mapping[str, Union[Stream, AirbyteStream]],
42
+ state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None,
33
43
  ):
34
44
  shared_state, per_stream_states = self._extract_from_state_message(state, stream_instance_map)
35
45
 
@@ -97,7 +107,9 @@ class ConnectorStateManager:
97
107
 
98
108
  @classmethod
99
109
  def _extract_from_state_message(
100
- cls, state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]], stream_instance_map: Mapping[str, Stream]
110
+ cls,
111
+ state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]],
112
+ stream_instance_map: Mapping[str, Union[Stream, AirbyteStream]],
101
113
  ) -> Tuple[Optional[AirbyteStateBlob], MutableMapping[HashableStreamDescriptor, Optional[AirbyteStateBlob]]]:
102
114
  """
103
115
  Takes an incoming list of state messages or the legacy state format and extracts state attributes according to type
@@ -149,7 +161,7 @@ class ConnectorStateManager:
149
161
 
150
162
  @staticmethod
151
163
  def _create_descriptor_to_stream_state_mapping(
152
- state: MutableMapping[str, Any], stream_to_instance_map: Mapping[str, Stream]
164
+ state: MutableMapping[str, Any], stream_to_instance_map: Mapping[str, Union[Stream, AirbyteStream]]
153
165
  ) -> MutableMapping[HashableStreamDescriptor, Optional[AirbyteStateBlob]]:
154
166
  """
155
167
  Takes incoming state received in the legacy format and transforms it into a mapping of StreamDescriptor to AirbyteStreamState
@@ -12,6 +12,7 @@ from airbyte_cdk.logger import AirbyteLogFormatter, init_logger
12
12
  from airbyte_cdk.models import (
13
13
  AirbyteMessage,
14
14
  AirbyteStateMessage,
15
+ AirbyteStream,
15
16
  ConfiguredAirbyteCatalog,
16
17
  ConnectorSpecification,
17
18
  FailureType,
@@ -20,6 +21,7 @@ from airbyte_cdk.models import (
20
21
  )
21
22
  from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
22
23
  from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
24
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
23
25
  from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy, DefaultFileBasedAvailabilityStrategy
24
26
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
25
27
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
@@ -31,12 +33,15 @@ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeP
31
33
  from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy
32
34
  from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
33
35
  from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
34
- from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedNoopCursor
36
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
37
+ AbstractConcurrentFileBasedCursor,
38
+ FileBasedConcurrentCursor,
39
+ FileBasedNoopCursor,
40
+ )
35
41
  from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
36
- from airbyte_cdk.sources.file_based.stream.cursor.default_file_based_cursor import DefaultFileBasedCursor
37
42
  from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository
38
- from airbyte_cdk.sources.source import TState
39
43
  from airbyte_cdk.sources.streams import Stream
44
+ from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
40
45
  from airbyte_cdk.utils.analytics_message import create_analytics_message
41
46
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
42
47
  from pydantic.error_wrappers import ValidationError
@@ -56,12 +61,12 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
56
61
  spec_class: Type[AbstractFileBasedSpec],
57
62
  catalog: Optional[ConfiguredAirbyteCatalog],
58
63
  config: Optional[Mapping[str, Any]],
59
- state: Optional[TState],
64
+ state: Optional[MutableMapping[str, Any]],
60
65
  availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy] = None,
61
66
  discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
62
67
  parsers: Mapping[Type[Any], FileTypeParser] = default_parsers,
63
68
  validation_policies: Mapping[ValidationPolicy, AbstractSchemaValidationPolicy] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
64
- cursor_cls: Type[AbstractFileBasedCursor] = DefaultFileBasedCursor,
69
+ cursor_cls: Type[Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]] = FileBasedConcurrentCursor,
65
70
  ):
66
71
  self.stream_reader = stream_reader
67
72
  self.spec_class = spec_class
@@ -137,52 +142,99 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
137
142
  """
138
143
  Return a list of this source's streams.
139
144
  """
140
- file_based_streams = self._get_file_based_streams(config)
141
-
142
- configured_streams: List[Stream] = []
143
-
144
- for stream in file_based_streams:
145
- sync_mode = self._get_sync_mode_from_catalog(stream)
146
- if sync_mode == SyncMode.full_refresh and hasattr(self, "_concurrency_level") and self._concurrency_level is not None:
147
- configured_streams.append(
148
- FileBasedStreamFacade.create_from_stream(stream, self, self.logger, None, FileBasedNoopCursor(stream.config))
149
- )
150
- else:
151
- configured_streams.append(stream)
152
145
 
153
- return configured_streams
146
+ if self.catalog:
147
+ state_manager = ConnectorStateManager(
148
+ stream_instance_map={s.stream.name: s.stream for s in self.catalog.streams},
149
+ state=self.state,
150
+ )
151
+ else:
152
+ # During `check` operations we don't have a catalog so cannot create a state manager.
153
+ # Since the state manager is only required for incremental syncs, this is fine.
154
+ state_manager = None
154
155
 
155
- def _get_file_based_streams(self, config: Mapping[str, Any]) -> List[AbstractFileBasedStream]:
156
156
  try:
157
157
  parsed_config = self._get_parsed_config(config)
158
158
  self.stream_reader.config = parsed_config
159
- streams: List[AbstractFileBasedStream] = []
159
+ streams: List[Stream] = []
160
160
  for stream_config in parsed_config.streams:
161
+ # Like state_manager, `catalog_stream` may be None during `check`
162
+ catalog_stream = self._get_stream_from_catalog(stream_config)
163
+ stream_state = (
164
+ state_manager.get_stream_state(catalog_stream.name, catalog_stream.namespace)
165
+ if (state_manager and catalog_stream)
166
+ else None
167
+ )
161
168
  self._validate_input_schema(stream_config)
162
- streams.append(
163
- DefaultFileBasedStream(
164
- config=stream_config,
165
- catalog_schema=self.stream_schemas.get(stream_config.name),
166
- stream_reader=self.stream_reader,
167
- availability_strategy=self.availability_strategy,
168
- discovery_policy=self.discovery_policy,
169
- parsers=self.parsers,
170
- validation_policy=self._validate_and_get_validation_policy(stream_config),
171
- cursor=self.cursor_cls(stream_config),
172
- errors_collector=self.errors_collector,
169
+
170
+ sync_mode = self._get_sync_mode_from_catalog(stream_config.name)
171
+
172
+ if sync_mode == SyncMode.full_refresh and hasattr(self, "_concurrency_level") and self._concurrency_level is not None:
173
+ cursor = FileBasedNoopCursor(stream_config)
174
+ stream = FileBasedStreamFacade.create_from_stream(
175
+ self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
173
176
  )
174
- )
177
+
178
+ elif (
179
+ sync_mode == SyncMode.incremental
180
+ and issubclass(self.cursor_cls, AbstractConcurrentFileBasedCursor)
181
+ and hasattr(self, "_concurrency_level")
182
+ and self._concurrency_level is not None
183
+ ):
184
+ assert (
185
+ state_manager is not None
186
+ ), "No ConnectorStateManager was created, but it is required for incremental syncs. This is unexpected. Please contact Support."
187
+
188
+ cursor = self.cursor_cls(
189
+ stream_config,
190
+ stream_config.name,
191
+ None,
192
+ stream_state,
193
+ self.message_repository,
194
+ state_manager,
195
+ CursorField(DefaultFileBasedStream.ab_last_mod_col),
196
+ )
197
+ stream = FileBasedStreamFacade.create_from_stream(
198
+ self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
199
+ )
200
+ else:
201
+ cursor = self.cursor_cls(stream_config)
202
+ stream = self._make_default_stream(stream_config, cursor)
203
+
204
+ streams.append(stream)
175
205
  return streams
176
206
 
177
207
  except ValidationError as exc:
178
208
  raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) from exc
179
209
 
180
- def _get_sync_mode_from_catalog(self, stream: Stream) -> Optional[SyncMode]:
210
+ def _make_default_stream(
211
+ self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor]
212
+ ) -> AbstractFileBasedStream:
213
+ return DefaultFileBasedStream(
214
+ config=stream_config,
215
+ catalog_schema=self.stream_schemas.get(stream_config.name),
216
+ stream_reader=self.stream_reader,
217
+ availability_strategy=self.availability_strategy,
218
+ discovery_policy=self.discovery_policy,
219
+ parsers=self.parsers,
220
+ validation_policy=self._validate_and_get_validation_policy(stream_config),
221
+ errors_collector=self.errors_collector,
222
+ cursor=cursor,
223
+ )
224
+
225
+ def _get_stream_from_catalog(self, stream_config: FileBasedStreamConfig) -> Optional[AirbyteStream]:
226
+ if self.catalog:
227
+ for stream in self.catalog.streams or []:
228
+ if stream.stream.name == stream_config.name:
229
+ return stream.stream
230
+ return None
231
+
232
+ def _get_sync_mode_from_catalog(self, stream_name: str) -> Optional[SyncMode]:
181
233
  if self.catalog:
182
234
  for catalog_stream in self.catalog.streams:
183
- if stream.name == catalog_stream.stream.name:
235
+ if stream_name == catalog_stream.stream.name:
184
236
  return catalog_stream.sync_mode
185
- self.logger.warning(f"No sync mode was found for {stream.name}.")
237
+ self.logger.warning(f"No sync mode was found for {stream_name}.")
186
238
  return None
187
239
 
188
240
  def read(
@@ -15,6 +15,7 @@ from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFile
15
15
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
16
16
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
17
17
  from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
18
+ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
18
19
  from airbyte_cdk.sources.file_based.types import StreamSlice
19
20
  from airbyte_cdk.sources.streams import Stream
20
21
 
@@ -45,6 +46,7 @@ class AbstractFileBasedStream(Stream):
45
46
  parsers: Dict[Type[Any], FileTypeParser],
46
47
  validation_policy: AbstractSchemaValidationPolicy,
47
48
  errors_collector: FileBasedErrorsCollector,
49
+ cursor: AbstractFileBasedCursor,
48
50
  ):
49
51
  super().__init__()
50
52
  self.config = config
@@ -55,6 +57,7 @@ class AbstractFileBasedStream(Stream):
55
57
  self._availability_strategy = availability_strategy
56
58
  self._parsers = parsers
57
59
  self.errors_collector = errors_collector
60
+ self._cursor = cursor
58
61
 
59
62
  @property
60
63
  @abstractmethod
@@ -5,7 +5,7 @@
5
5
  import copy
6
6
  import logging
7
7
  from functools import lru_cache
8
- from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union
8
+ from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union
9
9
 
10
10
  from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, SyncMode, Type
11
11
  from airbyte_cdk.sources import AbstractSource
@@ -19,6 +19,7 @@ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeP
19
19
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
20
20
  from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
21
21
  from airbyte_cdk.sources.file_based.stream.concurrent.cursor import FileBasedNoopCursor
22
+ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
22
23
  from airbyte_cdk.sources.file_based.types import StreamSlice
23
24
  from airbyte_cdk.sources.message import MessageRepository
24
25
  from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade
@@ -33,6 +34,9 @@ from airbyte_cdk.sources.utils.schema_helpers import InternalConfig
33
34
  from airbyte_cdk.sources.utils.slice_logger import SliceLogger
34
35
  from deprecated.classic import deprecated
35
36
 
37
+ if TYPE_CHECKING:
38
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor
39
+
36
40
  """
37
41
  This module contains adapters to help enabling concurrency on File-based Stream objects without needing to migrate to AbstractStream
38
42
  """
@@ -47,13 +51,14 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
47
51
  source: AbstractSource,
48
52
  logger: logging.Logger,
49
53
  state: Optional[MutableMapping[str, Any]],
50
- cursor: FileBasedNoopCursor,
54
+ cursor: "AbstractConcurrentFileBasedCursor",
51
55
  ) -> "FileBasedStreamFacade":
52
56
  """
53
57
  Create a ConcurrentStream from a FileBasedStream object.
54
58
  """
55
59
  pk = get_primary_key_from_stream(stream.primary_key)
56
60
  cursor_field = get_cursor_field_from_stream(stream)
61
+ stream._cursor = cursor
57
62
 
58
63
  if not source.message_repository:
59
64
  raise ValueError(
@@ -62,7 +67,7 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
62
67
 
63
68
  message_repository = source.message_repository
64
69
  return FileBasedStreamFacade(
65
- DefaultStream( # type: ignore
70
+ DefaultStream(
66
71
  partition_generator=FileBasedStreamPartitionGenerator(
67
72
  stream,
68
73
  message_repository,
@@ -90,14 +95,13 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
90
95
  self,
91
96
  stream: DefaultStream,
92
97
  legacy_stream: AbstractFileBasedStream,
93
- cursor: FileBasedNoopCursor,
98
+ cursor: AbstractFileBasedCursor,
94
99
  slice_logger: SliceLogger,
95
100
  logger: logging.Logger,
96
101
  ):
97
102
  """
98
103
  :param stream: The underlying AbstractStream
99
104
  """
100
- # super().__init__(stream, legacy_stream, cursor, slice_logger, logger)
101
105
  self._abstract_stream = stream
102
106
  self._legacy_stream = legacy_stream
103
107
  self._cursor = cursor
@@ -216,7 +220,7 @@ class FileBasedStreamPartition(Partition):
216
220
  sync_mode: SyncMode,
217
221
  cursor_field: Optional[List[str]],
218
222
  state: Optional[MutableMapping[str, Any]],
219
- cursor: FileBasedNoopCursor,
223
+ cursor: "AbstractConcurrentFileBasedCursor",
220
224
  ):
221
225
  self._stream = stream
222
226
  self._slice = _slice
@@ -292,7 +296,7 @@ class FileBasedStreamPartitionGenerator(PartitionGenerator):
292
296
  sync_mode: SyncMode,
293
297
  cursor_field: Optional[List[str]],
294
298
  state: Optional[MutableMapping[str, Any]],
295
- cursor: FileBasedNoopCursor,
299
+ cursor: "AbstractConcurrentFileBasedCursor",
296
300
  ):
297
301
  self._stream = stream
298
302
  self._message_repository = message_repository
@@ -305,19 +309,17 @@ class FileBasedStreamPartitionGenerator(PartitionGenerator):
305
309
  pending_partitions = []
306
310
  for _slice in self._stream.stream_slices(sync_mode=self._sync_mode, cursor_field=self._cursor_field, stream_state=self._state):
307
311
  if _slice is not None:
308
- pending_partitions.extend(
309
- [
312
+ for file in _slice.get("files", []):
313
+ pending_partitions.append(
310
314
  FileBasedStreamPartition(
311
315
  self._stream,
312
- {"files": [copy.deepcopy(f)]},
316
+ {"files": [copy.deepcopy(file)]},
313
317
  self._message_repository,
314
318
  self._sync_mode,
315
319
  self._cursor_field,
316
320
  self._state,
317
321
  self._cursor,
318
322
  )
319
- for f in _slice.get("files", [])
320
- ]
321
- )
323
+ )
322
324
  self._cursor.set_pending_partitions(pending_partitions)
323
325
  yield from pending_partitions
@@ -0,0 +1,5 @@
1
+ from .abstract_concurrent_file_based_cursor import AbstractConcurrentFileBasedCursor
2
+ from .file_based_noop_cursor import FileBasedNoopCursor
3
+ from .file_based_concurrent_cursor import FileBasedConcurrentCursor
4
+
5
+ __all__ = ["AbstractConcurrentFileBasedCursor", "FileBasedConcurrentCursor", "FileBasedNoopCursor"]
@@ -1,12 +1,12 @@
1
1
  #
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
+
4
5
  import logging
5
- from abc import abstractmethod
6
+ from abc import ABC, abstractmethod
6
7
  from datetime import datetime
7
- from typing import Any, Iterable, MutableMapping
8
+ from typing import TYPE_CHECKING, Any, Iterable, List, MutableMapping
8
9
 
9
- from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
10
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
11
  from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
12
12
  from airbyte_cdk.sources.file_based.types import StreamState
@@ -14,27 +14,33 @@ from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
14
14
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
15
15
  from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
16
16
 
17
+ if TYPE_CHECKING:
18
+ from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
19
+
20
+
21
+ class AbstractConcurrentFileBasedCursor(Cursor, AbstractFileBasedCursor, ABC):
22
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
23
+ pass
17
24
 
18
- class AbstractFileBasedConcurrentCursor(Cursor, AbstractFileBasedCursor):
19
25
  @property
20
26
  @abstractmethod
21
27
  def state(self) -> MutableMapping[str, Any]:
22
28
  ...
23
29
 
24
30
  @abstractmethod
25
- def add_file(self, file: RemoteFile) -> None:
31
+ def observe(self, record: Record) -> None:
26
32
  ...
27
33
 
28
34
  @abstractmethod
29
- def set_initial_state(self, value: StreamState) -> None:
35
+ def close_partition(self, partition: Partition) -> None:
30
36
  ...
31
37
 
32
38
  @abstractmethod
33
- def get_state(self) -> MutableMapping[str, Any]:
39
+ def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None:
34
40
  ...
35
41
 
36
42
  @abstractmethod
37
- def get_start_time(self) -> datetime:
43
+ def add_file(self, file: RemoteFile) -> None:
38
44
  ...
39
45
 
40
46
  @abstractmethod
@@ -42,49 +48,21 @@ class AbstractFileBasedConcurrentCursor(Cursor, AbstractFileBasedCursor):
42
48
  ...
43
49
 
44
50
  @abstractmethod
45
- def observe(self, record: Record) -> None:
51
+ def get_state(self) -> MutableMapping[str, Any]:
46
52
  ...
47
53
 
48
54
  @abstractmethod
49
- def close_partition(self, partition: Partition) -> None:
55
+ def set_initial_state(self, value: StreamState) -> None:
50
56
  ...
51
57
 
52
58
  @abstractmethod
53
- def set_pending_partitions(self, partitions: Iterable[Partition]) -> None:
54
- ...
55
-
56
-
57
- class FileBasedNoopCursor(AbstractFileBasedConcurrentCursor):
58
- def __init__(self, stream_config: FileBasedStreamConfig, **kwargs: Any):
59
- pass
60
-
61
- @property
62
- def state(self) -> MutableMapping[str, Any]:
63
- return {}
64
-
65
- def add_file(self, file: RemoteFile) -> None:
66
- return None
67
-
68
- def set_initial_state(self, value: StreamState) -> None:
69
- return None
70
-
71
- def get_state(self) -> MutableMapping[str, Any]:
72
- return {}
73
-
74
59
  def get_start_time(self) -> datetime:
75
- return datetime.min
76
-
77
- def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]:
78
- return []
79
-
80
- def observe(self, record: Record) -> None:
81
- return None
60
+ ...
82
61
 
83
- def close_partition(self, partition: Partition) -> None:
84
- return None
62
+ @abstractmethod
63
+ def emit_state_message(self) -> None:
64
+ ...
85
65
 
66
+ @abstractmethod
86
67
  def ensure_at_least_one_state_emitted(self) -> None:
87
- return None
88
-
89
- def set_pending_partitions(self, partitions: Iterable[Partition]) -> None:
90
- return None
68
+ ...