airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. airbyte_cdk/__init__.py +17 -2
  2. airbyte_cdk/config_observation.py +10 -3
  3. airbyte_cdk/connector.py +19 -9
  4. airbyte_cdk/connector_builder/connector_builder_handler.py +28 -8
  5. airbyte_cdk/connector_builder/main.py +26 -6
  6. airbyte_cdk/connector_builder/message_grouper.py +95 -25
  7. airbyte_cdk/destinations/destination.py +47 -14
  8. airbyte_cdk/destinations/vector_db_based/config.py +36 -14
  9. airbyte_cdk/destinations/vector_db_based/document_processor.py +49 -11
  10. airbyte_cdk/destinations/vector_db_based/embedder.py +52 -11
  11. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  12. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  13. airbyte_cdk/destinations/vector_db_based/writer.py +15 -4
  14. airbyte_cdk/entrypoint.py +82 -26
  15. airbyte_cdk/exception_handler.py +13 -3
  16. airbyte_cdk/logger.py +10 -2
  17. airbyte_cdk/models/airbyte_protocol.py +11 -5
  18. airbyte_cdk/models/airbyte_protocol_serializers.py +9 -3
  19. airbyte_cdk/models/well_known_types.py +1 -1
  20. airbyte_cdk/sources/abstract_source.py +63 -17
  21. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +47 -14
  22. airbyte_cdk/sources/concurrent_source/concurrent_source.py +25 -7
  23. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +27 -6
  24. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +9 -3
  25. airbyte_cdk/sources/connector_state_manager.py +32 -10
  26. airbyte_cdk/sources/declarative/async_job/job.py +3 -1
  27. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +68 -14
  28. airbyte_cdk/sources/declarative/async_job/job_tracker.py +24 -6
  29. airbyte_cdk/sources/declarative/async_job/repository.py +3 -1
  30. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  31. airbyte_cdk/sources/declarative/auth/jwt.py +27 -7
  32. airbyte_cdk/sources/declarative/auth/oauth.py +35 -11
  33. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +3 -1
  34. airbyte_cdk/sources/declarative/auth/token.py +25 -8
  35. airbyte_cdk/sources/declarative/checks/check_stream.py +12 -4
  36. airbyte_cdk/sources/declarative/checks/connection_checker.py +3 -1
  37. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +11 -3
  38. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +106 -50
  39. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +20 -6
  40. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +43 -0
  41. airbyte_cdk/sources/declarative/declarative_source.py +3 -1
  42. airbyte_cdk/sources/declarative/declarative_stream.py +27 -6
  43. airbyte_cdk/sources/declarative/decoders/__init__.py +2 -2
  44. airbyte_cdk/sources/declarative/decoders/decoder.py +3 -1
  45. airbyte_cdk/sources/declarative/decoders/json_decoder.py +48 -13
  46. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +3 -1
  47. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +6 -2
  48. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +6 -2
  49. airbyte_cdk/sources/declarative/extractors/record_filter.py +24 -7
  50. airbyte_cdk/sources/declarative/extractors/record_selector.py +10 -3
  51. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +15 -5
  52. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +96 -31
  53. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +22 -8
  54. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +46 -15
  55. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +19 -5
  56. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +3 -1
  57. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +20 -2
  58. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +5 -1
  59. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +10 -3
  60. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +6 -2
  61. airbyte_cdk/sources/declarative/interpolation/interpolation.py +7 -1
  62. airbyte_cdk/sources/declarative/interpolation/jinja.py +6 -2
  63. airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
  64. airbyte_cdk/sources/declarative/manifest_declarative_source.py +106 -24
  65. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +14 -5
  66. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +697 -678
  67. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +13 -4
  68. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +9 -2
  69. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +802 -232
  70. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +29 -7
  71. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +25 -7
  72. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +54 -15
  73. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +6 -2
  74. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +3 -1
  75. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +17 -5
  76. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +15 -5
  77. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +3 -1
  78. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +18 -8
  79. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +16 -7
  80. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +51 -14
  81. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +29 -8
  82. airbyte_cdk/sources/declarative/requesters/http_requester.py +58 -16
  83. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +49 -14
  84. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +3 -1
  85. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +3 -1
  86. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +17 -5
  87. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +24 -7
  88. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +9 -3
  89. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +3 -1
  90. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +6 -2
  91. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +19 -6
  92. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +3 -1
  93. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +21 -7
  94. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +18 -6
  95. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +27 -8
  96. airbyte_cdk/sources/declarative/requesters/requester.py +3 -1
  97. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -5
  98. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +105 -24
  99. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +3 -1
  100. airbyte_cdk/sources/declarative/spec/spec.py +8 -2
  101. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +3 -1
  102. airbyte_cdk/sources/declarative/transformations/add_fields.py +12 -3
  103. airbyte_cdk/sources/declarative/transformations/remove_fields.py +6 -2
  104. airbyte_cdk/sources/declarative/types.py +8 -1
  105. airbyte_cdk/sources/declarative/yaml_declarative_source.py +3 -1
  106. airbyte_cdk/sources/embedded/base_integration.py +14 -4
  107. airbyte_cdk/sources/embedded/catalog.py +16 -4
  108. airbyte_cdk/sources/embedded/runner.py +19 -3
  109. airbyte_cdk/sources/embedded/tools.py +3 -1
  110. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
  111. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +27 -7
  112. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +12 -6
  113. airbyte_cdk/sources/file_based/config/csv_format.py +21 -9
  114. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +6 -2
  115. airbyte_cdk/sources/file_based/config/unstructured_format.py +10 -3
  116. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  117. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  118. airbyte_cdk/sources/file_based/exceptions.py +13 -15
  119. airbyte_cdk/sources/file_based/file_based_source.py +82 -24
  120. airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -5
  121. airbyte_cdk/sources/file_based/file_types/avro_parser.py +58 -17
  122. airbyte_cdk/sources/file_based/file_types/csv_parser.py +89 -26
  123. airbyte_cdk/sources/file_based/file_types/excel_parser.py +25 -7
  124. airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
  125. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  126. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +20 -6
  127. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +57 -16
  128. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +64 -15
  129. airbyte_cdk/sources/file_based/schema_helpers.py +33 -10
  130. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  131. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  132. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +33 -10
  133. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +47 -11
  134. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +13 -22
  135. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +53 -17
  136. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +17 -5
  137. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  138. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +26 -9
  139. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +67 -21
  140. airbyte_cdk/sources/http_logger.py +5 -1
  141. airbyte_cdk/sources/message/repository.py +18 -4
  142. airbyte_cdk/sources/source.py +17 -7
  143. airbyte_cdk/sources/streams/availability_strategy.py +9 -3
  144. airbyte_cdk/sources/streams/call_rate.py +63 -19
  145. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +31 -7
  146. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +6 -2
  147. airbyte_cdk/sources/streams/concurrent/adapters.py +77 -22
  148. airbyte_cdk/sources/streams/concurrent/cursor.py +56 -20
  149. airbyte_cdk/sources/streams/concurrent/default_stream.py +9 -2
  150. airbyte_cdk/sources/streams/concurrent/helpers.py +6 -2
  151. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +9 -2
  152. airbyte_cdk/sources/streams/concurrent/partition_reader.py +4 -1
  153. airbyte_cdk/sources/streams/concurrent/partitions/record.py +10 -2
  154. airbyte_cdk/sources/streams/concurrent/partitions/types.py +6 -2
  155. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +25 -10
  156. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +32 -16
  157. airbyte_cdk/sources/streams/core.py +77 -22
  158. airbyte_cdk/sources/streams/http/availability_strategy.py +3 -1
  159. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +4 -1
  160. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +3 -1
  161. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +16 -5
  162. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +9 -3
  163. airbyte_cdk/sources/streams/http/exceptions.py +2 -2
  164. airbyte_cdk/sources/streams/http/http.py +133 -33
  165. airbyte_cdk/sources/streams/http/http_client.py +91 -29
  166. airbyte_cdk/sources/streams/http/rate_limiting.py +23 -7
  167. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +19 -6
  168. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +38 -11
  169. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  170. airbyte_cdk/sources/types.py +5 -1
  171. airbyte_cdk/sources/utils/record_helper.py +12 -3
  172. airbyte_cdk/sources/utils/schema_helpers.py +9 -3
  173. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  174. airbyte_cdk/sources/utils/transform.py +24 -9
  175. airbyte_cdk/sql/exceptions.py +19 -6
  176. airbyte_cdk/sql/secrets.py +3 -1
  177. airbyte_cdk/sql/shared/catalog_providers.py +13 -4
  178. airbyte_cdk/sql/shared/sql_processor.py +44 -14
  179. airbyte_cdk/test/catalog_builder.py +19 -8
  180. airbyte_cdk/test/entrypoint_wrapper.py +27 -8
  181. airbyte_cdk/test/mock_http/mocker.py +41 -11
  182. airbyte_cdk/test/mock_http/request.py +9 -3
  183. airbyte_cdk/test/mock_http/response.py +3 -1
  184. airbyte_cdk/test/mock_http/response_builder.py +29 -7
  185. airbyte_cdk/test/state_builder.py +10 -2
  186. airbyte_cdk/test/utils/data.py +6 -2
  187. airbyte_cdk/test/utils/http_mocking.py +3 -1
  188. airbyte_cdk/utils/airbyte_secrets_utils.py +3 -1
  189. airbyte_cdk/utils/analytics_message.py +10 -2
  190. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  191. airbyte_cdk/utils/mapping_helpers.py +3 -1
  192. airbyte_cdk/utils/message_utils.py +11 -4
  193. airbyte_cdk/utils/print_buffer.py +6 -1
  194. airbyte_cdk/utils/schema_inferrer.py +30 -9
  195. airbyte_cdk/utils/spec_schema_transformations.py +3 -1
  196. airbyte_cdk/utils/traced_exception.py +35 -9
  197. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/METADATA +8 -7
  198. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/RECORD +200 -200
  199. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/LICENSE.txt +0 -0
  200. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/WHEEL +0 -0
@@ -31,15 +31,27 @@ def to_configured_stream(
31
31
  primary_key: Optional[List[List[str]]] = None,
32
32
  ) -> ConfiguredAirbyteStream:
33
33
  return ConfiguredAirbyteStream(
34
- stream=stream, sync_mode=sync_mode, destination_sync_mode=destination_sync_mode, cursor_field=cursor_field, primary_key=primary_key
34
+ stream=stream,
35
+ sync_mode=sync_mode,
36
+ destination_sync_mode=destination_sync_mode,
37
+ cursor_field=cursor_field,
38
+ primary_key=primary_key,
35
39
  )
36
40
 
37
41
 
38
- def to_configured_catalog(configured_streams: List[ConfiguredAirbyteStream]) -> ConfiguredAirbyteCatalog:
42
+ def to_configured_catalog(
43
+ configured_streams: List[ConfiguredAirbyteStream],
44
+ ) -> ConfiguredAirbyteCatalog:
39
45
  return ConfiguredAirbyteCatalog(streams=configured_streams)
40
46
 
41
47
 
42
- def create_configured_catalog(stream: AirbyteStream, sync_mode: SyncMode = SyncMode.full_refresh) -> ConfiguredAirbyteCatalog:
43
- configured_streams = [to_configured_stream(stream, sync_mode=sync_mode, primary_key=stream.source_defined_primary_key)]
48
+ def create_configured_catalog(
49
+ stream: AirbyteStream, sync_mode: SyncMode = SyncMode.full_refresh
50
+ ) -> ConfiguredAirbyteCatalog:
51
+ configured_streams = [
52
+ to_configured_stream(
53
+ stream, sync_mode=sync_mode, primary_key=stream.source_defined_primary_key
54
+ )
55
+ ]
44
56
 
45
57
  return to_configured_catalog(configured_streams)
@@ -8,7 +8,13 @@ from abc import ABC, abstractmethod
8
8
  from typing import Generic, Iterable, Optional
9
9
 
10
10
  from airbyte_cdk.connector import TConfig
11
- from airbyte_cdk.models import AirbyteCatalog, AirbyteMessage, AirbyteStateMessage, ConfiguredAirbyteCatalog, ConnectorSpecification
11
+ from airbyte_cdk.models import (
12
+ AirbyteCatalog,
13
+ AirbyteMessage,
14
+ AirbyteStateMessage,
15
+ ConfiguredAirbyteCatalog,
16
+ ConnectorSpecification,
17
+ )
12
18
  from airbyte_cdk.sources.source import Source
13
19
 
14
20
 
@@ -22,7 +28,12 @@ class SourceRunner(ABC, Generic[TConfig]):
22
28
  pass
23
29
 
24
30
  @abstractmethod
25
- def read(self, config: TConfig, catalog: ConfiguredAirbyteCatalog, state: Optional[AirbyteStateMessage]) -> Iterable[AirbyteMessage]:
31
+ def read(
32
+ self,
33
+ config: TConfig,
34
+ catalog: ConfiguredAirbyteCatalog,
35
+ state: Optional[AirbyteStateMessage],
36
+ ) -> Iterable[AirbyteMessage]:
26
37
  pass
27
38
 
28
39
 
@@ -37,5 +48,10 @@ class CDKRunner(SourceRunner[TConfig]):
37
48
  def discover(self, config: TConfig) -> AirbyteCatalog:
38
49
  return self._source.discover(self._logger, config)
39
50
 
40
- def read(self, config: TConfig, catalog: ConfiguredAirbyteCatalog, state: Optional[AirbyteStateMessage]) -> Iterable[AirbyteMessage]:
51
+ def read(
52
+ self,
53
+ config: TConfig,
54
+ catalog: ConfiguredAirbyteCatalog,
55
+ state: Optional[AirbyteStateMessage],
56
+ ) -> Iterable[AirbyteMessage]:
41
57
  return self._source.read(self._logger, config, catalog, state=[state] if state else [])
@@ -8,7 +8,9 @@ import dpath
8
8
  from airbyte_cdk.models import AirbyteStream
9
9
 
10
10
 
11
- def get_first(iterable: Iterable[Any], predicate: Callable[[Any], bool] = lambda m: True) -> Optional[Any]:
11
+ def get_first(
12
+ iterable: Iterable[Any], predicate: Callable[[Any], bool] = lambda m: True
13
+ ) -> Optional[Any]:
12
14
  return next(filter(predicate, iterable), None)
13
15
 
14
16
 
@@ -22,7 +22,9 @@ if TYPE_CHECKING:
22
22
 
23
23
  class AbstractFileBasedAvailabilityStrategy(AvailabilityStrategy):
24
24
  @abstractmethod
25
- def check_availability(self, stream: Stream, logger: logging.Logger, _: Optional[Source]) -> Tuple[bool, Optional[str]]:
25
+ def check_availability(
26
+ self, stream: Stream, logger: logging.Logger, _: Optional[Source]
27
+ ) -> Tuple[bool, Optional[str]]:
26
28
  """
27
29
  Perform a connection check for the stream.
28
30
 
@@ -48,10 +50,16 @@ class AbstractFileBasedAvailabilityStrategyWrapper(AbstractAvailabilityStrategy)
48
50
  self.stream = stream
49
51
 
50
52
  def check_availability(self, logger: logging.Logger) -> StreamAvailability:
51
- is_available, reason = self.stream.availability_strategy.check_availability(self.stream, logger, None)
53
+ is_available, reason = self.stream.availability_strategy.check_availability(
54
+ self.stream, logger, None
55
+ )
52
56
  if is_available:
53
57
  return StreamAvailable()
54
58
  return StreamUnavailable(reason or "")
55
59
 
56
- def check_availability_and_parsability(self, logger: logging.Logger) -> Tuple[bool, Optional[str]]:
57
- return self.stream.availability_strategy.check_availability_and_parsability(self.stream, logger, None)
60
+ def check_availability_and_parsability(
61
+ self, logger: logging.Logger
62
+ ) -> Tuple[bool, Optional[str]]:
63
+ return self.stream.availability_strategy.check_availability_and_parsability(
64
+ self.stream, logger, None
65
+ )
@@ -8,8 +8,14 @@ from typing import TYPE_CHECKING, Optional, Tuple
8
8
 
9
9
  from airbyte_cdk import AirbyteTracedException
10
10
  from airbyte_cdk.sources import Source
11
- from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
12
- from airbyte_cdk.sources.file_based.exceptions import CheckAvailabilityError, CustomFileBasedException, FileBasedSourceError
11
+ from airbyte_cdk.sources.file_based.availability_strategy import (
12
+ AbstractFileBasedAvailabilityStrategy,
13
+ )
14
+ from airbyte_cdk.sources.file_based.exceptions import (
15
+ CheckAvailabilityError,
16
+ CustomFileBasedException,
17
+ FileBasedSourceError,
18
+ )
13
19
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
14
20
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
15
21
  from airbyte_cdk.sources.file_based.schema_helpers import conforms_to_schema
@@ -22,7 +28,9 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
22
28
  def __init__(self, stream_reader: AbstractFileBasedStreamReader):
23
29
  self.stream_reader = stream_reader
24
30
 
25
- def check_availability(self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source]) -> Tuple[bool, Optional[str]]: # type: ignore[override]
31
+ def check_availability(
32
+ self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source]
33
+ ) -> Tuple[bool, Optional[str]]: # type: ignore[override]
26
34
  """
27
35
  Perform a connection check for the stream (verify that we can list files from the stream).
28
36
 
@@ -87,15 +95,25 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
87
95
  except CustomFileBasedException as exc:
88
96
  raise CheckAvailabilityError(str(exc), stream=stream.name) from exc
89
97
  except Exception as exc:
90
- raise CheckAvailabilityError(FileBasedSourceError.ERROR_LISTING_FILES, stream=stream.name) from exc
98
+ raise CheckAvailabilityError(
99
+ FileBasedSourceError.ERROR_LISTING_FILES, stream=stream.name
100
+ ) from exc
91
101
 
92
102
  return file
93
103
 
94
- def _check_parse_record(self, stream: "AbstractFileBasedStream", file: RemoteFile, logger: logging.Logger) -> None:
104
+ def _check_parse_record(
105
+ self, stream: "AbstractFileBasedStream", file: RemoteFile, logger: logging.Logger
106
+ ) -> None:
95
107
  parser = stream.get_parser()
96
108
 
97
109
  try:
98
- record = next(iter(parser.parse_records(stream.config, file, self.stream_reader, logger, discovered_schema=None)))
110
+ record = next(
111
+ iter(
112
+ parser.parse_records(
113
+ stream.config, file, self.stream_reader, logger, discovered_schema=None
114
+ )
115
+ )
116
+ )
99
117
  except StopIteration:
100
118
  # The file is empty. We've verified that we can open it, so will
101
119
  # consider the connection check successful even though it means
@@ -104,7 +122,9 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
104
122
  except AirbyteTracedException as ate:
105
123
  raise ate
106
124
  except Exception as exc:
107
- raise CheckAvailabilityError(FileBasedSourceError.ERROR_READING_FILE, stream=stream.name, file=file.uri) from exc
125
+ raise CheckAvailabilityError(
126
+ FileBasedSourceError.ERROR_READING_FILE, stream=stream.name, file=file.uri
127
+ ) from exc
108
128
 
109
129
  schema = stream.catalog_schema or stream.config.input_schema
110
130
  if schema and stream.validation_policy.validate_schema_before_sync:
@@ -107,10 +107,16 @@ class AbstractFileBasedSpec(BaseModel):
107
107
 
108
108
  properties_to_change = ["validation_policy"]
109
109
  for property_to_change in properties_to_change:
110
- property_object = schema["properties"]["streams"]["items"]["properties"][property_to_change]
110
+ property_object = schema["properties"]["streams"]["items"]["properties"][
111
+ property_to_change
112
+ ]
111
113
  if "anyOf" in property_object:
112
- schema["properties"]["streams"]["items"]["properties"][property_to_change]["type"] = "object"
113
- schema["properties"]["streams"]["items"]["properties"][property_to_change]["oneOf"] = property_object.pop("anyOf")
114
+ schema["properties"]["streams"]["items"]["properties"][property_to_change][
115
+ "type"
116
+ ] = "object"
117
+ schema["properties"]["streams"]["items"]["properties"][property_to_change][
118
+ "oneOf"
119
+ ] = property_object.pop("anyOf")
114
120
  AbstractFileBasedSpec.move_enum_to_root(property_object)
115
121
 
116
122
  csv_format_schemas = list(
@@ -121,9 +127,9 @@ class AbstractFileBasedSpec(BaseModel):
121
127
  )
122
128
  if len(csv_format_schemas) != 1:
123
129
  raise ValueError(f"Expecting only one CSV format but got {csv_format_schemas}")
124
- csv_format_schemas[0]["properties"]["header_definition"]["oneOf"] = csv_format_schemas[0]["properties"]["header_definition"].pop(
125
- "anyOf", []
126
- )
130
+ csv_format_schemas[0]["properties"]["header_definition"]["oneOf"] = csv_format_schemas[0][
131
+ "properties"
132
+ ]["header_definition"].pop("anyOf", [])
127
133
  csv_format_schemas[0]["properties"]["header_definition"]["type"] = "object"
128
134
  return schema
129
135
 
@@ -70,7 +70,9 @@ class CsvHeaderUserProvided(BaseModel):
70
70
  @validator("column_names")
71
71
  def validate_column_names(cls, v: List[str]) -> List[str]:
72
72
  if not v:
73
- raise ValueError("At least one column name needs to be provided when using user provided headers")
73
+ raise ValueError(
74
+ "At least one column name needs to be provided when using user provided headers"
75
+ )
74
76
  return v
75
77
 
76
78
 
@@ -107,7 +109,9 @@ class CsvFormat(BaseModel):
107
109
  description='The character encoding of the CSV data. Leave blank to default to <strong>UTF8</strong>. See <a href="https://docs.python.org/3/library/codecs.html#standard-encodings" target="_blank">list of python encodings</a> for allowable options.',
108
110
  )
109
111
  double_quote: bool = Field(
110
- title="Double Quote", default=True, description="Whether two quotes in a quoted CSV value denote a single quote in the data."
112
+ title="Double Quote",
113
+ default=True,
114
+ description="Whether two quotes in a quoted CSV value denote a single quote in the data.",
111
115
  )
112
116
  null_values: Set[str] = Field(
113
117
  title="Null Values",
@@ -125,12 +129,16 @@ class CsvFormat(BaseModel):
125
129
  description="The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.",
126
130
  )
127
131
  skip_rows_after_header: int = Field(
128
- title="Skip Rows After Header", default=0, description="The number of rows to skip after the header row."
132
+ title="Skip Rows After Header",
133
+ default=0,
134
+ description="The number of rows to skip after the header row.",
129
135
  )
130
- header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = Field(
131
- title="CSV Header Definition",
132
- default=CsvHeaderFromCsv(header_definition_type=CsvHeaderDefinitionType.FROM_CSV.value),
133
- description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
136
+ header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = (
137
+ Field(
138
+ title="CSV Header Definition",
139
+ default=CsvHeaderFromCsv(header_definition_type=CsvHeaderDefinitionType.FROM_CSV.value),
140
+ description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
141
+ )
134
142
  )
135
143
  true_values: Set[str] = Field(
136
144
  title="True Values",
@@ -189,9 +197,13 @@ class CsvFormat(BaseModel):
189
197
  definition_type = values.get("header_definition_type")
190
198
  column_names = values.get("user_provided_column_names")
191
199
  if definition_type == CsvHeaderDefinitionType.USER_PROVIDED and not column_names:
192
- raise ValidationError("`user_provided_column_names` should be defined if the definition 'User Provided'.", model=CsvFormat)
200
+ raise ValidationError(
201
+ "`user_provided_column_names` should be defined if the definition 'User Provided'.",
202
+ model=CsvFormat,
203
+ )
193
204
  if definition_type != CsvHeaderDefinitionType.USER_PROVIDED and column_names:
194
205
  raise ValidationError(
195
- "`user_provided_column_names` should not be defined if the definition is not 'User Provided'.", model=CsvFormat
206
+ "`user_provided_column_names` should not be defined if the definition is not 'User Provided'.",
207
+ model=CsvFormat,
196
208
  )
197
209
  return values
@@ -56,7 +56,9 @@ class FileBasedStreamConfig(BaseModel):
56
56
  description="When the state history of the file store is full, syncs will only read files that were last modified in the provided day range.",
57
57
  default=3,
58
58
  )
59
- format: Union[AvroFormat, CsvFormat, JsonlFormat, ParquetFormat, UnstructuredFormat, ExcelFormat] = Field(
59
+ format: Union[
60
+ AvroFormat, CsvFormat, JsonlFormat, ParquetFormat, UnstructuredFormat, ExcelFormat
61
+ ] = Field(
60
62
  title="Format",
61
63
  description="The configuration options that are used to alter how to read incoming files that deviate from the standard formatting.",
62
64
  )
@@ -89,6 +91,8 @@ class FileBasedStreamConfig(BaseModel):
89
91
  if self.input_schema:
90
92
  schema = type_mapping_to_jsonschema(self.input_schema)
91
93
  if not schema:
92
- raise ValueError(f"Unable to create JSON schema from input schema {self.input_schema}")
94
+ raise ValueError(
95
+ f"Unable to create JSON schema from input schema {self.input_schema}"
96
+ )
93
97
  return schema
94
98
  return None
@@ -13,7 +13,9 @@ class LocalProcessingConfigModel(BaseModel):
13
13
 
14
14
  class Config(OneOfOptionConfig):
15
15
  title = "Local"
16
- description = "Process files locally, supporting `fast` and `ocr` modes. This is the default option."
16
+ description = (
17
+ "Process files locally, supporting `fast` and `ocr` modes. This is the default option."
18
+ )
17
19
  discriminator = "mode"
18
20
 
19
21
 
@@ -23,7 +25,9 @@ class APIParameterConfigModel(BaseModel):
23
25
  description="The name of the unstructured API parameter to use",
24
26
  examples=["combine_under_n_chars", "languages"],
25
27
  )
26
- value: str = Field(title="Value", description="The value of the parameter", examples=["true", "hi_res"])
28
+ value: str = Field(
29
+ title="Value", description="The value of the parameter", examples=["true", "hi_res"]
30
+ )
27
31
 
28
32
 
29
33
  class APIProcessingConfigModel(BaseModel):
@@ -85,7 +89,10 @@ class UnstructuredFormat(BaseModel):
85
89
  description="The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf",
86
90
  )
87
91
 
88
- processing: Union[LocalProcessingConfigModel, APIProcessingConfigModel,] = Field(
92
+ processing: Union[
93
+ LocalProcessingConfigModel,
94
+ APIProcessingConfigModel,
95
+ ] = Field(
89
96
  default=LocalProcessingConfigModel(mode="local"),
90
97
  title="Processing",
91
98
  description="Processing configuration",
@@ -15,9 +15,7 @@ class AbstractDiscoveryPolicy(ABC):
15
15
 
16
16
  @property
17
17
  @abstractmethod
18
- def n_concurrent_requests(self) -> int:
19
- ...
18
+ def n_concurrent_requests(self) -> int: ...
20
19
 
21
20
  @abstractmethod
22
- def get_max_n_files_for_schema_inference(self, parser: FileTypeParser) -> int:
23
- ...
21
+ def get_max_n_files_for_schema_inference(self, parser: FileTypeParser) -> int: ...
@@ -2,7 +2,9 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
- from airbyte_cdk.sources.file_based.discovery_policy.abstract_discovery_policy import AbstractDiscoveryPolicy
5
+ from airbyte_cdk.sources.file_based.discovery_policy.abstract_discovery_policy import (
6
+ AbstractDiscoveryPolicy,
7
+ )
6
8
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
7
9
 
8
10
  DEFAULT_N_CONCURRENT_REQUESTS = 10
@@ -23,6 +25,9 @@ class DefaultDiscoveryPolicy(AbstractDiscoveryPolicy):
23
25
  return min(
24
26
  filter(
25
27
  None,
26
- (DEFAULT_MAX_N_FILES_FOR_STREAM_SCHEMA_INFERENCE, parser.parser_max_n_files_for_schema_inference),
28
+ (
29
+ DEFAULT_MAX_N_FILES_FOR_STREAM_SCHEMA_INFERENCE,
30
+ parser.parser_max_n_files_for_schema_inference,
31
+ ),
27
32
  )
28
33
  )
@@ -11,27 +11,21 @@ from airbyte_cdk.utils import AirbyteTracedException
11
11
 
12
12
  class FileBasedSourceError(Enum):
13
13
  EMPTY_STREAM = "No files were identified in the stream. This may be because there are no files in the specified container, or because your glob patterns did not match any files. Please verify that your source contains files last modified after the start_date and that your glob patterns are not overly strict."
14
- GLOB_PARSE_ERROR = (
15
- "Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
16
- )
14
+ GLOB_PARSE_ERROR = "Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
17
15
  ENCODING_ERROR = "File encoding error. The configured encoding must match file encoding."
18
16
  ERROR_CASTING_VALUE = "Could not cast the value to the expected type."
19
17
  ERROR_CASTING_VALUE_UNRECOGNIZED_TYPE = "Could not cast the value to the expected type because the type is not recognized. Valid types are null, array, boolean, integer, number, object, and string."
20
18
  ERROR_DECODING_VALUE = "Expected a JSON-decodeable value but could not decode record."
21
- ERROR_LISTING_FILES = (
22
- "Error listing files. Please check the credentials provided in the config and verify that they provide permission to list files."
23
- )
24
- ERROR_READING_FILE = (
25
- "Error opening file. Please check the credentials provided in the config and verify that they provide permission to read files."
26
- )
19
+ ERROR_LISTING_FILES = "Error listing files. Please check the credentials provided in the config and verify that they provide permission to list files."
20
+ ERROR_READING_FILE = "Error opening file. Please check the credentials provided in the config and verify that they provide permission to read files."
27
21
  ERROR_PARSING_RECORD = "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable."
28
- ERROR_PARSING_USER_PROVIDED_SCHEMA = "The provided schema could not be transformed into valid JSON Schema."
22
+ ERROR_PARSING_USER_PROVIDED_SCHEMA = (
23
+ "The provided schema could not be transformed into valid JSON Schema."
24
+ )
29
25
  ERROR_VALIDATING_RECORD = "One or more records do not pass the schema validation policy. Please modify your input schema, or select a more lenient validation policy."
30
26
  ERROR_PARSING_RECORD_MISMATCHED_COLUMNS = "A header field has resolved to `None`. This indicates that the CSV has more rows than the number of header fields. If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
31
27
  ERROR_PARSING_RECORD_MISMATCHED_ROWS = "A row's value has resolved to `None`. This indicates that the CSV has more columns in the header field than the number of columns in the row(s). If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
32
- STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY = (
33
- "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema."
34
- )
28
+ STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY = "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema."
35
29
  NULL_VALUE_IN_SCHEMA = "Error during schema inference: no type was detected for key."
36
30
  UNRECOGNIZED_TYPE = "Error during schema inference: unrecognized type."
37
31
  SCHEMA_INFERENCE_ERROR = "Error inferring schema from files. Are the files valid?"
@@ -39,7 +33,9 @@ class FileBasedSourceError(Enum):
39
33
  CONFIG_VALIDATION_ERROR = "Error creating stream config object."
40
34
  MISSING_SCHEMA = "Expected `json_schema` in the configured catalog but it is missing."
41
35
  UNDEFINED_PARSER = "No parser is defined for this file type."
42
- UNDEFINED_VALIDATION_POLICY = "The validation policy defined in the config does not exist for the source."
36
+ UNDEFINED_VALIDATION_POLICY = (
37
+ "The validation policy defined in the config does not exist for the source."
38
+ )
43
39
 
44
40
 
45
41
  class FileBasedErrorsCollector:
@@ -70,7 +66,9 @@ class BaseFileBasedSourceError(Exception):
70
66
  def __init__(self, error: Union[FileBasedSourceError, str], **kwargs): # type: ignore # noqa
71
67
  if isinstance(error, FileBasedSourceError):
72
68
  error = FileBasedSourceError(error).value
73
- super().__init__(f"{error} Contact Support if you need assistance.\n{' '.join([f'{k}={v}' for k, v in kwargs.items()])}")
69
+ super().__init__(
70
+ f"{error} Contact Support if you need assistance.\n{' '.join([f'{k}={v}' for k, v in kwargs.items()])}"
71
+ )
74
72
 
75
73
 
76
74
  class ConfigValidationError(BaseFileBasedSourceError):
@@ -22,15 +22,31 @@ from airbyte_cdk.models import (
22
22
  from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
23
23
  from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
24
24
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
25
- from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy, DefaultFileBasedAvailabilityStrategy
25
+ from airbyte_cdk.sources.file_based.availability_strategy import (
26
+ AbstractFileBasedAvailabilityStrategy,
27
+ DefaultFileBasedAvailabilityStrategy,
28
+ )
26
29
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
27
- from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
28
- from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy
29
- from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedErrorsCollector, FileBasedSourceError
30
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
31
+ FileBasedStreamConfig,
32
+ ValidationPolicy,
33
+ )
34
+ from airbyte_cdk.sources.file_based.discovery_policy import (
35
+ AbstractDiscoveryPolicy,
36
+ DefaultDiscoveryPolicy,
37
+ )
38
+ from airbyte_cdk.sources.file_based.exceptions import (
39
+ ConfigValidationError,
40
+ FileBasedErrorsCollector,
41
+ FileBasedSourceError,
42
+ )
30
43
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
31
44
  from airbyte_cdk.sources.file_based.file_types import default_parsers
32
45
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
33
- from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy
46
+ from airbyte_cdk.sources.file_based.schema_validation_policies import (
47
+ DEFAULT_SCHEMA_VALIDATION_POLICIES,
48
+ AbstractSchemaValidationPolicy,
49
+ )
34
50
  from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
35
51
  from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
36
52
  from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
@@ -65,25 +81,37 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
65
81
  availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy] = None,
66
82
  discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
67
83
  parsers: Mapping[Type[Any], FileTypeParser] = default_parsers,
68
- validation_policies: Mapping[ValidationPolicy, AbstractSchemaValidationPolicy] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
69
- cursor_cls: Type[Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]] = FileBasedConcurrentCursor,
84
+ validation_policies: Mapping[
85
+ ValidationPolicy, AbstractSchemaValidationPolicy
86
+ ] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
87
+ cursor_cls: Type[
88
+ Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]
89
+ ] = FileBasedConcurrentCursor,
70
90
  ):
71
91
  self.stream_reader = stream_reader
72
92
  self.spec_class = spec_class
73
93
  self.config = config
74
94
  self.catalog = catalog
75
95
  self.state = state
76
- self.availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(stream_reader)
96
+ self.availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(
97
+ stream_reader
98
+ )
77
99
  self.discovery_policy = discovery_policy
78
100
  self.parsers = parsers
79
101
  self.validation_policies = validation_policies
80
- self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
102
+ self.stream_schemas = (
103
+ {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
104
+ )
81
105
  self.cursor_cls = cursor_cls
82
106
  self.logger = init_logger(f"airbyte.{self.name}")
83
107
  self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
84
108
  self._message_repository: Optional[MessageRepository] = None
85
109
  concurrent_source = ConcurrentSource.create(
86
- MAX_CONCURRENCY, INITIAL_N_PARTITIONS, self.logger, self._slice_logger, self.message_repository
110
+ MAX_CONCURRENCY,
111
+ INITIAL_N_PARTITIONS,
112
+ self.logger,
113
+ self._slice_logger,
114
+ self.message_repository,
87
115
  )
88
116
  self._state = None
89
117
  super().__init__(concurrent_source)
@@ -91,10 +119,14 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
91
119
  @property
92
120
  def message_repository(self) -> MessageRepository:
93
121
  if self._message_repository is None:
94
- self._message_repository = InMemoryMessageRepository(Level(AirbyteLogFormatter.level_mapping[self.logger.level]))
122
+ self._message_repository = InMemoryMessageRepository(
123
+ Level(AirbyteLogFormatter.level_mapping[self.logger.level])
124
+ )
95
125
  return self._message_repository
96
126
 
97
- def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
127
+ def check_connection(
128
+ self, logger: logging.Logger, config: Mapping[str, Any]
129
+ ) -> Tuple[bool, Optional[Any]]:
98
130
  """
99
131
  Check that the source can be accessed using the user-provided configuration.
100
132
 
@@ -195,13 +227,21 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
195
227
 
196
228
  sync_mode = self._get_sync_mode_from_catalog(stream_config.name)
197
229
 
198
- if sync_mode == SyncMode.full_refresh and hasattr(self, "_concurrency_level") and self._concurrency_level is not None:
230
+ if (
231
+ sync_mode == SyncMode.full_refresh
232
+ and hasattr(self, "_concurrency_level")
233
+ and self._concurrency_level is not None
234
+ ):
199
235
  cursor = FileBasedFinalStateCursor(
200
- stream_config=stream_config, stream_namespace=None, message_repository=self.message_repository
236
+ stream_config=stream_config,
237
+ stream_namespace=None,
238
+ message_repository=self.message_repository,
201
239
  )
202
240
  stream = FileBasedStreamFacade.create_from_stream(
203
241
  stream=self._make_default_stream(
204
- stream_config=stream_config, cursor=cursor, use_file_transfer=self._use_file_transfer(parsed_config)
242
+ stream_config=stream_config,
243
+ cursor=cursor,
244
+ use_file_transfer=self._use_file_transfer(parsed_config),
205
245
  ),
206
246
  source=self,
207
247
  logger=self.logger,
@@ -230,7 +270,9 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
230
270
  )
231
271
  stream = FileBasedStreamFacade.create_from_stream(
232
272
  stream=self._make_default_stream(
233
- stream_config=stream_config, cursor=cursor, use_file_transfer=self._use_file_transfer(parsed_config)
273
+ stream_config=stream_config,
274
+ cursor=cursor,
275
+ use_file_transfer=self._use_file_transfer(parsed_config),
234
276
  ),
235
277
  source=self,
236
278
  logger=self.logger,
@@ -240,7 +282,9 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
240
282
  else:
241
283
  cursor = self.cursor_cls(stream_config)
242
284
  stream = self._make_default_stream(
243
- stream_config=stream_config, cursor=cursor, use_file_transfer=self._use_file_transfer(parsed_config)
285
+ stream_config=stream_config,
286
+ cursor=cursor,
287
+ use_file_transfer=self._use_file_transfer(parsed_config),
244
288
  )
245
289
 
246
290
  streams.append(stream)
@@ -250,7 +294,10 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
250
294
  raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) from exc
251
295
 
252
296
  def _make_default_stream(
253
- self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor], use_file_transfer: bool = False
297
+ self,
298
+ stream_config: FileBasedStreamConfig,
299
+ cursor: Optional[AbstractFileBasedCursor],
300
+ use_file_transfer: bool = False,
254
301
  ) -> AbstractFileBasedStream:
255
302
  return DefaultFileBasedStream(
256
303
  config=stream_config,
@@ -265,7 +312,9 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
265
312
  use_file_transfer=use_file_transfer,
266
313
  )
267
314
 
268
- def _get_stream_from_catalog(self, stream_config: FileBasedStreamConfig) -> Optional[AirbyteStream]:
315
+ def _get_stream_from_catalog(
316
+ self, stream_config: FileBasedStreamConfig
317
+ ) -> Optional[AirbyteStream]:
269
318
  if self.catalog:
270
319
  for stream in self.catalog.streams or []:
271
320
  if stream.stream.name == stream_config.name:
@@ -292,7 +341,9 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
292
341
  yield from self.errors_collector.yield_and_raise_collected()
293
342
  # count streams using a certain parser
294
343
  parsed_config = self._get_parsed_config(config)
295
- for parser, count in Counter(stream.format.filetype for stream in parsed_config.streams).items():
344
+ for parser, count in Counter(
345
+ stream.format.filetype for stream in parsed_config.streams
346
+ ).items():
296
347
  yield create_analytics_message(f"file-cdk-{parser}-stream-count", count)
297
348
 
298
349
  def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification:
@@ -308,21 +359,28 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
308
359
  def _get_parsed_config(self, config: Mapping[str, Any]) -> AbstractFileBasedSpec:
309
360
  return self.spec_class(**config)
310
361
 
311
- def _validate_and_get_validation_policy(self, stream_config: FileBasedStreamConfig) -> AbstractSchemaValidationPolicy:
362
+ def _validate_and_get_validation_policy(
363
+ self, stream_config: FileBasedStreamConfig
364
+ ) -> AbstractSchemaValidationPolicy:
312
365
  if stream_config.validation_policy not in self.validation_policies:
313
366
  # This should never happen because we validate the config against the schema's validation_policy enum
314
367
  raise ValidationError(
315
- f"`validation_policy` must be one of {list(self.validation_policies.keys())}", model=FileBasedStreamConfig
368
+ f"`validation_policy` must be one of {list(self.validation_policies.keys())}",
369
+ model=FileBasedStreamConfig,
316
370
  )
317
371
  return self.validation_policies[stream_config.validation_policy]
318
372
 
319
373
  def _validate_input_schema(self, stream_config: FileBasedStreamConfig) -> None:
320
374
  if stream_config.schemaless and stream_config.input_schema:
321
- raise ValidationError("`input_schema` and `schemaless` options cannot both be set", model=FileBasedStreamConfig)
375
+ raise ValidationError(
376
+ "`input_schema` and `schemaless` options cannot both be set",
377
+ model=FileBasedStreamConfig,
378
+ )
322
379
 
323
380
  @staticmethod
324
381
  def _use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
325
382
  use_file_transfer = (
326
- hasattr(parsed_config.delivery_method, "delivery_type") and parsed_config.delivery_method.delivery_type == "use_file_transfer"
383
+ hasattr(parsed_config.delivery_method, "delivery_type")
384
+ and parsed_config.delivery_method.delivery_type == "use_file_transfer"
327
385
  )
328
386
  return use_file_transfer