airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. airbyte_cdk/__init__.py +17 -2
  2. airbyte_cdk/config_observation.py +10 -3
  3. airbyte_cdk/connector.py +19 -9
  4. airbyte_cdk/connector_builder/connector_builder_handler.py +28 -8
  5. airbyte_cdk/connector_builder/main.py +26 -6
  6. airbyte_cdk/connector_builder/message_grouper.py +95 -25
  7. airbyte_cdk/destinations/destination.py +47 -14
  8. airbyte_cdk/destinations/vector_db_based/config.py +36 -14
  9. airbyte_cdk/destinations/vector_db_based/document_processor.py +49 -11
  10. airbyte_cdk/destinations/vector_db_based/embedder.py +52 -11
  11. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  12. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  13. airbyte_cdk/destinations/vector_db_based/writer.py +15 -4
  14. airbyte_cdk/entrypoint.py +82 -26
  15. airbyte_cdk/exception_handler.py +13 -3
  16. airbyte_cdk/logger.py +10 -2
  17. airbyte_cdk/models/airbyte_protocol.py +11 -5
  18. airbyte_cdk/models/airbyte_protocol_serializers.py +9 -3
  19. airbyte_cdk/models/well_known_types.py +1 -1
  20. airbyte_cdk/sources/abstract_source.py +63 -17
  21. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +47 -14
  22. airbyte_cdk/sources/concurrent_source/concurrent_source.py +25 -7
  23. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +27 -6
  24. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +9 -3
  25. airbyte_cdk/sources/connector_state_manager.py +32 -10
  26. airbyte_cdk/sources/declarative/async_job/job.py +3 -1
  27. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +68 -14
  28. airbyte_cdk/sources/declarative/async_job/job_tracker.py +24 -6
  29. airbyte_cdk/sources/declarative/async_job/repository.py +3 -1
  30. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  31. airbyte_cdk/sources/declarative/auth/jwt.py +27 -7
  32. airbyte_cdk/sources/declarative/auth/oauth.py +35 -11
  33. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +3 -1
  34. airbyte_cdk/sources/declarative/auth/token.py +25 -8
  35. airbyte_cdk/sources/declarative/checks/check_stream.py +12 -4
  36. airbyte_cdk/sources/declarative/checks/connection_checker.py +3 -1
  37. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +11 -3
  38. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +106 -50
  39. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +20 -6
  40. airbyte_cdk/sources/declarative/declarative_source.py +3 -1
  41. airbyte_cdk/sources/declarative/declarative_stream.py +27 -6
  42. airbyte_cdk/sources/declarative/decoders/decoder.py +3 -1
  43. airbyte_cdk/sources/declarative/decoders/json_decoder.py +3 -1
  44. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +3 -1
  45. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +6 -2
  46. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +6 -2
  47. airbyte_cdk/sources/declarative/extractors/record_filter.py +24 -7
  48. airbyte_cdk/sources/declarative/extractors/record_selector.py +10 -3
  49. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +15 -5
  50. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +96 -31
  51. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +22 -8
  52. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +46 -15
  53. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +19 -5
  54. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +3 -1
  55. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +20 -2
  56. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +5 -1
  57. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +10 -3
  58. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +6 -2
  59. airbyte_cdk/sources/declarative/interpolation/interpolation.py +7 -1
  60. airbyte_cdk/sources/declarative/interpolation/jinja.py +6 -2
  61. airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
  62. airbyte_cdk/sources/declarative/manifest_declarative_source.py +106 -24
  63. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +7 -2
  64. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +656 -678
  65. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +13 -4
  66. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +9 -2
  67. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +782 -232
  68. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +29 -7
  69. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +25 -7
  70. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +54 -15
  71. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +6 -2
  72. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +3 -1
  73. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +17 -5
  74. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +15 -5
  75. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +3 -1
  76. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +18 -8
  77. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +16 -7
  78. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +51 -14
  79. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +29 -8
  80. airbyte_cdk/sources/declarative/requesters/http_requester.py +58 -16
  81. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +49 -14
  82. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +3 -1
  83. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +3 -1
  84. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +17 -5
  85. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +24 -7
  86. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +9 -3
  87. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +3 -1
  88. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +6 -2
  89. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +19 -6
  90. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +3 -1
  91. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +21 -7
  92. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +18 -6
  93. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +27 -8
  94. airbyte_cdk/sources/declarative/requesters/requester.py +3 -1
  95. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -5
  96. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +105 -24
  97. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +3 -1
  98. airbyte_cdk/sources/declarative/spec/spec.py +8 -2
  99. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +3 -1
  100. airbyte_cdk/sources/declarative/transformations/add_fields.py +12 -3
  101. airbyte_cdk/sources/declarative/transformations/remove_fields.py +6 -2
  102. airbyte_cdk/sources/declarative/types.py +8 -1
  103. airbyte_cdk/sources/declarative/yaml_declarative_source.py +3 -1
  104. airbyte_cdk/sources/embedded/base_integration.py +14 -4
  105. airbyte_cdk/sources/embedded/catalog.py +16 -4
  106. airbyte_cdk/sources/embedded/runner.py +19 -3
  107. airbyte_cdk/sources/embedded/tools.py +3 -1
  108. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
  109. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +27 -7
  110. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +12 -6
  111. airbyte_cdk/sources/file_based/config/csv_format.py +21 -9
  112. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +6 -2
  113. airbyte_cdk/sources/file_based/config/unstructured_format.py +10 -3
  114. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  115. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  116. airbyte_cdk/sources/file_based/exceptions.py +13 -15
  117. airbyte_cdk/sources/file_based/file_based_source.py +82 -24
  118. airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -5
  119. airbyte_cdk/sources/file_based/file_types/avro_parser.py +58 -17
  120. airbyte_cdk/sources/file_based/file_types/csv_parser.py +89 -26
  121. airbyte_cdk/sources/file_based/file_types/excel_parser.py +25 -7
  122. airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
  123. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  124. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +20 -6
  125. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +57 -16
  126. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +64 -15
  127. airbyte_cdk/sources/file_based/schema_helpers.py +33 -10
  128. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  129. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  130. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +33 -10
  131. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +47 -11
  132. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +13 -22
  133. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +53 -17
  134. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +17 -5
  135. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  136. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +26 -9
  137. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +67 -21
  138. airbyte_cdk/sources/http_logger.py +5 -1
  139. airbyte_cdk/sources/message/repository.py +18 -4
  140. airbyte_cdk/sources/source.py +17 -7
  141. airbyte_cdk/sources/streams/availability_strategy.py +9 -3
  142. airbyte_cdk/sources/streams/call_rate.py +63 -19
  143. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +31 -7
  144. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +6 -2
  145. airbyte_cdk/sources/streams/concurrent/adapters.py +77 -22
  146. airbyte_cdk/sources/streams/concurrent/cursor.py +56 -20
  147. airbyte_cdk/sources/streams/concurrent/default_stream.py +9 -2
  148. airbyte_cdk/sources/streams/concurrent/helpers.py +6 -2
  149. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +9 -2
  150. airbyte_cdk/sources/streams/concurrent/partition_reader.py +4 -1
  151. airbyte_cdk/sources/streams/concurrent/partitions/record.py +10 -2
  152. airbyte_cdk/sources/streams/concurrent/partitions/types.py +6 -2
  153. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +25 -10
  154. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +32 -16
  155. airbyte_cdk/sources/streams/core.py +77 -22
  156. airbyte_cdk/sources/streams/http/availability_strategy.py +3 -1
  157. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +4 -1
  158. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +3 -1
  159. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +16 -5
  160. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +9 -3
  161. airbyte_cdk/sources/streams/http/exceptions.py +2 -2
  162. airbyte_cdk/sources/streams/http/http.py +133 -33
  163. airbyte_cdk/sources/streams/http/http_client.py +91 -29
  164. airbyte_cdk/sources/streams/http/rate_limiting.py +23 -7
  165. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +19 -6
  166. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +38 -11
  167. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  168. airbyte_cdk/sources/types.py +5 -1
  169. airbyte_cdk/sources/utils/record_helper.py +12 -3
  170. airbyte_cdk/sources/utils/schema_helpers.py +9 -3
  171. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  172. airbyte_cdk/sources/utils/transform.py +24 -9
  173. airbyte_cdk/sql/exceptions.py +19 -6
  174. airbyte_cdk/sql/secrets.py +3 -1
  175. airbyte_cdk/sql/shared/catalog_providers.py +13 -4
  176. airbyte_cdk/sql/shared/sql_processor.py +44 -14
  177. airbyte_cdk/test/catalog_builder.py +19 -8
  178. airbyte_cdk/test/entrypoint_wrapper.py +27 -8
  179. airbyte_cdk/test/mock_http/mocker.py +41 -11
  180. airbyte_cdk/test/mock_http/request.py +9 -3
  181. airbyte_cdk/test/mock_http/response.py +3 -1
  182. airbyte_cdk/test/mock_http/response_builder.py +29 -7
  183. airbyte_cdk/test/state_builder.py +10 -2
  184. airbyte_cdk/test/utils/data.py +6 -2
  185. airbyte_cdk/test/utils/http_mocking.py +3 -1
  186. airbyte_cdk/utils/airbyte_secrets_utils.py +3 -1
  187. airbyte_cdk/utils/analytics_message.py +10 -2
  188. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  189. airbyte_cdk/utils/mapping_helpers.py +3 -1
  190. airbyte_cdk/utils/message_utils.py +11 -4
  191. airbyte_cdk/utils/print_buffer.py +6 -1
  192. airbyte_cdk/utils/schema_inferrer.py +30 -9
  193. airbyte_cdk/utils/spec_schema_transformations.py +3 -1
  194. airbyte_cdk/utils/traced_exception.py +35 -9
  195. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/METADATA +7 -6
  196. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/RECORD +198 -198
  197. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/LICENSE.txt +0 -0
  198. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/WHEEL +0 -0
@@ -24,45 +24,36 @@ class AbstractConcurrentFileBasedCursor(Cursor, AbstractFileBasedCursor, ABC):
24
24
 
25
25
  @property
26
26
  @abstractmethod
27
- def state(self) -> MutableMapping[str, Any]:
28
- ...
27
+ def state(self) -> MutableMapping[str, Any]: ...
29
28
 
30
29
  @abstractmethod
31
- def observe(self, record: Record) -> None:
32
- ...
30
+ def observe(self, record: Record) -> None: ...
33
31
 
34
32
  @abstractmethod
35
- def close_partition(self, partition: Partition) -> None:
36
- ...
33
+ def close_partition(self, partition: Partition) -> None: ...
37
34
 
38
35
  @abstractmethod
39
- def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None:
40
- ...
36
+ def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None: ...
41
37
 
42
38
  @abstractmethod
43
- def add_file(self, file: RemoteFile) -> None:
44
- ...
39
+ def add_file(self, file: RemoteFile) -> None: ...
45
40
 
46
41
  @abstractmethod
47
- def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]:
48
- ...
42
+ def get_files_to_sync(
43
+ self, all_files: Iterable[RemoteFile], logger: logging.Logger
44
+ ) -> Iterable[RemoteFile]: ...
49
45
 
50
46
  @abstractmethod
51
- def get_state(self) -> MutableMapping[str, Any]:
52
- ...
47
+ def get_state(self) -> MutableMapping[str, Any]: ...
53
48
 
54
49
  @abstractmethod
55
- def set_initial_state(self, value: StreamState) -> None:
56
- ...
50
+ def set_initial_state(self, value: StreamState) -> None: ...
57
51
 
58
52
  @abstractmethod
59
- def get_start_time(self) -> datetime:
60
- ...
53
+ def get_start_time(self) -> datetime: ...
61
54
 
62
55
  @abstractmethod
63
- def emit_state_message(self) -> None:
64
- ...
56
+ def emit_state_message(self) -> None: ...
65
57
 
66
58
  @abstractmethod
67
- def ensure_at_least_one_state_emitted(self) -> None:
68
- ...
59
+ def ensure_at_least_one_state_emitted(self) -> None: ...
@@ -11,7 +11,9 @@ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, Type
11
11
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
12
12
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
13
13
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
14
- from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import AbstractConcurrentFileBasedCursor
14
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import (
15
+ AbstractConcurrentFileBasedCursor,
16
+ )
15
17
  from airbyte_cdk.sources.file_based.stream.cursor import DefaultFileBasedCursor
16
18
  from airbyte_cdk.sources.file_based.types import StreamState
17
19
  from airbyte_cdk.sources.message.repository import MessageRepository
@@ -27,7 +29,9 @@ _NULL_FILE = ""
27
29
 
28
30
  class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
29
31
  CURSOR_FIELD = "_ab_source_file_last_modified"
30
- DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = DefaultFileBasedCursor.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
32
+ DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = (
33
+ DefaultFileBasedCursor.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
34
+ )
31
35
  DEFAULT_MAX_HISTORY_SIZE = 10_000
32
36
  DATE_TIME_FORMAT = DefaultFileBasedCursor.DATE_TIME_FORMAT
33
37
  zero_value = datetime.min
@@ -51,7 +55,8 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
51
55
  self._connector_state_manager = connector_state_manager
52
56
  self._cursor_field = cursor_field
53
57
  self._time_window_if_history_is_full = timedelta(
54
- days=stream_config.days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
58
+ days=stream_config.days_to_sync_if_history_is_full
59
+ or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
55
60
  )
56
61
  self._state_lock = RLock()
57
62
  self._pending_files_lock = RLock()
@@ -70,7 +75,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
70
75
  def close_partition(self, partition: Partition) -> None:
71
76
  with self._pending_files_lock:
72
77
  if self._pending_files is None:
73
- raise RuntimeError("Expected pending partitions to be set but it was not. This is unexpected. Please contact Support.")
78
+ raise RuntimeError(
79
+ "Expected pending partitions to be set but it was not. This is unexpected. Please contact Support."
80
+ )
74
81
 
75
82
  def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None:
76
83
  with self._pending_files_lock:
@@ -81,7 +88,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
81
88
  continue
82
89
  for file in _slice["files"]:
83
90
  if file.uri in self._pending_files.keys():
84
- raise RuntimeError(f"Already found file {_slice} in pending files. This is unexpected. Please contact Support.")
91
+ raise RuntimeError(
92
+ f"Already found file {_slice} in pending files. This is unexpected. Please contact Support."
93
+ )
85
94
  self._pending_files.update({file.uri: file})
86
95
 
87
96
  def _compute_prev_sync_cursor(self, value: Optional[StreamState]) -> Tuple[datetime, str]:
@@ -96,7 +105,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
96
105
  # represents the start time that the file was uploaded, we can usually expect that all previous
97
106
  # files have already been uploaded. If that's the case, they'll be in history and we'll skip
98
107
  # re-uploading them.
99
- earliest_file_cursor_value = self._get_cursor_key_from_file(self._compute_earliest_file_in_history())
108
+ earliest_file_cursor_value = self._get_cursor_key_from_file(
109
+ self._compute_earliest_file_in_history()
110
+ )
100
111
  cursor_str = min(prev_cursor_str, earliest_file_cursor_value)
101
112
  cursor_dt, cursor_uri = cursor_str.split("_", 1)
102
113
  return datetime.strptime(cursor_dt, self.DATE_TIME_FORMAT), cursor_uri
@@ -109,8 +120,13 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
109
120
  def _compute_earliest_file_in_history(self) -> Optional[RemoteFile]:
110
121
  with self._state_lock:
111
122
  if self._file_to_datetime_history:
112
- filename, last_modified = min(self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0]))
113
- return RemoteFile(uri=filename, last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT))
123
+ filename, last_modified = min(
124
+ self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])
125
+ )
126
+ return RemoteFile(
127
+ uri=filename,
128
+ last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT),
129
+ )
114
130
  else:
115
131
  return None
116
132
 
@@ -120,7 +136,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
120
136
  :param file: The file to add
121
137
  """
122
138
  if self._pending_files is None:
123
- raise RuntimeError("Expected pending partitions to be set but it was not. This is unexpected. Please contact Support.")
139
+ raise RuntimeError(
140
+ "Expected pending partitions to be set but it was not. This is unexpected. Please contact Support."
141
+ )
124
142
  with self._pending_files_lock:
125
143
  with self._state_lock:
126
144
  if file.uri not in self._pending_files:
@@ -135,7 +153,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
135
153
  )
136
154
  else:
137
155
  self._pending_files.pop(file.uri)
138
- self._file_to_datetime_history[file.uri] = file.last_modified.strftime(self.DATE_TIME_FORMAT)
156
+ self._file_to_datetime_history[file.uri] = file.last_modified.strftime(
157
+ self.DATE_TIME_FORMAT
158
+ )
139
159
  if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE:
140
160
  # Get the earliest file based on its last modified date and its uri
141
161
  oldest_file = self._compute_earliest_file_in_history()
@@ -155,7 +175,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
155
175
  self._stream_namespace,
156
176
  new_state,
157
177
  )
158
- state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace)
178
+ state_message = self._connector_state_manager.create_state_message(
179
+ self._stream_name, self._stream_namespace
180
+ )
159
181
  self._message_repository.emit_message(state_message)
160
182
 
161
183
  def _get_new_cursor_value(self) -> str:
@@ -183,12 +205,19 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
183
205
  def _compute_latest_file_in_history(self) -> Optional[RemoteFile]:
184
206
  with self._state_lock:
185
207
  if self._file_to_datetime_history:
186
- filename, last_modified = max(self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0]))
187
- return RemoteFile(uri=filename, last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT))
208
+ filename, last_modified = max(
209
+ self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])
210
+ )
211
+ return RemoteFile(
212
+ uri=filename,
213
+ last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT),
214
+ )
188
215
  else:
189
216
  return None
190
217
 
191
- def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]:
218
+ def get_files_to_sync(
219
+ self, all_files: Iterable[RemoteFile], logger: logging.Logger
220
+ ) -> Iterable[RemoteFile]:
192
221
  """
193
222
  Given the list of files in the source, return the files that should be synced.
194
223
  :param all_files: All files in the source
@@ -210,7 +239,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
210
239
  with self._state_lock:
211
240
  if file.uri in self._file_to_datetime_history:
212
241
  # If the file's uri is in the history, we should sync the file if it has been modified since it was synced
213
- updated_at_from_history = datetime.strptime(self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT)
242
+ updated_at_from_history = datetime.strptime(
243
+ self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT
244
+ )
214
245
  if file.last_modified < updated_at_from_history:
215
246
  self._message_repository.emit_message(
216
247
  AirbyteMessage(
@@ -246,7 +277,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
246
277
  """
247
278
  with self._state_lock:
248
279
  if self._file_to_datetime_history is None:
249
- raise RuntimeError("The history object has not been set. This is unexpected. Please contact Support.")
280
+ raise RuntimeError(
281
+ "The history object has not been set. This is unexpected. Please contact Support."
282
+ )
250
283
  return len(self._file_to_datetime_history) >= self.DEFAULT_MAX_HISTORY_SIZE
251
284
 
252
285
  def _compute_start_time(self) -> datetime:
@@ -268,7 +301,10 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
268
301
  Get the state of the cursor.
269
302
  """
270
303
  with self._state_lock:
271
- return {"history": self._file_to_datetime_history, self._cursor_field.cursor_field_key: self._get_new_cursor_value()}
304
+ return {
305
+ "history": self._file_to_datetime_history,
306
+ self._cursor_field.cursor_field_key: self._get_new_cursor_value(),
307
+ }
272
308
 
273
309
  def set_initial_state(self, value: StreamState) -> None:
274
310
  pass
@@ -9,7 +9,9 @@ from typing import TYPE_CHECKING, Any, Iterable, List, MutableMapping, Optional
9
9
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
10
10
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
11
11
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
12
- from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import AbstractConcurrentFileBasedCursor
12
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import (
13
+ AbstractConcurrentFileBasedCursor,
14
+ )
13
15
  from airbyte_cdk.sources.file_based.types import StreamState
14
16
  from airbyte_cdk.sources.message import MessageRepository
15
17
  from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY
@@ -24,7 +26,11 @@ class FileBasedFinalStateCursor(AbstractConcurrentFileBasedCursor):
24
26
  """Cursor that is used to guarantee at least one state message is emitted for a concurrent file-based stream."""
25
27
 
26
28
  def __init__(
27
- self, stream_config: FileBasedStreamConfig, message_repository: MessageRepository, stream_namespace: Optional[str], **kwargs: Any
29
+ self,
30
+ stream_config: FileBasedStreamConfig,
31
+ message_repository: MessageRepository,
32
+ stream_namespace: Optional[str],
33
+ **kwargs: Any,
28
34
  ):
29
35
  self._stream_name = stream_config.name
30
36
  self._stream_namespace = stream_namespace
@@ -50,7 +56,9 @@ class FileBasedFinalStateCursor(AbstractConcurrentFileBasedCursor):
50
56
  def add_file(self, file: RemoteFile) -> None:
51
57
  pass
52
58
 
53
- def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]:
59
+ def get_files_to_sync(
60
+ self, all_files: Iterable[RemoteFile], logger: logging.Logger
61
+ ) -> Iterable[RemoteFile]:
54
62
  return all_files
55
63
 
56
64
  def get_state(self) -> MutableMapping[str, Any]:
@@ -66,6 +74,10 @@ class FileBasedFinalStateCursor(AbstractConcurrentFileBasedCursor):
66
74
  pass
67
75
 
68
76
  def ensure_at_least_one_state_emitted(self) -> None:
69
- self._connector_state_manager.update_state_for_stream(self._stream_name, self._stream_namespace, self.state)
70
- state_message = self._connector_state_manager.create_state_message(self._stream_name, self._stream_namespace)
77
+ self._connector_state_manager.update_state_for_stream(
78
+ self._stream_name, self._stream_namespace, self.state
79
+ )
80
+ state_message = self._connector_state_manager.create_state_message(
81
+ self._stream_name, self._stream_namespace
82
+ )
71
83
  self._message_repository.emit_message(state_message)
@@ -54,7 +54,9 @@ class AbstractFileBasedCursor(ABC):
54
54
  ...
55
55
 
56
56
  @abstractmethod
57
- def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]:
57
+ def get_files_to_sync(
58
+ self, all_files: Iterable[RemoteFile], logger: logging.Logger
59
+ ) -> Iterable[RemoteFile]:
58
60
  """
59
61
  Given the list of files in the source, return the files that should be synced.
60
62
  :param all_files: All files in the source
@@ -8,7 +8,9 @@ from typing import Any, Iterable, MutableMapping, Optional
8
8
 
9
9
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
10
10
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
- from airbyte_cdk.sources.file_based.stream.cursor.abstract_file_based_cursor import AbstractFileBasedCursor
11
+ from airbyte_cdk.sources.file_based.stream.cursor.abstract_file_based_cursor import (
12
+ AbstractFileBasedCursor,
13
+ )
12
14
  from airbyte_cdk.sources.file_based.types import StreamState
13
15
 
14
16
 
@@ -22,11 +24,14 @@ class DefaultFileBasedCursor(AbstractFileBasedCursor):
22
24
  super().__init__(stream_config)
23
25
  self._file_to_datetime_history: MutableMapping[str, str] = {}
24
26
  self._time_window_if_history_is_full = timedelta(
25
- days=stream_config.days_to_sync_if_history_is_full or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
27
+ days=stream_config.days_to_sync_if_history_is_full
28
+ or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
26
29
  )
27
30
 
28
31
  if self._time_window_if_history_is_full <= timedelta():
29
- raise ValueError(f"days_to_sync_if_history_is_full must be a positive timedelta, got {self._time_window_if_history_is_full}")
32
+ raise ValueError(
33
+ f"days_to_sync_if_history_is_full must be a positive timedelta, got {self._time_window_if_history_is_full}"
34
+ )
30
35
 
31
36
  self._start_time = self._compute_start_time()
32
37
  self._initial_earliest_file_in_history: Optional[RemoteFile] = None
@@ -37,7 +42,9 @@ class DefaultFileBasedCursor(AbstractFileBasedCursor):
37
42
  self._initial_earliest_file_in_history = self._compute_earliest_file_in_history()
38
43
 
39
44
  def add_file(self, file: RemoteFile) -> None:
40
- self._file_to_datetime_history[file.uri] = file.last_modified.strftime(self.DATE_TIME_FORMAT)
45
+ self._file_to_datetime_history[file.uri] = file.last_modified.strftime(
46
+ self.DATE_TIME_FORMAT
47
+ )
41
48
  if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE:
42
49
  # Get the earliest file based on its last modified date and its uri
43
50
  oldest_file = self._compute_earliest_file_in_history()
@@ -60,7 +67,9 @@ class DefaultFileBasedCursor(AbstractFileBasedCursor):
60
67
  a string joining the last-modified timestamp of the last synced file and the name of the file.
61
68
  """
62
69
  if self._file_to_datetime_history.items():
63
- filename, timestamp = max(self._file_to_datetime_history.items(), key=lambda x: (x[1], x[0]))
70
+ filename, timestamp = max(
71
+ self._file_to_datetime_history.items(), key=lambda x: (x[1], x[0])
72
+ )
64
73
  return f"{timestamp}_{filename}"
65
74
  return None
66
75
 
@@ -73,7 +82,9 @@ class DefaultFileBasedCursor(AbstractFileBasedCursor):
73
82
  def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool:
74
83
  if file.uri in self._file_to_datetime_history:
75
84
  # If the file's uri is in the history, we should sync the file if it has been modified since it was synced
76
- updated_at_from_history = datetime.strptime(self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT)
85
+ updated_at_from_history = datetime.strptime(
86
+ self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT
87
+ )
77
88
  if file.last_modified < updated_at_from_history:
78
89
  logger.warning(
79
90
  f"The file {file.uri}'s last modified date is older than the last time it was synced. This is unexpected. Skipping the file."
@@ -99,7 +110,9 @@ class DefaultFileBasedCursor(AbstractFileBasedCursor):
99
110
  # The file is not in the history and the history is complete. We know we need to sync the file
100
111
  return True
101
112
 
102
- def get_files_to_sync(self, all_files: Iterable[RemoteFile], logger: logging.Logger) -> Iterable[RemoteFile]:
113
+ def get_files_to_sync(
114
+ self, all_files: Iterable[RemoteFile], logger: logging.Logger
115
+ ) -> Iterable[RemoteFile]:
103
116
  if self._is_history_full():
104
117
  logger.warning(
105
118
  f"The state history is full. "
@@ -115,8 +128,12 @@ class DefaultFileBasedCursor(AbstractFileBasedCursor):
115
128
 
116
129
  def _compute_earliest_file_in_history(self) -> Optional[RemoteFile]:
117
130
  if self._file_to_datetime_history:
118
- filename, last_modified = min(self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0]))
119
- return RemoteFile(uri=filename, last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT))
131
+ filename, last_modified = min(
132
+ self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])
133
+ )
134
+ return RemoteFile(
135
+ uri=filename, last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT)
136
+ )
120
137
  else:
121
138
  return None
122
139
 
@@ -22,7 +22,12 @@ from airbyte_cdk.sources.file_based.exceptions import (
22
22
  )
23
23
  from airbyte_cdk.sources.file_based.file_types import FileTransfer
24
24
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
25
- from airbyte_cdk.sources.file_based.schema_helpers import SchemaType, file_transfer_schema, merge_schemas, schemaless_schema
25
+ from airbyte_cdk.sources.file_based.schema_helpers import (
26
+ SchemaType,
27
+ file_transfer_schema,
28
+ merge_schemas,
29
+ schemaless_schema,
30
+ )
26
31
  from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
27
32
  from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
28
33
  from airbyte_cdk.sources.file_based.types import StreamSlice
@@ -33,7 +38,6 @@ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
33
38
 
34
39
 
35
40
  class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
36
-
37
41
  """
38
42
  The default file-based stream.
39
43
  """
@@ -68,18 +72,28 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
68
72
  @cursor.setter
69
73
  def cursor(self, value: AbstractFileBasedCursor) -> None:
70
74
  if self._cursor is not None:
71
- raise RuntimeError(f"Cursor for stream {self.name} is already set. This is unexpected. Please contact Support.")
75
+ raise RuntimeError(
76
+ f"Cursor for stream {self.name} is already set. This is unexpected. Please contact Support."
77
+ )
72
78
  self._cursor = value
73
79
 
74
80
  @property
75
81
  def primary_key(self) -> PrimaryKeyType:
76
- return self.config.primary_key or self.get_parser().get_parser_defined_primary_key(self.config)
82
+ return self.config.primary_key or self.get_parser().get_parser_defined_primary_key(
83
+ self.config
84
+ )
77
85
 
78
- def _filter_schema_invalid_properties(self, configured_catalog_json_schema: Dict[str, Any]) -> Dict[str, Any]:
86
+ def _filter_schema_invalid_properties(
87
+ self, configured_catalog_json_schema: Dict[str, Any]
88
+ ) -> Dict[str, Any]:
79
89
  if self.use_file_transfer:
80
90
  return {
81
91
  "type": "object",
82
- "properties": {"file_path": {"type": "string"}, "file_size": {"type": "string"}, self.ab_file_name_col: {"type": "string"}},
92
+ "properties": {
93
+ "file_path": {"type": "string"},
94
+ "file_size": {"type": "string"},
95
+ self.ab_file_name_col: {"type": "string"},
96
+ },
83
97
  }
84
98
  else:
85
99
  return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
@@ -89,16 +103,23 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
89
103
  all_files = self.list_files()
90
104
  files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
91
105
  sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
92
- slices = [{"files": list(group[1])} for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)]
106
+ slices = [
107
+ {"files": list(group[1])}
108
+ for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
109
+ ]
93
110
  return slices
94
111
 
95
- def transform_record(self, record: dict[str, Any], file: RemoteFile, last_updated: str) -> dict[str, Any]:
112
+ def transform_record(
113
+ self, record: dict[str, Any], file: RemoteFile, last_updated: str
114
+ ) -> dict[str, Any]:
96
115
  # adds _ab_source_file_last_modified and _ab_source_file_url to the record
97
116
  record[self.ab_last_mod_col] = last_updated
98
117
  record[self.ab_file_name_col] = file.uri
99
118
  return record
100
119
 
101
- def transform_record_for_file_transfer(self, record: dict[str, Any], file: RemoteFile) -> dict[str, Any]:
120
+ def transform_record_for_file_transfer(
121
+ self, record: dict[str, Any], file: RemoteFile
122
+ ) -> dict[str, Any]:
102
123
  # timstamp() returns a float representing the number of seconds since the unix epoch
103
124
  record[self.modified] = int(file.last_modified.timestamp()) * 1000
104
125
  record[self.source_file_url] = file.uri
@@ -127,15 +148,21 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
127
148
  self.logger.info(f"{self.name}: {file} file-based syncing")
128
149
  # todo: complete here the code to not rely on local parser
129
150
  file_transfer = FileTransfer()
130
- for record in file_transfer.get_file(self.config, file, self.stream_reader, self.logger):
151
+ for record in file_transfer.get_file(
152
+ self.config, file, self.stream_reader, self.logger
153
+ ):
131
154
  line_no += 1
132
155
  if not self.record_passes_validation_policy(record):
133
156
  n_skipped += 1
134
157
  continue
135
158
  record = self.transform_record_for_file_transfer(record, file)
136
- yield stream_data_to_airbyte_message(self.name, record, is_file_transfer_message=True)
159
+ yield stream_data_to_airbyte_message(
160
+ self.name, record, is_file_transfer_message=True
161
+ )
137
162
  else:
138
- for record in parser.parse_records(self.config, file, self.stream_reader, self.logger, schema):
163
+ for record in parser.parse_records(
164
+ self.config, file, self.stream_reader, self.logger, schema
165
+ ):
139
166
  line_no += 1
140
167
  if self.config.schemaless:
141
168
  record = {"data": record}
@@ -220,7 +247,9 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
220
247
  except AirbyteTracedException as ate:
221
248
  raise ate
222
249
  except Exception as exc:
223
- raise SchemaInferenceError(FileBasedSourceError.SCHEMA_INFERENCE_ERROR, stream=self.name) from exc
250
+ raise SchemaInferenceError(
251
+ FileBasedSourceError.SCHEMA_INFERENCE_ERROR, stream=self.name
252
+ ) from exc
224
253
  else:
225
254
  return {"type": "object", "properties": {**extra_fields, **schema["properties"]}}
226
255
 
@@ -245,14 +274,20 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
245
274
  first_n_files = self.config.recent_n_files_to_read_for_schema_discovery
246
275
 
247
276
  if first_n_files == 0:
248
- self.logger.warning(msg=f"No files were identified in the stream {self.name}. Setting default schema for the stream.")
277
+ self.logger.warning(
278
+ msg=f"No files were identified in the stream {self.name}. Setting default schema for the stream."
279
+ )
249
280
  return schemaless_schema
250
281
 
251
- max_n_files_for_schema_inference = self._discovery_policy.get_max_n_files_for_schema_inference(self.get_parser())
282
+ max_n_files_for_schema_inference = (
283
+ self._discovery_policy.get_max_n_files_for_schema_inference(self.get_parser())
284
+ )
252
285
 
253
286
  if first_n_files > max_n_files_for_schema_inference:
254
287
  # Use the most recent files for schema inference, so we pick up schema changes during discovery.
255
- self.logger.warning(msg=f"Refusing to infer schema for {first_n_files} files; using {max_n_files_for_schema_inference} files.")
288
+ self.logger.warning(
289
+ msg=f"Refusing to infer schema for {first_n_files} files; using {max_n_files_for_schema_inference} files."
290
+ )
256
291
  first_n_files = max_n_files_for_schema_inference
257
292
 
258
293
  files = sorted(files, key=lambda x: x.last_modified, reverse=True)[:first_n_files]
@@ -274,7 +309,9 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
274
309
  """
275
310
  Return all files that belong to the stream as defined by the stream's globs.
276
311
  """
277
- return self.stream_reader.get_matching_files(self.config.globs or [], self.config.legacy_prefix, self.logger)
312
+ return self.stream_reader.get_matching_files(
313
+ self.config.globs or [], self.config.legacy_prefix, self.logger
314
+ )
278
315
 
279
316
  def infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
280
317
  loop = asyncio.get_event_loop()
@@ -312,25 +349,34 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
312
349
  n_started, n_files = 0, len(files)
313
350
  files_iterator = iter(files)
314
351
  while pending_tasks or n_started < n_files:
315
- while len(pending_tasks) <= self._discovery_policy.n_concurrent_requests and (file := next(files_iterator, None)):
352
+ while len(pending_tasks) <= self._discovery_policy.n_concurrent_requests and (
353
+ file := next(files_iterator, None)
354
+ ):
316
355
  pending_tasks.add(asyncio.create_task(self._infer_file_schema(file)))
317
356
  n_started += 1
318
357
  # Return when the first task is completed so that we can enqueue a new task as soon as the
319
358
  # number of concurrent tasks drops below the number allowed.
320
- done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
359
+ done, pending_tasks = await asyncio.wait(
360
+ pending_tasks, return_when=asyncio.FIRST_COMPLETED
361
+ )
321
362
  for task in done:
322
363
  try:
323
364
  base_schema = merge_schemas(base_schema, task.result())
324
365
  except AirbyteTracedException as ate:
325
366
  raise ate
326
367
  except Exception as exc:
327
- self.logger.error(f"An error occurred inferring the schema. \n {traceback.format_exc()}", exc_info=exc)
368
+ self.logger.error(
369
+ f"An error occurred inferring the schema. \n {traceback.format_exc()}",
370
+ exc_info=exc,
371
+ )
328
372
 
329
373
  return base_schema
330
374
 
331
375
  async def _infer_file_schema(self, file: RemoteFile) -> SchemaType:
332
376
  try:
333
- return await self.get_parser().infer_schema(self.config, file, self.stream_reader, self.logger)
377
+ return await self.get_parser().infer_schema(
378
+ self.config, file, self.stream_reader, self.logger
379
+ )
334
380
  except AirbyteTracedException as ate:
335
381
  raise ate
336
382
  except Exception as exc:
@@ -9,7 +9,11 @@ from airbyte_cdk.sources.message import LogMessage
9
9
 
10
10
 
11
11
  def format_http_message(
12
- response: requests.Response, title: str, description: str, stream_name: Optional[str], is_auxiliary: bool = None
12
+ response: requests.Response,
13
+ title: str,
14
+ description: str,
15
+ stream_name: Optional[str],
16
+ is_auxiliary: bool = None,
13
17
  ) -> LogMessage:
14
18
  request = response.request
15
19
  log_message = {
@@ -28,7 +28,9 @@ _SEVERITY_BY_LOG_LEVEL = {
28
28
 
29
29
  def _is_severe_enough(threshold: Level, level: Level) -> bool:
30
30
  if threshold not in _SEVERITY_BY_LOG_LEVEL:
31
- _LOGGER.warning(f"Log level {threshold} for threshold is not supported. This is probably a CDK bug. Please contact Airbyte.")
31
+ _LOGGER.warning(
32
+ f"Log level {threshold} for threshold is not supported. This is probably a CDK bug. Please contact Airbyte."
33
+ )
32
34
  return True
33
35
 
34
36
  if level not in _SEVERITY_BY_LOG_LEVEL:
@@ -80,7 +82,12 @@ class InMemoryMessageRepository(MessageRepository):
80
82
  def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None:
81
83
  if _is_severe_enough(self._log_level, level):
82
84
  self.emit_message(
83
- AirbyteMessage(type=Type.LOG, log=AirbyteLogMessage(level=level, message=filter_secrets(json.dumps(message_provider()))))
85
+ AirbyteMessage(
86
+ type=Type.LOG,
87
+ log=AirbyteLogMessage(
88
+ level=level, message=filter_secrets(json.dumps(message_provider()))
89
+ ),
90
+ )
84
91
  )
85
92
 
86
93
  def consume_queue(self) -> Iterable[AirbyteMessage]:
@@ -89,7 +96,12 @@ class InMemoryMessageRepository(MessageRepository):
89
96
 
90
97
 
91
98
  class LogAppenderMessageRepositoryDecorator(MessageRepository):
92
- def __init__(self, dict_to_append: LogMessage, decorated: MessageRepository, log_level: Level = Level.INFO):
99
+ def __init__(
100
+ self,
101
+ dict_to_append: LogMessage,
102
+ decorated: MessageRepository,
103
+ log_level: Level = Level.INFO,
104
+ ):
93
105
  self._dict_to_append = dict_to_append
94
106
  self._decorated = decorated
95
107
  self._log_level = log_level
@@ -106,7 +118,9 @@ class LogAppenderMessageRepositoryDecorator(MessageRepository):
106
118
  def consume_queue(self) -> Iterable[AirbyteMessage]:
107
119
  return self._decorated.consume_queue()
108
120
 
109
- def _append_second_to_first(self, first: LogMessage, second: LogMessage, path: Optional[List[str]] = None) -> LogMessage:
121
+ def _append_second_to_first(
122
+ self, first: LogMessage, second: LogMessage, path: Optional[List[str]] = None
123
+ ) -> LogMessage:
110
124
  if path is None:
111
125
  path = []
112
126