airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (198) hide show
  1. airbyte_cdk/__init__.py +17 -2
  2. airbyte_cdk/config_observation.py +10 -3
  3. airbyte_cdk/connector.py +19 -9
  4. airbyte_cdk/connector_builder/connector_builder_handler.py +28 -8
  5. airbyte_cdk/connector_builder/main.py +26 -6
  6. airbyte_cdk/connector_builder/message_grouper.py +95 -25
  7. airbyte_cdk/destinations/destination.py +47 -14
  8. airbyte_cdk/destinations/vector_db_based/config.py +36 -14
  9. airbyte_cdk/destinations/vector_db_based/document_processor.py +49 -11
  10. airbyte_cdk/destinations/vector_db_based/embedder.py +52 -11
  11. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  12. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  13. airbyte_cdk/destinations/vector_db_based/writer.py +15 -4
  14. airbyte_cdk/entrypoint.py +82 -26
  15. airbyte_cdk/exception_handler.py +13 -3
  16. airbyte_cdk/logger.py +10 -2
  17. airbyte_cdk/models/airbyte_protocol.py +11 -5
  18. airbyte_cdk/models/airbyte_protocol_serializers.py +9 -3
  19. airbyte_cdk/models/well_known_types.py +1 -1
  20. airbyte_cdk/sources/abstract_source.py +63 -17
  21. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +47 -14
  22. airbyte_cdk/sources/concurrent_source/concurrent_source.py +25 -7
  23. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +27 -6
  24. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +9 -3
  25. airbyte_cdk/sources/connector_state_manager.py +32 -10
  26. airbyte_cdk/sources/declarative/async_job/job.py +3 -1
  27. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +68 -14
  28. airbyte_cdk/sources/declarative/async_job/job_tracker.py +24 -6
  29. airbyte_cdk/sources/declarative/async_job/repository.py +3 -1
  30. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  31. airbyte_cdk/sources/declarative/auth/jwt.py +27 -7
  32. airbyte_cdk/sources/declarative/auth/oauth.py +35 -11
  33. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +3 -1
  34. airbyte_cdk/sources/declarative/auth/token.py +25 -8
  35. airbyte_cdk/sources/declarative/checks/check_stream.py +12 -4
  36. airbyte_cdk/sources/declarative/checks/connection_checker.py +3 -1
  37. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +11 -3
  38. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +106 -50
  39. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +20 -6
  40. airbyte_cdk/sources/declarative/declarative_source.py +3 -1
  41. airbyte_cdk/sources/declarative/declarative_stream.py +27 -6
  42. airbyte_cdk/sources/declarative/decoders/decoder.py +3 -1
  43. airbyte_cdk/sources/declarative/decoders/json_decoder.py +3 -1
  44. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +3 -1
  45. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +6 -2
  46. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +6 -2
  47. airbyte_cdk/sources/declarative/extractors/record_filter.py +24 -7
  48. airbyte_cdk/sources/declarative/extractors/record_selector.py +10 -3
  49. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +15 -5
  50. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +96 -31
  51. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +22 -8
  52. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +46 -15
  53. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +19 -5
  54. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +3 -1
  55. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +20 -2
  56. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +5 -1
  57. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +10 -3
  58. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +6 -2
  59. airbyte_cdk/sources/declarative/interpolation/interpolation.py +7 -1
  60. airbyte_cdk/sources/declarative/interpolation/jinja.py +6 -2
  61. airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
  62. airbyte_cdk/sources/declarative/manifest_declarative_source.py +106 -24
  63. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +7 -2
  64. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +656 -678
  65. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +13 -4
  66. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +9 -2
  67. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +782 -232
  68. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +29 -7
  69. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +25 -7
  70. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +54 -15
  71. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +6 -2
  72. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +3 -1
  73. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +17 -5
  74. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +15 -5
  75. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +3 -1
  76. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +18 -8
  77. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +16 -7
  78. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +51 -14
  79. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +29 -8
  80. airbyte_cdk/sources/declarative/requesters/http_requester.py +58 -16
  81. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +49 -14
  82. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +3 -1
  83. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +3 -1
  84. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +17 -5
  85. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +24 -7
  86. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +9 -3
  87. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +3 -1
  88. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +6 -2
  89. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +19 -6
  90. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +3 -1
  91. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +21 -7
  92. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +18 -6
  93. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +27 -8
  94. airbyte_cdk/sources/declarative/requesters/requester.py +3 -1
  95. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -5
  96. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +105 -24
  97. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +3 -1
  98. airbyte_cdk/sources/declarative/spec/spec.py +8 -2
  99. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +3 -1
  100. airbyte_cdk/sources/declarative/transformations/add_fields.py +12 -3
  101. airbyte_cdk/sources/declarative/transformations/remove_fields.py +6 -2
  102. airbyte_cdk/sources/declarative/types.py +8 -1
  103. airbyte_cdk/sources/declarative/yaml_declarative_source.py +3 -1
  104. airbyte_cdk/sources/embedded/base_integration.py +14 -4
  105. airbyte_cdk/sources/embedded/catalog.py +16 -4
  106. airbyte_cdk/sources/embedded/runner.py +19 -3
  107. airbyte_cdk/sources/embedded/tools.py +3 -1
  108. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
  109. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +27 -7
  110. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +12 -6
  111. airbyte_cdk/sources/file_based/config/csv_format.py +21 -9
  112. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +6 -2
  113. airbyte_cdk/sources/file_based/config/unstructured_format.py +10 -3
  114. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  115. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  116. airbyte_cdk/sources/file_based/exceptions.py +13 -15
  117. airbyte_cdk/sources/file_based/file_based_source.py +82 -24
  118. airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -5
  119. airbyte_cdk/sources/file_based/file_types/avro_parser.py +58 -17
  120. airbyte_cdk/sources/file_based/file_types/csv_parser.py +89 -26
  121. airbyte_cdk/sources/file_based/file_types/excel_parser.py +25 -7
  122. airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
  123. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  124. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +20 -6
  125. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +57 -16
  126. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +64 -15
  127. airbyte_cdk/sources/file_based/schema_helpers.py +33 -10
  128. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  129. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  130. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +33 -10
  131. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +47 -11
  132. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +13 -22
  133. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +53 -17
  134. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +17 -5
  135. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  136. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +26 -9
  137. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +67 -21
  138. airbyte_cdk/sources/http_logger.py +5 -1
  139. airbyte_cdk/sources/message/repository.py +18 -4
  140. airbyte_cdk/sources/source.py +17 -7
  141. airbyte_cdk/sources/streams/availability_strategy.py +9 -3
  142. airbyte_cdk/sources/streams/call_rate.py +63 -19
  143. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +31 -7
  144. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +6 -2
  145. airbyte_cdk/sources/streams/concurrent/adapters.py +77 -22
  146. airbyte_cdk/sources/streams/concurrent/cursor.py +56 -20
  147. airbyte_cdk/sources/streams/concurrent/default_stream.py +9 -2
  148. airbyte_cdk/sources/streams/concurrent/helpers.py +6 -2
  149. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +9 -2
  150. airbyte_cdk/sources/streams/concurrent/partition_reader.py +4 -1
  151. airbyte_cdk/sources/streams/concurrent/partitions/record.py +10 -2
  152. airbyte_cdk/sources/streams/concurrent/partitions/types.py +6 -2
  153. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +25 -10
  154. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +32 -16
  155. airbyte_cdk/sources/streams/core.py +77 -22
  156. airbyte_cdk/sources/streams/http/availability_strategy.py +3 -1
  157. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +4 -1
  158. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +3 -1
  159. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +16 -5
  160. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +9 -3
  161. airbyte_cdk/sources/streams/http/exceptions.py +2 -2
  162. airbyte_cdk/sources/streams/http/http.py +133 -33
  163. airbyte_cdk/sources/streams/http/http_client.py +91 -29
  164. airbyte_cdk/sources/streams/http/rate_limiting.py +23 -7
  165. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +19 -6
  166. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +38 -11
  167. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  168. airbyte_cdk/sources/types.py +5 -1
  169. airbyte_cdk/sources/utils/record_helper.py +12 -3
  170. airbyte_cdk/sources/utils/schema_helpers.py +9 -3
  171. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  172. airbyte_cdk/sources/utils/transform.py +24 -9
  173. airbyte_cdk/sql/exceptions.py +19 -6
  174. airbyte_cdk/sql/secrets.py +3 -1
  175. airbyte_cdk/sql/shared/catalog_providers.py +13 -4
  176. airbyte_cdk/sql/shared/sql_processor.py +44 -14
  177. airbyte_cdk/test/catalog_builder.py +19 -8
  178. airbyte_cdk/test/entrypoint_wrapper.py +27 -8
  179. airbyte_cdk/test/mock_http/mocker.py +41 -11
  180. airbyte_cdk/test/mock_http/request.py +9 -3
  181. airbyte_cdk/test/mock_http/response.py +3 -1
  182. airbyte_cdk/test/mock_http/response_builder.py +29 -7
  183. airbyte_cdk/test/state_builder.py +10 -2
  184. airbyte_cdk/test/utils/data.py +6 -2
  185. airbyte_cdk/test/utils/http_mocking.py +3 -1
  186. airbyte_cdk/utils/airbyte_secrets_utils.py +3 -1
  187. airbyte_cdk/utils/analytics_message.py +10 -2
  188. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  189. airbyte_cdk/utils/mapping_helpers.py +3 -1
  190. airbyte_cdk/utils/message_utils.py +11 -4
  191. airbyte_cdk/utils/print_buffer.py +6 -1
  192. airbyte_cdk/utils/schema_inferrer.py +30 -9
  193. airbyte_cdk/utils/spec_schema_transformations.py +3 -1
  194. airbyte_cdk/utils/traced_exception.py +35 -9
  195. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/METADATA +7 -6
  196. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/RECORD +198 -198
  197. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/LICENSE.txt +0 -0
  198. {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/WHEEL +0 -0
@@ -10,9 +10,19 @@ from urllib.parse import unquote
10
10
 
11
11
  import pyarrow as pa
12
12
  import pyarrow.parquet as pq
13
- from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ParquetFormat
14
- from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, RecordParseError
15
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
13
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
14
+ FileBasedStreamConfig,
15
+ ParquetFormat,
16
+ )
17
+ from airbyte_cdk.sources.file_based.exceptions import (
18
+ ConfigValidationError,
19
+ FileBasedSourceError,
20
+ RecordParseError,
21
+ )
22
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
23
+ AbstractFileBasedStreamReader,
24
+ FileReadMode,
25
+ )
16
26
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
17
27
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
18
28
  from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
@@ -20,7 +30,6 @@ from pyarrow import DictionaryArray, Scalar
20
30
 
21
31
 
22
32
  class ParquetParser(FileTypeParser):
23
-
24
33
  ENCODING = None
25
34
 
26
35
  def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
@@ -45,9 +54,15 @@ class ParquetParser(FileTypeParser):
45
54
  parquet_schema = parquet_file.schema_arrow
46
55
 
47
56
  # Inferred non-partition schema
48
- schema = {field.name: ParquetParser.parquet_type_to_schema_type(field.type, parquet_format) for field in parquet_schema}
57
+ schema = {
58
+ field.name: ParquetParser.parquet_type_to_schema_type(field.type, parquet_format)
59
+ for field in parquet_schema
60
+ }
49
61
  # Inferred partition schema
50
- partition_columns = {partition.split("=")[0]: {"type": "string"} for partition in self._extract_partitions(file.uri)}
62
+ partition_columns = {
63
+ partition.split("=")[0]: {"type": "string"}
64
+ for partition in self._extract_partitions(file.uri)
65
+ }
51
66
 
52
67
  schema.update(partition_columns)
53
68
  return schema
@@ -69,21 +84,27 @@ class ParquetParser(FileTypeParser):
69
84
  try:
70
85
  with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
71
86
  reader = pq.ParquetFile(fp)
72
- partition_columns = {x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)}
87
+ partition_columns = {
88
+ x.split("=")[0]: x.split("=")[1] for x in self._extract_partitions(file.uri)
89
+ }
73
90
  for row_group in range(reader.num_row_groups):
74
91
  batch = reader.read_row_group(row_group)
75
92
  for row in range(batch.num_rows):
76
93
  line_no += 1
77
94
  yield {
78
95
  **{
79
- column: ParquetParser._to_output_value(batch.column(column)[row], parquet_format)
96
+ column: ParquetParser._to_output_value(
97
+ batch.column(column)[row], parquet_format
98
+ )
80
99
  for column in batch.column_names
81
100
  },
82
101
  **partition_columns,
83
102
  }
84
103
  except Exception as exc:
85
104
  raise RecordParseError(
86
- FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=f"{row_group=}, {line_no=}"
105
+ FileBasedSourceError.ERROR_PARSING_RECORD,
106
+ filename=file.uri,
107
+ lineno=f"{row_group=}, {line_no=}",
87
108
  ) from exc
88
109
 
89
110
  @staticmethod
@@ -95,7 +116,9 @@ class ParquetParser(FileTypeParser):
95
116
  return FileReadMode.READ_BINARY
96
117
 
97
118
  @staticmethod
98
- def _to_output_value(parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat) -> Any:
119
+ def _to_output_value(
120
+ parquet_value: Union[Scalar, DictionaryArray], parquet_format: ParquetFormat
121
+ ) -> Any:
99
122
  """
100
123
  Convert an entry in a pyarrow table to a value that can be output by the source.
101
124
  """
@@ -113,7 +136,11 @@ class ParquetParser(FileTypeParser):
113
136
  return None
114
137
 
115
138
  # Convert date and datetime objects to isoformat strings
116
- if pa.types.is_time(parquet_value.type) or pa.types.is_timestamp(parquet_value.type) or pa.types.is_date(parquet_value.type):
139
+ if (
140
+ pa.types.is_time(parquet_value.type)
141
+ or pa.types.is_timestamp(parquet_value.type)
142
+ or pa.types.is_date(parquet_value.type)
143
+ ):
117
144
  return parquet_value.as_py().isoformat()
118
145
 
119
146
  # Convert month_day_nano_interval to array
@@ -168,7 +195,9 @@ class ParquetParser(FileTypeParser):
168
195
  }
169
196
 
170
197
  @staticmethod
171
- def parquet_type_to_schema_type(parquet_type: pa.DataType, parquet_format: ParquetFormat) -> Mapping[str, str]:
198
+ def parquet_type_to_schema_type(
199
+ parquet_type: pa.DataType, parquet_format: ParquetFormat
200
+ ) -> Mapping[str, str]:
172
201
  """
173
202
  Convert a pyarrow data type to an Airbyte schema type.
174
203
  Parquet data types are defined at https://arrow.apache.org/docs/python/api/datatypes.html
@@ -198,7 +227,9 @@ class ParquetParser(FileTypeParser):
198
227
  @staticmethod
199
228
  def _is_binary(parquet_type: pa.DataType) -> bool:
200
229
  return bool(
201
- pa.types.is_binary(parquet_type) or pa.types.is_large_binary(parquet_type) or pa.types.is_fixed_size_binary(parquet_type)
230
+ pa.types.is_binary(parquet_type)
231
+ or pa.types.is_large_binary(parquet_type)
232
+ or pa.types.is_fixed_size_binary(parquet_type)
202
233
  )
203
234
 
204
235
  @staticmethod
@@ -221,13 +252,23 @@ class ParquetParser(FileTypeParser):
221
252
  pa.types.is_time(parquet_type)
222
253
  or pa.types.is_string(parquet_type)
223
254
  or pa.types.is_large_string(parquet_type)
224
- or ParquetParser._is_binary(parquet_type) # Best we can do is return as a string since we do not support binary
255
+ or ParquetParser._is_binary(
256
+ parquet_type
257
+ ) # Best we can do is return as a string since we do not support binary
225
258
  )
226
259
 
227
260
  @staticmethod
228
261
  def _is_object(parquet_type: pa.DataType) -> bool:
229
- return bool(pa.types.is_dictionary(parquet_type) or pa.types.is_struct(parquet_type) or pa.types.is_map(parquet_type))
262
+ return bool(
263
+ pa.types.is_dictionary(parquet_type)
264
+ or pa.types.is_struct(parquet_type)
265
+ or pa.types.is_map(parquet_type)
266
+ )
230
267
 
231
268
  @staticmethod
232
269
  def _is_list(parquet_type: pa.DataType) -> bool:
233
- return bool(pa.types.is_list(parquet_type) or pa.types.is_large_list(parquet_type) or parquet_type == pa.month_day_nano_interval())
270
+ return bool(
271
+ pa.types.is_list(parquet_type)
272
+ or pa.types.is_large_list(parquet_type)
273
+ or parquet_type == pa.month_day_nano_interval()
274
+ )
@@ -19,13 +19,21 @@ from airbyte_cdk.sources.file_based.config.unstructured_format import (
19
19
  UnstructuredFormat,
20
20
  )
21
21
  from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
22
- from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
22
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
23
+ AbstractFileBasedStreamReader,
24
+ FileReadMode,
25
+ )
23
26
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
24
27
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
25
28
  from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
26
29
  from airbyte_cdk.utils import is_cloud_environment
27
30
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
28
- from unstructured.file_utils.filetype import FILETYPE_TO_MIMETYPE, STR_TO_FILETYPE, FileType, detect_filetype
31
+ from unstructured.file_utils.filetype import (
32
+ FILETYPE_TO_MIMETYPE,
33
+ STR_TO_FILETYPE,
34
+ FileType,
35
+ detect_filetype,
36
+ )
29
37
 
30
38
  unstructured_partition_pdf = None
31
39
  unstructured_partition_docx = None
@@ -109,7 +117,10 @@ class UnstructuredParser(FileTypeParser):
109
117
  "type": "string",
110
118
  "description": "Content of the file as markdown. Might be null if the file could not be parsed",
111
119
  },
112
- "document_key": {"type": "string", "description": "Unique identifier of the document, e.g. the file path"},
120
+ "document_key": {
121
+ "type": "string",
122
+ "description": "Unique identifier of the document, e.g. the file path",
123
+ },
113
124
  "_ab_source_file_parse_error": {
114
125
  "type": "string",
115
126
  "description": "Error message if the file could not be parsed even though the file is supported",
@@ -149,9 +160,19 @@ class UnstructuredParser(FileTypeParser):
149
160
  else:
150
161
  raise e
151
162
 
152
- def _read_file(self, file_handle: IOBase, remote_file: RemoteFile, format: UnstructuredFormat, logger: logging.Logger) -> str:
163
+ def _read_file(
164
+ self,
165
+ file_handle: IOBase,
166
+ remote_file: RemoteFile,
167
+ format: UnstructuredFormat,
168
+ logger: logging.Logger,
169
+ ) -> str:
153
170
  _import_unstructured()
154
- if (not unstructured_partition_pdf) or (not unstructured_partition_docx) or (not unstructured_partition_pptx):
171
+ if (
172
+ (not unstructured_partition_pdf)
173
+ or (not unstructured_partition_docx)
174
+ or (not unstructured_partition_pptx)
175
+ ):
155
176
  # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
156
177
  raise Exception("unstructured library is not available")
157
178
 
@@ -167,7 +188,9 @@ class UnstructuredParser(FileTypeParser):
167
188
  return self._read_file_locally(file_handle, filetype, format.strategy, remote_file)
168
189
  elif format.processing.mode == "api":
169
190
  try:
170
- result: str = self._read_file_remotely_with_retries(file_handle, format.processing, filetype, format.strategy, remote_file)
191
+ result: str = self._read_file_remotely_with_retries(
192
+ file_handle, format.processing, filetype, format.strategy, remote_file
193
+ )
171
194
  except Exception as e:
172
195
  # If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
173
196
  #
@@ -175,11 +198,15 @@ class UnstructuredParser(FileTypeParser):
175
198
  # Once this parser leaves experimental stage, we should consider making this a system error instead for issues that might be transient.
176
199
  if isinstance(e, RecordParseError):
177
200
  raise e
178
- raise AirbyteTracedException.from_exception(e, failure_type=FailureType.config_error)
201
+ raise AirbyteTracedException.from_exception(
202
+ e, failure_type=FailureType.config_error
203
+ )
179
204
 
180
205
  return result
181
206
 
182
- def _params_to_dict(self, params: Optional[List[APIParameterConfigModel]], strategy: str) -> Dict[str, Union[str, List[str]]]:
207
+ def _params_to_dict(
208
+ self, params: Optional[List[APIParameterConfigModel]], strategy: str
209
+ ) -> Dict[str, Union[str, List[str]]]:
183
210
  result_dict: Dict[str, Union[str, List[str]]] = {"strategy": strategy}
184
211
  if params is None:
185
212
  return result_dict
@@ -229,9 +256,16 @@ class UnstructuredParser(FileTypeParser):
229
256
 
230
257
  return True, None
231
258
 
232
- @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error)
259
+ @backoff.on_exception(
260
+ backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error
261
+ )
233
262
  def _read_file_remotely_with_retries(
234
- self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
263
+ self,
264
+ file_handle: IOBase,
265
+ format: APIProcessingConfigModel,
266
+ filetype: FileType,
267
+ strategy: str,
268
+ remote_file: RemoteFile,
235
269
  ) -> str:
236
270
  """
237
271
  Read a file remotely, retrying up to 5 times if the error is not caused by user error. This is useful for transient network errors or the API server being overloaded temporarily.
@@ -239,7 +273,12 @@ class UnstructuredParser(FileTypeParser):
239
273
  return self._read_file_remotely(file_handle, format, filetype, strategy, remote_file)
240
274
 
241
275
  def _read_file_remotely(
242
- self, file_handle: IOBase, format: APIProcessingConfigModel, filetype: FileType, strategy: str, remote_file: RemoteFile
276
+ self,
277
+ file_handle: IOBase,
278
+ format: APIProcessingConfigModel,
279
+ filetype: FileType,
280
+ strategy: str,
281
+ remote_file: RemoteFile,
243
282
  ) -> str:
244
283
  headers = {"accept": "application/json", "unstructured-api-key": format.api_key}
245
284
 
@@ -247,7 +286,9 @@ class UnstructuredParser(FileTypeParser):
247
286
 
248
287
  file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
249
288
 
250
- response = requests.post(f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data)
289
+ response = requests.post(
290
+ f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data
291
+ )
251
292
 
252
293
  if response.status_code == 422:
253
294
  # 422 means the file couldn't be processed, but the API is working. Treat this as a parsing error (passing an error record to the destination).
@@ -260,9 +301,15 @@ class UnstructuredParser(FileTypeParser):
260
301
 
261
302
  return self._render_markdown(json_response)
262
303
 
263
- def _read_file_locally(self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile) -> str:
304
+ def _read_file_locally(
305
+ self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile
306
+ ) -> str:
264
307
  _import_unstructured()
265
- if (not unstructured_partition_pdf) or (not unstructured_partition_docx) or (not unstructured_partition_pptx):
308
+ if (
309
+ (not unstructured_partition_pdf)
310
+ or (not unstructured_partition_docx)
311
+ or (not unstructured_partition_pptx)
312
+ ):
266
313
  # check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
267
314
  raise Exception("unstructured library is not available")
268
315
 
@@ -290,7 +337,9 @@ class UnstructuredParser(FileTypeParser):
290
337
  return self._render_markdown([element.to_dict() for element in elements])
291
338
 
292
339
  def _create_parse_error(self, remote_file: RemoteFile, message: str) -> RecordParseError:
293
- return RecordParseError(FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message)
340
+ return RecordParseError(
341
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
342
+ )
294
343
 
295
344
  def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileType]:
296
345
  """
@@ -8,13 +8,20 @@ from enum import Enum
8
8
  from functools import total_ordering
9
9
  from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Type, Union
10
10
 
11
- from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedSourceError, SchemaInferenceError
11
+ from airbyte_cdk.sources.file_based.exceptions import (
12
+ ConfigValidationError,
13
+ FileBasedSourceError,
14
+ SchemaInferenceError,
15
+ )
12
16
 
13
17
  JsonSchemaSupportedType = Union[List[str], Literal["string"], str]
14
18
  SchemaType = Mapping[str, Mapping[str, JsonSchemaSupportedType]]
15
19
 
16
20
  schemaless_schema = {"type": "object", "properties": {"data": {"type": "object"}}}
17
- file_transfer_schema = {"type": "object", "properties": {"data": {"type": "object"}, "file": {"type": "object"}}}
21
+ file_transfer_schema = {
22
+ "type": "object",
23
+ "properties": {"data": {"type": "object"}, "file": {"type": "object"}},
24
+ }
18
25
 
19
26
 
20
27
  @total_ordering
@@ -129,7 +136,12 @@ def _choose_wider_type(key: str, t1: Mapping[str, Any], t2: Mapping[str, Any]) -
129
136
  detected_types=f"{t1},{t2}",
130
137
  )
131
138
  # Schemas can still be merged if a key contains a null value in either t1 or t2, but it is still an object
132
- elif (t1_type == "object" or t2_type == "object") and t1_type != "null" and t2_type != "null" and t1 != t2:
139
+ elif (
140
+ (t1_type == "object" or t2_type == "object")
141
+ and t1_type != "null"
142
+ and t2_type != "null"
143
+ and t1 != t2
144
+ ):
133
145
  raise SchemaInferenceError(
134
146
  FileBasedSourceError.SCHEMA_INFERENCE_ERROR,
135
147
  details="Cannot merge schema for unequal object types.",
@@ -137,12 +149,19 @@ def _choose_wider_type(key: str, t1: Mapping[str, Any], t2: Mapping[str, Any]) -
137
149
  detected_types=f"{t1},{t2}",
138
150
  )
139
151
  else:
140
- comparable_t1 = get_comparable_type(TYPE_PYTHON_MAPPING[t1_type][0]) # accessing the type_mapping value
141
- comparable_t2 = get_comparable_type(TYPE_PYTHON_MAPPING[t2_type][0]) # accessing the type_mapping value
152
+ comparable_t1 = get_comparable_type(
153
+ TYPE_PYTHON_MAPPING[t1_type][0]
154
+ ) # accessing the type_mapping value
155
+ comparable_t2 = get_comparable_type(
156
+ TYPE_PYTHON_MAPPING[t2_type][0]
157
+ ) # accessing the type_mapping value
142
158
  if not comparable_t1 and comparable_t2:
143
- raise SchemaInferenceError(FileBasedSourceError.UNRECOGNIZED_TYPE, key=key, detected_types=f"{t1},{t2}")
159
+ raise SchemaInferenceError(
160
+ FileBasedSourceError.UNRECOGNIZED_TYPE, key=key, detected_types=f"{t1},{t2}"
161
+ )
144
162
  return max(
145
- [t1, t2], key=lambda x: ComparableType(get_comparable_type(TYPE_PYTHON_MAPPING[x["type"]][0]))
163
+ [t1, t2],
164
+ key=lambda x: ComparableType(get_comparable_type(TYPE_PYTHON_MAPPING[x["type"]][0])),
146
165
  ) # accessing the type_mapping value
147
166
 
148
167
 
@@ -205,7 +224,8 @@ def _parse_json_input(input_schema: Union[str, Mapping[str, str]]) -> Optional[M
205
224
  schema = input_schema
206
225
  if not all(isinstance(s, str) for s in schema.values()):
207
226
  raise ConfigValidationError(
208
- FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA, details="Invalid input schema; nested schemas are not supported."
227
+ FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA,
228
+ details="Invalid input schema; nested schemas are not supported.",
209
229
  )
210
230
 
211
231
  except json.decoder.JSONDecodeError:
@@ -214,7 +234,9 @@ def _parse_json_input(input_schema: Union[str, Mapping[str, str]]) -> Optional[M
214
234
  return schema
215
235
 
216
236
 
217
- def type_mapping_to_jsonschema(input_schema: Optional[Union[str, Mapping[str, str]]]) -> Optional[Mapping[str, Any]]:
237
+ def type_mapping_to_jsonschema(
238
+ input_schema: Optional[Union[str, Mapping[str, str]]],
239
+ ) -> Optional[Mapping[str, Any]]:
218
240
  """
219
241
  Return the user input schema (type mapping), transformed to JSON Schema format.
220
242
 
@@ -241,7 +263,8 @@ def type_mapping_to_jsonschema(input_schema: Optional[Union[str, Mapping[str, st
241
263
 
242
264
  if not _json_schema_type:
243
265
  raise ConfigValidationError(
244
- FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA, details=f"Invalid type '{type_name}' for property '{col_name}'."
266
+ FileBasedSourceError.ERROR_PARSING_USER_PROVIDED_SCHEMA,
267
+ details=f"Invalid type '{type_name}' for property '{col_name}'.",
245
268
  )
246
269
 
247
270
  json_schema_type = _json_schema_type[0]
@@ -11,7 +11,9 @@ class AbstractSchemaValidationPolicy(ABC):
11
11
  validate_schema_before_sync = False # Whether to verify that records conform to the schema during the stream's availabilty check
12
12
 
13
13
  @abstractmethod
14
- def record_passes_validation_policy(self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]) -> bool:
14
+ def record_passes_validation_policy(
15
+ self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]
16
+ ) -> bool:
15
17
  """
16
18
  Return True if the record passes the user's validation policy.
17
19
  """
@@ -5,7 +5,10 @@
5
5
  from typing import Any, Mapping, Optional
6
6
 
7
7
  from airbyte_cdk.sources.file_based.config.file_based_stream_config import ValidationPolicy
8
- from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, StopSyncPerValidationPolicy
8
+ from airbyte_cdk.sources.file_based.exceptions import (
9
+ FileBasedSourceError,
10
+ StopSyncPerValidationPolicy,
11
+ )
9
12
  from airbyte_cdk.sources.file_based.schema_helpers import conforms_to_schema
10
13
  from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSchemaValidationPolicy
11
14
 
@@ -13,14 +16,18 @@ from airbyte_cdk.sources.file_based.schema_validation_policies import AbstractSc
13
16
  class EmitRecordPolicy(AbstractSchemaValidationPolicy):
14
17
  name = "emit_record"
15
18
 
16
- def record_passes_validation_policy(self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]) -> bool:
19
+ def record_passes_validation_policy(
20
+ self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]
21
+ ) -> bool:
17
22
  return True
18
23
 
19
24
 
20
25
  class SkipRecordPolicy(AbstractSchemaValidationPolicy):
21
26
  name = "skip_record"
22
27
 
23
- def record_passes_validation_policy(self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]) -> bool:
28
+ def record_passes_validation_policy(
29
+ self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]
30
+ ) -> bool:
24
31
  return schema is not None and conforms_to_schema(record, schema)
25
32
 
26
33
 
@@ -28,9 +35,13 @@ class WaitForDiscoverPolicy(AbstractSchemaValidationPolicy):
28
35
  name = "wait_for_discover"
29
36
  validate_schema_before_sync = True
30
37
 
31
- def record_passes_validation_policy(self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]) -> bool:
38
+ def record_passes_validation_policy(
39
+ self, record: Mapping[str, Any], schema: Optional[Mapping[str, Any]]
40
+ ) -> bool:
32
41
  if schema is None or not conforms_to_schema(record, schema):
33
- raise StopSyncPerValidationPolicy(FileBasedSourceError.STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY)
42
+ raise StopSyncPerValidationPolicy(
43
+ FileBasedSourceError.STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY
44
+ )
34
45
  return True
35
46
 
36
47
 
@@ -8,10 +8,20 @@ from typing import Any, Dict, Iterable, List, Mapping, Optional, Type
8
8
 
9
9
  from airbyte_cdk import AirbyteMessage
10
10
  from airbyte_cdk.models import SyncMode
11
- from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy
12
- from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, PrimaryKeyType
11
+ from airbyte_cdk.sources.file_based.availability_strategy import (
12
+ AbstractFileBasedAvailabilityStrategy,
13
+ )
14
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
15
+ FileBasedStreamConfig,
16
+ PrimaryKeyType,
17
+ )
13
18
  from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
14
- from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector, FileBasedSourceError, RecordParseError, UndefinedParserError
19
+ from airbyte_cdk.sources.file_based.exceptions import (
20
+ FileBasedErrorsCollector,
21
+ FileBasedSourceError,
22
+ RecordParseError,
23
+ UndefinedParserError,
24
+ )
15
25
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
16
26
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
17
27
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
@@ -64,8 +74,7 @@ class AbstractFileBasedStream(Stream):
64
74
 
65
75
  @property
66
76
  @abstractmethod
67
- def primary_key(self) -> PrimaryKeyType:
68
- ...
77
+ def primary_key(self) -> PrimaryKeyType: ...
69
78
 
70
79
  @cache
71
80
  def list_files(self) -> List[RemoteFile]:
@@ -102,14 +111,20 @@ class AbstractFileBasedStream(Stream):
102
111
  return self.read_records_from_slice(stream_slice)
103
112
 
104
113
  @abstractmethod
105
- def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[Mapping[str, Any] | AirbyteMessage]:
114
+ def read_records_from_slice(
115
+ self, stream_slice: StreamSlice
116
+ ) -> Iterable[Mapping[str, Any] | AirbyteMessage]:
106
117
  """
107
118
  Yield all records from all remote files in `list_files_for_this_sync`.
108
119
  """
109
120
  ...
110
121
 
111
122
  def stream_slices(
112
- self, *, sync_mode: SyncMode, cursor_field: Optional[List[str]] = None, stream_state: Optional[Mapping[str, Any]] = None
123
+ self,
124
+ *,
125
+ sync_mode: SyncMode,
126
+ cursor_field: Optional[List[str]] = None,
127
+ stream_state: Optional[Mapping[str, Any]] = None,
113
128
  ) -> Iterable[Optional[Mapping[str, Any]]]:
114
129
  """
115
130
  This method acts as an adapter between the generic Stream interface and the file-based's
@@ -144,14 +159,22 @@ class AbstractFileBasedStream(Stream):
144
159
  try:
145
160
  return self._parsers[type(self.config.format)]
146
161
  except KeyError:
147
- raise UndefinedParserError(FileBasedSourceError.UNDEFINED_PARSER, stream=self.name, format=type(self.config.format))
162
+ raise UndefinedParserError(
163
+ FileBasedSourceError.UNDEFINED_PARSER,
164
+ stream=self.name,
165
+ format=type(self.config.format),
166
+ )
148
167
 
149
168
  def record_passes_validation_policy(self, record: Mapping[str, Any]) -> bool:
150
169
  if self.validation_policy:
151
- return self.validation_policy.record_passes_validation_policy(record=record, schema=self.catalog_schema)
170
+ return self.validation_policy.record_passes_validation_policy(
171
+ record=record, schema=self.catalog_schema
172
+ )
152
173
  else:
153
174
  raise RecordParseError(
154
- FileBasedSourceError.UNDEFINED_VALIDATION_POLICY, stream=self.name, validation_policy=self.config.validation_policy
175
+ FileBasedSourceError.UNDEFINED_VALIDATION_POLICY,
176
+ stream=self.name,
177
+ validation_policy=self.config.validation_policy,
155
178
  )
156
179
 
157
180
  @cached_property
@@ -7,7 +7,14 @@ import logging
7
7
  from functools import cache, lru_cache
8
8
  from typing import TYPE_CHECKING, Any, Iterable, List, Mapping, MutableMapping, Optional, Union
9
9
 
10
- from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, ConfiguredAirbyteStream, Level, SyncMode, Type
10
+ from airbyte_cdk.models import (
11
+ AirbyteLogMessage,
12
+ AirbyteMessage,
13
+ ConfiguredAirbyteStream,
14
+ Level,
15
+ SyncMode,
16
+ Type,
17
+ )
11
18
  from airbyte_cdk.sources import AbstractSource
12
19
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
13
20
  from airbyte_cdk.sources.file_based.availability_strategy import (
@@ -26,7 +33,10 @@ from airbyte_cdk.sources.source import ExperimentalClassWarning
26
33
  from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade
27
34
  from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
28
35
  from airbyte_cdk.sources.streams.concurrent.exceptions import ExceptionWithDisplayMessage
29
- from airbyte_cdk.sources.streams.concurrent.helpers import get_cursor_field_from_stream, get_primary_key_from_stream
36
+ from airbyte_cdk.sources.streams.concurrent.helpers import (
37
+ get_cursor_field_from_stream,
38
+ get_primary_key_from_stream,
39
+ )
30
40
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
31
41
  from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
32
42
  from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
@@ -36,7 +46,9 @@ from airbyte_cdk.sources.utils.slice_logger import SliceLogger
36
46
  from deprecated.classic import deprecated
37
47
 
38
48
  if TYPE_CHECKING:
39
- from airbyte_cdk.sources.file_based.stream.concurrent.cursor import AbstractConcurrentFileBasedCursor
49
+ from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
50
+ AbstractConcurrentFileBasedCursor,
51
+ )
40
52
 
41
53
  """
42
54
  This module contains adapters to help enabling concurrency on File-based Stream objects without needing to migrate to AbstractStream
@@ -72,7 +84,9 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
72
84
  partition_generator=FileBasedStreamPartitionGenerator(
73
85
  stream,
74
86
  message_repository,
75
- SyncMode.full_refresh if isinstance(cursor, FileBasedFinalStateCursor) else SyncMode.incremental,
87
+ SyncMode.full_refresh
88
+ if isinstance(cursor, FileBasedFinalStateCursor)
89
+ else SyncMode.incremental,
76
90
  [cursor_field] if cursor_field is not None else None,
77
91
  state,
78
92
  cursor,
@@ -138,7 +152,10 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
138
152
 
139
153
  @property
140
154
  def primary_key(self) -> PrimaryKeyType:
141
- return self._legacy_stream.config.primary_key or self.get_parser().get_parser_defined_primary_key(self._legacy_stream.config)
155
+ return (
156
+ self._legacy_stream.config.primary_key
157
+ or self.get_parser().get_parser_defined_primary_key(self._legacy_stream.config)
158
+ )
142
159
 
143
160
  def get_parser(self) -> FileTypeParser:
144
161
  return self._legacy_stream.get_parser()
@@ -185,7 +202,10 @@ class FileBasedStreamFacade(AbstractStreamFacade[DefaultStream], AbstractFileBas
185
202
  # This shouldn't happen if the ConcurrentCursor was used
186
203
  state = "unknown; no state attribute was available on the cursor"
187
204
  yield AirbyteMessage(
188
- type=Type.LOG, log=AirbyteLogMessage(level=Level.ERROR, message=f"Cursor State at time of exception: {state}")
205
+ type=Type.LOG,
206
+ log=AirbyteLogMessage(
207
+ level=Level.ERROR, message=f"Cursor State at time of exception: {state}"
208
+ ),
189
209
  )
190
210
  raise exc
191
211
 
@@ -227,16 +247,30 @@ class FileBasedStreamPartition(Partition):
227
247
  ):
228
248
  if isinstance(record_data, Mapping):
229
249
  data_to_return = dict(record_data)
230
- self._stream.transformer.transform(data_to_return, self._stream.get_json_schema())
250
+ self._stream.transformer.transform(
251
+ data_to_return, self._stream.get_json_schema()
252
+ )
231
253
  yield Record(data_to_return, self)
232
- elif isinstance(record_data, AirbyteMessage) and record_data.type == Type.RECORD and record_data.record is not None:
254
+ elif (
255
+ isinstance(record_data, AirbyteMessage)
256
+ and record_data.type == Type.RECORD
257
+ and record_data.record is not None
258
+ ):
233
259
  # `AirbyteMessage`s of type `Record` should also be yielded so they are enqueued
234
260
  # If stream is flagged for file_transfer the record should data in file key
235
- record_message_data = record_data.record.file if self._use_file_transfer() else record_data.record.data
261
+ record_message_data = (
262
+ record_data.record.file
263
+ if self._use_file_transfer()
264
+ else record_data.record.data
265
+ )
236
266
  if not record_message_data:
237
267
  raise ExceptionWithDisplayMessage("A record without data was found")
238
268
  else:
239
- yield Record(data=record_message_data, partition=self, is_file_transfer_message=self._use_file_transfer())
269
+ yield Record(
270
+ data=record_message_data,
271
+ partition=self,
272
+ is_file_transfer_message=self._use_file_transfer(),
273
+ )
240
274
  else:
241
275
  self._message_repository.emit_message(record_data)
242
276
  except Exception as e:
@@ -305,7 +339,9 @@ class FileBasedStreamPartitionGenerator(PartitionGenerator):
305
339
 
306
340
  def generate(self) -> Iterable[FileBasedStreamPartition]:
307
341
  pending_partitions = []
308
- for _slice in self._stream.stream_slices(sync_mode=self._sync_mode, cursor_field=self._cursor_field, stream_state=self._state):
342
+ for _slice in self._stream.stream_slices(
343
+ sync_mode=self._sync_mode, cursor_field=self._cursor_field, stream_state=self._state
344
+ ):
309
345
  if _slice is not None:
310
346
  for file in _slice.get("files", []):
311
347
  pending_partitions.append(