airbyte-cdk 0.72.0__py3-none-any.whl → 6.13.1.dev4106__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (517) hide show
  1. airbyte_cdk/__init__.py +355 -6
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +29 -10
  7. airbyte_cdk/connector.py +24 -24
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
  10. airbyte_cdk/connector_builder/main.py +45 -13
  11. airbyte_cdk/connector_builder/message_grouper.py +189 -50
  12. airbyte_cdk/connector_builder/models.py +3 -2
  13. airbyte_cdk/destinations/__init__.py +4 -3
  14. airbyte_cdk/destinations/destination.py +54 -20
  15. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  16. airbyte_cdk/destinations/vector_db_based/config.py +40 -17
  17. airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
  18. airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
  19. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  20. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  21. airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
  22. airbyte_cdk/entrypoint.py +153 -44
  23. airbyte_cdk/exception_handler.py +21 -3
  24. airbyte_cdk/logger.py +30 -44
  25. airbyte_cdk/models/__init__.py +13 -2
  26. airbyte_cdk/models/airbyte_protocol.py +86 -1
  27. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  28. airbyte_cdk/models/file_transfer_record_message.py +13 -0
  29. airbyte_cdk/models/well_known_types.py +1 -1
  30. airbyte_cdk/sources/__init__.py +5 -1
  31. airbyte_cdk/sources/abstract_source.py +125 -79
  32. airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
  33. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
  34. airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
  35. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
  36. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  37. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
  38. airbyte_cdk/sources/config.py +3 -2
  39. airbyte_cdk/sources/connector_state_manager.py +49 -83
  40. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  41. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
  42. airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
  43. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  44. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  45. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  46. airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
  47. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  48. airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
  49. airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
  50. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
  51. airbyte_cdk/sources/declarative/auth/token.py +28 -10
  52. airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
  53. airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
  54. airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
  55. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  56. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  57. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +421 -0
  58. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
  59. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
  60. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1213 -88
  61. airbyte_cdk/sources/declarative/declarative_source.py +5 -2
  62. airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
  63. airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
  64. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
  65. airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
  66. airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
  67. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  68. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  69. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  70. airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
  71. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
  72. airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
  73. airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
  74. airbyte_cdk/sources/declarative/extractors/record_filter.py +65 -8
  75. airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
  76. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
  77. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  78. airbyte_cdk/sources/declarative/incremental/__init__.py +25 -3
  79. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
  80. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  81. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
  82. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +159 -74
  83. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  84. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  85. airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
  86. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
  87. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
  88. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
  89. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
  90. airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
  91. airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
  92. airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
  93. airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
  94. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  95. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  96. airbyte_cdk/sources/declarative/models/__init__.py +1 -1
  97. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1329 -595
  98. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
  99. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
  100. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
  101. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1699 -226
  102. airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
  103. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  104. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  105. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
  106. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  107. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
  108. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
  109. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
  110. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
  111. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
  112. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
  113. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
  114. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
  115. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
  116. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
  117. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
  118. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
  119. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  120. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
  121. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
  122. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
  123. airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
  124. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
  125. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
  126. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
  127. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
  128. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
  129. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
  130. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
  131. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
  132. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
  133. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
  134. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
  135. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
  136. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  137. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
  138. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
  139. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
  140. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
  141. airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
  142. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  143. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  144. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  145. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  146. airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
  147. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
  148. airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
  149. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +228 -72
  150. airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
  151. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
  152. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
  153. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
  154. airbyte_cdk/sources/declarative/spec/spec.py +12 -5
  155. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
  156. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
  157. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
  158. airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
  159. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  160. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  161. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  162. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  163. airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
  164. airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
  165. airbyte_cdk/sources/declarative/types.py +19 -110
  166. airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
  167. airbyte_cdk/sources/embedded/base_integration.py +16 -5
  168. airbyte_cdk/sources/embedded/catalog.py +16 -4
  169. airbyte_cdk/sources/embedded/runner.py +19 -3
  170. airbyte_cdk/sources/embedded/tools.py +5 -2
  171. airbyte_cdk/sources/file_based/README.md +152 -0
  172. airbyte_cdk/sources/file_based/__init__.py +24 -0
  173. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
  174. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
  175. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
  176. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +58 -10
  177. airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
  178. airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
  179. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  180. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
  181. airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
  182. airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
  183. airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
  184. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
  185. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  186. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  187. airbyte_cdk/sources/file_based/exceptions.py +52 -15
  188. airbyte_cdk/sources/file_based/file_based_source.py +163 -33
  189. airbyte_cdk/sources/file_based/file_based_stream_reader.py +83 -5
  190. airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
  191. airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
  192. airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
  193. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  194. airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
  195. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  196. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
  197. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
  198. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +145 -41
  199. airbyte_cdk/sources/file_based/remote_file.py +1 -1
  200. airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
  201. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
  202. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  203. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  204. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
  205. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
  206. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
  207. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
  208. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
  209. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
  210. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  211. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
  212. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +175 -45
  213. airbyte_cdk/sources/http_logger.py +8 -3
  214. airbyte_cdk/sources/message/__init__.py +7 -1
  215. airbyte_cdk/sources/message/repository.py +18 -4
  216. airbyte_cdk/sources/source.py +42 -38
  217. airbyte_cdk/sources/streams/__init__.py +2 -2
  218. airbyte_cdk/sources/streams/availability_strategy.py +54 -3
  219. airbyte_cdk/sources/streams/call_rate.py +64 -21
  220. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  221. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  222. airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
  223. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  224. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  225. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  226. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  227. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
  228. airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
  229. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
  230. airbyte_cdk/sources/streams/concurrent/cursor.py +298 -42
  231. airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
  232. airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
  233. airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
  234. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
  235. airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
  236. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
  237. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  238. airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
  239. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
  240. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
  241. airbyte_cdk/sources/streams/core.py +412 -87
  242. airbyte_cdk/sources/streams/http/__init__.py +2 -1
  243. airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
  244. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  245. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  246. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  247. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  248. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  249. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  250. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  251. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  252. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  253. airbyte_cdk/sources/streams/http/exceptions.py +27 -7
  254. airbyte_cdk/sources/streams/http/http.py +369 -246
  255. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  256. airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
  257. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
  258. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  259. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
  260. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  261. airbyte_cdk/sources/types.py +154 -0
  262. airbyte_cdk/sources/utils/record_helper.py +36 -21
  263. airbyte_cdk/sources/utils/schema_helpers.py +13 -6
  264. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  265. airbyte_cdk/sources/utils/transform.py +54 -20
  266. airbyte_cdk/sql/_util/hashing.py +34 -0
  267. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  268. airbyte_cdk/sql/constants.py +32 -0
  269. airbyte_cdk/sql/exceptions.py +235 -0
  270. airbyte_cdk/sql/secrets.py +123 -0
  271. airbyte_cdk/sql/shared/__init__.py +15 -0
  272. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  273. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  274. airbyte_cdk/sql/types.py +160 -0
  275. airbyte_cdk/test/catalog_builder.py +70 -18
  276. airbyte_cdk/test/entrypoint_wrapper.py +117 -42
  277. airbyte_cdk/test/mock_http/__init__.py +1 -1
  278. airbyte_cdk/test/mock_http/matcher.py +6 -0
  279. airbyte_cdk/test/mock_http/mocker.py +57 -10
  280. airbyte_cdk/test/mock_http/request.py +19 -3
  281. airbyte_cdk/test/mock_http/response.py +3 -1
  282. airbyte_cdk/test/mock_http/response_builder.py +32 -16
  283. airbyte_cdk/test/state_builder.py +18 -10
  284. airbyte_cdk/test/utils/__init__.py +1 -0
  285. airbyte_cdk/test/utils/data.py +24 -0
  286. airbyte_cdk/test/utils/http_mocking.py +16 -0
  287. airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
  288. airbyte_cdk/test/utils/reading.py +26 -0
  289. airbyte_cdk/utils/__init__.py +2 -1
  290. airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
  291. airbyte_cdk/utils/analytics_message.py +10 -2
  292. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  293. airbyte_cdk/utils/event_timing.py +10 -10
  294. airbyte_cdk/utils/mapping_helpers.py +3 -1
  295. airbyte_cdk/utils/message_utils.py +20 -11
  296. airbyte_cdk/utils/print_buffer.py +75 -0
  297. airbyte_cdk/utils/schema_inferrer.py +198 -28
  298. airbyte_cdk/utils/slice_hasher.py +30 -0
  299. airbyte_cdk/utils/spec_schema_transformations.py +6 -3
  300. airbyte_cdk/utils/stream_status_utils.py +8 -1
  301. airbyte_cdk/utils/traced_exception.py +61 -21
  302. airbyte_cdk-6.13.1.dev4106.dist-info/METADATA +109 -0
  303. airbyte_cdk-6.13.1.dev4106.dist-info/RECORD +349 -0
  304. {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/WHEEL +1 -2
  305. airbyte_cdk-6.13.1.dev4106.dist-info/entry_points.txt +3 -0
  306. airbyte_cdk/sources/declarative/create_partial.py +0 -92
  307. airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
  308. airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
  309. airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
  310. airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
  311. airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
  312. airbyte_cdk/sources/deprecated/base_source.py +0 -94
  313. airbyte_cdk/sources/deprecated/client.py +0 -99
  314. airbyte_cdk/sources/singer/__init__.py +0 -8
  315. airbyte_cdk/sources/singer/singer_helpers.py +0 -304
  316. airbyte_cdk/sources/singer/source.py +0 -186
  317. airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
  318. airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
  319. airbyte_cdk/sources/streams/http/auth/core.py +0 -29
  320. airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
  321. airbyte_cdk/sources/streams/http/auth/token.py +0 -47
  322. airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
  323. airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
  324. airbyte_cdk/sources/utils/schema_models.py +0 -84
  325. airbyte_cdk-0.72.0.dist-info/METADATA +0 -243
  326. airbyte_cdk-0.72.0.dist-info/RECORD +0 -466
  327. airbyte_cdk-0.72.0.dist-info/top_level.txt +0 -3
  328. source_declarative_manifest/main.py +0 -29
  329. unit_tests/connector_builder/__init__.py +0 -3
  330. unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
  331. unit_tests/connector_builder/test_message_grouper.py +0 -713
  332. unit_tests/connector_builder/utils.py +0 -27
  333. unit_tests/destinations/test_destination.py +0 -243
  334. unit_tests/singer/test_singer_helpers.py +0 -56
  335. unit_tests/singer/test_singer_source.py +0 -112
  336. unit_tests/sources/__init__.py +0 -0
  337. unit_tests/sources/concurrent_source/__init__.py +0 -3
  338. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
  339. unit_tests/sources/declarative/__init__.py +0 -3
  340. unit_tests/sources/declarative/auth/__init__.py +0 -3
  341. unit_tests/sources/declarative/auth/test_oauth.py +0 -331
  342. unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
  343. unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
  344. unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
  345. unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
  346. unit_tests/sources/declarative/checks/__init__.py +0 -3
  347. unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
  348. unit_tests/sources/declarative/decoders/__init__.py +0 -0
  349. unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
  350. unit_tests/sources/declarative/external_component.py +0 -13
  351. unit_tests/sources/declarative/extractors/__init__.py +0 -3
  352. unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
  353. unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
  354. unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
  355. unit_tests/sources/declarative/incremental/__init__.py +0 -0
  356. unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
  357. unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
  358. unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
  359. unit_tests/sources/declarative/interpolation/__init__.py +0 -3
  360. unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
  361. unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
  362. unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
  363. unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
  364. unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
  365. unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
  366. unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
  367. unit_tests/sources/declarative/parsers/__init__.py +0 -3
  368. unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
  369. unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
  370. unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1841
  371. unit_tests/sources/declarative/parsers/testing_components.py +0 -36
  372. unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
  373. unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
  374. unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
  375. unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
  376. unit_tests/sources/declarative/requesters/__init__.py +0 -3
  377. unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
  378. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
  379. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
  380. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
  381. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
  382. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
  383. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
  384. unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
  385. unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
  386. unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
  387. unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
  388. unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
  389. unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
  390. unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
  391. unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
  392. unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
  393. unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
  394. unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
  395. unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
  396. unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
  397. unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
  398. unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
  399. unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
  400. unit_tests/sources/declarative/retrievers/__init__.py +0 -3
  401. unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
  402. unit_tests/sources/declarative/schema/__init__.py +0 -6
  403. unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
  404. unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
  405. unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
  406. unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
  407. unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
  408. unit_tests/sources/declarative/states/__init__.py +0 -3
  409. unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
  410. unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
  411. unit_tests/sources/declarative/test_create_partial.py +0 -83
  412. unit_tests/sources/declarative/test_declarative_stream.py +0 -103
  413. unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
  414. unit_tests/sources/declarative/test_types.py +0 -39
  415. unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
  416. unit_tests/sources/file_based/__init__.py +0 -0
  417. unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
  418. unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
  419. unit_tests/sources/file_based/config/__init__.py +0 -0
  420. unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
  421. unit_tests/sources/file_based/config/test_csv_format.py +0 -34
  422. unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
  423. unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
  424. unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
  425. unit_tests/sources/file_based/file_types/__init__.py +0 -0
  426. unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
  427. unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
  428. unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
  429. unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
  430. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
  431. unit_tests/sources/file_based/helpers.py +0 -70
  432. unit_tests/sources/file_based/in_memory_files_source.py +0 -211
  433. unit_tests/sources/file_based/scenarios/__init__.py +0 -0
  434. unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
  435. unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
  436. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
  437. unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
  438. unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
  439. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
  440. unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
  441. unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
  442. unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
  443. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
  444. unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
  445. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
  446. unit_tests/sources/file_based/stream/__init__.py +0 -0
  447. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  448. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
  449. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
  450. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
  451. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
  452. unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
  453. unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
  454. unit_tests/sources/file_based/test_scenarios.py +0 -253
  455. unit_tests/sources/file_based/test_schema_helpers.py +0 -346
  456. unit_tests/sources/fixtures/__init__.py +0 -3
  457. unit_tests/sources/fixtures/source_test_fixture.py +0 -153
  458. unit_tests/sources/message/__init__.py +0 -0
  459. unit_tests/sources/message/test_repository.py +0 -153
  460. unit_tests/sources/streams/__init__.py +0 -0
  461. unit_tests/sources/streams/concurrent/__init__.py +0 -3
  462. unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
  463. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
  464. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
  465. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
  466. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
  467. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
  468. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
  469. unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
  470. unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
  471. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
  472. unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
  473. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
  474. unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
  475. unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
  476. unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
  477. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
  478. unit_tests/sources/streams/http/__init__.py +0 -0
  479. unit_tests/sources/streams/http/auth/__init__.py +0 -0
  480. unit_tests/sources/streams/http/auth/test_auth.py +0 -173
  481. unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
  482. unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
  483. unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
  484. unit_tests/sources/streams/http/test_http.py +0 -635
  485. unit_tests/sources/streams/test_availability_strategy.py +0 -70
  486. unit_tests/sources/streams/test_call_rate.py +0 -300
  487. unit_tests/sources/streams/test_stream_read.py +0 -405
  488. unit_tests/sources/streams/test_streams_core.py +0 -184
  489. unit_tests/sources/test_abstract_source.py +0 -1442
  490. unit_tests/sources/test_concurrent_source.py +0 -112
  491. unit_tests/sources/test_config.py +0 -92
  492. unit_tests/sources/test_connector_state_manager.py +0 -482
  493. unit_tests/sources/test_http_logger.py +0 -252
  494. unit_tests/sources/test_integration_source.py +0 -86
  495. unit_tests/sources/test_source.py +0 -684
  496. unit_tests/sources/test_source_read.py +0 -460
  497. unit_tests/test/__init__.py +0 -0
  498. unit_tests/test/mock_http/__init__.py +0 -0
  499. unit_tests/test/mock_http/test_matcher.py +0 -53
  500. unit_tests/test/mock_http/test_mocker.py +0 -214
  501. unit_tests/test/mock_http/test_request.py +0 -117
  502. unit_tests/test/mock_http/test_response_builder.py +0 -177
  503. unit_tests/test/test_entrypoint_wrapper.py +0 -240
  504. unit_tests/utils/__init__.py +0 -0
  505. unit_tests/utils/test_datetime_format_inferrer.py +0 -60
  506. unit_tests/utils/test_mapping_helpers.py +0 -54
  507. unit_tests/utils/test_message_utils.py +0 -91
  508. unit_tests/utils/test_rate_limiting.py +0 -26
  509. unit_tests/utils/test_schema_inferrer.py +0 -202
  510. unit_tests/utils/test_secret_utils.py +0 -135
  511. unit_tests/utils/test_stream_status_utils.py +0 -61
  512. unit_tests/utils/test_traced_exception.py +0 -107
  513. /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
  514. {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
  515. {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
  516. {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
  517. {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/LICENSE.txt +0 -0
@@ -11,26 +11,21 @@ from airbyte_cdk.utils import AirbyteTracedException
11
11
 
12
12
  class FileBasedSourceError(Enum):
13
13
  EMPTY_STREAM = "No files were identified in the stream. This may be because there are no files in the specified container, or because your glob patterns did not match any files. Please verify that your source contains files last modified after the start_date and that your glob patterns are not overly strict."
14
- GLOB_PARSE_ERROR = (
15
- "Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
16
- )
14
+ GLOB_PARSE_ERROR = "Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
15
+ ENCODING_ERROR = "File encoding error. The configured encoding must match file encoding."
17
16
  ERROR_CASTING_VALUE = "Could not cast the value to the expected type."
18
17
  ERROR_CASTING_VALUE_UNRECOGNIZED_TYPE = "Could not cast the value to the expected type because the type is not recognized. Valid types are null, array, boolean, integer, number, object, and string."
19
18
  ERROR_DECODING_VALUE = "Expected a JSON-decodeable value but could not decode record."
20
- ERROR_LISTING_FILES = (
21
- "Error listing files. Please check the credentials provided in the config and verify that they provide permission to list files."
22
- )
23
- ERROR_READING_FILE = (
24
- "Error opening file. Please check the credentials provided in the config and verify that they provide permission to read files."
25
- )
19
+ ERROR_LISTING_FILES = "Error listing files. Please check the credentials provided in the config and verify that they provide permission to list files."
20
+ ERROR_READING_FILE = "Error opening file. Please check the credentials provided in the config and verify that they provide permission to read files."
26
21
  ERROR_PARSING_RECORD = "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable."
27
- ERROR_PARSING_USER_PROVIDED_SCHEMA = "The provided schema could not be transformed into valid JSON Schema."
22
+ ERROR_PARSING_USER_PROVIDED_SCHEMA = (
23
+ "The provided schema could not be transformed into valid JSON Schema."
24
+ )
28
25
  ERROR_VALIDATING_RECORD = "One or more records do not pass the schema validation policy. Please modify your input schema, or select a more lenient validation policy."
29
26
  ERROR_PARSING_RECORD_MISMATCHED_COLUMNS = "A header field has resolved to `None`. This indicates that the CSV has more rows than the number of header fields. If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
30
27
  ERROR_PARSING_RECORD_MISMATCHED_ROWS = "A row's value has resolved to `None`. This indicates that the CSV has more columns in the header field than the number of columns in the row(s). If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
31
- STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY = (
32
- "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema."
33
- )
28
+ STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY = "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema."
34
29
  NULL_VALUE_IN_SCHEMA = "Error during schema inference: no type was detected for key."
35
30
  UNRECOGNIZED_TYPE = "Error during schema inference: unrecognized type."
36
31
  SCHEMA_INFERENCE_ERROR = "Error inferring schema from files. Are the files valid?"
@@ -38,7 +33,9 @@ class FileBasedSourceError(Enum):
38
33
  CONFIG_VALIDATION_ERROR = "Error creating stream config object."
39
34
  MISSING_SCHEMA = "Expected `json_schema` in the configured catalog but it is missing."
40
35
  UNDEFINED_PARSER = "No parser is defined for this file type."
41
- UNDEFINED_VALIDATION_POLICY = "The validation policy defined in the config does not exist for the source."
36
+ UNDEFINED_VALIDATION_POLICY = (
37
+ "The validation policy defined in the config does not exist for the source."
38
+ )
42
39
 
43
40
 
44
41
  class FileBasedErrorsCollector:
@@ -69,7 +66,9 @@ class BaseFileBasedSourceError(Exception):
69
66
  def __init__(self, error: Union[FileBasedSourceError, str], **kwargs): # type: ignore # noqa
70
67
  if isinstance(error, FileBasedSourceError):
71
68
  error = FileBasedSourceError(error).value
72
- super().__init__(f"{error} Contact Support if you need assistance.\n{' '.join([f'{k}={v}' for k, v in kwargs.items()])}")
69
+ super().__init__(
70
+ f"{error} Contact Support if you need assistance.\n{' '.join([f'{k}={v}' for k, v in kwargs.items()])}"
71
+ )
73
72
 
74
73
 
75
74
  class ConfigValidationError(BaseFileBasedSourceError):
@@ -112,6 +111,40 @@ class ErrorListingFiles(BaseFileBasedSourceError):
112
111
  pass
113
112
 
114
113
 
114
+ class DuplicatedFilesError(BaseFileBasedSourceError):
115
+ def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
116
+ self._duplicated_files_names = duplicated_files_names
117
+ self._stream_name: str = kwargs["stream"]
118
+ super().__init__(self._format_duplicate_files_error_message(), **kwargs)
119
+
120
+ def _format_duplicate_files_error_message(self) -> str:
121
+ duplicated_files_messages = []
122
+ for duplicated_file in self._duplicated_files_names:
123
+ for duplicated_file_name, file_paths in duplicated_file.items():
124
+ file_duplicated_message = (
125
+ f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
126
+ + "".join(f"\n - {file_paths}")
127
+ )
128
+ duplicated_files_messages.append(file_duplicated_message)
129
+
130
+ error_message = (
131
+ f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
132
+ "Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
133
+ "Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
134
+ + "\n".join(duplicated_files_messages)
135
+ )
136
+
137
+ return error_message
138
+
139
+ def __repr__(self) -> str:
140
+ """Return a string representation of the exception."""
141
+ class_name = self.__class__.__name__
142
+ properties_str = ", ".join(
143
+ f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
144
+ )
145
+ return f"{class_name}({properties_str})"
146
+
147
+
115
148
  class CustomFileBasedException(AirbyteTracedException):
116
149
  """
117
150
  A specialized exception for file-based connectors.
@@ -120,3 +153,7 @@ class CustomFileBasedException(AirbyteTracedException):
120
153
  """
121
154
 
122
155
  pass
156
+
157
+
158
+ class FileSizeLimitError(CustomFileBasedException):
159
+ pass
@@ -6,7 +6,9 @@ import logging
6
6
  import traceback
7
7
  from abc import ABC
8
8
  from collections import Counter
9
- from typing import Any, Iterator, List, Mapping, MutableMapping, Optional, Tuple, Type, Union
9
+ from typing import Any, Iterator, List, Mapping, Optional, Tuple, Type, Union
10
+
11
+ from pydantic.v1.error_wrappers import ValidationError
10
12
 
11
13
  from airbyte_cdk.logger import AirbyteLogFormatter, init_logger
12
14
  from airbyte_cdk.models import (
@@ -22,15 +24,31 @@ from airbyte_cdk.models import (
22
24
  from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
23
25
  from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
24
26
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
25
- from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy, DefaultFileBasedAvailabilityStrategy
27
+ from airbyte_cdk.sources.file_based.availability_strategy import (
28
+ AbstractFileBasedAvailabilityStrategy,
29
+ DefaultFileBasedAvailabilityStrategy,
30
+ )
26
31
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
27
- from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
28
- from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy
29
- from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedErrorsCollector, FileBasedSourceError
32
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
33
+ FileBasedStreamConfig,
34
+ ValidationPolicy,
35
+ )
36
+ from airbyte_cdk.sources.file_based.discovery_policy import (
37
+ AbstractDiscoveryPolicy,
38
+ DefaultDiscoveryPolicy,
39
+ )
40
+ from airbyte_cdk.sources.file_based.exceptions import (
41
+ ConfigValidationError,
42
+ FileBasedErrorsCollector,
43
+ FileBasedSourceError,
44
+ )
30
45
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
31
46
  from airbyte_cdk.sources.file_based.file_types import default_parsers
32
47
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
33
- from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy
48
+ from airbyte_cdk.sources.file_based.schema_validation_policies import (
49
+ DEFAULT_SCHEMA_VALIDATION_POLICIES,
50
+ AbstractSchemaValidationPolicy,
51
+ )
34
52
  from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
35
53
  from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
36
54
  from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
@@ -44,7 +62,6 @@ from airbyte_cdk.sources.streams import Stream
44
62
  from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
45
63
  from airbyte_cdk.utils.analytics_message import create_analytics_message
46
64
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
47
- from pydantic.error_wrappers import ValidationError
48
65
 
49
66
  DEFAULT_CONCURRENCY = 100
50
67
  MAX_CONCURRENCY = 100
@@ -61,29 +78,41 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
61
78
  spec_class: Type[AbstractFileBasedSpec],
62
79
  catalog: Optional[ConfiguredAirbyteCatalog],
63
80
  config: Optional[Mapping[str, Any]],
64
- state: Optional[MutableMapping[str, Any]],
81
+ state: Optional[List[AirbyteStateMessage]],
65
82
  availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy] = None,
66
83
  discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
67
84
  parsers: Mapping[Type[Any], FileTypeParser] = default_parsers,
68
- validation_policies: Mapping[ValidationPolicy, AbstractSchemaValidationPolicy] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
69
- cursor_cls: Type[Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]] = FileBasedConcurrentCursor,
85
+ validation_policies: Mapping[
86
+ ValidationPolicy, AbstractSchemaValidationPolicy
87
+ ] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
88
+ cursor_cls: Type[
89
+ Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]
90
+ ] = FileBasedConcurrentCursor,
70
91
  ):
71
92
  self.stream_reader = stream_reader
72
93
  self.spec_class = spec_class
73
94
  self.config = config
74
95
  self.catalog = catalog
75
96
  self.state = state
76
- self.availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(stream_reader)
97
+ self.availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(
98
+ stream_reader
99
+ )
77
100
  self.discovery_policy = discovery_policy
78
101
  self.parsers = parsers
79
102
  self.validation_policies = validation_policies
80
- self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
103
+ self.stream_schemas = (
104
+ {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
105
+ )
81
106
  self.cursor_cls = cursor_cls
82
107
  self.logger = init_logger(f"airbyte.{self.name}")
83
108
  self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
84
109
  self._message_repository: Optional[MessageRepository] = None
85
110
  concurrent_source = ConcurrentSource.create(
86
- MAX_CONCURRENCY, INITIAL_N_PARTITIONS, self.logger, self._slice_logger, self.message_repository
111
+ MAX_CONCURRENCY,
112
+ INITIAL_N_PARTITIONS,
113
+ self.logger,
114
+ self._slice_logger,
115
+ self.message_repository,
87
116
  )
88
117
  self._state = None
89
118
  super().__init__(concurrent_source)
@@ -91,10 +120,14 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
91
120
  @property
92
121
  def message_repository(self) -> MessageRepository:
93
122
  if self._message_repository is None:
94
- self._message_repository = InMemoryMessageRepository(Level(AirbyteLogFormatter.level_mapping[self.logger.level]))
123
+ self._message_repository = InMemoryMessageRepository(
124
+ Level(AirbyteLogFormatter.level_mapping[self.logger.level])
125
+ )
95
126
  return self._message_repository
96
127
 
97
- def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
128
+ def check_connection(
129
+ self, logger: logging.Logger, config: Mapping[str, Any]
130
+ ) -> Tuple[bool, Optional[Any]]:
98
131
  """
99
132
  Check that the source can be accessed using the user-provided configuration.
100
133
 
@@ -122,20 +155,49 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
122
155
  )
123
156
 
124
157
  errors = []
158
+ tracebacks = []
125
159
  for stream in streams:
126
160
  if not isinstance(stream, AbstractFileBasedStream):
127
161
  raise ValueError(f"Stream {stream} is not a file-based stream.")
128
162
  try:
163
+ parsed_config = self._get_parsed_config(config)
164
+ availability_method = (
165
+ stream.availability_strategy.check_availability
166
+ if self._use_file_transfer(parsed_config)
167
+ else stream.availability_strategy.check_availability_and_parsability
168
+ )
129
169
  (
130
170
  stream_is_available,
131
171
  reason,
132
- ) = stream.availability_strategy.check_availability_and_parsability(stream, logger, self)
172
+ ) = availability_method(stream, logger, self)
173
+ except AirbyteTracedException as ate:
174
+ errors.append(f"Unable to connect to stream {stream.name} - {ate.message}")
175
+ tracebacks.append(traceback.format_exc())
133
176
  except Exception:
134
- errors.append(f"Unable to connect to stream {stream.name} - {''.join(traceback.format_exc())}")
177
+ errors.append(f"Unable to connect to stream {stream.name}")
178
+ tracebacks.append(traceback.format_exc())
135
179
  else:
136
180
  if not stream_is_available and reason:
137
181
  errors.append(reason)
138
182
 
183
+ if len(errors) == 1 and len(tracebacks) == 1:
184
+ raise AirbyteTracedException(
185
+ internal_message=tracebacks[0],
186
+ message=f"{errors[0]}",
187
+ failure_type=FailureType.config_error,
188
+ )
189
+ if len(errors) == 1 and len(tracebacks) == 0:
190
+ raise AirbyteTracedException(
191
+ message=f"{errors[0]}",
192
+ failure_type=FailureType.config_error,
193
+ )
194
+ elif len(errors) > 1:
195
+ raise AirbyteTracedException(
196
+ internal_message="\n".join(tracebacks),
197
+ message=f"{len(errors)} streams with errors: {', '.join(error for error in errors)}",
198
+ failure_type=FailureType.config_error,
199
+ )
200
+
139
201
  return not bool(errors), (errors or None)
140
202
 
141
203
  def streams(self, config: Mapping[str, Any]) -> List[Stream]:
@@ -144,10 +206,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
144
206
  """
145
207
 
146
208
  if self.catalog:
147
- state_manager = ConnectorStateManager(
148
- stream_instance_map={s.stream.name: s.stream for s in self.catalog.streams},
149
- state=self.state,
150
- )
209
+ state_manager = ConnectorStateManager(state=self.state)
151
210
  else:
152
211
  # During `check` operations we don't have a catalog so cannot create a state manager.
153
212
  # Since the state manager is only required for incremental syncs, this is fine.
@@ -169,12 +228,26 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
169
228
 
170
229
  sync_mode = self._get_sync_mode_from_catalog(stream_config.name)
171
230
 
172
- if sync_mode == SyncMode.full_refresh and hasattr(self, "_concurrency_level") and self._concurrency_level is not None:
231
+ if (
232
+ sync_mode == SyncMode.full_refresh
233
+ and hasattr(self, "_concurrency_level")
234
+ and self._concurrency_level is not None
235
+ ):
173
236
  cursor = FileBasedFinalStateCursor(
174
- stream_config=stream_config, stream_namespace=None, message_repository=self.message_repository
237
+ stream_config=stream_config,
238
+ stream_namespace=None,
239
+ message_repository=self.message_repository,
175
240
  )
176
241
  stream = FileBasedStreamFacade.create_from_stream(
177
- self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
242
+ stream=self._make_default_stream(
243
+ stream_config=stream_config,
244
+ cursor=cursor,
245
+ parsed_config=parsed_config,
246
+ ),
247
+ source=self,
248
+ logger=self.logger,
249
+ state=stream_state,
250
+ cursor=cursor,
178
251
  )
179
252
 
180
253
  elif (
@@ -197,11 +270,23 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
197
270
  CursorField(DefaultFileBasedStream.ab_last_mod_col),
198
271
  )
199
272
  stream = FileBasedStreamFacade.create_from_stream(
200
- self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
273
+ stream=self._make_default_stream(
274
+ stream_config=stream_config,
275
+ cursor=cursor,
276
+ parsed_config=parsed_config,
277
+ ),
278
+ source=self,
279
+ logger=self.logger,
280
+ state=stream_state,
281
+ cursor=cursor,
201
282
  )
202
283
  else:
203
284
  cursor = self.cursor_cls(stream_config)
204
- stream = self._make_default_stream(stream_config, cursor)
285
+ stream = self._make_default_stream(
286
+ stream_config=stream_config,
287
+ cursor=cursor,
288
+ parsed_config=parsed_config,
289
+ )
205
290
 
206
291
  streams.append(stream)
207
292
  return streams
@@ -210,7 +295,10 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
210
295
  raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) from exc
211
296
 
212
297
  def _make_default_stream(
213
- self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor]
298
+ self,
299
+ stream_config: FileBasedStreamConfig,
300
+ cursor: Optional[AbstractFileBasedCursor],
301
+ parsed_config: AbstractFileBasedSpec,
214
302
  ) -> AbstractFileBasedStream:
215
303
  return DefaultFileBasedStream(
216
304
  config=stream_config,
@@ -222,9 +310,13 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
222
310
  validation_policy=self._validate_and_get_validation_policy(stream_config),
223
311
  errors_collector=self.errors_collector,
224
312
  cursor=cursor,
313
+ use_file_transfer=self._use_file_transfer(parsed_config),
314
+ preserve_directory_structure=self._preserve_directory_structure(parsed_config),
225
315
  )
226
316
 
227
- def _get_stream_from_catalog(self, stream_config: FileBasedStreamConfig) -> Optional[AirbyteStream]:
317
+ def _get_stream_from_catalog(
318
+ self, stream_config: FileBasedStreamConfig
319
+ ) -> Optional[AirbyteStream]:
228
320
  if self.catalog:
229
321
  for stream in self.catalog.streams or []:
230
322
  if stream.stream.name == stream_config.name:
@@ -244,14 +336,16 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
244
336
  logger: logging.Logger,
245
337
  config: Mapping[str, Any],
246
338
  catalog: ConfiguredAirbyteCatalog,
247
- state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None,
339
+ state: Optional[List[AirbyteStateMessage]] = None,
248
340
  ) -> Iterator[AirbyteMessage]:
249
341
  yield from super().read(logger, config, catalog, state)
250
342
  # emit all the errors collected
251
343
  yield from self.errors_collector.yield_and_raise_collected()
252
344
  # count streams using a certain parser
253
345
  parsed_config = self._get_parsed_config(config)
254
- for parser, count in Counter(stream.format.filetype for stream in parsed_config.streams).items():
346
+ for parser, count in Counter(
347
+ stream.format.filetype for stream in parsed_config.streams
348
+ ).items():
255
349
  yield create_analytics_message(f"file-cdk-{parser}-stream-count", count)
256
350
 
257
351
  def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification:
@@ -267,14 +361,50 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
267
361
  def _get_parsed_config(self, config: Mapping[str, Any]) -> AbstractFileBasedSpec:
268
362
  return self.spec_class(**config)
269
363
 
270
- def _validate_and_get_validation_policy(self, stream_config: FileBasedStreamConfig) -> AbstractSchemaValidationPolicy:
364
+ def _validate_and_get_validation_policy(
365
+ self, stream_config: FileBasedStreamConfig
366
+ ) -> AbstractSchemaValidationPolicy:
271
367
  if stream_config.validation_policy not in self.validation_policies:
272
368
  # This should never happen because we validate the config against the schema's validation_policy enum
273
369
  raise ValidationError(
274
- f"`validation_policy` must be one of {list(self.validation_policies.keys())}", model=FileBasedStreamConfig
370
+ f"`validation_policy` must be one of {list(self.validation_policies.keys())}",
371
+ model=FileBasedStreamConfig,
275
372
  )
276
373
  return self.validation_policies[stream_config.validation_policy]
277
374
 
278
375
  def _validate_input_schema(self, stream_config: FileBasedStreamConfig) -> None:
279
376
  if stream_config.schemaless and stream_config.input_schema:
280
- raise ValidationError("`input_schema` and `schemaless` options cannot both be set", model=FileBasedStreamConfig)
377
+ raise ValidationError(
378
+ "`input_schema` and `schemaless` options cannot both be set",
379
+ model=FileBasedStreamConfig,
380
+ )
381
+
382
+ @staticmethod
383
+ def _use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
384
+ use_file_transfer = (
385
+ hasattr(parsed_config.delivery_method, "delivery_type")
386
+ and parsed_config.delivery_method.delivery_type == "use_file_transfer"
387
+ )
388
+ return use_file_transfer
389
+
390
+ @staticmethod
391
+ def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
392
+ """
393
+ Determines whether to preserve directory structure during file transfer.
394
+
395
+ When enabled, files maintain their subdirectory paths in the destination.
396
+ When disabled, files are flattened to the root of the destination.
397
+
398
+ Args:
399
+ parsed_config: The parsed configuration containing delivery method settings
400
+
401
+ Returns:
402
+ True if directory structure should be preserved (default), False otherwise
403
+ """
404
+ if (
405
+ FileBasedSource._use_file_transfer(parsed_config)
406
+ and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
407
+ and parsed_config.delivery_method.preserve_directory_structure is not None
408
+ ):
409
+ return parsed_config.delivery_method.preserve_directory_structure
410
+ return True
@@ -7,11 +7,13 @@ from abc import ABC, abstractmethod
7
7
  from datetime import datetime
8
8
  from enum import Enum
9
9
  from io import IOBase
10
- from typing import Iterable, List, Optional, Set
10
+ from os import makedirs, path
11
+ from typing import Any, Dict, Iterable, List, Optional, Set
12
+
13
+ from wcmatch.glob import GLOBSTAR, globmatch
11
14
 
12
15
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
13
16
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
14
- from wcmatch.glob import GLOBSTAR, globmatch
15
17
 
16
18
 
17
19
  class FileReadMode(Enum):
@@ -44,7 +46,9 @@ class AbstractFileBasedStreamReader(ABC):
44
46
  ...
45
47
 
46
48
  @abstractmethod
47
- def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
49
+ def open_file(
50
+ self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger
51
+ ) -> IOBase:
48
52
  """
49
53
  Return a file handle for reading.
50
54
 
@@ -79,11 +83,17 @@ class AbstractFileBasedStreamReader(ABC):
79
83
  """
80
84
  ...
81
85
 
82
- def filter_files_by_globs_and_start_date(self, files: List[RemoteFile], globs: List[str]) -> Iterable[RemoteFile]:
86
+ def filter_files_by_globs_and_start_date(
87
+ self, files: List[RemoteFile], globs: List[str]
88
+ ) -> Iterable[RemoteFile]:
83
89
  """
84
90
  Utility method for filtering files based on globs.
85
91
  """
86
- start_date = datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT) if self.config and self.config.start_date else None
92
+ start_date = (
93
+ datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT)
94
+ if self.config and self.config.start_date
95
+ else None
96
+ )
87
97
  seen = set()
88
98
 
89
99
  for file in files:
@@ -92,6 +102,16 @@ class AbstractFileBasedStreamReader(ABC):
92
102
  seen.add(file.uri)
93
103
  yield file
94
104
 
105
+ @abstractmethod
106
+ def file_size(self, file: RemoteFile) -> int:
107
+ """Utility method to get size of the remote file.
108
+
109
+ This is required for connectors that will support writing to
110
+ files. If the connector does not support writing files, then the
111
+ subclass can simply `return 0`.
112
+ """
113
+ ...
114
+
95
115
  @staticmethod
96
116
  def file_matches_globs(file: RemoteFile, globs: List[str]) -> bool:
97
117
  # Use the GLOBSTAR flag to enable recursive ** matching
@@ -105,3 +125,61 @@ class AbstractFileBasedStreamReader(ABC):
105
125
  """
106
126
  prefixes = {glob.split("*")[0] for glob in globs}
107
127
  return set(filter(lambda x: bool(x), prefixes))
128
+
129
+ def use_file_transfer(self) -> bool:
130
+ if self.config:
131
+ use_file_transfer = (
132
+ hasattr(self.config.delivery_method, "delivery_type")
133
+ and self.config.delivery_method.delivery_type == "use_file_transfer"
134
+ )
135
+ return use_file_transfer
136
+ return False
137
+
138
+ def preserve_directory_structure(self) -> bool:
139
+ # fall back to preserve subdirectories if config is not present or incomplete
140
+ if (
141
+ self.use_file_transfer()
142
+ and self.config
143
+ and hasattr(self.config.delivery_method, "preserve_directory_structure")
144
+ and self.config.delivery_method.preserve_directory_structure is not None
145
+ ):
146
+ return self.config.delivery_method.preserve_directory_structure
147
+ return True
148
+
149
+ @abstractmethod
150
+ def get_file(
151
+ self, file: RemoteFile, local_directory: str, logger: logging.Logger
152
+ ) -> Dict[str, Any]:
153
+ """
154
+ This is required for connectors that will support writing to
155
+ files. It will handle the logic to download,get,read,acquire or
156
+ whatever is more efficient to get a file from the source.
157
+
158
+ Args:
159
+ file (RemoteFile): The remote file object containing URI and metadata.
160
+ local_directory (str): The local directory path where the file will be downloaded.
161
+ logger (logging.Logger): Logger for logging information and errors.
162
+
163
+ Returns:
164
+ dict: A dictionary containing the following:
165
+ - "file_url" (str): The absolute path of the downloaded file.
166
+ - "bytes" (int): The file size in bytes.
167
+ - "file_relative_path" (str): The relative path of the file for local storage. Is relative to local_directory as
168
+ this a mounted volume in the pod container.
169
+
170
+ """
171
+ ...
172
+
173
+ def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
174
+ preserve_directory_structure = self.preserve_directory_structure()
175
+ if preserve_directory_structure:
176
+ # Remove left slashes from source path format to make relative path for writing locally
177
+ file_relative_path = file.uri.lstrip("/")
178
+ else:
179
+ file_relative_path = path.basename(file.uri)
180
+ local_file_path = path.join(local_directory, file_relative_path)
181
+
182
+ # Ensure the local directory exists
183
+ makedirs(path.dirname(local_file_path), exist_ok=True)
184
+ absolute_file_path = path.abspath(local_file_path)
185
+ return [file_relative_path, local_file_path, absolute_file_path]
@@ -2,12 +2,15 @@ from typing import Any, Mapping, Type
2
2
 
3
3
  from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
4
4
  from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
5
+ from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
5
6
  from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
6
7
  from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
7
8
  from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
8
9
 
9
10
  from .avro_parser import AvroParser
10
11
  from .csv_parser import CsvParser
12
+ from .excel_parser import ExcelParser
13
+ from .file_transfer import FileTransfer
11
14
  from .file_type_parser import FileTypeParser
12
15
  from .jsonl_parser import JsonlParser
13
16
  from .parquet_parser import ParquetParser
@@ -16,9 +19,19 @@ from .unstructured_parser import UnstructuredParser
16
19
  default_parsers: Mapping[Type[Any], FileTypeParser] = {
17
20
  AvroFormat: AvroParser(),
18
21
  CsvFormat: CsvParser(),
22
+ ExcelFormat: ExcelParser(),
19
23
  JsonlFormat: JsonlParser(),
20
24
  ParquetFormat: ParquetParser(),
21
25
  UnstructuredFormat: UnstructuredParser(),
22
26
  }
23
27
 
24
- __all__ = ["AvroParser", "CsvParser", "JsonlParser", "ParquetParser", "UnstructuredParser", "default_parsers"]
28
+ __all__ = [
29
+ "AvroParser",
30
+ "CsvParser",
31
+ "ExcelParser",
32
+ "JsonlParser",
33
+ "ParquetParser",
34
+ "UnstructuredParser",
35
+ "FileTransfer",
36
+ "default_parsers",
37
+ ]