airbyte-cdk 0.72.1__py3-none-any.whl → 6.17.1.dev1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (518) hide show
  1. airbyte_cdk/__init__.py +355 -6
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +29 -10
  7. airbyte_cdk/connector.py +24 -24
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
  10. airbyte_cdk/connector_builder/main.py +45 -13
  11. airbyte_cdk/connector_builder/message_grouper.py +189 -50
  12. airbyte_cdk/connector_builder/models.py +3 -2
  13. airbyte_cdk/destinations/__init__.py +4 -3
  14. airbyte_cdk/destinations/destination.py +54 -20
  15. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  16. airbyte_cdk/destinations/vector_db_based/config.py +40 -17
  17. airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
  18. airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
  19. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  20. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  21. airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
  22. airbyte_cdk/entrypoint.py +153 -44
  23. airbyte_cdk/exception_handler.py +21 -3
  24. airbyte_cdk/logger.py +30 -44
  25. airbyte_cdk/models/__init__.py +13 -2
  26. airbyte_cdk/models/airbyte_protocol.py +86 -1
  27. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  28. airbyte_cdk/models/file_transfer_record_message.py +13 -0
  29. airbyte_cdk/models/well_known_types.py +1 -1
  30. airbyte_cdk/sources/__init__.py +5 -1
  31. airbyte_cdk/sources/abstract_source.py +125 -79
  32. airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
  33. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
  34. airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
  35. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
  36. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  37. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
  38. airbyte_cdk/sources/config.py +3 -2
  39. airbyte_cdk/sources/connector_state_manager.py +49 -83
  40. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  41. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
  42. airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
  43. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  44. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  45. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  46. airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
  47. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  48. airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
  49. airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
  50. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
  51. airbyte_cdk/sources/declarative/auth/token.py +28 -10
  52. airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
  53. airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
  54. airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
  55. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  56. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  57. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +490 -0
  58. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
  59. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
  60. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1185 -85
  61. airbyte_cdk/sources/declarative/declarative_source.py +5 -2
  62. airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
  63. airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
  64. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
  65. airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
  66. airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
  67. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  68. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  69. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  70. airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
  71. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
  72. airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
  73. airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
  74. airbyte_cdk/sources/declarative/extractors/record_filter.py +63 -8
  75. airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
  76. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
  77. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  78. airbyte_cdk/sources/declarative/incremental/__init__.py +31 -3
  79. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +340 -0
  80. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
  81. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  82. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
  83. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +174 -74
  84. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  85. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  86. airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
  87. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
  88. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
  89. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
  90. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
  91. airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
  92. airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
  93. airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
  94. airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
  95. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  96. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  97. airbyte_cdk/sources/declarative/models/__init__.py +1 -1
  98. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1319 -603
  99. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
  100. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
  101. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
  102. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1759 -225
  103. airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
  104. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  105. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  106. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
  107. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  108. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
  109. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
  110. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
  111. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
  112. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
  113. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
  114. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
  115. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
  116. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
  117. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
  118. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
  119. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
  120. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  121. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
  122. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
  123. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
  124. airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
  125. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
  126. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
  127. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
  128. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
  129. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
  130. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
  131. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
  132. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
  133. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
  134. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
  135. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
  136. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
  137. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  138. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
  139. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
  140. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
  141. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
  142. airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
  143. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  144. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  145. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  146. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  147. airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
  148. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
  149. airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
  150. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +229 -73
  151. airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
  152. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
  153. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
  154. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
  155. airbyte_cdk/sources/declarative/spec/spec.py +12 -5
  156. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
  157. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
  158. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
  159. airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
  160. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  161. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  162. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  163. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  164. airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
  165. airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
  166. airbyte_cdk/sources/declarative/types.py +19 -110
  167. airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
  168. airbyte_cdk/sources/embedded/base_integration.py +16 -5
  169. airbyte_cdk/sources/embedded/catalog.py +16 -4
  170. airbyte_cdk/sources/embedded/runner.py +19 -3
  171. airbyte_cdk/sources/embedded/tools.py +5 -2
  172. airbyte_cdk/sources/file_based/README.md +152 -0
  173. airbyte_cdk/sources/file_based/__init__.py +24 -0
  174. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
  175. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
  176. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
  177. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +47 -10
  178. airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
  179. airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
  180. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  181. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
  182. airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
  183. airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
  184. airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
  185. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
  186. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  187. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  188. airbyte_cdk/sources/file_based/exceptions.py +18 -15
  189. airbyte_cdk/sources/file_based/file_based_source.py +140 -33
  190. airbyte_cdk/sources/file_based/file_based_stream_reader.py +69 -5
  191. airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
  192. airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
  193. airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
  194. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  195. airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
  196. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  197. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
  198. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
  199. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +141 -41
  200. airbyte_cdk/sources/file_based/remote_file.py +1 -1
  201. airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
  202. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
  203. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  204. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  205. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
  206. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
  207. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
  208. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
  209. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
  210. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
  211. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  212. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
  213. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +147 -45
  214. airbyte_cdk/sources/http_logger.py +8 -3
  215. airbyte_cdk/sources/message/__init__.py +7 -1
  216. airbyte_cdk/sources/message/repository.py +18 -4
  217. airbyte_cdk/sources/source.py +42 -38
  218. airbyte_cdk/sources/streams/__init__.py +2 -2
  219. airbyte_cdk/sources/streams/availability_strategy.py +54 -3
  220. airbyte_cdk/sources/streams/call_rate.py +64 -21
  221. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  222. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  223. airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
  224. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  225. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  226. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  227. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  228. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
  229. airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
  230. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
  231. airbyte_cdk/sources/streams/concurrent/cursor.py +313 -48
  232. airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
  233. airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
  234. airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
  235. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
  236. airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
  237. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
  238. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  239. airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
  240. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
  241. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
  242. airbyte_cdk/sources/streams/core.py +412 -87
  243. airbyte_cdk/sources/streams/http/__init__.py +2 -1
  244. airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
  245. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  246. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  247. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  248. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  249. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  250. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  251. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  252. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  253. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  254. airbyte_cdk/sources/streams/http/exceptions.py +27 -7
  255. airbyte_cdk/sources/streams/http/http.py +369 -246
  256. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  257. airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
  258. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
  259. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  260. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
  261. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  262. airbyte_cdk/sources/types.py +154 -0
  263. airbyte_cdk/sources/utils/record_helper.py +36 -21
  264. airbyte_cdk/sources/utils/schema_helpers.py +13 -6
  265. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  266. airbyte_cdk/sources/utils/transform.py +54 -20
  267. airbyte_cdk/sql/_util/hashing.py +34 -0
  268. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  269. airbyte_cdk/sql/constants.py +32 -0
  270. airbyte_cdk/sql/exceptions.py +235 -0
  271. airbyte_cdk/sql/secrets.py +123 -0
  272. airbyte_cdk/sql/shared/__init__.py +15 -0
  273. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  274. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  275. airbyte_cdk/sql/types.py +160 -0
  276. airbyte_cdk/test/catalog_builder.py +70 -18
  277. airbyte_cdk/test/entrypoint_wrapper.py +117 -42
  278. airbyte_cdk/test/mock_http/__init__.py +1 -1
  279. airbyte_cdk/test/mock_http/matcher.py +6 -0
  280. airbyte_cdk/test/mock_http/mocker.py +57 -10
  281. airbyte_cdk/test/mock_http/request.py +19 -3
  282. airbyte_cdk/test/mock_http/response.py +3 -1
  283. airbyte_cdk/test/mock_http/response_builder.py +32 -16
  284. airbyte_cdk/test/state_builder.py +18 -10
  285. airbyte_cdk/test/utils/__init__.py +1 -0
  286. airbyte_cdk/test/utils/data.py +24 -0
  287. airbyte_cdk/test/utils/http_mocking.py +16 -0
  288. airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
  289. airbyte_cdk/test/utils/reading.py +26 -0
  290. airbyte_cdk/utils/__init__.py +2 -1
  291. airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
  292. airbyte_cdk/utils/analytics_message.py +10 -2
  293. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  294. airbyte_cdk/utils/event_timing.py +10 -10
  295. airbyte_cdk/utils/mapping_helpers.py +3 -1
  296. airbyte_cdk/utils/message_utils.py +20 -11
  297. airbyte_cdk/utils/print_buffer.py +75 -0
  298. airbyte_cdk/utils/schema_inferrer.py +198 -28
  299. airbyte_cdk/utils/slice_hasher.py +30 -0
  300. airbyte_cdk/utils/spec_schema_transformations.py +6 -3
  301. airbyte_cdk/utils/stream_status_utils.py +8 -1
  302. airbyte_cdk/utils/traced_exception.py +61 -21
  303. airbyte_cdk-6.17.1.dev1.dist-info/METADATA +109 -0
  304. airbyte_cdk-6.17.1.dev1.dist-info/RECORD +350 -0
  305. {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.17.1.dev1.dist-info}/WHEEL +1 -2
  306. airbyte_cdk-6.17.1.dev1.dist-info/entry_points.txt +3 -0
  307. airbyte_cdk/sources/declarative/create_partial.py +0 -92
  308. airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
  309. airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
  310. airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
  311. airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
  312. airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
  313. airbyte_cdk/sources/deprecated/base_source.py +0 -94
  314. airbyte_cdk/sources/deprecated/client.py +0 -99
  315. airbyte_cdk/sources/singer/__init__.py +0 -8
  316. airbyte_cdk/sources/singer/singer_helpers.py +0 -304
  317. airbyte_cdk/sources/singer/source.py +0 -186
  318. airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
  319. airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
  320. airbyte_cdk/sources/streams/http/auth/core.py +0 -29
  321. airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
  322. airbyte_cdk/sources/streams/http/auth/token.py +0 -47
  323. airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
  324. airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
  325. airbyte_cdk/sources/utils/schema_models.py +0 -84
  326. airbyte_cdk-0.72.1.dist-info/METADATA +0 -243
  327. airbyte_cdk-0.72.1.dist-info/RECORD +0 -466
  328. airbyte_cdk-0.72.1.dist-info/top_level.txt +0 -3
  329. source_declarative_manifest/main.py +0 -29
  330. unit_tests/connector_builder/__init__.py +0 -3
  331. unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
  332. unit_tests/connector_builder/test_message_grouper.py +0 -713
  333. unit_tests/connector_builder/utils.py +0 -27
  334. unit_tests/destinations/test_destination.py +0 -243
  335. unit_tests/singer/test_singer_helpers.py +0 -56
  336. unit_tests/singer/test_singer_source.py +0 -112
  337. unit_tests/sources/__init__.py +0 -0
  338. unit_tests/sources/concurrent_source/__init__.py +0 -3
  339. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
  340. unit_tests/sources/declarative/__init__.py +0 -3
  341. unit_tests/sources/declarative/auth/__init__.py +0 -3
  342. unit_tests/sources/declarative/auth/test_oauth.py +0 -331
  343. unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
  344. unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
  345. unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
  346. unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
  347. unit_tests/sources/declarative/checks/__init__.py +0 -3
  348. unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
  349. unit_tests/sources/declarative/decoders/__init__.py +0 -0
  350. unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
  351. unit_tests/sources/declarative/external_component.py +0 -13
  352. unit_tests/sources/declarative/extractors/__init__.py +0 -3
  353. unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
  354. unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
  355. unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
  356. unit_tests/sources/declarative/incremental/__init__.py +0 -0
  357. unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
  358. unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
  359. unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
  360. unit_tests/sources/declarative/interpolation/__init__.py +0 -3
  361. unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
  362. unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
  363. unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
  364. unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
  365. unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
  366. unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
  367. unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
  368. unit_tests/sources/declarative/parsers/__init__.py +0 -3
  369. unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
  370. unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
  371. unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1847
  372. unit_tests/sources/declarative/parsers/testing_components.py +0 -36
  373. unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
  374. unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
  375. unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
  376. unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
  377. unit_tests/sources/declarative/requesters/__init__.py +0 -3
  378. unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
  379. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
  380. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
  381. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
  382. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
  383. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
  384. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
  385. unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
  386. unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
  387. unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
  388. unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
  389. unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
  390. unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
  391. unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
  392. unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
  393. unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
  394. unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
  395. unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
  396. unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
  397. unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
  398. unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
  399. unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
  400. unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
  401. unit_tests/sources/declarative/retrievers/__init__.py +0 -3
  402. unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
  403. unit_tests/sources/declarative/schema/__init__.py +0 -6
  404. unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
  405. unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
  406. unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
  407. unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
  408. unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
  409. unit_tests/sources/declarative/states/__init__.py +0 -3
  410. unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
  411. unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
  412. unit_tests/sources/declarative/test_create_partial.py +0 -83
  413. unit_tests/sources/declarative/test_declarative_stream.py +0 -103
  414. unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
  415. unit_tests/sources/declarative/test_types.py +0 -39
  416. unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
  417. unit_tests/sources/file_based/__init__.py +0 -0
  418. unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
  419. unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
  420. unit_tests/sources/file_based/config/__init__.py +0 -0
  421. unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
  422. unit_tests/sources/file_based/config/test_csv_format.py +0 -34
  423. unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
  424. unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
  425. unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
  426. unit_tests/sources/file_based/file_types/__init__.py +0 -0
  427. unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
  428. unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
  429. unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
  430. unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
  431. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
  432. unit_tests/sources/file_based/helpers.py +0 -70
  433. unit_tests/sources/file_based/in_memory_files_source.py +0 -211
  434. unit_tests/sources/file_based/scenarios/__init__.py +0 -0
  435. unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
  436. unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
  437. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
  438. unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
  439. unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
  440. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
  441. unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
  442. unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
  443. unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
  444. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
  445. unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
  446. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
  447. unit_tests/sources/file_based/stream/__init__.py +0 -0
  448. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  449. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
  450. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
  451. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
  452. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
  453. unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
  454. unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
  455. unit_tests/sources/file_based/test_scenarios.py +0 -253
  456. unit_tests/sources/file_based/test_schema_helpers.py +0 -346
  457. unit_tests/sources/fixtures/__init__.py +0 -3
  458. unit_tests/sources/fixtures/source_test_fixture.py +0 -153
  459. unit_tests/sources/message/__init__.py +0 -0
  460. unit_tests/sources/message/test_repository.py +0 -153
  461. unit_tests/sources/streams/__init__.py +0 -0
  462. unit_tests/sources/streams/concurrent/__init__.py +0 -3
  463. unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
  464. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
  465. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
  466. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
  467. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
  468. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
  469. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
  470. unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
  471. unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
  472. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
  473. unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
  474. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
  475. unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
  476. unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
  477. unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
  478. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
  479. unit_tests/sources/streams/http/__init__.py +0 -0
  480. unit_tests/sources/streams/http/auth/__init__.py +0 -0
  481. unit_tests/sources/streams/http/auth/test_auth.py +0 -173
  482. unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
  483. unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
  484. unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
  485. unit_tests/sources/streams/http/test_http.py +0 -635
  486. unit_tests/sources/streams/test_availability_strategy.py +0 -70
  487. unit_tests/sources/streams/test_call_rate.py +0 -300
  488. unit_tests/sources/streams/test_stream_read.py +0 -405
  489. unit_tests/sources/streams/test_streams_core.py +0 -184
  490. unit_tests/sources/test_abstract_source.py +0 -1442
  491. unit_tests/sources/test_concurrent_source.py +0 -112
  492. unit_tests/sources/test_config.py +0 -92
  493. unit_tests/sources/test_connector_state_manager.py +0 -482
  494. unit_tests/sources/test_http_logger.py +0 -252
  495. unit_tests/sources/test_integration_source.py +0 -86
  496. unit_tests/sources/test_source.py +0 -684
  497. unit_tests/sources/test_source_read.py +0 -460
  498. unit_tests/test/__init__.py +0 -0
  499. unit_tests/test/mock_http/__init__.py +0 -0
  500. unit_tests/test/mock_http/test_matcher.py +0 -53
  501. unit_tests/test/mock_http/test_mocker.py +0 -214
  502. unit_tests/test/mock_http/test_request.py +0 -117
  503. unit_tests/test/mock_http/test_response_builder.py +0 -177
  504. unit_tests/test/test_entrypoint_wrapper.py +0 -240
  505. unit_tests/utils/__init__.py +0 -0
  506. unit_tests/utils/test_datetime_format_inferrer.py +0 -60
  507. unit_tests/utils/test_mapping_helpers.py +0 -54
  508. unit_tests/utils/test_message_utils.py +0 -91
  509. unit_tests/utils/test_rate_limiting.py +0 -26
  510. unit_tests/utils/test_schema_inferrer.py +0 -202
  511. unit_tests/utils/test_secret_utils.py +0 -135
  512. unit_tests/utils/test_stream_status_utils.py +0 -61
  513. unit_tests/utils/test_traced_exception.py +0 -107
  514. /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
  515. {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
  516. {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
  517. {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
  518. {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.17.1.dev1.dist-info}/LICENSE.txt +0 -0
@@ -11,26 +11,21 @@ from airbyte_cdk.utils import AirbyteTracedException
11
11
 
12
12
  class FileBasedSourceError(Enum):
13
13
  EMPTY_STREAM = "No files were identified in the stream. This may be because there are no files in the specified container, or because your glob patterns did not match any files. Please verify that your source contains files last modified after the start_date and that your glob patterns are not overly strict."
14
- GLOB_PARSE_ERROR = (
15
- "Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
16
- )
14
+ GLOB_PARSE_ERROR = "Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
15
+ ENCODING_ERROR = "File encoding error. The configured encoding must match file encoding."
17
16
  ERROR_CASTING_VALUE = "Could not cast the value to the expected type."
18
17
  ERROR_CASTING_VALUE_UNRECOGNIZED_TYPE = "Could not cast the value to the expected type because the type is not recognized. Valid types are null, array, boolean, integer, number, object, and string."
19
18
  ERROR_DECODING_VALUE = "Expected a JSON-decodeable value but could not decode record."
20
- ERROR_LISTING_FILES = (
21
- "Error listing files. Please check the credentials provided in the config and verify that they provide permission to list files."
22
- )
23
- ERROR_READING_FILE = (
24
- "Error opening file. Please check the credentials provided in the config and verify that they provide permission to read files."
25
- )
19
+ ERROR_LISTING_FILES = "Error listing files. Please check the credentials provided in the config and verify that they provide permission to list files."
20
+ ERROR_READING_FILE = "Error opening file. Please check the credentials provided in the config and verify that they provide permission to read files."
26
21
  ERROR_PARSING_RECORD = "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable."
27
- ERROR_PARSING_USER_PROVIDED_SCHEMA = "The provided schema could not be transformed into valid JSON Schema."
22
+ ERROR_PARSING_USER_PROVIDED_SCHEMA = (
23
+ "The provided schema could not be transformed into valid JSON Schema."
24
+ )
28
25
  ERROR_VALIDATING_RECORD = "One or more records do not pass the schema validation policy. Please modify your input schema, or select a more lenient validation policy."
29
26
  ERROR_PARSING_RECORD_MISMATCHED_COLUMNS = "A header field has resolved to `None`. This indicates that the CSV has more rows than the number of header fields. If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
30
27
  ERROR_PARSING_RECORD_MISMATCHED_ROWS = "A row's value has resolved to `None`. This indicates that the CSV has more columns in the header field than the number of columns in the row(s). If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
31
- STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY = (
32
- "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema."
33
- )
28
+ STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY = "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema."
34
29
  NULL_VALUE_IN_SCHEMA = "Error during schema inference: no type was detected for key."
35
30
  UNRECOGNIZED_TYPE = "Error during schema inference: unrecognized type."
36
31
  SCHEMA_INFERENCE_ERROR = "Error inferring schema from files. Are the files valid?"
@@ -38,7 +33,9 @@ class FileBasedSourceError(Enum):
38
33
  CONFIG_VALIDATION_ERROR = "Error creating stream config object."
39
34
  MISSING_SCHEMA = "Expected `json_schema` in the configured catalog but it is missing."
40
35
  UNDEFINED_PARSER = "No parser is defined for this file type."
41
- UNDEFINED_VALIDATION_POLICY = "The validation policy defined in the config does not exist for the source."
36
+ UNDEFINED_VALIDATION_POLICY = (
37
+ "The validation policy defined in the config does not exist for the source."
38
+ )
42
39
 
43
40
 
44
41
  class FileBasedErrorsCollector:
@@ -69,7 +66,9 @@ class BaseFileBasedSourceError(Exception):
69
66
  def __init__(self, error: Union[FileBasedSourceError, str], **kwargs): # type: ignore # noqa
70
67
  if isinstance(error, FileBasedSourceError):
71
68
  error = FileBasedSourceError(error).value
72
- super().__init__(f"{error} Contact Support if you need assistance.\n{' '.join([f'{k}={v}' for k, v in kwargs.items()])}")
69
+ super().__init__(
70
+ f"{error} Contact Support if you need assistance.\n{' '.join([f'{k}={v}' for k, v in kwargs.items()])}"
71
+ )
73
72
 
74
73
 
75
74
  class ConfigValidationError(BaseFileBasedSourceError):
@@ -120,3 +119,7 @@ class CustomFileBasedException(AirbyteTracedException):
120
119
  """
121
120
 
122
121
  pass
122
+
123
+
124
+ class FileSizeLimitError(CustomFileBasedException):
125
+ pass
@@ -6,7 +6,9 @@ import logging
6
6
  import traceback
7
7
  from abc import ABC
8
8
  from collections import Counter
9
- from typing import Any, Iterator, List, Mapping, MutableMapping, Optional, Tuple, Type, Union
9
+ from typing import Any, Iterator, List, Mapping, Optional, Tuple, Type, Union
10
+
11
+ from pydantic.v1.error_wrappers import ValidationError
10
12
 
11
13
  from airbyte_cdk.logger import AirbyteLogFormatter, init_logger
12
14
  from airbyte_cdk.models import (
@@ -22,15 +24,31 @@ from airbyte_cdk.models import (
22
24
  from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
23
25
  from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
24
26
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
25
- from airbyte_cdk.sources.file_based.availability_strategy import AbstractFileBasedAvailabilityStrategy, DefaultFileBasedAvailabilityStrategy
27
+ from airbyte_cdk.sources.file_based.availability_strategy import (
28
+ AbstractFileBasedAvailabilityStrategy,
29
+ DefaultFileBasedAvailabilityStrategy,
30
+ )
26
31
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
27
- from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig, ValidationPolicy
28
- from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy, DefaultDiscoveryPolicy
29
- from airbyte_cdk.sources.file_based.exceptions import ConfigValidationError, FileBasedErrorsCollector, FileBasedSourceError
32
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
33
+ FileBasedStreamConfig,
34
+ ValidationPolicy,
35
+ )
36
+ from airbyte_cdk.sources.file_based.discovery_policy import (
37
+ AbstractDiscoveryPolicy,
38
+ DefaultDiscoveryPolicy,
39
+ )
40
+ from airbyte_cdk.sources.file_based.exceptions import (
41
+ ConfigValidationError,
42
+ FileBasedErrorsCollector,
43
+ FileBasedSourceError,
44
+ )
30
45
  from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
31
46
  from airbyte_cdk.sources.file_based.file_types import default_parsers
32
47
  from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
33
- from airbyte_cdk.sources.file_based.schema_validation_policies import DEFAULT_SCHEMA_VALIDATION_POLICIES, AbstractSchemaValidationPolicy
48
+ from airbyte_cdk.sources.file_based.schema_validation_policies import (
49
+ DEFAULT_SCHEMA_VALIDATION_POLICIES,
50
+ AbstractSchemaValidationPolicy,
51
+ )
34
52
  from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
35
53
  from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
36
54
  from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
@@ -44,7 +62,6 @@ from airbyte_cdk.sources.streams import Stream
44
62
  from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
45
63
  from airbyte_cdk.utils.analytics_message import create_analytics_message
46
64
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException
47
- from pydantic.error_wrappers import ValidationError
48
65
 
49
66
  DEFAULT_CONCURRENCY = 100
50
67
  MAX_CONCURRENCY = 100
@@ -61,29 +78,41 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
61
78
  spec_class: Type[AbstractFileBasedSpec],
62
79
  catalog: Optional[ConfiguredAirbyteCatalog],
63
80
  config: Optional[Mapping[str, Any]],
64
- state: Optional[MutableMapping[str, Any]],
81
+ state: Optional[List[AirbyteStateMessage]],
65
82
  availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy] = None,
66
83
  discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
67
84
  parsers: Mapping[Type[Any], FileTypeParser] = default_parsers,
68
- validation_policies: Mapping[ValidationPolicy, AbstractSchemaValidationPolicy] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
69
- cursor_cls: Type[Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]] = FileBasedConcurrentCursor,
85
+ validation_policies: Mapping[
86
+ ValidationPolicy, AbstractSchemaValidationPolicy
87
+ ] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
88
+ cursor_cls: Type[
89
+ Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]
90
+ ] = FileBasedConcurrentCursor,
70
91
  ):
71
92
  self.stream_reader = stream_reader
72
93
  self.spec_class = spec_class
73
94
  self.config = config
74
95
  self.catalog = catalog
75
96
  self.state = state
76
- self.availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(stream_reader)
97
+ self.availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(
98
+ stream_reader
99
+ )
77
100
  self.discovery_policy = discovery_policy
78
101
  self.parsers = parsers
79
102
  self.validation_policies = validation_policies
80
- self.stream_schemas = {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
103
+ self.stream_schemas = (
104
+ {s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
105
+ )
81
106
  self.cursor_cls = cursor_cls
82
107
  self.logger = init_logger(f"airbyte.{self.name}")
83
108
  self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
84
109
  self._message_repository: Optional[MessageRepository] = None
85
110
  concurrent_source = ConcurrentSource.create(
86
- MAX_CONCURRENCY, INITIAL_N_PARTITIONS, self.logger, self._slice_logger, self.message_repository
111
+ MAX_CONCURRENCY,
112
+ INITIAL_N_PARTITIONS,
113
+ self.logger,
114
+ self._slice_logger,
115
+ self.message_repository,
87
116
  )
88
117
  self._state = None
89
118
  super().__init__(concurrent_source)
@@ -91,10 +120,14 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
91
120
  @property
92
121
  def message_repository(self) -> MessageRepository:
93
122
  if self._message_repository is None:
94
- self._message_repository = InMemoryMessageRepository(Level(AirbyteLogFormatter.level_mapping[self.logger.level]))
123
+ self._message_repository = InMemoryMessageRepository(
124
+ Level(AirbyteLogFormatter.level_mapping[self.logger.level])
125
+ )
95
126
  return self._message_repository
96
127
 
97
- def check_connection(self, logger: logging.Logger, config: Mapping[str, Any]) -> Tuple[bool, Optional[Any]]:
128
+ def check_connection(
129
+ self, logger: logging.Logger, config: Mapping[str, Any]
130
+ ) -> Tuple[bool, Optional[Any]]:
98
131
  """
99
132
  Check that the source can be accessed using the user-provided configuration.
100
133
 
@@ -122,20 +155,49 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
122
155
  )
123
156
 
124
157
  errors = []
158
+ tracebacks = []
125
159
  for stream in streams:
126
160
  if not isinstance(stream, AbstractFileBasedStream):
127
161
  raise ValueError(f"Stream {stream} is not a file-based stream.")
128
162
  try:
163
+ parsed_config = self._get_parsed_config(config)
164
+ availability_method = (
165
+ stream.availability_strategy.check_availability
166
+ if self._use_file_transfer(parsed_config)
167
+ else stream.availability_strategy.check_availability_and_parsability
168
+ )
129
169
  (
130
170
  stream_is_available,
131
171
  reason,
132
- ) = stream.availability_strategy.check_availability_and_parsability(stream, logger, self)
172
+ ) = availability_method(stream, logger, self)
173
+ except AirbyteTracedException as ate:
174
+ errors.append(f"Unable to connect to stream {stream.name} - {ate.message}")
175
+ tracebacks.append(traceback.format_exc())
133
176
  except Exception:
134
- errors.append(f"Unable to connect to stream {stream.name} - {''.join(traceback.format_exc())}")
177
+ errors.append(f"Unable to connect to stream {stream.name}")
178
+ tracebacks.append(traceback.format_exc())
135
179
  else:
136
180
  if not stream_is_available and reason:
137
181
  errors.append(reason)
138
182
 
183
+ if len(errors) == 1 and len(tracebacks) == 1:
184
+ raise AirbyteTracedException(
185
+ internal_message=tracebacks[0],
186
+ message=f"{errors[0]}",
187
+ failure_type=FailureType.config_error,
188
+ )
189
+ if len(errors) == 1 and len(tracebacks) == 0:
190
+ raise AirbyteTracedException(
191
+ message=f"{errors[0]}",
192
+ failure_type=FailureType.config_error,
193
+ )
194
+ elif len(errors) > 1:
195
+ raise AirbyteTracedException(
196
+ internal_message="\n".join(tracebacks),
197
+ message=f"{len(errors)} streams with errors: {', '.join(error for error in errors)}",
198
+ failure_type=FailureType.config_error,
199
+ )
200
+
139
201
  return not bool(errors), (errors or None)
140
202
 
141
203
  def streams(self, config: Mapping[str, Any]) -> List[Stream]:
@@ -144,10 +206,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
144
206
  """
145
207
 
146
208
  if self.catalog:
147
- state_manager = ConnectorStateManager(
148
- stream_instance_map={s.stream.name: s.stream for s in self.catalog.streams},
149
- state=self.state,
150
- )
209
+ state_manager = ConnectorStateManager(state=self.state)
151
210
  else:
152
211
  # During `check` operations we don't have a catalog so cannot create a state manager.
153
212
  # Since the state manager is only required for incremental syncs, this is fine.
@@ -169,12 +228,26 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
169
228
 
170
229
  sync_mode = self._get_sync_mode_from_catalog(stream_config.name)
171
230
 
172
- if sync_mode == SyncMode.full_refresh and hasattr(self, "_concurrency_level") and self._concurrency_level is not None:
231
+ if (
232
+ sync_mode == SyncMode.full_refresh
233
+ and hasattr(self, "_concurrency_level")
234
+ and self._concurrency_level is not None
235
+ ):
173
236
  cursor = FileBasedFinalStateCursor(
174
- stream_config=stream_config, stream_namespace=None, message_repository=self.message_repository
237
+ stream_config=stream_config,
238
+ stream_namespace=None,
239
+ message_repository=self.message_repository,
175
240
  )
176
241
  stream = FileBasedStreamFacade.create_from_stream(
177
- self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
242
+ stream=self._make_default_stream(
243
+ stream_config=stream_config,
244
+ cursor=cursor,
245
+ use_file_transfer=self._use_file_transfer(parsed_config),
246
+ ),
247
+ source=self,
248
+ logger=self.logger,
249
+ state=stream_state,
250
+ cursor=cursor,
178
251
  )
179
252
 
180
253
  elif (
@@ -197,11 +270,23 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
197
270
  CursorField(DefaultFileBasedStream.ab_last_mod_col),
198
271
  )
199
272
  stream = FileBasedStreamFacade.create_from_stream(
200
- self._make_default_stream(stream_config, cursor), self, self.logger, stream_state, cursor
273
+ stream=self._make_default_stream(
274
+ stream_config=stream_config,
275
+ cursor=cursor,
276
+ use_file_transfer=self._use_file_transfer(parsed_config),
277
+ ),
278
+ source=self,
279
+ logger=self.logger,
280
+ state=stream_state,
281
+ cursor=cursor,
201
282
  )
202
283
  else:
203
284
  cursor = self.cursor_cls(stream_config)
204
- stream = self._make_default_stream(stream_config, cursor)
285
+ stream = self._make_default_stream(
286
+ stream_config=stream_config,
287
+ cursor=cursor,
288
+ use_file_transfer=self._use_file_transfer(parsed_config),
289
+ )
205
290
 
206
291
  streams.append(stream)
207
292
  return streams
@@ -210,7 +295,10 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
210
295
  raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) from exc
211
296
 
212
297
  def _make_default_stream(
213
- self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor]
298
+ self,
299
+ stream_config: FileBasedStreamConfig,
300
+ cursor: Optional[AbstractFileBasedCursor],
301
+ use_file_transfer: bool = False,
214
302
  ) -> AbstractFileBasedStream:
215
303
  return DefaultFileBasedStream(
216
304
  config=stream_config,
@@ -222,9 +310,12 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
222
310
  validation_policy=self._validate_and_get_validation_policy(stream_config),
223
311
  errors_collector=self.errors_collector,
224
312
  cursor=cursor,
313
+ use_file_transfer=use_file_transfer,
225
314
  )
226
315
 
227
- def _get_stream_from_catalog(self, stream_config: FileBasedStreamConfig) -> Optional[AirbyteStream]:
316
+ def _get_stream_from_catalog(
317
+ self, stream_config: FileBasedStreamConfig
318
+ ) -> Optional[AirbyteStream]:
228
319
  if self.catalog:
229
320
  for stream in self.catalog.streams or []:
230
321
  if stream.stream.name == stream_config.name:
@@ -244,14 +335,16 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
244
335
  logger: logging.Logger,
245
336
  config: Mapping[str, Any],
246
337
  catalog: ConfiguredAirbyteCatalog,
247
- state: Optional[Union[List[AirbyteStateMessage], MutableMapping[str, Any]]] = None,
338
+ state: Optional[List[AirbyteStateMessage]] = None,
248
339
  ) -> Iterator[AirbyteMessage]:
249
340
  yield from super().read(logger, config, catalog, state)
250
341
  # emit all the errors collected
251
342
  yield from self.errors_collector.yield_and_raise_collected()
252
343
  # count streams using a certain parser
253
344
  parsed_config = self._get_parsed_config(config)
254
- for parser, count in Counter(stream.format.filetype for stream in parsed_config.streams).items():
345
+ for parser, count in Counter(
346
+ stream.format.filetype for stream in parsed_config.streams
347
+ ).items():
255
348
  yield create_analytics_message(f"file-cdk-{parser}-stream-count", count)
256
349
 
257
350
  def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification:
@@ -267,14 +360,28 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
267
360
  def _get_parsed_config(self, config: Mapping[str, Any]) -> AbstractFileBasedSpec:
268
361
  return self.spec_class(**config)
269
362
 
270
- def _validate_and_get_validation_policy(self, stream_config: FileBasedStreamConfig) -> AbstractSchemaValidationPolicy:
363
+ def _validate_and_get_validation_policy(
364
+ self, stream_config: FileBasedStreamConfig
365
+ ) -> AbstractSchemaValidationPolicy:
271
366
  if stream_config.validation_policy not in self.validation_policies:
272
367
  # This should never happen because we validate the config against the schema's validation_policy enum
273
368
  raise ValidationError(
274
- f"`validation_policy` must be one of {list(self.validation_policies.keys())}", model=FileBasedStreamConfig
369
+ f"`validation_policy` must be one of {list(self.validation_policies.keys())}",
370
+ model=FileBasedStreamConfig,
275
371
  )
276
372
  return self.validation_policies[stream_config.validation_policy]
277
373
 
278
374
  def _validate_input_schema(self, stream_config: FileBasedStreamConfig) -> None:
279
375
  if stream_config.schemaless and stream_config.input_schema:
280
- raise ValidationError("`input_schema` and `schemaless` options cannot both be set", model=FileBasedStreamConfig)
376
+ raise ValidationError(
377
+ "`input_schema` and `schemaless` options cannot both be set",
378
+ model=FileBasedStreamConfig,
379
+ )
380
+
381
+ @staticmethod
382
+ def _use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
383
+ use_file_transfer = (
384
+ hasattr(parsed_config.delivery_method, "delivery_type")
385
+ and parsed_config.delivery_method.delivery_type == "use_file_transfer"
386
+ )
387
+ return use_file_transfer
@@ -7,11 +7,13 @@ from abc import ABC, abstractmethod
7
7
  from datetime import datetime
8
8
  from enum import Enum
9
9
  from io import IOBase
10
- from typing import Iterable, List, Optional, Set
10
+ from os import makedirs, path
11
+ from typing import Any, Dict, Iterable, List, Optional, Set
12
+
13
+ from wcmatch.glob import GLOBSTAR, globmatch
11
14
 
12
15
  from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
13
16
  from airbyte_cdk.sources.file_based.remote_file import RemoteFile
14
- from wcmatch.glob import GLOBSTAR, globmatch
15
17
 
16
18
 
17
19
  class FileReadMode(Enum):
@@ -44,7 +46,9 @@ class AbstractFileBasedStreamReader(ABC):
44
46
  ...
45
47
 
46
48
  @abstractmethod
47
- def open_file(self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger) -> IOBase:
49
+ def open_file(
50
+ self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger
51
+ ) -> IOBase:
48
52
  """
49
53
  Return a file handle for reading.
50
54
 
@@ -79,11 +83,17 @@ class AbstractFileBasedStreamReader(ABC):
79
83
  """
80
84
  ...
81
85
 
82
- def filter_files_by_globs_and_start_date(self, files: List[RemoteFile], globs: List[str]) -> Iterable[RemoteFile]:
86
+ def filter_files_by_globs_and_start_date(
87
+ self, files: List[RemoteFile], globs: List[str]
88
+ ) -> Iterable[RemoteFile]:
83
89
  """
84
90
  Utility method for filtering files based on globs.
85
91
  """
86
- start_date = datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT) if self.config and self.config.start_date else None
92
+ start_date = (
93
+ datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT)
94
+ if self.config and self.config.start_date
95
+ else None
96
+ )
87
97
  seen = set()
88
98
 
89
99
  for file in files:
@@ -92,6 +102,16 @@ class AbstractFileBasedStreamReader(ABC):
92
102
  seen.add(file.uri)
93
103
  yield file
94
104
 
105
+ @abstractmethod
106
+ def file_size(self, file: RemoteFile) -> int:
107
+ """Utility method to get size of the remote file.
108
+
109
+ This is required for connectors that will support writing to
110
+ files. If the connector does not support writing files, then the
111
+ subclass can simply `return 0`.
112
+ """
113
+ ...
114
+
95
115
  @staticmethod
96
116
  def file_matches_globs(file: RemoteFile, globs: List[str]) -> bool:
97
117
  # Use the GLOBSTAR flag to enable recursive ** matching
@@ -105,3 +125,47 @@ class AbstractFileBasedStreamReader(ABC):
105
125
  """
106
126
  prefixes = {glob.split("*")[0] for glob in globs}
107
127
  return set(filter(lambda x: bool(x), prefixes))
128
+
129
+ def use_file_transfer(self) -> bool:
130
+ if self.config:
131
+ use_file_transfer = (
132
+ hasattr(self.config.delivery_method, "delivery_type")
133
+ and self.config.delivery_method.delivery_type == "use_file_transfer"
134
+ )
135
+ return use_file_transfer
136
+ return False
137
+
138
+ @abstractmethod
139
+ def get_file(
140
+ self, file: RemoteFile, local_directory: str, logger: logging.Logger
141
+ ) -> Dict[str, Any]:
142
+ """
143
+ This is required for connectors that will support writing to
144
+ files. It will handle the logic to download,get,read,acquire or
145
+ whatever is more efficient to get a file from the source.
146
+
147
+ Args:
148
+ file (RemoteFile): The remote file object containing URI and metadata.
149
+ local_directory (str): The local directory path where the file will be downloaded.
150
+ logger (logging.Logger): Logger for logging information and errors.
151
+
152
+ Returns:
153
+ dict: A dictionary containing the following:
154
+ - "file_url" (str): The absolute path of the downloaded file.
155
+ - "bytes" (int): The file size in bytes.
156
+ - "file_relative_path" (str): The relative path of the file for local storage. Is relative to local_directory as
157
+ this a mounted volume in the pod container.
158
+
159
+ """
160
+ ...
161
+
162
+ @staticmethod
163
+ def _get_file_transfer_paths(file: RemoteFile, local_directory: str) -> List[str]:
164
+ # Remove left slashes from source path format to make relative path for writing locally
165
+ file_relative_path = file.uri.lstrip("/")
166
+ local_file_path = path.join(local_directory, file_relative_path)
167
+
168
+ # Ensure the local directory exists
169
+ makedirs(path.dirname(local_file_path), exist_ok=True)
170
+ absolute_file_path = path.abspath(local_file_path)
171
+ return [file_relative_path, local_file_path, absolute_file_path]
@@ -2,12 +2,15 @@ from typing import Any, Mapping, Type
2
2
 
3
3
  from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
4
4
  from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
5
+ from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
5
6
  from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
6
7
  from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
7
8
  from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
8
9
 
9
10
  from .avro_parser import AvroParser
10
11
  from .csv_parser import CsvParser
12
+ from .excel_parser import ExcelParser
13
+ from .file_transfer import FileTransfer
11
14
  from .file_type_parser import FileTypeParser
12
15
  from .jsonl_parser import JsonlParser
13
16
  from .parquet_parser import ParquetParser
@@ -16,9 +19,19 @@ from .unstructured_parser import UnstructuredParser
16
19
  default_parsers: Mapping[Type[Any], FileTypeParser] = {
17
20
  AvroFormat: AvroParser(),
18
21
  CsvFormat: CsvParser(),
22
+ ExcelFormat: ExcelParser(),
19
23
  JsonlFormat: JsonlParser(),
20
24
  ParquetFormat: ParquetParser(),
21
25
  UnstructuredFormat: UnstructuredParser(),
22
26
  }
23
27
 
24
- __all__ = ["AvroParser", "CsvParser", "JsonlParser", "ParquetParser", "UnstructuredParser", "default_parsers"]
28
+ __all__ = [
29
+ "AvroParser",
30
+ "CsvParser",
31
+ "ExcelParser",
32
+ "JsonlParser",
33
+ "ParquetParser",
34
+ "UnstructuredParser",
35
+ "FileTransfer",
36
+ "default_parsers",
37
+ ]