airbyte-cdk 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (368) hide show
  1. airbyte_cdk/__init__.py +358 -0
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +104 -0
  7. airbyte_cdk/connector.py +123 -0
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/__init__.py +3 -0
  10. airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
  11. airbyte_cdk/connector_builder/main.py +107 -0
  12. airbyte_cdk/connector_builder/models.py +73 -0
  13. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  14. airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
  15. airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
  16. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  17. airbyte_cdk/connector_builder/test_reader/types.py +83 -0
  18. airbyte_cdk/destinations/__init__.py +8 -0
  19. airbyte_cdk/destinations/destination.py +154 -0
  20. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  21. airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
  22. airbyte_cdk/destinations/vector_db_based/config.py +298 -0
  23. airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
  24. airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
  25. airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
  26. airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
  27. airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
  28. airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
  29. airbyte_cdk/entrypoint.py +414 -0
  30. airbyte_cdk/exception_handler.py +56 -0
  31. airbyte_cdk/logger.py +109 -0
  32. airbyte_cdk/models/__init__.py +72 -0
  33. airbyte_cdk/models/airbyte_protocol.py +88 -0
  34. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  35. airbyte_cdk/models/well_known_types.py +5 -0
  36. airbyte_cdk/py.typed +0 -0
  37. airbyte_cdk/sources/__init__.py +26 -0
  38. airbyte_cdk/sources/abstract_source.py +326 -0
  39. airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
  40. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
  41. airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
  42. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
  43. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
  44. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  45. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
  46. airbyte_cdk/sources/config.py +27 -0
  47. airbyte_cdk/sources/connector_state_manager.py +161 -0
  48. airbyte_cdk/sources/declarative/__init__.py +3 -0
  49. airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
  50. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  51. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
  52. airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
  53. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  54. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  55. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  56. airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
  57. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
  58. airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
  59. airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
  60. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
  61. airbyte_cdk/sources/declarative/auth/token.py +267 -0
  62. airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
  63. airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
  64. airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
  65. airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
  66. airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
  67. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  68. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  69. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
  70. airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
  71. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
  72. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
  73. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
  74. airbyte_cdk/sources/declarative/declarative_source.py +36 -0
  75. airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
  76. airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
  77. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
  78. airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
  79. airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
  80. airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
  81. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  82. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  83. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  84. airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
  85. airbyte_cdk/sources/declarative/exceptions.py +9 -0
  86. airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
  87. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
  88. airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
  89. airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
  90. airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
  91. airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
  92. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
  93. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  94. airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
  95. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
  96. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
  97. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  98. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
  99. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
  100. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  101. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  102. airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
  103. airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
  104. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
  105. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
  106. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
  107. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
  108. airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
  109. airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
  110. airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
  111. airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
  112. airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
  113. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  114. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  115. airbyte_cdk/sources/declarative/models/__init__.py +2 -0
  116. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
  117. airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
  118. airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
  119. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
  120. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
  121. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
  122. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
  123. airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
  124. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  125. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  126. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
  127. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  128. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
  129. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
  130. airbyte_cdk/sources/declarative/requesters/README.md +56 -0
  131. airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
  132. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
  133. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
  134. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
  135. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
  136. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
  137. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
  138. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
  139. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
  140. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
  141. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
  142. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  143. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
  144. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
  145. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
  146. airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
  147. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
  148. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
  149. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
  150. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
  151. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
  152. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
  153. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
  154. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
  155. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
  156. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
  157. airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
  158. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
  159. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
  160. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  161. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
  162. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
  163. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
  164. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
  165. airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
  166. airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
  167. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  168. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  169. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  170. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  171. airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
  172. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
  173. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  174. airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
  175. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
  176. airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
  177. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
  178. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
  179. airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
  180. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
  181. airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
  182. airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
  183. airbyte_cdk/sources/declarative/spec/spec.py +48 -0
  184. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
  185. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
  186. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
  187. airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
  188. airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
  189. airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
  190. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  191. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  192. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  193. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  194. airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
  195. airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
  196. airbyte_cdk/sources/declarative/types.py +25 -0
  197. airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
  198. airbyte_cdk/sources/file_based/README.md +152 -0
  199. airbyte_cdk/sources/file_based/__init__.py +24 -0
  200. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
  201. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
  202. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
  203. airbyte_cdk/sources/file_based/config/__init__.py +0 -0
  204. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
  205. airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
  206. airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
  207. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  208. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
  209. airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
  210. airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
  211. airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
  212. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  213. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
  214. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
  215. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
  216. airbyte_cdk/sources/file_based/exceptions.py +159 -0
  217. airbyte_cdk/sources/file_based/file_based_source.py +466 -0
  218. airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
  219. airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
  220. airbyte_cdk/sources/file_based/file_record_data.py +22 -0
  221. airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
  222. airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
  223. airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
  224. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  225. airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
  226. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
  227. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
  228. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
  229. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
  230. airbyte_cdk/sources/file_based/remote_file.py +18 -0
  231. airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
  232. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
  233. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
  234. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
  235. airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
  236. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
  237. airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
  238. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
  239. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
  240. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
  241. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
  242. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
  243. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
  244. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
  245. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
  246. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
  247. airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
  248. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
  249. airbyte_cdk/sources/file_based/types.py +10 -0
  250. airbyte_cdk/sources/http_config.py +10 -0
  251. airbyte_cdk/sources/http_logger.py +55 -0
  252. airbyte_cdk/sources/message/__init__.py +19 -0
  253. airbyte_cdk/sources/message/repository.py +137 -0
  254. airbyte_cdk/sources/source.py +95 -0
  255. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  256. airbyte_cdk/sources/streams/__init__.py +8 -0
  257. airbyte_cdk/sources/streams/availability_strategy.py +84 -0
  258. airbyte_cdk/sources/streams/call_rate.py +704 -0
  259. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  260. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  261. airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
  262. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  263. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  264. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  265. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  266. airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
  267. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
  268. airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
  269. airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
  270. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
  271. airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
  272. airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
  273. airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
  274. airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
  275. airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
  276. airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
  277. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
  278. airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
  279. airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
  280. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
  281. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
  282. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  283. airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
  284. airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
  285. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
  286. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
  287. airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
  288. airbyte_cdk/sources/streams/core.py +703 -0
  289. airbyte_cdk/sources/streams/http/__init__.py +10 -0
  290. airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
  291. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  292. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  293. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  294. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  295. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  296. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  297. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  298. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  299. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  300. airbyte_cdk/sources/streams/http/exceptions.py +61 -0
  301. airbyte_cdk/sources/streams/http/http.py +673 -0
  302. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  303. airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
  304. airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
  305. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
  306. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
  307. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
  308. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
  309. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  310. airbyte_cdk/sources/streams/utils/__init__.py +3 -0
  311. airbyte_cdk/sources/types.py +169 -0
  312. airbyte_cdk/sources/utils/__init__.py +7 -0
  313. airbyte_cdk/sources/utils/casing.py +12 -0
  314. airbyte_cdk/sources/utils/files_directory.py +15 -0
  315. airbyte_cdk/sources/utils/record_helper.py +53 -0
  316. airbyte_cdk/sources/utils/schema_helpers.py +230 -0
  317. airbyte_cdk/sources/utils/slice_logger.py +57 -0
  318. airbyte_cdk/sources/utils/transform.py +277 -0
  319. airbyte_cdk/sources/utils/types.py +7 -0
  320. airbyte_cdk/sql/__init__.py +0 -0
  321. airbyte_cdk/sql/_util/__init__.py +0 -0
  322. airbyte_cdk/sql/_util/hashing.py +34 -0
  323. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  324. airbyte_cdk/sql/constants.py +32 -0
  325. airbyte_cdk/sql/exceptions.py +235 -0
  326. airbyte_cdk/sql/secrets.py +123 -0
  327. airbyte_cdk/sql/shared/__init__.py +15 -0
  328. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  329. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  330. airbyte_cdk/sql/types.py +160 -0
  331. airbyte_cdk/test/__init__.py +7 -0
  332. airbyte_cdk/test/catalog_builder.py +81 -0
  333. airbyte_cdk/test/entrypoint_wrapper.py +250 -0
  334. airbyte_cdk/test/mock_http/__init__.py +6 -0
  335. airbyte_cdk/test/mock_http/matcher.py +41 -0
  336. airbyte_cdk/test/mock_http/mocker.py +185 -0
  337. airbyte_cdk/test/mock_http/request.py +103 -0
  338. airbyte_cdk/test/mock_http/response.py +28 -0
  339. airbyte_cdk/test/mock_http/response_builder.py +237 -0
  340. airbyte_cdk/test/state_builder.py +33 -0
  341. airbyte_cdk/test/utils/__init__.py +1 -0
  342. airbyte_cdk/test/utils/data.py +24 -0
  343. airbyte_cdk/test/utils/http_mocking.py +16 -0
  344. airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
  345. airbyte_cdk/test/utils/reading.py +26 -0
  346. airbyte_cdk/utils/__init__.py +10 -0
  347. airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
  348. airbyte_cdk/utils/analytics_message.py +25 -0
  349. airbyte_cdk/utils/constants.py +5 -0
  350. airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
  351. airbyte_cdk/utils/datetime_helpers.py +499 -0
  352. airbyte_cdk/utils/event_timing.py +85 -0
  353. airbyte_cdk/utils/is_cloud_environment.py +18 -0
  354. airbyte_cdk/utils/mapping_helpers.py +162 -0
  355. airbyte_cdk/utils/message_utils.py +26 -0
  356. airbyte_cdk/utils/oneof_option_config.py +33 -0
  357. airbyte_cdk/utils/print_buffer.py +75 -0
  358. airbyte_cdk/utils/schema_inferrer.py +270 -0
  359. airbyte_cdk/utils/slice_hasher.py +37 -0
  360. airbyte_cdk/utils/spec_schema_transformations.py +26 -0
  361. airbyte_cdk/utils/stream_status_utils.py +43 -0
  362. airbyte_cdk/utils/traced_exception.py +145 -0
  363. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
  364. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
  365. airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
  366. airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
  367. airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
  368. airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,123 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ from abc import ABC, abstractmethod
7
+ from typing import Any, Dict, Iterable, Optional
8
+
9
+ from airbyte_cdk.sources.file_based import AbstractFileBasedSpec
10
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
11
+
12
+
13
+ class AbstractFileBasedStreamPermissionsReader(ABC):
14
+ """
15
+ This class is responsible for reading file permissions and Identities from a source.
16
+ """
17
+
18
+ def __init__(self) -> None:
19
+ self._config = None
20
+
21
+ @property
22
+ def config(self) -> Optional[AbstractFileBasedSpec]:
23
+ return self._config
24
+
25
+ @config.setter
26
+ @abstractmethod
27
+ def config(self, value: AbstractFileBasedSpec) -> None:
28
+ """
29
+ FileBasedSource reads the config from disk and parses it, and once parsed, the source sets the config on its StreamReader.
30
+
31
+ Note: FileBasedSource only requires the keys defined in the abstract config, whereas concrete implementations of StreamReader
32
+ will require keys that (for example) allow it to authenticate with the 3rd party.
33
+
34
+ Therefore, concrete implementations of AbstractFileBasedStreamPermissionsReader's's config setter should assert that `value` is of the correct
35
+ config type for that type of StreamReader.
36
+ """
37
+ ...
38
+
39
+ @abstractmethod
40
+ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> Dict[str, Any]:
41
+ """
42
+ This function should return the allow list for a given file, i.e. the list of all identities and their permission levels associated with it
43
+
44
+ e.g.
45
+ def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger):
46
+ api_conn = some_api.conn(credentials=SOME_CREDENTIALS)
47
+ result = api_conn.get_file_permissions_info(file.id)
48
+ return MyPermissionsModel(
49
+ id=result["id"],
50
+ access_control_list = result["access_control_list"],
51
+ is_public = result["is_public"],
52
+ ).dict()
53
+ """
54
+ ...
55
+
56
+ @abstractmethod
57
+ def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]:
58
+ """
59
+ This function should return the Identities in a determined "space" or "domain" where the file metadata (ACLs) are fetched and ACLs items (Identities) exists.
60
+
61
+ e.g.
62
+ def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]:
63
+ api_conn = some_api.conn(credentials=SOME_CREDENTIALS)
64
+ users_api = api_conn.users()
65
+ groups_api = api_conn.groups()
66
+ members_api = self.google_directory_service.members()
67
+ for user in users_api.list():
68
+ yield my_identity_model(id=user.id, name=user.name, email_address=user.email, type="user").dict()
69
+ for group in groups_api.list():
70
+ group_obj = my_identity_model(id=group.id, name=groups.name, email_address=user.email, type="group").dict()
71
+ for member in members_api.list(group=group):
72
+ group_obj.member_email_addresses = group_obj.member_email_addresses or []
73
+ group_obj.member_email_addresses.append(member.email)
74
+ yield group_obj.dict()
75
+ """
76
+ ...
77
+
78
+ @property
79
+ @abstractmethod
80
+ def file_permissions_schema(self) -> Dict[str, Any]:
81
+ """
82
+ This function should return the permissions schema for file permissions stream.
83
+
84
+ e.g.
85
+ def file_permissions_schema(self) -> Dict[str, Any]:
86
+ # you can also follow the pattern we have for python connectors and have a json file and read from there e.g. schemas/identities.json
87
+ return {
88
+ "type": "object",
89
+ "properties": {
90
+ "id": { "type": "string" },
91
+ "file_path": { "type": "string" },
92
+ "access_control_list": {
93
+ "type": "array",
94
+ "items": { "type": "string" }
95
+ },
96
+ "publicly_accessible": { "type": "boolean" }
97
+ }
98
+ }
99
+ """
100
+ ...
101
+
102
+ @property
103
+ @abstractmethod
104
+ def identities_schema(self) -> Dict[str, Any]:
105
+ """
106
+ This function should return the identities schema for file identity stream.
107
+
108
+ e.g.
109
+ def identities_schema(self) -> Dict[str, Any]:
110
+ # you can also follow the pattern we have for python connectors and have a json file and read from there e.g. schemas/identities.json
111
+ return {
112
+ "type": "object",
113
+ "properties": {
114
+ "id": { "type": "string" },
115
+ "remote_id": { "type": "string" },
116
+ "name": { "type": ["null", "string"] },
117
+ "email_address": { "type": ["null", "string"] },
118
+ "member_email_addresses": { "type": ["null", "array"] },
119
+ "type": { "type": "string" },
120
+ }
121
+ }
122
+ """
123
+ ...
@@ -0,0 +1,209 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ from abc import ABC, abstractmethod
7
+ from datetime import datetime
8
+ from enum import Enum
9
+ from io import IOBase
10
+ from os import makedirs, path
11
+ from typing import Any, Callable, Iterable, List, MutableMapping, Optional, Set, Tuple
12
+
13
+ from wcmatch.glob import GLOBSTAR, globmatch
14
+
15
+ from airbyte_cdk.models import AirbyteRecordMessageFileReference
16
+ from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
17
+ from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import (
18
+ include_identities_stream,
19
+ preserve_directory_structure,
20
+ use_file_transfer,
21
+ )
22
+ from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
23
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
24
+
25
+
26
+ class FileReadMode(Enum):
27
+ READ = "r"
28
+ READ_BINARY = "rb"
29
+
30
+
31
+ class AbstractFileBasedStreamReader(ABC):
32
+ DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
33
+ FILE_RELATIVE_PATH = "file_relative_path"
34
+ FILE_NAME = "file_name"
35
+ LOCAL_FILE_PATH = "local_file_path"
36
+ SOURCE_FILE_URI = "source_file_relative_path"
37
+ FILE_FOLDER = "file_folder"
38
+
39
+ def __init__(self) -> None:
40
+ self._config = None
41
+
42
+ @property
43
+ def config(self) -> Optional[AbstractFileBasedSpec]:
44
+ return self._config
45
+
46
+ @config.setter
47
+ @abstractmethod
48
+ def config(self, value: AbstractFileBasedSpec) -> None:
49
+ """
50
+ FileBasedSource reads the config from disk and parses it, and once parsed, the source sets the config on its StreamReader.
51
+
52
+ Note: FileBasedSource only requires the keys defined in the abstract config, whereas concrete implementations of StreamReader
53
+ will require keys that (for example) allow it to authenticate with the 3rd party.
54
+
55
+ Therefore, concrete implementations of AbstractFileBasedStreamReader's config setter should assert that `value` is of the correct
56
+ config type for that type of StreamReader.
57
+ """
58
+ ...
59
+
60
+ @abstractmethod
61
+ def open_file(
62
+ self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger
63
+ ) -> IOBase:
64
+ """
65
+ Return a file handle for reading.
66
+
67
+ Many sources will be able to use smart_open to implement this method,
68
+ for example:
69
+
70
+ client = boto3.Session(...)
71
+ return smart_open.open(remote_file.uri, transport_params={"client": client})
72
+ """
73
+ ...
74
+
75
+ @abstractmethod
76
+ def get_matching_files(
77
+ self,
78
+ globs: List[str],
79
+ prefix: Optional[str],
80
+ logger: logging.Logger,
81
+ ) -> Iterable[RemoteFile]:
82
+ """
83
+ Return all files that match any of the globs.
84
+
85
+ Example:
86
+
87
+ The source has files "a.json", "foo/a.json", "foo/bar/a.json"
88
+
89
+ If globs = ["*.json"] then this method returns ["a.json"].
90
+
91
+ If globs = ["foo/*.json"] then this method returns ["foo/a.json"].
92
+
93
+ Utility method `self.filter_files_by_globs` and `self.get_prefixes_from_globs`
94
+ are available, which may be helpful when implementing this method.
95
+ """
96
+ ...
97
+
98
+ def filter_files_by_globs_and_start_date(
99
+ self, files: List[RemoteFile], globs: List[str]
100
+ ) -> Iterable[RemoteFile]:
101
+ """
102
+ Utility method for filtering files based on globs.
103
+ """
104
+ start_date = (
105
+ datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT)
106
+ if self.config and self.config.start_date
107
+ else None
108
+ )
109
+ seen = set()
110
+
111
+ for file in files:
112
+ if self.file_matches_globs(file, globs):
113
+ if file.uri not in seen and (not start_date or file.last_modified >= start_date):
114
+ seen.add(file.uri)
115
+ yield file
116
+
117
+ @abstractmethod
118
+ def file_size(self, file: RemoteFile) -> int:
119
+ """Utility method to get size of the remote file.
120
+
121
+ This is required for connectors that will support writing to
122
+ files. If the connector does not support writing files, then the
123
+ subclass can simply `return 0`.
124
+ """
125
+ ...
126
+
127
+ @staticmethod
128
+ def file_matches_globs(file: RemoteFile, globs: List[str]) -> bool:
129
+ # Use the GLOBSTAR flag to enable recursive ** matching
130
+ # (https://facelessuser.github.io/wcmatch/wcmatch/#globstar)
131
+ return any(globmatch(file.uri, g, flags=GLOBSTAR) for g in globs)
132
+
133
+ @staticmethod
134
+ def get_prefixes_from_globs(globs: List[str]) -> Set[str]:
135
+ """
136
+ Utility method for extracting prefixes from the globs.
137
+ """
138
+ prefixes = {glob.split("*")[0] for glob in globs}
139
+ return set(filter(lambda x: bool(x), prefixes))
140
+
141
+ def use_file_transfer(self) -> bool:
142
+ if self.config:
143
+ return use_file_transfer(self.config)
144
+ return False
145
+
146
+ def preserve_directory_structure(self) -> bool:
147
+ # fall back to preserve subdirectories if config is not present or incomplete
148
+ if self.config:
149
+ return preserve_directory_structure(self.config)
150
+ return True
151
+
152
+ def include_identities_stream(self) -> bool:
153
+ if self.config:
154
+ return include_identities_stream(self.config)
155
+ return False
156
+
157
+ @abstractmethod
158
+ def upload(
159
+ self, file: RemoteFile, local_directory: str, logger: logging.Logger
160
+ ) -> Tuple[FileRecordData, AirbyteRecordMessageFileReference]:
161
+ """
162
+ This is required for connectors that will support writing to
163
+ files. It will handle the logic to download,get,read,acquire or
164
+ whatever is more efficient to get a file from the source.
165
+
166
+ Args:
167
+ file (RemoteFile): The remote file object containing URI and metadata.
168
+ local_directory (str): The local directory path where the file will be downloaded.
169
+ logger (logging.Logger): Logger for logging information and errors.
170
+
171
+ Returns:
172
+ AirbyteRecordMessageFileReference: A file reference object containing:
173
+ - staging_file_url (str): The absolute path to the referenced file in the staging area.
174
+ - file_size_bytes (int): The size of the referenced file in bytes.
175
+ - source_file_relative_path (str): The relative path to the referenced file in source.
176
+ """
177
+ ...
178
+
179
+ def _get_file_transfer_paths(
180
+ self,
181
+ file: RemoteFile,
182
+ local_directory: str,
183
+ parse_file_path_from_uri: Optional[Callable[[str], str]] = None,
184
+ ) -> MutableMapping[str, Any]:
185
+ preserve_directory_structure = self.preserve_directory_structure()
186
+ if not parse_file_path_from_uri:
187
+ file_path = file.uri
188
+ else:
189
+ file_path = parse_file_path_from_uri(file.uri)
190
+
191
+ file_name = path.basename(file_path)
192
+ file_folder = path.dirname(file_path)
193
+ if preserve_directory_structure:
194
+ # Remove left slashes from source path format to make relative path for writing locally
195
+ file_relative_path = file_path.lstrip("/")
196
+ else:
197
+ file_relative_path = file_name
198
+ local_file_path = path.join(local_directory, file_relative_path)
199
+ # Ensure the local directory exists
200
+ makedirs(path.dirname(local_file_path), exist_ok=True)
201
+
202
+ file_paths = {
203
+ self.FILE_RELATIVE_PATH: file_relative_path,
204
+ self.LOCAL_FILE_PATH: local_file_path,
205
+ self.FILE_NAME: file_name,
206
+ self.FILE_FOLDER: file_folder,
207
+ self.SOURCE_FILE_URI: file.uri,
208
+ }
209
+ return file_paths
@@ -0,0 +1,22 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from datetime import datetime
6
+ from typing import Optional
7
+
8
+ from pydantic.v1 import BaseModel
9
+
10
+
11
+ class FileRecordData(BaseModel):
12
+ """
13
+ A record in a file-based stream.
14
+ """
15
+
16
+ folder: str
17
+ filename: str
18
+ bytes: int
19
+
20
+ id: Optional[str] = None
21
+ updated_at: Optional[str] = None
22
+ mime_type: Optional[str] = None
@@ -0,0 +1,37 @@
1
+ from typing import Any, Mapping, Type
2
+
3
+ from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
4
+ from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
5
+ from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
6
+ from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
7
+ from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
8
+ from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
9
+
10
+ from .avro_parser import AvroParser
11
+ from .csv_parser import CsvParser
12
+ from .excel_parser import ExcelParser
13
+ from .file_transfer import FileTransfer
14
+ from .file_type_parser import FileTypeParser
15
+ from .jsonl_parser import JsonlParser
16
+ from .parquet_parser import ParquetParser
17
+ from .unstructured_parser import UnstructuredParser
18
+
19
+ default_parsers: Mapping[Type[Any], FileTypeParser] = {
20
+ AvroFormat: AvroParser(),
21
+ CsvFormat: CsvParser(),
22
+ ExcelFormat: ExcelParser(),
23
+ JsonlFormat: JsonlParser(),
24
+ ParquetFormat: ParquetParser(),
25
+ UnstructuredFormat: UnstructuredParser(),
26
+ }
27
+
28
+ __all__ = [
29
+ "AvroParser",
30
+ "CsvParser",
31
+ "ExcelParser",
32
+ "JsonlParser",
33
+ "ParquetParser",
34
+ "UnstructuredParser",
35
+ "FileTransfer",
36
+ "default_parsers",
37
+ ]
@@ -0,0 +1,233 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, cast
7
+
8
+ import fastavro
9
+
10
+ from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
11
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
12
+ from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
13
+ from airbyte_cdk.sources.file_based.file_based_stream_reader import (
14
+ AbstractFileBasedStreamReader,
15
+ FileReadMode,
16
+ )
17
+ from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
18
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
19
+ from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
20
+
21
+ AVRO_TYPE_TO_JSON_TYPE = {
22
+ "null": "null",
23
+ "boolean": "boolean",
24
+ "int": "integer",
25
+ "long": "integer",
26
+ "float": "number",
27
+ "double": "string", # double -> number conversions can lose precision
28
+ "bytes": "string",
29
+ "string": "string",
30
+ }
31
+
32
+ AVRO_LOGICAL_TYPE_TO_JSON = {
33
+ "decimal": {"type": "string"},
34
+ "uuid": {"type": "string"},
35
+ "date": {"type": "string", "format": "date"},
36
+ "time-millis": {"type": "integer"},
37
+ "time-micros": {"type": "integer"},
38
+ "timestamp-millis": {"type": "string", "format": "date-time"},
39
+ "timestamp-micros": {"type": "string"},
40
+ "local-timestamp-millis": {"type": "string", "format": "date-time"},
41
+ "local-timestamp-micros": {"type": "string"},
42
+ # fastavro does not support duration https://fastavro.readthedocs.io/en/latest/logical_types.html
43
+ }
44
+
45
+
46
+ class AvroParser(FileTypeParser):
47
+ ENCODING = None
48
+
49
+ def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
50
+ """
51
+ AvroParser does not require config checks, implicit pydantic validation is enough.
52
+ """
53
+ return True, None
54
+
55
+ async def infer_schema(
56
+ self,
57
+ config: FileBasedStreamConfig,
58
+ file: RemoteFile,
59
+ stream_reader: AbstractFileBasedStreamReader,
60
+ logger: logging.Logger,
61
+ ) -> SchemaType:
62
+ avro_format = config.format
63
+ if not isinstance(avro_format, AvroFormat):
64
+ raise ValueError(f"Expected ParquetFormat, got {avro_format}")
65
+
66
+ with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
67
+ avro_reader = fastavro.reader(fp) # type: ignore [arg-type]
68
+ avro_schema = avro_reader.writer_schema
69
+ if not avro_schema["type"] == "record": # type: ignore [index, call-overload]
70
+ unsupported_type = avro_schema["type"] # type: ignore [index, call-overload]
71
+ raise ValueError(
72
+ f"Only record based avro files are supported. Found {unsupported_type}"
73
+ )
74
+ json_schema = {
75
+ field["name"]: AvroParser._convert_avro_type_to_json( # type: ignore [index]
76
+ avro_format,
77
+ field["name"], # type: ignore [index]
78
+ field["type"], # type: ignore [index]
79
+ )
80
+ for field in avro_schema["fields"] # type: ignore [index, call-overload]
81
+ }
82
+ return json_schema
83
+
84
+ @classmethod
85
+ def _convert_avro_type_to_json(
86
+ cls, avro_format: AvroFormat, field_name: str, avro_field: str
87
+ ) -> Mapping[str, Any]:
88
+ if isinstance(avro_field, str) and avro_field in AVRO_TYPE_TO_JSON_TYPE:
89
+ # Legacy behavior to retain backwards compatibility. Long term we should always represent doubles as strings
90
+ if avro_field == "double" and not avro_format.double_as_string:
91
+ return {"type": "number"}
92
+ return {"type": AVRO_TYPE_TO_JSON_TYPE[avro_field]}
93
+ if isinstance(avro_field, Mapping):
94
+ if avro_field["type"] == "record":
95
+ return {
96
+ "type": "object",
97
+ "properties": {
98
+ object_field["name"]: AvroParser._convert_avro_type_to_json(
99
+ avro_format, object_field["name"], object_field["type"]
100
+ )
101
+ for object_field in avro_field["fields"]
102
+ },
103
+ }
104
+ elif avro_field["type"] == "array":
105
+ if "items" not in avro_field:
106
+ raise ValueError(
107
+ f"{field_name} array type does not have a required field items"
108
+ )
109
+ return {
110
+ "type": "array",
111
+ "items": AvroParser._convert_avro_type_to_json(
112
+ avro_format, "", avro_field["items"]
113
+ ),
114
+ }
115
+ elif avro_field["type"] == "enum":
116
+ if "symbols" not in avro_field:
117
+ raise ValueError(
118
+ f"{field_name} enum type does not have a required field symbols"
119
+ )
120
+ if "name" not in avro_field:
121
+ raise ValueError(f"{field_name} enum type does not have a required field name")
122
+ return {"type": "string", "enum": avro_field["symbols"]}
123
+ elif avro_field["type"] == "map":
124
+ if "values" not in avro_field:
125
+ raise ValueError(f"{field_name} map type does not have a required field values")
126
+ return {
127
+ "type": "object",
128
+ "additionalProperties": AvroParser._convert_avro_type_to_json(
129
+ avro_format, "", avro_field["values"]
130
+ ),
131
+ }
132
+ elif avro_field["type"] == "fixed" and avro_field.get("logicalType") != "duration":
133
+ if "size" not in avro_field:
134
+ raise ValueError(f"{field_name} fixed type does not have a required field size")
135
+ if not isinstance(avro_field["size"], int):
136
+ raise ValueError(f"{field_name} fixed type size value is not an integer")
137
+ return {
138
+ "type": "string",
139
+ "pattern": f"^[0-9A-Fa-f]{{{avro_field['size'] * 2}}}$",
140
+ }
141
+ elif avro_field.get("logicalType") == "decimal":
142
+ if "precision" not in avro_field:
143
+ raise ValueError(
144
+ f"{field_name} decimal type does not have a required field precision"
145
+ )
146
+ if "scale" not in avro_field:
147
+ raise ValueError(
148
+ f"{field_name} decimal type does not have a required field scale"
149
+ )
150
+ max_whole_number_range = avro_field["precision"] - avro_field["scale"]
151
+ decimal_range = avro_field["scale"]
152
+
153
+ # This regex looks like a mess, but it is validation for at least one whole number and optional fractional numbers
154
+ # For example: ^-?\d{1,5}(?:\.\d{1,3})?$ would accept 12345.123 and 123456.12345 would be rejected
155
+ return {
156
+ "type": "string",
157
+ "pattern": f"^-?\\d{{{1,max_whole_number_range}}}(?:\\.\\d{1,decimal_range})?$",
158
+ }
159
+ elif "logicalType" in avro_field:
160
+ if avro_field["logicalType"] not in AVRO_LOGICAL_TYPE_TO_JSON:
161
+ raise ValueError(
162
+ f"{avro_field['logicalType']} is not a valid Avro logical type"
163
+ )
164
+ return AVRO_LOGICAL_TYPE_TO_JSON[avro_field["logicalType"]]
165
+ else:
166
+ raise ValueError(f"Unsupported avro type: {avro_field}")
167
+ else:
168
+ raise ValueError(f"Unsupported avro type: {avro_field}")
169
+
170
+ def parse_records(
171
+ self,
172
+ config: FileBasedStreamConfig,
173
+ file: RemoteFile,
174
+ stream_reader: AbstractFileBasedStreamReader,
175
+ logger: logging.Logger,
176
+ discovered_schema: Optional[Mapping[str, SchemaType]],
177
+ ) -> Iterable[Dict[str, Any]]:
178
+ avro_format = config.format or AvroFormat(filetype="avro")
179
+ if not isinstance(avro_format, AvroFormat):
180
+ raise ValueError(f"Expected ParquetFormat, got {avro_format}")
181
+
182
+ line_no = 0
183
+ try:
184
+ with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
185
+ avro_reader = fastavro.reader(fp) # type: ignore [arg-type]
186
+ schema = avro_reader.writer_schema
187
+ schema_field_name_to_type = {
188
+ field["name"]: cast(dict[str, Any], field["type"]) # type: ignore [index]
189
+ for field in schema["fields"] # type: ignore [index, call-overload] # If schema is not dict, it is not subscriptable by strings
190
+ }
191
+ for record in avro_reader:
192
+ line_no += 1
193
+ yield {
194
+ record_field: self._to_output_value(
195
+ avro_format,
196
+ schema_field_name_to_type[record_field], # type: ignore [index] # Any not subscriptable
197
+ record[record_field], # type: ignore [index] # Any not subscriptable
198
+ )
199
+ for record_field, record_value in schema_field_name_to_type.items()
200
+ }
201
+ except Exception as exc:
202
+ raise RecordParseError(
203
+ FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no
204
+ ) from exc
205
+
206
+ @property
207
+ def file_read_mode(self) -> FileReadMode:
208
+ return FileReadMode.READ_BINARY
209
+
210
+ @staticmethod
211
+ def _to_output_value(
212
+ avro_format: AvroFormat, record_type: Mapping[str, Any], record_value: Any
213
+ ) -> Any:
214
+ if isinstance(record_value, bytes):
215
+ return record_value.decode()
216
+ elif not isinstance(record_type, Mapping):
217
+ if record_type == "double" and avro_format.double_as_string:
218
+ return str(record_value)
219
+ return record_value
220
+ if record_type.get("logicalType") in ("decimal", "uuid"):
221
+ return str(record_value)
222
+ elif record_type.get("logicalType") == "date":
223
+ return record_value.isoformat()
224
+ elif record_type.get("logicalType") == "timestamp-millis":
225
+ return record_value.isoformat(sep="T", timespec="milliseconds")
226
+ elif record_type.get("logicalType") == "timestamp-micros":
227
+ return record_value.isoformat(sep="T", timespec="microseconds")
228
+ elif record_type.get("logicalType") == "local-timestamp-millis":
229
+ return record_value.isoformat(sep="T", timespec="milliseconds")
230
+ elif record_type.get("logicalType") == "local-timestamp-micros":
231
+ return record_value.isoformat(sep="T", timespec="microseconds")
232
+ else:
233
+ return record_value