airbyte-cdk 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (368) hide show
  1. airbyte_cdk/__init__.py +358 -0
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +104 -0
  7. airbyte_cdk/connector.py +123 -0
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/__init__.py +3 -0
  10. airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
  11. airbyte_cdk/connector_builder/main.py +107 -0
  12. airbyte_cdk/connector_builder/models.py +73 -0
  13. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  14. airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
  15. airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
  16. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  17. airbyte_cdk/connector_builder/test_reader/types.py +83 -0
  18. airbyte_cdk/destinations/__init__.py +8 -0
  19. airbyte_cdk/destinations/destination.py +154 -0
  20. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  21. airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
  22. airbyte_cdk/destinations/vector_db_based/config.py +298 -0
  23. airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
  24. airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
  25. airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
  26. airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
  27. airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
  28. airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
  29. airbyte_cdk/entrypoint.py +414 -0
  30. airbyte_cdk/exception_handler.py +56 -0
  31. airbyte_cdk/logger.py +109 -0
  32. airbyte_cdk/models/__init__.py +72 -0
  33. airbyte_cdk/models/airbyte_protocol.py +88 -0
  34. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  35. airbyte_cdk/models/well_known_types.py +5 -0
  36. airbyte_cdk/py.typed +0 -0
  37. airbyte_cdk/sources/__init__.py +26 -0
  38. airbyte_cdk/sources/abstract_source.py +326 -0
  39. airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
  40. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
  41. airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
  42. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
  43. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
  44. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  45. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
  46. airbyte_cdk/sources/config.py +27 -0
  47. airbyte_cdk/sources/connector_state_manager.py +161 -0
  48. airbyte_cdk/sources/declarative/__init__.py +3 -0
  49. airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
  50. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  51. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
  52. airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
  53. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  54. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  55. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  56. airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
  57. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
  58. airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
  59. airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
  60. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
  61. airbyte_cdk/sources/declarative/auth/token.py +267 -0
  62. airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
  63. airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
  64. airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
  65. airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
  66. airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
  67. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  68. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  69. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
  70. airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
  71. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
  72. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
  73. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
  74. airbyte_cdk/sources/declarative/declarative_source.py +36 -0
  75. airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
  76. airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
  77. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
  78. airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
  79. airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
  80. airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
  81. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  82. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  83. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  84. airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
  85. airbyte_cdk/sources/declarative/exceptions.py +9 -0
  86. airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
  87. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
  88. airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
  89. airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
  90. airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
  91. airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
  92. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
  93. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  94. airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
  95. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
  96. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
  97. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  98. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
  99. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
  100. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  101. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  102. airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
  103. airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
  104. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
  105. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
  106. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
  107. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
  108. airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
  109. airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
  110. airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
  111. airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
  112. airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
  113. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  114. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  115. airbyte_cdk/sources/declarative/models/__init__.py +2 -0
  116. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
  117. airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
  118. airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
  119. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
  120. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
  121. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
  122. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
  123. airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
  124. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  125. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  126. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
  127. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  128. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
  129. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
  130. airbyte_cdk/sources/declarative/requesters/README.md +56 -0
  131. airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
  132. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
  133. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
  134. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
  135. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
  136. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
  137. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
  138. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
  139. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
  140. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
  141. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
  142. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  143. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
  144. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
  145. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
  146. airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
  147. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
  148. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
  149. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
  150. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
  151. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
  152. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
  153. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
  154. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
  155. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
  156. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
  157. airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
  158. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
  159. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
  160. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  161. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
  162. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
  163. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
  164. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
  165. airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
  166. airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
  167. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  168. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  169. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  170. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  171. airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
  172. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
  173. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  174. airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
  175. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
  176. airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
  177. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
  178. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
  179. airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
  180. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
  181. airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
  182. airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
  183. airbyte_cdk/sources/declarative/spec/spec.py +48 -0
  184. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
  185. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
  186. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
  187. airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
  188. airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
  189. airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
  190. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  191. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  192. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  193. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  194. airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
  195. airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
  196. airbyte_cdk/sources/declarative/types.py +25 -0
  197. airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
  198. airbyte_cdk/sources/file_based/README.md +152 -0
  199. airbyte_cdk/sources/file_based/__init__.py +24 -0
  200. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
  201. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
  202. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
  203. airbyte_cdk/sources/file_based/config/__init__.py +0 -0
  204. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
  205. airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
  206. airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
  207. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  208. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
  209. airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
  210. airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
  211. airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
  212. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  213. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
  214. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
  215. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
  216. airbyte_cdk/sources/file_based/exceptions.py +159 -0
  217. airbyte_cdk/sources/file_based/file_based_source.py +466 -0
  218. airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
  219. airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
  220. airbyte_cdk/sources/file_based/file_record_data.py +22 -0
  221. airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
  222. airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
  223. airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
  224. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  225. airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
  226. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
  227. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
  228. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
  229. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
  230. airbyte_cdk/sources/file_based/remote_file.py +18 -0
  231. airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
  232. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
  233. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
  234. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
  235. airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
  236. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
  237. airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
  238. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
  239. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
  240. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
  241. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
  242. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
  243. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
  244. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
  245. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
  246. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
  247. airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
  248. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
  249. airbyte_cdk/sources/file_based/types.py +10 -0
  250. airbyte_cdk/sources/http_config.py +10 -0
  251. airbyte_cdk/sources/http_logger.py +55 -0
  252. airbyte_cdk/sources/message/__init__.py +19 -0
  253. airbyte_cdk/sources/message/repository.py +137 -0
  254. airbyte_cdk/sources/source.py +95 -0
  255. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  256. airbyte_cdk/sources/streams/__init__.py +8 -0
  257. airbyte_cdk/sources/streams/availability_strategy.py +84 -0
  258. airbyte_cdk/sources/streams/call_rate.py +704 -0
  259. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  260. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  261. airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
  262. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  263. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  264. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  265. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  266. airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
  267. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
  268. airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
  269. airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
  270. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
  271. airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
  272. airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
  273. airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
  274. airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
  275. airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
  276. airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
  277. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
  278. airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
  279. airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
  280. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
  281. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
  282. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  283. airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
  284. airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
  285. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
  286. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
  287. airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
  288. airbyte_cdk/sources/streams/core.py +703 -0
  289. airbyte_cdk/sources/streams/http/__init__.py +10 -0
  290. airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
  291. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  292. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  293. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  294. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  295. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  296. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  297. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  298. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  299. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  300. airbyte_cdk/sources/streams/http/exceptions.py +61 -0
  301. airbyte_cdk/sources/streams/http/http.py +673 -0
  302. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  303. airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
  304. airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
  305. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
  306. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
  307. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
  308. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
  309. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  310. airbyte_cdk/sources/streams/utils/__init__.py +3 -0
  311. airbyte_cdk/sources/types.py +169 -0
  312. airbyte_cdk/sources/utils/__init__.py +7 -0
  313. airbyte_cdk/sources/utils/casing.py +12 -0
  314. airbyte_cdk/sources/utils/files_directory.py +15 -0
  315. airbyte_cdk/sources/utils/record_helper.py +53 -0
  316. airbyte_cdk/sources/utils/schema_helpers.py +230 -0
  317. airbyte_cdk/sources/utils/slice_logger.py +57 -0
  318. airbyte_cdk/sources/utils/transform.py +277 -0
  319. airbyte_cdk/sources/utils/types.py +7 -0
  320. airbyte_cdk/sql/__init__.py +0 -0
  321. airbyte_cdk/sql/_util/__init__.py +0 -0
  322. airbyte_cdk/sql/_util/hashing.py +34 -0
  323. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  324. airbyte_cdk/sql/constants.py +32 -0
  325. airbyte_cdk/sql/exceptions.py +235 -0
  326. airbyte_cdk/sql/secrets.py +123 -0
  327. airbyte_cdk/sql/shared/__init__.py +15 -0
  328. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  329. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  330. airbyte_cdk/sql/types.py +160 -0
  331. airbyte_cdk/test/__init__.py +7 -0
  332. airbyte_cdk/test/catalog_builder.py +81 -0
  333. airbyte_cdk/test/entrypoint_wrapper.py +250 -0
  334. airbyte_cdk/test/mock_http/__init__.py +6 -0
  335. airbyte_cdk/test/mock_http/matcher.py +41 -0
  336. airbyte_cdk/test/mock_http/mocker.py +185 -0
  337. airbyte_cdk/test/mock_http/request.py +103 -0
  338. airbyte_cdk/test/mock_http/response.py +28 -0
  339. airbyte_cdk/test/mock_http/response_builder.py +237 -0
  340. airbyte_cdk/test/state_builder.py +33 -0
  341. airbyte_cdk/test/utils/__init__.py +1 -0
  342. airbyte_cdk/test/utils/data.py +24 -0
  343. airbyte_cdk/test/utils/http_mocking.py +16 -0
  344. airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
  345. airbyte_cdk/test/utils/reading.py +26 -0
  346. airbyte_cdk/utils/__init__.py +10 -0
  347. airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
  348. airbyte_cdk/utils/analytics_message.py +25 -0
  349. airbyte_cdk/utils/constants.py +5 -0
  350. airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
  351. airbyte_cdk/utils/datetime_helpers.py +499 -0
  352. airbyte_cdk/utils/event_timing.py +85 -0
  353. airbyte_cdk/utils/is_cloud_environment.py +18 -0
  354. airbyte_cdk/utils/mapping_helpers.py +162 -0
  355. airbyte_cdk/utils/message_utils.py +26 -0
  356. airbyte_cdk/utils/oneof_option_config.py +33 -0
  357. airbyte_cdk/utils/print_buffer.py +75 -0
  358. airbyte_cdk/utils/schema_inferrer.py +270 -0
  359. airbyte_cdk/utils/slice_hasher.py +37 -0
  360. airbyte_cdk/utils/spec_schema_transformations.py +26 -0
  361. airbyte_cdk/utils/stream_status_utils.py +43 -0
  362. airbyte_cdk/utils/traced_exception.py +145 -0
  363. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
  364. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
  365. airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
  366. airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
  367. airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
  368. airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,396 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import asyncio
6
+ import itertools
7
+ import traceback
8
+ from collections import defaultdict
9
+ from copy import deepcopy
10
+ from functools import cache
11
+ from os import path
12
+ from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
13
+
14
+ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, FailureType, Level
15
+ from airbyte_cdk.models import Type as MessageType
16
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
17
+ from airbyte_cdk.sources.file_based.exceptions import (
18
+ DuplicatedFilesError,
19
+ FileBasedSourceError,
20
+ InvalidSchemaError,
21
+ MissingSchemaError,
22
+ RecordParseError,
23
+ SchemaInferenceError,
24
+ StopSyncPerValidationPolicy,
25
+ )
26
+ from airbyte_cdk.sources.file_based.file_types import FileTransfer
27
+ from airbyte_cdk.sources.file_based.remote_file import RemoteFile
28
+ from airbyte_cdk.sources.file_based.schema_helpers import (
29
+ SchemaType,
30
+ file_transfer_schema,
31
+ merge_schemas,
32
+ schemaless_schema,
33
+ )
34
+ from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
35
+ from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
36
+ from airbyte_cdk.sources.file_based.types import StreamSlice
37
+ from airbyte_cdk.sources.streams import IncrementalMixin
38
+ from airbyte_cdk.sources.streams.core import JsonSchema
39
+ from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
40
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
41
+
42
+
43
+ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
44
+ """
45
+ The default file-based stream.
46
+ """
47
+
48
+ FILE_TRANSFER_KW = "use_file_transfer"
49
+ PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure"
50
+ FILES_KEY = "files"
51
+ DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
52
+ ab_last_mod_col = "_ab_source_file_last_modified"
53
+ ab_file_name_col = "_ab_source_file_url"
54
+ modified = "modified"
55
+ source_file_url = "source_file_url"
56
+ airbyte_columns = [ab_last_mod_col, ab_file_name_col]
57
+ use_file_transfer = False
58
+ preserve_directory_structure = True
59
+ _file_transfer = FileTransfer()
60
+
61
+ def __init__(self, **kwargs: Any):
62
+ if self.FILE_TRANSFER_KW in kwargs:
63
+ self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
64
+ if self.PRESERVE_DIRECTORY_STRUCTURE_KW in kwargs:
65
+ self.preserve_directory_structure = kwargs.pop(
66
+ self.PRESERVE_DIRECTORY_STRUCTURE_KW, True
67
+ )
68
+ super().__init__(**kwargs)
69
+
70
+ @property
71
+ def state(self) -> MutableMapping[str, Any]:
72
+ return self._cursor.get_state()
73
+
74
+ @state.setter
75
+ def state(self, value: MutableMapping[str, Any]) -> None:
76
+ """State setter, accept state serialized by state getter."""
77
+ self._cursor.set_initial_state(value)
78
+
79
+ @property # type: ignore # mypy complains wrong type, but AbstractFileBasedCursor is parent of file-based cursors
80
+ def cursor(self) -> Optional[AbstractFileBasedCursor]:
81
+ return self._cursor
82
+
83
+ @cursor.setter
84
+ def cursor(self, value: AbstractFileBasedCursor) -> None:
85
+ if self._cursor is not None:
86
+ raise RuntimeError(
87
+ f"Cursor for stream {self.name} is already set. This is unexpected. Please contact Support."
88
+ )
89
+ self._cursor = value
90
+
91
+ @property
92
+ def primary_key(self) -> PrimaryKeyType:
93
+ return self.config.primary_key or self.get_parser().get_parser_defined_primary_key(
94
+ self.config
95
+ )
96
+
97
+ def _duplicated_files_names(
98
+ self, slices: List[dict[str, List[RemoteFile]]]
99
+ ) -> List[dict[str, List[str]]]:
100
+ seen_file_names: Dict[str, List[str]] = defaultdict(list)
101
+ for file_slice in slices:
102
+ for file_found in file_slice[self.FILES_KEY]:
103
+ file_name = path.basename(file_found.uri)
104
+ seen_file_names[file_name].append(file_found.uri)
105
+ return [
106
+ {file_name: paths} for file_name, paths in seen_file_names.items() if len(paths) > 1
107
+ ]
108
+
109
+ def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
110
+ # Sort files by last_modified, uri and return them grouped by last_modified
111
+ all_files = self.list_files()
112
+ files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
113
+ sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
114
+ slices = [
115
+ {self.FILES_KEY: list(group[1])}
116
+ for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
117
+ ]
118
+ if slices and not self.preserve_directory_structure:
119
+ duplicated_files_names = self._duplicated_files_names(slices)
120
+ if duplicated_files_names:
121
+ raise DuplicatedFilesError(
122
+ stream=self.name, duplicated_files_names=duplicated_files_names
123
+ )
124
+ return slices
125
+
126
+ def transform_record(
127
+ self, record: dict[str, Any], file: RemoteFile, last_updated: str
128
+ ) -> dict[str, Any]:
129
+ # adds _ab_source_file_last_modified and _ab_source_file_url to the record
130
+ record[self.ab_last_mod_col] = last_updated
131
+ record[self.ab_file_name_col] = file.uri
132
+ return record
133
+
134
+ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[AirbyteMessage]:
135
+ """
136
+ Yield all records from all remote files in `list_files_for_this_sync`.
137
+
138
+ If an error is encountered reading records from a file, log a message and do not attempt
139
+ to sync the rest of the file.
140
+ """
141
+ schema = self.catalog_schema
142
+ if schema is None:
143
+ # On read requests we should always have the catalog available
144
+ raise MissingSchemaError(FileBasedSourceError.MISSING_SCHEMA, stream=self.name)
145
+ # The stream only supports a single file type, so we can use the same parser for all files
146
+ parser = self.get_parser()
147
+ for file in stream_slice["files"]:
148
+ # only serialize the datetime once
149
+ file_datetime_string = file.last_modified.strftime(self.DATE_TIME_FORMAT)
150
+ n_skipped = line_no = 0
151
+
152
+ try:
153
+ if self.use_file_transfer:
154
+ for file_record_data, file_reference in self._file_transfer.upload(
155
+ file=file, stream_reader=self.stream_reader, logger=self.logger
156
+ ):
157
+ yield stream_data_to_airbyte_message(
158
+ self.name,
159
+ file_record_data.dict(exclude_none=True),
160
+ file_reference=file_reference,
161
+ )
162
+ else:
163
+ for record in parser.parse_records(
164
+ self.config, file, self.stream_reader, self.logger, schema
165
+ ):
166
+ line_no += 1
167
+ if self.config.schemaless:
168
+ record = {"data": record}
169
+ elif not self.record_passes_validation_policy(record):
170
+ n_skipped += 1
171
+ continue
172
+ record = self.transform_record(record, file, file_datetime_string)
173
+ yield stream_data_to_airbyte_message(self.name, record)
174
+ self._cursor.add_file(file)
175
+
176
+ except StopSyncPerValidationPolicy:
177
+ yield AirbyteMessage(
178
+ type=MessageType.LOG,
179
+ log=AirbyteLogMessage(
180
+ level=Level.WARN,
181
+ message=f"Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream={self.name} file={file.uri} validation_policy={self.config.validation_policy.value} n_skipped={n_skipped}",
182
+ ),
183
+ )
184
+ break
185
+
186
+ except RecordParseError:
187
+ # Increment line_no because the exception was raised before we could increment it
188
+ line_no += 1
189
+ self.errors_collector.collect(
190
+ AirbyteMessage(
191
+ type=MessageType.LOG,
192
+ log=AirbyteLogMessage(
193
+ level=Level.ERROR,
194
+ message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
195
+ stack_trace=traceback.format_exc(),
196
+ ),
197
+ ),
198
+ )
199
+
200
+ except AirbyteTracedException as exc:
201
+ # Re-raise the exception to stop the whole sync immediately as this is a fatal error
202
+ raise exc
203
+
204
+ except Exception:
205
+ yield AirbyteMessage(
206
+ type=MessageType.LOG,
207
+ log=AirbyteLogMessage(
208
+ level=Level.ERROR,
209
+ message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
210
+ stack_trace=traceback.format_exc(),
211
+ ),
212
+ )
213
+
214
+ finally:
215
+ if n_skipped:
216
+ yield AirbyteMessage(
217
+ type=MessageType.LOG,
218
+ log=AirbyteLogMessage(
219
+ level=Level.WARN,
220
+ message=f"Records in file did not pass validation policy. stream={self.name} file={file.uri} n_skipped={n_skipped} validation_policy={self.validation_policy.name}",
221
+ ),
222
+ )
223
+
224
+ @property
225
+ def cursor_field(self) -> Union[str, List[str]]:
226
+ """
227
+ Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.
228
+ :return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.
229
+ """
230
+ return self.ab_last_mod_col
231
+
232
+ @cache
233
+ def get_json_schema(self) -> JsonSchema:
234
+ if self.use_file_transfer:
235
+ return file_transfer_schema
236
+ extra_fields = {
237
+ self.ab_last_mod_col: {"type": "string"},
238
+ self.ab_file_name_col: {"type": "string"},
239
+ }
240
+ try:
241
+ schema = self._get_raw_json_schema()
242
+ except InvalidSchemaError as config_exception:
243
+ raise AirbyteTracedException(
244
+ internal_message="Please check the logged errors for more information.",
245
+ message=FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value,
246
+ exception=AirbyteTracedException(exception=config_exception),
247
+ failure_type=FailureType.config_error,
248
+ )
249
+ except AirbyteTracedException as ate:
250
+ raise ate
251
+ except Exception as exc:
252
+ raise SchemaInferenceError(
253
+ FileBasedSourceError.SCHEMA_INFERENCE_ERROR, stream=self.name
254
+ ) from exc
255
+ else:
256
+ return {"type": "object", "properties": {**extra_fields, **schema["properties"]}}
257
+
258
+ def _get_raw_json_schema(self) -> JsonSchema:
259
+ if self.config.input_schema:
260
+ return self.config.get_input_schema() # type: ignore
261
+ elif self.config.schemaless:
262
+ return schemaless_schema
263
+ else:
264
+ files = self.list_files()
265
+ first_n_files = len(files)
266
+
267
+ if self.config.recent_n_files_to_read_for_schema_discovery:
268
+ self.logger.info(
269
+ msg=(
270
+ f"Only first {self.config.recent_n_files_to_read_for_schema_discovery} files will be used to infer schema "
271
+ f"for stream {self.name} due to limitation in config."
272
+ )
273
+ )
274
+ first_n_files = self.config.recent_n_files_to_read_for_schema_discovery
275
+
276
+ if first_n_files == 0:
277
+ self.logger.warning(
278
+ msg=f"No files were identified in the stream {self.name}. Setting default schema for the stream."
279
+ )
280
+ return schemaless_schema
281
+
282
+ max_n_files_for_schema_inference = (
283
+ self._discovery_policy.get_max_n_files_for_schema_inference(self.get_parser())
284
+ )
285
+
286
+ if first_n_files > max_n_files_for_schema_inference:
287
+ # Use the most recent files for schema inference, so we pick up schema changes during discovery.
288
+ self.logger.warning(
289
+ msg=f"Refusing to infer schema for {first_n_files} files; using {max_n_files_for_schema_inference} files."
290
+ )
291
+ first_n_files = max_n_files_for_schema_inference
292
+
293
+ files = sorted(files, key=lambda x: x.last_modified, reverse=True)[:first_n_files]
294
+
295
+ inferred_schema = self.infer_schema(files)
296
+
297
+ if not inferred_schema:
298
+ raise InvalidSchemaError(
299
+ FileBasedSourceError.INVALID_SCHEMA_ERROR,
300
+ details=f"Empty schema. Please check that the files are valid for format {self.config.format}",
301
+ stream=self.name,
302
+ )
303
+
304
+ schema = {"type": "object", "properties": inferred_schema}
305
+
306
+ return schema
307
+
308
+ def get_files(self) -> Iterable[RemoteFile]:
309
+ """
310
+ Return all files that belong to the stream as defined by the stream's globs.
311
+ """
312
+ return self.stream_reader.get_matching_files(
313
+ self.config.globs or [], self.config.legacy_prefix, self.logger
314
+ )
315
+
316
+ def as_airbyte_stream(self) -> AirbyteStream:
317
+ file_stream = super().as_airbyte_stream()
318
+ file_stream.is_file_based = self.use_file_transfer
319
+ return file_stream
320
+
321
+ def infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
322
+ loop = asyncio.get_event_loop()
323
+ schema = loop.run_until_complete(self._infer_schema(files))
324
+ # as infer schema returns a Mapping that is assumed to be immutable, we need to create a deepcopy to avoid modifying the reference
325
+ return self._fill_nulls(deepcopy(schema))
326
+
327
+ @staticmethod
328
+ def _fill_nulls(schema: Mapping[str, Any]) -> Mapping[str, Any]:
329
+ if isinstance(schema, dict):
330
+ for k, v in schema.items():
331
+ if k == "type":
332
+ if isinstance(v, list):
333
+ if "null" not in v:
334
+ schema[k] = ["null"] + v
335
+ elif v != "null":
336
+ if isinstance(v, (str, list)):
337
+ schema[k] = ["null", v]
338
+ else:
339
+ DefaultFileBasedStream._fill_nulls(v)
340
+ else:
341
+ DefaultFileBasedStream._fill_nulls(v)
342
+ elif isinstance(schema, list):
343
+ for item in schema:
344
+ DefaultFileBasedStream._fill_nulls(item)
345
+ return schema
346
+
347
+ async def _infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
348
+ """
349
+ Infer the schema for a stream.
350
+
351
+ Each file type has a corresponding `infer_schema` handler.
352
+ Dispatch on file type.
353
+ """
354
+ base_schema: SchemaType = {}
355
+ pending_tasks: Set[asyncio.tasks.Task[SchemaType]] = set()
356
+
357
+ n_started, n_files = 0, len(files)
358
+ files_iterator = iter(files)
359
+ while pending_tasks or n_started < n_files:
360
+ while len(pending_tasks) <= self._discovery_policy.n_concurrent_requests and (
361
+ file := next(files_iterator, None)
362
+ ):
363
+ pending_tasks.add(asyncio.create_task(self._infer_file_schema(file)))
364
+ n_started += 1
365
+ # Return when the first task is completed so that we can enqueue a new task as soon as the
366
+ # number of concurrent tasks drops below the number allowed.
367
+ done, pending_tasks = await asyncio.wait(
368
+ pending_tasks, return_when=asyncio.FIRST_COMPLETED
369
+ )
370
+ for task in done:
371
+ try:
372
+ base_schema = merge_schemas(base_schema, task.result())
373
+ except AirbyteTracedException as ate:
374
+ raise ate
375
+ except Exception as exc:
376
+ self.logger.error(
377
+ f"An error occurred inferring the schema. \n {traceback.format_exc()}",
378
+ exc_info=exc,
379
+ )
380
+
381
+ return base_schema
382
+
383
+ async def _infer_file_schema(self, file: RemoteFile) -> SchemaType:
384
+ try:
385
+ return await self.get_parser().infer_schema(
386
+ self.config, file, self.stream_reader, self.logger
387
+ )
388
+ except AirbyteTracedException as ate:
389
+ raise ate
390
+ except Exception as exc:
391
+ raise SchemaInferenceError(
392
+ FileBasedSourceError.SCHEMA_INFERENCE_ERROR,
393
+ file=file.uri,
394
+ format=str(self.config.format),
395
+ stream=self.name,
396
+ ) from exc
@@ -0,0 +1,49 @@
1
+ #
2
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from functools import cache
6
+ from typing import Any, Dict, Iterable, Mapping, MutableMapping, Optional
7
+
8
+ from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
9
+ from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
10
+ from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector
11
+ from airbyte_cdk.sources.file_based.file_based_stream_permissions_reader import (
12
+ AbstractFileBasedStreamPermissionsReader,
13
+ )
14
+ from airbyte_cdk.sources.streams.core import JsonSchema
15
+ from airbyte_cdk.sources.streams.permissions.identities_stream import IdentitiesStream
16
+
17
+
18
+ class FileIdentitiesStream(IdentitiesStream):
19
+ """
20
+ The identities stream. A full refresh stream to sync identities from a certain domain.
21
+ The stream reader manage the logic to get such data, which is implemented on connector side.
22
+ """
23
+
24
+ is_resumable = False
25
+
26
+ def __init__(
27
+ self,
28
+ catalog_schema: Optional[Mapping[str, Any]],
29
+ stream_permissions_reader: AbstractFileBasedStreamPermissionsReader,
30
+ discovery_policy: AbstractDiscoveryPolicy,
31
+ errors_collector: FileBasedErrorsCollector,
32
+ ) -> None:
33
+ super().__init__()
34
+ self.catalog_schema = catalog_schema
35
+ self.stream_permissions_reader = stream_permissions_reader
36
+ self._discovery_policy = discovery_policy
37
+ self.errors_collector = errors_collector
38
+ self._cursor: MutableMapping[str, Any] = {}
39
+
40
+ @property
41
+ def primary_key(self) -> PrimaryKeyType:
42
+ return None
43
+
44
+ def load_identity_groups(self) -> Iterable[Dict[str, Any]]:
45
+ return self.stream_permissions_reader.load_identity_groups(logger=self.logger)
46
+
47
+ @cache
48
+ def get_json_schema(self) -> JsonSchema:
49
+ return self.stream_permissions_reader.identities_schema
@@ -0,0 +1,92 @@
1
+ #
2
+ # Copyright (c) 2025 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import traceback
6
+ from typing import Any, Dict, Iterable
7
+
8
+ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level
9
+ from airbyte_cdk.models import Type as MessageType
10
+ from airbyte_cdk.sources.file_based.file_based_stream_permissions_reader import (
11
+ AbstractFileBasedStreamPermissionsReader,
12
+ )
13
+ from airbyte_cdk.sources.file_based.stream import DefaultFileBasedStream
14
+ from airbyte_cdk.sources.file_based.types import StreamSlice
15
+ from airbyte_cdk.sources.streams.core import JsonSchema
16
+ from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
17
+
18
+
19
+ class PermissionsFileBasedStream(DefaultFileBasedStream):
20
+ """
21
+ A specialized stream for handling file-based ACL permissions.
22
+
23
+ This stream works with the stream_reader to:
24
+ 1. Fetch ACL permissions for each file in the source
25
+ 2. Transform permissions into a standardized format
26
+ 3. Generate records containing permission information
27
+
28
+ The stream_reader is responsible for the actual implementation of permission retrieval
29
+ and schema definition, while this class handles the streaming interface.
30
+ """
31
+
32
+ def __init__(
33
+ self, stream_permissions_reader: AbstractFileBasedStreamPermissionsReader, **kwargs: Any
34
+ ):
35
+ super().__init__(**kwargs)
36
+ self.stream_permissions_reader = stream_permissions_reader
37
+
38
+ def _filter_schema_invalid_properties(
39
+ self, configured_catalog_json_schema: Dict[str, Any]
40
+ ) -> Dict[str, Any]:
41
+ return self.stream_permissions_reader.file_permissions_schema
42
+
43
+ def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[AirbyteMessage]:
44
+ """
45
+ Yield permissions records from all remote files
46
+ """
47
+
48
+ for file in stream_slice["files"]:
49
+ no_permissions = False
50
+ file_datetime_string = file.last_modified.strftime(self.DATE_TIME_FORMAT)
51
+ try:
52
+ permissions_record = self.stream_permissions_reader.get_file_acl_permissions(
53
+ file, logger=self.logger
54
+ )
55
+ if not permissions_record:
56
+ no_permissions = True
57
+ self.logger.warning(
58
+ f"Unable to fetch permissions. stream={self.name} file={file.uri}"
59
+ )
60
+ continue
61
+ permissions_record = self.transform_record(
62
+ permissions_record, file, file_datetime_string
63
+ )
64
+ yield stream_data_to_airbyte_message(self.name, permissions_record)
65
+ except Exception as e:
66
+ self.logger.error(f"Failed to retrieve permissions for file {file.uri}: {str(e)}")
67
+ yield AirbyteMessage(
68
+ type=MessageType.LOG,
69
+ log=AirbyteLogMessage(
70
+ level=Level.ERROR,
71
+ message=f"Error retrieving files permissions: stream={self.name} file={file.uri}",
72
+ stack_trace=traceback.format_exc(),
73
+ ),
74
+ )
75
+ finally:
76
+ if no_permissions:
77
+ yield AirbyteMessage(
78
+ type=MessageType.LOG,
79
+ log=AirbyteLogMessage(
80
+ level=Level.WARN,
81
+ message=f"Unable to fetch permissions. stream={self.name} file={file.uri}",
82
+ ),
83
+ )
84
+
85
+ def _get_raw_json_schema(self) -> JsonSchema:
86
+ """
87
+ Retrieve the raw JSON schema for file permissions from the stream reader.
88
+
89
+ Returns:
90
+ The file permissions schema that defines the structure of permission records
91
+ """
92
+ return self.stream_permissions_reader.file_permissions_schema
@@ -0,0 +1,10 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Any, Mapping, MutableMapping
8
+
9
+ StreamSlice = Mapping[str, Any]
10
+ StreamState = MutableMapping[str, Any]
@@ -0,0 +1,10 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ # The goal of this variable is to make an implicit dependency explicit. As part of of the Concurrent CDK work, we are facing a situation
6
+ # where the connection pool size is too small to serve all the threads (see https://github.com/airbytehq/airbyte/issues/32072). In
7
+ # order to fix that, we will increase the requests library pool_maxsize. As there are many pieces of code that sets a requests.Session, we
8
+ # are creating this variable here so that a change in one affects the other. This can be removed once we merge how we do HTTP requests in
9
+ # one piece of code or once we make connection pool size configurable for each piece of code
10
+ MAX_CONNECTION_POOL_SIZE = 20
@@ -0,0 +1,55 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from typing import Optional, Union
6
+
7
+ import requests
8
+
9
+ from airbyte_cdk.sources.message import LogMessage
10
+
11
+
12
+ def format_http_message(
13
+ response: requests.Response,
14
+ title: str,
15
+ description: str,
16
+ stream_name: Optional[str],
17
+ is_auxiliary: bool | None = None,
18
+ type: Optional[str] = None,
19
+ ) -> LogMessage:
20
+ request_type: str = type if type else "HTTP"
21
+ request = response.request
22
+ log_message = {
23
+ "http": {
24
+ "title": title,
25
+ "type": request_type,
26
+ "description": description,
27
+ "request": {
28
+ "method": request.method,
29
+ "body": {
30
+ "content": _normalize_body_string(request.body),
31
+ },
32
+ "headers": dict(request.headers),
33
+ },
34
+ "response": {
35
+ "body": {
36
+ "content": response.text,
37
+ },
38
+ "headers": dict(response.headers),
39
+ "status_code": response.status_code,
40
+ },
41
+ },
42
+ "log": {
43
+ "level": "debug",
44
+ },
45
+ "url": {"full": request.url},
46
+ }
47
+ if is_auxiliary is not None:
48
+ log_message["http"]["is_auxiliary"] = is_auxiliary # type: ignore [index]
49
+ if stream_name:
50
+ log_message["airbyte_cdk"] = {"stream": {"name": stream_name}}
51
+ return log_message # type: ignore[return-value] # got "dict[str, object]", expected "dict[str, JsonType]"
52
+
53
+
54
+ def _normalize_body_string(body_str: Optional[Union[str, bytes]]) -> Optional[str]:
55
+ return body_str.decode() if isinstance(body_str, (bytes, bytearray)) else body_str
@@ -0,0 +1,19 @@
1
+ #
2
+ # Copyright (c) 2021 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from .repository import (
6
+ InMemoryMessageRepository,
7
+ LogAppenderMessageRepositoryDecorator,
8
+ LogMessage,
9
+ MessageRepository,
10
+ NoopMessageRepository,
11
+ )
12
+
13
+ __all__ = [
14
+ "InMemoryMessageRepository",
15
+ "LogAppenderMessageRepositoryDecorator",
16
+ "LogMessage",
17
+ "MessageRepository",
18
+ "NoopMessageRepository",
19
+ ]