airbyte-cdk 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (368) hide show
  1. airbyte_cdk/__init__.py +358 -0
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +104 -0
  7. airbyte_cdk/connector.py +123 -0
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/__init__.py +3 -0
  10. airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
  11. airbyte_cdk/connector_builder/main.py +107 -0
  12. airbyte_cdk/connector_builder/models.py +73 -0
  13. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  14. airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
  15. airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
  16. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  17. airbyte_cdk/connector_builder/test_reader/types.py +83 -0
  18. airbyte_cdk/destinations/__init__.py +8 -0
  19. airbyte_cdk/destinations/destination.py +154 -0
  20. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  21. airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
  22. airbyte_cdk/destinations/vector_db_based/config.py +298 -0
  23. airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
  24. airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
  25. airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
  26. airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
  27. airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
  28. airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
  29. airbyte_cdk/entrypoint.py +414 -0
  30. airbyte_cdk/exception_handler.py +56 -0
  31. airbyte_cdk/logger.py +109 -0
  32. airbyte_cdk/models/__init__.py +72 -0
  33. airbyte_cdk/models/airbyte_protocol.py +88 -0
  34. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  35. airbyte_cdk/models/well_known_types.py +5 -0
  36. airbyte_cdk/py.typed +0 -0
  37. airbyte_cdk/sources/__init__.py +26 -0
  38. airbyte_cdk/sources/abstract_source.py +326 -0
  39. airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
  40. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
  41. airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
  42. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
  43. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
  44. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  45. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
  46. airbyte_cdk/sources/config.py +27 -0
  47. airbyte_cdk/sources/connector_state_manager.py +161 -0
  48. airbyte_cdk/sources/declarative/__init__.py +3 -0
  49. airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
  50. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  51. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
  52. airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
  53. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  54. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  55. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  56. airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
  57. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
  58. airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
  59. airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
  60. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
  61. airbyte_cdk/sources/declarative/auth/token.py +267 -0
  62. airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
  63. airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
  64. airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
  65. airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
  66. airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
  67. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  68. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  69. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
  70. airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
  71. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
  72. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
  73. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
  74. airbyte_cdk/sources/declarative/declarative_source.py +36 -0
  75. airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
  76. airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
  77. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
  78. airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
  79. airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
  80. airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
  81. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  82. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  83. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  84. airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
  85. airbyte_cdk/sources/declarative/exceptions.py +9 -0
  86. airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
  87. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
  88. airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
  89. airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
  90. airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
  91. airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
  92. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
  93. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  94. airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
  95. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
  96. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
  97. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  98. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
  99. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
  100. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  101. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  102. airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
  103. airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
  104. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
  105. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
  106. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
  107. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
  108. airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
  109. airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
  110. airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
  111. airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
  112. airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
  113. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  114. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  115. airbyte_cdk/sources/declarative/models/__init__.py +2 -0
  116. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
  117. airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
  118. airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
  119. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
  120. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
  121. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
  122. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
  123. airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
  124. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  125. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  126. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
  127. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  128. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
  129. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
  130. airbyte_cdk/sources/declarative/requesters/README.md +56 -0
  131. airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
  132. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
  133. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
  134. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
  135. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
  136. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
  137. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
  138. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
  139. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
  140. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
  141. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
  142. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  143. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
  144. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
  145. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
  146. airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
  147. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
  148. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
  149. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
  150. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
  151. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
  152. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
  153. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
  154. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
  155. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
  156. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
  157. airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
  158. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
  159. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
  160. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  161. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
  162. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
  163. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
  164. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
  165. airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
  166. airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
  167. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  168. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  169. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  170. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  171. airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
  172. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
  173. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  174. airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
  175. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
  176. airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
  177. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
  178. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
  179. airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
  180. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
  181. airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
  182. airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
  183. airbyte_cdk/sources/declarative/spec/spec.py +48 -0
  184. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
  185. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
  186. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
  187. airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
  188. airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
  189. airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
  190. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  191. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  192. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  193. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  194. airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
  195. airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
  196. airbyte_cdk/sources/declarative/types.py +25 -0
  197. airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
  198. airbyte_cdk/sources/file_based/README.md +152 -0
  199. airbyte_cdk/sources/file_based/__init__.py +24 -0
  200. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
  201. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
  202. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
  203. airbyte_cdk/sources/file_based/config/__init__.py +0 -0
  204. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
  205. airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
  206. airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
  207. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  208. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
  209. airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
  210. airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
  211. airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
  212. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  213. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
  214. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
  215. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
  216. airbyte_cdk/sources/file_based/exceptions.py +159 -0
  217. airbyte_cdk/sources/file_based/file_based_source.py +466 -0
  218. airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
  219. airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
  220. airbyte_cdk/sources/file_based/file_record_data.py +22 -0
  221. airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
  222. airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
  223. airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
  224. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  225. airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
  226. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
  227. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
  228. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
  229. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
  230. airbyte_cdk/sources/file_based/remote_file.py +18 -0
  231. airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
  232. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
  233. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
  234. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
  235. airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
  236. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
  237. airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
  238. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
  239. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
  240. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
  241. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
  242. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
  243. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
  244. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
  245. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
  246. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
  247. airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
  248. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
  249. airbyte_cdk/sources/file_based/types.py +10 -0
  250. airbyte_cdk/sources/http_config.py +10 -0
  251. airbyte_cdk/sources/http_logger.py +55 -0
  252. airbyte_cdk/sources/message/__init__.py +19 -0
  253. airbyte_cdk/sources/message/repository.py +137 -0
  254. airbyte_cdk/sources/source.py +95 -0
  255. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  256. airbyte_cdk/sources/streams/__init__.py +8 -0
  257. airbyte_cdk/sources/streams/availability_strategy.py +84 -0
  258. airbyte_cdk/sources/streams/call_rate.py +704 -0
  259. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  260. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  261. airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
  262. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  263. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  264. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  265. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  266. airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
  267. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
  268. airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
  269. airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
  270. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
  271. airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
  272. airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
  273. airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
  274. airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
  275. airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
  276. airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
  277. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
  278. airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
  279. airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
  280. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
  281. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
  282. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  283. airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
  284. airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
  285. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
  286. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
  287. airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
  288. airbyte_cdk/sources/streams/core.py +703 -0
  289. airbyte_cdk/sources/streams/http/__init__.py +10 -0
  290. airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
  291. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  292. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  293. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  294. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  295. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  296. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  297. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  298. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  299. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  300. airbyte_cdk/sources/streams/http/exceptions.py +61 -0
  301. airbyte_cdk/sources/streams/http/http.py +673 -0
  302. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  303. airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
  304. airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
  305. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
  306. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
  307. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
  308. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
  309. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  310. airbyte_cdk/sources/streams/utils/__init__.py +3 -0
  311. airbyte_cdk/sources/types.py +169 -0
  312. airbyte_cdk/sources/utils/__init__.py +7 -0
  313. airbyte_cdk/sources/utils/casing.py +12 -0
  314. airbyte_cdk/sources/utils/files_directory.py +15 -0
  315. airbyte_cdk/sources/utils/record_helper.py +53 -0
  316. airbyte_cdk/sources/utils/schema_helpers.py +230 -0
  317. airbyte_cdk/sources/utils/slice_logger.py +57 -0
  318. airbyte_cdk/sources/utils/transform.py +277 -0
  319. airbyte_cdk/sources/utils/types.py +7 -0
  320. airbyte_cdk/sql/__init__.py +0 -0
  321. airbyte_cdk/sql/_util/__init__.py +0 -0
  322. airbyte_cdk/sql/_util/hashing.py +34 -0
  323. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  324. airbyte_cdk/sql/constants.py +32 -0
  325. airbyte_cdk/sql/exceptions.py +235 -0
  326. airbyte_cdk/sql/secrets.py +123 -0
  327. airbyte_cdk/sql/shared/__init__.py +15 -0
  328. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  329. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  330. airbyte_cdk/sql/types.py +160 -0
  331. airbyte_cdk/test/__init__.py +7 -0
  332. airbyte_cdk/test/catalog_builder.py +81 -0
  333. airbyte_cdk/test/entrypoint_wrapper.py +250 -0
  334. airbyte_cdk/test/mock_http/__init__.py +6 -0
  335. airbyte_cdk/test/mock_http/matcher.py +41 -0
  336. airbyte_cdk/test/mock_http/mocker.py +185 -0
  337. airbyte_cdk/test/mock_http/request.py +103 -0
  338. airbyte_cdk/test/mock_http/response.py +28 -0
  339. airbyte_cdk/test/mock_http/response_builder.py +237 -0
  340. airbyte_cdk/test/state_builder.py +33 -0
  341. airbyte_cdk/test/utils/__init__.py +1 -0
  342. airbyte_cdk/test/utils/data.py +24 -0
  343. airbyte_cdk/test/utils/http_mocking.py +16 -0
  344. airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
  345. airbyte_cdk/test/utils/reading.py +26 -0
  346. airbyte_cdk/utils/__init__.py +10 -0
  347. airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
  348. airbyte_cdk/utils/analytics_message.py +25 -0
  349. airbyte_cdk/utils/constants.py +5 -0
  350. airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
  351. airbyte_cdk/utils/datetime_helpers.py +499 -0
  352. airbyte_cdk/utils/event_timing.py +85 -0
  353. airbyte_cdk/utils/is_cloud_environment.py +18 -0
  354. airbyte_cdk/utils/mapping_helpers.py +162 -0
  355. airbyte_cdk/utils/message_utils.py +26 -0
  356. airbyte_cdk/utils/oneof_option_config.py +33 -0
  357. airbyte_cdk/utils/print_buffer.py +75 -0
  358. airbyte_cdk/utils/schema_inferrer.py +270 -0
  359. airbyte_cdk/utils/slice_hasher.py +37 -0
  360. airbyte_cdk/utils/spec_schema_transformations.py +26 -0
  361. airbyte_cdk/utils/stream_status_utils.py +43 -0
  362. airbyte_cdk/utils/traced_exception.py +145 -0
  363. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
  364. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
  365. airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
  366. airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
  367. airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
  368. airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,255 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+ import logging
5
+ from typing import Dict, Iterable, List, Optional, Set
6
+
7
+ from airbyte_cdk.exception_handler import generate_failed_streams_error_message
8
+ from airbyte_cdk.models import AirbyteMessage, AirbyteStreamStatus, FailureType, StreamDescriptor
9
+ from airbyte_cdk.models import Type as MessageType
10
+ from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import (
11
+ PartitionGenerationCompletedSentinel,
12
+ )
13
+ from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException
14
+ from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
15
+ from airbyte_cdk.sources.message import MessageRepository
16
+ from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
17
+ from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer
18
+ from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader
19
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
20
+ from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCompleteSentinel
21
+ from airbyte_cdk.sources.types import Record
22
+ from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
23
+ from airbyte_cdk.sources.utils.slice_logger import SliceLogger
24
+ from airbyte_cdk.utils import AirbyteTracedException
25
+ from airbyte_cdk.utils.stream_status_utils import (
26
+ as_airbyte_message as stream_status_as_airbyte_message,
27
+ )
28
+
29
+
30
+ class ConcurrentReadProcessor:
31
+ def __init__(
32
+ self,
33
+ stream_instances_to_read_from: List[AbstractStream],
34
+ partition_enqueuer: PartitionEnqueuer,
35
+ thread_pool_manager: ThreadPoolManager,
36
+ logger: logging.Logger,
37
+ slice_logger: SliceLogger,
38
+ message_repository: MessageRepository,
39
+ partition_reader: PartitionReader,
40
+ ):
41
+ """
42
+ This class is responsible for handling items from a concurrent stream read process.
43
+ :param stream_instances_to_read_from: List of streams to read from
44
+ :param partition_enqueuer: PartitionEnqueuer instance
45
+ :param thread_pool_manager: ThreadPoolManager instance
46
+ :param logger: Logger instance
47
+ :param slice_logger: SliceLogger instance
48
+ :param message_repository: MessageRepository instance
49
+ :param partition_reader: PartitionReader instance
50
+ """
51
+ self._stream_name_to_instance = {s.name: s for s in stream_instances_to_read_from}
52
+ self._record_counter = {}
53
+ self._streams_to_running_partitions: Dict[str, Set[Partition]] = {}
54
+ for stream in stream_instances_to_read_from:
55
+ self._streams_to_running_partitions[stream.name] = set()
56
+ self._record_counter[stream.name] = 0
57
+ self._thread_pool_manager = thread_pool_manager
58
+ self._partition_enqueuer = partition_enqueuer
59
+ self._stream_instances_to_start_partition_generation = stream_instances_to_read_from
60
+ self._streams_currently_generating_partitions: List[str] = []
61
+ self._logger = logger
62
+ self._slice_logger = slice_logger
63
+ self._message_repository = message_repository
64
+ self._partition_reader = partition_reader
65
+ self._streams_done: Set[str] = set()
66
+ self._exceptions_per_stream_name: dict[str, List[Exception]] = {}
67
+
68
+ def on_partition_generation_completed(
69
+ self, sentinel: PartitionGenerationCompletedSentinel
70
+ ) -> Iterable[AirbyteMessage]:
71
+ """
72
+ This method is called when a partition generation is completed.
73
+ 1. Remove the stream from the list of streams currently generating partitions
74
+ 2. If the stream is done, mark it as such and return a stream status message
75
+ 3. If there are more streams to read from, start the next partition generator
76
+ """
77
+ stream_name = sentinel.stream.name
78
+ self._streams_currently_generating_partitions.remove(sentinel.stream.name)
79
+ # It is possible for the stream to already be done if no partitions were generated
80
+ # If the partition generation process was completed and there are no partitions left to process, the stream is done
81
+ if (
82
+ self._is_stream_done(stream_name)
83
+ or len(self._streams_to_running_partitions[stream_name]) == 0
84
+ ):
85
+ yield from self._on_stream_is_done(stream_name)
86
+ if self._stream_instances_to_start_partition_generation:
87
+ yield self.start_next_partition_generator() # type:ignore # None may be yielded
88
+
89
+ def on_partition(self, partition: Partition) -> None:
90
+ """
91
+ This method is called when a partition is generated.
92
+ 1. Add the partition to the set of partitions for the stream
93
+ 2. Log the slice if necessary
94
+ 3. Submit the partition to the thread pool manager
95
+ """
96
+ stream_name = partition.stream_name()
97
+ self._streams_to_running_partitions[stream_name].add(partition)
98
+ if self._slice_logger.should_log_slice_message(self._logger):
99
+ self._message_repository.emit_message(
100
+ self._slice_logger.create_slice_log_message(partition.to_slice())
101
+ )
102
+ self._thread_pool_manager.submit(self._partition_reader.process_partition, partition)
103
+
104
+ def on_partition_complete_sentinel(
105
+ self, sentinel: PartitionCompleteSentinel
106
+ ) -> Iterable[AirbyteMessage]:
107
+ """
108
+ This method is called when a partition is completed.
109
+ 1. Close the partition
110
+ 2. If the stream is done, mark it as such and return a stream status message
111
+ 3. Emit messages that were added to the message repository
112
+ """
113
+ partition = sentinel.partition
114
+
115
+ try:
116
+ if sentinel.is_successful:
117
+ stream = self._stream_name_to_instance[partition.stream_name()]
118
+ stream.cursor.close_partition(partition)
119
+ except Exception as exception:
120
+ self._flag_exception(partition.stream_name(), exception)
121
+ yield AirbyteTracedException.from_exception(
122
+ exception, stream_descriptor=StreamDescriptor(name=partition.stream_name())
123
+ ).as_sanitized_airbyte_message()
124
+ finally:
125
+ partitions_running = self._streams_to_running_partitions[partition.stream_name()]
126
+ if partition in partitions_running:
127
+ partitions_running.remove(partition)
128
+ # If all partitions were generated and this was the last one, the stream is done
129
+ if (
130
+ partition.stream_name() not in self._streams_currently_generating_partitions
131
+ and len(partitions_running) == 0
132
+ ):
133
+ yield from self._on_stream_is_done(partition.stream_name())
134
+ yield from self._message_repository.consume_queue()
135
+
136
+ def on_record(self, record: Record) -> Iterable[AirbyteMessage]:
137
+ """
138
+ This method is called when a record is read from a partition.
139
+ 1. Convert the record to an AirbyteMessage
140
+ 2. If this is the first record for the stream, mark the stream as RUNNING
141
+ 3. Increment the record counter for the stream
142
+ 4. Ensures the cursor knows the record has been successfully emitted
143
+ 5. Emit the message
144
+ 6. Emit messages that were added to the message repository
145
+ """
146
+ # Do not pass a transformer or a schema
147
+ # AbstractStreams are expected to return data as they are expected.
148
+ # Any transformation on the data should be done before reaching this point
149
+ message = stream_data_to_airbyte_message(
150
+ stream_name=record.stream_name,
151
+ data_or_message=record.data,
152
+ file_reference=record.file_reference,
153
+ )
154
+ stream = self._stream_name_to_instance[record.stream_name]
155
+
156
+ if message.type == MessageType.RECORD:
157
+ if self._record_counter[stream.name] == 0:
158
+ self._logger.info(f"Marking stream {stream.name} as RUNNING")
159
+ yield stream_status_as_airbyte_message(
160
+ stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING
161
+ )
162
+ self._record_counter[stream.name] += 1
163
+ stream.cursor.observe(record)
164
+ yield message
165
+ yield from self._message_repository.consume_queue()
166
+
167
+ def on_exception(self, exception: StreamThreadException) -> Iterable[AirbyteMessage]:
168
+ """
169
+ This method is called when an exception is raised.
170
+ 1. Stop all running streams
171
+ 2. Raise the exception
172
+ """
173
+ self._flag_exception(exception.stream_name, exception.exception)
174
+ self._logger.exception(
175
+ f"Exception while syncing stream {exception.stream_name}", exc_info=exception.exception
176
+ )
177
+
178
+ stream_descriptor = StreamDescriptor(name=exception.stream_name)
179
+ if isinstance(exception.exception, AirbyteTracedException):
180
+ yield exception.exception.as_airbyte_message(stream_descriptor=stream_descriptor)
181
+ else:
182
+ yield AirbyteTracedException.from_exception(
183
+ exception, stream_descriptor=stream_descriptor
184
+ ).as_airbyte_message()
185
+
186
+ def _flag_exception(self, stream_name: str, exception: Exception) -> None:
187
+ self._exceptions_per_stream_name.setdefault(stream_name, []).append(exception)
188
+
189
+ def start_next_partition_generator(self) -> Optional[AirbyteMessage]:
190
+ """
191
+ Start the next partition generator.
192
+ 1. Pop the next stream to read from
193
+ 2. Submit the partition generator to the thread pool manager
194
+ 3. Add the stream to the list of streams currently generating partitions
195
+ 4. Return a stream status message
196
+ """
197
+ if self._stream_instances_to_start_partition_generation:
198
+ stream = self._stream_instances_to_start_partition_generation.pop(0)
199
+ self._thread_pool_manager.submit(self._partition_enqueuer.generate_partitions, stream)
200
+ self._streams_currently_generating_partitions.append(stream.name)
201
+ self._logger.info(f"Marking stream {stream.name} as STARTED")
202
+ self._logger.info(f"Syncing stream: {stream.name} ")
203
+ return stream_status_as_airbyte_message(
204
+ stream.as_airbyte_stream(),
205
+ AirbyteStreamStatus.STARTED,
206
+ )
207
+ else:
208
+ return None
209
+
210
+ def is_done(self) -> bool:
211
+ """
212
+ This method is called to check if the sync is done.
213
+ The sync is done when:
214
+ 1. There are no more streams generating partitions
215
+ 2. There are no more streams to read from
216
+ 3. All partitions for all streams are closed
217
+ """
218
+ is_done = all(
219
+ [
220
+ self._is_stream_done(stream_name)
221
+ for stream_name in self._stream_name_to_instance.keys()
222
+ ]
223
+ )
224
+ if is_done and self._exceptions_per_stream_name:
225
+ error_message = generate_failed_streams_error_message(self._exceptions_per_stream_name)
226
+ self._logger.info(error_message)
227
+ # We still raise at least one exception when a stream raises an exception because the platform currently relies
228
+ # on a non-zero exit code to determine if a sync attempt has failed. We also raise the exception as a config_error
229
+ # type because this combined error isn't actionable, but rather the previously emitted individual errors.
230
+ raise AirbyteTracedException(
231
+ message=error_message,
232
+ internal_message="Concurrent read failure",
233
+ failure_type=FailureType.config_error,
234
+ )
235
+ return is_done
236
+
237
+ def _is_stream_done(self, stream_name: str) -> bool:
238
+ return stream_name in self._streams_done
239
+
240
+ def _on_stream_is_done(self, stream_name: str) -> Iterable[AirbyteMessage]:
241
+ self._logger.info(
242
+ f"Read {self._record_counter[stream_name]} records from {stream_name} stream"
243
+ )
244
+ self._logger.info(f"Marking stream {stream_name} as STOPPED")
245
+ stream = self._stream_name_to_instance[stream_name]
246
+ stream.cursor.ensure_at_least_one_state_emitted()
247
+ yield from self._message_repository.consume_queue()
248
+ self._logger.info(f"Finished syncing {stream.name}")
249
+ self._streams_done.add(stream_name)
250
+ stream_status = (
251
+ AirbyteStreamStatus.INCOMPLETE
252
+ if self._exceptions_per_stream_name.get(stream_name, [])
253
+ else AirbyteStreamStatus.COMPLETE
254
+ )
255
+ yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), stream_status)
@@ -0,0 +1,165 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+ import concurrent
5
+ import logging
6
+ from queue import Queue
7
+ from typing import Iterable, Iterator, List
8
+
9
+ from airbyte_cdk.models import AirbyteMessage
10
+ from airbyte_cdk.sources.concurrent_source.concurrent_read_processor import ConcurrentReadProcessor
11
+ from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import (
12
+ PartitionGenerationCompletedSentinel,
13
+ )
14
+ from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException
15
+ from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
16
+ from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
17
+ from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
18
+ from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer
19
+ from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader
20
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
21
+ from airbyte_cdk.sources.streams.concurrent.partitions.types import (
22
+ PartitionCompleteSentinel,
23
+ QueueItem,
24
+ )
25
+ from airbyte_cdk.sources.types import Record
26
+ from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
27
+
28
+
29
+ class ConcurrentSource:
30
+ """
31
+ A Source that reads data from multiple AbstractStreams concurrently.
32
+ It does so by submitting partition generation, and partition read tasks to a thread pool.
33
+ The tasks asynchronously add their output to a shared queue.
34
+ The read is done when all partitions for all streams w ere generated and read.
35
+ """
36
+
37
+ DEFAULT_TIMEOUT_SECONDS = 900
38
+
39
+ @staticmethod
40
+ def create(
41
+ num_workers: int,
42
+ initial_number_of_partitions_to_generate: int,
43
+ logger: logging.Logger,
44
+ slice_logger: SliceLogger,
45
+ message_repository: MessageRepository,
46
+ timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
47
+ ) -> "ConcurrentSource":
48
+ is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1
49
+ too_many_generator = (
50
+ not is_single_threaded and initial_number_of_partitions_to_generate >= num_workers
51
+ )
52
+ assert (
53
+ not too_many_generator
54
+ ), "It is required to have more workers than threads generating partitions"
55
+ threadpool = ThreadPoolManager(
56
+ concurrent.futures.ThreadPoolExecutor(
57
+ max_workers=num_workers, thread_name_prefix="workerpool"
58
+ ),
59
+ logger,
60
+ )
61
+ return ConcurrentSource(
62
+ threadpool,
63
+ logger,
64
+ slice_logger,
65
+ message_repository,
66
+ initial_number_of_partitions_to_generate,
67
+ timeout_seconds,
68
+ )
69
+
70
+ def __init__(
71
+ self,
72
+ threadpool: ThreadPoolManager,
73
+ logger: logging.Logger,
74
+ slice_logger: SliceLogger = DebugSliceLogger(),
75
+ message_repository: MessageRepository = InMemoryMessageRepository(),
76
+ initial_number_partitions_to_generate: int = 1,
77
+ timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
78
+ ) -> None:
79
+ """
80
+ :param threadpool: The threadpool to submit tasks to
81
+ :param logger: The logger to log to
82
+ :param slice_logger: The slice logger used to create messages on new slices
83
+ :param message_repository: The repository to emit messages to
84
+ :param initial_number_partitions_to_generate: The initial number of concurrent partition generation tasks. Limiting this number ensures will limit the latency of the first records emitted. While the latency is not critical, emitting the records early allows the platform and the destination to process them as early as possible.
85
+ :param timeout_seconds: The maximum number of seconds to wait for a record to be read from the queue. If no record is read within this time, the source will stop reading and return.
86
+ """
87
+ self._threadpool = threadpool
88
+ self._logger = logger
89
+ self._slice_logger = slice_logger
90
+ self._message_repository = message_repository
91
+ self._initial_number_partitions_to_generate = initial_number_partitions_to_generate
92
+ self._timeout_seconds = timeout_seconds
93
+
94
+ def read(
95
+ self,
96
+ streams: List[AbstractStream],
97
+ ) -> Iterator[AirbyteMessage]:
98
+ self._logger.info("Starting syncing")
99
+
100
+ # We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
101
+ # threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
102
+ # partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
103
+ # information and might even need to be configurable depending on the source
104
+ queue: Queue[QueueItem] = Queue(maxsize=10_000)
105
+ concurrent_stream_processor = ConcurrentReadProcessor(
106
+ streams,
107
+ PartitionEnqueuer(queue, self._threadpool),
108
+ self._threadpool,
109
+ self._logger,
110
+ self._slice_logger,
111
+ self._message_repository,
112
+ PartitionReader(queue),
113
+ )
114
+
115
+ # Enqueue initial partition generation tasks
116
+ yield from self._submit_initial_partition_generators(concurrent_stream_processor)
117
+
118
+ # Read from the queue until all partitions were generated and read
119
+ yield from self._consume_from_queue(
120
+ queue,
121
+ concurrent_stream_processor,
122
+ )
123
+ self._threadpool.check_for_errors_and_shutdown()
124
+ self._logger.info("Finished syncing")
125
+
126
+ def _submit_initial_partition_generators(
127
+ self, concurrent_stream_processor: ConcurrentReadProcessor
128
+ ) -> Iterable[AirbyteMessage]:
129
+ for _ in range(self._initial_number_partitions_to_generate):
130
+ status_message = concurrent_stream_processor.start_next_partition_generator()
131
+ if status_message:
132
+ yield status_message
133
+
134
+ def _consume_from_queue(
135
+ self,
136
+ queue: Queue[QueueItem],
137
+ concurrent_stream_processor: ConcurrentReadProcessor,
138
+ ) -> Iterable[AirbyteMessage]:
139
+ while airbyte_message_or_record_or_exception := queue.get():
140
+ yield from self._handle_item(
141
+ airbyte_message_or_record_or_exception,
142
+ concurrent_stream_processor,
143
+ )
144
+ if concurrent_stream_processor.is_done() and queue.empty():
145
+ # all partitions were generated and processed. we're done here
146
+ break
147
+
148
+ def _handle_item(
149
+ self,
150
+ queue_item: QueueItem,
151
+ concurrent_stream_processor: ConcurrentReadProcessor,
152
+ ) -> Iterable[AirbyteMessage]:
153
+ # handle queue item and call the appropriate handler depending on the type of the queue item
154
+ if isinstance(queue_item, StreamThreadException):
155
+ yield from concurrent_stream_processor.on_exception(queue_item)
156
+ elif isinstance(queue_item, PartitionGenerationCompletedSentinel):
157
+ yield from concurrent_stream_processor.on_partition_generation_completed(queue_item)
158
+ elif isinstance(queue_item, Partition):
159
+ concurrent_stream_processor.on_partition(queue_item)
160
+ elif isinstance(queue_item, PartitionCompleteSentinel):
161
+ yield from concurrent_stream_processor.on_partition_complete_sentinel(queue_item)
162
+ elif isinstance(queue_item, Record):
163
+ yield from concurrent_stream_processor.on_record(queue_item)
164
+ else:
165
+ raise ValueError(f"Unknown queue item type: {type(queue_item)}")
@@ -0,0 +1,147 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ from abc import ABC
7
+ from datetime import timedelta
8
+ from typing import Any, Callable, Iterator, List, Mapping, MutableMapping, Optional, Tuple
9
+
10
+ from airbyte_cdk.models import AirbyteMessage, AirbyteStateMessage, ConfiguredAirbyteCatalog
11
+ from airbyte_cdk.sources import AbstractSource
12
+ from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
13
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
14
+ from airbyte_cdk.sources.streams import Stream
15
+ from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
16
+ from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade
17
+ from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
18
+ from airbyte_cdk.sources.streams.concurrent.cursor import (
19
+ ConcurrentCursor,
20
+ Cursor,
21
+ CursorField,
22
+ CursorValueType,
23
+ FinalStateCursor,
24
+ GapType,
25
+ )
26
+ from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import (
27
+ AbstractStreamStateConverter,
28
+ )
29
+
30
+ DEFAULT_LOOKBACK_SECONDS = 0
31
+
32
+
33
+ class ConcurrentSourceAdapter(AbstractSource, ABC):
34
+ def __init__(self, concurrent_source: ConcurrentSource, **kwargs: Any) -> None:
35
+ """
36
+ ConcurrentSourceAdapter is a Source that wraps a concurrent source and exposes it as a regular source.
37
+
38
+ The source's streams are still defined through the streams() method.
39
+ Streams wrapped in a StreamFacade will be processed concurrently.
40
+ Other streams will be processed sequentially as a later step.
41
+ """
42
+ self._concurrent_source = concurrent_source
43
+ super().__init__(**kwargs)
44
+
45
+ def read(
46
+ self,
47
+ logger: logging.Logger,
48
+ config: Mapping[str, Any],
49
+ catalog: ConfiguredAirbyteCatalog,
50
+ state: Optional[List[AirbyteStateMessage]] = None,
51
+ ) -> Iterator[AirbyteMessage]:
52
+ abstract_streams = self._select_abstract_streams(config, catalog)
53
+ concurrent_stream_names = {stream.name for stream in abstract_streams}
54
+ configured_catalog_for_regular_streams = ConfiguredAirbyteCatalog(
55
+ streams=[
56
+ stream
57
+ for stream in catalog.streams
58
+ if stream.stream.name not in concurrent_stream_names
59
+ ]
60
+ )
61
+ if abstract_streams:
62
+ yield from self._concurrent_source.read(abstract_streams)
63
+ if configured_catalog_for_regular_streams.streams:
64
+ yield from super().read(logger, config, configured_catalog_for_regular_streams, state)
65
+
66
+ def _select_abstract_streams(
67
+ self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog
68
+ ) -> List[AbstractStream]:
69
+ """
70
+ Selects streams that can be processed concurrently and returns their abstract representations.
71
+ """
72
+ all_streams = self.streams(config)
73
+ stream_name_to_instance: Mapping[str, Stream] = {s.name: s for s in all_streams}
74
+ abstract_streams: List[AbstractStream] = []
75
+ for configured_stream in configured_catalog.streams:
76
+ stream_instance = stream_name_to_instance.get(configured_stream.stream.name)
77
+ if not stream_instance:
78
+ continue
79
+
80
+ if isinstance(stream_instance, AbstractStreamFacade):
81
+ abstract_streams.append(stream_instance.get_underlying_stream())
82
+ return abstract_streams
83
+
84
+ def convert_to_concurrent_stream(
85
+ self,
86
+ logger: logging.Logger,
87
+ stream: Stream,
88
+ state_manager: ConnectorStateManager,
89
+ cursor: Optional[Cursor] = None,
90
+ ) -> Stream:
91
+ """
92
+ Prepares a stream for concurrent processing by initializing or assigning a cursor,
93
+ managing the stream's state, and returning an updated Stream instance.
94
+ """
95
+ state: MutableMapping[str, Any] = {}
96
+
97
+ if cursor:
98
+ state = state_manager.get_stream_state(stream.name, stream.namespace)
99
+
100
+ stream.cursor = cursor # type: ignore[assignment] # cursor is of type ConcurrentCursor, which inherits from Cursor
101
+ if hasattr(stream, "parent"):
102
+ stream.parent.cursor = cursor
103
+ else:
104
+ cursor = FinalStateCursor(
105
+ stream_name=stream.name,
106
+ stream_namespace=stream.namespace,
107
+ message_repository=self.message_repository, # type: ignore[arg-type] # _default_message_repository will be returned in the worst case
108
+ )
109
+ return StreamFacade.create_from_stream(stream, self, logger, state, cursor)
110
+
111
+ def initialize_cursor(
112
+ self,
113
+ stream: Stream,
114
+ state_manager: ConnectorStateManager,
115
+ converter: AbstractStreamStateConverter,
116
+ slice_boundary_fields: Optional[Tuple[str, str]],
117
+ start: Optional[CursorValueType],
118
+ end_provider: Callable[[], CursorValueType],
119
+ lookback_window: Optional[GapType] = None,
120
+ slice_range: Optional[GapType] = None,
121
+ ) -> Optional[ConcurrentCursor]:
122
+ lookback_window = lookback_window or timedelta(seconds=DEFAULT_LOOKBACK_SECONDS)
123
+
124
+ cursor_field_name = stream.cursor_field
125
+
126
+ if cursor_field_name:
127
+ if not isinstance(cursor_field_name, str):
128
+ raise ValueError(
129
+ f"Cursor field type must be a string, but received {type(cursor_field_name).__name__}."
130
+ )
131
+
132
+ return ConcurrentCursor(
133
+ stream.name,
134
+ stream.namespace,
135
+ state_manager.get_stream_state(stream.name, stream.namespace),
136
+ self.message_repository, # type: ignore[arg-type] # _default_message_repository will be returned in the worst case
137
+ state_manager,
138
+ converter,
139
+ CursorField(cursor_field_name),
140
+ slice_boundary_fields,
141
+ start,
142
+ end_provider,
143
+ lookback_window,
144
+ slice_range,
145
+ )
146
+
147
+ return None
@@ -0,0 +1,24 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+ from typing import Any
5
+
6
+ from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
7
+
8
+
9
+ class PartitionGenerationCompletedSentinel:
10
+ """
11
+ A sentinel object indicating all partitions for a stream were produced.
12
+ Includes a pointer to the stream that was processed.
13
+ """
14
+
15
+ def __init__(self, stream: AbstractStream):
16
+ """
17
+ :param stream: The stream that was processed
18
+ """
19
+ self.stream = stream
20
+
21
+ def __eq__(self, other: Any) -> bool:
22
+ if isinstance(other, PartitionGenerationCompletedSentinel):
23
+ return self.stream == other.stream
24
+ return False
@@ -0,0 +1,25 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ from typing import Any
4
+
5
+
6
+ class StreamThreadException(Exception):
7
+ def __init__(self, exception: Exception, stream_name: str):
8
+ self._exception = exception
9
+ self._stream_name = stream_name
10
+
11
+ @property
12
+ def stream_name(self) -> str:
13
+ return self._stream_name
14
+
15
+ @property
16
+ def exception(self) -> Exception:
17
+ return self._exception
18
+
19
+ def __str__(self) -> str:
20
+ return f"Exception while syncing stream {self._stream_name}: {self._exception}"
21
+
22
+ def __eq__(self, other: Any) -> bool:
23
+ if isinstance(other, StreamThreadException):
24
+ return self._exception == other._exception and self._stream_name == other._stream_name
25
+ return False