airbyte-cdk 0.72.0__py3-none-any.whl → 6.13.1.dev4106__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (517) hide show
  1. airbyte_cdk/__init__.py +355 -6
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +29 -10
  7. airbyte_cdk/connector.py +24 -24
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
  10. airbyte_cdk/connector_builder/main.py +45 -13
  11. airbyte_cdk/connector_builder/message_grouper.py +189 -50
  12. airbyte_cdk/connector_builder/models.py +3 -2
  13. airbyte_cdk/destinations/__init__.py +4 -3
  14. airbyte_cdk/destinations/destination.py +54 -20
  15. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  16. airbyte_cdk/destinations/vector_db_based/config.py +40 -17
  17. airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
  18. airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
  19. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  20. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  21. airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
  22. airbyte_cdk/entrypoint.py +153 -44
  23. airbyte_cdk/exception_handler.py +21 -3
  24. airbyte_cdk/logger.py +30 -44
  25. airbyte_cdk/models/__init__.py +13 -2
  26. airbyte_cdk/models/airbyte_protocol.py +86 -1
  27. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  28. airbyte_cdk/models/file_transfer_record_message.py +13 -0
  29. airbyte_cdk/models/well_known_types.py +1 -1
  30. airbyte_cdk/sources/__init__.py +5 -1
  31. airbyte_cdk/sources/abstract_source.py +125 -79
  32. airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
  33. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
  34. airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
  35. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
  36. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  37. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
  38. airbyte_cdk/sources/config.py +3 -2
  39. airbyte_cdk/sources/connector_state_manager.py +49 -83
  40. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  41. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
  42. airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
  43. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  44. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  45. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  46. airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
  47. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  48. airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
  49. airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
  50. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
  51. airbyte_cdk/sources/declarative/auth/token.py +28 -10
  52. airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
  53. airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
  54. airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
  55. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  56. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  57. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +421 -0
  58. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
  59. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
  60. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1213 -88
  61. airbyte_cdk/sources/declarative/declarative_source.py +5 -2
  62. airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
  63. airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
  64. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
  65. airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
  66. airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
  67. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  68. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  69. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  70. airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
  71. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
  72. airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
  73. airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
  74. airbyte_cdk/sources/declarative/extractors/record_filter.py +65 -8
  75. airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
  76. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
  77. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  78. airbyte_cdk/sources/declarative/incremental/__init__.py +25 -3
  79. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
  80. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  81. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
  82. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +159 -74
  83. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  84. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  85. airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
  86. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
  87. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
  88. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
  89. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
  90. airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
  91. airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
  92. airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
  93. airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
  94. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  95. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  96. airbyte_cdk/sources/declarative/models/__init__.py +1 -1
  97. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1329 -595
  98. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
  99. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
  100. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
  101. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1699 -226
  102. airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
  103. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  104. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  105. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
  106. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  107. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
  108. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
  109. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
  110. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
  111. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
  112. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
  113. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
  114. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
  115. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
  116. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
  117. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
  118. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
  119. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  120. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
  121. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
  122. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
  123. airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
  124. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
  125. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
  126. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
  127. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
  128. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
  129. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
  130. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
  131. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
  132. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
  133. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
  134. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
  135. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
  136. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  137. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
  138. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
  139. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
  140. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
  141. airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
  142. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  143. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  144. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  145. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  146. airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
  147. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
  148. airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
  149. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +228 -72
  150. airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
  151. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
  152. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
  153. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
  154. airbyte_cdk/sources/declarative/spec/spec.py +12 -5
  155. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
  156. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
  157. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
  158. airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
  159. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  160. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  161. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  162. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  163. airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
  164. airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
  165. airbyte_cdk/sources/declarative/types.py +19 -110
  166. airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
  167. airbyte_cdk/sources/embedded/base_integration.py +16 -5
  168. airbyte_cdk/sources/embedded/catalog.py +16 -4
  169. airbyte_cdk/sources/embedded/runner.py +19 -3
  170. airbyte_cdk/sources/embedded/tools.py +5 -2
  171. airbyte_cdk/sources/file_based/README.md +152 -0
  172. airbyte_cdk/sources/file_based/__init__.py +24 -0
  173. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
  174. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
  175. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
  176. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +58 -10
  177. airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
  178. airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
  179. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  180. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
  181. airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
  182. airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
  183. airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
  184. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
  185. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  186. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  187. airbyte_cdk/sources/file_based/exceptions.py +52 -15
  188. airbyte_cdk/sources/file_based/file_based_source.py +163 -33
  189. airbyte_cdk/sources/file_based/file_based_stream_reader.py +83 -5
  190. airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
  191. airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
  192. airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
  193. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  194. airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
  195. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  196. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
  197. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
  198. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +145 -41
  199. airbyte_cdk/sources/file_based/remote_file.py +1 -1
  200. airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
  201. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
  202. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  203. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  204. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
  205. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
  206. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
  207. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
  208. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
  209. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
  210. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  211. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
  212. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +175 -45
  213. airbyte_cdk/sources/http_logger.py +8 -3
  214. airbyte_cdk/sources/message/__init__.py +7 -1
  215. airbyte_cdk/sources/message/repository.py +18 -4
  216. airbyte_cdk/sources/source.py +42 -38
  217. airbyte_cdk/sources/streams/__init__.py +2 -2
  218. airbyte_cdk/sources/streams/availability_strategy.py +54 -3
  219. airbyte_cdk/sources/streams/call_rate.py +64 -21
  220. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  221. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  222. airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
  223. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  224. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  225. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  226. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  227. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
  228. airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
  229. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
  230. airbyte_cdk/sources/streams/concurrent/cursor.py +298 -42
  231. airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
  232. airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
  233. airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
  234. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
  235. airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
  236. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
  237. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  238. airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
  239. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
  240. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
  241. airbyte_cdk/sources/streams/core.py +412 -87
  242. airbyte_cdk/sources/streams/http/__init__.py +2 -1
  243. airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
  244. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  245. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  246. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  247. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  248. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  249. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  250. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  251. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  252. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  253. airbyte_cdk/sources/streams/http/exceptions.py +27 -7
  254. airbyte_cdk/sources/streams/http/http.py +369 -246
  255. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  256. airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
  257. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
  258. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  259. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
  260. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  261. airbyte_cdk/sources/types.py +154 -0
  262. airbyte_cdk/sources/utils/record_helper.py +36 -21
  263. airbyte_cdk/sources/utils/schema_helpers.py +13 -6
  264. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  265. airbyte_cdk/sources/utils/transform.py +54 -20
  266. airbyte_cdk/sql/_util/hashing.py +34 -0
  267. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  268. airbyte_cdk/sql/constants.py +32 -0
  269. airbyte_cdk/sql/exceptions.py +235 -0
  270. airbyte_cdk/sql/secrets.py +123 -0
  271. airbyte_cdk/sql/shared/__init__.py +15 -0
  272. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  273. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  274. airbyte_cdk/sql/types.py +160 -0
  275. airbyte_cdk/test/catalog_builder.py +70 -18
  276. airbyte_cdk/test/entrypoint_wrapper.py +117 -42
  277. airbyte_cdk/test/mock_http/__init__.py +1 -1
  278. airbyte_cdk/test/mock_http/matcher.py +6 -0
  279. airbyte_cdk/test/mock_http/mocker.py +57 -10
  280. airbyte_cdk/test/mock_http/request.py +19 -3
  281. airbyte_cdk/test/mock_http/response.py +3 -1
  282. airbyte_cdk/test/mock_http/response_builder.py +32 -16
  283. airbyte_cdk/test/state_builder.py +18 -10
  284. airbyte_cdk/test/utils/__init__.py +1 -0
  285. airbyte_cdk/test/utils/data.py +24 -0
  286. airbyte_cdk/test/utils/http_mocking.py +16 -0
  287. airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
  288. airbyte_cdk/test/utils/reading.py +26 -0
  289. airbyte_cdk/utils/__init__.py +2 -1
  290. airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
  291. airbyte_cdk/utils/analytics_message.py +10 -2
  292. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  293. airbyte_cdk/utils/event_timing.py +10 -10
  294. airbyte_cdk/utils/mapping_helpers.py +3 -1
  295. airbyte_cdk/utils/message_utils.py +20 -11
  296. airbyte_cdk/utils/print_buffer.py +75 -0
  297. airbyte_cdk/utils/schema_inferrer.py +198 -28
  298. airbyte_cdk/utils/slice_hasher.py +30 -0
  299. airbyte_cdk/utils/spec_schema_transformations.py +6 -3
  300. airbyte_cdk/utils/stream_status_utils.py +8 -1
  301. airbyte_cdk/utils/traced_exception.py +61 -21
  302. airbyte_cdk-6.13.1.dev4106.dist-info/METADATA +109 -0
  303. airbyte_cdk-6.13.1.dev4106.dist-info/RECORD +349 -0
  304. {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/WHEEL +1 -2
  305. airbyte_cdk-6.13.1.dev4106.dist-info/entry_points.txt +3 -0
  306. airbyte_cdk/sources/declarative/create_partial.py +0 -92
  307. airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
  308. airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
  309. airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
  310. airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
  311. airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
  312. airbyte_cdk/sources/deprecated/base_source.py +0 -94
  313. airbyte_cdk/sources/deprecated/client.py +0 -99
  314. airbyte_cdk/sources/singer/__init__.py +0 -8
  315. airbyte_cdk/sources/singer/singer_helpers.py +0 -304
  316. airbyte_cdk/sources/singer/source.py +0 -186
  317. airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
  318. airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
  319. airbyte_cdk/sources/streams/http/auth/core.py +0 -29
  320. airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
  321. airbyte_cdk/sources/streams/http/auth/token.py +0 -47
  322. airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
  323. airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
  324. airbyte_cdk/sources/utils/schema_models.py +0 -84
  325. airbyte_cdk-0.72.0.dist-info/METADATA +0 -243
  326. airbyte_cdk-0.72.0.dist-info/RECORD +0 -466
  327. airbyte_cdk-0.72.0.dist-info/top_level.txt +0 -3
  328. source_declarative_manifest/main.py +0 -29
  329. unit_tests/connector_builder/__init__.py +0 -3
  330. unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
  331. unit_tests/connector_builder/test_message_grouper.py +0 -713
  332. unit_tests/connector_builder/utils.py +0 -27
  333. unit_tests/destinations/test_destination.py +0 -243
  334. unit_tests/singer/test_singer_helpers.py +0 -56
  335. unit_tests/singer/test_singer_source.py +0 -112
  336. unit_tests/sources/__init__.py +0 -0
  337. unit_tests/sources/concurrent_source/__init__.py +0 -3
  338. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
  339. unit_tests/sources/declarative/__init__.py +0 -3
  340. unit_tests/sources/declarative/auth/__init__.py +0 -3
  341. unit_tests/sources/declarative/auth/test_oauth.py +0 -331
  342. unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
  343. unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
  344. unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
  345. unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
  346. unit_tests/sources/declarative/checks/__init__.py +0 -3
  347. unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
  348. unit_tests/sources/declarative/decoders/__init__.py +0 -0
  349. unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
  350. unit_tests/sources/declarative/external_component.py +0 -13
  351. unit_tests/sources/declarative/extractors/__init__.py +0 -3
  352. unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
  353. unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
  354. unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
  355. unit_tests/sources/declarative/incremental/__init__.py +0 -0
  356. unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
  357. unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
  358. unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
  359. unit_tests/sources/declarative/interpolation/__init__.py +0 -3
  360. unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
  361. unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
  362. unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
  363. unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
  364. unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
  365. unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
  366. unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
  367. unit_tests/sources/declarative/parsers/__init__.py +0 -3
  368. unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
  369. unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
  370. unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1841
  371. unit_tests/sources/declarative/parsers/testing_components.py +0 -36
  372. unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
  373. unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
  374. unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
  375. unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
  376. unit_tests/sources/declarative/requesters/__init__.py +0 -3
  377. unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
  378. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
  379. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
  380. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
  381. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
  382. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
  383. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
  384. unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
  385. unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
  386. unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
  387. unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
  388. unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
  389. unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
  390. unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
  391. unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
  392. unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
  393. unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
  394. unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
  395. unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
  396. unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
  397. unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
  398. unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
  399. unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
  400. unit_tests/sources/declarative/retrievers/__init__.py +0 -3
  401. unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
  402. unit_tests/sources/declarative/schema/__init__.py +0 -6
  403. unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
  404. unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
  405. unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
  406. unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
  407. unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
  408. unit_tests/sources/declarative/states/__init__.py +0 -3
  409. unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
  410. unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
  411. unit_tests/sources/declarative/test_create_partial.py +0 -83
  412. unit_tests/sources/declarative/test_declarative_stream.py +0 -103
  413. unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
  414. unit_tests/sources/declarative/test_types.py +0 -39
  415. unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
  416. unit_tests/sources/file_based/__init__.py +0 -0
  417. unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
  418. unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
  419. unit_tests/sources/file_based/config/__init__.py +0 -0
  420. unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
  421. unit_tests/sources/file_based/config/test_csv_format.py +0 -34
  422. unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
  423. unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
  424. unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
  425. unit_tests/sources/file_based/file_types/__init__.py +0 -0
  426. unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
  427. unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
  428. unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
  429. unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
  430. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
  431. unit_tests/sources/file_based/helpers.py +0 -70
  432. unit_tests/sources/file_based/in_memory_files_source.py +0 -211
  433. unit_tests/sources/file_based/scenarios/__init__.py +0 -0
  434. unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
  435. unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
  436. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
  437. unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
  438. unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
  439. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
  440. unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
  441. unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
  442. unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
  443. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
  444. unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
  445. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
  446. unit_tests/sources/file_based/stream/__init__.py +0 -0
  447. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  448. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
  449. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
  450. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
  451. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
  452. unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
  453. unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
  454. unit_tests/sources/file_based/test_scenarios.py +0 -253
  455. unit_tests/sources/file_based/test_schema_helpers.py +0 -346
  456. unit_tests/sources/fixtures/__init__.py +0 -3
  457. unit_tests/sources/fixtures/source_test_fixture.py +0 -153
  458. unit_tests/sources/message/__init__.py +0 -0
  459. unit_tests/sources/message/test_repository.py +0 -153
  460. unit_tests/sources/streams/__init__.py +0 -0
  461. unit_tests/sources/streams/concurrent/__init__.py +0 -3
  462. unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
  463. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
  464. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
  465. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
  466. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
  467. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
  468. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
  469. unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
  470. unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
  471. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
  472. unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
  473. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
  474. unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
  475. unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
  476. unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
  477. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
  478. unit_tests/sources/streams/http/__init__.py +0 -0
  479. unit_tests/sources/streams/http/auth/__init__.py +0 -0
  480. unit_tests/sources/streams/http/auth/test_auth.py +0 -173
  481. unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
  482. unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
  483. unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
  484. unit_tests/sources/streams/http/test_http.py +0 -635
  485. unit_tests/sources/streams/test_availability_strategy.py +0 -70
  486. unit_tests/sources/streams/test_call_rate.py +0 -300
  487. unit_tests/sources/streams/test_stream_read.py +0 -405
  488. unit_tests/sources/streams/test_streams_core.py +0 -184
  489. unit_tests/sources/test_abstract_source.py +0 -1442
  490. unit_tests/sources/test_concurrent_source.py +0 -112
  491. unit_tests/sources/test_config.py +0 -92
  492. unit_tests/sources/test_connector_state_manager.py +0 -482
  493. unit_tests/sources/test_http_logger.py +0 -252
  494. unit_tests/sources/test_integration_source.py +0 -86
  495. unit_tests/sources/test_source.py +0 -684
  496. unit_tests/sources/test_source_read.py +0 -460
  497. unit_tests/test/__init__.py +0 -0
  498. unit_tests/test/mock_http/__init__.py +0 -0
  499. unit_tests/test/mock_http/test_matcher.py +0 -53
  500. unit_tests/test/mock_http/test_mocker.py +0 -214
  501. unit_tests/test/mock_http/test_request.py +0 -117
  502. unit_tests/test/mock_http/test_response_builder.py +0 -177
  503. unit_tests/test/test_entrypoint_wrapper.py +0 -240
  504. unit_tests/utils/__init__.py +0 -0
  505. unit_tests/utils/test_datetime_format_inferrer.py +0 -60
  506. unit_tests/utils/test_mapping_helpers.py +0 -54
  507. unit_tests/utils/test_message_utils.py +0 -91
  508. unit_tests/utils/test_rate_limiting.py +0 -26
  509. unit_tests/utils/test_schema_inferrer.py +0 -202
  510. unit_tests/utils/test_secret_utils.py +0 -135
  511. unit_tests/utils/test_stream_status_utils.py +0 -61
  512. unit_tests/utils/test_traced_exception.py +0 -107
  513. /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
  514. {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
  515. {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
  516. {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
  517. {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/LICENSE.txt +0 -0
@@ -0,0 +1,497 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ import logging
4
+ import threading
5
+ import time
6
+ import traceback
7
+ import uuid
8
+ from datetime import timedelta
9
+ from typing import (
10
+ Any,
11
+ Generator,
12
+ Generic,
13
+ Iterable,
14
+ List,
15
+ Mapping,
16
+ Optional,
17
+ Set,
18
+ Tuple,
19
+ Type,
20
+ TypeVar,
21
+ )
22
+
23
+ from airbyte_cdk.logger import lazy_log
24
+ from airbyte_cdk.models import FailureType
25
+ from airbyte_cdk.sources.declarative.async_job.job import AsyncJob
26
+ from airbyte_cdk.sources.declarative.async_job.job_tracker import (
27
+ ConcurrentJobLimitReached,
28
+ JobTracker,
29
+ )
30
+ from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository
31
+ from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus
32
+ from airbyte_cdk.sources.message import MessageRepository
33
+ from airbyte_cdk.sources.types import StreamSlice
34
+ from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets
35
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
36
+
37
+ LOGGER = logging.getLogger("airbyte")
38
+ _NO_TIMEOUT = timedelta.max
39
+ _API_SIDE_RUNNING_STATUS = {AsyncJobStatus.RUNNING, AsyncJobStatus.TIMED_OUT}
40
+
41
+
42
+ class AsyncPartition:
43
+ """
44
+ This bucket of api_jobs is a bit useless for this iteration but should become interesting when we will be able to split jobs
45
+ """
46
+
47
+ _MAX_NUMBER_OF_ATTEMPTS = 3
48
+
49
+ def __init__(self, jobs: List[AsyncJob], stream_slice: StreamSlice) -> None:
50
+ self._attempts_per_job = {job: 1 for job in jobs}
51
+ self._stream_slice = stream_slice
52
+
53
+ def has_reached_max_attempt(self) -> bool:
54
+ return any(
55
+ map(
56
+ lambda attempt_count: attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS,
57
+ self._attempts_per_job.values(),
58
+ )
59
+ )
60
+
61
+ def replace_job(self, job_to_replace: AsyncJob, new_jobs: List[AsyncJob]) -> None:
62
+ current_attempt_count = self._attempts_per_job.pop(job_to_replace, None)
63
+ if current_attempt_count is None:
64
+ raise ValueError("Could not find job to replace")
65
+ elif current_attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS:
66
+ raise ValueError(f"Max attempt reached for job in partition {self._stream_slice}")
67
+
68
+ new_attempt_count = current_attempt_count + 1
69
+ for job in new_jobs:
70
+ self._attempts_per_job[job] = new_attempt_count
71
+
72
+ def should_split(self, job: AsyncJob) -> bool:
73
+ """
74
+ Not used right now but once we support job split, we should split based on the number of attempts
75
+ """
76
+ return False
77
+
78
+ @property
79
+ def jobs(self) -> Iterable[AsyncJob]:
80
+ return self._attempts_per_job.keys()
81
+
82
+ @property
83
+ def stream_slice(self) -> StreamSlice:
84
+ return self._stream_slice
85
+
86
+ @property
87
+ def status(self) -> AsyncJobStatus:
88
+ """
89
+ Given different job statuses, the priority is: FAILED, TIMED_OUT, RUNNING. Else, it means everything is completed.
90
+ """
91
+ statuses = set(map(lambda job: job.status(), self.jobs))
92
+ if statuses == {AsyncJobStatus.COMPLETED}:
93
+ return AsyncJobStatus.COMPLETED
94
+ elif AsyncJobStatus.FAILED in statuses:
95
+ return AsyncJobStatus.FAILED
96
+ elif AsyncJobStatus.TIMED_OUT in statuses:
97
+ return AsyncJobStatus.TIMED_OUT
98
+ else:
99
+ return AsyncJobStatus.RUNNING
100
+
101
+ def __repr__(self) -> str:
102
+ return f"AsyncPartition(stream_slice={self._stream_slice}, attempt_per_job={self._attempts_per_job})"
103
+
104
+ def __json_serializable__(self) -> Any:
105
+ return self._stream_slice
106
+
107
+
108
+ T = TypeVar("T")
109
+
110
+
111
+ class LookaheadIterator(Generic[T]):
112
+ def __init__(self, iterable: Iterable[T]) -> None:
113
+ self._iterator = iter(iterable)
114
+ self._buffer: List[T] = []
115
+
116
+ def __iter__(self) -> "LookaheadIterator[T]":
117
+ return self
118
+
119
+ def __next__(self) -> T:
120
+ if self._buffer:
121
+ return self._buffer.pop()
122
+ else:
123
+ return next(self._iterator)
124
+
125
+ def has_next(self) -> bool:
126
+ if self._buffer:
127
+ return True
128
+
129
+ try:
130
+ self._buffer = [next(self._iterator)]
131
+ except StopIteration:
132
+ return False
133
+ else:
134
+ return True
135
+
136
+ def add_at_the_beginning(self, item: T) -> None:
137
+ self._buffer = [item] + self._buffer
138
+
139
+
140
+ class AsyncJobOrchestrator:
141
+ _WAIT_TIME_BETWEEN_STATUS_UPDATE_IN_SECONDS = 5
142
+ _KNOWN_JOB_STATUSES = {
143
+ AsyncJobStatus.COMPLETED,
144
+ AsyncJobStatus.FAILED,
145
+ AsyncJobStatus.RUNNING,
146
+ AsyncJobStatus.TIMED_OUT,
147
+ }
148
+ _RUNNING_ON_API_SIDE_STATUS = {AsyncJobStatus.RUNNING, AsyncJobStatus.TIMED_OUT}
149
+
150
+ def __init__(
151
+ self,
152
+ job_repository: AsyncJobRepository,
153
+ slices: Iterable[StreamSlice],
154
+ job_tracker: JobTracker,
155
+ message_repository: MessageRepository,
156
+ exceptions_to_break_on: Iterable[Type[Exception]] = tuple(),
157
+ has_bulk_parent: bool = False,
158
+ ) -> None:
159
+ """
160
+ If the stream slices provided as a parameters relies on a async job streams that relies on the same JobTracker, `has_bulk_parent`
161
+ needs to be set to True as jobs creation needs to be prioritized on the parent level. Doing otherwise could lead to a situation
162
+ where the child has taken up all the job budget without room to the parent to create more which would lead to an infinite loop of
163
+ "trying to start a parent job" and "ConcurrentJobLimitReached".
164
+ """
165
+ if {*AsyncJobStatus} != self._KNOWN_JOB_STATUSES:
166
+ # this is to prevent developers updating the possible statuses without updating the logic of this class
167
+ raise ValueError(
168
+ "An AsyncJobStatus has been either removed or added which means the logic of this class needs to be reviewed. Once the logic has been updated, please update _KNOWN_JOB_STATUSES"
169
+ )
170
+
171
+ self._job_repository: AsyncJobRepository = job_repository
172
+ self._slice_iterator = LookaheadIterator(slices)
173
+ self._running_partitions: List[AsyncPartition] = []
174
+ self._job_tracker = job_tracker
175
+ self._message_repository = message_repository
176
+ self._exceptions_to_break_on: Tuple[Type[Exception], ...] = tuple(exceptions_to_break_on)
177
+ self._has_bulk_parent = has_bulk_parent
178
+
179
+ self._non_breaking_exceptions: List[Exception] = []
180
+
181
+ def _replace_failed_jobs(self, partition: AsyncPartition) -> None:
182
+ failed_status_jobs = (AsyncJobStatus.FAILED, AsyncJobStatus.TIMED_OUT)
183
+ jobs_to_replace = [job for job in partition.jobs if job.status() in failed_status_jobs]
184
+ for job in jobs_to_replace:
185
+ new_job = self._start_job(job.job_parameters(), job.api_job_id())
186
+ partition.replace_job(job, [new_job])
187
+
188
+ def _start_jobs(self) -> None:
189
+ """
190
+ Retry failed jobs and start jobs for each slice in the slice iterator.
191
+ This method iterates over the running jobs and slice iterator and starts a job for each slice.
192
+ The started jobs are added to the running partitions.
193
+ Returns:
194
+ None
195
+
196
+ However, the first iteration is for sendgrid which only has one job.
197
+ """
198
+ at_least_one_slice_consumed_from_slice_iterator_during_current_iteration = False
199
+ _slice = None
200
+ try:
201
+ for partition in self._running_partitions:
202
+ self._replace_failed_jobs(partition)
203
+
204
+ if (
205
+ self._has_bulk_parent
206
+ and self._running_partitions
207
+ and self._slice_iterator.has_next()
208
+ ):
209
+ LOGGER.debug(
210
+ "This AsyncJobOrchestrator is operating as a child of a bulk stream hence we limit the number of concurrent jobs on the child until there are no more parent slices to avoid the child taking all the API job budget"
211
+ )
212
+ return
213
+
214
+ for _slice in self._slice_iterator:
215
+ at_least_one_slice_consumed_from_slice_iterator_during_current_iteration = True
216
+ job = self._start_job(_slice)
217
+ self._running_partitions.append(AsyncPartition([job], _slice))
218
+ if self._has_bulk_parent and self._slice_iterator.has_next():
219
+ break
220
+ except ConcurrentJobLimitReached:
221
+ if at_least_one_slice_consumed_from_slice_iterator_during_current_iteration:
222
+ # this means a slice has been consumed but the job couldn't be create therefore we need to put it back at the beginning of the _slice_iterator
223
+ self._slice_iterator.add_at_the_beginning(_slice) # type: ignore # we know it's not None here because `ConcurrentJobLimitReached` happens during the for loop
224
+ LOGGER.debug(
225
+ "Waiting before creating more jobs as the limit of concurrent jobs has been reached. Will try again later..."
226
+ )
227
+
228
+ def _start_job(self, _slice: StreamSlice, previous_job_id: Optional[str] = None) -> AsyncJob:
229
+ if previous_job_id:
230
+ id_to_replace = previous_job_id
231
+ lazy_log(LOGGER, logging.DEBUG, lambda: f"Attempting to replace job {id_to_replace}...")
232
+ else:
233
+ id_to_replace = self._job_tracker.try_to_get_intent()
234
+
235
+ try:
236
+ job = self._job_repository.start(_slice)
237
+ self._job_tracker.add_job(id_to_replace, job.api_job_id())
238
+ return job
239
+ except Exception as exception:
240
+ LOGGER.warning(f"Exception has occurred during job creation: {exception}")
241
+ if self._is_breaking_exception(exception):
242
+ self._job_tracker.remove_job(id_to_replace)
243
+ raise exception
244
+ return self._keep_api_budget_with_failed_job(_slice, exception, id_to_replace)
245
+
246
+ def _keep_api_budget_with_failed_job(
247
+ self, _slice: StreamSlice, exception: Exception, intent: str
248
+ ) -> AsyncJob:
249
+ """
250
+ We have a mechanism to retry job. It is used when a job status is FAILED or TIMED_OUT. The easiest way to retry is to have this job
251
+ as created in a failed state and leverage the retry for failed/timed out jobs. This way, we don't have to have another process for
252
+ retrying jobs that couldn't be started.
253
+ """
254
+ LOGGER.warning(
255
+ f"Could not start job for slice {_slice}. Job will be flagged as failed and retried if max number of attempts not reached: {exception}"
256
+ )
257
+ traced_exception = (
258
+ exception
259
+ if isinstance(exception, AirbyteTracedException)
260
+ else AirbyteTracedException.from_exception(exception)
261
+ )
262
+ # Even though we're not sure this will break the stream, we will emit here for simplicity's sake. If we wanted to be more accurate,
263
+ # we would keep the exceptions in-memory until we know that we have reached the max attempt.
264
+ self._message_repository.emit_message(traced_exception.as_airbyte_message())
265
+ job = self._create_failed_job(_slice)
266
+ self._job_tracker.add_job(intent, job.api_job_id())
267
+ return job
268
+
269
+ def _create_failed_job(self, stream_slice: StreamSlice) -> AsyncJob:
270
+ job = AsyncJob(f"{uuid.uuid4()} - Job that could not start", stream_slice, _NO_TIMEOUT)
271
+ job.update_status(AsyncJobStatus.FAILED)
272
+ return job
273
+
274
+ def _get_running_jobs(self) -> Set[AsyncJob]:
275
+ """
276
+ Returns a set of running AsyncJob objects.
277
+
278
+ Returns:
279
+ Set[AsyncJob]: A set of AsyncJob objects that are currently running.
280
+ """
281
+ return {
282
+ job
283
+ for partition in self._running_partitions
284
+ for job in partition.jobs
285
+ if job.status() == AsyncJobStatus.RUNNING
286
+ }
287
+
288
+ def _update_jobs_status(self) -> None:
289
+ """
290
+ Update the status of all running jobs in the repository.
291
+ """
292
+ running_jobs = self._get_running_jobs()
293
+ if running_jobs:
294
+ # update the status only if there are RUNNING jobs
295
+ self._job_repository.update_jobs_status(running_jobs)
296
+
297
+ def _wait_on_status_update(self) -> None:
298
+ """
299
+ Waits for a specified amount of time between status updates.
300
+
301
+
302
+ This method is used to introduce a delay between status updates in order to avoid excessive polling.
303
+ The duration of the delay is determined by the value of `_WAIT_TIME_BETWEEN_STATUS_UPDATE_IN_SECONDS`.
304
+
305
+ Returns:
306
+ None
307
+ """
308
+ lazy_log(
309
+ LOGGER,
310
+ logging.DEBUG,
311
+ lambda: f"Polling status in progress. There are currently {len(self._running_partitions)} running partitions.",
312
+ )
313
+
314
+ lazy_log(
315
+ LOGGER,
316
+ logging.DEBUG,
317
+ lambda: f"Waiting for {self._WAIT_TIME_BETWEEN_STATUS_UPDATE_IN_SECONDS} seconds before next poll...",
318
+ )
319
+ time.sleep(self._WAIT_TIME_BETWEEN_STATUS_UPDATE_IN_SECONDS)
320
+
321
+ def _process_completed_partition(self, partition: AsyncPartition) -> None:
322
+ """
323
+ Process a completed partition.
324
+ Args:
325
+ partition (AsyncPartition): The completed partition to process.
326
+ """
327
+ job_ids = list(map(lambda job: job.api_job_id(), {job for job in partition.jobs}))
328
+ LOGGER.info(
329
+ f"The following jobs for stream slice {partition.stream_slice} have been completed: {job_ids}."
330
+ )
331
+
332
+ # It is important to remove the jobs from the job tracker before yielding the partition as the caller might try to schedule jobs
333
+ # but won't be able to as all jobs slots are taken even though job is done.
334
+ for job in partition.jobs:
335
+ self._job_tracker.remove_job(job.api_job_id())
336
+
337
+ def _process_running_partitions_and_yield_completed_ones(
338
+ self,
339
+ ) -> Generator[AsyncPartition, Any, None]:
340
+ """
341
+ Process the running partitions.
342
+
343
+ Yields:
344
+ AsyncPartition: The processed partition.
345
+
346
+ Raises:
347
+ Any: Any exception raised during processing.
348
+ """
349
+ current_running_partitions: List[AsyncPartition] = []
350
+ for partition in self._running_partitions:
351
+ match partition.status:
352
+ case AsyncJobStatus.COMPLETED:
353
+ self._process_completed_partition(partition)
354
+ yield partition
355
+ case AsyncJobStatus.RUNNING:
356
+ current_running_partitions.append(partition)
357
+ case _ if partition.has_reached_max_attempt():
358
+ self._stop_partition(partition)
359
+ self._process_partitions_with_errors(partition)
360
+ case _:
361
+ self._stop_timed_out_jobs(partition)
362
+
363
+ # job will be restarted in `_start_job`
364
+ current_running_partitions.insert(0, partition)
365
+
366
+ for job in partition.jobs:
367
+ # We only remove completed jobs as we want failed/timed out jobs to be re-allocated in priority
368
+ if job.status() == AsyncJobStatus.COMPLETED:
369
+ self._job_tracker.remove_job(job.api_job_id())
370
+
371
+ # update the referenced list with running partitions
372
+ self._running_partitions = current_running_partitions
373
+
374
+ def _stop_partition(self, partition: AsyncPartition) -> None:
375
+ for job in partition.jobs:
376
+ if job.status() in _API_SIDE_RUNNING_STATUS:
377
+ self._abort_job(job, free_job_allocation=True)
378
+ else:
379
+ self._job_tracker.remove_job(job.api_job_id())
380
+
381
+ def _stop_timed_out_jobs(self, partition: AsyncPartition) -> None:
382
+ for job in partition.jobs:
383
+ if job.status() == AsyncJobStatus.TIMED_OUT:
384
+ # we don't free allocation here because it is expected to retry the job
385
+ self._abort_job(job, free_job_allocation=False)
386
+
387
+ def _abort_job(self, job: AsyncJob, free_job_allocation: bool = True) -> None:
388
+ try:
389
+ self._job_repository.abort(job)
390
+ if free_job_allocation:
391
+ self._job_tracker.remove_job(job.api_job_id())
392
+ except Exception as exception:
393
+ LOGGER.warning(f"Could not free budget for job {job.api_job_id()}: {exception}")
394
+
395
+ def _process_partitions_with_errors(self, partition: AsyncPartition) -> None:
396
+ """
397
+ Process a partition with status errors (FAILED and TIMEOUT).
398
+
399
+ Args:
400
+ partition (AsyncPartition): The partition to process.
401
+ Returns:
402
+ AirbyteTracedException: An exception indicating that at least one job could not be completed.
403
+ Raises:
404
+ AirbyteTracedException: If at least one job could not be completed.
405
+ """
406
+ status_by_job_id = {job.api_job_id(): job.status() for job in partition.jobs}
407
+ self._non_breaking_exceptions.append(
408
+ AirbyteTracedException(
409
+ internal_message=f"At least one job could not be completed for slice {partition.stream_slice}. Job statuses were: {status_by_job_id}. See warning logs for more information.",
410
+ failure_type=FailureType.config_error,
411
+ )
412
+ )
413
+
414
+ def create_and_get_completed_partitions(self) -> Iterable[AsyncPartition]:
415
+ """
416
+ Creates and retrieves completed partitions.
417
+ This method continuously starts jobs, updates job status, processes running partitions,
418
+ logs polling partitions, and waits for status updates. It yields completed partitions
419
+ as they become available.
420
+
421
+ Returns:
422
+ An iterable of completed partitions, represented as AsyncPartition objects.
423
+ Each partition is wrapped in an Optional, allowing for None values.
424
+ """
425
+ while True:
426
+ try:
427
+ lazy_log(
428
+ LOGGER,
429
+ logging.DEBUG,
430
+ lambda: f"JobOrchestrator loop - (Thread {threading.get_native_id()}, AsyncJobOrchestrator {self}) is starting the async job loop",
431
+ )
432
+ self._start_jobs()
433
+ if not self._slice_iterator.has_next() and not self._running_partitions:
434
+ break
435
+
436
+ self._update_jobs_status()
437
+ yield from self._process_running_partitions_and_yield_completed_ones()
438
+ self._wait_on_status_update()
439
+ except Exception as exception:
440
+ if self._is_breaking_exception(exception):
441
+ LOGGER.warning(
442
+ f"Caught exception that stops the processing of the jobs: {exception}"
443
+ )
444
+ self._abort_all_running_jobs()
445
+ raise exception
446
+
447
+ self._non_breaking_exceptions.append(exception)
448
+
449
+ LOGGER.info(
450
+ f"JobOrchestrator loop - Thread (Thread {threading.get_native_id()}, AsyncJobOrchestrator {self}) completed! Errors during creation were {self._non_breaking_exceptions}"
451
+ )
452
+ if self._non_breaking_exceptions:
453
+ # We emitted traced message but we didn't break on non_breaking_exception. We still need to raise an exception so that the
454
+ # call of `create_and_get_completed_partitions` knows that there was an issue with some partitions and the sync is incomplete.
455
+ raise AirbyteTracedException(
456
+ message="",
457
+ internal_message="\n".join(
458
+ [
459
+ filter_secrets(exception.__repr__())
460
+ for exception in self._non_breaking_exceptions
461
+ ]
462
+ ),
463
+ failure_type=FailureType.config_error,
464
+ )
465
+
466
+ def _handle_non_breaking_error(self, exception: Exception) -> None:
467
+ LOGGER.error(f"Failed to start the Job: {exception}, traceback: {traceback.format_exc()}")
468
+ self._non_breaking_exceptions.append(exception)
469
+
470
+ def _abort_all_running_jobs(self) -> None:
471
+ for partition in self._running_partitions:
472
+ for job in partition.jobs:
473
+ if job.status() in self._RUNNING_ON_API_SIDE_STATUS:
474
+ self._abort_job(job, free_job_allocation=True)
475
+ self._job_tracker.remove_job(job.api_job_id())
476
+
477
+ self._running_partitions = []
478
+
479
+ def _is_breaking_exception(self, exception: Exception) -> bool:
480
+ return isinstance(exception, self._exceptions_to_break_on) or (
481
+ isinstance(exception, AirbyteTracedException)
482
+ and exception.failure_type == FailureType.config_error
483
+ )
484
+
485
+ def fetch_records(self, partition: AsyncPartition) -> Iterable[Mapping[str, Any]]:
486
+ """
487
+ Fetches records from the given partition's jobs.
488
+
489
+ Args:
490
+ partition (AsyncPartition): The partition containing the jobs.
491
+
492
+ Yields:
493
+ Iterable[Mapping[str, Any]]: The fetched records from the jobs.
494
+ """
495
+ for job in partition.jobs:
496
+ yield from self._job_repository.fetch_records(job)
497
+ self._job_repository.delete(job)
@@ -0,0 +1,75 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ import logging
4
+ import threading
5
+ import uuid
6
+ from typing import Set
7
+
8
+ from airbyte_cdk.logger import lazy_log
9
+
10
+ LOGGER = logging.getLogger("airbyte")
11
+
12
+
13
+ class ConcurrentJobLimitReached(Exception):
14
+ pass
15
+
16
+
17
+ class JobTracker:
18
+ def __init__(self, limit: int):
19
+ self._jobs: Set[str] = set()
20
+ self._limit = limit
21
+ self._lock = threading.Lock()
22
+
23
+ def try_to_get_intent(self) -> str:
24
+ lazy_log(
25
+ LOGGER,
26
+ logging.DEBUG,
27
+ lambda: f"JobTracker - Trying to acquire lock by thread {threading.get_native_id()}...",
28
+ )
29
+ with self._lock:
30
+ if self._has_reached_limit():
31
+ raise ConcurrentJobLimitReached(
32
+ "Can't allocate more jobs right now: limit already reached"
33
+ )
34
+ intent = f"intent_{str(uuid.uuid4())}"
35
+ lazy_log(
36
+ LOGGER,
37
+ logging.DEBUG,
38
+ lambda: f"JobTracker - Thread {threading.get_native_id()} has acquired {intent}!",
39
+ )
40
+ self._jobs.add(intent)
41
+ return intent
42
+
43
+ def add_job(self, intent_or_job_id: str, job_id: str) -> None:
44
+ if intent_or_job_id not in self._jobs:
45
+ raise ValueError(
46
+ f"Can't add job: Unknown intent or job id, known values are {self._jobs}"
47
+ )
48
+
49
+ if intent_or_job_id == job_id:
50
+ # Nothing to do here as the ID to replace is the same
51
+ return
52
+
53
+ lazy_log(
54
+ LOGGER,
55
+ logging.DEBUG,
56
+ lambda: f"JobTracker - Thread {threading.get_native_id()} replacing job {intent_or_job_id} by {job_id}!",
57
+ )
58
+ with self._lock:
59
+ self._jobs.add(job_id)
60
+ self._jobs.remove(intent_or_job_id)
61
+
62
+ def remove_job(self, job_id: str) -> None:
63
+ """
64
+ If the job is not allocated as a running job, this method does nothing and it won't raise.
65
+ """
66
+ lazy_log(
67
+ LOGGER,
68
+ logging.DEBUG,
69
+ lambda: f"JobTracker - Thread {threading.get_native_id()} removing job {job_id}",
70
+ )
71
+ with self._lock:
72
+ self._jobs.discard(job_id)
73
+
74
+ def _has_reached_limit(self) -> bool:
75
+ return len(self._jobs) >= self._limit
@@ -0,0 +1,35 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ from abc import abstractmethod
4
+ from typing import Any, Iterable, Mapping, Set
5
+
6
+ from airbyte_cdk.sources.declarative.async_job.job import AsyncJob
7
+ from airbyte_cdk.sources.types import StreamSlice
8
+
9
+
10
+ class AsyncJobRepository:
11
+ @abstractmethod
12
+ def start(self, stream_slice: StreamSlice) -> AsyncJob:
13
+ pass
14
+
15
+ @abstractmethod
16
+ def update_jobs_status(self, jobs: Set[AsyncJob]) -> None:
17
+ pass
18
+
19
+ @abstractmethod
20
+ def fetch_records(self, job: AsyncJob) -> Iterable[Mapping[str, Any]]:
21
+ pass
22
+
23
+ @abstractmethod
24
+ def abort(self, job: AsyncJob) -> None:
25
+ """
26
+ Called when we need to stop on the API side. This method can raise NotImplementedError as not all the APIs will support aborting
27
+ jobs.
28
+ """
29
+ raise NotImplementedError(
30
+ "Either the API or the AsyncJobRepository implementation do not support aborting jobs"
31
+ )
32
+
33
+ @abstractmethod
34
+ def delete(self, job: AsyncJob) -> None:
35
+ pass
@@ -0,0 +1,24 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+
4
+ from enum import Enum
5
+
6
+ _TERMINAL = True
7
+
8
+
9
+ class AsyncJobStatus(Enum):
10
+ RUNNING = ("RUNNING", not _TERMINAL)
11
+ COMPLETED = ("COMPLETED", _TERMINAL)
12
+ FAILED = ("FAILED", _TERMINAL)
13
+ TIMED_OUT = ("TIMED_OUT", _TERMINAL)
14
+
15
+ def __init__(self, value: str, is_terminal: bool) -> None:
16
+ self._value = value
17
+ self._is_terminal = is_terminal
18
+
19
+ def is_terminal(self) -> bool:
20
+ """
21
+ A status is terminal when a job status can't be updated anymore. For example if a job is completed, it will stay completed but a
22
+ running job might because completed, failed or timed out.
23
+ """
24
+ return self._is_terminal
@@ -0,0 +1,39 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+ from datetime import datetime, timedelta, timezone
3
+ from typing import Optional
4
+
5
+
6
+ class Timer:
7
+ def __init__(self, timeout: timedelta) -> None:
8
+ self._start_datetime: Optional[datetime] = None
9
+ self._end_datetime: Optional[datetime] = None
10
+ self._timeout = timeout
11
+
12
+ def start(self) -> None:
13
+ self._start_datetime = self._now()
14
+ self._end_datetime = None
15
+
16
+ def stop(self) -> None:
17
+ if self._end_datetime is None:
18
+ self._end_datetime = self._now()
19
+
20
+ def is_started(self) -> bool:
21
+ return self._start_datetime is not None
22
+
23
+ @property
24
+ def elapsed_time(self) -> Optional[timedelta]:
25
+ if not self._start_datetime:
26
+ return None
27
+
28
+ end_time = self._end_datetime or self._now()
29
+ elapsed_period = end_time - self._start_datetime
30
+ return elapsed_period
31
+
32
+ def has_timed_out(self) -> bool:
33
+ if not self.is_started():
34
+ return False
35
+ return self.elapsed_time > self._timeout # type: ignore # given the job timer is started, we assume there is an elapsed_period
36
+
37
+ @staticmethod
38
+ def _now() -> datetime:
39
+ return datetime.now(tz=timezone.utc)
@@ -2,8 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from airbyte_cdk.sources.declarative.auth.jwt import JwtAuthenticator
5
6
  from airbyte_cdk.sources.declarative.auth.oauth import DeclarativeOauth2Authenticator
6
7
 
7
- __all__ = [
8
- "DeclarativeOauth2Authenticator",
9
- ]
8
+ __all__ = ["DeclarativeOauth2Authenticator", "JwtAuthenticator"]