airbyte-cdk 0.72.0__py3-none-any.whl → 6.17.1.dev0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (518) hide show
  1. airbyte_cdk/__init__.py +355 -6
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +29 -10
  7. airbyte_cdk/connector.py +24 -24
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
  10. airbyte_cdk/connector_builder/main.py +45 -13
  11. airbyte_cdk/connector_builder/message_grouper.py +189 -50
  12. airbyte_cdk/connector_builder/models.py +3 -2
  13. airbyte_cdk/destinations/__init__.py +4 -3
  14. airbyte_cdk/destinations/destination.py +54 -20
  15. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  16. airbyte_cdk/destinations/vector_db_based/config.py +40 -17
  17. airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
  18. airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
  19. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  20. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  21. airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
  22. airbyte_cdk/entrypoint.py +153 -44
  23. airbyte_cdk/exception_handler.py +21 -3
  24. airbyte_cdk/logger.py +30 -44
  25. airbyte_cdk/models/__init__.py +13 -2
  26. airbyte_cdk/models/airbyte_protocol.py +86 -1
  27. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  28. airbyte_cdk/models/file_transfer_record_message.py +13 -0
  29. airbyte_cdk/models/well_known_types.py +1 -1
  30. airbyte_cdk/sources/__init__.py +5 -1
  31. airbyte_cdk/sources/abstract_source.py +125 -79
  32. airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
  33. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
  34. airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
  35. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
  36. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  37. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
  38. airbyte_cdk/sources/config.py +3 -2
  39. airbyte_cdk/sources/connector_state_manager.py +49 -83
  40. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  41. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
  42. airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
  43. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  44. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  45. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  46. airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
  47. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  48. airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
  49. airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
  50. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
  51. airbyte_cdk/sources/declarative/auth/token.py +28 -10
  52. airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
  53. airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
  54. airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
  55. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  56. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  57. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +490 -0
  58. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
  59. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
  60. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1213 -88
  61. airbyte_cdk/sources/declarative/declarative_source.py +5 -2
  62. airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
  63. airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
  64. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
  65. airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
  66. airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
  67. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  68. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  69. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  70. airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
  71. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
  72. airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
  73. airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
  74. airbyte_cdk/sources/declarative/extractors/record_filter.py +63 -8
  75. airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
  76. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
  77. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  78. airbyte_cdk/sources/declarative/incremental/__init__.py +31 -3
  79. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +346 -0
  80. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
  81. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  82. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
  83. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +173 -74
  84. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  85. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  86. airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
  87. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
  88. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
  89. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
  90. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
  91. airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
  92. airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
  93. airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
  94. airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
  95. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  96. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  97. airbyte_cdk/sources/declarative/models/__init__.py +1 -1
  98. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1329 -595
  99. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
  100. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
  101. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
  102. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1763 -226
  103. airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
  104. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  105. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  106. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
  107. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  108. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
  109. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
  110. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
  111. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
  112. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
  113. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
  114. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
  115. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
  116. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
  117. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
  118. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
  119. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
  120. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  121. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
  122. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
  123. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
  124. airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
  125. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
  126. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
  127. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
  128. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
  129. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
  130. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
  131. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
  132. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
  133. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
  134. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
  135. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
  136. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
  137. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  138. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
  139. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
  140. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
  141. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
  142. airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
  143. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  144. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  145. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  146. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  147. airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
  148. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
  149. airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
  150. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +229 -73
  151. airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
  152. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
  153. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
  154. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
  155. airbyte_cdk/sources/declarative/spec/spec.py +12 -5
  156. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
  157. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
  158. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
  159. airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
  160. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  161. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  162. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  163. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  164. airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
  165. airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
  166. airbyte_cdk/sources/declarative/types.py +19 -110
  167. airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
  168. airbyte_cdk/sources/embedded/base_integration.py +16 -5
  169. airbyte_cdk/sources/embedded/catalog.py +16 -4
  170. airbyte_cdk/sources/embedded/runner.py +19 -3
  171. airbyte_cdk/sources/embedded/tools.py +5 -2
  172. airbyte_cdk/sources/file_based/README.md +152 -0
  173. airbyte_cdk/sources/file_based/__init__.py +24 -0
  174. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
  175. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
  176. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
  177. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +47 -10
  178. airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
  179. airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
  180. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  181. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
  182. airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
  183. airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
  184. airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
  185. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
  186. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  187. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  188. airbyte_cdk/sources/file_based/exceptions.py +18 -15
  189. airbyte_cdk/sources/file_based/file_based_source.py +140 -33
  190. airbyte_cdk/sources/file_based/file_based_stream_reader.py +69 -5
  191. airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
  192. airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
  193. airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
  194. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  195. airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
  196. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  197. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
  198. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
  199. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +141 -41
  200. airbyte_cdk/sources/file_based/remote_file.py +1 -1
  201. airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
  202. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
  203. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  204. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  205. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
  206. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
  207. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
  208. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
  209. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
  210. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
  211. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  212. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
  213. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +147 -45
  214. airbyte_cdk/sources/http_logger.py +8 -3
  215. airbyte_cdk/sources/message/__init__.py +7 -1
  216. airbyte_cdk/sources/message/repository.py +18 -4
  217. airbyte_cdk/sources/source.py +42 -38
  218. airbyte_cdk/sources/streams/__init__.py +2 -2
  219. airbyte_cdk/sources/streams/availability_strategy.py +54 -3
  220. airbyte_cdk/sources/streams/call_rate.py +64 -21
  221. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  222. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  223. airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
  224. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  225. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  226. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  227. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  228. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
  229. airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
  230. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
  231. airbyte_cdk/sources/streams/concurrent/cursor.py +298 -42
  232. airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
  233. airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
  234. airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
  235. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
  236. airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
  237. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
  238. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  239. airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
  240. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
  241. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
  242. airbyte_cdk/sources/streams/core.py +412 -87
  243. airbyte_cdk/sources/streams/http/__init__.py +2 -1
  244. airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
  245. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  246. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  247. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  248. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  249. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  250. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  251. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  252. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  253. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  254. airbyte_cdk/sources/streams/http/exceptions.py +27 -7
  255. airbyte_cdk/sources/streams/http/http.py +369 -246
  256. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  257. airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
  258. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
  259. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  260. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
  261. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  262. airbyte_cdk/sources/types.py +154 -0
  263. airbyte_cdk/sources/utils/record_helper.py +36 -21
  264. airbyte_cdk/sources/utils/schema_helpers.py +13 -6
  265. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  266. airbyte_cdk/sources/utils/transform.py +54 -20
  267. airbyte_cdk/sql/_util/hashing.py +34 -0
  268. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  269. airbyte_cdk/sql/constants.py +32 -0
  270. airbyte_cdk/sql/exceptions.py +235 -0
  271. airbyte_cdk/sql/secrets.py +123 -0
  272. airbyte_cdk/sql/shared/__init__.py +15 -0
  273. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  274. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  275. airbyte_cdk/sql/types.py +160 -0
  276. airbyte_cdk/test/catalog_builder.py +70 -18
  277. airbyte_cdk/test/entrypoint_wrapper.py +117 -42
  278. airbyte_cdk/test/mock_http/__init__.py +1 -1
  279. airbyte_cdk/test/mock_http/matcher.py +6 -0
  280. airbyte_cdk/test/mock_http/mocker.py +57 -10
  281. airbyte_cdk/test/mock_http/request.py +19 -3
  282. airbyte_cdk/test/mock_http/response.py +3 -1
  283. airbyte_cdk/test/mock_http/response_builder.py +32 -16
  284. airbyte_cdk/test/state_builder.py +18 -10
  285. airbyte_cdk/test/utils/__init__.py +1 -0
  286. airbyte_cdk/test/utils/data.py +24 -0
  287. airbyte_cdk/test/utils/http_mocking.py +16 -0
  288. airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
  289. airbyte_cdk/test/utils/reading.py +26 -0
  290. airbyte_cdk/utils/__init__.py +2 -1
  291. airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
  292. airbyte_cdk/utils/analytics_message.py +10 -2
  293. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  294. airbyte_cdk/utils/event_timing.py +10 -10
  295. airbyte_cdk/utils/mapping_helpers.py +3 -1
  296. airbyte_cdk/utils/message_utils.py +20 -11
  297. airbyte_cdk/utils/print_buffer.py +75 -0
  298. airbyte_cdk/utils/schema_inferrer.py +198 -28
  299. airbyte_cdk/utils/slice_hasher.py +30 -0
  300. airbyte_cdk/utils/spec_schema_transformations.py +6 -3
  301. airbyte_cdk/utils/stream_status_utils.py +8 -1
  302. airbyte_cdk/utils/traced_exception.py +61 -21
  303. airbyte_cdk-6.17.1.dev0.dist-info/METADATA +109 -0
  304. airbyte_cdk-6.17.1.dev0.dist-info/RECORD +350 -0
  305. {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.17.1.dev0.dist-info}/WHEEL +1 -2
  306. airbyte_cdk-6.17.1.dev0.dist-info/entry_points.txt +3 -0
  307. airbyte_cdk/sources/declarative/create_partial.py +0 -92
  308. airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
  309. airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
  310. airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
  311. airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
  312. airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
  313. airbyte_cdk/sources/deprecated/base_source.py +0 -94
  314. airbyte_cdk/sources/deprecated/client.py +0 -99
  315. airbyte_cdk/sources/singer/__init__.py +0 -8
  316. airbyte_cdk/sources/singer/singer_helpers.py +0 -304
  317. airbyte_cdk/sources/singer/source.py +0 -186
  318. airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
  319. airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
  320. airbyte_cdk/sources/streams/http/auth/core.py +0 -29
  321. airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
  322. airbyte_cdk/sources/streams/http/auth/token.py +0 -47
  323. airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
  324. airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
  325. airbyte_cdk/sources/utils/schema_models.py +0 -84
  326. airbyte_cdk-0.72.0.dist-info/METADATA +0 -243
  327. airbyte_cdk-0.72.0.dist-info/RECORD +0 -466
  328. airbyte_cdk-0.72.0.dist-info/top_level.txt +0 -3
  329. source_declarative_manifest/main.py +0 -29
  330. unit_tests/connector_builder/__init__.py +0 -3
  331. unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
  332. unit_tests/connector_builder/test_message_grouper.py +0 -713
  333. unit_tests/connector_builder/utils.py +0 -27
  334. unit_tests/destinations/test_destination.py +0 -243
  335. unit_tests/singer/test_singer_helpers.py +0 -56
  336. unit_tests/singer/test_singer_source.py +0 -112
  337. unit_tests/sources/__init__.py +0 -0
  338. unit_tests/sources/concurrent_source/__init__.py +0 -3
  339. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
  340. unit_tests/sources/declarative/__init__.py +0 -3
  341. unit_tests/sources/declarative/auth/__init__.py +0 -3
  342. unit_tests/sources/declarative/auth/test_oauth.py +0 -331
  343. unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
  344. unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
  345. unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
  346. unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
  347. unit_tests/sources/declarative/checks/__init__.py +0 -3
  348. unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
  349. unit_tests/sources/declarative/decoders/__init__.py +0 -0
  350. unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
  351. unit_tests/sources/declarative/external_component.py +0 -13
  352. unit_tests/sources/declarative/extractors/__init__.py +0 -3
  353. unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
  354. unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
  355. unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
  356. unit_tests/sources/declarative/incremental/__init__.py +0 -0
  357. unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
  358. unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
  359. unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
  360. unit_tests/sources/declarative/interpolation/__init__.py +0 -3
  361. unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
  362. unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
  363. unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
  364. unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
  365. unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
  366. unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
  367. unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
  368. unit_tests/sources/declarative/parsers/__init__.py +0 -3
  369. unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
  370. unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
  371. unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1841
  372. unit_tests/sources/declarative/parsers/testing_components.py +0 -36
  373. unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
  374. unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
  375. unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
  376. unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
  377. unit_tests/sources/declarative/requesters/__init__.py +0 -3
  378. unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
  379. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
  380. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
  381. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
  382. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
  383. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
  384. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
  385. unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
  386. unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
  387. unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
  388. unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
  389. unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
  390. unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
  391. unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
  392. unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
  393. unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
  394. unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
  395. unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
  396. unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
  397. unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
  398. unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
  399. unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
  400. unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
  401. unit_tests/sources/declarative/retrievers/__init__.py +0 -3
  402. unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
  403. unit_tests/sources/declarative/schema/__init__.py +0 -6
  404. unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
  405. unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
  406. unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
  407. unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
  408. unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
  409. unit_tests/sources/declarative/states/__init__.py +0 -3
  410. unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
  411. unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
  412. unit_tests/sources/declarative/test_create_partial.py +0 -83
  413. unit_tests/sources/declarative/test_declarative_stream.py +0 -103
  414. unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
  415. unit_tests/sources/declarative/test_types.py +0 -39
  416. unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
  417. unit_tests/sources/file_based/__init__.py +0 -0
  418. unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
  419. unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
  420. unit_tests/sources/file_based/config/__init__.py +0 -0
  421. unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
  422. unit_tests/sources/file_based/config/test_csv_format.py +0 -34
  423. unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
  424. unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
  425. unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
  426. unit_tests/sources/file_based/file_types/__init__.py +0 -0
  427. unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
  428. unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
  429. unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
  430. unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
  431. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
  432. unit_tests/sources/file_based/helpers.py +0 -70
  433. unit_tests/sources/file_based/in_memory_files_source.py +0 -211
  434. unit_tests/sources/file_based/scenarios/__init__.py +0 -0
  435. unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
  436. unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
  437. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
  438. unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
  439. unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
  440. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
  441. unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
  442. unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
  443. unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
  444. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
  445. unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
  446. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
  447. unit_tests/sources/file_based/stream/__init__.py +0 -0
  448. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  449. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
  450. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
  451. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
  452. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
  453. unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
  454. unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
  455. unit_tests/sources/file_based/test_scenarios.py +0 -253
  456. unit_tests/sources/file_based/test_schema_helpers.py +0 -346
  457. unit_tests/sources/fixtures/__init__.py +0 -3
  458. unit_tests/sources/fixtures/source_test_fixture.py +0 -153
  459. unit_tests/sources/message/__init__.py +0 -0
  460. unit_tests/sources/message/test_repository.py +0 -153
  461. unit_tests/sources/streams/__init__.py +0 -0
  462. unit_tests/sources/streams/concurrent/__init__.py +0 -3
  463. unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
  464. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
  465. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
  466. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
  467. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
  468. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
  469. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
  470. unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
  471. unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
  472. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
  473. unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
  474. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
  475. unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
  476. unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
  477. unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
  478. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
  479. unit_tests/sources/streams/http/__init__.py +0 -0
  480. unit_tests/sources/streams/http/auth/__init__.py +0 -0
  481. unit_tests/sources/streams/http/auth/test_auth.py +0 -173
  482. unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
  483. unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
  484. unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
  485. unit_tests/sources/streams/http/test_http.py +0 -635
  486. unit_tests/sources/streams/test_availability_strategy.py +0 -70
  487. unit_tests/sources/streams/test_call_rate.py +0 -300
  488. unit_tests/sources/streams/test_stream_read.py +0 -405
  489. unit_tests/sources/streams/test_streams_core.py +0 -184
  490. unit_tests/sources/test_abstract_source.py +0 -1442
  491. unit_tests/sources/test_concurrent_source.py +0 -112
  492. unit_tests/sources/test_config.py +0 -92
  493. unit_tests/sources/test_connector_state_manager.py +0 -482
  494. unit_tests/sources/test_http_logger.py +0 -252
  495. unit_tests/sources/test_integration_source.py +0 -86
  496. unit_tests/sources/test_source.py +0 -684
  497. unit_tests/sources/test_source_read.py +0 -460
  498. unit_tests/test/__init__.py +0 -0
  499. unit_tests/test/mock_http/__init__.py +0 -0
  500. unit_tests/test/mock_http/test_matcher.py +0 -53
  501. unit_tests/test/mock_http/test_mocker.py +0 -214
  502. unit_tests/test/mock_http/test_request.py +0 -117
  503. unit_tests/test/mock_http/test_response_builder.py +0 -177
  504. unit_tests/test/test_entrypoint_wrapper.py +0 -240
  505. unit_tests/utils/__init__.py +0 -0
  506. unit_tests/utils/test_datetime_format_inferrer.py +0 -60
  507. unit_tests/utils/test_mapping_helpers.py +0 -54
  508. unit_tests/utils/test_message_utils.py +0 -91
  509. unit_tests/utils/test_rate_limiting.py +0 -26
  510. unit_tests/utils/test_schema_inferrer.py +0 -202
  511. unit_tests/utils/test_secret_utils.py +0 -135
  512. unit_tests/utils/test_stream_status_utils.py +0 -61
  513. unit_tests/utils/test_traced_exception.py +0 -107
  514. /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
  515. {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
  516. {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
  517. {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
  518. {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.17.1.dev0.dist-info}/LICENSE.txt +0 -0
@@ -0,0 +1,497 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ import logging
4
+ import threading
5
+ import time
6
+ import traceback
7
+ import uuid
8
+ from datetime import timedelta
9
+ from typing import (
10
+ Any,
11
+ Generator,
12
+ Generic,
13
+ Iterable,
14
+ List,
15
+ Mapping,
16
+ Optional,
17
+ Set,
18
+ Tuple,
19
+ Type,
20
+ TypeVar,
21
+ )
22
+
23
+ from airbyte_cdk.logger import lazy_log
24
+ from airbyte_cdk.models import FailureType
25
+ from airbyte_cdk.sources.declarative.async_job.job import AsyncJob
26
+ from airbyte_cdk.sources.declarative.async_job.job_tracker import (
27
+ ConcurrentJobLimitReached,
28
+ JobTracker,
29
+ )
30
+ from airbyte_cdk.sources.declarative.async_job.repository import AsyncJobRepository
31
+ from airbyte_cdk.sources.declarative.async_job.status import AsyncJobStatus
32
+ from airbyte_cdk.sources.message import MessageRepository
33
+ from airbyte_cdk.sources.types import StreamSlice
34
+ from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets
35
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
36
+
37
+ LOGGER = logging.getLogger("airbyte")
38
+ _NO_TIMEOUT = timedelta.max
39
+ _API_SIDE_RUNNING_STATUS = {AsyncJobStatus.RUNNING, AsyncJobStatus.TIMED_OUT}
40
+
41
+
42
+ class AsyncPartition:
43
+ """
44
+ This bucket of api_jobs is a bit useless for this iteration but should become interesting when we will be able to split jobs
45
+ """
46
+
47
+ _MAX_NUMBER_OF_ATTEMPTS = 3
48
+
49
+ def __init__(self, jobs: List[AsyncJob], stream_slice: StreamSlice) -> None:
50
+ self._attempts_per_job = {job: 1 for job in jobs}
51
+ self._stream_slice = stream_slice
52
+
53
+ def has_reached_max_attempt(self) -> bool:
54
+ return any(
55
+ map(
56
+ lambda attempt_count: attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS,
57
+ self._attempts_per_job.values(),
58
+ )
59
+ )
60
+
61
+ def replace_job(self, job_to_replace: AsyncJob, new_jobs: List[AsyncJob]) -> None:
62
+ current_attempt_count = self._attempts_per_job.pop(job_to_replace, None)
63
+ if current_attempt_count is None:
64
+ raise ValueError("Could not find job to replace")
65
+ elif current_attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS:
66
+ raise ValueError(f"Max attempt reached for job in partition {self._stream_slice}")
67
+
68
+ new_attempt_count = current_attempt_count + 1
69
+ for job in new_jobs:
70
+ self._attempts_per_job[job] = new_attempt_count
71
+
72
+ def should_split(self, job: AsyncJob) -> bool:
73
+ """
74
+ Not used right now but once we support job split, we should split based on the number of attempts
75
+ """
76
+ return False
77
+
78
+ @property
79
+ def jobs(self) -> Iterable[AsyncJob]:
80
+ return self._attempts_per_job.keys()
81
+
82
+ @property
83
+ def stream_slice(self) -> StreamSlice:
84
+ return self._stream_slice
85
+
86
+ @property
87
+ def status(self) -> AsyncJobStatus:
88
+ """
89
+ Given different job statuses, the priority is: FAILED, TIMED_OUT, RUNNING. Else, it means everything is completed.
90
+ """
91
+ statuses = set(map(lambda job: job.status(), self.jobs))
92
+ if statuses == {AsyncJobStatus.COMPLETED}:
93
+ return AsyncJobStatus.COMPLETED
94
+ elif AsyncJobStatus.FAILED in statuses:
95
+ return AsyncJobStatus.FAILED
96
+ elif AsyncJobStatus.TIMED_OUT in statuses:
97
+ return AsyncJobStatus.TIMED_OUT
98
+ else:
99
+ return AsyncJobStatus.RUNNING
100
+
101
+ def __repr__(self) -> str:
102
+ return f"AsyncPartition(stream_slice={self._stream_slice}, attempt_per_job={self._attempts_per_job})"
103
+
104
+ def __json_serializable__(self) -> Any:
105
+ return self._stream_slice
106
+
107
+
108
+ T = TypeVar("T")
109
+
110
+
111
+ class LookaheadIterator(Generic[T]):
112
+ def __init__(self, iterable: Iterable[T]) -> None:
113
+ self._iterator = iter(iterable)
114
+ self._buffer: List[T] = []
115
+
116
+ def __iter__(self) -> "LookaheadIterator[T]":
117
+ return self
118
+
119
+ def __next__(self) -> T:
120
+ if self._buffer:
121
+ return self._buffer.pop()
122
+ else:
123
+ return next(self._iterator)
124
+
125
+ def has_next(self) -> bool:
126
+ if self._buffer:
127
+ return True
128
+
129
+ try:
130
+ self._buffer = [next(self._iterator)]
131
+ except StopIteration:
132
+ return False
133
+ else:
134
+ return True
135
+
136
+ def add_at_the_beginning(self, item: T) -> None:
137
+ self._buffer = [item] + self._buffer
138
+
139
+
140
+ class AsyncJobOrchestrator:
141
+ _WAIT_TIME_BETWEEN_STATUS_UPDATE_IN_SECONDS = 5
142
+ _KNOWN_JOB_STATUSES = {
143
+ AsyncJobStatus.COMPLETED,
144
+ AsyncJobStatus.FAILED,
145
+ AsyncJobStatus.RUNNING,
146
+ AsyncJobStatus.TIMED_OUT,
147
+ }
148
+ _RUNNING_ON_API_SIDE_STATUS = {AsyncJobStatus.RUNNING, AsyncJobStatus.TIMED_OUT}
149
+
150
+ def __init__(
151
+ self,
152
+ job_repository: AsyncJobRepository,
153
+ slices: Iterable[StreamSlice],
154
+ job_tracker: JobTracker,
155
+ message_repository: MessageRepository,
156
+ exceptions_to_break_on: Iterable[Type[Exception]] = tuple(),
157
+ has_bulk_parent: bool = False,
158
+ ) -> None:
159
+ """
160
+ If the stream slices provided as a parameters relies on a async job streams that relies on the same JobTracker, `has_bulk_parent`
161
+ needs to be set to True as jobs creation needs to be prioritized on the parent level. Doing otherwise could lead to a situation
162
+ where the child has taken up all the job budget without room to the parent to create more which would lead to an infinite loop of
163
+ "trying to start a parent job" and "ConcurrentJobLimitReached".
164
+ """
165
+ if {*AsyncJobStatus} != self._KNOWN_JOB_STATUSES:
166
+ # this is to prevent developers updating the possible statuses without updating the logic of this class
167
+ raise ValueError(
168
+ "An AsyncJobStatus has been either removed or added which means the logic of this class needs to be reviewed. Once the logic has been updated, please update _KNOWN_JOB_STATUSES"
169
+ )
170
+
171
+ self._job_repository: AsyncJobRepository = job_repository
172
+ self._slice_iterator = LookaheadIterator(slices)
173
+ self._running_partitions: List[AsyncPartition] = []
174
+ self._job_tracker = job_tracker
175
+ self._message_repository = message_repository
176
+ self._exceptions_to_break_on: Tuple[Type[Exception], ...] = tuple(exceptions_to_break_on)
177
+ self._has_bulk_parent = has_bulk_parent
178
+
179
+ self._non_breaking_exceptions: List[Exception] = []
180
+
181
+ def _replace_failed_jobs(self, partition: AsyncPartition) -> None:
182
+ failed_status_jobs = (AsyncJobStatus.FAILED, AsyncJobStatus.TIMED_OUT)
183
+ jobs_to_replace = [job for job in partition.jobs if job.status() in failed_status_jobs]
184
+ for job in jobs_to_replace:
185
+ new_job = self._start_job(job.job_parameters(), job.api_job_id())
186
+ partition.replace_job(job, [new_job])
187
+
188
+ def _start_jobs(self) -> None:
189
+ """
190
+ Retry failed jobs and start jobs for each slice in the slice iterator.
191
+ This method iterates over the running jobs and slice iterator and starts a job for each slice.
192
+ The started jobs are added to the running partitions.
193
+ Returns:
194
+ None
195
+
196
+ However, the first iteration is for sendgrid which only has one job.
197
+ """
198
+ at_least_one_slice_consumed_from_slice_iterator_during_current_iteration = False
199
+ _slice = None
200
+ try:
201
+ for partition in self._running_partitions:
202
+ self._replace_failed_jobs(partition)
203
+
204
+ if (
205
+ self._has_bulk_parent
206
+ and self._running_partitions
207
+ and self._slice_iterator.has_next()
208
+ ):
209
+ LOGGER.debug(
210
+ "This AsyncJobOrchestrator is operating as a child of a bulk stream hence we limit the number of concurrent jobs on the child until there are no more parent slices to avoid the child taking all the API job budget"
211
+ )
212
+ return
213
+
214
+ for _slice in self._slice_iterator:
215
+ at_least_one_slice_consumed_from_slice_iterator_during_current_iteration = True
216
+ job = self._start_job(_slice)
217
+ self._running_partitions.append(AsyncPartition([job], _slice))
218
+ if self._has_bulk_parent and self._slice_iterator.has_next():
219
+ break
220
+ except ConcurrentJobLimitReached:
221
+ if at_least_one_slice_consumed_from_slice_iterator_during_current_iteration:
222
+ # this means a slice has been consumed but the job couldn't be create therefore we need to put it back at the beginning of the _slice_iterator
223
+ self._slice_iterator.add_at_the_beginning(_slice) # type: ignore # we know it's not None here because `ConcurrentJobLimitReached` happens during the for loop
224
+ LOGGER.debug(
225
+ "Waiting before creating more jobs as the limit of concurrent jobs has been reached. Will try again later..."
226
+ )
227
+
228
+ def _start_job(self, _slice: StreamSlice, previous_job_id: Optional[str] = None) -> AsyncJob:
229
+ if previous_job_id:
230
+ id_to_replace = previous_job_id
231
+ lazy_log(LOGGER, logging.DEBUG, lambda: f"Attempting to replace job {id_to_replace}...")
232
+ else:
233
+ id_to_replace = self._job_tracker.try_to_get_intent()
234
+
235
+ try:
236
+ job = self._job_repository.start(_slice)
237
+ self._job_tracker.add_job(id_to_replace, job.api_job_id())
238
+ return job
239
+ except Exception as exception:
240
+ LOGGER.warning(f"Exception has occurred during job creation: {exception}")
241
+ if self._is_breaking_exception(exception):
242
+ self._job_tracker.remove_job(id_to_replace)
243
+ raise exception
244
+ return self._keep_api_budget_with_failed_job(_slice, exception, id_to_replace)
245
+
246
+ def _keep_api_budget_with_failed_job(
247
+ self, _slice: StreamSlice, exception: Exception, intent: str
248
+ ) -> AsyncJob:
249
+ """
250
+ We have a mechanism to retry job. It is used when a job status is FAILED or TIMED_OUT. The easiest way to retry is to have this job
251
+ as created in a failed state and leverage the retry for failed/timed out jobs. This way, we don't have to have another process for
252
+ retrying jobs that couldn't be started.
253
+ """
254
+ LOGGER.warning(
255
+ f"Could not start job for slice {_slice}. Job will be flagged as failed and retried if max number of attempts not reached: {exception}"
256
+ )
257
+ traced_exception = (
258
+ exception
259
+ if isinstance(exception, AirbyteTracedException)
260
+ else AirbyteTracedException.from_exception(exception)
261
+ )
262
+ # Even though we're not sure this will break the stream, we will emit here for simplicity's sake. If we wanted to be more accurate,
263
+ # we would keep the exceptions in-memory until we know that we have reached the max attempt.
264
+ self._message_repository.emit_message(traced_exception.as_airbyte_message())
265
+ job = self._create_failed_job(_slice)
266
+ self._job_tracker.add_job(intent, job.api_job_id())
267
+ return job
268
+
269
+ def _create_failed_job(self, stream_slice: StreamSlice) -> AsyncJob:
270
+ job = AsyncJob(f"{uuid.uuid4()} - Job that could not start", stream_slice, _NO_TIMEOUT)
271
+ job.update_status(AsyncJobStatus.FAILED)
272
+ return job
273
+
274
+ def _get_running_jobs(self) -> Set[AsyncJob]:
275
+ """
276
+ Returns a set of running AsyncJob objects.
277
+
278
+ Returns:
279
+ Set[AsyncJob]: A set of AsyncJob objects that are currently running.
280
+ """
281
+ return {
282
+ job
283
+ for partition in self._running_partitions
284
+ for job in partition.jobs
285
+ if job.status() == AsyncJobStatus.RUNNING
286
+ }
287
+
288
+ def _update_jobs_status(self) -> None:
289
+ """
290
+ Update the status of all running jobs in the repository.
291
+ """
292
+ running_jobs = self._get_running_jobs()
293
+ if running_jobs:
294
+ # update the status only if there are RUNNING jobs
295
+ self._job_repository.update_jobs_status(running_jobs)
296
+
297
+ def _wait_on_status_update(self) -> None:
298
+ """
299
+ Waits for a specified amount of time between status updates.
300
+
301
+
302
+ This method is used to introduce a delay between status updates in order to avoid excessive polling.
303
+ The duration of the delay is determined by the value of `_WAIT_TIME_BETWEEN_STATUS_UPDATE_IN_SECONDS`.
304
+
305
+ Returns:
306
+ None
307
+ """
308
+ lazy_log(
309
+ LOGGER,
310
+ logging.DEBUG,
311
+ lambda: f"Polling status in progress. There are currently {len(self._running_partitions)} running partitions.",
312
+ )
313
+
314
+ lazy_log(
315
+ LOGGER,
316
+ logging.DEBUG,
317
+ lambda: f"Waiting for {self._WAIT_TIME_BETWEEN_STATUS_UPDATE_IN_SECONDS} seconds before next poll...",
318
+ )
319
+ time.sleep(self._WAIT_TIME_BETWEEN_STATUS_UPDATE_IN_SECONDS)
320
+
321
+ def _process_completed_partition(self, partition: AsyncPartition) -> None:
322
+ """
323
+ Process a completed partition.
324
+ Args:
325
+ partition (AsyncPartition): The completed partition to process.
326
+ """
327
+ job_ids = list(map(lambda job: job.api_job_id(), {job for job in partition.jobs}))
328
+ LOGGER.info(
329
+ f"The following jobs for stream slice {partition.stream_slice} have been completed: {job_ids}."
330
+ )
331
+
332
+ # It is important to remove the jobs from the job tracker before yielding the partition as the caller might try to schedule jobs
333
+ # but won't be able to as all jobs slots are taken even though job is done.
334
+ for job in partition.jobs:
335
+ self._job_tracker.remove_job(job.api_job_id())
336
+
337
+ def _process_running_partitions_and_yield_completed_ones(
338
+ self,
339
+ ) -> Generator[AsyncPartition, Any, None]:
340
+ """
341
+ Process the running partitions.
342
+
343
+ Yields:
344
+ AsyncPartition: The processed partition.
345
+
346
+ Raises:
347
+ Any: Any exception raised during processing.
348
+ """
349
+ current_running_partitions: List[AsyncPartition] = []
350
+ for partition in self._running_partitions:
351
+ match partition.status:
352
+ case AsyncJobStatus.COMPLETED:
353
+ self._process_completed_partition(partition)
354
+ yield partition
355
+ case AsyncJobStatus.RUNNING:
356
+ current_running_partitions.append(partition)
357
+ case _ if partition.has_reached_max_attempt():
358
+ self._stop_partition(partition)
359
+ self._process_partitions_with_errors(partition)
360
+ case _:
361
+ self._stop_timed_out_jobs(partition)
362
+
363
+ # job will be restarted in `_start_job`
364
+ current_running_partitions.insert(0, partition)
365
+
366
+ for job in partition.jobs:
367
+ # We only remove completed jobs as we want failed/timed out jobs to be re-allocated in priority
368
+ if job.status() == AsyncJobStatus.COMPLETED:
369
+ self._job_tracker.remove_job(job.api_job_id())
370
+
371
+ # update the referenced list with running partitions
372
+ self._running_partitions = current_running_partitions
373
+
374
+ def _stop_partition(self, partition: AsyncPartition) -> None:
375
+ for job in partition.jobs:
376
+ if job.status() in _API_SIDE_RUNNING_STATUS:
377
+ self._abort_job(job, free_job_allocation=True)
378
+ else:
379
+ self._job_tracker.remove_job(job.api_job_id())
380
+
381
+ def _stop_timed_out_jobs(self, partition: AsyncPartition) -> None:
382
+ for job in partition.jobs:
383
+ if job.status() == AsyncJobStatus.TIMED_OUT:
384
+ # we don't free allocation here because it is expected to retry the job
385
+ self._abort_job(job, free_job_allocation=False)
386
+
387
+ def _abort_job(self, job: AsyncJob, free_job_allocation: bool = True) -> None:
388
+ try:
389
+ self._job_repository.abort(job)
390
+ if free_job_allocation:
391
+ self._job_tracker.remove_job(job.api_job_id())
392
+ except Exception as exception:
393
+ LOGGER.warning(f"Could not free budget for job {job.api_job_id()}: {exception}")
394
+
395
+ def _process_partitions_with_errors(self, partition: AsyncPartition) -> None:
396
+ """
397
+ Process a partition with status errors (FAILED and TIMEOUT).
398
+
399
+ Args:
400
+ partition (AsyncPartition): The partition to process.
401
+ Returns:
402
+ AirbyteTracedException: An exception indicating that at least one job could not be completed.
403
+ Raises:
404
+ AirbyteTracedException: If at least one job could not be completed.
405
+ """
406
+ status_by_job_id = {job.api_job_id(): job.status() for job in partition.jobs}
407
+ self._non_breaking_exceptions.append(
408
+ AirbyteTracedException(
409
+ internal_message=f"At least one job could not be completed for slice {partition.stream_slice}. Job statuses were: {status_by_job_id}. See warning logs for more information.",
410
+ failure_type=FailureType.config_error,
411
+ )
412
+ )
413
+
414
+ def create_and_get_completed_partitions(self) -> Iterable[AsyncPartition]:
415
+ """
416
+ Creates and retrieves completed partitions.
417
+ This method continuously starts jobs, updates job status, processes running partitions,
418
+ logs polling partitions, and waits for status updates. It yields completed partitions
419
+ as they become available.
420
+
421
+ Returns:
422
+ An iterable of completed partitions, represented as AsyncPartition objects.
423
+ Each partition is wrapped in an Optional, allowing for None values.
424
+ """
425
+ while True:
426
+ try:
427
+ lazy_log(
428
+ LOGGER,
429
+ logging.DEBUG,
430
+ lambda: f"JobOrchestrator loop - (Thread {threading.get_native_id()}, AsyncJobOrchestrator {self}) is starting the async job loop",
431
+ )
432
+ self._start_jobs()
433
+ if not self._slice_iterator.has_next() and not self._running_partitions:
434
+ break
435
+
436
+ self._update_jobs_status()
437
+ yield from self._process_running_partitions_and_yield_completed_ones()
438
+ self._wait_on_status_update()
439
+ except Exception as exception:
440
+ if self._is_breaking_exception(exception):
441
+ LOGGER.warning(
442
+ f"Caught exception that stops the processing of the jobs: {exception}"
443
+ )
444
+ self._abort_all_running_jobs()
445
+ raise exception
446
+
447
+ self._non_breaking_exceptions.append(exception)
448
+
449
+ LOGGER.info(
450
+ f"JobOrchestrator loop - Thread (Thread {threading.get_native_id()}, AsyncJobOrchestrator {self}) completed! Errors during creation were {self._non_breaking_exceptions}"
451
+ )
452
+ if self._non_breaking_exceptions:
453
+ # We emitted traced message but we didn't break on non_breaking_exception. We still need to raise an exception so that the
454
+ # call of `create_and_get_completed_partitions` knows that there was an issue with some partitions and the sync is incomplete.
455
+ raise AirbyteTracedException(
456
+ message="",
457
+ internal_message="\n".join(
458
+ [
459
+ filter_secrets(exception.__repr__())
460
+ for exception in self._non_breaking_exceptions
461
+ ]
462
+ ),
463
+ failure_type=FailureType.config_error,
464
+ )
465
+
466
+ def _handle_non_breaking_error(self, exception: Exception) -> None:
467
+ LOGGER.error(f"Failed to start the Job: {exception}, traceback: {traceback.format_exc()}")
468
+ self._non_breaking_exceptions.append(exception)
469
+
470
+ def _abort_all_running_jobs(self) -> None:
471
+ for partition in self._running_partitions:
472
+ for job in partition.jobs:
473
+ if job.status() in self._RUNNING_ON_API_SIDE_STATUS:
474
+ self._abort_job(job, free_job_allocation=True)
475
+ self._job_tracker.remove_job(job.api_job_id())
476
+
477
+ self._running_partitions = []
478
+
479
+ def _is_breaking_exception(self, exception: Exception) -> bool:
480
+ return isinstance(exception, self._exceptions_to_break_on) or (
481
+ isinstance(exception, AirbyteTracedException)
482
+ and exception.failure_type == FailureType.config_error
483
+ )
484
+
485
+ def fetch_records(self, partition: AsyncPartition) -> Iterable[Mapping[str, Any]]:
486
+ """
487
+ Fetches records from the given partition's jobs.
488
+
489
+ Args:
490
+ partition (AsyncPartition): The partition containing the jobs.
491
+
492
+ Yields:
493
+ Iterable[Mapping[str, Any]]: The fetched records from the jobs.
494
+ """
495
+ for job in partition.jobs:
496
+ yield from self._job_repository.fetch_records(job)
497
+ self._job_repository.delete(job)
@@ -0,0 +1,75 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ import logging
4
+ import threading
5
+ import uuid
6
+ from typing import Set
7
+
8
+ from airbyte_cdk.logger import lazy_log
9
+
10
+ LOGGER = logging.getLogger("airbyte")
11
+
12
+
13
+ class ConcurrentJobLimitReached(Exception):
14
+ pass
15
+
16
+
17
+ class JobTracker:
18
+ def __init__(self, limit: int):
19
+ self._jobs: Set[str] = set()
20
+ self._limit = limit
21
+ self._lock = threading.Lock()
22
+
23
+ def try_to_get_intent(self) -> str:
24
+ lazy_log(
25
+ LOGGER,
26
+ logging.DEBUG,
27
+ lambda: f"JobTracker - Trying to acquire lock by thread {threading.get_native_id()}...",
28
+ )
29
+ with self._lock:
30
+ if self._has_reached_limit():
31
+ raise ConcurrentJobLimitReached(
32
+ "Can't allocate more jobs right now: limit already reached"
33
+ )
34
+ intent = f"intent_{str(uuid.uuid4())}"
35
+ lazy_log(
36
+ LOGGER,
37
+ logging.DEBUG,
38
+ lambda: f"JobTracker - Thread {threading.get_native_id()} has acquired {intent}!",
39
+ )
40
+ self._jobs.add(intent)
41
+ return intent
42
+
43
+ def add_job(self, intent_or_job_id: str, job_id: str) -> None:
44
+ if intent_or_job_id not in self._jobs:
45
+ raise ValueError(
46
+ f"Can't add job: Unknown intent or job id, known values are {self._jobs}"
47
+ )
48
+
49
+ if intent_or_job_id == job_id:
50
+ # Nothing to do here as the ID to replace is the same
51
+ return
52
+
53
+ lazy_log(
54
+ LOGGER,
55
+ logging.DEBUG,
56
+ lambda: f"JobTracker - Thread {threading.get_native_id()} replacing job {intent_or_job_id} by {job_id}!",
57
+ )
58
+ with self._lock:
59
+ self._jobs.add(job_id)
60
+ self._jobs.remove(intent_or_job_id)
61
+
62
+ def remove_job(self, job_id: str) -> None:
63
+ """
64
+ If the job is not allocated as a running job, this method does nothing and it won't raise.
65
+ """
66
+ lazy_log(
67
+ LOGGER,
68
+ logging.DEBUG,
69
+ lambda: f"JobTracker - Thread {threading.get_native_id()} removing job {job_id}",
70
+ )
71
+ with self._lock:
72
+ self._jobs.discard(job_id)
73
+
74
+ def _has_reached_limit(self) -> bool:
75
+ return len(self._jobs) >= self._limit
@@ -0,0 +1,35 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ from abc import abstractmethod
4
+ from typing import Any, Iterable, Mapping, Set
5
+
6
+ from airbyte_cdk.sources.declarative.async_job.job import AsyncJob
7
+ from airbyte_cdk.sources.types import StreamSlice
8
+
9
+
10
+ class AsyncJobRepository:
11
+ @abstractmethod
12
+ def start(self, stream_slice: StreamSlice) -> AsyncJob:
13
+ pass
14
+
15
+ @abstractmethod
16
+ def update_jobs_status(self, jobs: Set[AsyncJob]) -> None:
17
+ pass
18
+
19
+ @abstractmethod
20
+ def fetch_records(self, job: AsyncJob) -> Iterable[Mapping[str, Any]]:
21
+ pass
22
+
23
+ @abstractmethod
24
+ def abort(self, job: AsyncJob) -> None:
25
+ """
26
+ Called when we need to stop on the API side. This method can raise NotImplementedError as not all the APIs will support aborting
27
+ jobs.
28
+ """
29
+ raise NotImplementedError(
30
+ "Either the API or the AsyncJobRepository implementation do not support aborting jobs"
31
+ )
32
+
33
+ @abstractmethod
34
+ def delete(self, job: AsyncJob) -> None:
35
+ pass
@@ -0,0 +1,24 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+
4
+ from enum import Enum
5
+
6
+ _TERMINAL = True
7
+
8
+
9
+ class AsyncJobStatus(Enum):
10
+ RUNNING = ("RUNNING", not _TERMINAL)
11
+ COMPLETED = ("COMPLETED", _TERMINAL)
12
+ FAILED = ("FAILED", _TERMINAL)
13
+ TIMED_OUT = ("TIMED_OUT", _TERMINAL)
14
+
15
+ def __init__(self, value: str, is_terminal: bool) -> None:
16
+ self._value = value
17
+ self._is_terminal = is_terminal
18
+
19
+ def is_terminal(self) -> bool:
20
+ """
21
+ A status is terminal when a job status can't be updated anymore. For example if a job is completed, it will stay completed but a
22
+ running job might because completed, failed or timed out.
23
+ """
24
+ return self._is_terminal
@@ -0,0 +1,39 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+ from datetime import datetime, timedelta, timezone
3
+ from typing import Optional
4
+
5
+
6
+ class Timer:
7
+ def __init__(self, timeout: timedelta) -> None:
8
+ self._start_datetime: Optional[datetime] = None
9
+ self._end_datetime: Optional[datetime] = None
10
+ self._timeout = timeout
11
+
12
+ def start(self) -> None:
13
+ self._start_datetime = self._now()
14
+ self._end_datetime = None
15
+
16
+ def stop(self) -> None:
17
+ if self._end_datetime is None:
18
+ self._end_datetime = self._now()
19
+
20
+ def is_started(self) -> bool:
21
+ return self._start_datetime is not None
22
+
23
+ @property
24
+ def elapsed_time(self) -> Optional[timedelta]:
25
+ if not self._start_datetime:
26
+ return None
27
+
28
+ end_time = self._end_datetime or self._now()
29
+ elapsed_period = end_time - self._start_datetime
30
+ return elapsed_period
31
+
32
+ def has_timed_out(self) -> bool:
33
+ if not self.is_started():
34
+ return False
35
+ return self.elapsed_time > self._timeout # type: ignore # given the job timer is started, we assume there is an elapsed_period
36
+
37
+ @staticmethod
38
+ def _now() -> datetime:
39
+ return datetime.now(tz=timezone.utc)
@@ -2,8 +2,7 @@
2
2
  # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
3
  #
4
4
 
5
+ from airbyte_cdk.sources.declarative.auth.jwt import JwtAuthenticator
5
6
  from airbyte_cdk.sources.declarative.auth.oauth import DeclarativeOauth2Authenticator
6
7
 
7
- __all__ = [
8
- "DeclarativeOauth2Authenticator",
9
- ]
8
+ __all__ = ["DeclarativeOauth2Authenticator", "JwtAuthenticator"]