airbyte-cdk 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (368) hide show
  1. airbyte_cdk/__init__.py +358 -0
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +104 -0
  7. airbyte_cdk/connector.py +123 -0
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/__init__.py +3 -0
  10. airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
  11. airbyte_cdk/connector_builder/main.py +107 -0
  12. airbyte_cdk/connector_builder/models.py +73 -0
  13. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  14. airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
  15. airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
  16. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  17. airbyte_cdk/connector_builder/test_reader/types.py +83 -0
  18. airbyte_cdk/destinations/__init__.py +8 -0
  19. airbyte_cdk/destinations/destination.py +154 -0
  20. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  21. airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
  22. airbyte_cdk/destinations/vector_db_based/config.py +298 -0
  23. airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
  24. airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
  25. airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
  26. airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
  27. airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
  28. airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
  29. airbyte_cdk/entrypoint.py +414 -0
  30. airbyte_cdk/exception_handler.py +56 -0
  31. airbyte_cdk/logger.py +109 -0
  32. airbyte_cdk/models/__init__.py +72 -0
  33. airbyte_cdk/models/airbyte_protocol.py +88 -0
  34. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  35. airbyte_cdk/models/well_known_types.py +5 -0
  36. airbyte_cdk/py.typed +0 -0
  37. airbyte_cdk/sources/__init__.py +26 -0
  38. airbyte_cdk/sources/abstract_source.py +326 -0
  39. airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
  40. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
  41. airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
  42. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
  43. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
  44. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  45. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
  46. airbyte_cdk/sources/config.py +27 -0
  47. airbyte_cdk/sources/connector_state_manager.py +161 -0
  48. airbyte_cdk/sources/declarative/__init__.py +3 -0
  49. airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
  50. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  51. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
  52. airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
  53. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  54. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  55. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  56. airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
  57. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
  58. airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
  59. airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
  60. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
  61. airbyte_cdk/sources/declarative/auth/token.py +267 -0
  62. airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
  63. airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
  64. airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
  65. airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
  66. airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
  67. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  68. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  69. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
  70. airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
  71. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
  72. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
  73. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
  74. airbyte_cdk/sources/declarative/declarative_source.py +36 -0
  75. airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
  76. airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
  77. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
  78. airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
  79. airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
  80. airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
  81. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  82. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  83. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  84. airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
  85. airbyte_cdk/sources/declarative/exceptions.py +9 -0
  86. airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
  87. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
  88. airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
  89. airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
  90. airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
  91. airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
  92. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
  93. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  94. airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
  95. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
  96. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
  97. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  98. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
  99. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
  100. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  101. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  102. airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
  103. airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
  104. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
  105. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
  106. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
  107. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
  108. airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
  109. airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
  110. airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
  111. airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
  112. airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
  113. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  114. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  115. airbyte_cdk/sources/declarative/models/__init__.py +2 -0
  116. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
  117. airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
  118. airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
  119. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
  120. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
  121. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
  122. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
  123. airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
  124. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  125. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  126. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
  127. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  128. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
  129. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
  130. airbyte_cdk/sources/declarative/requesters/README.md +56 -0
  131. airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
  132. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
  133. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
  134. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
  135. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
  136. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
  137. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
  138. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
  139. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
  140. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
  141. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
  142. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  143. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
  144. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
  145. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
  146. airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
  147. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
  148. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
  149. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
  150. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
  151. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
  152. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
  153. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
  154. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
  155. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
  156. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
  157. airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
  158. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
  159. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
  160. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  161. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
  162. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
  163. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
  164. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
  165. airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
  166. airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
  167. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  168. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  169. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  170. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  171. airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
  172. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
  173. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  174. airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
  175. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
  176. airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
  177. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
  178. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
  179. airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
  180. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
  181. airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
  182. airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
  183. airbyte_cdk/sources/declarative/spec/spec.py +48 -0
  184. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
  185. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
  186. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
  187. airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
  188. airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
  189. airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
  190. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  191. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  192. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  193. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  194. airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
  195. airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
  196. airbyte_cdk/sources/declarative/types.py +25 -0
  197. airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
  198. airbyte_cdk/sources/file_based/README.md +152 -0
  199. airbyte_cdk/sources/file_based/__init__.py +24 -0
  200. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
  201. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
  202. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
  203. airbyte_cdk/sources/file_based/config/__init__.py +0 -0
  204. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
  205. airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
  206. airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
  207. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  208. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
  209. airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
  210. airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
  211. airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
  212. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  213. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
  214. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
  215. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
  216. airbyte_cdk/sources/file_based/exceptions.py +159 -0
  217. airbyte_cdk/sources/file_based/file_based_source.py +466 -0
  218. airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
  219. airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
  220. airbyte_cdk/sources/file_based/file_record_data.py +22 -0
  221. airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
  222. airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
  223. airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
  224. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  225. airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
  226. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
  227. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
  228. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
  229. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
  230. airbyte_cdk/sources/file_based/remote_file.py +18 -0
  231. airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
  232. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
  233. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
  234. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
  235. airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
  236. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
  237. airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
  238. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
  239. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
  240. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
  241. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
  242. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
  243. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
  244. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
  245. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
  246. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
  247. airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
  248. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
  249. airbyte_cdk/sources/file_based/types.py +10 -0
  250. airbyte_cdk/sources/http_config.py +10 -0
  251. airbyte_cdk/sources/http_logger.py +55 -0
  252. airbyte_cdk/sources/message/__init__.py +19 -0
  253. airbyte_cdk/sources/message/repository.py +137 -0
  254. airbyte_cdk/sources/source.py +95 -0
  255. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  256. airbyte_cdk/sources/streams/__init__.py +8 -0
  257. airbyte_cdk/sources/streams/availability_strategy.py +84 -0
  258. airbyte_cdk/sources/streams/call_rate.py +704 -0
  259. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  260. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  261. airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
  262. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  263. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  264. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  265. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  266. airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
  267. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
  268. airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
  269. airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
  270. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
  271. airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
  272. airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
  273. airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
  274. airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
  275. airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
  276. airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
  277. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
  278. airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
  279. airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
  280. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
  281. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
  282. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  283. airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
  284. airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
  285. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
  286. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
  287. airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
  288. airbyte_cdk/sources/streams/core.py +703 -0
  289. airbyte_cdk/sources/streams/http/__init__.py +10 -0
  290. airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
  291. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  292. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  293. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  294. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  295. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  296. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  297. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  298. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  299. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  300. airbyte_cdk/sources/streams/http/exceptions.py +61 -0
  301. airbyte_cdk/sources/streams/http/http.py +673 -0
  302. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  303. airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
  304. airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
  305. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
  306. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
  307. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
  308. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
  309. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  310. airbyte_cdk/sources/streams/utils/__init__.py +3 -0
  311. airbyte_cdk/sources/types.py +169 -0
  312. airbyte_cdk/sources/utils/__init__.py +7 -0
  313. airbyte_cdk/sources/utils/casing.py +12 -0
  314. airbyte_cdk/sources/utils/files_directory.py +15 -0
  315. airbyte_cdk/sources/utils/record_helper.py +53 -0
  316. airbyte_cdk/sources/utils/schema_helpers.py +230 -0
  317. airbyte_cdk/sources/utils/slice_logger.py +57 -0
  318. airbyte_cdk/sources/utils/transform.py +277 -0
  319. airbyte_cdk/sources/utils/types.py +7 -0
  320. airbyte_cdk/sql/__init__.py +0 -0
  321. airbyte_cdk/sql/_util/__init__.py +0 -0
  322. airbyte_cdk/sql/_util/hashing.py +34 -0
  323. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  324. airbyte_cdk/sql/constants.py +32 -0
  325. airbyte_cdk/sql/exceptions.py +235 -0
  326. airbyte_cdk/sql/secrets.py +123 -0
  327. airbyte_cdk/sql/shared/__init__.py +15 -0
  328. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  329. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  330. airbyte_cdk/sql/types.py +160 -0
  331. airbyte_cdk/test/__init__.py +7 -0
  332. airbyte_cdk/test/catalog_builder.py +81 -0
  333. airbyte_cdk/test/entrypoint_wrapper.py +250 -0
  334. airbyte_cdk/test/mock_http/__init__.py +6 -0
  335. airbyte_cdk/test/mock_http/matcher.py +41 -0
  336. airbyte_cdk/test/mock_http/mocker.py +185 -0
  337. airbyte_cdk/test/mock_http/request.py +103 -0
  338. airbyte_cdk/test/mock_http/response.py +28 -0
  339. airbyte_cdk/test/mock_http/response_builder.py +237 -0
  340. airbyte_cdk/test/state_builder.py +33 -0
  341. airbyte_cdk/test/utils/__init__.py +1 -0
  342. airbyte_cdk/test/utils/data.py +24 -0
  343. airbyte_cdk/test/utils/http_mocking.py +16 -0
  344. airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
  345. airbyte_cdk/test/utils/reading.py +26 -0
  346. airbyte_cdk/utils/__init__.py +10 -0
  347. airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
  348. airbyte_cdk/utils/analytics_message.py +25 -0
  349. airbyte_cdk/utils/constants.py +5 -0
  350. airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
  351. airbyte_cdk/utils/datetime_helpers.py +499 -0
  352. airbyte_cdk/utils/event_timing.py +85 -0
  353. airbyte_cdk/utils/is_cloud_environment.py +18 -0
  354. airbyte_cdk/utils/mapping_helpers.py +162 -0
  355. airbyte_cdk/utils/message_utils.py +26 -0
  356. airbyte_cdk/utils/oneof_option_config.py +33 -0
  357. airbyte_cdk/utils/print_buffer.py +75 -0
  358. airbyte_cdk/utils/schema_inferrer.py +270 -0
  359. airbyte_cdk/utils/slice_hasher.py +37 -0
  360. airbyte_cdk/utils/spec_schema_transformations.py +26 -0
  361. airbyte_cdk/utils/stream_status_utils.py +43 -0
  362. airbyte_cdk/utils/traced_exception.py +145 -0
  363. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
  364. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
  365. airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
  366. airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
  367. airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
  368. airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,380 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import logging
6
+ from collections import OrderedDict
7
+ from typing import Any, Callable, Iterable, Mapping, Optional, Union
8
+
9
+ from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
10
+ from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
11
+ from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
12
+ PerPartitionKeySerializer,
13
+ )
14
+ from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
15
+
16
+ logger = logging.getLogger("airbyte")
17
+
18
+
19
+ class CursorFactory:
20
+ def __init__(self, create_function: Callable[[], DeclarativeCursor]):
21
+ self._create_function = create_function
22
+
23
+ def create(self) -> DeclarativeCursor:
24
+ return self._create_function()
25
+
26
+
27
+ class PerPartitionCursor(DeclarativeCursor):
28
+ """
29
+ Manages state per partition when a stream has many partitions, to prevent data loss or duplication.
30
+
31
+ **Partition Limitation and Limit Reached Logic**
32
+
33
+ - **DEFAULT_MAX_PARTITIONS_NUMBER**: The maximum number of partitions to keep in memory (default is 10,000).
34
+ - **_cursor_per_partition**: An ordered dictionary that stores cursors for each partition.
35
+ - **_over_limit**: A counter that increments each time an oldest partition is removed when the limit is exceeded.
36
+
37
+ The class ensures that the number of partitions tracked does not exceed the `DEFAULT_MAX_PARTITIONS_NUMBER` to prevent excessive memory usage.
38
+
39
+ - When the number of partitions exceeds the limit, the oldest partitions are removed from `_cursor_per_partition`, and `_over_limit` is incremented accordingly.
40
+ - The `limit_reached` method returns `True` when `_over_limit` exceeds `DEFAULT_MAX_PARTITIONS_NUMBER`, indicating that the global cursor should be used instead of per-partition cursors.
41
+
42
+ This approach avoids unnecessary switching to a global cursor due to temporary spikes in partition counts, ensuring that switching is only done when a sustained high number of partitions is observed.
43
+ """
44
+
45
+ DEFAULT_MAX_PARTITIONS_NUMBER = 10000
46
+ _NO_STATE: Mapping[str, Any] = {}
47
+ _NO_CURSOR_STATE: Mapping[str, Any] = {}
48
+ _KEY = 0
49
+ _VALUE = 1
50
+ _state_to_migrate_from: Mapping[str, Any] = {}
51
+
52
+ def __init__(self, cursor_factory: CursorFactory, partition_router: PartitionRouter):
53
+ self._cursor_factory = cursor_factory
54
+ self._partition_router = partition_router
55
+ # The dict is ordered to ensure that once the maximum number of partitions is reached,
56
+ # the oldest partitions can be efficiently removed, maintaining the most recent partitions.
57
+ self._cursor_per_partition: OrderedDict[str, DeclarativeCursor] = OrderedDict()
58
+ self._over_limit = 0
59
+ self._partition_serializer = PerPartitionKeySerializer()
60
+
61
+ def stream_slices(self) -> Iterable[StreamSlice]:
62
+ slices = self._partition_router.stream_slices()
63
+ for partition in slices:
64
+ yield from self.generate_slices_from_partition(partition)
65
+
66
+ def generate_slices_from_partition(self, partition: StreamSlice) -> Iterable[StreamSlice]:
67
+ # Ensure the maximum number of partitions is not exceeded
68
+ self._ensure_partition_limit()
69
+
70
+ cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
71
+ if not cursor:
72
+ partition_state = (
73
+ self._state_to_migrate_from
74
+ if self._state_to_migrate_from
75
+ else self._NO_CURSOR_STATE
76
+ )
77
+ cursor = self._create_cursor(partition_state)
78
+ self._cursor_per_partition[self._to_partition_key(partition.partition)] = cursor
79
+
80
+ for cursor_slice in cursor.stream_slices():
81
+ yield StreamSlice(
82
+ partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
83
+ )
84
+
85
+ def _ensure_partition_limit(self) -> None:
86
+ """
87
+ Ensure the maximum number of partitions is not exceeded. If so, the oldest added partition will be dropped.
88
+ """
89
+ while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
90
+ self._over_limit += 1
91
+ oldest_partition = self._cursor_per_partition.popitem(last=False)[
92
+ 0
93
+ ] # Remove the oldest partition
94
+ logger.warning(
95
+ f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._over_limit}."
96
+ )
97
+
98
+ def limit_reached(self) -> bool:
99
+ return self._over_limit > self.DEFAULT_MAX_PARTITIONS_NUMBER
100
+
101
+ def set_initial_state(self, stream_state: StreamState) -> None:
102
+ """
103
+ Set the initial state for the cursors.
104
+
105
+ This method initializes the state for each partition cursor using the provided stream state.
106
+ If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
107
+
108
+ Additionally, it sets the parent state for partition routers that are based on parent streams. If a partition router
109
+ does not have parent streams, this step will be skipped due to the default PartitionRouter implementation.
110
+
111
+ Args:
112
+ stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
113
+ {
114
+ "states": [
115
+ {
116
+ "partition": {
117
+ "partition_key": "value"
118
+ },
119
+ "cursor": {
120
+ "last_updated": "2023-05-27T00:00:00Z"
121
+ }
122
+ }
123
+ ],
124
+ "parent_state": {
125
+ "parent_stream_name": {
126
+ "last_updated": "2023-05-27T00:00:00Z"
127
+ }
128
+ }
129
+ }
130
+ """
131
+ if not stream_state:
132
+ return
133
+
134
+ if "states" not in stream_state:
135
+ # We assume that `stream_state` is in a global format that can be applied to all partitions.
136
+ # Example: {"global_state_format_key": "global_state_format_value"}
137
+ self._state_to_migrate_from = stream_state
138
+
139
+ else:
140
+ for state in stream_state["states"]:
141
+ self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
142
+ self._create_cursor(state["cursor"])
143
+ )
144
+
145
+ # set default state for missing partitions if it is per partition with fallback to global
146
+ if "state" in stream_state:
147
+ self._state_to_migrate_from = stream_state["state"]
148
+
149
+ # Set parent state for partition routers based on parent streams
150
+ self._partition_router.set_initial_state(stream_state)
151
+
152
+ def observe(self, stream_slice: StreamSlice, record: Record) -> None:
153
+ self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].observe(
154
+ StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), record
155
+ )
156
+
157
+ def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
158
+ try:
159
+ self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].close_slice(
160
+ StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice), *args
161
+ )
162
+ except KeyError as exception:
163
+ raise ValueError(
164
+ f"Partition {str(exception)} could not be found in current state based on the record. This is unexpected because "
165
+ f"we should only update state for partitions that were emitted during `stream_slices`"
166
+ )
167
+
168
+ def get_stream_state(self) -> StreamState:
169
+ states = []
170
+ for partition_tuple, cursor in self._cursor_per_partition.items():
171
+ cursor_state = cursor.get_stream_state()
172
+ if cursor_state:
173
+ states.append(
174
+ {
175
+ "partition": self._to_dict(partition_tuple),
176
+ "cursor": cursor_state,
177
+ }
178
+ )
179
+ state: dict[str, Any] = {"states": states}
180
+
181
+ parent_state = self._partition_router.get_stream_state()
182
+ if parent_state:
183
+ state["parent_state"] = parent_state
184
+ return state
185
+
186
+ def _get_state_for_partition(self, partition: Mapping[str, Any]) -> Optional[StreamState]:
187
+ cursor = self._cursor_per_partition.get(self._to_partition_key(partition))
188
+ if cursor:
189
+ return cursor.get_stream_state()
190
+
191
+ return None
192
+
193
+ @staticmethod
194
+ def _is_new_state(stream_state: Mapping[str, Any]) -> bool:
195
+ return not bool(stream_state)
196
+
197
+ def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
198
+ return self._partition_serializer.to_partition_key(partition)
199
+
200
+ def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
201
+ return self._partition_serializer.to_partition(partition_key)
202
+
203
+ def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
204
+ if not stream_slice:
205
+ raise ValueError("A partition needs to be provided in order to extract a state")
206
+
207
+ if not stream_slice:
208
+ return None
209
+
210
+ return self._get_state_for_partition(stream_slice.partition)
211
+
212
+ def _create_cursor(self, cursor_state: Any) -> DeclarativeCursor:
213
+ cursor = self._cursor_factory.create()
214
+ cursor.set_initial_state(cursor_state)
215
+ return cursor
216
+
217
+ def get_request_params(
218
+ self,
219
+ *,
220
+ stream_state: Optional[StreamState] = None,
221
+ stream_slice: Optional[StreamSlice] = None,
222
+ next_page_token: Optional[Mapping[str, Any]] = None,
223
+ ) -> Mapping[str, Any]:
224
+ if stream_slice:
225
+ if self._to_partition_key(stream_slice.partition) not in self._cursor_per_partition:
226
+ self._create_cursor_for_partition(self._to_partition_key(stream_slice.partition))
227
+ return self._partition_router.get_request_params( # type: ignore # this always returns a mapping
228
+ stream_state=stream_state,
229
+ stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}),
230
+ next_page_token=next_page_token,
231
+ ) | self._cursor_per_partition[
232
+ self._to_partition_key(stream_slice.partition)
233
+ ].get_request_params(
234
+ stream_state=stream_state,
235
+ stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice),
236
+ next_page_token=next_page_token,
237
+ )
238
+ else:
239
+ raise ValueError("A partition needs to be provided in order to get request params")
240
+
241
+ def get_request_headers(
242
+ self,
243
+ *,
244
+ stream_state: Optional[StreamState] = None,
245
+ stream_slice: Optional[StreamSlice] = None,
246
+ next_page_token: Optional[Mapping[str, Any]] = None,
247
+ ) -> Mapping[str, Any]:
248
+ if stream_slice:
249
+ if self._to_partition_key(stream_slice.partition) not in self._cursor_per_partition:
250
+ self._create_cursor_for_partition(self._to_partition_key(stream_slice.partition))
251
+ return self._partition_router.get_request_headers( # type: ignore # this always returns a mapping
252
+ stream_state=stream_state,
253
+ stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}),
254
+ next_page_token=next_page_token,
255
+ ) | self._cursor_per_partition[
256
+ self._to_partition_key(stream_slice.partition)
257
+ ].get_request_headers(
258
+ stream_state=stream_state,
259
+ stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice),
260
+ next_page_token=next_page_token,
261
+ )
262
+ else:
263
+ raise ValueError("A partition needs to be provided in order to get request headers")
264
+
265
+ def get_request_body_data(
266
+ self,
267
+ *,
268
+ stream_state: Optional[StreamState] = None,
269
+ stream_slice: Optional[StreamSlice] = None,
270
+ next_page_token: Optional[Mapping[str, Any]] = None,
271
+ ) -> Union[Mapping[str, Any], str]:
272
+ if stream_slice:
273
+ if self._to_partition_key(stream_slice.partition) not in self._cursor_per_partition:
274
+ self._create_cursor_for_partition(self._to_partition_key(stream_slice.partition))
275
+ return self._partition_router.get_request_body_data( # type: ignore # this always returns a mapping
276
+ stream_state=stream_state,
277
+ stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}),
278
+ next_page_token=next_page_token,
279
+ ) | self._cursor_per_partition[
280
+ self._to_partition_key(stream_slice.partition)
281
+ ].get_request_body_data(
282
+ stream_state=stream_state,
283
+ stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice),
284
+ next_page_token=next_page_token,
285
+ )
286
+ else:
287
+ raise ValueError("A partition needs to be provided in order to get request body data")
288
+
289
+ def get_request_body_json(
290
+ self,
291
+ *,
292
+ stream_state: Optional[StreamState] = None,
293
+ stream_slice: Optional[StreamSlice] = None,
294
+ next_page_token: Optional[Mapping[str, Any]] = None,
295
+ ) -> Mapping[str, Any]:
296
+ if stream_slice:
297
+ if self._to_partition_key(stream_slice.partition) not in self._cursor_per_partition:
298
+ self._create_cursor_for_partition(self._to_partition_key(stream_slice.partition))
299
+ return self._partition_router.get_request_body_json( # type: ignore # this always returns a mapping
300
+ stream_state=stream_state,
301
+ stream_slice=StreamSlice(partition=stream_slice.partition, cursor_slice={}),
302
+ next_page_token=next_page_token,
303
+ ) | self._cursor_per_partition[
304
+ self._to_partition_key(stream_slice.partition)
305
+ ].get_request_body_json(
306
+ stream_state=stream_state,
307
+ stream_slice=StreamSlice(partition={}, cursor_slice=stream_slice.cursor_slice),
308
+ next_page_token=next_page_token,
309
+ )
310
+ else:
311
+ raise ValueError("A partition needs to be provided in order to get request body json")
312
+
313
+ def should_be_synced(self, record: Record) -> bool:
314
+ return self._get_cursor(record).should_be_synced(
315
+ self._convert_record_to_cursor_record(record)
316
+ )
317
+
318
+ def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
319
+ if not first.associated_slice or not second.associated_slice:
320
+ raise ValueError(
321
+ f"Both records should have an associated slice but got {first.associated_slice} and {second.associated_slice}"
322
+ )
323
+ if first.associated_slice.partition != second.associated_slice.partition:
324
+ raise ValueError(
325
+ f"To compare records, partition should be the same but got {first.associated_slice.partition} and {second.associated_slice.partition}"
326
+ )
327
+
328
+ return self._get_cursor(first).is_greater_than_or_equal(
329
+ self._convert_record_to_cursor_record(first),
330
+ self._convert_record_to_cursor_record(second),
331
+ )
332
+
333
+ @staticmethod
334
+ def _convert_record_to_cursor_record(record: Record) -> Record:
335
+ return Record(
336
+ data=record.data,
337
+ stream_name=record.stream_name,
338
+ associated_slice=StreamSlice(
339
+ partition={}, cursor_slice=record.associated_slice.cursor_slice
340
+ )
341
+ if record.associated_slice
342
+ else None,
343
+ )
344
+
345
+ def _get_cursor(self, record: Record) -> DeclarativeCursor:
346
+ if not record.associated_slice:
347
+ raise ValueError(
348
+ "Invalid state as stream slices that are emitted should refer to an existing cursor"
349
+ )
350
+ partition_key = self._to_partition_key(record.associated_slice.partition)
351
+ if partition_key not in self._cursor_per_partition:
352
+ self._create_cursor_for_partition(partition_key)
353
+ cursor = self._cursor_per_partition[partition_key]
354
+ return cursor
355
+
356
+ def _create_cursor_for_partition(self, partition_key: str) -> None:
357
+ """
358
+ Dynamically creates and initializes a cursor for the specified partition.
359
+
360
+ This method is required for `ConcurrentPerPartitionCursor`. For concurrent cursors,
361
+ stream_slices is executed only for the concurrent cursor, so cursors per partition
362
+ are not created for the declarative cursor. This method ensures that a cursor is available
363
+ to create requests for the specified partition. The cursor is initialized
364
+ with the per-partition state if present in the initial state, or with the global state
365
+ adjusted by the lookback window, or with the state to migrate from.
366
+
367
+ Note:
368
+ This is a temporary workaround and should be removed once the declarative cursor
369
+ is decoupled from the concurrent cursor implementation.
370
+
371
+ Args:
372
+ partition_key (str): The unique identifier for the partition for which the cursor
373
+ needs to be created.
374
+ """
375
+ partition_state = (
376
+ self._state_to_migrate_from if self._state_to_migrate_from else self._NO_CURSOR_STATE
377
+ )
378
+ cursor = self._create_cursor(partition_state)
379
+
380
+ self._cursor_per_partition[partition_key] = cursor
@@ -0,0 +1,200 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+ from typing import Any, Iterable, Mapping, MutableMapping, Optional, Union
5
+
6
+ from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
7
+ from airbyte_cdk.sources.declarative.incremental.declarative_cursor import DeclarativeCursor
8
+ from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
9
+ GlobalSubstreamCursor,
10
+ iterate_with_last_flag_and_state,
11
+ )
12
+ from airbyte_cdk.sources.declarative.incremental.per_partition_cursor import (
13
+ CursorFactory,
14
+ PerPartitionCursor,
15
+ )
16
+ from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
17
+ from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
18
+
19
+
20
+ class PerPartitionWithGlobalCursor(DeclarativeCursor):
21
+ """
22
+ Manages state for streams with multiple partitions, with an optional fallback to a global cursor when specific conditions are met.
23
+
24
+ This cursor handles partitioned streams by maintaining individual state per partition using `PerPartitionCursor`. If the number of partitions exceeds a defined limit, it switches to a global cursor (`GlobalSubstreamCursor`) to manage state more efficiently.
25
+
26
+ **Overview**
27
+
28
+ - **Partition-Based State**: Initially manages state per partition to ensure accurate processing of each partition's data.
29
+ - **Global Fallback**: Switches to a global cursor when the partition limit is exceeded to handle state management more effectively.
30
+
31
+ **Switching Logic**
32
+
33
+ - Monitors the number of partitions.
34
+ - If `PerPartitionCursor.limit_reached()` returns `True`, sets `_use_global_cursor` to `True`, activating the global cursor.
35
+
36
+ **Active Cursor Selection**
37
+
38
+ - Uses the `_get_active_cursor()` helper method to select the active cursor based on the `_use_global_cursor` flag.
39
+ - This simplifies the logic and ensures consistent cursor usage across methods.
40
+
41
+ **State Structure Example**
42
+
43
+ ```json
44
+ {
45
+ "states": [
46
+ {
47
+ "partition": {"partition_key": "partition_1"},
48
+ "cursor": {"cursor_field": "2021-01-15"}
49
+ },
50
+ {
51
+ "partition": {"partition_key": "partition_2"},
52
+ "cursor": {"cursor_field": "2021-02-14"}
53
+ }
54
+ ],
55
+ "state": {
56
+ "cursor_field": "2021-02-15"
57
+ },
58
+ "use_global_cursor": false
59
+ }
60
+ ```
61
+
62
+ In this example, the cursor is using partition-based state management (`"use_global_cursor": false`), maintaining separate cursor states for each partition.
63
+
64
+ **Usage Scenario**
65
+
66
+ Suitable for streams where the number of partitions may vary significantly, requiring dynamic switching between per-partition and global state management to ensure data consistency and efficient synchronization.
67
+ """
68
+
69
+ def __init__(
70
+ self,
71
+ cursor_factory: CursorFactory,
72
+ partition_router: PartitionRouter,
73
+ stream_cursor: DatetimeBasedCursor,
74
+ ):
75
+ self._partition_router = partition_router
76
+ self._per_partition_cursor = PerPartitionCursor(cursor_factory, partition_router)
77
+ self._global_cursor = GlobalSubstreamCursor(stream_cursor, partition_router)
78
+ self._use_global_cursor = False
79
+ self._current_partition: Optional[Mapping[str, Any]] = None
80
+ self._last_slice: bool = False
81
+ self._parent_state: Optional[Mapping[str, Any]] = None
82
+
83
+ def _get_active_cursor(self) -> Union[PerPartitionCursor, GlobalSubstreamCursor]:
84
+ return self._global_cursor if self._use_global_cursor else self._per_partition_cursor
85
+
86
+ def stream_slices(self) -> Iterable[StreamSlice]:
87
+ self._global_cursor.start_slices_generation()
88
+
89
+ # Iterate through partitions and process slices
90
+ for partition, is_last_partition, parent_state in iterate_with_last_flag_and_state(
91
+ self._partition_router.stream_slices(), self._partition_router.get_stream_state
92
+ ):
93
+ # Generate slices for the current cursor and handle the last slice using the flag
94
+ self._parent_state = parent_state
95
+ for slice, is_last_slice, _ in iterate_with_last_flag_and_state(
96
+ self._get_active_cursor().generate_slices_from_partition(partition=partition),
97
+ lambda: None,
98
+ ):
99
+ self._global_cursor.register_slice(is_last_slice and is_last_partition)
100
+ yield slice
101
+ self._parent_state = self._partition_router.get_stream_state()
102
+
103
+ def set_initial_state(self, stream_state: StreamState) -> None:
104
+ """
105
+ Set the initial state for the cursors.
106
+ """
107
+ self._use_global_cursor = stream_state.get("use_global_cursor", False)
108
+
109
+ self._parent_state = stream_state.get("parent_state", {})
110
+
111
+ self._global_cursor.set_initial_state(stream_state)
112
+ if not self._use_global_cursor:
113
+ self._per_partition_cursor.set_initial_state(stream_state)
114
+
115
+ def observe(self, stream_slice: StreamSlice, record: Record) -> None:
116
+ if not self._use_global_cursor and self._per_partition_cursor.limit_reached():
117
+ self._use_global_cursor = True
118
+
119
+ if not self._use_global_cursor:
120
+ self._per_partition_cursor.observe(stream_slice, record)
121
+ self._global_cursor.observe(stream_slice, record)
122
+
123
+ def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
124
+ if not self._use_global_cursor:
125
+ self._per_partition_cursor.close_slice(stream_slice, *args)
126
+ self._global_cursor.close_slice(stream_slice, *args)
127
+
128
+ def get_stream_state(self) -> StreamState:
129
+ final_state: MutableMapping[str, Any] = {"use_global_cursor": self._use_global_cursor}
130
+
131
+ final_state.update(self._global_cursor.get_stream_state())
132
+ if not self._use_global_cursor:
133
+ final_state.update(self._per_partition_cursor.get_stream_state())
134
+
135
+ final_state["parent_state"] = self._parent_state
136
+ if not final_state.get("parent_state"):
137
+ del final_state["parent_state"]
138
+
139
+ return final_state
140
+
141
+ def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
142
+ return self._get_active_cursor().select_state(stream_slice)
143
+
144
+ def get_request_params(
145
+ self,
146
+ *,
147
+ stream_state: Optional[StreamState] = None,
148
+ stream_slice: Optional[StreamSlice] = None,
149
+ next_page_token: Optional[Mapping[str, Any]] = None,
150
+ ) -> Mapping[str, Any]:
151
+ return self._get_active_cursor().get_request_params(
152
+ stream_state=stream_state,
153
+ stream_slice=stream_slice,
154
+ next_page_token=next_page_token,
155
+ )
156
+
157
+ def get_request_headers(
158
+ self,
159
+ *,
160
+ stream_state: Optional[StreamState] = None,
161
+ stream_slice: Optional[StreamSlice] = None,
162
+ next_page_token: Optional[Mapping[str, Any]] = None,
163
+ ) -> Mapping[str, Any]:
164
+ return self._get_active_cursor().get_request_headers(
165
+ stream_state=stream_state,
166
+ stream_slice=stream_slice,
167
+ next_page_token=next_page_token,
168
+ )
169
+
170
+ def get_request_body_data(
171
+ self,
172
+ *,
173
+ stream_state: Optional[StreamState] = None,
174
+ stream_slice: Optional[StreamSlice] = None,
175
+ next_page_token: Optional[Mapping[str, Any]] = None,
176
+ ) -> Union[Mapping[str, Any], str]:
177
+ return self._get_active_cursor().get_request_body_data(
178
+ stream_state=stream_state,
179
+ stream_slice=stream_slice,
180
+ next_page_token=next_page_token,
181
+ )
182
+
183
+ def get_request_body_json(
184
+ self,
185
+ *,
186
+ stream_state: Optional[StreamState] = None,
187
+ stream_slice: Optional[StreamSlice] = None,
188
+ next_page_token: Optional[Mapping[str, Any]] = None,
189
+ ) -> Mapping[str, Any]:
190
+ return self._get_active_cursor().get_request_body_json(
191
+ stream_state=stream_state,
192
+ stream_slice=stream_slice,
193
+ next_page_token=next_page_token,
194
+ )
195
+
196
+ def should_be_synced(self, record: Record) -> bool:
197
+ return self._get_active_cursor().should_be_synced(record)
198
+
199
+ def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
200
+ return self._global_cursor.is_greater_than_or_equal(first, second)