airbyte-cdk 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (368) hide show
  1. airbyte_cdk/__init__.py +358 -0
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +104 -0
  7. airbyte_cdk/connector.py +123 -0
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/__init__.py +3 -0
  10. airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
  11. airbyte_cdk/connector_builder/main.py +107 -0
  12. airbyte_cdk/connector_builder/models.py +73 -0
  13. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  14. airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
  15. airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
  16. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  17. airbyte_cdk/connector_builder/test_reader/types.py +83 -0
  18. airbyte_cdk/destinations/__init__.py +8 -0
  19. airbyte_cdk/destinations/destination.py +154 -0
  20. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  21. airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
  22. airbyte_cdk/destinations/vector_db_based/config.py +298 -0
  23. airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
  24. airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
  25. airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
  26. airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
  27. airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
  28. airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
  29. airbyte_cdk/entrypoint.py +414 -0
  30. airbyte_cdk/exception_handler.py +56 -0
  31. airbyte_cdk/logger.py +109 -0
  32. airbyte_cdk/models/__init__.py +72 -0
  33. airbyte_cdk/models/airbyte_protocol.py +88 -0
  34. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  35. airbyte_cdk/models/well_known_types.py +5 -0
  36. airbyte_cdk/py.typed +0 -0
  37. airbyte_cdk/sources/__init__.py +26 -0
  38. airbyte_cdk/sources/abstract_source.py +326 -0
  39. airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
  40. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
  41. airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
  42. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
  43. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
  44. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  45. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
  46. airbyte_cdk/sources/config.py +27 -0
  47. airbyte_cdk/sources/connector_state_manager.py +161 -0
  48. airbyte_cdk/sources/declarative/__init__.py +3 -0
  49. airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
  50. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  51. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
  52. airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
  53. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  54. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  55. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  56. airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
  57. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
  58. airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
  59. airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
  60. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
  61. airbyte_cdk/sources/declarative/auth/token.py +267 -0
  62. airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
  63. airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
  64. airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
  65. airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
  66. airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
  67. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  68. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  69. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
  70. airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
  71. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
  72. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
  73. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
  74. airbyte_cdk/sources/declarative/declarative_source.py +36 -0
  75. airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
  76. airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
  77. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
  78. airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
  79. airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
  80. airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
  81. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  82. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  83. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  84. airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
  85. airbyte_cdk/sources/declarative/exceptions.py +9 -0
  86. airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
  87. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
  88. airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
  89. airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
  90. airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
  91. airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
  92. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
  93. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  94. airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
  95. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
  96. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
  97. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  98. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
  99. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
  100. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  101. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  102. airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
  103. airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
  104. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
  105. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
  106. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
  107. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
  108. airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
  109. airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
  110. airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
  111. airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
  112. airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
  113. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  114. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  115. airbyte_cdk/sources/declarative/models/__init__.py +2 -0
  116. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
  117. airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
  118. airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
  119. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
  120. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
  121. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
  122. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
  123. airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
  124. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  125. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  126. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
  127. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  128. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
  129. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
  130. airbyte_cdk/sources/declarative/requesters/README.md +56 -0
  131. airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
  132. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
  133. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
  134. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
  135. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
  136. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
  137. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
  138. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
  139. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
  140. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
  141. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
  142. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  143. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
  144. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
  145. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
  146. airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
  147. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
  148. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
  149. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
  150. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
  151. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
  152. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
  153. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
  154. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
  155. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
  156. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
  157. airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
  158. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
  159. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
  160. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  161. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
  162. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
  163. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
  164. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
  165. airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
  166. airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
  167. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  168. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  169. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  170. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  171. airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
  172. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
  173. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  174. airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
  175. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
  176. airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
  177. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
  178. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
  179. airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
  180. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
  181. airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
  182. airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
  183. airbyte_cdk/sources/declarative/spec/spec.py +48 -0
  184. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
  185. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
  186. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
  187. airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
  188. airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
  189. airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
  190. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  191. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  192. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  193. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  194. airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
  195. airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
  196. airbyte_cdk/sources/declarative/types.py +25 -0
  197. airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
  198. airbyte_cdk/sources/file_based/README.md +152 -0
  199. airbyte_cdk/sources/file_based/__init__.py +24 -0
  200. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
  201. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
  202. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
  203. airbyte_cdk/sources/file_based/config/__init__.py +0 -0
  204. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
  205. airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
  206. airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
  207. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  208. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
  209. airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
  210. airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
  211. airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
  212. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  213. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
  214. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
  215. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
  216. airbyte_cdk/sources/file_based/exceptions.py +159 -0
  217. airbyte_cdk/sources/file_based/file_based_source.py +466 -0
  218. airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
  219. airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
  220. airbyte_cdk/sources/file_based/file_record_data.py +22 -0
  221. airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
  222. airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
  223. airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
  224. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  225. airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
  226. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
  227. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
  228. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
  229. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
  230. airbyte_cdk/sources/file_based/remote_file.py +18 -0
  231. airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
  232. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
  233. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
  234. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
  235. airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
  236. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
  237. airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
  238. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
  239. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
  240. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
  241. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
  242. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
  243. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
  244. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
  245. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
  246. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
  247. airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
  248. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
  249. airbyte_cdk/sources/file_based/types.py +10 -0
  250. airbyte_cdk/sources/http_config.py +10 -0
  251. airbyte_cdk/sources/http_logger.py +55 -0
  252. airbyte_cdk/sources/message/__init__.py +19 -0
  253. airbyte_cdk/sources/message/repository.py +137 -0
  254. airbyte_cdk/sources/source.py +95 -0
  255. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  256. airbyte_cdk/sources/streams/__init__.py +8 -0
  257. airbyte_cdk/sources/streams/availability_strategy.py +84 -0
  258. airbyte_cdk/sources/streams/call_rate.py +704 -0
  259. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  260. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  261. airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
  262. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  263. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  264. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  265. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  266. airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
  267. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
  268. airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
  269. airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
  270. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
  271. airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
  272. airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
  273. airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
  274. airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
  275. airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
  276. airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
  277. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
  278. airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
  279. airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
  280. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
  281. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
  282. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  283. airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
  284. airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
  285. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
  286. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
  287. airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
  288. airbyte_cdk/sources/streams/core.py +703 -0
  289. airbyte_cdk/sources/streams/http/__init__.py +10 -0
  290. airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
  291. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  292. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  293. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  294. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  295. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  296. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  297. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  298. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  299. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  300. airbyte_cdk/sources/streams/http/exceptions.py +61 -0
  301. airbyte_cdk/sources/streams/http/http.py +673 -0
  302. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  303. airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
  304. airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
  305. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
  306. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
  307. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
  308. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
  309. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  310. airbyte_cdk/sources/streams/utils/__init__.py +3 -0
  311. airbyte_cdk/sources/types.py +169 -0
  312. airbyte_cdk/sources/utils/__init__.py +7 -0
  313. airbyte_cdk/sources/utils/casing.py +12 -0
  314. airbyte_cdk/sources/utils/files_directory.py +15 -0
  315. airbyte_cdk/sources/utils/record_helper.py +53 -0
  316. airbyte_cdk/sources/utils/schema_helpers.py +230 -0
  317. airbyte_cdk/sources/utils/slice_logger.py +57 -0
  318. airbyte_cdk/sources/utils/transform.py +277 -0
  319. airbyte_cdk/sources/utils/types.py +7 -0
  320. airbyte_cdk/sql/__init__.py +0 -0
  321. airbyte_cdk/sql/_util/__init__.py +0 -0
  322. airbyte_cdk/sql/_util/hashing.py +34 -0
  323. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  324. airbyte_cdk/sql/constants.py +32 -0
  325. airbyte_cdk/sql/exceptions.py +235 -0
  326. airbyte_cdk/sql/secrets.py +123 -0
  327. airbyte_cdk/sql/shared/__init__.py +15 -0
  328. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  329. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  330. airbyte_cdk/sql/types.py +160 -0
  331. airbyte_cdk/test/__init__.py +7 -0
  332. airbyte_cdk/test/catalog_builder.py +81 -0
  333. airbyte_cdk/test/entrypoint_wrapper.py +250 -0
  334. airbyte_cdk/test/mock_http/__init__.py +6 -0
  335. airbyte_cdk/test/mock_http/matcher.py +41 -0
  336. airbyte_cdk/test/mock_http/mocker.py +185 -0
  337. airbyte_cdk/test/mock_http/request.py +103 -0
  338. airbyte_cdk/test/mock_http/response.py +28 -0
  339. airbyte_cdk/test/mock_http/response_builder.py +237 -0
  340. airbyte_cdk/test/state_builder.py +33 -0
  341. airbyte_cdk/test/utils/__init__.py +1 -0
  342. airbyte_cdk/test/utils/data.py +24 -0
  343. airbyte_cdk/test/utils/http_mocking.py +16 -0
  344. airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
  345. airbyte_cdk/test/utils/reading.py +26 -0
  346. airbyte_cdk/utils/__init__.py +10 -0
  347. airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
  348. airbyte_cdk/utils/analytics_message.py +25 -0
  349. airbyte_cdk/utils/constants.py +5 -0
  350. airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
  351. airbyte_cdk/utils/datetime_helpers.py +499 -0
  352. airbyte_cdk/utils/event_timing.py +85 -0
  353. airbyte_cdk/utils/is_cloud_environment.py +18 -0
  354. airbyte_cdk/utils/mapping_helpers.py +162 -0
  355. airbyte_cdk/utils/message_utils.py +26 -0
  356. airbyte_cdk/utils/oneof_option_config.py +33 -0
  357. airbyte_cdk/utils/print_buffer.py +75 -0
  358. airbyte_cdk/utils/schema_inferrer.py +270 -0
  359. airbyte_cdk/utils/slice_hasher.py +37 -0
  360. airbyte_cdk/utils/spec_schema_transformations.py +26 -0
  361. airbyte_cdk/utils/stream_status_utils.py +43 -0
  362. airbyte_cdk/utils/traced_exception.py +145 -0
  363. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
  364. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
  365. airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
  366. airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
  367. airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
  368. airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,497 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import copy
6
+ import logging
7
+ import threading
8
+ import time
9
+ from collections import OrderedDict
10
+ from copy import deepcopy
11
+ from datetime import timedelta
12
+ from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
13
+
14
+ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
15
+ from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
16
+ Timer,
17
+ iterate_with_last_flag_and_state,
18
+ )
19
+ from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
20
+ from airbyte_cdk.sources.message import MessageRepository
21
+ from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
22
+ PerPartitionKeySerializer,
23
+ )
24
+ from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, Cursor, CursorField
25
+ from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
26
+ from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import (
27
+ AbstractStreamStateConverter,
28
+ )
29
+ from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
30
+
31
+ logger = logging.getLogger("airbyte")
32
+
33
+
34
+ class ConcurrentCursorFactory:
35
+ def __init__(self, create_function: Callable[..., ConcurrentCursor]):
36
+ self._create_function = create_function
37
+
38
+ def create(
39
+ self, stream_state: Mapping[str, Any], runtime_lookback_window: Optional[timedelta]
40
+ ) -> ConcurrentCursor:
41
+ return self._create_function(
42
+ stream_state=stream_state, runtime_lookback_window=runtime_lookback_window
43
+ )
44
+
45
+
46
+ class ConcurrentPerPartitionCursor(Cursor):
47
+ """
48
+ Manages state per partition when a stream has many partitions, preventing data loss or duplication.
49
+
50
+ Attributes:
51
+ DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000).
52
+
53
+ - **Partition Limitation Logic**
54
+ Ensures the number of tracked partitions does not exceed the specified limit to prevent memory overuse. Oldest partitions are removed when the limit is reached.
55
+
56
+ - **Global Cursor Fallback**
57
+ New partitions use global state as the initial state to progress the state for deleted or new partitions. The history data added after the initial sync will be missing.
58
+
59
+ CurrentPerPartitionCursor expects the state of the ConcurrentCursor to follow the format {cursor_field: cursor_value}.
60
+ """
61
+
62
+ DEFAULT_MAX_PARTITIONS_NUMBER = 25_000
63
+ SWITCH_TO_GLOBAL_LIMIT = 10_000
64
+ _NO_STATE: Mapping[str, Any] = {}
65
+ _NO_CURSOR_STATE: Mapping[str, Any] = {}
66
+ _GLOBAL_STATE_KEY = "state"
67
+ _PERPARTITION_STATE_KEY = "states"
68
+ _KEY = 0
69
+ _VALUE = 1
70
+
71
+ def __init__(
72
+ self,
73
+ cursor_factory: ConcurrentCursorFactory,
74
+ partition_router: PartitionRouter,
75
+ stream_name: str,
76
+ stream_namespace: Optional[str],
77
+ stream_state: Any,
78
+ message_repository: MessageRepository,
79
+ connector_state_manager: ConnectorStateManager,
80
+ connector_state_converter: AbstractStreamStateConverter,
81
+ cursor_field: CursorField,
82
+ ) -> None:
83
+ self._global_cursor: Optional[StreamState] = {}
84
+ self._stream_name = stream_name
85
+ self._stream_namespace = stream_namespace
86
+ self._message_repository = message_repository
87
+ self._connector_state_manager = connector_state_manager
88
+ self._connector_state_converter = connector_state_converter
89
+ self._cursor_field = cursor_field
90
+
91
+ self._cursor_factory = cursor_factory
92
+ self._partition_router = partition_router
93
+
94
+ # The dict is ordered to ensure that once the maximum number of partitions is reached,
95
+ # the oldest partitions can be efficiently removed, maintaining the most recent partitions.
96
+ self._cursor_per_partition: OrderedDict[str, ConcurrentCursor] = OrderedDict()
97
+ self._semaphore_per_partition: OrderedDict[str, threading.Semaphore] = OrderedDict()
98
+
99
+ # Parent-state tracking: store each partition’s parent state in creation order
100
+ self._partition_parent_state_map: OrderedDict[str, Mapping[str, Any]] = OrderedDict()
101
+
102
+ self._finished_partitions: set[str] = set()
103
+ self._lock = threading.Lock()
104
+ self._timer = Timer()
105
+ self._new_global_cursor: Optional[StreamState] = None
106
+ self._lookback_window: int = 0
107
+ self._parent_state: Optional[StreamState] = None
108
+ self._number_of_partitions: int = 0
109
+ self._use_global_cursor: bool = False
110
+ self._partition_serializer = PerPartitionKeySerializer()
111
+ # Track the last time a state message was emitted
112
+ self._last_emission_time: float = 0.0
113
+
114
+ self._set_initial_state(stream_state)
115
+
116
+ @property
117
+ def cursor_field(self) -> CursorField:
118
+ return self._cursor_field
119
+
120
+ @property
121
+ def state(self) -> MutableMapping[str, Any]:
122
+ state: dict[str, Any] = {"use_global_cursor": self._use_global_cursor}
123
+ if not self._use_global_cursor:
124
+ states = []
125
+ for partition_tuple, cursor in self._cursor_per_partition.items():
126
+ if cursor.state:
127
+ states.append(
128
+ {
129
+ "partition": self._to_dict(partition_tuple),
130
+ "cursor": copy.deepcopy(cursor.state),
131
+ }
132
+ )
133
+ state[self._PERPARTITION_STATE_KEY] = states
134
+
135
+ if self._global_cursor:
136
+ state[self._GLOBAL_STATE_KEY] = self._global_cursor
137
+ if self._lookback_window is not None:
138
+ state["lookback_window"] = self._lookback_window
139
+ if self._parent_state is not None:
140
+ state["parent_state"] = self._parent_state
141
+ return state
142
+
143
+ def close_partition(self, partition: Partition) -> None:
144
+ # Attempt to retrieve the stream slice
145
+ stream_slice: Optional[StreamSlice] = partition.to_slice() # type: ignore[assignment]
146
+
147
+ # Ensure stream_slice is not None
148
+ if stream_slice is None:
149
+ raise ValueError("stream_slice cannot be None")
150
+
151
+ partition_key = self._to_partition_key(stream_slice.partition)
152
+ with self._lock:
153
+ self._semaphore_per_partition[partition_key].acquire()
154
+ if not self._use_global_cursor:
155
+ self._cursor_per_partition[partition_key].close_partition(partition=partition)
156
+ cursor = self._cursor_per_partition[partition_key]
157
+ if (
158
+ partition_key in self._finished_partitions
159
+ and self._semaphore_per_partition[partition_key]._value == 0
160
+ ):
161
+ self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key])
162
+
163
+ self._check_and_update_parent_state()
164
+
165
+ self._emit_state_message()
166
+
167
+ def _check_and_update_parent_state(self) -> None:
168
+ """
169
+ Pop the leftmost partition state from _partition_parent_state_map only if
170
+ *all partitions* up to (and including) that partition key in _semaphore_per_partition
171
+ are fully finished (i.e. in _finished_partitions and semaphore._value == 0).
172
+ Additionally, delete finished semaphores with a value of 0 to free up memory,
173
+ as they are only needed to track errors and completion status.
174
+ """
175
+ last_closed_state = None
176
+
177
+ while self._partition_parent_state_map:
178
+ # Look at the earliest partition key in creation order
179
+ earliest_key = next(iter(self._partition_parent_state_map))
180
+
181
+ # Verify ALL partitions from the left up to earliest_key are finished
182
+ all_left_finished = True
183
+ for p_key, sem in list(
184
+ self._semaphore_per_partition.items()
185
+ ): # Use list to allow modification during iteration
186
+ # If any earlier partition is still not finished, we must stop
187
+ if p_key not in self._finished_partitions or sem._value != 0:
188
+ all_left_finished = False
189
+ break
190
+ # Once we've reached earliest_key in the semaphore order, we can stop checking
191
+ if p_key == earliest_key:
192
+ break
193
+
194
+ # If the partitions up to earliest_key are not all finished, break the while-loop
195
+ if not all_left_finished:
196
+ break
197
+
198
+ # Pop the leftmost entry from parent-state map
199
+ _, closed_parent_state = self._partition_parent_state_map.popitem(last=False)
200
+ last_closed_state = closed_parent_state
201
+
202
+ # Clean up finished semaphores with value 0 up to and including earliest_key
203
+ for p_key in list(self._semaphore_per_partition.keys()):
204
+ sem = self._semaphore_per_partition[p_key]
205
+ if p_key in self._finished_partitions and sem._value == 0:
206
+ del self._semaphore_per_partition[p_key]
207
+ logger.debug(f"Deleted finished semaphore for partition {p_key} with value 0")
208
+ if p_key == earliest_key:
209
+ break
210
+
211
+ # Update _parent_state if we popped at least one partition
212
+ if last_closed_state is not None:
213
+ self._parent_state = last_closed_state
214
+
215
+ def ensure_at_least_one_state_emitted(self) -> None:
216
+ """
217
+ The platform expects at least one state message on successful syncs. Hence, whatever happens, we expect this method to be
218
+ called.
219
+ """
220
+ if not any(
221
+ semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
222
+ ):
223
+ self._global_cursor = self._new_global_cursor
224
+ self._lookback_window = self._timer.finish()
225
+ self._parent_state = self._partition_router.get_stream_state()
226
+ self._emit_state_message(throttle=False)
227
+
228
+ def _throttle_state_message(self) -> Optional[float]:
229
+ """
230
+ Throttles the state message emission to once every 60 seconds.
231
+ """
232
+ current_time = time.time()
233
+ if current_time - self._last_emission_time <= 60:
234
+ return None
235
+ return current_time
236
+
237
+ def _emit_state_message(self, throttle: bool = True) -> None:
238
+ if throttle:
239
+ current_time = self._throttle_state_message()
240
+ if current_time is None:
241
+ return
242
+ self._last_emission_time = current_time
243
+ self._connector_state_manager.update_state_for_stream(
244
+ self._stream_name,
245
+ self._stream_namespace,
246
+ self.state,
247
+ )
248
+ state_message = self._connector_state_manager.create_state_message(
249
+ self._stream_name, self._stream_namespace
250
+ )
251
+ self._message_repository.emit_message(state_message)
252
+
253
+ def stream_slices(self) -> Iterable[StreamSlice]:
254
+ if self._timer.is_running():
255
+ raise RuntimeError("stream_slices has been executed more than once.")
256
+
257
+ slices = self._partition_router.stream_slices()
258
+ self._timer.start()
259
+ for partition, last, parent_state in iterate_with_last_flag_and_state(
260
+ slices, self._partition_router.get_stream_state
261
+ ):
262
+ yield from self._generate_slices_from_partition(partition, parent_state)
263
+
264
+ def _generate_slices_from_partition(
265
+ self, partition: StreamSlice, parent_state: Mapping[str, Any]
266
+ ) -> Iterable[StreamSlice]:
267
+ # Ensure the maximum number of partitions is not exceeded
268
+ self._ensure_partition_limit()
269
+
270
+ partition_key = self._to_partition_key(partition.partition)
271
+
272
+ cursor = self._cursor_per_partition.get(self._to_partition_key(partition.partition))
273
+ if not cursor:
274
+ cursor = self._create_cursor(
275
+ self._global_cursor,
276
+ self._lookback_window if self._global_cursor else 0,
277
+ )
278
+ with self._lock:
279
+ self._number_of_partitions += 1
280
+ self._cursor_per_partition[partition_key] = cursor
281
+ self._semaphore_per_partition[partition_key] = threading.Semaphore(0)
282
+
283
+ with self._lock:
284
+ if (
285
+ len(self._partition_parent_state_map) == 0
286
+ or self._partition_parent_state_map[
287
+ next(reversed(self._partition_parent_state_map))
288
+ ]
289
+ != parent_state
290
+ ):
291
+ self._partition_parent_state_map[partition_key] = deepcopy(parent_state)
292
+
293
+ for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
294
+ cursor.stream_slices(),
295
+ lambda: None,
296
+ ):
297
+ self._semaphore_per_partition[partition_key].release()
298
+ if is_last_slice:
299
+ self._finished_partitions.add(partition_key)
300
+ yield StreamSlice(
301
+ partition=partition, cursor_slice=cursor_slice, extra_fields=partition.extra_fields
302
+ )
303
+
304
+ def _ensure_partition_limit(self) -> None:
305
+ """
306
+ Ensure the maximum number of partitions does not exceed the predefined limit.
307
+
308
+ Steps:
309
+ 1. Attempt to remove partitions that are marked as finished in `_finished_partitions`.
310
+ These partitions are considered processed and safe to delete.
311
+ 2. If the limit is still exceeded and no finished partitions are available for removal,
312
+ remove the oldest partition unconditionally. We expect failed partitions to be removed.
313
+
314
+ Logging:
315
+ - Logs a warning each time a partition is removed, indicating whether it was finished
316
+ or removed due to being the oldest.
317
+ """
318
+ if not self._use_global_cursor and self.limit_reached():
319
+ logger.info(
320
+ f"Exceeded the 'SWITCH_TO_GLOBAL_LIMIT' of {self.SWITCH_TO_GLOBAL_LIMIT}. "
321
+ f"Switching to global cursor for {self._stream_name}."
322
+ )
323
+ self._use_global_cursor = True
324
+
325
+ with self._lock:
326
+ while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
327
+ # Try removing finished partitions first
328
+ for partition_key in list(self._cursor_per_partition.keys()):
329
+ if partition_key in self._finished_partitions and (
330
+ partition_key not in self._semaphore_per_partition
331
+ or self._semaphore_per_partition[partition_key]._value == 0
332
+ ):
333
+ oldest_partition = self._cursor_per_partition.pop(
334
+ partition_key
335
+ ) # Remove the oldest partition
336
+ logger.warning(
337
+ f"The maximum number of partitions has been reached. Dropping the oldest finished partition: {oldest_partition}. Over limit: {self._number_of_partitions - self.DEFAULT_MAX_PARTITIONS_NUMBER}."
338
+ )
339
+ break
340
+ else:
341
+ # If no finished partitions can be removed, fall back to removing the oldest partition
342
+ oldest_partition = self._cursor_per_partition.popitem(last=False)[
343
+ 1
344
+ ] # Remove the oldest partition
345
+ logger.warning(
346
+ f"The maximum number of partitions has been reached. Dropping the oldest partition: {oldest_partition}. Over limit: {self._number_of_partitions - self.DEFAULT_MAX_PARTITIONS_NUMBER}."
347
+ )
348
+
349
+ def _set_initial_state(self, stream_state: StreamState) -> None:
350
+ """
351
+ Initialize the cursor's state using the provided `stream_state`.
352
+
353
+ This method supports global and per-partition state initialization.
354
+
355
+ - **Global State**: If `states` is missing, the `state` is treated as global and applied to all partitions.
356
+ The `global state` holds a single cursor position representing the latest processed record across all partitions.
357
+
358
+ - **Lookback Window**: Configured via `lookback_window`, it defines the period (in seconds) for reprocessing records.
359
+ This ensures robustness in case of upstream data delays or reordering. If not specified, it defaults to 0.
360
+
361
+ - **Per-Partition State**: If `states` is present, each partition's cursor state is initialized separately.
362
+
363
+ - **Parent State**: (if available) Used to initialize partition routers based on parent streams.
364
+
365
+ Args:
366
+ stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
367
+ {
368
+ "states": [
369
+ {
370
+ "partition": {
371
+ "partition_key": "value"
372
+ },
373
+ "cursor": {
374
+ "last_updated": "2023-05-27T00:00:00Z"
375
+ }
376
+ }
377
+ ],
378
+ "state": {
379
+ "last_updated": "2023-05-27T00:00:00Z"
380
+ },
381
+ lookback_window: 10,
382
+ "parent_state": {
383
+ "parent_stream_name": {
384
+ "last_updated": "2023-05-27T00:00:00Z"
385
+ }
386
+ }
387
+ }
388
+ """
389
+ if not stream_state:
390
+ return
391
+
392
+ if (
393
+ self._PERPARTITION_STATE_KEY not in stream_state
394
+ and self._GLOBAL_STATE_KEY not in stream_state
395
+ ):
396
+ # We assume that `stream_state` is in a global format that can be applied to all partitions.
397
+ # Example: {"global_state_format_key": "global_state_format_value"}
398
+ self._set_global_state(stream_state)
399
+
400
+ else:
401
+ self._use_global_cursor = stream_state.get("use_global_cursor", False)
402
+
403
+ self._lookback_window = int(stream_state.get("lookback_window", 0))
404
+
405
+ for state in stream_state.get(self._PERPARTITION_STATE_KEY, []):
406
+ self._number_of_partitions += 1
407
+ self._cursor_per_partition[self._to_partition_key(state["partition"])] = (
408
+ self._create_cursor(state["cursor"])
409
+ )
410
+
411
+ # set default state for missing partitions if it is per partition with fallback to global
412
+ if self._GLOBAL_STATE_KEY in stream_state:
413
+ self._set_global_state(stream_state[self._GLOBAL_STATE_KEY])
414
+
415
+ # Set initial parent state
416
+ if stream_state.get("parent_state"):
417
+ self._parent_state = stream_state["parent_state"]
418
+
419
+ # Set parent state for partition routers based on parent streams
420
+ self._partition_router.set_initial_state(stream_state)
421
+
422
+ def _set_global_state(self, stream_state: Mapping[str, Any]) -> None:
423
+ """
424
+ Initializes the global cursor state from the provided stream state.
425
+
426
+ If the cursor field key is present in the stream state, its value is parsed,
427
+ formatted, and stored as the global cursor. This ensures consistency in state
428
+ representation across partitions.
429
+ """
430
+ if self.cursor_field.cursor_field_key in stream_state:
431
+ global_state_value = stream_state[self.cursor_field.cursor_field_key]
432
+ final_format_global_state_value = self._connector_state_converter.output_format(
433
+ self._connector_state_converter.parse_value(global_state_value)
434
+ )
435
+
436
+ fixed_global_state = {
437
+ self.cursor_field.cursor_field_key: final_format_global_state_value
438
+ }
439
+
440
+ self._global_cursor = deepcopy(fixed_global_state)
441
+ self._new_global_cursor = deepcopy(fixed_global_state)
442
+
443
+ def observe(self, record: Record) -> None:
444
+ if not record.associated_slice:
445
+ raise ValueError(
446
+ "Invalid state as stream slices that are emitted should refer to an existing cursor"
447
+ )
448
+
449
+ record_cursor = self._connector_state_converter.output_format(
450
+ self._connector_state_converter.parse_value(self._cursor_field.extract_value(record))
451
+ )
452
+ self._update_global_cursor(record_cursor)
453
+ if not self._use_global_cursor:
454
+ self._cursor_per_partition[
455
+ self._to_partition_key(record.associated_slice.partition)
456
+ ].observe(record)
457
+
458
+ def _update_global_cursor(self, value: Any) -> None:
459
+ if (
460
+ self._new_global_cursor is None
461
+ or self._new_global_cursor[self.cursor_field.cursor_field_key] < value
462
+ ):
463
+ self._new_global_cursor = {self.cursor_field.cursor_field_key: copy.deepcopy(value)}
464
+
465
+ def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
466
+ return self._partition_serializer.to_partition_key(partition)
467
+
468
+ def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
469
+ return self._partition_serializer.to_partition(partition_key)
470
+
471
+ def _create_cursor(
472
+ self, cursor_state: Any, runtime_lookback_window: int = 0
473
+ ) -> ConcurrentCursor:
474
+ cursor = self._cursor_factory.create(
475
+ stream_state=deepcopy(cursor_state),
476
+ runtime_lookback_window=timedelta(seconds=runtime_lookback_window),
477
+ )
478
+ return cursor
479
+
480
+ def should_be_synced(self, record: Record) -> bool:
481
+ return self._get_cursor(record).should_be_synced(record)
482
+
483
+ def _get_cursor(self, record: Record) -> ConcurrentCursor:
484
+ if not record.associated_slice:
485
+ raise ValueError(
486
+ "Invalid state as stream slices that are emitted should refer to an existing cursor"
487
+ )
488
+ partition_key = self._to_partition_key(record.associated_slice.partition)
489
+ if partition_key not in self._cursor_per_partition:
490
+ raise ValueError(
491
+ "Invalid state as stream slices that are emitted should refer to an existing cursor"
492
+ )
493
+ cursor = self._cursor_per_partition[partition_key]
494
+ return cursor
495
+
496
+ def limit_reached(self) -> bool:
497
+ return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT