airbyte-cdk 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (368) hide show
  1. airbyte_cdk/__init__.py +358 -0
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +104 -0
  7. airbyte_cdk/connector.py +123 -0
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/__init__.py +3 -0
  10. airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
  11. airbyte_cdk/connector_builder/main.py +107 -0
  12. airbyte_cdk/connector_builder/models.py +73 -0
  13. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  14. airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
  15. airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
  16. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  17. airbyte_cdk/connector_builder/test_reader/types.py +83 -0
  18. airbyte_cdk/destinations/__init__.py +8 -0
  19. airbyte_cdk/destinations/destination.py +154 -0
  20. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  21. airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
  22. airbyte_cdk/destinations/vector_db_based/config.py +298 -0
  23. airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
  24. airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
  25. airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
  26. airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
  27. airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
  28. airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
  29. airbyte_cdk/entrypoint.py +414 -0
  30. airbyte_cdk/exception_handler.py +56 -0
  31. airbyte_cdk/logger.py +109 -0
  32. airbyte_cdk/models/__init__.py +72 -0
  33. airbyte_cdk/models/airbyte_protocol.py +88 -0
  34. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  35. airbyte_cdk/models/well_known_types.py +5 -0
  36. airbyte_cdk/py.typed +0 -0
  37. airbyte_cdk/sources/__init__.py +26 -0
  38. airbyte_cdk/sources/abstract_source.py +326 -0
  39. airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
  40. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
  41. airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
  42. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
  43. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
  44. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  45. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
  46. airbyte_cdk/sources/config.py +27 -0
  47. airbyte_cdk/sources/connector_state_manager.py +161 -0
  48. airbyte_cdk/sources/declarative/__init__.py +3 -0
  49. airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
  50. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  51. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
  52. airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
  53. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  54. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  55. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  56. airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
  57. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
  58. airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
  59. airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
  60. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
  61. airbyte_cdk/sources/declarative/auth/token.py +267 -0
  62. airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
  63. airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
  64. airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
  65. airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
  66. airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
  67. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  68. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  69. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
  70. airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
  71. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
  72. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
  73. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
  74. airbyte_cdk/sources/declarative/declarative_source.py +36 -0
  75. airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
  76. airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
  77. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
  78. airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
  79. airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
  80. airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
  81. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  82. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  83. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  84. airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
  85. airbyte_cdk/sources/declarative/exceptions.py +9 -0
  86. airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
  87. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
  88. airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
  89. airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
  90. airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
  91. airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
  92. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
  93. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  94. airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
  95. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
  96. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
  97. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  98. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
  99. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
  100. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  101. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  102. airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
  103. airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
  104. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
  105. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
  106. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
  107. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
  108. airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
  109. airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
  110. airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
  111. airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
  112. airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
  113. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  114. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  115. airbyte_cdk/sources/declarative/models/__init__.py +2 -0
  116. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
  117. airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
  118. airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
  119. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
  120. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
  121. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
  122. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
  123. airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
  124. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  125. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  126. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
  127. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  128. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
  129. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
  130. airbyte_cdk/sources/declarative/requesters/README.md +56 -0
  131. airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
  132. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
  133. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
  134. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
  135. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
  136. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
  137. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
  138. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
  139. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
  140. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
  141. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
  142. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  143. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
  144. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
  145. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
  146. airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
  147. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
  148. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
  149. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
  150. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
  151. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
  152. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
  153. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
  154. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
  155. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
  156. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
  157. airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
  158. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
  159. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
  160. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  161. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
  162. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
  163. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
  164. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
  165. airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
  166. airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
  167. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  168. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  169. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  170. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  171. airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
  172. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
  173. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  174. airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
  175. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
  176. airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
  177. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
  178. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
  179. airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
  180. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
  181. airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
  182. airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
  183. airbyte_cdk/sources/declarative/spec/spec.py +48 -0
  184. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
  185. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
  186. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
  187. airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
  188. airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
  189. airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
  190. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  191. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  192. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  193. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  194. airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
  195. airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
  196. airbyte_cdk/sources/declarative/types.py +25 -0
  197. airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
  198. airbyte_cdk/sources/file_based/README.md +152 -0
  199. airbyte_cdk/sources/file_based/__init__.py +24 -0
  200. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
  201. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
  202. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
  203. airbyte_cdk/sources/file_based/config/__init__.py +0 -0
  204. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
  205. airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
  206. airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
  207. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  208. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
  209. airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
  210. airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
  211. airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
  212. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  213. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
  214. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
  215. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
  216. airbyte_cdk/sources/file_based/exceptions.py +159 -0
  217. airbyte_cdk/sources/file_based/file_based_source.py +466 -0
  218. airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
  219. airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
  220. airbyte_cdk/sources/file_based/file_record_data.py +22 -0
  221. airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
  222. airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
  223. airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
  224. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  225. airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
  226. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
  227. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
  228. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
  229. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
  230. airbyte_cdk/sources/file_based/remote_file.py +18 -0
  231. airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
  232. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
  233. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
  234. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
  235. airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
  236. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
  237. airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
  238. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
  239. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
  240. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
  241. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
  242. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
  243. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
  244. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
  245. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
  246. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
  247. airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
  248. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
  249. airbyte_cdk/sources/file_based/types.py +10 -0
  250. airbyte_cdk/sources/http_config.py +10 -0
  251. airbyte_cdk/sources/http_logger.py +55 -0
  252. airbyte_cdk/sources/message/__init__.py +19 -0
  253. airbyte_cdk/sources/message/repository.py +137 -0
  254. airbyte_cdk/sources/source.py +95 -0
  255. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  256. airbyte_cdk/sources/streams/__init__.py +8 -0
  257. airbyte_cdk/sources/streams/availability_strategy.py +84 -0
  258. airbyte_cdk/sources/streams/call_rate.py +704 -0
  259. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  260. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  261. airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
  262. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  263. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  264. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  265. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  266. airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
  267. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
  268. airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
  269. airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
  270. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
  271. airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
  272. airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
  273. airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
  274. airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
  275. airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
  276. airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
  277. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
  278. airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
  279. airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
  280. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
  281. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
  282. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  283. airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
  284. airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
  285. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
  286. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
  287. airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
  288. airbyte_cdk/sources/streams/core.py +703 -0
  289. airbyte_cdk/sources/streams/http/__init__.py +10 -0
  290. airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
  291. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  292. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  293. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  294. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  295. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  296. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  297. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  298. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  299. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  300. airbyte_cdk/sources/streams/http/exceptions.py +61 -0
  301. airbyte_cdk/sources/streams/http/http.py +673 -0
  302. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  303. airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
  304. airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
  305. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
  306. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
  307. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
  308. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
  309. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  310. airbyte_cdk/sources/streams/utils/__init__.py +3 -0
  311. airbyte_cdk/sources/types.py +169 -0
  312. airbyte_cdk/sources/utils/__init__.py +7 -0
  313. airbyte_cdk/sources/utils/casing.py +12 -0
  314. airbyte_cdk/sources/utils/files_directory.py +15 -0
  315. airbyte_cdk/sources/utils/record_helper.py +53 -0
  316. airbyte_cdk/sources/utils/schema_helpers.py +230 -0
  317. airbyte_cdk/sources/utils/slice_logger.py +57 -0
  318. airbyte_cdk/sources/utils/transform.py +277 -0
  319. airbyte_cdk/sources/utils/types.py +7 -0
  320. airbyte_cdk/sql/__init__.py +0 -0
  321. airbyte_cdk/sql/_util/__init__.py +0 -0
  322. airbyte_cdk/sql/_util/hashing.py +34 -0
  323. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  324. airbyte_cdk/sql/constants.py +32 -0
  325. airbyte_cdk/sql/exceptions.py +235 -0
  326. airbyte_cdk/sql/secrets.py +123 -0
  327. airbyte_cdk/sql/shared/__init__.py +15 -0
  328. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  329. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  330. airbyte_cdk/sql/types.py +160 -0
  331. airbyte_cdk/test/__init__.py +7 -0
  332. airbyte_cdk/test/catalog_builder.py +81 -0
  333. airbyte_cdk/test/entrypoint_wrapper.py +250 -0
  334. airbyte_cdk/test/mock_http/__init__.py +6 -0
  335. airbyte_cdk/test/mock_http/matcher.py +41 -0
  336. airbyte_cdk/test/mock_http/mocker.py +185 -0
  337. airbyte_cdk/test/mock_http/request.py +103 -0
  338. airbyte_cdk/test/mock_http/response.py +28 -0
  339. airbyte_cdk/test/mock_http/response_builder.py +237 -0
  340. airbyte_cdk/test/state_builder.py +33 -0
  341. airbyte_cdk/test/utils/__init__.py +1 -0
  342. airbyte_cdk/test/utils/data.py +24 -0
  343. airbyte_cdk/test/utils/http_mocking.py +16 -0
  344. airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
  345. airbyte_cdk/test/utils/reading.py +26 -0
  346. airbyte_cdk/utils/__init__.py +10 -0
  347. airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
  348. airbyte_cdk/utils/analytics_message.py +25 -0
  349. airbyte_cdk/utils/constants.py +5 -0
  350. airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
  351. airbyte_cdk/utils/datetime_helpers.py +499 -0
  352. airbyte_cdk/utils/event_timing.py +85 -0
  353. airbyte_cdk/utils/is_cloud_environment.py +18 -0
  354. airbyte_cdk/utils/mapping_helpers.py +162 -0
  355. airbyte_cdk/utils/message_utils.py +26 -0
  356. airbyte_cdk/utils/oneof_option_config.py +33 -0
  357. airbyte_cdk/utils/print_buffer.py +75 -0
  358. airbyte_cdk/utils/schema_inferrer.py +270 -0
  359. airbyte_cdk/utils/slice_hasher.py +37 -0
  360. airbyte_cdk/utils/spec_schema_transformations.py +26 -0
  361. airbyte_cdk/utils/stream_status_utils.py +43 -0
  362. airbyte_cdk/utils/traced_exception.py +145 -0
  363. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
  364. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
  365. airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
  366. airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
  367. airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
  368. airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,26 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+
4
+ from .checkpoint_reader import (
5
+ CheckpointMode,
6
+ CheckpointReader,
7
+ CursorBasedCheckpointReader,
8
+ FullRefreshCheckpointReader,
9
+ IncrementalCheckpointReader,
10
+ LegacyCursorBasedCheckpointReader,
11
+ ResumableFullRefreshCheckpointReader,
12
+ )
13
+ from .cursor import Cursor
14
+ from .resumable_full_refresh_cursor import ResumableFullRefreshCursor
15
+
16
+ __all__ = [
17
+ "CheckpointMode",
18
+ "CheckpointReader",
19
+ "Cursor",
20
+ "CursorBasedCheckpointReader",
21
+ "FullRefreshCheckpointReader",
22
+ "IncrementalCheckpointReader",
23
+ "LegacyCursorBasedCheckpointReader",
24
+ "ResumableFullRefreshCheckpointReader",
25
+ "ResumableFullRefreshCursor",
26
+ ]
@@ -0,0 +1,335 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ from abc import ABC, abstractmethod
4
+ from enum import Enum
5
+ from typing import Any, Iterable, Mapping, Optional
6
+
7
+ from airbyte_cdk.sources.types import StreamSlice
8
+
9
+ from .cursor import Cursor
10
+
11
+
12
+ class CheckpointMode(Enum):
13
+ INCREMENTAL = "incremental"
14
+ RESUMABLE_FULL_REFRESH = "resumable_full_refresh"
15
+ FULL_REFRESH = "full_refresh"
16
+
17
+
18
+ FULL_REFRESH_COMPLETE_STATE: Mapping[str, Any] = {"__ab_full_refresh_sync_complete": True}
19
+
20
+
21
+ class CheckpointReader(ABC):
22
+ """
23
+ CheckpointReader manages how to iterate over a stream's partitions and serves as the bridge for interpreting the current state
24
+ of the stream that should be emitted back to the platform.
25
+ """
26
+
27
+ @abstractmethod
28
+ def next(self) -> Optional[Mapping[str, Any]]:
29
+ """
30
+ Returns the next slice that will be used to fetch the next group of records. Returning None indicates that the reader
31
+ has finished iterating over all slices.
32
+ """
33
+
34
+ @abstractmethod
35
+ def observe(self, new_state: Mapping[str, Any]) -> None:
36
+ """
37
+ Updates the internal state of the checkpoint reader based on the incoming stream state from a connector.
38
+
39
+ WARNING: This is used to retain backwards compatibility with streams using the legacy get_stream_state() method.
40
+ In order to uptake Resumable Full Refresh, connectors must migrate streams to use the state setter/getter methods.
41
+ """
42
+
43
+ @abstractmethod
44
+ def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
45
+ """
46
+ Retrieves the current state value of the stream. The connector does not emit state messages if the checkpoint value is None.
47
+ """
48
+
49
+
50
+ class IncrementalCheckpointReader(CheckpointReader):
51
+ """
52
+ IncrementalCheckpointReader handles iterating through a stream based on partitioned windows of data that are determined
53
+ before syncing data.
54
+ """
55
+
56
+ def __init__(
57
+ self, stream_state: Mapping[str, Any], stream_slices: Iterable[Optional[Mapping[str, Any]]]
58
+ ):
59
+ self._state: Optional[Mapping[str, Any]] = stream_state
60
+ self._stream_slices = iter(stream_slices)
61
+ self._has_slices = False
62
+
63
+ def next(self) -> Optional[Mapping[str, Any]]:
64
+ try:
65
+ next_slice = next(self._stream_slices)
66
+ self._has_slices = True
67
+ return next_slice
68
+ except StopIteration:
69
+ # This is used to avoid sending a duplicate state message at the end of a sync since the stream has already
70
+ # emitted state at the end of each slice. If we want to avoid this extra complexity, we can also just accept
71
+ # that every sync emits a final duplicate state
72
+ if self._has_slices:
73
+ self._state = None
74
+ return None
75
+
76
+ def observe(self, new_state: Mapping[str, Any]) -> None:
77
+ self._state = new_state
78
+
79
+ def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
80
+ return self._state
81
+
82
+
83
+ class CursorBasedCheckpointReader(CheckpointReader):
84
+ """
85
+ CursorBasedCheckpointReader is used by streams that implement a Cursor in order to manage state. This allows the checkpoint
86
+ reader to delegate the complexity of fetching state to the cursor and focus on the iteration over a stream's partitions.
87
+
88
+ This reader supports the Cursor interface used by Python and low-code sources. Not to be confused with Cursor interface
89
+ that belongs to the Concurrent CDK.
90
+ """
91
+
92
+ def __init__(
93
+ self,
94
+ cursor: Cursor,
95
+ stream_slices: Iterable[Optional[Mapping[str, Any]]],
96
+ read_state_from_cursor: bool = False,
97
+ ):
98
+ self._cursor = cursor
99
+ self._stream_slices = iter(stream_slices)
100
+ # read_state_from_cursor is used to delineate that partitions should determine when to stop syncing dynamically according
101
+ # to the value of the state at runtime. This currently only applies to streams that use resumable full refresh.
102
+ self._read_state_from_cursor = read_state_from_cursor
103
+ self._current_slice: Optional[StreamSlice] = None
104
+ self._finished_sync = False
105
+ self._previous_state: Optional[Mapping[str, Any]] = None
106
+
107
+ def next(self) -> Optional[Mapping[str, Any]]:
108
+ try:
109
+ self.current_slice = self._find_next_slice()
110
+ return self.current_slice
111
+ except StopIteration:
112
+ self._finished_sync = True
113
+ return None
114
+
115
+ def observe(self, new_state: Mapping[str, Any]) -> None:
116
+ # Cursor based checkpoint readers don't need to observe the new state because it has already been updated by the cursor
117
+ # while processing records
118
+ pass
119
+
120
+ def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
121
+ # This is used to avoid sending a duplicate state messages
122
+ new_state = self._cursor.get_stream_state()
123
+ if new_state != self._previous_state:
124
+ self._previous_state = new_state
125
+ return new_state
126
+ else:
127
+ return None
128
+
129
+ def _find_next_slice(self) -> StreamSlice:
130
+ """
131
+ _find_next_slice() returns the next slice of data should be synced for the current stream according to its cursor.
132
+ This function supports iterating over a stream's slices across two dimensions. The first dimension is the stream's
133
+ partitions like parent records for a substream. The inner dimension iterates over the cursor value like a date
134
+ range for incremental streams or a pagination checkpoint for resumable full refresh.
135
+
136
+ The basic algorithm for iterating through a stream's slices is:
137
+ 1. The first time next() is invoked we get the first partition
138
+ 2. If the current partition is already complete as a result of a previous sync attempt, continue iterating until
139
+ we find an un-synced partition.
140
+ 2. For streams whose cursor value is determined dynamically using stream state
141
+ 1. Get the state for the current partition
142
+ 2. If the current partition's state is complete, continue iterating over partitions
143
+ 3. If the current partition's state is still in progress, emit the next cursor value
144
+ 4. If the current partition is complete as delineated by the sentinel value, get the next incomplete partition
145
+ 3. When stream has processed all partitions, the iterator will raise a StopIteration exception signaling there are no more
146
+ slices left for extracting more records.
147
+ """
148
+
149
+ if self._read_state_from_cursor:
150
+ if self.current_slice is None:
151
+ # current_slice is None represents the first time we are iterating over a stream's slices. The first slice to
152
+ # sync not been assigned yet and must first be read from the iterator
153
+ next_slice = self.read_and_convert_slice()
154
+ state_for_slice = self._cursor.select_state(next_slice)
155
+ if state_for_slice == FULL_REFRESH_COMPLETE_STATE:
156
+ # Skip every slice that already has the terminal complete value indicating that a previous attempt
157
+ # successfully synced the slice
158
+ has_more = True
159
+ while has_more:
160
+ next_slice = self.read_and_convert_slice()
161
+ state_for_slice = self._cursor.select_state(next_slice)
162
+ has_more = state_for_slice == FULL_REFRESH_COMPLETE_STATE
163
+ return StreamSlice(
164
+ cursor_slice=state_for_slice or {},
165
+ partition=next_slice.partition,
166
+ extra_fields=next_slice.extra_fields,
167
+ )
168
+ else:
169
+ state_for_slice = self._cursor.select_state(self.current_slice)
170
+ if state_for_slice == FULL_REFRESH_COMPLETE_STATE:
171
+ # If the current slice is is complete, move to the next slice and skip the next slices that already
172
+ # have the terminal complete value indicating that a previous attempt was successfully read.
173
+ # Dummy initialization for mypy since we'll iterate at least once to get the next slice
174
+ next_candidate_slice = StreamSlice(cursor_slice={}, partition={})
175
+ has_more = True
176
+ while has_more:
177
+ next_candidate_slice = self.read_and_convert_slice()
178
+ state_for_slice = self._cursor.select_state(next_candidate_slice)
179
+ has_more = state_for_slice == FULL_REFRESH_COMPLETE_STATE
180
+ return StreamSlice(
181
+ cursor_slice=state_for_slice or {},
182
+ partition=next_candidate_slice.partition,
183
+ extra_fields=next_candidate_slice.extra_fields,
184
+ )
185
+ # The reader continues to process the current partition if it's state is still in progress
186
+ return StreamSlice(
187
+ cursor_slice=state_for_slice or {},
188
+ partition=self.current_slice.partition,
189
+ extra_fields=self.current_slice.extra_fields,
190
+ )
191
+ else:
192
+ # Unlike RFR cursors that iterate dynamically according to how stream state is updated, most cursors operate
193
+ # on a fixed set of slices determined before reading records. They just iterate to the next slice
194
+ return self.read_and_convert_slice()
195
+
196
+ @property
197
+ def current_slice(self) -> Optional[StreamSlice]:
198
+ return self._current_slice
199
+
200
+ @current_slice.setter
201
+ def current_slice(self, value: StreamSlice) -> None:
202
+ self._current_slice = value
203
+
204
+ def read_and_convert_slice(self) -> StreamSlice:
205
+ next_slice = next(self._stream_slices)
206
+ if not isinstance(next_slice, StreamSlice):
207
+ raise ValueError(
208
+ f"{self.current_slice} should be of type StreamSlice. This is likely a bug in the CDK, please contact Airbyte support"
209
+ )
210
+ return next_slice
211
+
212
+
213
+ class LegacyCursorBasedCheckpointReader(CursorBasedCheckpointReader):
214
+ """
215
+ This (unfortunate) class operates like an adapter to retain backwards compatibility with legacy sources that take in stream_slice
216
+ in the form of a Mapping instead of the StreamSlice object. Internally, the reader still operates over StreamSlices, but it
217
+ is instantiated with and emits stream slices in the form of a Mapping[str, Any]. The logic of how partitions and cursors
218
+ are iterated over is synonymous with CursorBasedCheckpointReader.
219
+
220
+ We also retain the existing top level fields defined by the connector so the fields are present on dependent methods. For example,
221
+ the resulting mapping structure passed back to the stream's read_records() method looks like:
222
+ {
223
+ "cursor_slice": {
224
+ "next_page_token": 10
225
+ },
226
+ "partition": {
227
+ "repository": "airbytehq/airbyte"
228
+ },
229
+ "next_page_token": 10,
230
+ "repository": "airbytehq/airbyte"
231
+ }
232
+ """
233
+
234
+ def __init__(
235
+ self,
236
+ cursor: Cursor,
237
+ stream_slices: Iterable[Optional[Mapping[str, Any]]],
238
+ read_state_from_cursor: bool = False,
239
+ ):
240
+ super().__init__(
241
+ cursor=cursor,
242
+ stream_slices=stream_slices,
243
+ read_state_from_cursor=read_state_from_cursor,
244
+ )
245
+
246
+ def next(self) -> Optional[Mapping[str, Any]]:
247
+ try:
248
+ self.current_slice = self._find_next_slice()
249
+
250
+ if "partition" in dict(self.current_slice):
251
+ raise ValueError("Stream is configured to use invalid stream slice key 'partition'")
252
+ elif "cursor_slice" in dict(self.current_slice):
253
+ raise ValueError(
254
+ "Stream is configured to use invalid stream slice key 'cursor_slice'"
255
+ )
256
+
257
+ # We convert StreamSlice to a regular mapping because legacy connectors operate on the basic Mapping object. We
258
+ # also duplicate all fields at the top level for backwards compatibility for existing Python sources
259
+ return {
260
+ "partition": self.current_slice.partition,
261
+ "cursor_slice": self.current_slice.cursor_slice,
262
+ **dict(self.current_slice),
263
+ }
264
+ except StopIteration:
265
+ self._finished_sync = True
266
+ return None
267
+
268
+ def read_and_convert_slice(self) -> StreamSlice:
269
+ next_mapping_slice = next(self._stream_slices)
270
+ if not isinstance(next_mapping_slice, Mapping):
271
+ raise ValueError(
272
+ f"{self.current_slice} should be of type Mapping. This is likely a bug in the CDK, please contact Airbyte support"
273
+ )
274
+
275
+ # The legacy reader is instantiated with an iterable of stream slice mappings. We convert each into a StreamSlice
276
+ # to sanely process them during the sync and to reuse the existing Python defined cursors
277
+ return StreamSlice(
278
+ partition=next_mapping_slice,
279
+ cursor_slice={},
280
+ )
281
+
282
+
283
+ class ResumableFullRefreshCheckpointReader(CheckpointReader):
284
+ """
285
+ ResumableFullRefreshCheckpointReader allows for iteration over an unbounded set of records based on the pagination strategy
286
+ of the stream. Because the number of pages is unknown, the stream's current state is used to determine whether to continue
287
+ fetching more pages or stopping the sync.
288
+ """
289
+
290
+ def __init__(self, stream_state: Mapping[str, Any]):
291
+ # The first attempt of an RFR stream has an empty {} incoming state, but should still make a first attempt to read records
292
+ # from the first page in next().
293
+ self._first_page = bool(stream_state == {})
294
+ self._state: Mapping[str, Any] = stream_state
295
+
296
+ def next(self) -> Optional[Mapping[str, Any]]:
297
+ if self._first_page:
298
+ self._first_page = False
299
+ return self._state
300
+ elif self._state == FULL_REFRESH_COMPLETE_STATE:
301
+ return None
302
+ else:
303
+ return self._state
304
+
305
+ def observe(self, new_state: Mapping[str, Any]) -> None:
306
+ self._state = new_state
307
+
308
+ def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
309
+ return self._state or {}
310
+
311
+
312
+ class FullRefreshCheckpointReader(CheckpointReader):
313
+ """
314
+ FullRefreshCheckpointReader iterates over data that cannot be checkpointed incrementally during the sync because the stream
315
+ is not capable of managing state. At the end of a sync, a final state message is emitted to signal completion.
316
+ """
317
+
318
+ def __init__(self, stream_slices: Iterable[Optional[Mapping[str, Any]]]):
319
+ self._stream_slices = iter(stream_slices)
320
+ self._final_checkpoint = False
321
+
322
+ def next(self) -> Optional[Mapping[str, Any]]:
323
+ try:
324
+ return next(self._stream_slices)
325
+ except StopIteration:
326
+ self._final_checkpoint = True
327
+ return None
328
+
329
+ def observe(self, new_state: Mapping[str, Any]) -> None:
330
+ pass
331
+
332
+ def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
333
+ if self._final_checkpoint:
334
+ return {"__ab_no_cursor_state_message": True}
335
+ return None
@@ -0,0 +1,77 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import Any, Optional
7
+
8
+ from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
9
+
10
+
11
+ class Cursor(ABC):
12
+ """
13
+ Cursors are components that allow for checkpointing the current state of a sync. They keep track of what data has been consumed
14
+ and allows for syncs to be resumed from a specific point based on that information.
15
+ """
16
+
17
+ @abstractmethod
18
+ def set_initial_state(self, stream_state: StreamState) -> None:
19
+ """
20
+ Cursors are not initialized with their state. As state is needed in order to function properly, this method should be called
21
+ before calling anything else
22
+
23
+ :param stream_state: The state of the stream as returned by get_stream_state
24
+ """
25
+
26
+ def observe(self, stream_slice: StreamSlice, record: Record) -> None:
27
+ """
28
+ Register a record with the cursor; the cursor instance can then use it to manage the state of the in-progress stream read.
29
+
30
+ :param stream_slice: The current slice, which may or may not contain the most recently observed record
31
+ :param record: the most recently-read record, which the cursor can use to update the stream state. Outwardly-visible changes to the
32
+ stream state may need to be deferred depending on whether the source reliably orders records by the cursor field.
33
+ """
34
+ pass
35
+
36
+ @abstractmethod
37
+ def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
38
+ """
39
+ Update state based on the stream slice. Note that `stream_slice.cursor_slice` and `most_recent_record.associated_slice` are expected
40
+ to be the same but we make it explicit here that `stream_slice` should be leveraged to update the state. We do not pass in the
41
+ latest record, since cursor instances should maintain the relevant internal state on their own.
42
+
43
+ :param stream_slice: slice to close
44
+ """
45
+
46
+ @abstractmethod
47
+ def get_stream_state(self) -> StreamState:
48
+ """
49
+ Returns the current stream state. We would like to restrict it's usage since it does expose internal of state. As of 2023-06-14, it
50
+ is used for two things:
51
+ * Interpolation of the requests
52
+ * Transformation of records
53
+ * Saving the state
54
+
55
+ For the first case, we are probably stuck with exposing the stream state. For the second, we can probably expose a method that
56
+ allows for emitting the state to the platform.
57
+ """
58
+
59
+ @abstractmethod
60
+ def should_be_synced(self, record: Record) -> bool:
61
+ """
62
+ Evaluating if a record should be synced allows for filtering and stop condition on pagination
63
+ """
64
+
65
+ @abstractmethod
66
+ def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
67
+ """
68
+ Evaluating which record is greater in terms of cursor. This is used to avoid having to capture all the records to close a slice
69
+ """
70
+
71
+ @abstractmethod
72
+ def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
73
+ """
74
+ Get the state value of a specific stream_slice. For incremental or resumable full refresh cursors which only manage state in
75
+ a single dimension this is the entire state object. For per-partition cursors used by substreams, this returns the state of
76
+ a specific parent delineated by the incoming slice's partition object.
77
+ """
@@ -0,0 +1,22 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ import json
4
+ from typing import Any, Mapping
5
+
6
+
7
+ class PerPartitionKeySerializer:
8
+ """
9
+ We are concerned of the performance of looping through the `states` list and evaluating equality on the partition. To reduce this
10
+ concern, we wanted to use dictionaries to map `partition -> cursor`. However, partitions are dict and dict can't be used as dict keys
11
+ since they are not hashable. By creating json string using the dict, we can have a use the dict as a key to the dict since strings are
12
+ hashable.
13
+ """
14
+
15
+ @staticmethod
16
+ def to_partition_key(to_serialize: Any) -> str:
17
+ # separators have changed in Python 3.4. To avoid being impacted by further change, we explicitly specify our own value
18
+ return json.dumps(to_serialize, indent=None, separators=(",", ":"), sort_keys=True)
19
+
20
+ @staticmethod
21
+ def to_partition(to_deserialize: Any) -> Mapping[str, Any]:
22
+ return json.loads(to_deserialize) # type: ignore # The partition is known to be a dict, but the type hint is Any
@@ -0,0 +1,51 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Optional
5
+
6
+ from airbyte_cdk.sources.streams.checkpoint import Cursor
7
+ from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
8
+
9
+
10
+ @dataclass
11
+ class ResumableFullRefreshCursor(Cursor):
12
+ """
13
+ Cursor that allows for the checkpointing of sync progress according to a synthetic cursor based on the pagination state
14
+ of the stream. Resumable full refresh syncs are only intended to retain state in between sync attempts of the same job
15
+ with the platform responsible for removing said state.
16
+ """
17
+
18
+ def __init__(self) -> None:
19
+ self._cursor: StreamState = {}
20
+
21
+ def get_stream_state(self) -> StreamState:
22
+ return self._cursor
23
+
24
+ def set_initial_state(self, stream_state: StreamState) -> None:
25
+ self._cursor = stream_state
26
+
27
+ def observe(self, stream_slice: StreamSlice, record: Record) -> None:
28
+ """
29
+ Resumable full refresh manages state using a page number so it does not need to update state by observing incoming records.
30
+ """
31
+ pass
32
+
33
+ def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
34
+ self._cursor = stream_slice.cursor_slice
35
+
36
+ def should_be_synced(self, record: Record) -> bool:
37
+ """
38
+ Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages
39
+ that don't have filterable bounds. We should always return them.
40
+ """
41
+ return True
42
+
43
+ def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
44
+ """
45
+ RFR record don't have ordering to be compared between one another.
46
+ """
47
+ return False
48
+
49
+ def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
50
+ # A top-level RFR cursor only manages the state of a single partition
51
+ return self._cursor
@@ -0,0 +1,110 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Mapping, MutableMapping, Optional
5
+
6
+ from airbyte_cdk.models import FailureType
7
+ from airbyte_cdk.sources.streams.checkpoint import Cursor
8
+ from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
9
+ PerPartitionKeySerializer,
10
+ )
11
+ from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
12
+ from airbyte_cdk.utils import AirbyteTracedException
13
+
14
+ FULL_REFRESH_COMPLETE_STATE: Mapping[str, Any] = {"__ab_full_refresh_sync_complete": True}
15
+
16
+
17
+ @dataclass
18
+ class SubstreamResumableFullRefreshCursor(Cursor):
19
+ def __init__(self) -> None:
20
+ self._per_partition_state: MutableMapping[str, StreamState] = {}
21
+ self._partition_serializer = PerPartitionKeySerializer()
22
+
23
+ def get_stream_state(self) -> StreamState:
24
+ return {"states": list(self._per_partition_state.values())}
25
+
26
+ def set_initial_state(self, stream_state: StreamState) -> None:
27
+ """
28
+ Set the initial state for the cursors.
29
+
30
+ This method initializes the state for each partition cursor using the provided stream state.
31
+ If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
32
+
33
+ To simplify processing and state management, we do not maintain the checkpointed state of the parent partitions.
34
+ Instead, we are tracking whether a parent has already successfully synced on a prior attempt and skipping over it
35
+ allowing the sync to continue making progress. And this works for RFR because the platform will dispose of this
36
+ state on the next sync job.
37
+
38
+ Args:
39
+ stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
40
+ {
41
+ "states": [
42
+ {
43
+ "partition": {
44
+ "partition_key": "value_0"
45
+ },
46
+ "cursor": {
47
+ "__ab_full_refresh_sync_complete": True
48
+ }
49
+ },
50
+ {
51
+ "partition": {
52
+ "partition_key": "value_1"
53
+ },
54
+ "cursor": {},
55
+ },
56
+ ]
57
+ }
58
+ """
59
+ if not stream_state:
60
+ return
61
+
62
+ if "states" not in stream_state:
63
+ raise AirbyteTracedException(
64
+ internal_message=f"Could not sync parse the following state: {stream_state}",
65
+ message="The state for is format invalid. Validate that the migration steps included a reset and that it was performed "
66
+ "properly. Otherwise, please contact Airbyte support.",
67
+ failure_type=FailureType.config_error,
68
+ )
69
+
70
+ for state in stream_state["states"]:
71
+ self._per_partition_state[self._to_partition_key(state["partition"])] = state
72
+
73
+ def observe(self, stream_slice: StreamSlice, record: Record) -> None:
74
+ """
75
+ Substream resumable full refresh manages state by closing the slice after syncing a parent so observe is not used.
76
+ """
77
+ pass
78
+
79
+ def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
80
+ self._per_partition_state[self._to_partition_key(stream_slice.partition)] = {
81
+ "partition": stream_slice.partition,
82
+ "cursor": FULL_REFRESH_COMPLETE_STATE,
83
+ }
84
+
85
+ def should_be_synced(self, record: Record) -> bool:
86
+ """
87
+ Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages
88
+ that don't have filterable bounds. We should always return them.
89
+ """
90
+ return True
91
+
92
+ def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
93
+ """
94
+ RFR record don't have ordering to be compared between one another.
95
+ """
96
+ return False
97
+
98
+ def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
99
+ if not stream_slice:
100
+ raise ValueError("A partition needs to be provided in order to extract a state")
101
+
102
+ return self._per_partition_state.get(
103
+ self._to_partition_key(stream_slice.partition), {}
104
+ ).get("cursor")
105
+
106
+ def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
107
+ return self._partition_serializer.to_partition_key(partition)
108
+
109
+ def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
110
+ return self._partition_serializer.to_partition(partition_key)
@@ -0,0 +1,7 @@
1
+ ## Breaking Changes & Limitations
2
+
3
+ - [bigger scope than Concurrent CDK] checkpointing state was acting on the number of records per slice. This has been changed to consider the number of records per syncs
4
+ - `Source.read_state` and `Source._emit_legacy_state_format` are now classmethods to allow for developers to have access to the state before instantiating the source
5
+ - send_per_stream_state is always True for Concurrent CDK
6
+ - Using stream_state during read_records: The concern is that today, stream_instance.get_updated_state is called on every record and read_records on every slice. The implication is that the argument stream_state passed to read_records will have the value after the last stream_instance.get_updated_state of the previous slice. For Concurrent CDK, this is not possible as slices are processed in an unordered way.
7
+ - Cursor fields can only be data-time formatted as epoch. Eventually, we want to move to ISO 8601 as it provides more flexibility but for the first iteration on Stripe, it was easier to use the same format that was already used
@@ -0,0 +1,3 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #