airbyte-cdk 0.72.1__py3-none-any.whl → 6.17.1.dev0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (518) hide show
  1. airbyte_cdk/__init__.py +355 -6
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +29 -10
  7. airbyte_cdk/connector.py +24 -24
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
  10. airbyte_cdk/connector_builder/main.py +45 -13
  11. airbyte_cdk/connector_builder/message_grouper.py +189 -50
  12. airbyte_cdk/connector_builder/models.py +3 -2
  13. airbyte_cdk/destinations/__init__.py +4 -3
  14. airbyte_cdk/destinations/destination.py +54 -20
  15. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  16. airbyte_cdk/destinations/vector_db_based/config.py +40 -17
  17. airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
  18. airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
  19. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  20. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  21. airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
  22. airbyte_cdk/entrypoint.py +153 -44
  23. airbyte_cdk/exception_handler.py +21 -3
  24. airbyte_cdk/logger.py +30 -44
  25. airbyte_cdk/models/__init__.py +13 -2
  26. airbyte_cdk/models/airbyte_protocol.py +86 -1
  27. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  28. airbyte_cdk/models/file_transfer_record_message.py +13 -0
  29. airbyte_cdk/models/well_known_types.py +1 -1
  30. airbyte_cdk/sources/__init__.py +5 -1
  31. airbyte_cdk/sources/abstract_source.py +125 -79
  32. airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
  33. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
  34. airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
  35. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
  36. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  37. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
  38. airbyte_cdk/sources/config.py +3 -2
  39. airbyte_cdk/sources/connector_state_manager.py +49 -83
  40. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  41. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
  42. airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
  43. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  44. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  45. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  46. airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
  47. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  48. airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
  49. airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
  50. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
  51. airbyte_cdk/sources/declarative/auth/token.py +28 -10
  52. airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
  53. airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
  54. airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
  55. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  56. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  57. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +490 -0
  58. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
  59. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
  60. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1185 -85
  61. airbyte_cdk/sources/declarative/declarative_source.py +5 -2
  62. airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
  63. airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
  64. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
  65. airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
  66. airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
  67. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  68. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  69. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  70. airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
  71. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
  72. airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
  73. airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
  74. airbyte_cdk/sources/declarative/extractors/record_filter.py +63 -8
  75. airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
  76. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
  77. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  78. airbyte_cdk/sources/declarative/incremental/__init__.py +31 -3
  79. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +346 -0
  80. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
  81. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  82. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
  83. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +173 -74
  84. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  85. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  86. airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
  87. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
  88. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
  89. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
  90. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
  91. airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
  92. airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
  93. airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
  94. airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
  95. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  96. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  97. airbyte_cdk/sources/declarative/models/__init__.py +1 -1
  98. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1319 -603
  99. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
  100. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
  101. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
  102. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1759 -225
  103. airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
  104. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  105. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  106. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
  107. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  108. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
  109. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
  110. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
  111. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
  112. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
  113. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
  114. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
  115. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
  116. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
  117. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
  118. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
  119. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
  120. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  121. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
  122. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
  123. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
  124. airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
  125. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
  126. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
  127. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
  128. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
  129. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
  130. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
  131. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
  132. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
  133. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
  134. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
  135. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
  136. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
  137. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  138. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
  139. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
  140. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
  141. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
  142. airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
  143. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  144. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  145. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  146. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  147. airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
  148. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
  149. airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
  150. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +229 -73
  151. airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
  152. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
  153. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
  154. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
  155. airbyte_cdk/sources/declarative/spec/spec.py +12 -5
  156. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
  157. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
  158. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
  159. airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
  160. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  161. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  162. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  163. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  164. airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
  165. airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
  166. airbyte_cdk/sources/declarative/types.py +19 -110
  167. airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
  168. airbyte_cdk/sources/embedded/base_integration.py +16 -5
  169. airbyte_cdk/sources/embedded/catalog.py +16 -4
  170. airbyte_cdk/sources/embedded/runner.py +19 -3
  171. airbyte_cdk/sources/embedded/tools.py +5 -2
  172. airbyte_cdk/sources/file_based/README.md +152 -0
  173. airbyte_cdk/sources/file_based/__init__.py +24 -0
  174. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
  175. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
  176. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
  177. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +47 -10
  178. airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
  179. airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
  180. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  181. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
  182. airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
  183. airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
  184. airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
  185. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
  186. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  187. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  188. airbyte_cdk/sources/file_based/exceptions.py +18 -15
  189. airbyte_cdk/sources/file_based/file_based_source.py +140 -33
  190. airbyte_cdk/sources/file_based/file_based_stream_reader.py +69 -5
  191. airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
  192. airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
  193. airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
  194. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  195. airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
  196. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  197. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
  198. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
  199. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +141 -41
  200. airbyte_cdk/sources/file_based/remote_file.py +1 -1
  201. airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
  202. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
  203. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  204. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  205. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
  206. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
  207. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
  208. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
  209. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
  210. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
  211. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  212. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
  213. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +147 -45
  214. airbyte_cdk/sources/http_logger.py +8 -3
  215. airbyte_cdk/sources/message/__init__.py +7 -1
  216. airbyte_cdk/sources/message/repository.py +18 -4
  217. airbyte_cdk/sources/source.py +42 -38
  218. airbyte_cdk/sources/streams/__init__.py +2 -2
  219. airbyte_cdk/sources/streams/availability_strategy.py +54 -3
  220. airbyte_cdk/sources/streams/call_rate.py +64 -21
  221. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  222. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  223. airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
  224. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  225. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  226. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  227. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  228. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
  229. airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
  230. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
  231. airbyte_cdk/sources/streams/concurrent/cursor.py +298 -42
  232. airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
  233. airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
  234. airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
  235. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
  236. airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
  237. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
  238. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  239. airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
  240. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
  241. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
  242. airbyte_cdk/sources/streams/core.py +412 -87
  243. airbyte_cdk/sources/streams/http/__init__.py +2 -1
  244. airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
  245. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  246. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  247. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  248. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  249. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  250. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  251. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  252. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  253. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  254. airbyte_cdk/sources/streams/http/exceptions.py +27 -7
  255. airbyte_cdk/sources/streams/http/http.py +369 -246
  256. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  257. airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
  258. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
  259. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  260. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
  261. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  262. airbyte_cdk/sources/types.py +154 -0
  263. airbyte_cdk/sources/utils/record_helper.py +36 -21
  264. airbyte_cdk/sources/utils/schema_helpers.py +13 -6
  265. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  266. airbyte_cdk/sources/utils/transform.py +54 -20
  267. airbyte_cdk/sql/_util/hashing.py +34 -0
  268. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  269. airbyte_cdk/sql/constants.py +32 -0
  270. airbyte_cdk/sql/exceptions.py +235 -0
  271. airbyte_cdk/sql/secrets.py +123 -0
  272. airbyte_cdk/sql/shared/__init__.py +15 -0
  273. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  274. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  275. airbyte_cdk/sql/types.py +160 -0
  276. airbyte_cdk/test/catalog_builder.py +70 -18
  277. airbyte_cdk/test/entrypoint_wrapper.py +117 -42
  278. airbyte_cdk/test/mock_http/__init__.py +1 -1
  279. airbyte_cdk/test/mock_http/matcher.py +6 -0
  280. airbyte_cdk/test/mock_http/mocker.py +57 -10
  281. airbyte_cdk/test/mock_http/request.py +19 -3
  282. airbyte_cdk/test/mock_http/response.py +3 -1
  283. airbyte_cdk/test/mock_http/response_builder.py +32 -16
  284. airbyte_cdk/test/state_builder.py +18 -10
  285. airbyte_cdk/test/utils/__init__.py +1 -0
  286. airbyte_cdk/test/utils/data.py +24 -0
  287. airbyte_cdk/test/utils/http_mocking.py +16 -0
  288. airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
  289. airbyte_cdk/test/utils/reading.py +26 -0
  290. airbyte_cdk/utils/__init__.py +2 -1
  291. airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
  292. airbyte_cdk/utils/analytics_message.py +10 -2
  293. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  294. airbyte_cdk/utils/event_timing.py +10 -10
  295. airbyte_cdk/utils/mapping_helpers.py +3 -1
  296. airbyte_cdk/utils/message_utils.py +20 -11
  297. airbyte_cdk/utils/print_buffer.py +75 -0
  298. airbyte_cdk/utils/schema_inferrer.py +198 -28
  299. airbyte_cdk/utils/slice_hasher.py +30 -0
  300. airbyte_cdk/utils/spec_schema_transformations.py +6 -3
  301. airbyte_cdk/utils/stream_status_utils.py +8 -1
  302. airbyte_cdk/utils/traced_exception.py +61 -21
  303. airbyte_cdk-6.17.1.dev0.dist-info/METADATA +109 -0
  304. airbyte_cdk-6.17.1.dev0.dist-info/RECORD +350 -0
  305. {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.17.1.dev0.dist-info}/WHEEL +1 -2
  306. airbyte_cdk-6.17.1.dev0.dist-info/entry_points.txt +3 -0
  307. airbyte_cdk/sources/declarative/create_partial.py +0 -92
  308. airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
  309. airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
  310. airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
  311. airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
  312. airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
  313. airbyte_cdk/sources/deprecated/base_source.py +0 -94
  314. airbyte_cdk/sources/deprecated/client.py +0 -99
  315. airbyte_cdk/sources/singer/__init__.py +0 -8
  316. airbyte_cdk/sources/singer/singer_helpers.py +0 -304
  317. airbyte_cdk/sources/singer/source.py +0 -186
  318. airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
  319. airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
  320. airbyte_cdk/sources/streams/http/auth/core.py +0 -29
  321. airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
  322. airbyte_cdk/sources/streams/http/auth/token.py +0 -47
  323. airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
  324. airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
  325. airbyte_cdk/sources/utils/schema_models.py +0 -84
  326. airbyte_cdk-0.72.1.dist-info/METADATA +0 -243
  327. airbyte_cdk-0.72.1.dist-info/RECORD +0 -466
  328. airbyte_cdk-0.72.1.dist-info/top_level.txt +0 -3
  329. source_declarative_manifest/main.py +0 -29
  330. unit_tests/connector_builder/__init__.py +0 -3
  331. unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
  332. unit_tests/connector_builder/test_message_grouper.py +0 -713
  333. unit_tests/connector_builder/utils.py +0 -27
  334. unit_tests/destinations/test_destination.py +0 -243
  335. unit_tests/singer/test_singer_helpers.py +0 -56
  336. unit_tests/singer/test_singer_source.py +0 -112
  337. unit_tests/sources/__init__.py +0 -0
  338. unit_tests/sources/concurrent_source/__init__.py +0 -3
  339. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
  340. unit_tests/sources/declarative/__init__.py +0 -3
  341. unit_tests/sources/declarative/auth/__init__.py +0 -3
  342. unit_tests/sources/declarative/auth/test_oauth.py +0 -331
  343. unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
  344. unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
  345. unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
  346. unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
  347. unit_tests/sources/declarative/checks/__init__.py +0 -3
  348. unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
  349. unit_tests/sources/declarative/decoders/__init__.py +0 -0
  350. unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
  351. unit_tests/sources/declarative/external_component.py +0 -13
  352. unit_tests/sources/declarative/extractors/__init__.py +0 -3
  353. unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
  354. unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
  355. unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
  356. unit_tests/sources/declarative/incremental/__init__.py +0 -0
  357. unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
  358. unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
  359. unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
  360. unit_tests/sources/declarative/interpolation/__init__.py +0 -3
  361. unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
  362. unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
  363. unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
  364. unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
  365. unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
  366. unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
  367. unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
  368. unit_tests/sources/declarative/parsers/__init__.py +0 -3
  369. unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
  370. unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
  371. unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1847
  372. unit_tests/sources/declarative/parsers/testing_components.py +0 -36
  373. unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
  374. unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
  375. unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
  376. unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
  377. unit_tests/sources/declarative/requesters/__init__.py +0 -3
  378. unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
  379. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
  380. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
  381. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
  382. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
  383. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
  384. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
  385. unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
  386. unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
  387. unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
  388. unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
  389. unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
  390. unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
  391. unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
  392. unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
  393. unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
  394. unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
  395. unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
  396. unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
  397. unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
  398. unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
  399. unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
  400. unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
  401. unit_tests/sources/declarative/retrievers/__init__.py +0 -3
  402. unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
  403. unit_tests/sources/declarative/schema/__init__.py +0 -6
  404. unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
  405. unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
  406. unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
  407. unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
  408. unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
  409. unit_tests/sources/declarative/states/__init__.py +0 -3
  410. unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
  411. unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
  412. unit_tests/sources/declarative/test_create_partial.py +0 -83
  413. unit_tests/sources/declarative/test_declarative_stream.py +0 -103
  414. unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
  415. unit_tests/sources/declarative/test_types.py +0 -39
  416. unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
  417. unit_tests/sources/file_based/__init__.py +0 -0
  418. unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
  419. unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
  420. unit_tests/sources/file_based/config/__init__.py +0 -0
  421. unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
  422. unit_tests/sources/file_based/config/test_csv_format.py +0 -34
  423. unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
  424. unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
  425. unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
  426. unit_tests/sources/file_based/file_types/__init__.py +0 -0
  427. unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
  428. unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
  429. unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
  430. unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
  431. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
  432. unit_tests/sources/file_based/helpers.py +0 -70
  433. unit_tests/sources/file_based/in_memory_files_source.py +0 -211
  434. unit_tests/sources/file_based/scenarios/__init__.py +0 -0
  435. unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
  436. unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
  437. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
  438. unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
  439. unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
  440. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
  441. unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
  442. unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
  443. unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
  444. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
  445. unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
  446. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
  447. unit_tests/sources/file_based/stream/__init__.py +0 -0
  448. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  449. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
  450. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
  451. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
  452. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
  453. unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
  454. unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
  455. unit_tests/sources/file_based/test_scenarios.py +0 -253
  456. unit_tests/sources/file_based/test_schema_helpers.py +0 -346
  457. unit_tests/sources/fixtures/__init__.py +0 -3
  458. unit_tests/sources/fixtures/source_test_fixture.py +0 -153
  459. unit_tests/sources/message/__init__.py +0 -0
  460. unit_tests/sources/message/test_repository.py +0 -153
  461. unit_tests/sources/streams/__init__.py +0 -0
  462. unit_tests/sources/streams/concurrent/__init__.py +0 -3
  463. unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
  464. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
  465. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
  466. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
  467. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
  468. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
  469. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
  470. unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
  471. unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
  472. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
  473. unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
  474. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
  475. unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
  476. unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
  477. unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
  478. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
  479. unit_tests/sources/streams/http/__init__.py +0 -0
  480. unit_tests/sources/streams/http/auth/__init__.py +0 -0
  481. unit_tests/sources/streams/http/auth/test_auth.py +0 -173
  482. unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
  483. unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
  484. unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
  485. unit_tests/sources/streams/http/test_http.py +0 -635
  486. unit_tests/sources/streams/test_availability_strategy.py +0 -70
  487. unit_tests/sources/streams/test_call_rate.py +0 -300
  488. unit_tests/sources/streams/test_stream_read.py +0 -405
  489. unit_tests/sources/streams/test_streams_core.py +0 -184
  490. unit_tests/sources/test_abstract_source.py +0 -1442
  491. unit_tests/sources/test_concurrent_source.py +0 -112
  492. unit_tests/sources/test_config.py +0 -92
  493. unit_tests/sources/test_connector_state_manager.py +0 -482
  494. unit_tests/sources/test_http_logger.py +0 -252
  495. unit_tests/sources/test_integration_source.py +0 -86
  496. unit_tests/sources/test_source.py +0 -684
  497. unit_tests/sources/test_source_read.py +0 -460
  498. unit_tests/test/__init__.py +0 -0
  499. unit_tests/test/mock_http/__init__.py +0 -0
  500. unit_tests/test/mock_http/test_matcher.py +0 -53
  501. unit_tests/test/mock_http/test_mocker.py +0 -214
  502. unit_tests/test/mock_http/test_request.py +0 -117
  503. unit_tests/test/mock_http/test_response_builder.py +0 -177
  504. unit_tests/test/test_entrypoint_wrapper.py +0 -240
  505. unit_tests/utils/__init__.py +0 -0
  506. unit_tests/utils/test_datetime_format_inferrer.py +0 -60
  507. unit_tests/utils/test_mapping_helpers.py +0 -54
  508. unit_tests/utils/test_message_utils.py +0 -91
  509. unit_tests/utils/test_rate_limiting.py +0 -26
  510. unit_tests/utils/test_schema_inferrer.py +0 -202
  511. unit_tests/utils/test_secret_utils.py +0 -135
  512. unit_tests/utils/test_stream_status_utils.py +0 -61
  513. unit_tests/utils/test_traced_exception.py +0 -107
  514. /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
  515. {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
  516. {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
  517. {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
  518. {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.17.1.dev0.dist-info}/LICENSE.txt +0 -0
@@ -0,0 +1,335 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ from abc import ABC, abstractmethod
4
+ from enum import Enum
5
+ from typing import Any, Iterable, Mapping, Optional
6
+
7
+ from airbyte_cdk.sources.types import StreamSlice
8
+
9
+ from .cursor import Cursor
10
+
11
+
12
+ class CheckpointMode(Enum):
13
+ INCREMENTAL = "incremental"
14
+ RESUMABLE_FULL_REFRESH = "resumable_full_refresh"
15
+ FULL_REFRESH = "full_refresh"
16
+
17
+
18
+ FULL_REFRESH_COMPLETE_STATE: Mapping[str, Any] = {"__ab_full_refresh_sync_complete": True}
19
+
20
+
21
+ class CheckpointReader(ABC):
22
+ """
23
+ CheckpointReader manages how to iterate over a stream's partitions and serves as the bridge for interpreting the current state
24
+ of the stream that should be emitted back to the platform.
25
+ """
26
+
27
+ @abstractmethod
28
+ def next(self) -> Optional[Mapping[str, Any]]:
29
+ """
30
+ Returns the next slice that will be used to fetch the next group of records. Returning None indicates that the reader
31
+ has finished iterating over all slices.
32
+ """
33
+
34
+ @abstractmethod
35
+ def observe(self, new_state: Mapping[str, Any]) -> None:
36
+ """
37
+ Updates the internal state of the checkpoint reader based on the incoming stream state from a connector.
38
+
39
+ WARNING: This is used to retain backwards compatibility with streams using the legacy get_stream_state() method.
40
+ In order to uptake Resumable Full Refresh, connectors must migrate streams to use the state setter/getter methods.
41
+ """
42
+
43
+ @abstractmethod
44
+ def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
45
+ """
46
+ Retrieves the current state value of the stream. The connector does not emit state messages if the checkpoint value is None.
47
+ """
48
+
49
+
50
+ class IncrementalCheckpointReader(CheckpointReader):
51
+ """
52
+ IncrementalCheckpointReader handles iterating through a stream based on partitioned windows of data that are determined
53
+ before syncing data.
54
+ """
55
+
56
+ def __init__(
57
+ self, stream_state: Mapping[str, Any], stream_slices: Iterable[Optional[Mapping[str, Any]]]
58
+ ):
59
+ self._state: Optional[Mapping[str, Any]] = stream_state
60
+ self._stream_slices = iter(stream_slices)
61
+ self._has_slices = False
62
+
63
+ def next(self) -> Optional[Mapping[str, Any]]:
64
+ try:
65
+ next_slice = next(self._stream_slices)
66
+ self._has_slices = True
67
+ return next_slice
68
+ except StopIteration:
69
+ # This is used to avoid sending a duplicate state message at the end of a sync since the stream has already
70
+ # emitted state at the end of each slice. If we want to avoid this extra complexity, we can also just accept
71
+ # that every sync emits a final duplicate state
72
+ if self._has_slices:
73
+ self._state = None
74
+ return None
75
+
76
+ def observe(self, new_state: Mapping[str, Any]) -> None:
77
+ self._state = new_state
78
+
79
+ def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
80
+ return self._state
81
+
82
+
83
+ class CursorBasedCheckpointReader(CheckpointReader):
84
+ """
85
+ CursorBasedCheckpointReader is used by streams that implement a Cursor in order to manage state. This allows the checkpoint
86
+ reader to delegate the complexity of fetching state to the cursor and focus on the iteration over a stream's partitions.
87
+
88
+ This reader supports the Cursor interface used by Python and low-code sources. Not to be confused with Cursor interface
89
+ that belongs to the Concurrent CDK.
90
+ """
91
+
92
+ def __init__(
93
+ self,
94
+ cursor: Cursor,
95
+ stream_slices: Iterable[Optional[Mapping[str, Any]]],
96
+ read_state_from_cursor: bool = False,
97
+ ):
98
+ self._cursor = cursor
99
+ self._stream_slices = iter(stream_slices)
100
+ # read_state_from_cursor is used to delineate that partitions should determine when to stop syncing dynamically according
101
+ # to the value of the state at runtime. This currently only applies to streams that use resumable full refresh.
102
+ self._read_state_from_cursor = read_state_from_cursor
103
+ self._current_slice: Optional[StreamSlice] = None
104
+ self._finished_sync = False
105
+ self._previous_state: Optional[Mapping[str, Any]] = None
106
+
107
+ def next(self) -> Optional[Mapping[str, Any]]:
108
+ try:
109
+ self.current_slice = self._find_next_slice()
110
+ return self.current_slice
111
+ except StopIteration:
112
+ self._finished_sync = True
113
+ return None
114
+
115
+ def observe(self, new_state: Mapping[str, Any]) -> None:
116
+ # Cursor based checkpoint readers don't need to observe the new state because it has already been updated by the cursor
117
+ # while processing records
118
+ pass
119
+
120
+ def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
121
+ # This is used to avoid sending a duplicate state messages
122
+ new_state = self._cursor.get_stream_state()
123
+ if new_state != self._previous_state:
124
+ self._previous_state = new_state
125
+ return new_state
126
+ else:
127
+ return None
128
+
129
+ def _find_next_slice(self) -> StreamSlice:
130
+ """
131
+ _find_next_slice() returns the next slice of data should be synced for the current stream according to its cursor.
132
+ This function supports iterating over a stream's slices across two dimensions. The first dimension is the stream's
133
+ partitions like parent records for a substream. The inner dimension iterates over the cursor value like a date
134
+ range for incremental streams or a pagination checkpoint for resumable full refresh.
135
+
136
+ The basic algorithm for iterating through a stream's slices is:
137
+ 1. The first time next() is invoked we get the first partition
138
+ 2. If the current partition is already complete as a result of a previous sync attempt, continue iterating until
139
+ we find an un-synced partition.
140
+ 2. For streams whose cursor value is determined dynamically using stream state
141
+ 1. Get the state for the current partition
142
+ 2. If the current partition's state is complete, continue iterating over partitions
143
+ 3. If the current partition's state is still in progress, emit the next cursor value
144
+ 4. If the current partition is complete as delineated by the sentinel value, get the next incomplete partition
145
+ 3. When stream has processed all partitions, the iterator will raise a StopIteration exception signaling there are no more
146
+ slices left for extracting more records.
147
+ """
148
+
149
+ if self._read_state_from_cursor:
150
+ if self.current_slice is None:
151
+ # current_slice is None represents the first time we are iterating over a stream's slices. The first slice to
152
+ # sync not been assigned yet and must first be read from the iterator
153
+ next_slice = self.read_and_convert_slice()
154
+ state_for_slice = self._cursor.select_state(next_slice)
155
+ if state_for_slice == FULL_REFRESH_COMPLETE_STATE:
156
+ # Skip every slice that already has the terminal complete value indicating that a previous attempt
157
+ # successfully synced the slice
158
+ has_more = True
159
+ while has_more:
160
+ next_slice = self.read_and_convert_slice()
161
+ state_for_slice = self._cursor.select_state(next_slice)
162
+ has_more = state_for_slice == FULL_REFRESH_COMPLETE_STATE
163
+ return StreamSlice(
164
+ cursor_slice=state_for_slice or {},
165
+ partition=next_slice.partition,
166
+ extra_fields=next_slice.extra_fields,
167
+ )
168
+ else:
169
+ state_for_slice = self._cursor.select_state(self.current_slice)
170
+ if state_for_slice == FULL_REFRESH_COMPLETE_STATE:
171
+ # If the current slice is is complete, move to the next slice and skip the next slices that already
172
+ # have the terminal complete value indicating that a previous attempt was successfully read.
173
+ # Dummy initialization for mypy since we'll iterate at least once to get the next slice
174
+ next_candidate_slice = StreamSlice(cursor_slice={}, partition={})
175
+ has_more = True
176
+ while has_more:
177
+ next_candidate_slice = self.read_and_convert_slice()
178
+ state_for_slice = self._cursor.select_state(next_candidate_slice)
179
+ has_more = state_for_slice == FULL_REFRESH_COMPLETE_STATE
180
+ return StreamSlice(
181
+ cursor_slice=state_for_slice or {},
182
+ partition=next_candidate_slice.partition,
183
+ extra_fields=next_candidate_slice.extra_fields,
184
+ )
185
+ # The reader continues to process the current partition if it's state is still in progress
186
+ return StreamSlice(
187
+ cursor_slice=state_for_slice or {},
188
+ partition=self.current_slice.partition,
189
+ extra_fields=self.current_slice.extra_fields,
190
+ )
191
+ else:
192
+ # Unlike RFR cursors that iterate dynamically according to how stream state is updated, most cursors operate
193
+ # on a fixed set of slices determined before reading records. They just iterate to the next slice
194
+ return self.read_and_convert_slice()
195
+
196
+ @property
197
+ def current_slice(self) -> Optional[StreamSlice]:
198
+ return self._current_slice
199
+
200
+ @current_slice.setter
201
+ def current_slice(self, value: StreamSlice) -> None:
202
+ self._current_slice = value
203
+
204
+ def read_and_convert_slice(self) -> StreamSlice:
205
+ next_slice = next(self._stream_slices)
206
+ if not isinstance(next_slice, StreamSlice):
207
+ raise ValueError(
208
+ f"{self.current_slice} should be of type StreamSlice. This is likely a bug in the CDK, please contact Airbyte support"
209
+ )
210
+ return next_slice
211
+
212
+
213
+ class LegacyCursorBasedCheckpointReader(CursorBasedCheckpointReader):
214
+ """
215
+ This (unfortunate) class operates like an adapter to retain backwards compatibility with legacy sources that take in stream_slice
216
+ in the form of a Mapping instead of the StreamSlice object. Internally, the reader still operates over StreamSlices, but it
217
+ is instantiated with and emits stream slices in the form of a Mapping[str, Any]. The logic of how partitions and cursors
218
+ are iterated over is synonymous with CursorBasedCheckpointReader.
219
+
220
+ We also retain the existing top level fields defined by the connector so the fields are present on dependent methods. For example,
221
+ the resulting mapping structure passed back to the stream's read_records() method looks like:
222
+ {
223
+ "cursor_slice": {
224
+ "next_page_token": 10
225
+ },
226
+ "partition": {
227
+ "repository": "airbytehq/airbyte"
228
+ },
229
+ "next_page_token": 10,
230
+ "repository": "airbytehq/airbyte"
231
+ }
232
+ """
233
+
234
+ def __init__(
235
+ self,
236
+ cursor: Cursor,
237
+ stream_slices: Iterable[Optional[Mapping[str, Any]]],
238
+ read_state_from_cursor: bool = False,
239
+ ):
240
+ super().__init__(
241
+ cursor=cursor,
242
+ stream_slices=stream_slices,
243
+ read_state_from_cursor=read_state_from_cursor,
244
+ )
245
+
246
+ def next(self) -> Optional[Mapping[str, Any]]:
247
+ try:
248
+ self.current_slice = self._find_next_slice()
249
+
250
+ if "partition" in dict(self.current_slice):
251
+ raise ValueError("Stream is configured to use invalid stream slice key 'partition'")
252
+ elif "cursor_slice" in dict(self.current_slice):
253
+ raise ValueError(
254
+ "Stream is configured to use invalid stream slice key 'cursor_slice'"
255
+ )
256
+
257
+ # We convert StreamSlice to a regular mapping because legacy connectors operate on the basic Mapping object. We
258
+ # also duplicate all fields at the top level for backwards compatibility for existing Python sources
259
+ return {
260
+ "partition": self.current_slice.partition,
261
+ "cursor_slice": self.current_slice.cursor_slice,
262
+ **dict(self.current_slice),
263
+ }
264
+ except StopIteration:
265
+ self._finished_sync = True
266
+ return None
267
+
268
+ def read_and_convert_slice(self) -> StreamSlice:
269
+ next_mapping_slice = next(self._stream_slices)
270
+ if not isinstance(next_mapping_slice, Mapping):
271
+ raise ValueError(
272
+ f"{self.current_slice} should be of type Mapping. This is likely a bug in the CDK, please contact Airbyte support"
273
+ )
274
+
275
+ # The legacy reader is instantiated with an iterable of stream slice mappings. We convert each into a StreamSlice
276
+ # to sanely process them during the sync and to reuse the existing Python defined cursors
277
+ return StreamSlice(
278
+ partition=next_mapping_slice,
279
+ cursor_slice={},
280
+ )
281
+
282
+
283
+ class ResumableFullRefreshCheckpointReader(CheckpointReader):
284
+ """
285
+ ResumableFullRefreshCheckpointReader allows for iteration over an unbounded set of records based on the pagination strategy
286
+ of the stream. Because the number of pages is unknown, the stream's current state is used to determine whether to continue
287
+ fetching more pages or stopping the sync.
288
+ """
289
+
290
+ def __init__(self, stream_state: Mapping[str, Any]):
291
+ # The first attempt of an RFR stream has an empty {} incoming state, but should still make a first attempt to read records
292
+ # from the first page in next().
293
+ self._first_page = bool(stream_state == {})
294
+ self._state: Mapping[str, Any] = stream_state
295
+
296
+ def next(self) -> Optional[Mapping[str, Any]]:
297
+ if self._first_page:
298
+ self._first_page = False
299
+ return self._state
300
+ elif self._state == FULL_REFRESH_COMPLETE_STATE:
301
+ return None
302
+ else:
303
+ return self._state
304
+
305
+ def observe(self, new_state: Mapping[str, Any]) -> None:
306
+ self._state = new_state
307
+
308
+ def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
309
+ return self._state or {}
310
+
311
+
312
+ class FullRefreshCheckpointReader(CheckpointReader):
313
+ """
314
+ FullRefreshCheckpointReader iterates over data that cannot be checkpointed incrementally during the sync because the stream
315
+ is not capable of managing state. At the end of a sync, a final state message is emitted to signal completion.
316
+ """
317
+
318
+ def __init__(self, stream_slices: Iterable[Optional[Mapping[str, Any]]]):
319
+ self._stream_slices = iter(stream_slices)
320
+ self._final_checkpoint = False
321
+
322
+ def next(self) -> Optional[Mapping[str, Any]]:
323
+ try:
324
+ return next(self._stream_slices)
325
+ except StopIteration:
326
+ self._final_checkpoint = True
327
+ return None
328
+
329
+ def observe(self, new_state: Mapping[str, Any]) -> None:
330
+ pass
331
+
332
+ def get_checkpoint(self) -> Optional[Mapping[str, Any]]:
333
+ if self._final_checkpoint:
334
+ return {"__ab_no_cursor_state_message": True}
335
+ return None
@@ -3,16 +3,15 @@
3
3
  #
4
4
 
5
5
  from abc import ABC, abstractmethod
6
- from typing import Optional
6
+ from typing import Any, Optional
7
7
 
8
- from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer import StreamSlicer
9
- from airbyte_cdk.sources.declarative.types import Record, StreamSlice, StreamState
8
+ from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
10
9
 
11
10
 
12
- class Cursor(ABC, StreamSlicer):
11
+ class Cursor(ABC):
13
12
  """
14
- Cursors are components that allow for incremental syncs. They keep track of what data has been consumed and slices the requests based on
15
- that information.
13
+ Cursors are components that allow for checkpointing the current state of a sync. They keep track of what data has been consumed
14
+ and allows for syncs to be resumed from a specific point based on that information.
16
15
  """
17
16
 
18
17
  @abstractmethod
@@ -35,17 +34,13 @@ class Cursor(ABC, StreamSlicer):
35
34
  pass
36
35
 
37
36
  @abstractmethod
38
- def close_slice(self, stream_slice: StreamSlice, most_recent_record: Optional[Record]) -> None:
37
+ def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
39
38
  """
40
- Update state based on the stream slice and the latest record. Note that `stream_slice.cursor_slice` and
41
- `most_recent_record.associated_slice` are expected to be the same but we make it explicit here that `stream_slice` should be leveraged to
42
- update the state.
39
+ Update state based on the stream slice. Note that `stream_slice.cursor_slice` and `most_recent_record.associated_slice` are expected
40
+ to be the same but we make it explicit here that `stream_slice` should be leveraged to update the state. We do not pass in the
41
+ latest record, since cursor instances should maintain the relevant internal state on their own.
43
42
 
44
43
  :param stream_slice: slice to close
45
- :param most_recent_record: the latest record we have received for the slice. This is important to consider because even if the
46
- cursor emits a slice, some APIs are not able to enforce the upper boundary. The outcome is that the last_record might have a
47
- higher cursor value than the slice upper boundary and if we want to reduce the duplication as much as possible, we need to
48
- consider the highest value between the internal cursor, the stream slice upper boundary and the record cursor value.
49
44
  """
50
45
 
51
46
  @abstractmethod
@@ -72,3 +67,11 @@ class Cursor(ABC, StreamSlicer):
72
67
  """
73
68
  Evaluating which record is greater in terms of cursor. This is used to avoid having to capture all the records to close a slice
74
69
  """
70
+
71
+ @abstractmethod
72
+ def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
73
+ """
74
+ Get the state value of a specific stream_slice. For incremental or resumable full refresh cursors which only manage state in
75
+ a single dimension this is the entire state object. For per-partition cursors used by substreams, this returns the state of
76
+ a specific parent delineated by the incoming slice's partition object.
77
+ """
@@ -0,0 +1,22 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ import json
4
+ from typing import Any, Mapping
5
+
6
+
7
+ class PerPartitionKeySerializer:
8
+ """
9
+ We are concerned of the performance of looping through the `states` list and evaluating equality on the partition. To reduce this
10
+ concern, we wanted to use dictionaries to map `partition -> cursor`. However, partitions are dict and dict can't be used as dict keys
11
+ since they are not hashable. By creating json string using the dict, we can have a use the dict as a key to the dict since strings are
12
+ hashable.
13
+ """
14
+
15
+ @staticmethod
16
+ def to_partition_key(to_serialize: Any) -> str:
17
+ # separators have changed in Python 3.4. To avoid being impacted by further change, we explicitly specify our own value
18
+ return json.dumps(to_serialize, indent=None, separators=(",", ":"), sort_keys=True)
19
+
20
+ @staticmethod
21
+ def to_partition(to_deserialize: Any) -> Mapping[str, Any]:
22
+ return json.loads(to_deserialize) # type: ignore # The partition is known to be a dict, but the type hint is Any
@@ -0,0 +1,51 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Optional
5
+
6
+ from airbyte_cdk.sources.streams.checkpoint import Cursor
7
+ from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
8
+
9
+
10
+ @dataclass
11
+ class ResumableFullRefreshCursor(Cursor):
12
+ """
13
+ Cursor that allows for the checkpointing of sync progress according to a synthetic cursor based on the pagination state
14
+ of the stream. Resumable full refresh syncs are only intended to retain state in between sync attempts of the same job
15
+ with the platform responsible for removing said state.
16
+ """
17
+
18
+ def __init__(self) -> None:
19
+ self._cursor: StreamState = {}
20
+
21
+ def get_stream_state(self) -> StreamState:
22
+ return self._cursor
23
+
24
+ def set_initial_state(self, stream_state: StreamState) -> None:
25
+ self._cursor = stream_state
26
+
27
+ def observe(self, stream_slice: StreamSlice, record: Record) -> None:
28
+ """
29
+ Resumable full refresh manages state using a page number so it does not need to update state by observing incoming records.
30
+ """
31
+ pass
32
+
33
+ def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
34
+ self._cursor = stream_slice.cursor_slice
35
+
36
+ def should_be_synced(self, record: Record) -> bool:
37
+ """
38
+ Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages
39
+ that don't have filterable bounds. We should always return them.
40
+ """
41
+ return True
42
+
43
+ def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
44
+ """
45
+ RFR record don't have ordering to be compared between one another.
46
+ """
47
+ return False
48
+
49
+ def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
50
+ # A top-level RFR cursor only manages the state of a single partition
51
+ return self._cursor
@@ -0,0 +1,110 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Mapping, MutableMapping, Optional
5
+
6
+ from airbyte_cdk.models import FailureType
7
+ from airbyte_cdk.sources.streams.checkpoint import Cursor
8
+ from airbyte_cdk.sources.streams.checkpoint.per_partition_key_serializer import (
9
+ PerPartitionKeySerializer,
10
+ )
11
+ from airbyte_cdk.sources.types import Record, StreamSlice, StreamState
12
+ from airbyte_cdk.utils import AirbyteTracedException
13
+
14
+ FULL_REFRESH_COMPLETE_STATE: Mapping[str, Any] = {"__ab_full_refresh_sync_complete": True}
15
+
16
+
17
+ @dataclass
18
+ class SubstreamResumableFullRefreshCursor(Cursor):
19
+ def __init__(self) -> None:
20
+ self._per_partition_state: MutableMapping[str, StreamState] = {}
21
+ self._partition_serializer = PerPartitionKeySerializer()
22
+
23
+ def get_stream_state(self) -> StreamState:
24
+ return {"states": list(self._per_partition_state.values())}
25
+
26
+ def set_initial_state(self, stream_state: StreamState) -> None:
27
+ """
28
+ Set the initial state for the cursors.
29
+
30
+ This method initializes the state for each partition cursor using the provided stream state.
31
+ If a partition state is provided in the stream state, it will update the corresponding partition cursor with this state.
32
+
33
+ To simplify processing and state management, we do not maintain the checkpointed state of the parent partitions.
34
+ Instead, we are tracking whether a parent has already successfully synced on a prior attempt and skipping over it
35
+ allowing the sync to continue making progress. And this works for RFR because the platform will dispose of this
36
+ state on the next sync job.
37
+
38
+ Args:
39
+ stream_state (StreamState): The state of the streams to be set. The format of the stream state should be:
40
+ {
41
+ "states": [
42
+ {
43
+ "partition": {
44
+ "partition_key": "value_0"
45
+ },
46
+ "cursor": {
47
+ "__ab_full_refresh_sync_complete": True
48
+ }
49
+ },
50
+ {
51
+ "partition": {
52
+ "partition_key": "value_1"
53
+ },
54
+ "cursor": {},
55
+ },
56
+ ]
57
+ }
58
+ """
59
+ if not stream_state:
60
+ return
61
+
62
+ if "states" not in stream_state:
63
+ raise AirbyteTracedException(
64
+ internal_message=f"Could not sync parse the following state: {stream_state}",
65
+ message="The state for is format invalid. Validate that the migration steps included a reset and that it was performed "
66
+ "properly. Otherwise, please contact Airbyte support.",
67
+ failure_type=FailureType.config_error,
68
+ )
69
+
70
+ for state in stream_state["states"]:
71
+ self._per_partition_state[self._to_partition_key(state["partition"])] = state
72
+
73
+ def observe(self, stream_slice: StreamSlice, record: Record) -> None:
74
+ """
75
+ Substream resumable full refresh manages state by closing the slice after syncing a parent so observe is not used.
76
+ """
77
+ pass
78
+
79
+ def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None:
80
+ self._per_partition_state[self._to_partition_key(stream_slice.partition)] = {
81
+ "partition": stream_slice.partition,
82
+ "cursor": FULL_REFRESH_COMPLETE_STATE,
83
+ }
84
+
85
+ def should_be_synced(self, record: Record) -> bool:
86
+ """
87
+ Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages
88
+ that don't have filterable bounds. We should always return them.
89
+ """
90
+ return True
91
+
92
+ def is_greater_than_or_equal(self, first: Record, second: Record) -> bool:
93
+ """
94
+ RFR record don't have ordering to be compared between one another.
95
+ """
96
+ return False
97
+
98
+ def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]:
99
+ if not stream_slice:
100
+ raise ValueError("A partition needs to be provided in order to extract a state")
101
+
102
+ return self._per_partition_state.get(
103
+ self._to_partition_key(stream_slice.partition), {}
104
+ ).get("cursor")
105
+
106
+ def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
107
+ return self._partition_serializer.to_partition_key(partition)
108
+
109
+ def _to_dict(self, partition_key: str) -> Mapping[str, Any]:
110
+ return self._partition_serializer.to_partition(partition_key)
@@ -0,0 +1,7 @@
1
+ ## Breaking Changes & Limitations
2
+
3
+ - [bigger scope than Concurrent CDK] checkpointing state was acting on the number of records per slice. This has been changed to consider the number of records per syncs
4
+ - `Source.read_state` and `Source._emit_legacy_state_format` are now classmethods to allow for developers to have access to the state before instantiating the source
5
+ - send_per_stream_state is always True for Concurrent CDK
6
+ - Using stream_state during read_records: The concern is that today, stream_instance.get_updated_state is called on every record and read_records on every slice. The implication is that the argument stream_state passed to read_records will have the value after the last stream_instance.get_updated_state of the previous slice. For Concurrent CDK, this is not possible as slices are processed in an unordered way.
7
+ - Cursor fields can only be data-time formatted as epoch. Eventually, we want to move to ISO 8601 as it provides more flexibility but for the first iteration on Stripe, it was easier to use the same format that was already used
@@ -5,14 +5,19 @@
5
5
  from abc import ABC, abstractmethod
6
6
  from typing import Any, Iterable, Mapping, Optional
7
7
 
8
+ from typing_extensions import deprecated
9
+
8
10
  from airbyte_cdk.models import AirbyteStream
11
+ from airbyte_cdk.sources.source import ExperimentalClassWarning
9
12
  from airbyte_cdk.sources.streams.concurrent.availability_strategy import StreamAvailability
10
13
  from airbyte_cdk.sources.streams.concurrent.cursor import Cursor
11
14
  from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
12
- from deprecated.classic import deprecated
13
15
 
14
16
 
15
- @deprecated("This class is experimental. Use at your own risk.")
17
+ @deprecated(
18
+ "This class is experimental. Use at your own risk.",
19
+ category=ExperimentalClassWarning,
20
+ )
16
21
  class AbstractStream(ABC):
17
22
  """
18
23
  AbstractStream is an experimental interface for streams developed as part of the Concurrent CDK.