airbyte-cdk 0.0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (368) hide show
  1. airbyte_cdk/__init__.py +358 -0
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +104 -0
  7. airbyte_cdk/connector.py +123 -0
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/__init__.py +3 -0
  10. airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
  11. airbyte_cdk/connector_builder/main.py +107 -0
  12. airbyte_cdk/connector_builder/models.py +73 -0
  13. airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
  14. airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
  15. airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
  16. airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
  17. airbyte_cdk/connector_builder/test_reader/types.py +83 -0
  18. airbyte_cdk/destinations/__init__.py +8 -0
  19. airbyte_cdk/destinations/destination.py +154 -0
  20. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  21. airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
  22. airbyte_cdk/destinations/vector_db_based/config.py +298 -0
  23. airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
  24. airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
  25. airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
  26. airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
  27. airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
  28. airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
  29. airbyte_cdk/entrypoint.py +414 -0
  30. airbyte_cdk/exception_handler.py +56 -0
  31. airbyte_cdk/logger.py +109 -0
  32. airbyte_cdk/models/__init__.py +72 -0
  33. airbyte_cdk/models/airbyte_protocol.py +88 -0
  34. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  35. airbyte_cdk/models/well_known_types.py +5 -0
  36. airbyte_cdk/py.typed +0 -0
  37. airbyte_cdk/sources/__init__.py +26 -0
  38. airbyte_cdk/sources/abstract_source.py +326 -0
  39. airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
  40. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
  41. airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
  42. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
  43. airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
  44. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  45. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
  46. airbyte_cdk/sources/config.py +27 -0
  47. airbyte_cdk/sources/connector_state_manager.py +161 -0
  48. airbyte_cdk/sources/declarative/__init__.py +3 -0
  49. airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
  50. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  51. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
  52. airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
  53. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  54. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  55. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  56. airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
  57. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
  58. airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
  59. airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
  60. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
  61. airbyte_cdk/sources/declarative/auth/token.py +267 -0
  62. airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
  63. airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
  64. airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
  65. airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
  66. airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
  67. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  68. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  69. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
  70. airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
  71. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
  72. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
  73. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
  74. airbyte_cdk/sources/declarative/declarative_source.py +36 -0
  75. airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
  76. airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
  77. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
  78. airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
  79. airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
  80. airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
  81. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  82. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  83. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  84. airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
  85. airbyte_cdk/sources/declarative/exceptions.py +9 -0
  86. airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
  87. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
  88. airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
  89. airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
  90. airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
  91. airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
  92. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
  93. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  94. airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
  95. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
  96. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
  97. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  98. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
  99. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
  100. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  101. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  102. airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
  103. airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
  104. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
  105. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
  106. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
  107. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
  108. airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
  109. airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
  110. airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
  111. airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
  112. airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
  113. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  114. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  115. airbyte_cdk/sources/declarative/models/__init__.py +2 -0
  116. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
  117. airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
  118. airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
  119. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
  120. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
  121. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
  122. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
  123. airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
  124. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  125. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  126. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
  127. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  128. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
  129. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
  130. airbyte_cdk/sources/declarative/requesters/README.md +56 -0
  131. airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
  132. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
  133. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
  134. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
  135. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
  136. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
  137. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
  138. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
  139. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
  140. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
  141. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
  142. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  143. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
  144. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
  145. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
  146. airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
  147. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
  148. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
  149. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
  150. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
  151. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
  152. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
  153. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
  154. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
  155. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
  156. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
  157. airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
  158. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
  159. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
  160. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  161. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
  162. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
  163. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
  164. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
  165. airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
  166. airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
  167. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  168. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  169. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  170. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  171. airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
  172. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
  173. airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
  174. airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
  175. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
  176. airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
  177. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
  178. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
  179. airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
  180. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
  181. airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
  182. airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
  183. airbyte_cdk/sources/declarative/spec/spec.py +48 -0
  184. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
  185. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
  186. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
  187. airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
  188. airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
  189. airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
  190. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  191. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  192. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  193. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  194. airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
  195. airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
  196. airbyte_cdk/sources/declarative/types.py +25 -0
  197. airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
  198. airbyte_cdk/sources/file_based/README.md +152 -0
  199. airbyte_cdk/sources/file_based/__init__.py +24 -0
  200. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
  201. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
  202. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
  203. airbyte_cdk/sources/file_based/config/__init__.py +0 -0
  204. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
  205. airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
  206. airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
  207. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  208. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
  209. airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
  210. airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
  211. airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
  212. airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
  213. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
  214. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
  215. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
  216. airbyte_cdk/sources/file_based/exceptions.py +159 -0
  217. airbyte_cdk/sources/file_based/file_based_source.py +466 -0
  218. airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
  219. airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
  220. airbyte_cdk/sources/file_based/file_record_data.py +22 -0
  221. airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
  222. airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
  223. airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
  224. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  225. airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
  226. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
  227. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
  228. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
  229. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
  230. airbyte_cdk/sources/file_based/remote_file.py +18 -0
  231. airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
  232. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
  233. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
  234. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
  235. airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
  236. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
  237. airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
  238. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
  239. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
  240. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
  241. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
  242. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
  243. airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
  244. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
  245. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
  246. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
  247. airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
  248. airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
  249. airbyte_cdk/sources/file_based/types.py +10 -0
  250. airbyte_cdk/sources/http_config.py +10 -0
  251. airbyte_cdk/sources/http_logger.py +55 -0
  252. airbyte_cdk/sources/message/__init__.py +19 -0
  253. airbyte_cdk/sources/message/repository.py +137 -0
  254. airbyte_cdk/sources/source.py +95 -0
  255. airbyte_cdk/sources/specs/transfer_modes.py +26 -0
  256. airbyte_cdk/sources/streams/__init__.py +8 -0
  257. airbyte_cdk/sources/streams/availability_strategy.py +84 -0
  258. airbyte_cdk/sources/streams/call_rate.py +704 -0
  259. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  260. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  261. airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
  262. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  263. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  264. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  265. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  266. airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
  267. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
  268. airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
  269. airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
  270. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
  271. airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
  272. airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
  273. airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
  274. airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
  275. airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
  276. airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
  277. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
  278. airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
  279. airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
  280. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
  281. airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
  282. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  283. airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
  284. airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
  285. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
  286. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
  287. airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
  288. airbyte_cdk/sources/streams/core.py +703 -0
  289. airbyte_cdk/sources/streams/http/__init__.py +10 -0
  290. airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
  291. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  292. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  293. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  294. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  295. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  296. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  297. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  298. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  299. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  300. airbyte_cdk/sources/streams/http/exceptions.py +61 -0
  301. airbyte_cdk/sources/streams/http/http.py +673 -0
  302. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  303. airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
  304. airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
  305. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
  306. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
  307. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
  308. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
  309. airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
  310. airbyte_cdk/sources/streams/utils/__init__.py +3 -0
  311. airbyte_cdk/sources/types.py +169 -0
  312. airbyte_cdk/sources/utils/__init__.py +7 -0
  313. airbyte_cdk/sources/utils/casing.py +12 -0
  314. airbyte_cdk/sources/utils/files_directory.py +15 -0
  315. airbyte_cdk/sources/utils/record_helper.py +53 -0
  316. airbyte_cdk/sources/utils/schema_helpers.py +230 -0
  317. airbyte_cdk/sources/utils/slice_logger.py +57 -0
  318. airbyte_cdk/sources/utils/transform.py +277 -0
  319. airbyte_cdk/sources/utils/types.py +7 -0
  320. airbyte_cdk/sql/__init__.py +0 -0
  321. airbyte_cdk/sql/_util/__init__.py +0 -0
  322. airbyte_cdk/sql/_util/hashing.py +34 -0
  323. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  324. airbyte_cdk/sql/constants.py +32 -0
  325. airbyte_cdk/sql/exceptions.py +235 -0
  326. airbyte_cdk/sql/secrets.py +123 -0
  327. airbyte_cdk/sql/shared/__init__.py +15 -0
  328. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  329. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  330. airbyte_cdk/sql/types.py +160 -0
  331. airbyte_cdk/test/__init__.py +7 -0
  332. airbyte_cdk/test/catalog_builder.py +81 -0
  333. airbyte_cdk/test/entrypoint_wrapper.py +250 -0
  334. airbyte_cdk/test/mock_http/__init__.py +6 -0
  335. airbyte_cdk/test/mock_http/matcher.py +41 -0
  336. airbyte_cdk/test/mock_http/mocker.py +185 -0
  337. airbyte_cdk/test/mock_http/request.py +103 -0
  338. airbyte_cdk/test/mock_http/response.py +28 -0
  339. airbyte_cdk/test/mock_http/response_builder.py +237 -0
  340. airbyte_cdk/test/state_builder.py +33 -0
  341. airbyte_cdk/test/utils/__init__.py +1 -0
  342. airbyte_cdk/test/utils/data.py +24 -0
  343. airbyte_cdk/test/utils/http_mocking.py +16 -0
  344. airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
  345. airbyte_cdk/test/utils/reading.py +26 -0
  346. airbyte_cdk/utils/__init__.py +10 -0
  347. airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
  348. airbyte_cdk/utils/analytics_message.py +25 -0
  349. airbyte_cdk/utils/constants.py +5 -0
  350. airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
  351. airbyte_cdk/utils/datetime_helpers.py +499 -0
  352. airbyte_cdk/utils/event_timing.py +85 -0
  353. airbyte_cdk/utils/is_cloud_environment.py +18 -0
  354. airbyte_cdk/utils/mapping_helpers.py +162 -0
  355. airbyte_cdk/utils/message_utils.py +26 -0
  356. airbyte_cdk/utils/oneof_option_config.py +33 -0
  357. airbyte_cdk/utils/print_buffer.py +75 -0
  358. airbyte_cdk/utils/schema_inferrer.py +270 -0
  359. airbyte_cdk/utils/slice_hasher.py +37 -0
  360. airbyte_cdk/utils/spec_schema_transformations.py +26 -0
  361. airbyte_cdk/utils/stream_status_utils.py +43 -0
  362. airbyte_cdk/utils/traced_exception.py +145 -0
  363. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
  364. airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
  365. airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
  366. airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
  367. airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
  368. airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,162 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+
6
+ import copy
7
+ from typing import Any, Dict, List, Mapping, Optional, Union
8
+
9
+ from airbyte_cdk.sources.declarative.requesters.request_option import (
10
+ RequestOption,
11
+ RequestOptionType,
12
+ )
13
+ from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
14
+
15
+
16
+ def _merge_mappings(
17
+ target: Dict[str, Any],
18
+ source: Mapping[str, Any],
19
+ path: Optional[List[str]] = None,
20
+ allow_same_value_merge: bool = False,
21
+ ) -> None:
22
+ """
23
+ Recursively merge two dictionaries, raising an error if there are any conflicts.
24
+ For body_json requests (allow_same_value_merge=True), a conflict occurs only when the same path has different values.
25
+ For other request types (allow_same_value_merge=False), any duplicate key is a conflict, regardless of value.
26
+
27
+ Args:
28
+ target: The dictionary to merge into
29
+ source: The dictionary to merge from
30
+ path: The current path in the nested structure (for error messages)
31
+ allow_same_value_merge: Whether to allow merging the same value into the same key. Set to false by default, should only be true for body_json injections
32
+ """
33
+ path = path or []
34
+ for key, source_value in source.items():
35
+ current_path = path + [str(key)]
36
+
37
+ if key in target:
38
+ target_value = target[key]
39
+ if isinstance(target_value, dict) and isinstance(source_value, dict):
40
+ # Only body_json supports nested_structures
41
+ if not allow_same_value_merge:
42
+ raise ValueError(
43
+ f"Request body collision, duplicate keys detected at key path: {'.'.join(current_path)}. Please ensure that all keys in the request are unique."
44
+ )
45
+ # If both are dictionaries, recursively merge them
46
+ _merge_mappings(target_value, source_value, current_path, allow_same_value_merge)
47
+
48
+ elif not allow_same_value_merge or target_value != source_value:
49
+ # If same key has different values, that's a conflict
50
+ raise ValueError(
51
+ f"Request body collision, duplicate keys detected at key path: {'.'.join(current_path)}. Please ensure that all keys in the request are unique."
52
+ )
53
+ else:
54
+ # No conflict, just copy the value (using deepcopy for nested structures)
55
+ target[key] = copy.deepcopy(source_value)
56
+
57
+
58
+ def combine_mappings(
59
+ mappings: List[Optional[Union[Mapping[str, Any], str]]],
60
+ allow_same_value_merge: bool = False,
61
+ ) -> Union[Mapping[str, Any], str]:
62
+ """
63
+ Combine multiple mappings into a single mapping.
64
+
65
+ For body_json requests (allow_same_value_merge=True):
66
+ - Supports nested structures (e.g., {"data": {"user": {"id": 1}}})
67
+ - Allows duplicate keys if their values match
68
+ - Raises error if same path has different values
69
+
70
+ For other request types (allow_same_value_merge=False):
71
+ - Only supports flat structures
72
+ - Any duplicate key raises an error, regardless of value
73
+
74
+ Args:
75
+ mappings: List of mappings to combine
76
+ allow_same_value_merge: Whether to allow duplicate keys with matching values.
77
+ Should only be True for body_json requests.
78
+
79
+ Returns:
80
+ A single mapping combining all inputs, or a string if there is exactly one
81
+ string mapping and no other non-empty mappings.
82
+
83
+ Raises:
84
+ ValueError: If there are:
85
+ - Multiple string mappings
86
+ - Both a string mapping and non-empty dictionary mappings
87
+ - Conflicting keys/paths based on allow_same_value_merge setting
88
+ """
89
+ if not mappings:
90
+ return {}
91
+
92
+ # Count how many string options we have, ignoring None values
93
+ string_options = sum(isinstance(mapping, str) for mapping in mappings if mapping is not None)
94
+ if string_options > 1:
95
+ raise ValueError("Cannot combine multiple string options")
96
+
97
+ # Filter out None values and empty mappings
98
+ non_empty_mappings = [
99
+ m for m in mappings if m is not None and not (isinstance(m, Mapping) and not m)
100
+ ]
101
+
102
+ # If there is only one string option and no other non-empty mappings, return it
103
+ if string_options == 1:
104
+ if len(non_empty_mappings) > 1:
105
+ raise ValueError("Cannot combine multiple options if one is a string")
106
+ return next(m for m in non_empty_mappings if isinstance(m, str))
107
+
108
+ # Start with an empty result and merge each mapping into it
109
+ result: Dict[str, Any] = {}
110
+ for mapping in non_empty_mappings:
111
+ if mapping and isinstance(mapping, Mapping):
112
+ _merge_mappings(result, mapping, allow_same_value_merge=allow_same_value_merge)
113
+
114
+ return result
115
+
116
+
117
+ def _validate_component_request_option_paths(
118
+ config: Config, *request_options: Optional[RequestOption]
119
+ ) -> None:
120
+ """
121
+ Validates that a component with multiple request options does not have conflicting paths.
122
+ Uses dummy values for validation since actual values might not be available at init time.
123
+ """
124
+ grouped_options: Dict[RequestOptionType, List[RequestOption]] = {}
125
+ for option in request_options:
126
+ if option:
127
+ grouped_options.setdefault(option.inject_into, []).append(option)
128
+
129
+ for inject_type, options in grouped_options.items():
130
+ if len(options) <= 1:
131
+ continue
132
+
133
+ option_dicts: List[Optional[Union[Mapping[str, Any], str]]] = []
134
+ for i, option in enumerate(options):
135
+ option_dict: Dict[str, Any] = {}
136
+ # Use indexed dummy values to ensure we catch conflicts
137
+ option.inject_into_request(option_dict, f"dummy_value_{i}", config)
138
+ option_dicts.append(option_dict)
139
+
140
+ try:
141
+ combine_mappings(
142
+ option_dicts, allow_same_value_merge=(inject_type == RequestOptionType.body_json)
143
+ )
144
+ except ValueError as error:
145
+ raise ValueError(error)
146
+
147
+
148
+ def get_interpolation_context(
149
+ stream_state: Optional[StreamState] = None,
150
+ stream_slice: Optional[StreamSlice] = None,
151
+ next_page_token: Optional[Mapping[str, Any]] = None,
152
+ ) -> Mapping[str, Any]:
153
+ return {
154
+ "stream_slice": stream_slice,
155
+ "next_page_token": next_page_token,
156
+ # update the context with extra fields, if passed.
157
+ **(
158
+ stream_slice.extra_fields
159
+ if stream_slice is not None and hasattr(stream_slice, "extra_fields")
160
+ else {}
161
+ ),
162
+ }
@@ -0,0 +1,26 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ from airbyte_cdk.models import AirbyteMessage, Type
4
+ from airbyte_cdk.sources.connector_state_manager import HashableStreamDescriptor
5
+
6
+
7
+ def get_stream_descriptor(message: AirbyteMessage) -> HashableStreamDescriptor:
8
+ match message.type:
9
+ case Type.RECORD:
10
+ return HashableStreamDescriptor(
11
+ name=message.record.stream, # type: ignore[union-attr] # record has `stream`
12
+ namespace=message.record.namespace, # type: ignore[union-attr] # record has `namespace`
13
+ )
14
+ case Type.STATE:
15
+ if not message.state.stream or not message.state.stream.stream_descriptor: # type: ignore[union-attr] # state has `stream`
16
+ raise ValueError(
17
+ "State message was not in per-stream state format, which is required for record counts."
18
+ )
19
+ return HashableStreamDescriptor(
20
+ name=message.state.stream.stream_descriptor.name, # type: ignore[union-attr] # state has `stream`
21
+ namespace=message.state.stream.stream_descriptor.namespace, # type: ignore[union-attr] # state has `stream`
22
+ )
23
+ case _:
24
+ raise NotImplementedError(
25
+ f"get_stream_descriptor is not implemented for message type '{message.type}'."
26
+ )
@@ -0,0 +1,33 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from typing import Any, Dict
6
+
7
+
8
+ class OneOfOptionConfig:
9
+ """
10
+ Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.
11
+
12
+ Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).
13
+
14
+ Usage:
15
+
16
+ ```python
17
+ class OptionModel(BaseModel):
18
+ mode: Literal["option_a"] = Field("option_a", const=True)
19
+ option_a_field: str = Field(...)
20
+
21
+ class Config(OneOfOptionConfig):
22
+ title = "Option A"
23
+ description = "Option A description"
24
+ discriminator = "mode"
25
+ ```
26
+ """
27
+
28
+ @staticmethod
29
+ def schema_extra(schema: Dict[str, Any], model: Any) -> None:
30
+ if hasattr(model.Config, "description"):
31
+ schema["description"] = model.Config.description
32
+ if hasattr(model.Config, "discriminator"):
33
+ schema.setdefault("required", []).append(model.Config.discriminator)
@@ -0,0 +1,75 @@
1
+ # Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2
+
3
+ import sys
4
+ import time
5
+ from io import StringIO
6
+ from threading import RLock
7
+ from types import TracebackType
8
+ from typing import Optional
9
+
10
+
11
+ class PrintBuffer:
12
+ """
13
+ A class to buffer print statements and flush them at a specified interval.
14
+
15
+ The PrintBuffer class is designed to capture and buffer output that would
16
+ normally be printed to the standard output (stdout). This can be useful for
17
+ scenarios where you want to minimize the number of I/O operations by grouping
18
+ multiple print statements together and flushing them as a single operation.
19
+
20
+ Attributes:
21
+ buffer (StringIO): A buffer to store the messages before flushing.
22
+ flush_interval (float): The time interval (in seconds) after which the buffer is flushed.
23
+ last_flush_time (float): The last time the buffer was flushed.
24
+ lock (RLock): A reentrant lock to ensure thread-safe operations.
25
+
26
+ Methods:
27
+ write(message: str) -> None:
28
+ Writes a message to the buffer and flushes if the interval has passed.
29
+
30
+ flush() -> None:
31
+ Flushes the buffer content to the standard output.
32
+
33
+ __enter__() -> "PrintBuffer":
34
+ Enters the runtime context related to this object, redirecting stdout and stderr.
35
+
36
+ __exit__(exc_type, exc_val, exc_tb) -> None:
37
+ Exits the runtime context and restores the original stdout and stderr.
38
+ """
39
+
40
+ def __init__(self, flush_interval: float = 0.1):
41
+ self.buffer = StringIO()
42
+ self.flush_interval = flush_interval
43
+ self.last_flush_time = time.monotonic()
44
+ self.lock = RLock()
45
+
46
+ def write(self, message: str) -> None:
47
+ with self.lock:
48
+ self.buffer.write(message)
49
+ current_time = time.monotonic()
50
+ if (current_time - self.last_flush_time) >= self.flush_interval:
51
+ self.flush()
52
+ self.last_flush_time = current_time
53
+
54
+ def flush(self) -> None:
55
+ with self.lock:
56
+ combined_message = self.buffer.getvalue()
57
+ sys.__stdout__.write(combined_message) # type: ignore[union-attr]
58
+ self.buffer = StringIO()
59
+
60
+ def __enter__(self) -> "PrintBuffer":
61
+ self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
62
+ # Used to disable buffering during the pytest session, because it is not compatible with capsys
63
+ if "pytest" not in str(type(sys.stdout)).lower():
64
+ sys.stdout = self
65
+ sys.stderr = self
66
+ return self
67
+
68
+ def __exit__(
69
+ self,
70
+ exc_type: Optional[BaseException],
71
+ exc_val: Optional[BaseException],
72
+ exc_tb: Optional[TracebackType],
73
+ ) -> None:
74
+ self.flush()
75
+ sys.stdout, sys.stderr = self.old_stdout, self.old_stderr
@@ -0,0 +1,270 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ from collections import defaultdict
6
+ from typing import Any, Dict, List, Mapping, Optional
7
+
8
+ from genson import SchemaBuilder, SchemaNode
9
+ from genson.schema.strategies.object import Object
10
+ from genson.schema.strategies.scalar import Number
11
+
12
+ from airbyte_cdk.models import AirbyteRecordMessage
13
+
14
+ # schema keywords
15
+ _TYPE = "type"
16
+ _NULL_TYPE = "null"
17
+ _OBJECT_TYPE = "object"
18
+ _ANY_OF = "anyOf"
19
+ _ITEMS = "items"
20
+ _PROPERTIES = "properties"
21
+ _REQUIRED = "required"
22
+
23
+
24
+ class NoRequiredObj(Object):
25
+ """
26
+ This class has Object behaviour, but it does not generate "required[]" fields
27
+ every time it parses object. So we don't add unnecessary extra field.
28
+
29
+ The logic is that even reading all the data from a source, it does not mean that there can be another record added with those fields as
30
+ optional. Hence, we make everything nullable.
31
+ """
32
+
33
+ def to_schema(self) -> Mapping[str, Any]:
34
+ schema: Dict[str, Any] = super(NoRequiredObj, self).to_schema()
35
+ schema.pop("required", None)
36
+ return schema
37
+
38
+
39
+ class IntegerToNumber(Number):
40
+ """
41
+ This class has the regular Number behaviour, but it will never emit an integer type.
42
+ """
43
+
44
+ def __init__(self, node_class: SchemaNode):
45
+ super().__init__(node_class)
46
+ self._type = "number"
47
+
48
+
49
+ class NoRequiredSchemaBuilder(SchemaBuilder):
50
+ EXTRA_STRATEGIES = (NoRequiredObj, IntegerToNumber)
51
+
52
+
53
+ # This type is inferred from the genson lib, but there is no alias provided for it - creating it here for type safety
54
+ InferredSchema = Dict[str, Any]
55
+
56
+
57
+ class SchemaValidationException(Exception):
58
+ @classmethod
59
+ def merge_exceptions(
60
+ cls, exceptions: List["SchemaValidationException"]
61
+ ) -> "SchemaValidationException":
62
+ # We assume the schema is the same for all SchemaValidationException
63
+ return SchemaValidationException(
64
+ exceptions[0].schema,
65
+ [x for exception in exceptions for x in exception._validation_errors],
66
+ )
67
+
68
+ def __init__(self, schema: InferredSchema, validation_errors: List[Exception]):
69
+ self._schema = schema
70
+ self._validation_errors = validation_errors
71
+
72
+ @property
73
+ def schema(self) -> InferredSchema:
74
+ return self._schema
75
+
76
+ @property
77
+ def validation_errors(self) -> List[str]:
78
+ return list(map(lambda error: str(error), self._validation_errors))
79
+
80
+
81
+ class SchemaInferrer:
82
+ """
83
+ This class is used to infer a JSON schema which fits all the records passed into it
84
+ throughout its lifecycle via the accumulate method.
85
+
86
+ Instances of this class are stateful, meaning they build their inferred schemas
87
+ from every record passed into the accumulate method.
88
+
89
+ """
90
+
91
+ stream_to_builder: Dict[str, SchemaBuilder]
92
+
93
+ def __init__(
94
+ self, pk: Optional[List[List[str]]] = None, cursor_field: Optional[List[List[str]]] = None
95
+ ) -> None:
96
+ self.stream_to_builder = defaultdict(NoRequiredSchemaBuilder)
97
+ self._pk = [] if pk is None else pk
98
+ self._cursor_field = [] if cursor_field is None else cursor_field
99
+
100
+ def accumulate(self, record: AirbyteRecordMessage) -> None:
101
+ """Uses the input record to add to the inferred schemas maintained by this object"""
102
+ self.stream_to_builder[record.stream].add_object(record.data)
103
+
104
+ def _null_type_in_any_of(self, node: InferredSchema) -> bool:
105
+ if _ANY_OF in node:
106
+ return {_TYPE: _NULL_TYPE} in node[_ANY_OF]
107
+ else:
108
+ return False
109
+
110
+ def _remove_type_from_any_of(self, node: InferredSchema) -> None:
111
+ if _ANY_OF in node:
112
+ node.pop(_TYPE, None)
113
+
114
+ def _clean_any_of(self, node: InferredSchema) -> None:
115
+ if len(node[_ANY_OF]) == 2 and self._null_type_in_any_of(node):
116
+ real_type = (
117
+ node[_ANY_OF][1] if node[_ANY_OF][0][_TYPE] == _NULL_TYPE else node[_ANY_OF][0]
118
+ )
119
+ node.update(real_type)
120
+ node[_TYPE] = [node[_TYPE], _NULL_TYPE]
121
+ node.pop(_ANY_OF)
122
+ # populate `type` for `anyOf` if it's not present to pass all other checks
123
+ elif len(node[_ANY_OF]) == 2 and not self._null_type_in_any_of(node):
124
+ node[_TYPE] = [_NULL_TYPE]
125
+
126
+ def _clean_properties(self, node: InferredSchema) -> None:
127
+ for key, value in list(node[_PROPERTIES].items()):
128
+ if isinstance(value, dict) and value.get(_TYPE, None) == _NULL_TYPE:
129
+ node[_PROPERTIES].pop(key)
130
+ else:
131
+ self._clean(value)
132
+
133
+ def _ensure_null_type_on_top(self, node: InferredSchema) -> None:
134
+ if isinstance(node[_TYPE], list):
135
+ if _NULL_TYPE in node[_TYPE]:
136
+ # we want to make sure null is always at the end as it makes schemas more readable
137
+ node[_TYPE].remove(_NULL_TYPE)
138
+ node[_TYPE].append(_NULL_TYPE)
139
+ else:
140
+ node[_TYPE] = [node[_TYPE], _NULL_TYPE]
141
+
142
+ def _clean(self, node: InferredSchema) -> InferredSchema:
143
+ """
144
+ Recursively cleans up a produced schema:
145
+ - remove anyOf if one of them is just a null value
146
+ - remove properties of type "null"
147
+ """
148
+
149
+ if isinstance(node, dict):
150
+ if _ANY_OF in node:
151
+ self._clean_any_of(node)
152
+
153
+ if _PROPERTIES in node and isinstance(node[_PROPERTIES], dict):
154
+ self._clean_properties(node)
155
+
156
+ if _ITEMS in node:
157
+ self._clean(node[_ITEMS])
158
+
159
+ # this check needs to follow the "anyOf" cleaning as it might populate `type`
160
+ self._ensure_null_type_on_top(node)
161
+
162
+ # remove added `type: ["null"]` for `anyOf` nested node
163
+ self._remove_type_from_any_of(node)
164
+
165
+ return node
166
+
167
+ def _add_required_properties(self, node: InferredSchema) -> InferredSchema:
168
+ """
169
+ This method takes properties that should be marked as required (self._pk and self._cursor_field) and travel the schema to mark every
170
+ node as required.
171
+ """
172
+ # Removing nullable for the root as when we call `_clean`, we make everything nullable
173
+ node[_TYPE] = _OBJECT_TYPE
174
+
175
+ exceptions = []
176
+ for field in [x for x in [self._pk, self._cursor_field] if x]:
177
+ try:
178
+ self._add_fields_as_required(node, field)
179
+ except SchemaValidationException as exception:
180
+ exceptions.append(exception)
181
+
182
+ if exceptions:
183
+ raise SchemaValidationException.merge_exceptions(exceptions)
184
+
185
+ return node
186
+
187
+ def _add_fields_as_required(self, node: InferredSchema, composite_key: List[List[str]]) -> None:
188
+ """
189
+ Take a list of nested keys (this list represents a composite key) and travel the schema to mark every node as required.
190
+ """
191
+ errors: List[Exception] = []
192
+
193
+ for path in composite_key:
194
+ try:
195
+ self._add_field_as_required(node, path)
196
+ except ValueError as exception:
197
+ errors.append(exception)
198
+
199
+ if errors:
200
+ raise SchemaValidationException(node, errors)
201
+
202
+ def _add_field_as_required(
203
+ self, node: InferredSchema, path: List[str], traveled_path: Optional[List[str]] = None
204
+ ) -> None:
205
+ """
206
+ Take a nested key and travel the schema to mark every node as required.
207
+ """
208
+ self._remove_null_from_type(node)
209
+ if self._is_leaf(path):
210
+ return
211
+
212
+ if not traveled_path:
213
+ traveled_path = []
214
+
215
+ if _PROPERTIES not in node:
216
+ # This validation is only relevant when `traveled_path` is empty
217
+ raise ValueError(
218
+ f"Path {traveled_path} does not refer to an object but is `{node}` and hence {path} can't be marked as required."
219
+ )
220
+
221
+ next_node = path[0]
222
+ if next_node not in node[_PROPERTIES]:
223
+ raise ValueError(
224
+ f"Path {traveled_path} does not have field `{next_node}` in the schema and hence can't be marked as required."
225
+ )
226
+
227
+ if _TYPE not in node:
228
+ # We do not expect this case to happen but we added a specific error message just in case
229
+ raise ValueError(
230
+ f"Unknown schema error: {traveled_path} is expected to have a type but did not. Schema inferrence is probably broken"
231
+ )
232
+
233
+ if node[_TYPE] not in [
234
+ _OBJECT_TYPE,
235
+ [_NULL_TYPE, _OBJECT_TYPE],
236
+ [_OBJECT_TYPE, _NULL_TYPE],
237
+ ]:
238
+ raise ValueError(
239
+ f"Path {traveled_path} is expected to be an object but was of type `{node['properties'][next_node]['type']}`"
240
+ )
241
+
242
+ if _REQUIRED not in node or not node[_REQUIRED]:
243
+ node[_REQUIRED] = [next_node]
244
+ elif next_node not in node[_REQUIRED]:
245
+ node[_REQUIRED].append(next_node)
246
+
247
+ traveled_path.append(next_node)
248
+ self._add_field_as_required(node[_PROPERTIES][next_node], path[1:], traveled_path)
249
+
250
+ def _is_leaf(self, path: List[str]) -> bool:
251
+ return len(path) == 0
252
+
253
+ def _remove_null_from_type(self, node: InferredSchema) -> None:
254
+ if isinstance(node[_TYPE], list):
255
+ if _NULL_TYPE in node[_TYPE]:
256
+ node[_TYPE].remove(_NULL_TYPE)
257
+ if len(node[_TYPE]) == 1:
258
+ node[_TYPE] = node[_TYPE][0]
259
+
260
+ def get_stream_schema(self, stream_name: str) -> Optional[InferredSchema]:
261
+ """
262
+ Returns the inferred JSON schema for the specified stream. Might be `None` if there were no records for the given stream name.
263
+ """
264
+ return (
265
+ self._add_required_properties(
266
+ self._clean(self.stream_to_builder[stream_name].to_schema())
267
+ )
268
+ if stream_name in self.stream_to_builder
269
+ else None
270
+ )
@@ -0,0 +1,37 @@
1
+ import hashlib
2
+ import json
3
+ from typing import Any, Final, Mapping, Optional
4
+
5
+
6
+ class SliceEncoder(json.JSONEncoder):
7
+ def default(self, obj: Any) -> Any:
8
+ if hasattr(obj, "__json_serializable__"):
9
+ return obj.__json_serializable__()
10
+
11
+ # Let the base class default method raise the TypeError
12
+ return super().default(obj)
13
+
14
+
15
+ class SliceHasher:
16
+ _ENCODING: Final = "utf-8"
17
+
18
+ @classmethod
19
+ def hash(
20
+ cls,
21
+ stream_name: str = "<stream name not provided>",
22
+ stream_slice: Optional[Mapping[str, Any]] = None,
23
+ ) -> int:
24
+ """
25
+ Note that streams partition with the same slicing value but with different names might collapse if stream name is not provided
26
+ """
27
+ if stream_slice:
28
+ try:
29
+ s = json.dumps(stream_slice, sort_keys=True, cls=SliceEncoder)
30
+ hash_input = f"{stream_name}:{s}".encode(cls._ENCODING)
31
+ except TypeError as e:
32
+ raise ValueError(f"Failed to serialize stream slice: {e}")
33
+ else:
34
+ hash_input = stream_name.encode(cls._ENCODING)
35
+
36
+ # Use last 8 bytes as 64-bit integer for better distribution
37
+ return int.from_bytes(hashlib.sha256(hash_input).digest()[-8:], "big")
@@ -0,0 +1,26 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+ import json
6
+ import re
7
+ from typing import Any
8
+
9
+ from jsonschema import RefResolver
10
+
11
+
12
+ def resolve_refs(schema: dict[str, Any]) -> dict[str, Any]:
13
+ """
14
+ For spec schemas generated using Pydantic models, the resulting JSON schema can contain refs between object
15
+ relationships.
16
+ """
17
+ json_schema_ref_resolver = RefResolver.from_schema(schema)
18
+ str_schema = json.dumps(schema)
19
+ for ref_block in re.findall(r'{"\$ref": "#\/definitions\/.+?(?="})"}', str_schema):
20
+ ref = json.loads(ref_block)["$ref"]
21
+ str_schema = str_schema.replace(
22
+ ref_block, json.dumps(json_schema_ref_resolver.resolve(ref)[1])
23
+ )
24
+ pyschema: dict[str, Any] = json.loads(str_schema)
25
+ del pyschema["definitions"]
26
+ return pyschema
@@ -0,0 +1,43 @@
1
+ #
2
+ # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
3
+ #
4
+
5
+
6
+ from datetime import datetime
7
+ from typing import List, Optional, Union
8
+
9
+ from airbyte_cdk.models import (
10
+ AirbyteMessage,
11
+ AirbyteStream,
12
+ AirbyteStreamStatus,
13
+ AirbyteStreamStatusReason,
14
+ AirbyteStreamStatusTraceMessage,
15
+ AirbyteTraceMessage,
16
+ StreamDescriptor,
17
+ TraceType,
18
+ )
19
+ from airbyte_cdk.models import Type as MessageType
20
+
21
+
22
+ def as_airbyte_message(
23
+ stream: Union[AirbyteStream, StreamDescriptor],
24
+ current_status: AirbyteStreamStatus,
25
+ reasons: Optional[List[AirbyteStreamStatusReason]] = None,
26
+ ) -> AirbyteMessage:
27
+ """
28
+ Builds an AirbyteStreamStatusTraceMessage for the provided stream
29
+ """
30
+
31
+ now_millis = datetime.now().timestamp() * 1000.0
32
+
33
+ trace_message = AirbyteTraceMessage(
34
+ type=TraceType.STREAM_STATUS,
35
+ emitted_at=now_millis,
36
+ stream_status=AirbyteStreamStatusTraceMessage(
37
+ stream_descriptor=StreamDescriptor(name=stream.name, namespace=stream.namespace),
38
+ status=current_status,
39
+ reasons=reasons,
40
+ ),
41
+ )
42
+
43
+ return AirbyteMessage(type=MessageType.TRACE, trace=trace_message)