airbyte-cdk 0.72.1__py3-none-any.whl → 6.13.1.dev4106__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (517) hide show
  1. airbyte_cdk/__init__.py +355 -6
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +29 -10
  7. airbyte_cdk/connector.py +24 -24
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
  10. airbyte_cdk/connector_builder/main.py +45 -13
  11. airbyte_cdk/connector_builder/message_grouper.py +189 -50
  12. airbyte_cdk/connector_builder/models.py +3 -2
  13. airbyte_cdk/destinations/__init__.py +4 -3
  14. airbyte_cdk/destinations/destination.py +54 -20
  15. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  16. airbyte_cdk/destinations/vector_db_based/config.py +40 -17
  17. airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
  18. airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
  19. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  20. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  21. airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
  22. airbyte_cdk/entrypoint.py +153 -44
  23. airbyte_cdk/exception_handler.py +21 -3
  24. airbyte_cdk/logger.py +30 -44
  25. airbyte_cdk/models/__init__.py +13 -2
  26. airbyte_cdk/models/airbyte_protocol.py +86 -1
  27. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  28. airbyte_cdk/models/file_transfer_record_message.py +13 -0
  29. airbyte_cdk/models/well_known_types.py +1 -1
  30. airbyte_cdk/sources/__init__.py +5 -1
  31. airbyte_cdk/sources/abstract_source.py +125 -79
  32. airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
  33. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
  34. airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
  35. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
  36. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  37. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
  38. airbyte_cdk/sources/config.py +3 -2
  39. airbyte_cdk/sources/connector_state_manager.py +49 -83
  40. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  41. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
  42. airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
  43. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  44. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  45. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  46. airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
  47. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  48. airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
  49. airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
  50. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
  51. airbyte_cdk/sources/declarative/auth/token.py +28 -10
  52. airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
  53. airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
  54. airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
  55. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  56. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  57. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +421 -0
  58. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
  59. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
  60. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1185 -85
  61. airbyte_cdk/sources/declarative/declarative_source.py +5 -2
  62. airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
  63. airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
  64. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
  65. airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
  66. airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
  67. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  68. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  69. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  70. airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
  71. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
  72. airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
  73. airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
  74. airbyte_cdk/sources/declarative/extractors/record_filter.py +65 -8
  75. airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
  76. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
  77. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  78. airbyte_cdk/sources/declarative/incremental/__init__.py +25 -3
  79. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
  80. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  81. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
  82. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +159 -74
  83. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  84. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  85. airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
  86. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
  87. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
  88. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
  89. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
  90. airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
  91. airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
  92. airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
  93. airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
  94. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  95. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  96. airbyte_cdk/sources/declarative/models/__init__.py +1 -1
  97. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1319 -603
  98. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
  99. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
  100. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
  101. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1695 -225
  102. airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
  103. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  104. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  105. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
  106. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  107. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
  108. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
  109. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
  110. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
  111. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
  112. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
  113. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
  114. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
  115. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
  116. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
  117. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
  118. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
  119. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  120. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
  121. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
  122. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
  123. airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
  124. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
  125. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
  126. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
  127. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
  128. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
  129. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
  130. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
  131. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
  132. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
  133. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
  134. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
  135. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
  136. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  137. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
  138. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
  139. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
  140. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
  141. airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
  142. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  143. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  144. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  145. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  146. airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
  147. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
  148. airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
  149. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +228 -72
  150. airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
  151. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
  152. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
  153. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
  154. airbyte_cdk/sources/declarative/spec/spec.py +12 -5
  155. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
  156. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
  157. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
  158. airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
  159. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  160. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  161. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  162. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  163. airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
  164. airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
  165. airbyte_cdk/sources/declarative/types.py +19 -110
  166. airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
  167. airbyte_cdk/sources/embedded/base_integration.py +16 -5
  168. airbyte_cdk/sources/embedded/catalog.py +16 -4
  169. airbyte_cdk/sources/embedded/runner.py +19 -3
  170. airbyte_cdk/sources/embedded/tools.py +5 -2
  171. airbyte_cdk/sources/file_based/README.md +152 -0
  172. airbyte_cdk/sources/file_based/__init__.py +24 -0
  173. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
  174. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
  175. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
  176. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +58 -10
  177. airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
  178. airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
  179. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  180. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
  181. airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
  182. airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
  183. airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
  184. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
  185. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  186. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  187. airbyte_cdk/sources/file_based/exceptions.py +52 -15
  188. airbyte_cdk/sources/file_based/file_based_source.py +163 -33
  189. airbyte_cdk/sources/file_based/file_based_stream_reader.py +83 -5
  190. airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
  191. airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
  192. airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
  193. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  194. airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
  195. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  196. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
  197. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
  198. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +145 -41
  199. airbyte_cdk/sources/file_based/remote_file.py +1 -1
  200. airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
  201. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
  202. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  203. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  204. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
  205. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
  206. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
  207. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
  208. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
  209. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
  210. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  211. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
  212. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +175 -45
  213. airbyte_cdk/sources/http_logger.py +8 -3
  214. airbyte_cdk/sources/message/__init__.py +7 -1
  215. airbyte_cdk/sources/message/repository.py +18 -4
  216. airbyte_cdk/sources/source.py +42 -38
  217. airbyte_cdk/sources/streams/__init__.py +2 -2
  218. airbyte_cdk/sources/streams/availability_strategy.py +54 -3
  219. airbyte_cdk/sources/streams/call_rate.py +64 -21
  220. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  221. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  222. airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
  223. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  224. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  225. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  226. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  227. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
  228. airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
  229. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
  230. airbyte_cdk/sources/streams/concurrent/cursor.py +298 -42
  231. airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
  232. airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
  233. airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
  234. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
  235. airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
  236. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
  237. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  238. airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
  239. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
  240. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
  241. airbyte_cdk/sources/streams/core.py +412 -87
  242. airbyte_cdk/sources/streams/http/__init__.py +2 -1
  243. airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
  244. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  245. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  246. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  247. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  248. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  249. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  250. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  251. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  252. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  253. airbyte_cdk/sources/streams/http/exceptions.py +27 -7
  254. airbyte_cdk/sources/streams/http/http.py +369 -246
  255. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  256. airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
  257. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
  258. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  259. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
  260. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  261. airbyte_cdk/sources/types.py +154 -0
  262. airbyte_cdk/sources/utils/record_helper.py +36 -21
  263. airbyte_cdk/sources/utils/schema_helpers.py +13 -6
  264. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  265. airbyte_cdk/sources/utils/transform.py +54 -20
  266. airbyte_cdk/sql/_util/hashing.py +34 -0
  267. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  268. airbyte_cdk/sql/constants.py +32 -0
  269. airbyte_cdk/sql/exceptions.py +235 -0
  270. airbyte_cdk/sql/secrets.py +123 -0
  271. airbyte_cdk/sql/shared/__init__.py +15 -0
  272. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  273. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  274. airbyte_cdk/sql/types.py +160 -0
  275. airbyte_cdk/test/catalog_builder.py +70 -18
  276. airbyte_cdk/test/entrypoint_wrapper.py +117 -42
  277. airbyte_cdk/test/mock_http/__init__.py +1 -1
  278. airbyte_cdk/test/mock_http/matcher.py +6 -0
  279. airbyte_cdk/test/mock_http/mocker.py +57 -10
  280. airbyte_cdk/test/mock_http/request.py +19 -3
  281. airbyte_cdk/test/mock_http/response.py +3 -1
  282. airbyte_cdk/test/mock_http/response_builder.py +32 -16
  283. airbyte_cdk/test/state_builder.py +18 -10
  284. airbyte_cdk/test/utils/__init__.py +1 -0
  285. airbyte_cdk/test/utils/data.py +24 -0
  286. airbyte_cdk/test/utils/http_mocking.py +16 -0
  287. airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
  288. airbyte_cdk/test/utils/reading.py +26 -0
  289. airbyte_cdk/utils/__init__.py +2 -1
  290. airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
  291. airbyte_cdk/utils/analytics_message.py +10 -2
  292. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  293. airbyte_cdk/utils/event_timing.py +10 -10
  294. airbyte_cdk/utils/mapping_helpers.py +3 -1
  295. airbyte_cdk/utils/message_utils.py +20 -11
  296. airbyte_cdk/utils/print_buffer.py +75 -0
  297. airbyte_cdk/utils/schema_inferrer.py +198 -28
  298. airbyte_cdk/utils/slice_hasher.py +30 -0
  299. airbyte_cdk/utils/spec_schema_transformations.py +6 -3
  300. airbyte_cdk/utils/stream_status_utils.py +8 -1
  301. airbyte_cdk/utils/traced_exception.py +61 -21
  302. airbyte_cdk-6.13.1.dev4106.dist-info/METADATA +109 -0
  303. airbyte_cdk-6.13.1.dev4106.dist-info/RECORD +349 -0
  304. {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/WHEEL +1 -2
  305. airbyte_cdk-6.13.1.dev4106.dist-info/entry_points.txt +3 -0
  306. airbyte_cdk/sources/declarative/create_partial.py +0 -92
  307. airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
  308. airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
  309. airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
  310. airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
  311. airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
  312. airbyte_cdk/sources/deprecated/base_source.py +0 -94
  313. airbyte_cdk/sources/deprecated/client.py +0 -99
  314. airbyte_cdk/sources/singer/__init__.py +0 -8
  315. airbyte_cdk/sources/singer/singer_helpers.py +0 -304
  316. airbyte_cdk/sources/singer/source.py +0 -186
  317. airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
  318. airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
  319. airbyte_cdk/sources/streams/http/auth/core.py +0 -29
  320. airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
  321. airbyte_cdk/sources/streams/http/auth/token.py +0 -47
  322. airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
  323. airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
  324. airbyte_cdk/sources/utils/schema_models.py +0 -84
  325. airbyte_cdk-0.72.1.dist-info/METADATA +0 -243
  326. airbyte_cdk-0.72.1.dist-info/RECORD +0 -466
  327. airbyte_cdk-0.72.1.dist-info/top_level.txt +0 -3
  328. source_declarative_manifest/main.py +0 -29
  329. unit_tests/connector_builder/__init__.py +0 -3
  330. unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
  331. unit_tests/connector_builder/test_message_grouper.py +0 -713
  332. unit_tests/connector_builder/utils.py +0 -27
  333. unit_tests/destinations/test_destination.py +0 -243
  334. unit_tests/singer/test_singer_helpers.py +0 -56
  335. unit_tests/singer/test_singer_source.py +0 -112
  336. unit_tests/sources/__init__.py +0 -0
  337. unit_tests/sources/concurrent_source/__init__.py +0 -3
  338. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
  339. unit_tests/sources/declarative/__init__.py +0 -3
  340. unit_tests/sources/declarative/auth/__init__.py +0 -3
  341. unit_tests/sources/declarative/auth/test_oauth.py +0 -331
  342. unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
  343. unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
  344. unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
  345. unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
  346. unit_tests/sources/declarative/checks/__init__.py +0 -3
  347. unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
  348. unit_tests/sources/declarative/decoders/__init__.py +0 -0
  349. unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
  350. unit_tests/sources/declarative/external_component.py +0 -13
  351. unit_tests/sources/declarative/extractors/__init__.py +0 -3
  352. unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
  353. unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
  354. unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
  355. unit_tests/sources/declarative/incremental/__init__.py +0 -0
  356. unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
  357. unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
  358. unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
  359. unit_tests/sources/declarative/interpolation/__init__.py +0 -3
  360. unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
  361. unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
  362. unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
  363. unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
  364. unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
  365. unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
  366. unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
  367. unit_tests/sources/declarative/parsers/__init__.py +0 -3
  368. unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
  369. unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
  370. unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1847
  371. unit_tests/sources/declarative/parsers/testing_components.py +0 -36
  372. unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
  373. unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
  374. unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
  375. unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
  376. unit_tests/sources/declarative/requesters/__init__.py +0 -3
  377. unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
  378. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
  379. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
  380. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
  381. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
  382. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
  383. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
  384. unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
  385. unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
  386. unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
  387. unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
  388. unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
  389. unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
  390. unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
  391. unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
  392. unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
  393. unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
  394. unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
  395. unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
  396. unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
  397. unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
  398. unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
  399. unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
  400. unit_tests/sources/declarative/retrievers/__init__.py +0 -3
  401. unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
  402. unit_tests/sources/declarative/schema/__init__.py +0 -6
  403. unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
  404. unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
  405. unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
  406. unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
  407. unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
  408. unit_tests/sources/declarative/states/__init__.py +0 -3
  409. unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
  410. unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
  411. unit_tests/sources/declarative/test_create_partial.py +0 -83
  412. unit_tests/sources/declarative/test_declarative_stream.py +0 -103
  413. unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
  414. unit_tests/sources/declarative/test_types.py +0 -39
  415. unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
  416. unit_tests/sources/file_based/__init__.py +0 -0
  417. unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
  418. unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
  419. unit_tests/sources/file_based/config/__init__.py +0 -0
  420. unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
  421. unit_tests/sources/file_based/config/test_csv_format.py +0 -34
  422. unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
  423. unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
  424. unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
  425. unit_tests/sources/file_based/file_types/__init__.py +0 -0
  426. unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
  427. unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
  428. unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
  429. unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
  430. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
  431. unit_tests/sources/file_based/helpers.py +0 -70
  432. unit_tests/sources/file_based/in_memory_files_source.py +0 -211
  433. unit_tests/sources/file_based/scenarios/__init__.py +0 -0
  434. unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
  435. unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
  436. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
  437. unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
  438. unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
  439. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
  440. unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
  441. unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
  442. unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
  443. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
  444. unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
  445. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
  446. unit_tests/sources/file_based/stream/__init__.py +0 -0
  447. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  448. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
  449. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
  450. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
  451. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
  452. unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
  453. unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
  454. unit_tests/sources/file_based/test_scenarios.py +0 -253
  455. unit_tests/sources/file_based/test_schema_helpers.py +0 -346
  456. unit_tests/sources/fixtures/__init__.py +0 -3
  457. unit_tests/sources/fixtures/source_test_fixture.py +0 -153
  458. unit_tests/sources/message/__init__.py +0 -0
  459. unit_tests/sources/message/test_repository.py +0 -153
  460. unit_tests/sources/streams/__init__.py +0 -0
  461. unit_tests/sources/streams/concurrent/__init__.py +0 -3
  462. unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
  463. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
  464. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
  465. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
  466. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
  467. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
  468. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
  469. unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
  470. unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
  471. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
  472. unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
  473. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
  474. unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
  475. unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
  476. unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
  477. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
  478. unit_tests/sources/streams/http/__init__.py +0 -0
  479. unit_tests/sources/streams/http/auth/__init__.py +0 -0
  480. unit_tests/sources/streams/http/auth/test_auth.py +0 -173
  481. unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
  482. unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
  483. unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
  484. unit_tests/sources/streams/http/test_http.py +0 -635
  485. unit_tests/sources/streams/test_availability_strategy.py +0 -70
  486. unit_tests/sources/streams/test_call_rate.py +0 -300
  487. unit_tests/sources/streams/test_stream_read.py +0 -405
  488. unit_tests/sources/streams/test_streams_core.py +0 -184
  489. unit_tests/sources/test_abstract_source.py +0 -1442
  490. unit_tests/sources/test_concurrent_source.py +0 -112
  491. unit_tests/sources/test_config.py +0 -92
  492. unit_tests/sources/test_connector_state_manager.py +0 -482
  493. unit_tests/sources/test_http_logger.py +0 -252
  494. unit_tests/sources/test_integration_source.py +0 -86
  495. unit_tests/sources/test_source.py +0 -684
  496. unit_tests/sources/test_source_read.py +0 -460
  497. unit_tests/test/__init__.py +0 -0
  498. unit_tests/test/mock_http/__init__.py +0 -0
  499. unit_tests/test/mock_http/test_matcher.py +0 -53
  500. unit_tests/test/mock_http/test_mocker.py +0 -214
  501. unit_tests/test/mock_http/test_request.py +0 -117
  502. unit_tests/test/mock_http/test_response_builder.py +0 -177
  503. unit_tests/test/test_entrypoint_wrapper.py +0 -240
  504. unit_tests/utils/__init__.py +0 -0
  505. unit_tests/utils/test_datetime_format_inferrer.py +0 -60
  506. unit_tests/utils/test_mapping_helpers.py +0 -54
  507. unit_tests/utils/test_message_utils.py +0 -91
  508. unit_tests/utils/test_rate_limiting.py +0 -26
  509. unit_tests/utils/test_schema_inferrer.py +0 -202
  510. unit_tests/utils/test_secret_utils.py +0 -135
  511. unit_tests/utils/test_stream_status_utils.py +0 -61
  512. unit_tests/utils/test_traced_exception.py +0 -107
  513. /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
  514. {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
  515. {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
  516. {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
  517. {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/LICENSE.txt +0 -0
@@ -4,10 +4,11 @@
4
4
 
5
5
  from typing import Any, Dict, List, Literal, Optional, Union
6
6
 
7
- import dpath.util
7
+ import dpath
8
+ from pydantic.v1 import BaseModel, Field
9
+
8
10
  from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
9
11
  from airbyte_cdk.utils.spec_schema_transformations import resolve_refs
10
- from pydantic import BaseModel, Field
11
12
 
12
13
 
13
14
  class SeparatorSplitterConfigModel(BaseModel):
@@ -17,7 +18,11 @@ class SeparatorSplitterConfigModel(BaseModel):
17
18
  title="Separators",
18
19
  description='List of separator strings to split text fields by. The separator itself needs to be wrapped in double quotes, e.g. to split by the dot character, use ".". To split by a newline, use "\\n".',
19
20
  )
20
- keep_separator: bool = Field(default=False, title="Keep separator", description="Whether to keep the separator in the resulting chunks")
21
+ keep_separator: bool = Field(
22
+ default=False,
23
+ title="Keep separator",
24
+ description="Whether to keep the separator in the resulting chunks",
25
+ )
21
26
 
22
27
  class Config(OneOfOptionConfig):
23
28
  title = "By Separator"
@@ -68,18 +73,20 @@ class CodeSplitterConfigModel(BaseModel):
68
73
 
69
74
  class Config(OneOfOptionConfig):
70
75
  title = "By Programming Language"
71
- description = (
72
- "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
73
- )
76
+ description = "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
74
77
  discriminator = "mode"
75
78
 
76
79
 
77
- TextSplitterConfigModel = Union[SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel]
80
+ TextSplitterConfigModel = Union[
81
+ SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel
82
+ ]
78
83
 
79
84
 
80
85
  class FieldNameMappingConfigModel(BaseModel):
81
86
  from_field: str = Field(title="From field name", description="The field name in the source")
82
- to_field: str = Field(title="To field name", description="The field name to use in the destination")
87
+ to_field: str = Field(
88
+ title="To field name", description="The field name to use in the destination"
89
+ )
83
90
 
84
91
 
85
92
  class ProcessingConfigModel(BaseModel):
@@ -132,9 +139,7 @@ class OpenAIEmbeddingConfigModel(BaseModel):
132
139
 
133
140
  class Config(OneOfOptionConfig):
134
141
  title = "OpenAI"
135
- description = (
136
- "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
137
- )
142
+ description = "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
138
143
  discriminator = "mode"
139
144
 
140
145
 
@@ -142,7 +147,10 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
142
147
  mode: Literal["openai_compatible"] = Field("openai_compatible", const=True)
143
148
  api_key: str = Field(title="API key", default="", airbyte_secret=True)
144
149
  base_url: str = Field(
145
- ..., title="Base URL", description="The base URL for your OpenAI-compatible service", examples=["https://your-service-name.com"]
150
+ ...,
151
+ title="Base URL",
152
+ description="The base URL for your OpenAI-compatible service",
153
+ examples=["https://your-service-name.com"],
146
154
  )
147
155
  model_name: str = Field(
148
156
  title="Model name",
@@ -151,7 +159,9 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
151
159
  examples=["text-embedding-ada-002"],
152
160
  )
153
161
  dimensions: int = Field(
154
- title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
162
+ title="Embedding dimensions",
163
+ description="The number of dimensions the embedding model is generating",
164
+ examples=[1536, 384],
155
165
  )
156
166
 
157
167
  class Config(OneOfOptionConfig):
@@ -199,10 +209,16 @@ class FakeEmbeddingConfigModel(BaseModel):
199
209
  class FromFieldEmbeddingConfigModel(BaseModel):
200
210
  mode: Literal["from_field"] = Field("from_field", const=True)
201
211
  field_name: str = Field(
202
- ..., title="Field name", description="Name of the field in the record that contains the embedding", examples=["embedding", "vector"]
212
+ ...,
213
+ title="Field name",
214
+ description="Name of the field in the record that contains the embedding",
215
+ examples=["embedding", "vector"],
203
216
  )
204
217
  dimensions: int = Field(
205
- ..., title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
218
+ ...,
219
+ title="Embedding dimensions",
220
+ description="The number of dimensions the embedding model is generating",
221
+ examples=[1536, 384],
206
222
  )
207
223
 
208
224
  class Config(OneOfOptionConfig):
@@ -241,7 +257,14 @@ class VectorDBConfigModel(BaseModel):
241
257
  FakeEmbeddingConfigModel,
242
258
  AzureOpenAIEmbeddingConfigModel,
243
259
  OpenAICompatibleEmbeddingConfigModel,
244
- ] = Field(..., title="Embedding", description="Embedding configuration", discriminator="mode", group="embedding", type="object")
260
+ ] = Field(
261
+ ...,
262
+ title="Embedding",
263
+ description="Embedding configuration",
264
+ discriminator="mode",
265
+ group="embedding",
266
+ type="object",
267
+ )
245
268
  processing: ProcessingConfigModel
246
269
  omit_raw_text: bool = Field(
247
270
  default=False,
@@ -264,7 +287,7 @@ class VectorDBConfigModel(BaseModel):
264
287
  @staticmethod
265
288
  def remove_discriminator(schema: Dict[str, Any]) -> None:
266
289
  """pydantic adds "discriminator" to the schema for oneOfs, which is not treated right by the platform as we inline all references"""
267
- dpath.util.delete(schema, "properties/**/discriminator")
290
+ dpath.delete(schema, "properties/**/discriminator")
268
291
 
269
292
  @classmethod
270
293
  def schema(cls, by_alias: bool = True, ref_template: str = "") -> Dict[str, Any]:
@@ -7,14 +7,24 @@ import logging
7
7
  from dataclasses import dataclass
8
8
  from typing import Any, Dict, List, Mapping, Optional, Tuple
9
9
 
10
- import dpath.util
11
- from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel, SeparatorSplitterConfigModel, TextSplitterConfigModel
12
- from airbyte_cdk.destinations.vector_db_based.utils import create_stream_identifier
13
- from airbyte_cdk.models import AirbyteRecordMessage, ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, DestinationSyncMode
14
- from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
15
- from langchain.document_loaders.base import Document
10
+ import dpath
16
11
  from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
17
12
  from langchain.utils import stringify_dict
13
+ from langchain_core.documents.base import Document
14
+
15
+ from airbyte_cdk.destinations.vector_db_based.config import (
16
+ ProcessingConfigModel,
17
+ SeparatorSplitterConfigModel,
18
+ TextSplitterConfigModel,
19
+ )
20
+ from airbyte_cdk.destinations.vector_db_based.utils import create_stream_identifier
21
+ from airbyte_cdk.models import (
22
+ AirbyteRecordMessage,
23
+ ConfiguredAirbyteCatalog,
24
+ ConfiguredAirbyteStream,
25
+ DestinationSyncMode,
26
+ )
27
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
18
28
 
19
29
  METADATA_STREAM_FIELD = "_ab_stream"
20
30
  METADATA_RECORD_ID_FIELD = "_ab_record_id"
@@ -30,7 +40,14 @@ class Chunk:
30
40
  embedding: Optional[List[float]] = None
31
41
 
32
42
 
33
- headers_to_split_on = ["(?:^|\n)# ", "(?:^|\n)## ", "(?:^|\n)### ", "(?:^|\n)#### ", "(?:^|\n)##### ", "(?:^|\n)###### "]
43
+ headers_to_split_on = [
44
+ "(?:^|\n)# ",
45
+ "(?:^|\n)## ",
46
+ "(?:^|\n)### ",
47
+ "(?:^|\n)#### ",
48
+ "(?:^|\n)##### ",
49
+ "(?:^|\n)###### ",
50
+ ]
34
51
 
35
52
 
36
53
  class DocumentProcessor:
@@ -64,7 +81,10 @@ class DocumentProcessor:
64
81
  return None
65
82
 
66
83
  def _get_text_splitter(
67
- self, chunk_size: int, chunk_overlap: int, splitter_config: Optional[TextSplitterConfigModel]
84
+ self,
85
+ chunk_size: int,
86
+ chunk_overlap: int,
87
+ splitter_config: Optional[TextSplitterConfigModel],
68
88
  ) -> RecursiveCharacterTextSplitter:
69
89
  if splitter_config is None:
70
90
  splitter_config = SeparatorSplitterConfigModel(mode="separator")
@@ -89,14 +109,20 @@ class DocumentProcessor:
89
109
  return RecursiveCharacterTextSplitter.from_tiktoken_encoder(
90
110
  chunk_size=chunk_size,
91
111
  chunk_overlap=chunk_overlap,
92
- separators=RecursiveCharacterTextSplitter.get_separators_for_language(Language(splitter_config.language)),
112
+ separators=RecursiveCharacterTextSplitter.get_separators_for_language(
113
+ Language(splitter_config.language)
114
+ ),
93
115
  disallowed_special=(),
94
116
  )
95
117
 
96
118
  def __init__(self, config: ProcessingConfigModel, catalog: ConfiguredAirbyteCatalog):
97
- self.streams = {create_stream_identifier(stream.stream): stream for stream in catalog.streams}
119
+ self.streams = {
120
+ create_stream_identifier(stream.stream): stream for stream in catalog.streams
121
+ }
98
122
 
99
- self.splitter = self._get_text_splitter(config.chunk_size, config.chunk_overlap, config.text_splitter)
123
+ self.splitter = self._get_text_splitter(
124
+ config.chunk_size, config.chunk_overlap, config.text_splitter
125
+ )
100
126
  self.text_fields = config.text_fields
101
127
  self.metadata_fields = config.metadata_fields
102
128
  self.field_name_mappings = config.field_name_mappings
@@ -119,10 +145,18 @@ class DocumentProcessor:
119
145
  failure_type=FailureType.config_error,
120
146
  )
121
147
  chunks = [
122
- Chunk(page_content=chunk_document.page_content, metadata=chunk_document.metadata, record=record)
148
+ Chunk(
149
+ page_content=chunk_document.page_content,
150
+ metadata=chunk_document.metadata,
151
+ record=record,
152
+ )
123
153
  for chunk_document in self._split_document(doc)
124
154
  ]
125
- id_to_delete = doc.metadata[METADATA_RECORD_ID_FIELD] if METADATA_RECORD_ID_FIELD in doc.metadata else None
155
+ id_to_delete = (
156
+ doc.metadata[METADATA_RECORD_ID_FIELD]
157
+ if METADATA_RECORD_ID_FIELD in doc.metadata
158
+ else None
159
+ )
126
160
  return chunks, id_to_delete
127
161
 
128
162
  def _generate_document(self, record: AirbyteRecordMessage) -> Optional[Document]:
@@ -133,11 +167,13 @@ class DocumentProcessor:
133
167
  metadata = self._extract_metadata(record)
134
168
  return Document(page_content=text, metadata=metadata)
135
169
 
136
- def _extract_relevant_fields(self, record: AirbyteRecordMessage, fields: Optional[List[str]]) -> Dict[str, Any]:
170
+ def _extract_relevant_fields(
171
+ self, record: AirbyteRecordMessage, fields: Optional[List[str]]
172
+ ) -> Dict[str, Any]:
137
173
  relevant_fields = {}
138
174
  if fields and len(fields) > 0:
139
175
  for field in fields:
140
- values = dpath.util.values(record.data, field, separator=".")
176
+ values = dpath.values(record.data, field, separator=".")
141
177
  if values and len(values) > 0:
142
178
  relevant_fields[field] = values if len(values) > 1 else values[0]
143
179
  else:
@@ -156,13 +192,16 @@ class DocumentProcessor:
156
192
  stream_identifier = create_stream_identifier(record)
157
193
  current_stream: ConfiguredAirbyteStream = self.streams[stream_identifier]
158
194
  # if the sync mode is deduping, use the primary key to upsert existing records instead of appending new ones
159
- if not current_stream.primary_key or current_stream.destination_sync_mode != DestinationSyncMode.append_dedup:
195
+ if (
196
+ not current_stream.primary_key
197
+ or current_stream.destination_sync_mode != DestinationSyncMode.append_dedup
198
+ ):
160
199
  return None
161
200
 
162
201
  primary_key = []
163
202
  for key in current_stream.primary_key:
164
203
  try:
165
- primary_key.append(str(dpath.util.get(record.data, key)))
204
+ primary_key.append(str(dpath.get(record.data, key)))
166
205
  except KeyError:
167
206
  primary_key.append("__not_found__")
168
207
  stringified_primary_key = "_".join(primary_key)
@@ -7,6 +7,11 @@ from abc import ABC, abstractmethod
7
7
  from dataclasses import dataclass
8
8
  from typing import List, Optional, Union, cast
9
9
 
10
+ from langchain.embeddings.cohere import CohereEmbeddings
11
+ from langchain.embeddings.fake import FakeEmbeddings
12
+ from langchain.embeddings.localai import LocalAIEmbeddings
13
+ from langchain.embeddings.openai import OpenAIEmbeddings
14
+
10
15
  from airbyte_cdk.destinations.vector_db_based.config import (
11
16
  AzureOpenAIEmbeddingConfigModel,
12
17
  CohereEmbeddingConfigModel,
@@ -19,10 +24,6 @@ from airbyte_cdk.destinations.vector_db_based.config import (
19
24
  from airbyte_cdk.destinations.vector_db_based.utils import create_chunks, format_exception
20
25
  from airbyte_cdk.models import AirbyteRecordMessage
21
26
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
22
- from langchain.embeddings.cohere import CohereEmbeddings
23
- from langchain.embeddings.fake import FakeEmbeddings
24
- from langchain.embeddings.localai import LocalAIEmbeddings
25
- from langchain.embeddings.openai import OpenAIEmbeddings
26
27
 
27
28
 
28
29
  @dataclass
@@ -92,7 +93,9 @@ class BaseOpenAIEmbedder(Embedder):
92
93
  batches = create_chunks(documents, batch_size=embedding_batch_size)
93
94
  embeddings: List[Optional[List[float]]] = []
94
95
  for batch in batches:
95
- embeddings.extend(self.embeddings.embed_documents([chunk.page_content for chunk in batch]))
96
+ embeddings.extend(
97
+ self.embeddings.embed_documents([chunk.page_content for chunk in batch])
98
+ )
96
99
  return embeddings
97
100
 
98
101
  @property
@@ -103,13 +106,30 @@ class BaseOpenAIEmbedder(Embedder):
103
106
 
104
107
  class OpenAIEmbedder(BaseOpenAIEmbedder):
105
108
  def __init__(self, config: OpenAIEmbeddingConfigModel, chunk_size: int):
106
- super().__init__(OpenAIEmbeddings(openai_api_key=config.openai_key, max_retries=15, disallowed_special=()), chunk_size) # type: ignore
109
+ super().__init__(
110
+ OpenAIEmbeddings( # type: ignore [call-arg]
111
+ openai_api_key=config.openai_key, max_retries=15, disallowed_special=()
112
+ ),
113
+ chunk_size,
114
+ ) # type: ignore
107
115
 
108
116
 
109
117
  class AzureOpenAIEmbedder(BaseOpenAIEmbedder):
110
118
  def __init__(self, config: AzureOpenAIEmbeddingConfigModel, chunk_size: int):
111
119
  # Azure OpenAI API has — as of 20230927 — a limit of 16 documents per request
112
- super().__init__(OpenAIEmbeddings(openai_api_key=config.openai_key, chunk_size=16, max_retries=15, openai_api_type="azure", openai_api_version="2023-05-15", openai_api_base=config.api_base, deployment=config.deployment, disallowed_special=()), chunk_size) # type: ignore
120
+ super().__init__(
121
+ OpenAIEmbeddings( # type: ignore [call-arg]
122
+ openai_api_key=config.openai_key,
123
+ chunk_size=16,
124
+ max_retries=15,
125
+ openai_api_type="azure",
126
+ openai_api_version="2023-05-15",
127
+ openai_api_base=config.api_base,
128
+ deployment=config.deployment,
129
+ disallowed_special=(),
130
+ ),
131
+ chunk_size,
132
+ ) # type: ignore
113
133
 
114
134
 
115
135
  COHERE_VECTOR_SIZE = 1024
@@ -119,7 +139,9 @@ class CohereEmbedder(Embedder):
119
139
  def __init__(self, config: CohereEmbeddingConfigModel):
120
140
  super().__init__()
121
141
  # Client is set internally
122
- self.embeddings = CohereEmbeddings(cohere_api_key=config.cohere_key, model="embed-english-light-v2.0") # type: ignore
142
+ self.embeddings = CohereEmbeddings(
143
+ cohere_api_key=config.cohere_key, model="embed-english-light-v2.0"
144
+ ) # type: ignore
123
145
 
124
146
  def check(self) -> Optional[str]:
125
147
  try:
@@ -129,7 +151,10 @@ class CohereEmbedder(Embedder):
129
151
  return None
130
152
 
131
153
  def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
132
- return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
154
+ return cast(
155
+ List[Optional[List[float]]],
156
+ self.embeddings.embed_documents([document.page_content for document in documents]),
157
+ )
133
158
 
134
159
  @property
135
160
  def embedding_dimensions(self) -> int:
@@ -150,7 +175,10 @@ class FakeEmbedder(Embedder):
150
175
  return None
151
176
 
152
177
  def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
153
- return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
178
+ return cast(
179
+ List[Optional[List[float]]],
180
+ self.embeddings.embed_documents([document.page_content for document in documents]),
181
+ )
154
182
 
155
183
  @property
156
184
  def embedding_dimensions(self) -> int:
@@ -167,11 +195,20 @@ class OpenAICompatibleEmbedder(Embedder):
167
195
  self.config = config
168
196
  # Client is set internally
169
197
  # Always set an API key even if there is none defined in the config because the validator will fail otherwise. Embedding APIs that don't require an API key don't fail if one is provided, so this is not breaking usage.
170
- self.embeddings = LocalAIEmbeddings(model=config.model_name, openai_api_key=config.api_key or "dummy-api-key", openai_api_base=config.base_url, max_retries=15, disallowed_special=()) # type: ignore
198
+ self.embeddings = LocalAIEmbeddings(
199
+ model=config.model_name,
200
+ openai_api_key=config.api_key or "dummy-api-key",
201
+ openai_api_base=config.base_url,
202
+ max_retries=15,
203
+ disallowed_special=(),
204
+ ) # type: ignore
171
205
 
172
206
  def check(self) -> Optional[str]:
173
207
  deployment_mode = os.environ.get("DEPLOYMENT_MODE", "")
174
- if deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE and not self.config.base_url.startswith("https://"):
208
+ if (
209
+ deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE
210
+ and not self.config.base_url.startswith("https://")
211
+ ):
175
212
  return "Base URL must start with https://"
176
213
 
177
214
  try:
@@ -181,7 +218,10 @@ class OpenAICompatibleEmbedder(Embedder):
181
218
  return None
182
219
 
183
220
  def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
184
- return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
221
+ return cast(
222
+ List[Optional[List[float]]],
223
+ self.embeddings.embed_documents([document.page_content for document in documents]),
224
+ )
185
225
 
186
226
  @property
187
227
  def embedding_dimensions(self) -> int:
@@ -254,8 +294,10 @@ def create_from_config(
254
294
  ],
255
295
  processing_config: ProcessingConfigModel,
256
296
  ) -> Embedder:
257
-
258
297
  if embedding_config.mode == "azure_openai" or embedding_config.mode == "openai":
259
- return cast(Embedder, embedder_map[embedding_config.mode](embedding_config, processing_config.chunk_size))
298
+ return cast(
299
+ Embedder,
300
+ embedder_map[embedding_config.mode](embedding_config, processing_config.chunk_size),
301
+ )
260
302
  else:
261
303
  return cast(Embedder, embedder_map[embedding_config.mode](embedding_config))
@@ -26,12 +26,19 @@ class BaseIntegrationTest(unittest.TestCase):
26
26
  It provides helper methods to create Airbyte catalogs, records and state messages.
27
27
  """
28
28
 
29
- def _get_configured_catalog(self, destination_mode: DestinationSyncMode) -> ConfiguredAirbyteCatalog:
30
- stream_schema = {"type": "object", "properties": {"str_col": {"type": "str"}, "int_col": {"type": "integer"}}}
29
+ def _get_configured_catalog(
30
+ self, destination_mode: DestinationSyncMode
31
+ ) -> ConfiguredAirbyteCatalog:
32
+ stream_schema = {
33
+ "type": "object",
34
+ "properties": {"str_col": {"type": "str"}, "int_col": {"type": "integer"}},
35
+ }
31
36
 
32
37
  overwrite_stream = ConfiguredAirbyteStream(
33
38
  stream=AirbyteStream(
34
- name="mystream", json_schema=stream_schema, supported_sync_modes=[SyncMode.incremental, SyncMode.full_refresh]
39
+ name="mystream",
40
+ json_schema=stream_schema,
41
+ supported_sync_modes=[SyncMode.incremental, SyncMode.full_refresh],
35
42
  ),
36
43
  primary_key=[["int_col"]],
37
44
  sync_mode=SyncMode.incremental,
@@ -45,7 +52,10 @@ class BaseIntegrationTest(unittest.TestCase):
45
52
 
46
53
  def _record(self, stream: str, str_value: str, int_value: int) -> AirbyteMessage:
47
54
  return AirbyteMessage(
48
- type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0)
55
+ type=Type.RECORD,
56
+ record=AirbyteRecordMessage(
57
+ stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0
58
+ ),
49
59
  )
50
60
 
51
61
  def setUp(self) -> None:
@@ -10,7 +10,11 @@ from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStream
10
10
 
11
11
 
12
12
  def format_exception(exception: Exception) -> str:
13
- return str(exception) + "\n" + "".join(traceback.TracebackException.from_exception(exception).format())
13
+ return (
14
+ str(exception)
15
+ + "\n"
16
+ + "".join(traceback.TracebackException.from_exception(exception).format())
17
+ )
14
18
 
15
19
 
16
20
  def create_chunks(iterable: Iterable[Any], batch_size: int) -> Iterator[Tuple[Any, ...]]:
@@ -26,4 +30,6 @@ def create_stream_identifier(stream: Union[AirbyteStream, AirbyteRecordMessage])
26
30
  if isinstance(stream, AirbyteStream):
27
31
  return str(stream.name if stream.namespace is None else f"{stream.namespace}_{stream.name}")
28
32
  else:
29
- return str(stream.stream if stream.namespace is None else f"{stream.namespace}_{stream.stream}")
33
+ return str(
34
+ stream.stream if stream.namespace is None else f"{stream.namespace}_{stream.stream}"
35
+ )
@@ -27,7 +27,12 @@ class Writer:
27
27
  """
28
28
 
29
29
  def __init__(
30
- self, processing_config: ProcessingConfigModel, indexer: Indexer, embedder: Embedder, batch_size: int, omit_raw_text: bool
30
+ self,
31
+ processing_config: ProcessingConfigModel,
32
+ indexer: Indexer,
33
+ embedder: Embedder,
34
+ batch_size: int,
35
+ omit_raw_text: bool,
31
36
  ) -> None:
32
37
  self.processing_config = processing_config
33
38
  self.indexer = indexer
@@ -54,7 +59,9 @@ class Writer:
54
59
  self.indexer.delete(ids, namespace, stream)
55
60
 
56
61
  for (namespace, stream), chunks in self.chunks.items():
57
- embeddings = self.embedder.embed_documents([self._convert_to_document(chunk) for chunk in chunks])
62
+ embeddings = self.embedder.embed_documents(
63
+ [self._convert_to_document(chunk) for chunk in chunks]
64
+ )
58
65
  for i, document in enumerate(chunks):
59
66
  document.embedding = embeddings[i]
60
67
  if self.omit_raw_text:
@@ -63,7 +70,9 @@ class Writer:
63
70
 
64
71
  self._init_batch()
65
72
 
66
- def write(self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]) -> Iterable[AirbyteMessage]:
73
+ def write(
74
+ self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]
75
+ ) -> Iterable[AirbyteMessage]:
67
76
  self.processor = DocumentProcessor(self.processing_config, configured_catalog)
68
77
  self.indexer.pre_sync(configured_catalog)
69
78
  for message in input_messages:
@@ -74,9 +83,19 @@ class Writer:
74
83
  yield message
75
84
  elif message.type == Type.RECORD:
76
85
  record_chunks, record_id_to_delete = self.processor.process(message.record)
77
- self.chunks[(message.record.namespace, message.record.stream)].extend(record_chunks)
86
+ self.chunks[
87
+ ( # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"
88
+ message.record.namespace, # type: ignore [union-attr] # record not None
89
+ message.record.stream, # type: ignore [union-attr] # record not None
90
+ )
91
+ ].extend(record_chunks)
78
92
  if record_id_to_delete is not None:
79
- self.ids_to_delete[(message.record.namespace, message.record.stream)].append(record_id_to_delete)
93
+ self.ids_to_delete[
94
+ ( # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"
95
+ message.record.namespace, # type: ignore [union-attr] # record not None
96
+ message.record.stream, # type: ignore [union-attr] # record not None
97
+ )
98
+ ].append(record_id_to_delete)
80
99
  self.number_of_chunks += len(record_chunks)
81
100
  if self.number_of_chunks >= self.batch_size:
82
101
  self._process_batch()