airbyte-cdk 0.72.0__py3-none-any.whl → 6.17.1.dev0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (518) hide show
  1. airbyte_cdk/__init__.py +355 -6
  2. airbyte_cdk/cli/__init__.py +1 -0
  3. airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
  4. airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
  5. airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
  6. airbyte_cdk/config_observation.py +29 -10
  7. airbyte_cdk/connector.py +24 -24
  8. airbyte_cdk/connector_builder/README.md +53 -0
  9. airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
  10. airbyte_cdk/connector_builder/main.py +45 -13
  11. airbyte_cdk/connector_builder/message_grouper.py +189 -50
  12. airbyte_cdk/connector_builder/models.py +3 -2
  13. airbyte_cdk/destinations/__init__.py +4 -3
  14. airbyte_cdk/destinations/destination.py +54 -20
  15. airbyte_cdk/destinations/vector_db_based/README.md +37 -0
  16. airbyte_cdk/destinations/vector_db_based/config.py +40 -17
  17. airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
  18. airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
  19. airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
  20. airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
  21. airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
  22. airbyte_cdk/entrypoint.py +153 -44
  23. airbyte_cdk/exception_handler.py +21 -3
  24. airbyte_cdk/logger.py +30 -44
  25. airbyte_cdk/models/__init__.py +13 -2
  26. airbyte_cdk/models/airbyte_protocol.py +86 -1
  27. airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
  28. airbyte_cdk/models/file_transfer_record_message.py +13 -0
  29. airbyte_cdk/models/well_known_types.py +1 -1
  30. airbyte_cdk/sources/__init__.py +5 -1
  31. airbyte_cdk/sources/abstract_source.py +125 -79
  32. airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
  33. airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
  34. airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
  35. airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
  36. airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
  37. airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
  38. airbyte_cdk/sources/config.py +3 -2
  39. airbyte_cdk/sources/connector_state_manager.py +49 -83
  40. airbyte_cdk/sources/declarative/async_job/job.py +52 -0
  41. airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
  42. airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
  43. airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
  44. airbyte_cdk/sources/declarative/async_job/status.py +24 -0
  45. airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
  46. airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
  47. airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
  48. airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
  49. airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
  50. airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
  51. airbyte_cdk/sources/declarative/auth/token.py +28 -10
  52. airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
  53. airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
  54. airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
  55. airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
  56. airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
  57. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +490 -0
  58. airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
  59. airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
  60. airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1213 -88
  61. airbyte_cdk/sources/declarative/declarative_source.py +5 -2
  62. airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
  63. airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
  64. airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
  65. airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
  66. airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
  67. airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
  68. airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
  69. airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
  70. airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
  71. airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
  72. airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
  73. airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
  74. airbyte_cdk/sources/declarative/extractors/record_filter.py +63 -8
  75. airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
  76. airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
  77. airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
  78. airbyte_cdk/sources/declarative/incremental/__init__.py +31 -3
  79. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +346 -0
  80. airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
  81. airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
  82. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
  83. airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +173 -74
  84. airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
  85. airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
  86. airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
  87. airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
  88. airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
  89. airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
  90. airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
  91. airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
  92. airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
  93. airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
  94. airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
  95. airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
  96. airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
  97. airbyte_cdk/sources/declarative/models/__init__.py +1 -1
  98. airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1329 -595
  99. airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
  100. airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
  101. airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
  102. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1763 -226
  103. airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
  104. airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
  105. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
  106. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
  107. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
  108. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
  109. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
  110. airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
  111. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
  112. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
  113. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
  114. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
  115. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
  116. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
  117. airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
  118. airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
  119. airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
  120. airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
  121. airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
  122. airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
  123. airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
  124. airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
  125. airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
  126. airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
  127. airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
  128. airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
  129. airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
  130. airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
  131. airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
  132. airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
  133. airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
  134. airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
  135. airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
  136. airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
  137. airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
  138. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
  139. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
  140. airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
  141. airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
  142. airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
  143. airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
  144. airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
  145. airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
  146. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
  147. airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
  148. airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
  149. airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
  150. airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +229 -73
  151. airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
  152. airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
  153. airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
  154. airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
  155. airbyte_cdk/sources/declarative/spec/spec.py +12 -5
  156. airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
  157. airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
  158. airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
  159. airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
  160. airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
  161. airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
  162. airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
  163. airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
  164. airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
  165. airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
  166. airbyte_cdk/sources/declarative/types.py +19 -110
  167. airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
  168. airbyte_cdk/sources/embedded/base_integration.py +16 -5
  169. airbyte_cdk/sources/embedded/catalog.py +16 -4
  170. airbyte_cdk/sources/embedded/runner.py +19 -3
  171. airbyte_cdk/sources/embedded/tools.py +5 -2
  172. airbyte_cdk/sources/file_based/README.md +152 -0
  173. airbyte_cdk/sources/file_based/__init__.py +24 -0
  174. airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
  175. airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
  176. airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
  177. airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +47 -10
  178. airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
  179. airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
  180. airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
  181. airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
  182. airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
  183. airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
  184. airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
  185. airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
  186. airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
  187. airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
  188. airbyte_cdk/sources/file_based/exceptions.py +18 -15
  189. airbyte_cdk/sources/file_based/file_based_source.py +140 -33
  190. airbyte_cdk/sources/file_based/file_based_stream_reader.py +69 -5
  191. airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
  192. airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
  193. airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
  194. airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
  195. airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
  196. airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
  197. airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
  198. airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
  199. airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +141 -41
  200. airbyte_cdk/sources/file_based/remote_file.py +1 -1
  201. airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
  202. airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
  203. airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
  204. airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
  205. airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
  206. airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
  207. airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
  208. airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
  209. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
  210. airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
  211. airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
  212. airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
  213. airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +147 -45
  214. airbyte_cdk/sources/http_logger.py +8 -3
  215. airbyte_cdk/sources/message/__init__.py +7 -1
  216. airbyte_cdk/sources/message/repository.py +18 -4
  217. airbyte_cdk/sources/source.py +42 -38
  218. airbyte_cdk/sources/streams/__init__.py +2 -2
  219. airbyte_cdk/sources/streams/availability_strategy.py +54 -3
  220. airbyte_cdk/sources/streams/call_rate.py +64 -21
  221. airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
  222. airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
  223. airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
  224. airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
  225. airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
  226. airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
  227. airbyte_cdk/sources/streams/concurrent/README.md +7 -0
  228. airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
  229. airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
  230. airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
  231. airbyte_cdk/sources/streams/concurrent/cursor.py +298 -42
  232. airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
  233. airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
  234. airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
  235. airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
  236. airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
  237. airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
  238. airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
  239. airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
  240. airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
  241. airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
  242. airbyte_cdk/sources/streams/core.py +412 -87
  243. airbyte_cdk/sources/streams/http/__init__.py +2 -1
  244. airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
  245. airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
  246. airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
  247. airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
  248. airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
  249. airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
  250. airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
  251. airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
  252. airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
  253. airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
  254. airbyte_cdk/sources/streams/http/exceptions.py +27 -7
  255. airbyte_cdk/sources/streams/http/http.py +369 -246
  256. airbyte_cdk/sources/streams/http/http_client.py +531 -0
  257. airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
  258. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
  259. airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
  260. airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
  261. airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
  262. airbyte_cdk/sources/types.py +154 -0
  263. airbyte_cdk/sources/utils/record_helper.py +36 -21
  264. airbyte_cdk/sources/utils/schema_helpers.py +13 -6
  265. airbyte_cdk/sources/utils/slice_logger.py +4 -1
  266. airbyte_cdk/sources/utils/transform.py +54 -20
  267. airbyte_cdk/sql/_util/hashing.py +34 -0
  268. airbyte_cdk/sql/_util/name_normalizers.py +92 -0
  269. airbyte_cdk/sql/constants.py +32 -0
  270. airbyte_cdk/sql/exceptions.py +235 -0
  271. airbyte_cdk/sql/secrets.py +123 -0
  272. airbyte_cdk/sql/shared/__init__.py +15 -0
  273. airbyte_cdk/sql/shared/catalog_providers.py +145 -0
  274. airbyte_cdk/sql/shared/sql_processor.py +786 -0
  275. airbyte_cdk/sql/types.py +160 -0
  276. airbyte_cdk/test/catalog_builder.py +70 -18
  277. airbyte_cdk/test/entrypoint_wrapper.py +117 -42
  278. airbyte_cdk/test/mock_http/__init__.py +1 -1
  279. airbyte_cdk/test/mock_http/matcher.py +6 -0
  280. airbyte_cdk/test/mock_http/mocker.py +57 -10
  281. airbyte_cdk/test/mock_http/request.py +19 -3
  282. airbyte_cdk/test/mock_http/response.py +3 -1
  283. airbyte_cdk/test/mock_http/response_builder.py +32 -16
  284. airbyte_cdk/test/state_builder.py +18 -10
  285. airbyte_cdk/test/utils/__init__.py +1 -0
  286. airbyte_cdk/test/utils/data.py +24 -0
  287. airbyte_cdk/test/utils/http_mocking.py +16 -0
  288. airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
  289. airbyte_cdk/test/utils/reading.py +26 -0
  290. airbyte_cdk/utils/__init__.py +2 -1
  291. airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
  292. airbyte_cdk/utils/analytics_message.py +10 -2
  293. airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
  294. airbyte_cdk/utils/event_timing.py +10 -10
  295. airbyte_cdk/utils/mapping_helpers.py +3 -1
  296. airbyte_cdk/utils/message_utils.py +20 -11
  297. airbyte_cdk/utils/print_buffer.py +75 -0
  298. airbyte_cdk/utils/schema_inferrer.py +198 -28
  299. airbyte_cdk/utils/slice_hasher.py +30 -0
  300. airbyte_cdk/utils/spec_schema_transformations.py +6 -3
  301. airbyte_cdk/utils/stream_status_utils.py +8 -1
  302. airbyte_cdk/utils/traced_exception.py +61 -21
  303. airbyte_cdk-6.17.1.dev0.dist-info/METADATA +109 -0
  304. airbyte_cdk-6.17.1.dev0.dist-info/RECORD +350 -0
  305. {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.17.1.dev0.dist-info}/WHEEL +1 -2
  306. airbyte_cdk-6.17.1.dev0.dist-info/entry_points.txt +3 -0
  307. airbyte_cdk/sources/declarative/create_partial.py +0 -92
  308. airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
  309. airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
  310. airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
  311. airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
  312. airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
  313. airbyte_cdk/sources/deprecated/base_source.py +0 -94
  314. airbyte_cdk/sources/deprecated/client.py +0 -99
  315. airbyte_cdk/sources/singer/__init__.py +0 -8
  316. airbyte_cdk/sources/singer/singer_helpers.py +0 -304
  317. airbyte_cdk/sources/singer/source.py +0 -186
  318. airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
  319. airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
  320. airbyte_cdk/sources/streams/http/auth/core.py +0 -29
  321. airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
  322. airbyte_cdk/sources/streams/http/auth/token.py +0 -47
  323. airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
  324. airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
  325. airbyte_cdk/sources/utils/schema_models.py +0 -84
  326. airbyte_cdk-0.72.0.dist-info/METADATA +0 -243
  327. airbyte_cdk-0.72.0.dist-info/RECORD +0 -466
  328. airbyte_cdk-0.72.0.dist-info/top_level.txt +0 -3
  329. source_declarative_manifest/main.py +0 -29
  330. unit_tests/connector_builder/__init__.py +0 -3
  331. unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
  332. unit_tests/connector_builder/test_message_grouper.py +0 -713
  333. unit_tests/connector_builder/utils.py +0 -27
  334. unit_tests/destinations/test_destination.py +0 -243
  335. unit_tests/singer/test_singer_helpers.py +0 -56
  336. unit_tests/singer/test_singer_source.py +0 -112
  337. unit_tests/sources/__init__.py +0 -0
  338. unit_tests/sources/concurrent_source/__init__.py +0 -3
  339. unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
  340. unit_tests/sources/declarative/__init__.py +0 -3
  341. unit_tests/sources/declarative/auth/__init__.py +0 -3
  342. unit_tests/sources/declarative/auth/test_oauth.py +0 -331
  343. unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
  344. unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
  345. unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
  346. unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
  347. unit_tests/sources/declarative/checks/__init__.py +0 -3
  348. unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
  349. unit_tests/sources/declarative/decoders/__init__.py +0 -0
  350. unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
  351. unit_tests/sources/declarative/external_component.py +0 -13
  352. unit_tests/sources/declarative/extractors/__init__.py +0 -3
  353. unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
  354. unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
  355. unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
  356. unit_tests/sources/declarative/incremental/__init__.py +0 -0
  357. unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
  358. unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
  359. unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
  360. unit_tests/sources/declarative/interpolation/__init__.py +0 -3
  361. unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
  362. unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
  363. unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
  364. unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
  365. unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
  366. unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
  367. unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
  368. unit_tests/sources/declarative/parsers/__init__.py +0 -3
  369. unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
  370. unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
  371. unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1841
  372. unit_tests/sources/declarative/parsers/testing_components.py +0 -36
  373. unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
  374. unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
  375. unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
  376. unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
  377. unit_tests/sources/declarative/requesters/__init__.py +0 -3
  378. unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
  379. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
  380. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
  381. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
  382. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
  383. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
  384. unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
  385. unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
  386. unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
  387. unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
  388. unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
  389. unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
  390. unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
  391. unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
  392. unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
  393. unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
  394. unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
  395. unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
  396. unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
  397. unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
  398. unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
  399. unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
  400. unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
  401. unit_tests/sources/declarative/retrievers/__init__.py +0 -3
  402. unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
  403. unit_tests/sources/declarative/schema/__init__.py +0 -6
  404. unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
  405. unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
  406. unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
  407. unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
  408. unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
  409. unit_tests/sources/declarative/states/__init__.py +0 -3
  410. unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
  411. unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
  412. unit_tests/sources/declarative/test_create_partial.py +0 -83
  413. unit_tests/sources/declarative/test_declarative_stream.py +0 -103
  414. unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
  415. unit_tests/sources/declarative/test_types.py +0 -39
  416. unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
  417. unit_tests/sources/file_based/__init__.py +0 -0
  418. unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
  419. unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
  420. unit_tests/sources/file_based/config/__init__.py +0 -0
  421. unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
  422. unit_tests/sources/file_based/config/test_csv_format.py +0 -34
  423. unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
  424. unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
  425. unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
  426. unit_tests/sources/file_based/file_types/__init__.py +0 -0
  427. unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
  428. unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
  429. unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
  430. unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
  431. unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
  432. unit_tests/sources/file_based/helpers.py +0 -70
  433. unit_tests/sources/file_based/in_memory_files_source.py +0 -211
  434. unit_tests/sources/file_based/scenarios/__init__.py +0 -0
  435. unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
  436. unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
  437. unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
  438. unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
  439. unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
  440. unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
  441. unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
  442. unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
  443. unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
  444. unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
  445. unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
  446. unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
  447. unit_tests/sources/file_based/stream/__init__.py +0 -0
  448. unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
  449. unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
  450. unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
  451. unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
  452. unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
  453. unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
  454. unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
  455. unit_tests/sources/file_based/test_scenarios.py +0 -253
  456. unit_tests/sources/file_based/test_schema_helpers.py +0 -346
  457. unit_tests/sources/fixtures/__init__.py +0 -3
  458. unit_tests/sources/fixtures/source_test_fixture.py +0 -153
  459. unit_tests/sources/message/__init__.py +0 -0
  460. unit_tests/sources/message/test_repository.py +0 -153
  461. unit_tests/sources/streams/__init__.py +0 -0
  462. unit_tests/sources/streams/concurrent/__init__.py +0 -3
  463. unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
  464. unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
  465. unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
  466. unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
  467. unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
  468. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
  469. unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
  470. unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
  471. unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
  472. unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
  473. unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
  474. unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
  475. unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
  476. unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
  477. unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
  478. unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
  479. unit_tests/sources/streams/http/__init__.py +0 -0
  480. unit_tests/sources/streams/http/auth/__init__.py +0 -0
  481. unit_tests/sources/streams/http/auth/test_auth.py +0 -173
  482. unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
  483. unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
  484. unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
  485. unit_tests/sources/streams/http/test_http.py +0 -635
  486. unit_tests/sources/streams/test_availability_strategy.py +0 -70
  487. unit_tests/sources/streams/test_call_rate.py +0 -300
  488. unit_tests/sources/streams/test_stream_read.py +0 -405
  489. unit_tests/sources/streams/test_streams_core.py +0 -184
  490. unit_tests/sources/test_abstract_source.py +0 -1442
  491. unit_tests/sources/test_concurrent_source.py +0 -112
  492. unit_tests/sources/test_config.py +0 -92
  493. unit_tests/sources/test_connector_state_manager.py +0 -482
  494. unit_tests/sources/test_http_logger.py +0 -252
  495. unit_tests/sources/test_integration_source.py +0 -86
  496. unit_tests/sources/test_source.py +0 -684
  497. unit_tests/sources/test_source_read.py +0 -460
  498. unit_tests/test/__init__.py +0 -0
  499. unit_tests/test/mock_http/__init__.py +0 -0
  500. unit_tests/test/mock_http/test_matcher.py +0 -53
  501. unit_tests/test/mock_http/test_mocker.py +0 -214
  502. unit_tests/test/mock_http/test_request.py +0 -117
  503. unit_tests/test/mock_http/test_response_builder.py +0 -177
  504. unit_tests/test/test_entrypoint_wrapper.py +0 -240
  505. unit_tests/utils/__init__.py +0 -0
  506. unit_tests/utils/test_datetime_format_inferrer.py +0 -60
  507. unit_tests/utils/test_mapping_helpers.py +0 -54
  508. unit_tests/utils/test_message_utils.py +0 -91
  509. unit_tests/utils/test_rate_limiting.py +0 -26
  510. unit_tests/utils/test_schema_inferrer.py +0 -202
  511. unit_tests/utils/test_secret_utils.py +0 -135
  512. unit_tests/utils/test_stream_status_utils.py +0 -61
  513. unit_tests/utils/test_traced_exception.py +0 -107
  514. /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
  515. {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
  516. {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
  517. {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
  518. {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.17.1.dev0.dist-info}/LICENSE.txt +0 -0
@@ -4,10 +4,11 @@
4
4
 
5
5
  from typing import Any, Dict, List, Literal, Optional, Union
6
6
 
7
- import dpath.util
7
+ import dpath
8
+ from pydantic.v1 import BaseModel, Field
9
+
8
10
  from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
9
11
  from airbyte_cdk.utils.spec_schema_transformations import resolve_refs
10
- from pydantic import BaseModel, Field
11
12
 
12
13
 
13
14
  class SeparatorSplitterConfigModel(BaseModel):
@@ -17,7 +18,11 @@ class SeparatorSplitterConfigModel(BaseModel):
17
18
  title="Separators",
18
19
  description='List of separator strings to split text fields by. The separator itself needs to be wrapped in double quotes, e.g. to split by the dot character, use ".". To split by a newline, use "\\n".',
19
20
  )
20
- keep_separator: bool = Field(default=False, title="Keep separator", description="Whether to keep the separator in the resulting chunks")
21
+ keep_separator: bool = Field(
22
+ default=False,
23
+ title="Keep separator",
24
+ description="Whether to keep the separator in the resulting chunks",
25
+ )
21
26
 
22
27
  class Config(OneOfOptionConfig):
23
28
  title = "By Separator"
@@ -68,18 +73,20 @@ class CodeSplitterConfigModel(BaseModel):
68
73
 
69
74
  class Config(OneOfOptionConfig):
70
75
  title = "By Programming Language"
71
- description = (
72
- "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
73
- )
76
+ description = "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
74
77
  discriminator = "mode"
75
78
 
76
79
 
77
- TextSplitterConfigModel = Union[SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel]
80
+ TextSplitterConfigModel = Union[
81
+ SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel
82
+ ]
78
83
 
79
84
 
80
85
  class FieldNameMappingConfigModel(BaseModel):
81
86
  from_field: str = Field(title="From field name", description="The field name in the source")
82
- to_field: str = Field(title="To field name", description="The field name to use in the destination")
87
+ to_field: str = Field(
88
+ title="To field name", description="The field name to use in the destination"
89
+ )
83
90
 
84
91
 
85
92
  class ProcessingConfigModel(BaseModel):
@@ -132,9 +139,7 @@ class OpenAIEmbeddingConfigModel(BaseModel):
132
139
 
133
140
  class Config(OneOfOptionConfig):
134
141
  title = "OpenAI"
135
- description = (
136
- "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
137
- )
142
+ description = "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
138
143
  discriminator = "mode"
139
144
 
140
145
 
@@ -142,7 +147,10 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
142
147
  mode: Literal["openai_compatible"] = Field("openai_compatible", const=True)
143
148
  api_key: str = Field(title="API key", default="", airbyte_secret=True)
144
149
  base_url: str = Field(
145
- ..., title="Base URL", description="The base URL for your OpenAI-compatible service", examples=["https://your-service-name.com"]
150
+ ...,
151
+ title="Base URL",
152
+ description="The base URL for your OpenAI-compatible service",
153
+ examples=["https://your-service-name.com"],
146
154
  )
147
155
  model_name: str = Field(
148
156
  title="Model name",
@@ -151,7 +159,9 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
151
159
  examples=["text-embedding-ada-002"],
152
160
  )
153
161
  dimensions: int = Field(
154
- title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
162
+ title="Embedding dimensions",
163
+ description="The number of dimensions the embedding model is generating",
164
+ examples=[1536, 384],
155
165
  )
156
166
 
157
167
  class Config(OneOfOptionConfig):
@@ -199,10 +209,16 @@ class FakeEmbeddingConfigModel(BaseModel):
199
209
  class FromFieldEmbeddingConfigModel(BaseModel):
200
210
  mode: Literal["from_field"] = Field("from_field", const=True)
201
211
  field_name: str = Field(
202
- ..., title="Field name", description="Name of the field in the record that contains the embedding", examples=["embedding", "vector"]
212
+ ...,
213
+ title="Field name",
214
+ description="Name of the field in the record that contains the embedding",
215
+ examples=["embedding", "vector"],
203
216
  )
204
217
  dimensions: int = Field(
205
- ..., title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
218
+ ...,
219
+ title="Embedding dimensions",
220
+ description="The number of dimensions the embedding model is generating",
221
+ examples=[1536, 384],
206
222
  )
207
223
 
208
224
  class Config(OneOfOptionConfig):
@@ -241,7 +257,14 @@ class VectorDBConfigModel(BaseModel):
241
257
  FakeEmbeddingConfigModel,
242
258
  AzureOpenAIEmbeddingConfigModel,
243
259
  OpenAICompatibleEmbeddingConfigModel,
244
- ] = Field(..., title="Embedding", description="Embedding configuration", discriminator="mode", group="embedding", type="object")
260
+ ] = Field(
261
+ ...,
262
+ title="Embedding",
263
+ description="Embedding configuration",
264
+ discriminator="mode",
265
+ group="embedding",
266
+ type="object",
267
+ )
245
268
  processing: ProcessingConfigModel
246
269
  omit_raw_text: bool = Field(
247
270
  default=False,
@@ -264,7 +287,7 @@ class VectorDBConfigModel(BaseModel):
264
287
  @staticmethod
265
288
  def remove_discriminator(schema: Dict[str, Any]) -> None:
266
289
  """pydantic adds "discriminator" to the schema for oneOfs, which is not treated right by the platform as we inline all references"""
267
- dpath.util.delete(schema, "properties/**/discriminator")
290
+ dpath.delete(schema, "properties/**/discriminator")
268
291
 
269
292
  @classmethod
270
293
  def schema(cls, by_alias: bool = True, ref_template: str = "") -> Dict[str, Any]:
@@ -7,14 +7,24 @@ import logging
7
7
  from dataclasses import dataclass
8
8
  from typing import Any, Dict, List, Mapping, Optional, Tuple
9
9
 
10
- import dpath.util
11
- from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel, SeparatorSplitterConfigModel, TextSplitterConfigModel
12
- from airbyte_cdk.destinations.vector_db_based.utils import create_stream_identifier
13
- from airbyte_cdk.models import AirbyteRecordMessage, ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, DestinationSyncMode
14
- from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
15
- from langchain.document_loaders.base import Document
10
+ import dpath
16
11
  from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
17
12
  from langchain.utils import stringify_dict
13
+ from langchain_core.documents.base import Document
14
+
15
+ from airbyte_cdk.destinations.vector_db_based.config import (
16
+ ProcessingConfigModel,
17
+ SeparatorSplitterConfigModel,
18
+ TextSplitterConfigModel,
19
+ )
20
+ from airbyte_cdk.destinations.vector_db_based.utils import create_stream_identifier
21
+ from airbyte_cdk.models import (
22
+ AirbyteRecordMessage,
23
+ ConfiguredAirbyteCatalog,
24
+ ConfiguredAirbyteStream,
25
+ DestinationSyncMode,
26
+ )
27
+ from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
18
28
 
19
29
  METADATA_STREAM_FIELD = "_ab_stream"
20
30
  METADATA_RECORD_ID_FIELD = "_ab_record_id"
@@ -30,7 +40,14 @@ class Chunk:
30
40
  embedding: Optional[List[float]] = None
31
41
 
32
42
 
33
- headers_to_split_on = ["(?:^|\n)# ", "(?:^|\n)## ", "(?:^|\n)### ", "(?:^|\n)#### ", "(?:^|\n)##### ", "(?:^|\n)###### "]
43
+ headers_to_split_on = [
44
+ "(?:^|\n)# ",
45
+ "(?:^|\n)## ",
46
+ "(?:^|\n)### ",
47
+ "(?:^|\n)#### ",
48
+ "(?:^|\n)##### ",
49
+ "(?:^|\n)###### ",
50
+ ]
34
51
 
35
52
 
36
53
  class DocumentProcessor:
@@ -64,7 +81,10 @@ class DocumentProcessor:
64
81
  return None
65
82
 
66
83
  def _get_text_splitter(
67
- self, chunk_size: int, chunk_overlap: int, splitter_config: Optional[TextSplitterConfigModel]
84
+ self,
85
+ chunk_size: int,
86
+ chunk_overlap: int,
87
+ splitter_config: Optional[TextSplitterConfigModel],
68
88
  ) -> RecursiveCharacterTextSplitter:
69
89
  if splitter_config is None:
70
90
  splitter_config = SeparatorSplitterConfigModel(mode="separator")
@@ -89,14 +109,20 @@ class DocumentProcessor:
89
109
  return RecursiveCharacterTextSplitter.from_tiktoken_encoder(
90
110
  chunk_size=chunk_size,
91
111
  chunk_overlap=chunk_overlap,
92
- separators=RecursiveCharacterTextSplitter.get_separators_for_language(Language(splitter_config.language)),
112
+ separators=RecursiveCharacterTextSplitter.get_separators_for_language(
113
+ Language(splitter_config.language)
114
+ ),
93
115
  disallowed_special=(),
94
116
  )
95
117
 
96
118
  def __init__(self, config: ProcessingConfigModel, catalog: ConfiguredAirbyteCatalog):
97
- self.streams = {create_stream_identifier(stream.stream): stream for stream in catalog.streams}
119
+ self.streams = {
120
+ create_stream_identifier(stream.stream): stream for stream in catalog.streams
121
+ }
98
122
 
99
- self.splitter = self._get_text_splitter(config.chunk_size, config.chunk_overlap, config.text_splitter)
123
+ self.splitter = self._get_text_splitter(
124
+ config.chunk_size, config.chunk_overlap, config.text_splitter
125
+ )
100
126
  self.text_fields = config.text_fields
101
127
  self.metadata_fields = config.metadata_fields
102
128
  self.field_name_mappings = config.field_name_mappings
@@ -119,10 +145,18 @@ class DocumentProcessor:
119
145
  failure_type=FailureType.config_error,
120
146
  )
121
147
  chunks = [
122
- Chunk(page_content=chunk_document.page_content, metadata=chunk_document.metadata, record=record)
148
+ Chunk(
149
+ page_content=chunk_document.page_content,
150
+ metadata=chunk_document.metadata,
151
+ record=record,
152
+ )
123
153
  for chunk_document in self._split_document(doc)
124
154
  ]
125
- id_to_delete = doc.metadata[METADATA_RECORD_ID_FIELD] if METADATA_RECORD_ID_FIELD in doc.metadata else None
155
+ id_to_delete = (
156
+ doc.metadata[METADATA_RECORD_ID_FIELD]
157
+ if METADATA_RECORD_ID_FIELD in doc.metadata
158
+ else None
159
+ )
126
160
  return chunks, id_to_delete
127
161
 
128
162
  def _generate_document(self, record: AirbyteRecordMessage) -> Optional[Document]:
@@ -133,11 +167,13 @@ class DocumentProcessor:
133
167
  metadata = self._extract_metadata(record)
134
168
  return Document(page_content=text, metadata=metadata)
135
169
 
136
- def _extract_relevant_fields(self, record: AirbyteRecordMessage, fields: Optional[List[str]]) -> Dict[str, Any]:
170
+ def _extract_relevant_fields(
171
+ self, record: AirbyteRecordMessage, fields: Optional[List[str]]
172
+ ) -> Dict[str, Any]:
137
173
  relevant_fields = {}
138
174
  if fields and len(fields) > 0:
139
175
  for field in fields:
140
- values = dpath.util.values(record.data, field, separator=".")
176
+ values = dpath.values(record.data, field, separator=".")
141
177
  if values and len(values) > 0:
142
178
  relevant_fields[field] = values if len(values) > 1 else values[0]
143
179
  else:
@@ -156,13 +192,16 @@ class DocumentProcessor:
156
192
  stream_identifier = create_stream_identifier(record)
157
193
  current_stream: ConfiguredAirbyteStream = self.streams[stream_identifier]
158
194
  # if the sync mode is deduping, use the primary key to upsert existing records instead of appending new ones
159
- if not current_stream.primary_key or current_stream.destination_sync_mode != DestinationSyncMode.append_dedup:
195
+ if (
196
+ not current_stream.primary_key
197
+ or current_stream.destination_sync_mode != DestinationSyncMode.append_dedup
198
+ ):
160
199
  return None
161
200
 
162
201
  primary_key = []
163
202
  for key in current_stream.primary_key:
164
203
  try:
165
- primary_key.append(str(dpath.util.get(record.data, key)))
204
+ primary_key.append(str(dpath.get(record.data, key)))
166
205
  except KeyError:
167
206
  primary_key.append("__not_found__")
168
207
  stringified_primary_key = "_".join(primary_key)
@@ -7,6 +7,11 @@ from abc import ABC, abstractmethod
7
7
  from dataclasses import dataclass
8
8
  from typing import List, Optional, Union, cast
9
9
 
10
+ from langchain.embeddings.cohere import CohereEmbeddings
11
+ from langchain.embeddings.fake import FakeEmbeddings
12
+ from langchain.embeddings.localai import LocalAIEmbeddings
13
+ from langchain.embeddings.openai import OpenAIEmbeddings
14
+
10
15
  from airbyte_cdk.destinations.vector_db_based.config import (
11
16
  AzureOpenAIEmbeddingConfigModel,
12
17
  CohereEmbeddingConfigModel,
@@ -19,10 +24,6 @@ from airbyte_cdk.destinations.vector_db_based.config import (
19
24
  from airbyte_cdk.destinations.vector_db_based.utils import create_chunks, format_exception
20
25
  from airbyte_cdk.models import AirbyteRecordMessage
21
26
  from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
22
- from langchain.embeddings.cohere import CohereEmbeddings
23
- from langchain.embeddings.fake import FakeEmbeddings
24
- from langchain.embeddings.localai import LocalAIEmbeddings
25
- from langchain.embeddings.openai import OpenAIEmbeddings
26
27
 
27
28
 
28
29
  @dataclass
@@ -92,7 +93,9 @@ class BaseOpenAIEmbedder(Embedder):
92
93
  batches = create_chunks(documents, batch_size=embedding_batch_size)
93
94
  embeddings: List[Optional[List[float]]] = []
94
95
  for batch in batches:
95
- embeddings.extend(self.embeddings.embed_documents([chunk.page_content for chunk in batch]))
96
+ embeddings.extend(
97
+ self.embeddings.embed_documents([chunk.page_content for chunk in batch])
98
+ )
96
99
  return embeddings
97
100
 
98
101
  @property
@@ -103,13 +106,30 @@ class BaseOpenAIEmbedder(Embedder):
103
106
 
104
107
  class OpenAIEmbedder(BaseOpenAIEmbedder):
105
108
  def __init__(self, config: OpenAIEmbeddingConfigModel, chunk_size: int):
106
- super().__init__(OpenAIEmbeddings(openai_api_key=config.openai_key, max_retries=15, disallowed_special=()), chunk_size) # type: ignore
109
+ super().__init__(
110
+ OpenAIEmbeddings( # type: ignore [call-arg]
111
+ openai_api_key=config.openai_key, max_retries=15, disallowed_special=()
112
+ ),
113
+ chunk_size,
114
+ ) # type: ignore
107
115
 
108
116
 
109
117
  class AzureOpenAIEmbedder(BaseOpenAIEmbedder):
110
118
  def __init__(self, config: AzureOpenAIEmbeddingConfigModel, chunk_size: int):
111
119
  # Azure OpenAI API has — as of 20230927 — a limit of 16 documents per request
112
- super().__init__(OpenAIEmbeddings(openai_api_key=config.openai_key, chunk_size=16, max_retries=15, openai_api_type="azure", openai_api_version="2023-05-15", openai_api_base=config.api_base, deployment=config.deployment, disallowed_special=()), chunk_size) # type: ignore
120
+ super().__init__(
121
+ OpenAIEmbeddings( # type: ignore [call-arg]
122
+ openai_api_key=config.openai_key,
123
+ chunk_size=16,
124
+ max_retries=15,
125
+ openai_api_type="azure",
126
+ openai_api_version="2023-05-15",
127
+ openai_api_base=config.api_base,
128
+ deployment=config.deployment,
129
+ disallowed_special=(),
130
+ ),
131
+ chunk_size,
132
+ ) # type: ignore
113
133
 
114
134
 
115
135
  COHERE_VECTOR_SIZE = 1024
@@ -119,7 +139,9 @@ class CohereEmbedder(Embedder):
119
139
  def __init__(self, config: CohereEmbeddingConfigModel):
120
140
  super().__init__()
121
141
  # Client is set internally
122
- self.embeddings = CohereEmbeddings(cohere_api_key=config.cohere_key, model="embed-english-light-v2.0") # type: ignore
142
+ self.embeddings = CohereEmbeddings(
143
+ cohere_api_key=config.cohere_key, model="embed-english-light-v2.0"
144
+ ) # type: ignore
123
145
 
124
146
  def check(self) -> Optional[str]:
125
147
  try:
@@ -129,7 +151,10 @@ class CohereEmbedder(Embedder):
129
151
  return None
130
152
 
131
153
  def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
132
- return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
154
+ return cast(
155
+ List[Optional[List[float]]],
156
+ self.embeddings.embed_documents([document.page_content for document in documents]),
157
+ )
133
158
 
134
159
  @property
135
160
  def embedding_dimensions(self) -> int:
@@ -150,7 +175,10 @@ class FakeEmbedder(Embedder):
150
175
  return None
151
176
 
152
177
  def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
153
- return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
178
+ return cast(
179
+ List[Optional[List[float]]],
180
+ self.embeddings.embed_documents([document.page_content for document in documents]),
181
+ )
154
182
 
155
183
  @property
156
184
  def embedding_dimensions(self) -> int:
@@ -167,11 +195,20 @@ class OpenAICompatibleEmbedder(Embedder):
167
195
  self.config = config
168
196
  # Client is set internally
169
197
  # Always set an API key even if there is none defined in the config because the validator will fail otherwise. Embedding APIs that don't require an API key don't fail if one is provided, so this is not breaking usage.
170
- self.embeddings = LocalAIEmbeddings(model=config.model_name, openai_api_key=config.api_key or "dummy-api-key", openai_api_base=config.base_url, max_retries=15, disallowed_special=()) # type: ignore
198
+ self.embeddings = LocalAIEmbeddings(
199
+ model=config.model_name,
200
+ openai_api_key=config.api_key or "dummy-api-key",
201
+ openai_api_base=config.base_url,
202
+ max_retries=15,
203
+ disallowed_special=(),
204
+ ) # type: ignore
171
205
 
172
206
  def check(self) -> Optional[str]:
173
207
  deployment_mode = os.environ.get("DEPLOYMENT_MODE", "")
174
- if deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE and not self.config.base_url.startswith("https://"):
208
+ if (
209
+ deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE
210
+ and not self.config.base_url.startswith("https://")
211
+ ):
175
212
  return "Base URL must start with https://"
176
213
 
177
214
  try:
@@ -181,7 +218,10 @@ class OpenAICompatibleEmbedder(Embedder):
181
218
  return None
182
219
 
183
220
  def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
184
- return cast(List[Optional[List[float]]], self.embeddings.embed_documents([document.page_content for document in documents]))
221
+ return cast(
222
+ List[Optional[List[float]]],
223
+ self.embeddings.embed_documents([document.page_content for document in documents]),
224
+ )
185
225
 
186
226
  @property
187
227
  def embedding_dimensions(self) -> int:
@@ -254,8 +294,10 @@ def create_from_config(
254
294
  ],
255
295
  processing_config: ProcessingConfigModel,
256
296
  ) -> Embedder:
257
-
258
297
  if embedding_config.mode == "azure_openai" or embedding_config.mode == "openai":
259
- return cast(Embedder, embedder_map[embedding_config.mode](embedding_config, processing_config.chunk_size))
298
+ return cast(
299
+ Embedder,
300
+ embedder_map[embedding_config.mode](embedding_config, processing_config.chunk_size),
301
+ )
260
302
  else:
261
303
  return cast(Embedder, embedder_map[embedding_config.mode](embedding_config))
@@ -26,12 +26,19 @@ class BaseIntegrationTest(unittest.TestCase):
26
26
  It provides helper methods to create Airbyte catalogs, records and state messages.
27
27
  """
28
28
 
29
- def _get_configured_catalog(self, destination_mode: DestinationSyncMode) -> ConfiguredAirbyteCatalog:
30
- stream_schema = {"type": "object", "properties": {"str_col": {"type": "str"}, "int_col": {"type": "integer"}}}
29
+ def _get_configured_catalog(
30
+ self, destination_mode: DestinationSyncMode
31
+ ) -> ConfiguredAirbyteCatalog:
32
+ stream_schema = {
33
+ "type": "object",
34
+ "properties": {"str_col": {"type": "str"}, "int_col": {"type": "integer"}},
35
+ }
31
36
 
32
37
  overwrite_stream = ConfiguredAirbyteStream(
33
38
  stream=AirbyteStream(
34
- name="mystream", json_schema=stream_schema, supported_sync_modes=[SyncMode.incremental, SyncMode.full_refresh]
39
+ name="mystream",
40
+ json_schema=stream_schema,
41
+ supported_sync_modes=[SyncMode.incremental, SyncMode.full_refresh],
35
42
  ),
36
43
  primary_key=[["int_col"]],
37
44
  sync_mode=SyncMode.incremental,
@@ -45,7 +52,10 @@ class BaseIntegrationTest(unittest.TestCase):
45
52
 
46
53
  def _record(self, stream: str, str_value: str, int_value: int) -> AirbyteMessage:
47
54
  return AirbyteMessage(
48
- type=Type.RECORD, record=AirbyteRecordMessage(stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0)
55
+ type=Type.RECORD,
56
+ record=AirbyteRecordMessage(
57
+ stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0
58
+ ),
49
59
  )
50
60
 
51
61
  def setUp(self) -> None:
@@ -10,7 +10,11 @@ from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStream
10
10
 
11
11
 
12
12
  def format_exception(exception: Exception) -> str:
13
- return str(exception) + "\n" + "".join(traceback.TracebackException.from_exception(exception).format())
13
+ return (
14
+ str(exception)
15
+ + "\n"
16
+ + "".join(traceback.TracebackException.from_exception(exception).format())
17
+ )
14
18
 
15
19
 
16
20
  def create_chunks(iterable: Iterable[Any], batch_size: int) -> Iterator[Tuple[Any, ...]]:
@@ -26,4 +30,6 @@ def create_stream_identifier(stream: Union[AirbyteStream, AirbyteRecordMessage])
26
30
  if isinstance(stream, AirbyteStream):
27
31
  return str(stream.name if stream.namespace is None else f"{stream.namespace}_{stream.name}")
28
32
  else:
29
- return str(stream.stream if stream.namespace is None else f"{stream.namespace}_{stream.stream}")
33
+ return str(
34
+ stream.stream if stream.namespace is None else f"{stream.namespace}_{stream.stream}"
35
+ )
@@ -27,7 +27,12 @@ class Writer:
27
27
  """
28
28
 
29
29
  def __init__(
30
- self, processing_config: ProcessingConfigModel, indexer: Indexer, embedder: Embedder, batch_size: int, omit_raw_text: bool
30
+ self,
31
+ processing_config: ProcessingConfigModel,
32
+ indexer: Indexer,
33
+ embedder: Embedder,
34
+ batch_size: int,
35
+ omit_raw_text: bool,
31
36
  ) -> None:
32
37
  self.processing_config = processing_config
33
38
  self.indexer = indexer
@@ -54,7 +59,9 @@ class Writer:
54
59
  self.indexer.delete(ids, namespace, stream)
55
60
 
56
61
  for (namespace, stream), chunks in self.chunks.items():
57
- embeddings = self.embedder.embed_documents([self._convert_to_document(chunk) for chunk in chunks])
62
+ embeddings = self.embedder.embed_documents(
63
+ [self._convert_to_document(chunk) for chunk in chunks]
64
+ )
58
65
  for i, document in enumerate(chunks):
59
66
  document.embedding = embeddings[i]
60
67
  if self.omit_raw_text:
@@ -63,7 +70,9 @@ class Writer:
63
70
 
64
71
  self._init_batch()
65
72
 
66
- def write(self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]) -> Iterable[AirbyteMessage]:
73
+ def write(
74
+ self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]
75
+ ) -> Iterable[AirbyteMessage]:
67
76
  self.processor = DocumentProcessor(self.processing_config, configured_catalog)
68
77
  self.indexer.pre_sync(configured_catalog)
69
78
  for message in input_messages:
@@ -74,9 +83,19 @@ class Writer:
74
83
  yield message
75
84
  elif message.type == Type.RECORD:
76
85
  record_chunks, record_id_to_delete = self.processor.process(message.record)
77
- self.chunks[(message.record.namespace, message.record.stream)].extend(record_chunks)
86
+ self.chunks[
87
+ ( # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"
88
+ message.record.namespace, # type: ignore [union-attr] # record not None
89
+ message.record.stream, # type: ignore [union-attr] # record not None
90
+ )
91
+ ].extend(record_chunks)
78
92
  if record_id_to_delete is not None:
79
- self.ids_to_delete[(message.record.namespace, message.record.stream)].append(record_id_to_delete)
93
+ self.ids_to_delete[
94
+ ( # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"
95
+ message.record.namespace, # type: ignore [union-attr] # record not None
96
+ message.record.stream, # type: ignore [union-attr] # record not None
97
+ )
98
+ ].append(record_id_to_delete)
80
99
  self.number_of_chunks += len(record_chunks)
81
100
  if self.number_of_chunks >= self.batch_size:
82
101
  self._process_batch()