airbyte-cdk 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +358 -0
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +104 -0
- airbyte_cdk/connector.py +123 -0
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/__init__.py +3 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
- airbyte_cdk/connector_builder/main.py +107 -0
- airbyte_cdk/connector_builder/models.py +73 -0
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +83 -0
- airbyte_cdk/destinations/__init__.py +8 -0
- airbyte_cdk/destinations/destination.py +154 -0
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
- airbyte_cdk/destinations/vector_db_based/config.py +298 -0
- airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
- airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
- airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
- airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
- airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
- airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
- airbyte_cdk/entrypoint.py +414 -0
- airbyte_cdk/exception_handler.py +56 -0
- airbyte_cdk/logger.py +109 -0
- airbyte_cdk/models/__init__.py +72 -0
- airbyte_cdk/models/airbyte_protocol.py +88 -0
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/well_known_types.py +5 -0
- airbyte_cdk/py.typed +0 -0
- airbyte_cdk/sources/__init__.py +26 -0
- airbyte_cdk/sources/abstract_source.py +326 -0
- airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
- airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
- airbyte_cdk/sources/config.py +27 -0
- airbyte_cdk/sources/connector_state_manager.py +161 -0
- airbyte_cdk/sources/declarative/__init__.py +3 -0
- airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
- airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
- airbyte_cdk/sources/declarative/auth/token.py +267 -0
- airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
- airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
- airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
- airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
- airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
- airbyte_cdk/sources/declarative/declarative_source.py +36 -0
- airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
- airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
- airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
- airbyte_cdk/sources/declarative/exceptions.py +9 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
- airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
- airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
- airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
- airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
- airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +2 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
- airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
- airbyte_cdk/sources/declarative/requesters/README.md +56 -0
- airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
- airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
- airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
- airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
- airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
- airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
- airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
- airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
- airbyte_cdk/sources/declarative/spec/spec.py +48 -0
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
- airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
- airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
- airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
- airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
- airbyte_cdk/sources/declarative/types.py +25 -0
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
- airbyte_cdk/sources/file_based/config/__init__.py +0 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
- airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
- airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
- airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
- airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
- airbyte_cdk/sources/file_based/exceptions.py +159 -0
- airbyte_cdk/sources/file_based/file_based_source.py +466 -0
- airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
- airbyte_cdk/sources/file_based/file_record_data.py +22 -0
- airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
- airbyte_cdk/sources/file_based/remote_file.py +18 -0
- airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
- airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
- airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
- airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
- airbyte_cdk/sources/file_based/types.py +10 -0
- airbyte_cdk/sources/http_config.py +10 -0
- airbyte_cdk/sources/http_logger.py +55 -0
- airbyte_cdk/sources/message/__init__.py +19 -0
- airbyte_cdk/sources/message/repository.py +137 -0
- airbyte_cdk/sources/source.py +95 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/__init__.py +8 -0
- airbyte_cdk/sources/streams/availability_strategy.py +84 -0
- airbyte_cdk/sources/streams/call_rate.py +704 -0
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
- airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
- airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
- airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
- airbyte_cdk/sources/streams/core.py +703 -0
- airbyte_cdk/sources/streams/http/__init__.py +10 -0
- airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +61 -0
- airbyte_cdk/sources/streams/http/http.py +673 -0
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/sources/streams/utils/__init__.py +3 -0
- airbyte_cdk/sources/types.py +169 -0
- airbyte_cdk/sources/utils/__init__.py +7 -0
- airbyte_cdk/sources/utils/casing.py +12 -0
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +53 -0
- airbyte_cdk/sources/utils/schema_helpers.py +230 -0
- airbyte_cdk/sources/utils/slice_logger.py +57 -0
- airbyte_cdk/sources/utils/transform.py +277 -0
- airbyte_cdk/sources/utils/types.py +7 -0
- airbyte_cdk/sql/__init__.py +0 -0
- airbyte_cdk/sql/_util/__init__.py +0 -0
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/__init__.py +7 -0
- airbyte_cdk/test/catalog_builder.py +81 -0
- airbyte_cdk/test/entrypoint_wrapper.py +250 -0
- airbyte_cdk/test/mock_http/__init__.py +6 -0
- airbyte_cdk/test/mock_http/matcher.py +41 -0
- airbyte_cdk/test/mock_http/mocker.py +185 -0
- airbyte_cdk/test/mock_http/request.py +103 -0
- airbyte_cdk/test/mock_http/response.py +28 -0
- airbyte_cdk/test/mock_http/response_builder.py +237 -0
- airbyte_cdk/test/state_builder.py +33 -0
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +10 -0
- airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
- airbyte_cdk/utils/analytics_message.py +25 -0
- airbyte_cdk/utils/constants.py +5 -0
- airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
- airbyte_cdk/utils/datetime_helpers.py +499 -0
- airbyte_cdk/utils/event_timing.py +85 -0
- airbyte_cdk/utils/is_cloud_environment.py +18 -0
- airbyte_cdk/utils/mapping_helpers.py +162 -0
- airbyte_cdk/utils/message_utils.py +26 -0
- airbyte_cdk/utils/oneof_option_config.py +33 -0
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +270 -0
- airbyte_cdk/utils/slice_hasher.py +37 -0
- airbyte_cdk/utils/spec_schema_transformations.py +26 -0
- airbyte_cdk/utils/stream_status_utils.py +43 -0
- airbyte_cdk/utils/traced_exception.py +145 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
- airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
- airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
- airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
- airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Any, Dict, List, Mapping, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
import dpath
|
|
11
|
+
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
|
|
12
|
+
from langchain.utils import stringify_dict
|
|
13
|
+
from langchain_core.documents.base import Document
|
|
14
|
+
|
|
15
|
+
from airbyte_cdk.destinations.vector_db_based.config import (
|
|
16
|
+
ProcessingConfigModel,
|
|
17
|
+
SeparatorSplitterConfigModel,
|
|
18
|
+
TextSplitterConfigModel,
|
|
19
|
+
)
|
|
20
|
+
from airbyte_cdk.destinations.vector_db_based.utils import create_stream_identifier
|
|
21
|
+
from airbyte_cdk.models import (
|
|
22
|
+
AirbyteRecordMessage,
|
|
23
|
+
ConfiguredAirbyteCatalog,
|
|
24
|
+
ConfiguredAirbyteStream,
|
|
25
|
+
DestinationSyncMode,
|
|
26
|
+
)
|
|
27
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
|
|
28
|
+
|
|
29
|
+
METADATA_STREAM_FIELD = "_ab_stream"
|
|
30
|
+
METADATA_RECORD_ID_FIELD = "_ab_record_id"
|
|
31
|
+
|
|
32
|
+
CDC_DELETED_FIELD = "_ab_cdc_deleted_at"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class Chunk:
|
|
37
|
+
page_content: Optional[str]
|
|
38
|
+
metadata: Dict[str, Any]
|
|
39
|
+
record: AirbyteRecordMessage
|
|
40
|
+
embedding: Optional[List[float]] = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
headers_to_split_on = [
|
|
44
|
+
"(?:^|\n)# ",
|
|
45
|
+
"(?:^|\n)## ",
|
|
46
|
+
"(?:^|\n)### ",
|
|
47
|
+
"(?:^|\n)#### ",
|
|
48
|
+
"(?:^|\n)##### ",
|
|
49
|
+
"(?:^|\n)###### ",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DocumentProcessor:
|
|
54
|
+
"""
|
|
55
|
+
DocumentProcessor is a helper class that generates documents from Airbyte records.
|
|
56
|
+
|
|
57
|
+
It is used to generate documents from records before writing them to the destination:
|
|
58
|
+
* The text fields are extracted from the record and concatenated to a single string.
|
|
59
|
+
* The metadata fields are extracted from the record and added to the document metadata.
|
|
60
|
+
* The document is split into chunks of a given size using a langchain text splitter.
|
|
61
|
+
|
|
62
|
+
The Writer class uses the DocumentProcessor class to internally generate documents from records - in most cases you don't need to use it directly,
|
|
63
|
+
except if you want to implement a custom writer.
|
|
64
|
+
|
|
65
|
+
The config parameters specified by the ProcessingConfigModel has to be made part of the connector spec to allow the user to configure the document processor.
|
|
66
|
+
Calling DocumentProcessor.check_config(config) will validate the config and return an error message if the config is invalid.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
streams: Mapping[str, ConfiguredAirbyteStream]
|
|
70
|
+
|
|
71
|
+
@staticmethod
|
|
72
|
+
def check_config(config: ProcessingConfigModel) -> Optional[str]:
|
|
73
|
+
if config.text_splitter is not None and config.text_splitter.mode == "separator":
|
|
74
|
+
for s in config.text_splitter.separators:
|
|
75
|
+
try:
|
|
76
|
+
separator = json.loads(s)
|
|
77
|
+
if not isinstance(separator, str):
|
|
78
|
+
return f"Invalid separator: {s}. Separator needs to be a valid JSON string using double quotes."
|
|
79
|
+
except json.decoder.JSONDecodeError:
|
|
80
|
+
return f"Invalid separator: {s}. Separator needs to be a valid JSON string using double quotes."
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
def _get_text_splitter(
|
|
84
|
+
self,
|
|
85
|
+
chunk_size: int,
|
|
86
|
+
chunk_overlap: int,
|
|
87
|
+
splitter_config: Optional[TextSplitterConfigModel],
|
|
88
|
+
) -> RecursiveCharacterTextSplitter:
|
|
89
|
+
if splitter_config is None:
|
|
90
|
+
splitter_config = SeparatorSplitterConfigModel(mode="separator")
|
|
91
|
+
if splitter_config.mode == "separator":
|
|
92
|
+
return RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
|
93
|
+
chunk_size=chunk_size,
|
|
94
|
+
chunk_overlap=chunk_overlap,
|
|
95
|
+
separators=[json.loads(s) for s in splitter_config.separators],
|
|
96
|
+
keep_separator=splitter_config.keep_separator,
|
|
97
|
+
disallowed_special=(),
|
|
98
|
+
)
|
|
99
|
+
if splitter_config.mode == "markdown":
|
|
100
|
+
return RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
|
101
|
+
chunk_size=chunk_size,
|
|
102
|
+
chunk_overlap=chunk_overlap,
|
|
103
|
+
separators=headers_to_split_on[: splitter_config.split_level],
|
|
104
|
+
is_separator_regex=True,
|
|
105
|
+
keep_separator=True,
|
|
106
|
+
disallowed_special=(),
|
|
107
|
+
)
|
|
108
|
+
if splitter_config.mode == "code":
|
|
109
|
+
return RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
|
110
|
+
chunk_size=chunk_size,
|
|
111
|
+
chunk_overlap=chunk_overlap,
|
|
112
|
+
separators=RecursiveCharacterTextSplitter.get_separators_for_language(
|
|
113
|
+
Language(splitter_config.language)
|
|
114
|
+
),
|
|
115
|
+
disallowed_special=(),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def __init__(self, config: ProcessingConfigModel, catalog: ConfiguredAirbyteCatalog):
|
|
119
|
+
self.streams = {
|
|
120
|
+
create_stream_identifier(stream.stream): stream for stream in catalog.streams
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
self.splitter = self._get_text_splitter(
|
|
124
|
+
config.chunk_size, config.chunk_overlap, config.text_splitter
|
|
125
|
+
)
|
|
126
|
+
self.text_fields = config.text_fields
|
|
127
|
+
self.metadata_fields = config.metadata_fields
|
|
128
|
+
self.field_name_mappings = config.field_name_mappings
|
|
129
|
+
self.logger = logging.getLogger("airbyte.document_processor")
|
|
130
|
+
|
|
131
|
+
def process(self, record: AirbyteRecordMessage) -> Tuple[List[Chunk], Optional[str]]:
|
|
132
|
+
"""
|
|
133
|
+
Generate documents from records.
|
|
134
|
+
:param records: List of AirbyteRecordMessages
|
|
135
|
+
:return: Tuple of (List of document chunks, record id to delete if a stream is in dedup mode to avoid stale documents in the vector store)
|
|
136
|
+
"""
|
|
137
|
+
if CDC_DELETED_FIELD in record.data and record.data[CDC_DELETED_FIELD]:
|
|
138
|
+
return [], self._extract_primary_key(record)
|
|
139
|
+
doc = self._generate_document(record)
|
|
140
|
+
if doc is None:
|
|
141
|
+
text_fields = ", ".join(self.text_fields) if self.text_fields else "all fields"
|
|
142
|
+
raise AirbyteTracedException(
|
|
143
|
+
internal_message="No text fields found in record",
|
|
144
|
+
message=f"Record {str(record.data)[:250]}... does not contain any of the configured text fields: {text_fields}. Please check your processing configuration, there has to be at least one text field set in each record.",
|
|
145
|
+
failure_type=FailureType.config_error,
|
|
146
|
+
)
|
|
147
|
+
chunks = [
|
|
148
|
+
Chunk(
|
|
149
|
+
page_content=chunk_document.page_content,
|
|
150
|
+
metadata=chunk_document.metadata,
|
|
151
|
+
record=record,
|
|
152
|
+
)
|
|
153
|
+
for chunk_document in self._split_document(doc)
|
|
154
|
+
]
|
|
155
|
+
id_to_delete = (
|
|
156
|
+
doc.metadata[METADATA_RECORD_ID_FIELD]
|
|
157
|
+
if METADATA_RECORD_ID_FIELD in doc.metadata
|
|
158
|
+
else None
|
|
159
|
+
)
|
|
160
|
+
return chunks, id_to_delete
|
|
161
|
+
|
|
162
|
+
def _generate_document(self, record: AirbyteRecordMessage) -> Optional[Document]:
|
|
163
|
+
relevant_fields = self._extract_relevant_fields(record, self.text_fields)
|
|
164
|
+
if len(relevant_fields) == 0:
|
|
165
|
+
return None
|
|
166
|
+
text = stringify_dict(relevant_fields)
|
|
167
|
+
metadata = self._extract_metadata(record)
|
|
168
|
+
return Document(page_content=text, metadata=metadata)
|
|
169
|
+
|
|
170
|
+
def _extract_relevant_fields(
|
|
171
|
+
self, record: AirbyteRecordMessage, fields: Optional[List[str]]
|
|
172
|
+
) -> Dict[str, Any]:
|
|
173
|
+
relevant_fields = {}
|
|
174
|
+
if fields and len(fields) > 0:
|
|
175
|
+
for field in fields:
|
|
176
|
+
values = dpath.values(record.data, field, separator=".")
|
|
177
|
+
if values and len(values) > 0:
|
|
178
|
+
relevant_fields[field] = values if len(values) > 1 else values[0]
|
|
179
|
+
else:
|
|
180
|
+
relevant_fields = record.data
|
|
181
|
+
return self._remap_field_names(relevant_fields)
|
|
182
|
+
|
|
183
|
+
def _extract_metadata(self, record: AirbyteRecordMessage) -> Dict[str, Any]:
|
|
184
|
+
metadata = self._extract_relevant_fields(record, self.metadata_fields)
|
|
185
|
+
metadata[METADATA_STREAM_FIELD] = create_stream_identifier(record)
|
|
186
|
+
primary_key = self._extract_primary_key(record)
|
|
187
|
+
if primary_key:
|
|
188
|
+
metadata[METADATA_RECORD_ID_FIELD] = primary_key
|
|
189
|
+
return metadata
|
|
190
|
+
|
|
191
|
+
def _extract_primary_key(self, record: AirbyteRecordMessage) -> Optional[str]:
|
|
192
|
+
stream_identifier = create_stream_identifier(record)
|
|
193
|
+
current_stream: ConfiguredAirbyteStream = self.streams[stream_identifier]
|
|
194
|
+
# if the sync mode is deduping, use the primary key to upsert existing records instead of appending new ones
|
|
195
|
+
if (
|
|
196
|
+
not current_stream.primary_key
|
|
197
|
+
or current_stream.destination_sync_mode != DestinationSyncMode.append_dedup
|
|
198
|
+
):
|
|
199
|
+
return None
|
|
200
|
+
|
|
201
|
+
primary_key = []
|
|
202
|
+
for key in current_stream.primary_key:
|
|
203
|
+
try:
|
|
204
|
+
primary_key.append(str(dpath.get(record.data, key)))
|
|
205
|
+
except KeyError:
|
|
206
|
+
primary_key.append("__not_found__")
|
|
207
|
+
stringified_primary_key = "_".join(primary_key)
|
|
208
|
+
return f"{stream_identifier}_{stringified_primary_key}"
|
|
209
|
+
|
|
210
|
+
def _split_document(self, doc: Document) -> List[Document]:
|
|
211
|
+
chunks: List[Document] = self.splitter.split_documents([doc])
|
|
212
|
+
return chunks
|
|
213
|
+
|
|
214
|
+
def _remap_field_names(self, fields: Dict[str, Any]) -> Dict[str, Any]:
|
|
215
|
+
if not self.field_name_mappings:
|
|
216
|
+
return fields
|
|
217
|
+
|
|
218
|
+
new_fields = fields.copy()
|
|
219
|
+
for mapping in self.field_name_mappings:
|
|
220
|
+
if mapping.from_field in new_fields:
|
|
221
|
+
new_fields[mapping.to_field] = new_fields.pop(mapping.from_field)
|
|
222
|
+
|
|
223
|
+
return new_fields
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import List, Optional, Union, cast
|
|
9
|
+
|
|
10
|
+
from langchain.embeddings.cohere import CohereEmbeddings
|
|
11
|
+
from langchain.embeddings.fake import FakeEmbeddings
|
|
12
|
+
from langchain.embeddings.localai import LocalAIEmbeddings
|
|
13
|
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
|
14
|
+
|
|
15
|
+
from airbyte_cdk.destinations.vector_db_based.config import (
|
|
16
|
+
AzureOpenAIEmbeddingConfigModel,
|
|
17
|
+
CohereEmbeddingConfigModel,
|
|
18
|
+
FakeEmbeddingConfigModel,
|
|
19
|
+
FromFieldEmbeddingConfigModel,
|
|
20
|
+
OpenAICompatibleEmbeddingConfigModel,
|
|
21
|
+
OpenAIEmbeddingConfigModel,
|
|
22
|
+
ProcessingConfigModel,
|
|
23
|
+
)
|
|
24
|
+
from airbyte_cdk.destinations.vector_db_based.utils import create_chunks, format_exception
|
|
25
|
+
from airbyte_cdk.models import AirbyteRecordMessage
|
|
26
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class Document:
|
|
31
|
+
page_content: str
|
|
32
|
+
record: AirbyteRecordMessage
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Embedder(ABC):
|
|
36
|
+
"""
|
|
37
|
+
Embedder is an abstract class that defines the interface for embedding text.
|
|
38
|
+
|
|
39
|
+
The Indexer class uses the Embedder class to internally embed text - each indexer is responsible to pass the text of all documents to the embedder and store the resulting embeddings in the destination.
|
|
40
|
+
The destination connector is responsible to create an embedder instance and pass it to the writer.
|
|
41
|
+
The CDK defines basic embedders that should be supported in each destination. It is possible to implement custom embedders for special destinations if needed.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self) -> None:
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def check(self) -> Optional[str]:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
|
53
|
+
"""
|
|
54
|
+
Embed the text of each chunk and return the resulting embedding vectors.
|
|
55
|
+
If a chunk cannot be embedded or is configured to not be embedded, return None for that chunk.
|
|
56
|
+
"""
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
@abstractmethod
|
|
61
|
+
def embedding_dimensions(self) -> int:
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
OPEN_AI_VECTOR_SIZE = 1536
|
|
66
|
+
|
|
67
|
+
OPEN_AI_TOKEN_LIMIT = 150_000 # limit of tokens per minute
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class BaseOpenAIEmbedder(Embedder):
|
|
71
|
+
def __init__(self, embeddings: OpenAIEmbeddings, chunk_size: int):
|
|
72
|
+
super().__init__()
|
|
73
|
+
self.embeddings = embeddings
|
|
74
|
+
self.chunk_size = chunk_size
|
|
75
|
+
|
|
76
|
+
def check(self) -> Optional[str]:
|
|
77
|
+
try:
|
|
78
|
+
self.embeddings.embed_query("test")
|
|
79
|
+
except Exception as e:
|
|
80
|
+
return format_exception(e)
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
|
84
|
+
"""
|
|
85
|
+
Embed the text of each chunk and return the resulting embedding vectors.
|
|
86
|
+
|
|
87
|
+
As the OpenAI API will fail if more than the per-minute limit worth of tokens is sent at once, we split the request into batches and embed each batch separately.
|
|
88
|
+
It's still possible to run into the rate limit between each embed call because the available token budget hasn't recovered between the calls,
|
|
89
|
+
but the built-in retry mechanism of the OpenAI client handles that.
|
|
90
|
+
"""
|
|
91
|
+
# Each chunk can hold at most self.chunk_size tokens, so tokens-per-minute by maximum tokens per chunk is the number of documents that can be embedded at once without exhausting the limit in a single request
|
|
92
|
+
embedding_batch_size = OPEN_AI_TOKEN_LIMIT // self.chunk_size
|
|
93
|
+
batches = create_chunks(documents, batch_size=embedding_batch_size)
|
|
94
|
+
embeddings: List[Optional[List[float]]] = []
|
|
95
|
+
for batch in batches:
|
|
96
|
+
embeddings.extend(
|
|
97
|
+
self.embeddings.embed_documents([chunk.page_content for chunk in batch])
|
|
98
|
+
)
|
|
99
|
+
return embeddings
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def embedding_dimensions(self) -> int:
|
|
103
|
+
# vector size produced by text-embedding-ada-002 model
|
|
104
|
+
return OPEN_AI_VECTOR_SIZE
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class OpenAIEmbedder(BaseOpenAIEmbedder):
|
|
108
|
+
def __init__(self, config: OpenAIEmbeddingConfigModel, chunk_size: int):
|
|
109
|
+
super().__init__(
|
|
110
|
+
OpenAIEmbeddings( # type: ignore [call-arg]
|
|
111
|
+
openai_api_key=config.openai_key, max_retries=15, disallowed_special=()
|
|
112
|
+
),
|
|
113
|
+
chunk_size,
|
|
114
|
+
) # type: ignore
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class AzureOpenAIEmbedder(BaseOpenAIEmbedder):
|
|
118
|
+
def __init__(self, config: AzureOpenAIEmbeddingConfigModel, chunk_size: int):
|
|
119
|
+
# Azure OpenAI API has — as of 20230927 — a limit of 16 documents per request
|
|
120
|
+
super().__init__(
|
|
121
|
+
OpenAIEmbeddings( # type: ignore [call-arg]
|
|
122
|
+
openai_api_key=config.openai_key,
|
|
123
|
+
chunk_size=16,
|
|
124
|
+
max_retries=15,
|
|
125
|
+
openai_api_type="azure",
|
|
126
|
+
openai_api_version="2023-05-15",
|
|
127
|
+
openai_api_base=config.api_base,
|
|
128
|
+
deployment=config.deployment,
|
|
129
|
+
disallowed_special=(),
|
|
130
|
+
),
|
|
131
|
+
chunk_size,
|
|
132
|
+
) # type: ignore
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
COHERE_VECTOR_SIZE = 1024
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class CohereEmbedder(Embedder):
|
|
139
|
+
def __init__(self, config: CohereEmbeddingConfigModel):
|
|
140
|
+
super().__init__()
|
|
141
|
+
# Client is set internally
|
|
142
|
+
self.embeddings = CohereEmbeddings(
|
|
143
|
+
cohere_api_key=config.cohere_key, model="embed-english-light-v2.0"
|
|
144
|
+
) # type: ignore
|
|
145
|
+
|
|
146
|
+
def check(self) -> Optional[str]:
|
|
147
|
+
try:
|
|
148
|
+
self.embeddings.embed_query("test")
|
|
149
|
+
except Exception as e:
|
|
150
|
+
return format_exception(e)
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
|
154
|
+
return cast(
|
|
155
|
+
List[Optional[List[float]]],
|
|
156
|
+
self.embeddings.embed_documents([document.page_content for document in documents]),
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def embedding_dimensions(self) -> int:
|
|
161
|
+
# vector size produced by text-embedding-ada-002 model
|
|
162
|
+
return COHERE_VECTOR_SIZE
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class FakeEmbedder(Embedder):
|
|
166
|
+
def __init__(self, config: FakeEmbeddingConfigModel):
|
|
167
|
+
super().__init__()
|
|
168
|
+
self.embeddings = FakeEmbeddings(size=OPEN_AI_VECTOR_SIZE)
|
|
169
|
+
|
|
170
|
+
def check(self) -> Optional[str]:
|
|
171
|
+
try:
|
|
172
|
+
self.embeddings.embed_query("test")
|
|
173
|
+
except Exception as e:
|
|
174
|
+
return format_exception(e)
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
|
178
|
+
return cast(
|
|
179
|
+
List[Optional[List[float]]],
|
|
180
|
+
self.embeddings.embed_documents([document.page_content for document in documents]),
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def embedding_dimensions(self) -> int:
|
|
185
|
+
# use same vector size as for OpenAI embeddings to keep it realistic
|
|
186
|
+
return OPEN_AI_VECTOR_SIZE
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
CLOUD_DEPLOYMENT_MODE = "cloud"
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class OpenAICompatibleEmbedder(Embedder):
|
|
193
|
+
def __init__(self, config: OpenAICompatibleEmbeddingConfigModel):
|
|
194
|
+
super().__init__()
|
|
195
|
+
self.config = config
|
|
196
|
+
# Client is set internally
|
|
197
|
+
# Always set an API key even if there is none defined in the config because the validator will fail otherwise. Embedding APIs that don't require an API key don't fail if one is provided, so this is not breaking usage.
|
|
198
|
+
self.embeddings = LocalAIEmbeddings(
|
|
199
|
+
model=config.model_name,
|
|
200
|
+
openai_api_key=config.api_key or "dummy-api-key",
|
|
201
|
+
openai_api_base=config.base_url,
|
|
202
|
+
max_retries=15,
|
|
203
|
+
disallowed_special=(),
|
|
204
|
+
) # type: ignore
|
|
205
|
+
|
|
206
|
+
def check(self) -> Optional[str]:
|
|
207
|
+
deployment_mode = os.environ.get("DEPLOYMENT_MODE", "")
|
|
208
|
+
if (
|
|
209
|
+
deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE
|
|
210
|
+
and not self.config.base_url.startswith("https://")
|
|
211
|
+
):
|
|
212
|
+
return "Base URL must start with https://"
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
self.embeddings.embed_query("test")
|
|
216
|
+
except Exception as e:
|
|
217
|
+
return format_exception(e)
|
|
218
|
+
return None
|
|
219
|
+
|
|
220
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
|
221
|
+
return cast(
|
|
222
|
+
List[Optional[List[float]]],
|
|
223
|
+
self.embeddings.embed_documents([document.page_content for document in documents]),
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
@property
|
|
227
|
+
def embedding_dimensions(self) -> int:
|
|
228
|
+
# vector size produced by the model
|
|
229
|
+
return self.config.dimensions
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class FromFieldEmbedder(Embedder):
|
|
233
|
+
def __init__(self, config: FromFieldEmbeddingConfigModel):
|
|
234
|
+
super().__init__()
|
|
235
|
+
self.config = config
|
|
236
|
+
|
|
237
|
+
def check(self) -> Optional[str]:
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
|
241
|
+
"""
|
|
242
|
+
From each chunk, pull the embedding from the field specified in the config.
|
|
243
|
+
Check that the field exists, is a list of numbers and is the correct size. If not, raise an AirbyteTracedException explaining the problem.
|
|
244
|
+
"""
|
|
245
|
+
embeddings: List[Optional[List[float]]] = []
|
|
246
|
+
for document in documents:
|
|
247
|
+
data = document.record.data
|
|
248
|
+
if self.config.field_name not in data:
|
|
249
|
+
raise AirbyteTracedException(
|
|
250
|
+
internal_message="Embedding vector field not found",
|
|
251
|
+
failure_type=FailureType.config_error,
|
|
252
|
+
message=f"Record {str(data)[:250]}... in stream {document.record.stream} does not contain embedding vector field {self.config.field_name}. Please check your embedding configuration, the embedding vector field has to be set correctly on every record.",
|
|
253
|
+
)
|
|
254
|
+
field = data[self.config.field_name]
|
|
255
|
+
if not isinstance(field, list) or not all(isinstance(x, (int, float)) for x in field):
|
|
256
|
+
raise AirbyteTracedException(
|
|
257
|
+
internal_message="Embedding vector field not a list of numbers",
|
|
258
|
+
failure_type=FailureType.config_error,
|
|
259
|
+
message=f"Record {str(data)[:250]}... in stream {document.record.stream} does contain embedding vector field {self.config.field_name}, but it is not a list of numbers. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
|
|
260
|
+
)
|
|
261
|
+
if len(field) != self.config.dimensions:
|
|
262
|
+
raise AirbyteTracedException(
|
|
263
|
+
internal_message="Embedding vector field has wrong length",
|
|
264
|
+
failure_type=FailureType.config_error,
|
|
265
|
+
message=f"Record {str(data)[:250]}... in stream {document.record.stream} does contain embedding vector field {self.config.field_name}, but it has length {len(field)} instead of the configured {self.config.dimensions}. Please check your embedding configuration, the embedding vector field has to be a list of numbers of length {self.config.dimensions} on every record.",
|
|
266
|
+
)
|
|
267
|
+
embeddings.append(field)
|
|
268
|
+
|
|
269
|
+
return embeddings
|
|
270
|
+
|
|
271
|
+
@property
|
|
272
|
+
def embedding_dimensions(self) -> int:
|
|
273
|
+
return self.config.dimensions
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
embedder_map = {
|
|
277
|
+
"openai": OpenAIEmbedder,
|
|
278
|
+
"cohere": CohereEmbedder,
|
|
279
|
+
"fake": FakeEmbedder,
|
|
280
|
+
"azure_openai": AzureOpenAIEmbedder,
|
|
281
|
+
"from_field": FromFieldEmbedder,
|
|
282
|
+
"openai_compatible": OpenAICompatibleEmbedder,
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def create_from_config(
|
|
287
|
+
embedding_config: Union[
|
|
288
|
+
AzureOpenAIEmbeddingConfigModel,
|
|
289
|
+
CohereEmbeddingConfigModel,
|
|
290
|
+
FakeEmbeddingConfigModel,
|
|
291
|
+
FromFieldEmbeddingConfigModel,
|
|
292
|
+
OpenAIEmbeddingConfigModel,
|
|
293
|
+
OpenAICompatibleEmbeddingConfigModel,
|
|
294
|
+
],
|
|
295
|
+
processing_config: ProcessingConfigModel,
|
|
296
|
+
) -> Embedder:
|
|
297
|
+
if embedding_config.mode == "azure_openai" or embedding_config.mode == "openai":
|
|
298
|
+
return cast(
|
|
299
|
+
Embedder,
|
|
300
|
+
embedder_map[embedding_config.mode](embedding_config, processing_config.chunk_size),
|
|
301
|
+
)
|
|
302
|
+
else:
|
|
303
|
+
return cast(Embedder, embedder_map[embedding_config.mode](embedding_config))
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import itertools
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from typing import Any, Generator, Iterable, List, Optional, Tuple, TypeVar
|
|
8
|
+
|
|
9
|
+
from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk
|
|
10
|
+
from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Indexer(ABC):
|
|
14
|
+
"""
|
|
15
|
+
Indexer is an abstract class that defines the interface for indexing documents.
|
|
16
|
+
|
|
17
|
+
The Writer class uses the Indexer class to internally index documents generated by the document processor.
|
|
18
|
+
In a destination connector, implement a custom indexer by extending this class and implementing the abstract methods.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, config: Any):
|
|
22
|
+
self.config = config
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
def pre_sync(self, catalog: ConfiguredAirbyteCatalog) -> None:
|
|
26
|
+
"""
|
|
27
|
+
Run before the sync starts. This method should be used to make sure all records in the destination that belong to streams with a destination mode of overwrite are deleted.
|
|
28
|
+
|
|
29
|
+
Each record has a metadata field with the name airbyte_cdk.destinations.vector_db_based.document_processor.METADATA_STREAM_FIELD which can be used to filter documents for deletion.
|
|
30
|
+
Use the airbyte_cdk.destinations.vector_db_based.utils.create_stream_identifier method to create the stream identifier based on the stream definition to use for filtering.
|
|
31
|
+
"""
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
def post_sync(self) -> List[AirbyteMessage]:
|
|
35
|
+
"""
|
|
36
|
+
Run after the sync finishes. This method should be used to perform any cleanup operations and can return a list of AirbyteMessages to be logged.
|
|
37
|
+
"""
|
|
38
|
+
return []
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
def index(self, document_chunks: List[Chunk], namespace: str, stream: str) -> None:
|
|
42
|
+
"""
|
|
43
|
+
Index a list of document chunks.
|
|
44
|
+
|
|
45
|
+
This method should be used to index the documents in the destination. If page_content is None, the document should be indexed without the raw text.
|
|
46
|
+
All chunks belong to the stream and namespace specified in the parameters.
|
|
47
|
+
"""
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def delete(self, delete_ids: List[str], namespace: str, stream: str) -> None:
|
|
52
|
+
"""
|
|
53
|
+
Delete document chunks belonging to certain record ids.
|
|
54
|
+
|
|
55
|
+
This method should be used to delete documents from the destination.
|
|
56
|
+
The delete_ids parameter contains a list of record ids - all chunks with a record id in this list should be deleted from the destination.
|
|
57
|
+
All ids belong to the stream and namespace specified in the parameters.
|
|
58
|
+
"""
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def check(self) -> Optional[str]:
|
|
63
|
+
"""
|
|
64
|
+
Check if the indexer is configured correctly. This method should be used to check if the indexer is configured correctly and return an error message if it is not.
|
|
65
|
+
"""
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
T = TypeVar("T")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def chunks(iterable: Iterable[T], batch_size: int) -> Generator[Tuple[T, ...], None, None]:
|
|
73
|
+
"""A helper function to break an iterable into chunks of size batch_size."""
|
|
74
|
+
it = iter(iterable)
|
|
75
|
+
chunk = tuple(itertools.islice(it, batch_size))
|
|
76
|
+
while chunk:
|
|
77
|
+
yield chunk
|
|
78
|
+
chunk = tuple(itertools.islice(it, batch_size))
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import unittest
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
|
|
9
|
+
from airbyte_cdk.models import (
|
|
10
|
+
AirbyteMessage,
|
|
11
|
+
AirbyteRecordMessage,
|
|
12
|
+
AirbyteStateMessage,
|
|
13
|
+
AirbyteStream,
|
|
14
|
+
ConfiguredAirbyteCatalog,
|
|
15
|
+
ConfiguredAirbyteStream,
|
|
16
|
+
DestinationSyncMode,
|
|
17
|
+
SyncMode,
|
|
18
|
+
Type,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BaseIntegrationTest(unittest.TestCase):
|
|
23
|
+
"""
|
|
24
|
+
BaseIntegrationTest is a base class for integration tests for vector db destinations.
|
|
25
|
+
|
|
26
|
+
It provides helper methods to create Airbyte catalogs, records and state messages.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def _get_configured_catalog(
|
|
30
|
+
self, destination_mode: DestinationSyncMode
|
|
31
|
+
) -> ConfiguredAirbyteCatalog:
|
|
32
|
+
stream_schema = {
|
|
33
|
+
"type": "object",
|
|
34
|
+
"properties": {"str_col": {"type": "str"}, "int_col": {"type": "integer"}},
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
overwrite_stream = ConfiguredAirbyteStream(
|
|
38
|
+
stream=AirbyteStream(
|
|
39
|
+
name="mystream",
|
|
40
|
+
json_schema=stream_schema,
|
|
41
|
+
supported_sync_modes=[SyncMode.incremental, SyncMode.full_refresh],
|
|
42
|
+
),
|
|
43
|
+
primary_key=[["int_col"]],
|
|
44
|
+
sync_mode=SyncMode.incremental,
|
|
45
|
+
destination_sync_mode=destination_mode,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
return ConfiguredAirbyteCatalog(streams=[overwrite_stream])
|
|
49
|
+
|
|
50
|
+
def _state(self, data: Dict[str, Any]) -> AirbyteMessage:
|
|
51
|
+
return AirbyteMessage(type=Type.STATE, state=AirbyteStateMessage(data=data))
|
|
52
|
+
|
|
53
|
+
def _record(self, stream: str, str_value: str, int_value: int) -> AirbyteMessage:
|
|
54
|
+
return AirbyteMessage(
|
|
55
|
+
type=Type.RECORD,
|
|
56
|
+
record=AirbyteRecordMessage(
|
|
57
|
+
stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0
|
|
58
|
+
),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def setUp(self) -> None:
|
|
62
|
+
with open("secrets/config.json", "r") as f:
|
|
63
|
+
self.config = json.loads(f.read())
|