airbyte-cdk 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +358 -0
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +104 -0
- airbyte_cdk/connector.py +123 -0
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/__init__.py +3 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
- airbyte_cdk/connector_builder/main.py +107 -0
- airbyte_cdk/connector_builder/models.py +73 -0
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +83 -0
- airbyte_cdk/destinations/__init__.py +8 -0
- airbyte_cdk/destinations/destination.py +154 -0
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
- airbyte_cdk/destinations/vector_db_based/config.py +298 -0
- airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
- airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
- airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
- airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
- airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
- airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
- airbyte_cdk/entrypoint.py +414 -0
- airbyte_cdk/exception_handler.py +56 -0
- airbyte_cdk/logger.py +109 -0
- airbyte_cdk/models/__init__.py +72 -0
- airbyte_cdk/models/airbyte_protocol.py +88 -0
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/well_known_types.py +5 -0
- airbyte_cdk/py.typed +0 -0
- airbyte_cdk/sources/__init__.py +26 -0
- airbyte_cdk/sources/abstract_source.py +326 -0
- airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
- airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
- airbyte_cdk/sources/config.py +27 -0
- airbyte_cdk/sources/connector_state_manager.py +161 -0
- airbyte_cdk/sources/declarative/__init__.py +3 -0
- airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
- airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
- airbyte_cdk/sources/declarative/auth/token.py +267 -0
- airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
- airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
- airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
- airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
- airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
- airbyte_cdk/sources/declarative/declarative_source.py +36 -0
- airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
- airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
- airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
- airbyte_cdk/sources/declarative/exceptions.py +9 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
- airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
- airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
- airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
- airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
- airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +2 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
- airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
- airbyte_cdk/sources/declarative/requesters/README.md +56 -0
- airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
- airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
- airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
- airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
- airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
- airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
- airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
- airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
- airbyte_cdk/sources/declarative/spec/spec.py +48 -0
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
- airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
- airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
- airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
- airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
- airbyte_cdk/sources/declarative/types.py +25 -0
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
- airbyte_cdk/sources/file_based/config/__init__.py +0 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
- airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
- airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
- airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
- airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
- airbyte_cdk/sources/file_based/exceptions.py +159 -0
- airbyte_cdk/sources/file_based/file_based_source.py +466 -0
- airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
- airbyte_cdk/sources/file_based/file_record_data.py +22 -0
- airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
- airbyte_cdk/sources/file_based/remote_file.py +18 -0
- airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
- airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
- airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
- airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
- airbyte_cdk/sources/file_based/types.py +10 -0
- airbyte_cdk/sources/http_config.py +10 -0
- airbyte_cdk/sources/http_logger.py +55 -0
- airbyte_cdk/sources/message/__init__.py +19 -0
- airbyte_cdk/sources/message/repository.py +137 -0
- airbyte_cdk/sources/source.py +95 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/__init__.py +8 -0
- airbyte_cdk/sources/streams/availability_strategy.py +84 -0
- airbyte_cdk/sources/streams/call_rate.py +704 -0
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
- airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
- airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
- airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
- airbyte_cdk/sources/streams/core.py +703 -0
- airbyte_cdk/sources/streams/http/__init__.py +10 -0
- airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +61 -0
- airbyte_cdk/sources/streams/http/http.py +673 -0
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/sources/streams/utils/__init__.py +3 -0
- airbyte_cdk/sources/types.py +169 -0
- airbyte_cdk/sources/utils/__init__.py +7 -0
- airbyte_cdk/sources/utils/casing.py +12 -0
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +53 -0
- airbyte_cdk/sources/utils/schema_helpers.py +230 -0
- airbyte_cdk/sources/utils/slice_logger.py +57 -0
- airbyte_cdk/sources/utils/transform.py +277 -0
- airbyte_cdk/sources/utils/types.py +7 -0
- airbyte_cdk/sql/__init__.py +0 -0
- airbyte_cdk/sql/_util/__init__.py +0 -0
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/__init__.py +7 -0
- airbyte_cdk/test/catalog_builder.py +81 -0
- airbyte_cdk/test/entrypoint_wrapper.py +250 -0
- airbyte_cdk/test/mock_http/__init__.py +6 -0
- airbyte_cdk/test/mock_http/matcher.py +41 -0
- airbyte_cdk/test/mock_http/mocker.py +185 -0
- airbyte_cdk/test/mock_http/request.py +103 -0
- airbyte_cdk/test/mock_http/response.py +28 -0
- airbyte_cdk/test/mock_http/response_builder.py +237 -0
- airbyte_cdk/test/state_builder.py +33 -0
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +10 -0
- airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
- airbyte_cdk/utils/analytics_message.py +25 -0
- airbyte_cdk/utils/constants.py +5 -0
- airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
- airbyte_cdk/utils/datetime_helpers.py +499 -0
- airbyte_cdk/utils/event_timing.py +85 -0
- airbyte_cdk/utils/is_cloud_environment.py +18 -0
- airbyte_cdk/utils/mapping_helpers.py +162 -0
- airbyte_cdk/utils/message_utils.py +26 -0
- airbyte_cdk/utils/oneof_option_config.py +33 -0
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +270 -0
- airbyte_cdk/utils/slice_hasher.py +37 -0
- airbyte_cdk/utils/spec_schema_transformations.py +26 -0
- airbyte_cdk/utils/stream_status_utils.py +43 -0
- airbyte_cdk/utils/traced_exception.py +145 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
- airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
- airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
- airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
- airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
This module defines type aliases utilized in the Airbyte Connector Builder's test reader.
|
|
7
|
+
These aliases streamline type-checking for heterogeneous message groups and schema outputs,
|
|
8
|
+
ensuring consistency throughout the processing of stream data and associated messages.
|
|
9
|
+
|
|
10
|
+
Type Aliases:
|
|
11
|
+
MESSAGE_GROUPS:
|
|
12
|
+
An iterable union of message-like objects which may include:
|
|
13
|
+
- StreamReadSlices: Represents slices used to read data from a stream.
|
|
14
|
+
- AirbyteControlMessage: Represents control commands used in the Airbyte protocol.
|
|
15
|
+
- AirbyteLogMessage: Represents log messages generated by the system.
|
|
16
|
+
- AirbyteTraceMessage: Represents trace messages typically used for debugging.
|
|
17
|
+
- AuxiliaryRequest: Represents any supplementary request issued during processing.
|
|
18
|
+
|
|
19
|
+
INFERRED_SCHEMA_OUTPUT_TYPE:
|
|
20
|
+
A tuple where:
|
|
21
|
+
- The first element is either an InferredSchema instance or None, denoting the inferred JSON schema.
|
|
22
|
+
- The second element is a list of LogMessage instances capturing logs produced during inference.
|
|
23
|
+
|
|
24
|
+
GROUPED_MESSAGES:
|
|
25
|
+
A tuple representing grouped messages divided as follows:
|
|
26
|
+
- A list of StreamReadSlices.
|
|
27
|
+
- A list of LogMessage instances.
|
|
28
|
+
- A list of AuxiliaryRequest instances.
|
|
29
|
+
- An optional AirbyteControlMessage that, if present, governs control flow in message processing.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from typing import Any, Iterable, List
|
|
33
|
+
|
|
34
|
+
from airbyte_cdk.connector_builder.models import (
|
|
35
|
+
AuxiliaryRequest,
|
|
36
|
+
HttpRequest,
|
|
37
|
+
HttpResponse,
|
|
38
|
+
LogMessage,
|
|
39
|
+
StreamReadSlices,
|
|
40
|
+
)
|
|
41
|
+
from airbyte_cdk.models import (
|
|
42
|
+
AirbyteControlMessage,
|
|
43
|
+
AirbyteLogMessage,
|
|
44
|
+
AirbyteTraceMessage,
|
|
45
|
+
)
|
|
46
|
+
from airbyte_cdk.utils.schema_inferrer import (
|
|
47
|
+
InferredSchema,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
MESSAGE_GROUPS = Iterable[
|
|
51
|
+
StreamReadSlices
|
|
52
|
+
| AirbyteControlMessage
|
|
53
|
+
| AirbyteLogMessage
|
|
54
|
+
| AirbyteTraceMessage
|
|
55
|
+
| AuxiliaryRequest,
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
INFERRED_SCHEMA_OUTPUT_TYPE = tuple[
|
|
59
|
+
InferredSchema | None,
|
|
60
|
+
List[LogMessage],
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
GROUPED_MESSAGES = tuple[
|
|
64
|
+
List[StreamReadSlices],
|
|
65
|
+
List[LogMessage],
|
|
66
|
+
List[AuxiliaryRequest],
|
|
67
|
+
AirbyteControlMessage | None,
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
LOG_MESSAGES_OUTPUT_TYPE = tuple[
|
|
71
|
+
bool,
|
|
72
|
+
HttpRequest | None,
|
|
73
|
+
HttpResponse | None,
|
|
74
|
+
AuxiliaryRequest | None,
|
|
75
|
+
AirbyteLogMessage | None,
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
ASYNC_AUXILIARY_REQUEST_TYPES = [
|
|
79
|
+
"ASYNC_CREATE",
|
|
80
|
+
"ASYNC_POLL",
|
|
81
|
+
"ASYNC_ABORT",
|
|
82
|
+
"ASYNC_DELETE",
|
|
83
|
+
]
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import io
|
|
7
|
+
import logging
|
|
8
|
+
import sys
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from typing import Any, Iterable, List, Mapping
|
|
11
|
+
|
|
12
|
+
import orjson
|
|
13
|
+
|
|
14
|
+
from airbyte_cdk.connector import Connector
|
|
15
|
+
from airbyte_cdk.exception_handler import init_uncaught_exception_handler
|
|
16
|
+
from airbyte_cdk.models import (
|
|
17
|
+
AirbyteMessage,
|
|
18
|
+
AirbyteMessageSerializer,
|
|
19
|
+
ConfiguredAirbyteCatalog,
|
|
20
|
+
ConfiguredAirbyteCatalogSerializer,
|
|
21
|
+
Type,
|
|
22
|
+
)
|
|
23
|
+
from airbyte_cdk.sources.utils.schema_helpers import check_config_against_spec_or_exit
|
|
24
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger("airbyte")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Destination(Connector, ABC):
|
|
30
|
+
VALID_CMDS = {"spec", "check", "write"}
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def write(
|
|
34
|
+
self,
|
|
35
|
+
config: Mapping[str, Any],
|
|
36
|
+
configured_catalog: ConfiguredAirbyteCatalog,
|
|
37
|
+
input_messages: Iterable[AirbyteMessage],
|
|
38
|
+
) -> Iterable[AirbyteMessage]:
|
|
39
|
+
"""Implement to define how the connector writes data to the destination"""
|
|
40
|
+
|
|
41
|
+
def _run_check(self, config: Mapping[str, Any]) -> AirbyteMessage:
|
|
42
|
+
check_result = self.check(logger, config)
|
|
43
|
+
return AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=check_result)
|
|
44
|
+
|
|
45
|
+
def _parse_input_stream(self, input_stream: io.TextIOWrapper) -> Iterable[AirbyteMessage]:
|
|
46
|
+
"""Reads from stdin, converting to Airbyte messages"""
|
|
47
|
+
for line in input_stream:
|
|
48
|
+
try:
|
|
49
|
+
yield AirbyteMessageSerializer.load(orjson.loads(line))
|
|
50
|
+
except orjson.JSONDecodeError:
|
|
51
|
+
logger.info(
|
|
52
|
+
f"ignoring input which can't be deserialized as Airbyte Message: {line}"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def _run_write(
|
|
56
|
+
self,
|
|
57
|
+
config: Mapping[str, Any],
|
|
58
|
+
configured_catalog_path: str,
|
|
59
|
+
input_stream: io.TextIOWrapper,
|
|
60
|
+
) -> Iterable[AirbyteMessage]:
|
|
61
|
+
catalog = ConfiguredAirbyteCatalogSerializer.load(
|
|
62
|
+
orjson.loads(open(configured_catalog_path).read())
|
|
63
|
+
)
|
|
64
|
+
input_messages = self._parse_input_stream(input_stream)
|
|
65
|
+
logger.info("Begin writing to the destination...")
|
|
66
|
+
yield from self.write(
|
|
67
|
+
config=config, configured_catalog=catalog, input_messages=input_messages
|
|
68
|
+
)
|
|
69
|
+
logger.info("Writing complete.")
|
|
70
|
+
|
|
71
|
+
def parse_args(self, args: List[str]) -> argparse.Namespace:
|
|
72
|
+
"""
|
|
73
|
+
:param args: commandline arguments
|
|
74
|
+
:return:
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
parent_parser = argparse.ArgumentParser(add_help=False)
|
|
78
|
+
main_parser = argparse.ArgumentParser()
|
|
79
|
+
subparsers = main_parser.add_subparsers(title="commands", dest="command")
|
|
80
|
+
|
|
81
|
+
# spec
|
|
82
|
+
subparsers.add_parser(
|
|
83
|
+
"spec", help="outputs the json configuration specification", parents=[parent_parser]
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# check
|
|
87
|
+
check_parser = subparsers.add_parser(
|
|
88
|
+
"check", help="checks the config can be used to connect", parents=[parent_parser]
|
|
89
|
+
)
|
|
90
|
+
required_check_parser = check_parser.add_argument_group("required named arguments")
|
|
91
|
+
required_check_parser.add_argument(
|
|
92
|
+
"--config", type=str, required=True, help="path to the json configuration file"
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# write
|
|
96
|
+
write_parser = subparsers.add_parser(
|
|
97
|
+
"write", help="Writes data to the destination", parents=[parent_parser]
|
|
98
|
+
)
|
|
99
|
+
write_required = write_parser.add_argument_group("required named arguments")
|
|
100
|
+
write_required.add_argument(
|
|
101
|
+
"--config", type=str, required=True, help="path to the JSON configuration file"
|
|
102
|
+
)
|
|
103
|
+
write_required.add_argument(
|
|
104
|
+
"--catalog", type=str, required=True, help="path to the configured catalog JSON file"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
parsed_args = main_parser.parse_args(args)
|
|
108
|
+
cmd = parsed_args.command
|
|
109
|
+
if not cmd:
|
|
110
|
+
raise Exception("No command entered. ")
|
|
111
|
+
elif cmd not in ["spec", "check", "write"]:
|
|
112
|
+
# This is technically dead code since parse_args() would fail if this was the case
|
|
113
|
+
# But it's non-obvious enough to warrant placing it here anyways
|
|
114
|
+
raise Exception(f"Unknown command entered: {cmd}")
|
|
115
|
+
|
|
116
|
+
return parsed_args
|
|
117
|
+
|
|
118
|
+
def run_cmd(self, parsed_args: argparse.Namespace) -> Iterable[AirbyteMessage]:
|
|
119
|
+
cmd = parsed_args.command
|
|
120
|
+
if cmd not in self.VALID_CMDS:
|
|
121
|
+
raise Exception(f"Unrecognized command: {cmd}")
|
|
122
|
+
|
|
123
|
+
spec = self.spec(logger)
|
|
124
|
+
if cmd == "spec":
|
|
125
|
+
yield AirbyteMessage(type=Type.SPEC, spec=spec)
|
|
126
|
+
return
|
|
127
|
+
config = self.read_config(config_path=parsed_args.config)
|
|
128
|
+
if self.check_config_against_spec or cmd == "check":
|
|
129
|
+
try:
|
|
130
|
+
check_config_against_spec_or_exit(config, spec)
|
|
131
|
+
except AirbyteTracedException as traced_exc:
|
|
132
|
+
connection_status = traced_exc.as_connection_status_message()
|
|
133
|
+
if connection_status and cmd == "check":
|
|
134
|
+
yield connection_status
|
|
135
|
+
return
|
|
136
|
+
raise traced_exc
|
|
137
|
+
|
|
138
|
+
if cmd == "check":
|
|
139
|
+
yield self._run_check(config=config)
|
|
140
|
+
elif cmd == "write":
|
|
141
|
+
# Wrap in UTF-8 to override any other input encodings
|
|
142
|
+
wrapped_stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf-8")
|
|
143
|
+
yield from self._run_write(
|
|
144
|
+
config=config,
|
|
145
|
+
configured_catalog_path=parsed_args.catalog,
|
|
146
|
+
input_stream=wrapped_stdin,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def run(self, args: List[str]) -> None:
|
|
150
|
+
init_uncaught_exception_handler(logger)
|
|
151
|
+
parsed_args = self.parse_args(args)
|
|
152
|
+
output_messages = self.run_cmd(parsed_args)
|
|
153
|
+
for message in output_messages:
|
|
154
|
+
print(orjson.dumps(AirbyteMessageSerializer.dump(message)).decode())
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Vector DB based destinations
|
|
2
|
+
|
|
3
|
+
## Note: All helpers in this directory are experimental and subject to change
|
|
4
|
+
|
|
5
|
+
This directory contains several helpers that can be used to create a destination that processes and chunks records, embeds their text part and loads them into a vector database.
|
|
6
|
+
The specific loading behavior is defined by the destination connector itself, but chunking and embedding behavior is handled by the helpers.
|
|
7
|
+
|
|
8
|
+
To use these helpers, install the CDK with the `vector-db-based` extra:
|
|
9
|
+
|
|
10
|
+
```bash
|
|
11
|
+
pip install airbyte-cdk[vector-db-based]
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
The helpers can be used in the following way:
|
|
15
|
+
|
|
16
|
+
- Add the config models to the spec of the connector
|
|
17
|
+
- Implement the `Indexer` interface for your specific database
|
|
18
|
+
- In the check implementation of the destination, initialize the indexer and the embedder and call `check` on them
|
|
19
|
+
- In the write implementation of the destination, initialize the indexer, the embedder and pass them to a new instance of the writer. Then call the writers `write` method with the iterable for incoming messages
|
|
20
|
+
|
|
21
|
+
If there are no connector-specific embedders, the `airbyte_cdk.destinations.vector_db_based.embedder.create_from_config` function can be used to get an embedder instance from the config.
|
|
22
|
+
|
|
23
|
+
This is how the components interact:
|
|
24
|
+
|
|
25
|
+
```text
|
|
26
|
+
┌─────────────┐
|
|
27
|
+
│MyDestination│
|
|
28
|
+
└┬────────────┘
|
|
29
|
+
┌▽───────────────────────────────┐
|
|
30
|
+
│Writer │
|
|
31
|
+
└┬─────────┬──────────┬──────────┘
|
|
32
|
+
┌▽───────┐┌▽────────┐┌▽────────────────┐
|
|
33
|
+
│Embedder││MyIndexer││DocumentProcessor│
|
|
34
|
+
└────────┘└─────────┘└─────────────────┘
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Normally, only the `MyDestination` class and the `MyIndexer` class has to be implemented specifically for the destination. The other classes are provided as is by the helpers.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
from .config import (
|
|
6
|
+
AzureOpenAIEmbeddingConfigModel,
|
|
7
|
+
CohereEmbeddingConfigModel,
|
|
8
|
+
FakeEmbeddingConfigModel,
|
|
9
|
+
FromFieldEmbeddingConfigModel,
|
|
10
|
+
OpenAICompatibleEmbeddingConfigModel,
|
|
11
|
+
OpenAIEmbeddingConfigModel,
|
|
12
|
+
ProcessingConfigModel,
|
|
13
|
+
)
|
|
14
|
+
from .document_processor import Chunk, DocumentProcessor
|
|
15
|
+
from .embedder import CohereEmbedder, Embedder, FakeEmbedder, OpenAIEmbedder
|
|
16
|
+
from .indexer import Indexer
|
|
17
|
+
from .writer import Writer
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"AzureOpenAIEmbedder",
|
|
21
|
+
"AzureOpenAIEmbeddingConfigModel",
|
|
22
|
+
"Chunk",
|
|
23
|
+
"CohereEmbedder",
|
|
24
|
+
"CohereEmbeddingConfigModel",
|
|
25
|
+
"DocumentProcessor",
|
|
26
|
+
"Embedder",
|
|
27
|
+
"FakeEmbedder",
|
|
28
|
+
"FakeEmbeddingConfigModel",
|
|
29
|
+
"FromFieldEmbedder",
|
|
30
|
+
"FromFieldEmbeddingConfigModel",
|
|
31
|
+
"Indexer",
|
|
32
|
+
"OpenAICompatibleEmbedder",
|
|
33
|
+
"OpenAICompatibleEmbeddingConfigModel",
|
|
34
|
+
"OpenAIEmbedder",
|
|
35
|
+
"OpenAIEmbeddingConfigModel",
|
|
36
|
+
"ProcessingConfigModel",
|
|
37
|
+
"Writer",
|
|
38
|
+
]
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
6
|
+
|
|
7
|
+
import dpath
|
|
8
|
+
from pydantic.v1 import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
|
11
|
+
from airbyte_cdk.utils.spec_schema_transformations import resolve_refs
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SeparatorSplitterConfigModel(BaseModel):
|
|
15
|
+
mode: Literal["separator"] = Field("separator", const=True)
|
|
16
|
+
separators: List[str] = Field(
|
|
17
|
+
default=['"\\n\\n"', '"\\n"', '" "', '""'],
|
|
18
|
+
title="Separators",
|
|
19
|
+
description='List of separator strings to split text fields by. The separator itself needs to be wrapped in double quotes, e.g. to split by the dot character, use ".". To split by a newline, use "\\n".',
|
|
20
|
+
)
|
|
21
|
+
keep_separator: bool = Field(
|
|
22
|
+
default=False,
|
|
23
|
+
title="Keep separator",
|
|
24
|
+
description="Whether to keep the separator in the resulting chunks",
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
class Config(OneOfOptionConfig):
|
|
28
|
+
title = "By Separator"
|
|
29
|
+
description = "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
|
|
30
|
+
discriminator = "mode"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class MarkdownHeaderSplitterConfigModel(BaseModel):
|
|
34
|
+
mode: Literal["markdown"] = Field("markdown", const=True)
|
|
35
|
+
split_level: int = Field(
|
|
36
|
+
default=1,
|
|
37
|
+
title="Split level",
|
|
38
|
+
description="Level of markdown headers to split text fields by. Headings down to the specified level will be used as split points",
|
|
39
|
+
le=6,
|
|
40
|
+
ge=1,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
class Config(OneOfOptionConfig):
|
|
44
|
+
title = "By Markdown header"
|
|
45
|
+
description = "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
|
|
46
|
+
discriminator = "mode"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class CodeSplitterConfigModel(BaseModel):
|
|
50
|
+
mode: Literal["code"] = Field("code", const=True)
|
|
51
|
+
language: str = Field(
|
|
52
|
+
title="Language",
|
|
53
|
+
description="Split code in suitable places based on the programming language",
|
|
54
|
+
enum=[
|
|
55
|
+
"cpp",
|
|
56
|
+
"go",
|
|
57
|
+
"java",
|
|
58
|
+
"js",
|
|
59
|
+
"php",
|
|
60
|
+
"proto",
|
|
61
|
+
"python",
|
|
62
|
+
"rst",
|
|
63
|
+
"ruby",
|
|
64
|
+
"rust",
|
|
65
|
+
"scala",
|
|
66
|
+
"swift",
|
|
67
|
+
"markdown",
|
|
68
|
+
"latex",
|
|
69
|
+
"html",
|
|
70
|
+
"sol",
|
|
71
|
+
],
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
class Config(OneOfOptionConfig):
|
|
75
|
+
title = "By Programming Language"
|
|
76
|
+
description = "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
|
|
77
|
+
discriminator = "mode"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
TextSplitterConfigModel = Union[
|
|
81
|
+
SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class FieldNameMappingConfigModel(BaseModel):
|
|
86
|
+
from_field: str = Field(title="From field name", description="The field name in the source")
|
|
87
|
+
to_field: str = Field(
|
|
88
|
+
title="To field name", description="The field name to use in the destination"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class ProcessingConfigModel(BaseModel):
|
|
93
|
+
chunk_size: int = Field(
|
|
94
|
+
...,
|
|
95
|
+
title="Chunk size",
|
|
96
|
+
maximum=8191,
|
|
97
|
+
minimum=1,
|
|
98
|
+
description="Size of chunks in tokens to store in vector store (make sure it is not too big for the context if your LLM)",
|
|
99
|
+
)
|
|
100
|
+
chunk_overlap: int = Field(
|
|
101
|
+
title="Chunk overlap",
|
|
102
|
+
description="Size of overlap between chunks in tokens to store in vector store to better capture relevant context",
|
|
103
|
+
default=0,
|
|
104
|
+
)
|
|
105
|
+
text_fields: Optional[List[str]] = Field(
|
|
106
|
+
default=[],
|
|
107
|
+
title="Text fields to embed",
|
|
108
|
+
description="List of fields in the record that should be used to calculate the embedding. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered text fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array.",
|
|
109
|
+
always_show=True,
|
|
110
|
+
examples=["text", "user.name", "users.*.name"],
|
|
111
|
+
)
|
|
112
|
+
metadata_fields: Optional[List[str]] = Field(
|
|
113
|
+
default=[],
|
|
114
|
+
title="Fields to store as metadata",
|
|
115
|
+
description="List of fields in the record that should be stored as metadata. The field list is applied to all streams in the same way and non-existing fields are ignored. If none are defined, all fields are considered metadata fields. When specifying text fields, you can access nested fields in the record by using dot notation, e.g. `user.name` will access the `name` field in the `user` object. It's also possible to use wildcards to access all fields in an object, e.g. `users.*.name` will access all `names` fields in all entries of the `users` array. When specifying nested paths, all matching values are flattened into an array set to a field named by the path.",
|
|
116
|
+
always_show=True,
|
|
117
|
+
examples=["age", "user", "user.name"],
|
|
118
|
+
)
|
|
119
|
+
text_splitter: TextSplitterConfigModel = Field(
|
|
120
|
+
default=None,
|
|
121
|
+
title="Text splitter",
|
|
122
|
+
discriminator="mode",
|
|
123
|
+
type="object",
|
|
124
|
+
description="Split text fields into chunks based on the specified method.",
|
|
125
|
+
)
|
|
126
|
+
field_name_mappings: Optional[List[FieldNameMappingConfigModel]] = Field(
|
|
127
|
+
default=[],
|
|
128
|
+
title="Field name mappings",
|
|
129
|
+
description="List of fields to rename. Not applicable for nested fields, but can be used to rename fields already flattened via dot notation.",
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
class Config:
|
|
133
|
+
schema_extra = {"group": "processing"}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class OpenAIEmbeddingConfigModel(BaseModel):
|
|
137
|
+
mode: Literal["openai"] = Field("openai", const=True)
|
|
138
|
+
openai_key: str = Field(..., title="OpenAI API key", airbyte_secret=True)
|
|
139
|
+
|
|
140
|
+
class Config(OneOfOptionConfig):
|
|
141
|
+
title = "OpenAI"
|
|
142
|
+
description = "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
|
|
143
|
+
discriminator = "mode"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class OpenAICompatibleEmbeddingConfigModel(BaseModel):
|
|
147
|
+
mode: Literal["openai_compatible"] = Field("openai_compatible", const=True)
|
|
148
|
+
api_key: str = Field(title="API key", default="", airbyte_secret=True)
|
|
149
|
+
base_url: str = Field(
|
|
150
|
+
...,
|
|
151
|
+
title="Base URL",
|
|
152
|
+
description="The base URL for your OpenAI-compatible service",
|
|
153
|
+
examples=["https://your-service-name.com"],
|
|
154
|
+
)
|
|
155
|
+
model_name: str = Field(
|
|
156
|
+
title="Model name",
|
|
157
|
+
description="The name of the model to use for embedding",
|
|
158
|
+
default="text-embedding-ada-002",
|
|
159
|
+
examples=["text-embedding-ada-002"],
|
|
160
|
+
)
|
|
161
|
+
dimensions: int = Field(
|
|
162
|
+
title="Embedding dimensions",
|
|
163
|
+
description="The number of dimensions the embedding model is generating",
|
|
164
|
+
examples=[1536, 384],
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
class Config(OneOfOptionConfig):
|
|
168
|
+
title = "OpenAI-compatible"
|
|
169
|
+
description = "Use a service that's compatible with the OpenAI API to embed text."
|
|
170
|
+
discriminator = "mode"
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class AzureOpenAIEmbeddingConfigModel(BaseModel):
|
|
174
|
+
mode: Literal["azure_openai"] = Field("azure_openai", const=True)
|
|
175
|
+
openai_key: str = Field(
|
|
176
|
+
...,
|
|
177
|
+
title="Azure OpenAI API key",
|
|
178
|
+
airbyte_secret=True,
|
|
179
|
+
description="The API key for your Azure OpenAI resource. You can find this in the Azure portal under your Azure OpenAI resource",
|
|
180
|
+
)
|
|
181
|
+
api_base: str = Field(
|
|
182
|
+
...,
|
|
183
|
+
title="Resource base URL",
|
|
184
|
+
description="The base URL for your Azure OpenAI resource. You can find this in the Azure portal under your Azure OpenAI resource",
|
|
185
|
+
examples=["https://your-resource-name.openai.azure.com"],
|
|
186
|
+
)
|
|
187
|
+
deployment: str = Field(
|
|
188
|
+
...,
|
|
189
|
+
title="Deployment",
|
|
190
|
+
description="The deployment for your Azure OpenAI resource. You can find this in the Azure portal under your Azure OpenAI resource",
|
|
191
|
+
examples=["your-resource-name"],
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
class Config(OneOfOptionConfig):
|
|
195
|
+
title = "Azure OpenAI"
|
|
196
|
+
description = "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
|
|
197
|
+
discriminator = "mode"
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class FakeEmbeddingConfigModel(BaseModel):
|
|
201
|
+
mode: Literal["fake"] = Field("fake", const=True)
|
|
202
|
+
|
|
203
|
+
class Config(OneOfOptionConfig):
|
|
204
|
+
title = "Fake"
|
|
205
|
+
description = "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
|
|
206
|
+
discriminator = "mode"
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class FromFieldEmbeddingConfigModel(BaseModel):
|
|
210
|
+
mode: Literal["from_field"] = Field("from_field", const=True)
|
|
211
|
+
field_name: str = Field(
|
|
212
|
+
...,
|
|
213
|
+
title="Field name",
|
|
214
|
+
description="Name of the field in the record that contains the embedding",
|
|
215
|
+
examples=["embedding", "vector"],
|
|
216
|
+
)
|
|
217
|
+
dimensions: int = Field(
|
|
218
|
+
...,
|
|
219
|
+
title="Embedding dimensions",
|
|
220
|
+
description="The number of dimensions the embedding model is generating",
|
|
221
|
+
examples=[1536, 384],
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
class Config(OneOfOptionConfig):
|
|
225
|
+
title = "From Field"
|
|
226
|
+
description = "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
|
|
227
|
+
discriminator = "mode"
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class CohereEmbeddingConfigModel(BaseModel):
|
|
231
|
+
mode: Literal["cohere"] = Field("cohere", const=True)
|
|
232
|
+
cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)
|
|
233
|
+
|
|
234
|
+
class Config(OneOfOptionConfig):
|
|
235
|
+
title = "Cohere"
|
|
236
|
+
description = "Use the Cohere API to embed text."
|
|
237
|
+
discriminator = "mode"
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
class VectorDBConfigModel(BaseModel):
|
|
241
|
+
"""
|
|
242
|
+
The configuration model for the Vector DB based destinations. This model is used to generate the UI for the destination configuration,
|
|
243
|
+
as well as to provide type safety for the configuration passed to the destination.
|
|
244
|
+
|
|
245
|
+
The configuration model is composed of four parts:
|
|
246
|
+
* Processing configuration
|
|
247
|
+
* Embedding configuration
|
|
248
|
+
* Indexing configuration
|
|
249
|
+
* Advanced configuration
|
|
250
|
+
|
|
251
|
+
Processing, embedding and advanced configuration are provided by this base class, while the indexing configuration is provided by the destination connector in the sub class.
|
|
252
|
+
"""
|
|
253
|
+
|
|
254
|
+
embedding: Union[
|
|
255
|
+
OpenAIEmbeddingConfigModel,
|
|
256
|
+
CohereEmbeddingConfigModel,
|
|
257
|
+
FakeEmbeddingConfigModel,
|
|
258
|
+
AzureOpenAIEmbeddingConfigModel,
|
|
259
|
+
OpenAICompatibleEmbeddingConfigModel,
|
|
260
|
+
] = Field(
|
|
261
|
+
...,
|
|
262
|
+
title="Embedding",
|
|
263
|
+
description="Embedding configuration",
|
|
264
|
+
discriminator="mode",
|
|
265
|
+
group="embedding",
|
|
266
|
+
type="object",
|
|
267
|
+
)
|
|
268
|
+
processing: ProcessingConfigModel
|
|
269
|
+
omit_raw_text: bool = Field(
|
|
270
|
+
default=False,
|
|
271
|
+
title="Do not store raw text",
|
|
272
|
+
group="advanced",
|
|
273
|
+
description="Do not store the text that gets embedded along with the vector and the metadata in the destination. If set to true, only the vector and the metadata will be stored - in this case raw text for LLM use cases needs to be retrieved from another source.",
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
class Config:
|
|
277
|
+
title = "Destination Config"
|
|
278
|
+
schema_extra = {
|
|
279
|
+
"groups": [
|
|
280
|
+
{"id": "processing", "title": "Processing"},
|
|
281
|
+
{"id": "embedding", "title": "Embedding"},
|
|
282
|
+
{"id": "indexing", "title": "Indexing"},
|
|
283
|
+
{"id": "advanced", "title": "Advanced"},
|
|
284
|
+
]
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
@staticmethod
|
|
288
|
+
def remove_discriminator(schema: Dict[str, Any]) -> None:
|
|
289
|
+
"""pydantic adds "discriminator" to the schema for oneOfs, which is not treated right by the platform as we inline all references"""
|
|
290
|
+
dpath.delete(schema, "properties/**/discriminator")
|
|
291
|
+
|
|
292
|
+
@classmethod
|
|
293
|
+
def schema(cls, by_alias: bool = True, ref_template: str = "") -> Dict[str, Any]:
|
|
294
|
+
"""we're overriding the schema classmethod to enable some post-processing"""
|
|
295
|
+
schema: Dict[str, Any] = super().schema()
|
|
296
|
+
schema = resolve_refs(schema)
|
|
297
|
+
cls.remove_discriminator(schema)
|
|
298
|
+
return schema
|