airbyte-cdk 0.72.0__py3-none-any.whl → 6.13.1.dev4106__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/__init__.py +355 -6
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +29 -10
- airbyte_cdk/connector.py +24 -24
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
- airbyte_cdk/connector_builder/main.py +45 -13
- airbyte_cdk/connector_builder/message_grouper.py +189 -50
- airbyte_cdk/connector_builder/models.py +3 -2
- airbyte_cdk/destinations/__init__.py +4 -3
- airbyte_cdk/destinations/destination.py +54 -20
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/config.py +40 -17
- airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
- airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
- airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
- airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
- airbyte_cdk/entrypoint.py +153 -44
- airbyte_cdk/exception_handler.py +21 -3
- airbyte_cdk/logger.py +30 -44
- airbyte_cdk/models/__init__.py +13 -2
- airbyte_cdk/models/airbyte_protocol.py +86 -1
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/file_transfer_record_message.py +13 -0
- airbyte_cdk/models/well_known_types.py +1 -1
- airbyte_cdk/sources/__init__.py +5 -1
- airbyte_cdk/sources/abstract_source.py +125 -79
- airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
- airbyte_cdk/sources/config.py +3 -2
- airbyte_cdk/sources/connector_state_manager.py +49 -83
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
- airbyte_cdk/sources/declarative/auth/token.py +28 -10
- airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
- airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
- airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +421 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1213 -88
- airbyte_cdk/sources/declarative/declarative_source.py +5 -2
- airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
- airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
- airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
- airbyte_cdk/sources/declarative/extractors/record_filter.py +65 -8
- airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +25 -3
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +159 -74
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
- airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
- airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +1 -1
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1329 -595
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1699 -226
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
- airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +228 -72
- airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
- airbyte_cdk/sources/declarative/spec/spec.py +12 -5
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
- airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
- airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
- airbyte_cdk/sources/declarative/types.py +19 -110
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
- airbyte_cdk/sources/embedded/base_integration.py +16 -5
- airbyte_cdk/sources/embedded/catalog.py +16 -4
- airbyte_cdk/sources/embedded/runner.py +19 -3
- airbyte_cdk/sources/embedded/tools.py +5 -2
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +58 -10
- airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
- airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
- airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
- airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
- airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
- airbyte_cdk/sources/file_based/exceptions.py +52 -15
- airbyte_cdk/sources/file_based/file_based_source.py +163 -33
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +83 -5
- airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +145 -41
- airbyte_cdk/sources/file_based/remote_file.py +1 -1
- airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +175 -45
- airbyte_cdk/sources/http_logger.py +8 -3
- airbyte_cdk/sources/message/__init__.py +7 -1
- airbyte_cdk/sources/message/repository.py +18 -4
- airbyte_cdk/sources/source.py +42 -38
- airbyte_cdk/sources/streams/__init__.py +2 -2
- airbyte_cdk/sources/streams/availability_strategy.py +54 -3
- airbyte_cdk/sources/streams/call_rate.py +64 -21
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
- airbyte_cdk/sources/streams/concurrent/cursor.py +298 -42
- airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
- airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
- airbyte_cdk/sources/streams/core.py +412 -87
- airbyte_cdk/sources/streams/http/__init__.py +2 -1
- airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +27 -7
- airbyte_cdk/sources/streams/http/http.py +369 -246
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
- airbyte_cdk/sources/types.py +154 -0
- airbyte_cdk/sources/utils/record_helper.py +36 -21
- airbyte_cdk/sources/utils/schema_helpers.py +13 -6
- airbyte_cdk/sources/utils/slice_logger.py +4 -1
- airbyte_cdk/sources/utils/transform.py +54 -20
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/catalog_builder.py +70 -18
- airbyte_cdk/test/entrypoint_wrapper.py +117 -42
- airbyte_cdk/test/mock_http/__init__.py +1 -1
- airbyte_cdk/test/mock_http/matcher.py +6 -0
- airbyte_cdk/test/mock_http/mocker.py +57 -10
- airbyte_cdk/test/mock_http/request.py +19 -3
- airbyte_cdk/test/mock_http/response.py +3 -1
- airbyte_cdk/test/mock_http/response_builder.py +32 -16
- airbyte_cdk/test/state_builder.py +18 -10
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +2 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
- airbyte_cdk/utils/analytics_message.py +10 -2
- airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
- airbyte_cdk/utils/event_timing.py +10 -10
- airbyte_cdk/utils/mapping_helpers.py +3 -1
- airbyte_cdk/utils/message_utils.py +20 -11
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +198 -28
- airbyte_cdk/utils/slice_hasher.py +30 -0
- airbyte_cdk/utils/spec_schema_transformations.py +6 -3
- airbyte_cdk/utils/stream_status_utils.py +8 -1
- airbyte_cdk/utils/traced_exception.py +61 -21
- airbyte_cdk-6.13.1.dev4106.dist-info/METADATA +109 -0
- airbyte_cdk-6.13.1.dev4106.dist-info/RECORD +349 -0
- {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/WHEEL +1 -2
- airbyte_cdk-6.13.1.dev4106.dist-info/entry_points.txt +3 -0
- airbyte_cdk/sources/declarative/create_partial.py +0 -92
- airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
- airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
- airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
- airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
- airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
- airbyte_cdk/sources/deprecated/base_source.py +0 -94
- airbyte_cdk/sources/deprecated/client.py +0 -99
- airbyte_cdk/sources/singer/__init__.py +0 -8
- airbyte_cdk/sources/singer/singer_helpers.py +0 -304
- airbyte_cdk/sources/singer/source.py +0 -186
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
- airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
- airbyte_cdk/sources/streams/http/auth/core.py +0 -29
- airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
- airbyte_cdk/sources/streams/http/auth/token.py +0 -47
- airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
- airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
- airbyte_cdk/sources/utils/schema_models.py +0 -84
- airbyte_cdk-0.72.0.dist-info/METADATA +0 -243
- airbyte_cdk-0.72.0.dist-info/RECORD +0 -466
- airbyte_cdk-0.72.0.dist-info/top_level.txt +0 -3
- source_declarative_manifest/main.py +0 -29
- unit_tests/connector_builder/__init__.py +0 -3
- unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
- unit_tests/connector_builder/test_message_grouper.py +0 -713
- unit_tests/connector_builder/utils.py +0 -27
- unit_tests/destinations/test_destination.py +0 -243
- unit_tests/singer/test_singer_helpers.py +0 -56
- unit_tests/singer/test_singer_source.py +0 -112
- unit_tests/sources/__init__.py +0 -0
- unit_tests/sources/concurrent_source/__init__.py +0 -3
- unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
- unit_tests/sources/declarative/__init__.py +0 -3
- unit_tests/sources/declarative/auth/__init__.py +0 -3
- unit_tests/sources/declarative/auth/test_oauth.py +0 -331
- unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
- unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
- unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
- unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
- unit_tests/sources/declarative/checks/__init__.py +0 -3
- unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
- unit_tests/sources/declarative/decoders/__init__.py +0 -0
- unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
- unit_tests/sources/declarative/external_component.py +0 -13
- unit_tests/sources/declarative/extractors/__init__.py +0 -3
- unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
- unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
- unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
- unit_tests/sources/declarative/incremental/__init__.py +0 -0
- unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
- unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
- unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
- unit_tests/sources/declarative/interpolation/__init__.py +0 -3
- unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
- unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
- unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
- unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
- unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
- unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
- unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
- unit_tests/sources/declarative/parsers/__init__.py +0 -3
- unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
- unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
- unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1841
- unit_tests/sources/declarative/parsers/testing_components.py +0 -36
- unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
- unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
- unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
- unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
- unit_tests/sources/declarative/requesters/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
- unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
- unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
- unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
- unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
- unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
- unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
- unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
- unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
- unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
- unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
- unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
- unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
- unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
- unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
- unit_tests/sources/declarative/retrievers/__init__.py +0 -3
- unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
- unit_tests/sources/declarative/schema/__init__.py +0 -6
- unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
- unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
- unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
- unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
- unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
- unit_tests/sources/declarative/states/__init__.py +0 -3
- unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
- unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
- unit_tests/sources/declarative/test_create_partial.py +0 -83
- unit_tests/sources/declarative/test_declarative_stream.py +0 -103
- unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
- unit_tests/sources/declarative/test_types.py +0 -39
- unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
- unit_tests/sources/file_based/__init__.py +0 -0
- unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
- unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
- unit_tests/sources/file_based/config/__init__.py +0 -0
- unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
- unit_tests/sources/file_based/config/test_csv_format.py +0 -34
- unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
- unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
- unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
- unit_tests/sources/file_based/file_types/__init__.py +0 -0
- unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
- unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
- unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
- unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
- unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
- unit_tests/sources/file_based/helpers.py +0 -70
- unit_tests/sources/file_based/in_memory_files_source.py +0 -211
- unit_tests/sources/file_based/scenarios/__init__.py +0 -0
- unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
- unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
- unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
- unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
- unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
- unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
- unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
- unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
- unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
- unit_tests/sources/file_based/stream/__init__.py +0 -0
- unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
- unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
- unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
- unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
- unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
- unit_tests/sources/file_based/test_scenarios.py +0 -253
- unit_tests/sources/file_based/test_schema_helpers.py +0 -346
- unit_tests/sources/fixtures/__init__.py +0 -3
- unit_tests/sources/fixtures/source_test_fixture.py +0 -153
- unit_tests/sources/message/__init__.py +0 -0
- unit_tests/sources/message/test_repository.py +0 -153
- unit_tests/sources/streams/__init__.py +0 -0
- unit_tests/sources/streams/concurrent/__init__.py +0 -3
- unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
- unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
- unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
- unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
- unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
- unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
- unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
- unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
- unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
- unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
- unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
- unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
- unit_tests/sources/streams/http/__init__.py +0 -0
- unit_tests/sources/streams/http/auth/__init__.py +0 -0
- unit_tests/sources/streams/http/auth/test_auth.py +0 -173
- unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
- unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
- unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
- unit_tests/sources/streams/http/test_http.py +0 -635
- unit_tests/sources/streams/test_availability_strategy.py +0 -70
- unit_tests/sources/streams/test_call_rate.py +0 -300
- unit_tests/sources/streams/test_stream_read.py +0 -405
- unit_tests/sources/streams/test_streams_core.py +0 -184
- unit_tests/sources/test_abstract_source.py +0 -1442
- unit_tests/sources/test_concurrent_source.py +0 -112
- unit_tests/sources/test_config.py +0 -92
- unit_tests/sources/test_connector_state_manager.py +0 -482
- unit_tests/sources/test_http_logger.py +0 -252
- unit_tests/sources/test_integration_source.py +0 -86
- unit_tests/sources/test_source.py +0 -684
- unit_tests/sources/test_source_read.py +0 -460
- unit_tests/test/__init__.py +0 -0
- unit_tests/test/mock_http/__init__.py +0 -0
- unit_tests/test/mock_http/test_matcher.py +0 -53
- unit_tests/test/mock_http/test_mocker.py +0 -214
- unit_tests/test/mock_http/test_request.py +0 -117
- unit_tests/test/mock_http/test_response_builder.py +0 -177
- unit_tests/test/test_entrypoint_wrapper.py +0 -240
- unit_tests/utils/__init__.py +0 -0
- unit_tests/utils/test_datetime_format_inferrer.py +0 -60
- unit_tests/utils/test_mapping_helpers.py +0 -54
- unit_tests/utils/test_message_utils.py +0 -91
- unit_tests/utils/test_rate_limiting.py +0 -26
- unit_tests/utils/test_schema_inferrer.py +0 -202
- unit_tests/utils/test_secret_utils.py +0 -135
- unit_tests/utils/test_stream_status_utils.py +0 -61
- unit_tests/utils/test_traced_exception.py +0 -107
- /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
- {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
- {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
- {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
- {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/LICENSE.txt +0 -0
@@ -4,10 +4,11 @@
|
|
4
4
|
|
5
5
|
from typing import Any, Dict, List, Literal, Optional, Union
|
6
6
|
|
7
|
-
import dpath
|
7
|
+
import dpath
|
8
|
+
from pydantic.v1 import BaseModel, Field
|
9
|
+
|
8
10
|
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
|
9
11
|
from airbyte_cdk.utils.spec_schema_transformations import resolve_refs
|
10
|
-
from pydantic import BaseModel, Field
|
11
12
|
|
12
13
|
|
13
14
|
class SeparatorSplitterConfigModel(BaseModel):
|
@@ -17,7 +18,11 @@ class SeparatorSplitterConfigModel(BaseModel):
|
|
17
18
|
title="Separators",
|
18
19
|
description='List of separator strings to split text fields by. The separator itself needs to be wrapped in double quotes, e.g. to split by the dot character, use ".". To split by a newline, use "\\n".',
|
19
20
|
)
|
20
|
-
keep_separator: bool = Field(
|
21
|
+
keep_separator: bool = Field(
|
22
|
+
default=False,
|
23
|
+
title="Keep separator",
|
24
|
+
description="Whether to keep the separator in the resulting chunks",
|
25
|
+
)
|
21
26
|
|
22
27
|
class Config(OneOfOptionConfig):
|
23
28
|
title = "By Separator"
|
@@ -68,18 +73,20 @@ class CodeSplitterConfigModel(BaseModel):
|
|
68
73
|
|
69
74
|
class Config(OneOfOptionConfig):
|
70
75
|
title = "By Programming Language"
|
71
|
-
description =
|
72
|
-
"Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
|
73
|
-
)
|
76
|
+
description = "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
|
74
77
|
discriminator = "mode"
|
75
78
|
|
76
79
|
|
77
|
-
TextSplitterConfigModel = Union[
|
80
|
+
TextSplitterConfigModel = Union[
|
81
|
+
SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel
|
82
|
+
]
|
78
83
|
|
79
84
|
|
80
85
|
class FieldNameMappingConfigModel(BaseModel):
|
81
86
|
from_field: str = Field(title="From field name", description="The field name in the source")
|
82
|
-
to_field: str = Field(
|
87
|
+
to_field: str = Field(
|
88
|
+
title="To field name", description="The field name to use in the destination"
|
89
|
+
)
|
83
90
|
|
84
91
|
|
85
92
|
class ProcessingConfigModel(BaseModel):
|
@@ -132,9 +139,7 @@ class OpenAIEmbeddingConfigModel(BaseModel):
|
|
132
139
|
|
133
140
|
class Config(OneOfOptionConfig):
|
134
141
|
title = "OpenAI"
|
135
|
-
description =
|
136
|
-
"Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
|
137
|
-
)
|
142
|
+
description = "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
|
138
143
|
discriminator = "mode"
|
139
144
|
|
140
145
|
|
@@ -142,7 +147,10 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
|
|
142
147
|
mode: Literal["openai_compatible"] = Field("openai_compatible", const=True)
|
143
148
|
api_key: str = Field(title="API key", default="", airbyte_secret=True)
|
144
149
|
base_url: str = Field(
|
145
|
-
...,
|
150
|
+
...,
|
151
|
+
title="Base URL",
|
152
|
+
description="The base URL for your OpenAI-compatible service",
|
153
|
+
examples=["https://your-service-name.com"],
|
146
154
|
)
|
147
155
|
model_name: str = Field(
|
148
156
|
title="Model name",
|
@@ -151,7 +159,9 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
|
|
151
159
|
examples=["text-embedding-ada-002"],
|
152
160
|
)
|
153
161
|
dimensions: int = Field(
|
154
|
-
title="Embedding dimensions",
|
162
|
+
title="Embedding dimensions",
|
163
|
+
description="The number of dimensions the embedding model is generating",
|
164
|
+
examples=[1536, 384],
|
155
165
|
)
|
156
166
|
|
157
167
|
class Config(OneOfOptionConfig):
|
@@ -199,10 +209,16 @@ class FakeEmbeddingConfigModel(BaseModel):
|
|
199
209
|
class FromFieldEmbeddingConfigModel(BaseModel):
|
200
210
|
mode: Literal["from_field"] = Field("from_field", const=True)
|
201
211
|
field_name: str = Field(
|
202
|
-
...,
|
212
|
+
...,
|
213
|
+
title="Field name",
|
214
|
+
description="Name of the field in the record that contains the embedding",
|
215
|
+
examples=["embedding", "vector"],
|
203
216
|
)
|
204
217
|
dimensions: int = Field(
|
205
|
-
...,
|
218
|
+
...,
|
219
|
+
title="Embedding dimensions",
|
220
|
+
description="The number of dimensions the embedding model is generating",
|
221
|
+
examples=[1536, 384],
|
206
222
|
)
|
207
223
|
|
208
224
|
class Config(OneOfOptionConfig):
|
@@ -241,7 +257,14 @@ class VectorDBConfigModel(BaseModel):
|
|
241
257
|
FakeEmbeddingConfigModel,
|
242
258
|
AzureOpenAIEmbeddingConfigModel,
|
243
259
|
OpenAICompatibleEmbeddingConfigModel,
|
244
|
-
] = Field(
|
260
|
+
] = Field(
|
261
|
+
...,
|
262
|
+
title="Embedding",
|
263
|
+
description="Embedding configuration",
|
264
|
+
discriminator="mode",
|
265
|
+
group="embedding",
|
266
|
+
type="object",
|
267
|
+
)
|
245
268
|
processing: ProcessingConfigModel
|
246
269
|
omit_raw_text: bool = Field(
|
247
270
|
default=False,
|
@@ -264,7 +287,7 @@ class VectorDBConfigModel(BaseModel):
|
|
264
287
|
@staticmethod
|
265
288
|
def remove_discriminator(schema: Dict[str, Any]) -> None:
|
266
289
|
"""pydantic adds "discriminator" to the schema for oneOfs, which is not treated right by the platform as we inline all references"""
|
267
|
-
dpath.
|
290
|
+
dpath.delete(schema, "properties/**/discriminator")
|
268
291
|
|
269
292
|
@classmethod
|
270
293
|
def schema(cls, by_alias: bool = True, ref_template: str = "") -> Dict[str, Any]:
|
@@ -7,14 +7,24 @@ import logging
|
|
7
7
|
from dataclasses import dataclass
|
8
8
|
from typing import Any, Dict, List, Mapping, Optional, Tuple
|
9
9
|
|
10
|
-
import dpath
|
11
|
-
from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel, SeparatorSplitterConfigModel, TextSplitterConfigModel
|
12
|
-
from airbyte_cdk.destinations.vector_db_based.utils import create_stream_identifier
|
13
|
-
from airbyte_cdk.models import AirbyteRecordMessage, ConfiguredAirbyteCatalog, ConfiguredAirbyteStream, DestinationSyncMode
|
14
|
-
from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
|
15
|
-
from langchain.document_loaders.base import Document
|
10
|
+
import dpath
|
16
11
|
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
|
17
12
|
from langchain.utils import stringify_dict
|
13
|
+
from langchain_core.documents.base import Document
|
14
|
+
|
15
|
+
from airbyte_cdk.destinations.vector_db_based.config import (
|
16
|
+
ProcessingConfigModel,
|
17
|
+
SeparatorSplitterConfigModel,
|
18
|
+
TextSplitterConfigModel,
|
19
|
+
)
|
20
|
+
from airbyte_cdk.destinations.vector_db_based.utils import create_stream_identifier
|
21
|
+
from airbyte_cdk.models import (
|
22
|
+
AirbyteRecordMessage,
|
23
|
+
ConfiguredAirbyteCatalog,
|
24
|
+
ConfiguredAirbyteStream,
|
25
|
+
DestinationSyncMode,
|
26
|
+
)
|
27
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
|
18
28
|
|
19
29
|
METADATA_STREAM_FIELD = "_ab_stream"
|
20
30
|
METADATA_RECORD_ID_FIELD = "_ab_record_id"
|
@@ -30,7 +40,14 @@ class Chunk:
|
|
30
40
|
embedding: Optional[List[float]] = None
|
31
41
|
|
32
42
|
|
33
|
-
headers_to_split_on = [
|
43
|
+
headers_to_split_on = [
|
44
|
+
"(?:^|\n)# ",
|
45
|
+
"(?:^|\n)## ",
|
46
|
+
"(?:^|\n)### ",
|
47
|
+
"(?:^|\n)#### ",
|
48
|
+
"(?:^|\n)##### ",
|
49
|
+
"(?:^|\n)###### ",
|
50
|
+
]
|
34
51
|
|
35
52
|
|
36
53
|
class DocumentProcessor:
|
@@ -64,7 +81,10 @@ class DocumentProcessor:
|
|
64
81
|
return None
|
65
82
|
|
66
83
|
def _get_text_splitter(
|
67
|
-
self,
|
84
|
+
self,
|
85
|
+
chunk_size: int,
|
86
|
+
chunk_overlap: int,
|
87
|
+
splitter_config: Optional[TextSplitterConfigModel],
|
68
88
|
) -> RecursiveCharacterTextSplitter:
|
69
89
|
if splitter_config is None:
|
70
90
|
splitter_config = SeparatorSplitterConfigModel(mode="separator")
|
@@ -89,14 +109,20 @@ class DocumentProcessor:
|
|
89
109
|
return RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
90
110
|
chunk_size=chunk_size,
|
91
111
|
chunk_overlap=chunk_overlap,
|
92
|
-
separators=RecursiveCharacterTextSplitter.get_separators_for_language(
|
112
|
+
separators=RecursiveCharacterTextSplitter.get_separators_for_language(
|
113
|
+
Language(splitter_config.language)
|
114
|
+
),
|
93
115
|
disallowed_special=(),
|
94
116
|
)
|
95
117
|
|
96
118
|
def __init__(self, config: ProcessingConfigModel, catalog: ConfiguredAirbyteCatalog):
|
97
|
-
self.streams = {
|
119
|
+
self.streams = {
|
120
|
+
create_stream_identifier(stream.stream): stream for stream in catalog.streams
|
121
|
+
}
|
98
122
|
|
99
|
-
self.splitter = self._get_text_splitter(
|
123
|
+
self.splitter = self._get_text_splitter(
|
124
|
+
config.chunk_size, config.chunk_overlap, config.text_splitter
|
125
|
+
)
|
100
126
|
self.text_fields = config.text_fields
|
101
127
|
self.metadata_fields = config.metadata_fields
|
102
128
|
self.field_name_mappings = config.field_name_mappings
|
@@ -119,10 +145,18 @@ class DocumentProcessor:
|
|
119
145
|
failure_type=FailureType.config_error,
|
120
146
|
)
|
121
147
|
chunks = [
|
122
|
-
Chunk(
|
148
|
+
Chunk(
|
149
|
+
page_content=chunk_document.page_content,
|
150
|
+
metadata=chunk_document.metadata,
|
151
|
+
record=record,
|
152
|
+
)
|
123
153
|
for chunk_document in self._split_document(doc)
|
124
154
|
]
|
125
|
-
id_to_delete =
|
155
|
+
id_to_delete = (
|
156
|
+
doc.metadata[METADATA_RECORD_ID_FIELD]
|
157
|
+
if METADATA_RECORD_ID_FIELD in doc.metadata
|
158
|
+
else None
|
159
|
+
)
|
126
160
|
return chunks, id_to_delete
|
127
161
|
|
128
162
|
def _generate_document(self, record: AirbyteRecordMessage) -> Optional[Document]:
|
@@ -133,11 +167,13 @@ class DocumentProcessor:
|
|
133
167
|
metadata = self._extract_metadata(record)
|
134
168
|
return Document(page_content=text, metadata=metadata)
|
135
169
|
|
136
|
-
def _extract_relevant_fields(
|
170
|
+
def _extract_relevant_fields(
|
171
|
+
self, record: AirbyteRecordMessage, fields: Optional[List[str]]
|
172
|
+
) -> Dict[str, Any]:
|
137
173
|
relevant_fields = {}
|
138
174
|
if fields and len(fields) > 0:
|
139
175
|
for field in fields:
|
140
|
-
values = dpath.
|
176
|
+
values = dpath.values(record.data, field, separator=".")
|
141
177
|
if values and len(values) > 0:
|
142
178
|
relevant_fields[field] = values if len(values) > 1 else values[0]
|
143
179
|
else:
|
@@ -156,13 +192,16 @@ class DocumentProcessor:
|
|
156
192
|
stream_identifier = create_stream_identifier(record)
|
157
193
|
current_stream: ConfiguredAirbyteStream = self.streams[stream_identifier]
|
158
194
|
# if the sync mode is deduping, use the primary key to upsert existing records instead of appending new ones
|
159
|
-
if
|
195
|
+
if (
|
196
|
+
not current_stream.primary_key
|
197
|
+
or current_stream.destination_sync_mode != DestinationSyncMode.append_dedup
|
198
|
+
):
|
160
199
|
return None
|
161
200
|
|
162
201
|
primary_key = []
|
163
202
|
for key in current_stream.primary_key:
|
164
203
|
try:
|
165
|
-
primary_key.append(str(dpath.
|
204
|
+
primary_key.append(str(dpath.get(record.data, key)))
|
166
205
|
except KeyError:
|
167
206
|
primary_key.append("__not_found__")
|
168
207
|
stringified_primary_key = "_".join(primary_key)
|
@@ -7,6 +7,11 @@ from abc import ABC, abstractmethod
|
|
7
7
|
from dataclasses import dataclass
|
8
8
|
from typing import List, Optional, Union, cast
|
9
9
|
|
10
|
+
from langchain.embeddings.cohere import CohereEmbeddings
|
11
|
+
from langchain.embeddings.fake import FakeEmbeddings
|
12
|
+
from langchain.embeddings.localai import LocalAIEmbeddings
|
13
|
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
14
|
+
|
10
15
|
from airbyte_cdk.destinations.vector_db_based.config import (
|
11
16
|
AzureOpenAIEmbeddingConfigModel,
|
12
17
|
CohereEmbeddingConfigModel,
|
@@ -19,10 +24,6 @@ from airbyte_cdk.destinations.vector_db_based.config import (
|
|
19
24
|
from airbyte_cdk.destinations.vector_db_based.utils import create_chunks, format_exception
|
20
25
|
from airbyte_cdk.models import AirbyteRecordMessage
|
21
26
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException, FailureType
|
22
|
-
from langchain.embeddings.cohere import CohereEmbeddings
|
23
|
-
from langchain.embeddings.fake import FakeEmbeddings
|
24
|
-
from langchain.embeddings.localai import LocalAIEmbeddings
|
25
|
-
from langchain.embeddings.openai import OpenAIEmbeddings
|
26
27
|
|
27
28
|
|
28
29
|
@dataclass
|
@@ -92,7 +93,9 @@ class BaseOpenAIEmbedder(Embedder):
|
|
92
93
|
batches = create_chunks(documents, batch_size=embedding_batch_size)
|
93
94
|
embeddings: List[Optional[List[float]]] = []
|
94
95
|
for batch in batches:
|
95
|
-
embeddings.extend(
|
96
|
+
embeddings.extend(
|
97
|
+
self.embeddings.embed_documents([chunk.page_content for chunk in batch])
|
98
|
+
)
|
96
99
|
return embeddings
|
97
100
|
|
98
101
|
@property
|
@@ -103,13 +106,30 @@ class BaseOpenAIEmbedder(Embedder):
|
|
103
106
|
|
104
107
|
class OpenAIEmbedder(BaseOpenAIEmbedder):
|
105
108
|
def __init__(self, config: OpenAIEmbeddingConfigModel, chunk_size: int):
|
106
|
-
super().__init__(
|
109
|
+
super().__init__(
|
110
|
+
OpenAIEmbeddings( # type: ignore [call-arg]
|
111
|
+
openai_api_key=config.openai_key, max_retries=15, disallowed_special=()
|
112
|
+
),
|
113
|
+
chunk_size,
|
114
|
+
) # type: ignore
|
107
115
|
|
108
116
|
|
109
117
|
class AzureOpenAIEmbedder(BaseOpenAIEmbedder):
|
110
118
|
def __init__(self, config: AzureOpenAIEmbeddingConfigModel, chunk_size: int):
|
111
119
|
# Azure OpenAI API has — as of 20230927 — a limit of 16 documents per request
|
112
|
-
super().__init__(
|
120
|
+
super().__init__(
|
121
|
+
OpenAIEmbeddings( # type: ignore [call-arg]
|
122
|
+
openai_api_key=config.openai_key,
|
123
|
+
chunk_size=16,
|
124
|
+
max_retries=15,
|
125
|
+
openai_api_type="azure",
|
126
|
+
openai_api_version="2023-05-15",
|
127
|
+
openai_api_base=config.api_base,
|
128
|
+
deployment=config.deployment,
|
129
|
+
disallowed_special=(),
|
130
|
+
),
|
131
|
+
chunk_size,
|
132
|
+
) # type: ignore
|
113
133
|
|
114
134
|
|
115
135
|
COHERE_VECTOR_SIZE = 1024
|
@@ -119,7 +139,9 @@ class CohereEmbedder(Embedder):
|
|
119
139
|
def __init__(self, config: CohereEmbeddingConfigModel):
|
120
140
|
super().__init__()
|
121
141
|
# Client is set internally
|
122
|
-
self.embeddings = CohereEmbeddings(
|
142
|
+
self.embeddings = CohereEmbeddings(
|
143
|
+
cohere_api_key=config.cohere_key, model="embed-english-light-v2.0"
|
144
|
+
) # type: ignore
|
123
145
|
|
124
146
|
def check(self) -> Optional[str]:
|
125
147
|
try:
|
@@ -129,7 +151,10 @@ class CohereEmbedder(Embedder):
|
|
129
151
|
return None
|
130
152
|
|
131
153
|
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
132
|
-
return cast(
|
154
|
+
return cast(
|
155
|
+
List[Optional[List[float]]],
|
156
|
+
self.embeddings.embed_documents([document.page_content for document in documents]),
|
157
|
+
)
|
133
158
|
|
134
159
|
@property
|
135
160
|
def embedding_dimensions(self) -> int:
|
@@ -150,7 +175,10 @@ class FakeEmbedder(Embedder):
|
|
150
175
|
return None
|
151
176
|
|
152
177
|
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
153
|
-
return cast(
|
178
|
+
return cast(
|
179
|
+
List[Optional[List[float]]],
|
180
|
+
self.embeddings.embed_documents([document.page_content for document in documents]),
|
181
|
+
)
|
154
182
|
|
155
183
|
@property
|
156
184
|
def embedding_dimensions(self) -> int:
|
@@ -167,11 +195,20 @@ class OpenAICompatibleEmbedder(Embedder):
|
|
167
195
|
self.config = config
|
168
196
|
# Client is set internally
|
169
197
|
# Always set an API key even if there is none defined in the config because the validator will fail otherwise. Embedding APIs that don't require an API key don't fail if one is provided, so this is not breaking usage.
|
170
|
-
self.embeddings = LocalAIEmbeddings(
|
198
|
+
self.embeddings = LocalAIEmbeddings(
|
199
|
+
model=config.model_name,
|
200
|
+
openai_api_key=config.api_key or "dummy-api-key",
|
201
|
+
openai_api_base=config.base_url,
|
202
|
+
max_retries=15,
|
203
|
+
disallowed_special=(),
|
204
|
+
) # type: ignore
|
171
205
|
|
172
206
|
def check(self) -> Optional[str]:
|
173
207
|
deployment_mode = os.environ.get("DEPLOYMENT_MODE", "")
|
174
|
-
if
|
208
|
+
if (
|
209
|
+
deployment_mode.casefold() == CLOUD_DEPLOYMENT_MODE
|
210
|
+
and not self.config.base_url.startswith("https://")
|
211
|
+
):
|
175
212
|
return "Base URL must start with https://"
|
176
213
|
|
177
214
|
try:
|
@@ -181,7 +218,10 @@ class OpenAICompatibleEmbedder(Embedder):
|
|
181
218
|
return None
|
182
219
|
|
183
220
|
def embed_documents(self, documents: List[Document]) -> List[Optional[List[float]]]:
|
184
|
-
return cast(
|
221
|
+
return cast(
|
222
|
+
List[Optional[List[float]]],
|
223
|
+
self.embeddings.embed_documents([document.page_content for document in documents]),
|
224
|
+
)
|
185
225
|
|
186
226
|
@property
|
187
227
|
def embedding_dimensions(self) -> int:
|
@@ -254,8 +294,10 @@ def create_from_config(
|
|
254
294
|
],
|
255
295
|
processing_config: ProcessingConfigModel,
|
256
296
|
) -> Embedder:
|
257
|
-
|
258
297
|
if embedding_config.mode == "azure_openai" or embedding_config.mode == "openai":
|
259
|
-
return cast(
|
298
|
+
return cast(
|
299
|
+
Embedder,
|
300
|
+
embedder_map[embedding_config.mode](embedding_config, processing_config.chunk_size),
|
301
|
+
)
|
260
302
|
else:
|
261
303
|
return cast(Embedder, embedder_map[embedding_config.mode](embedding_config))
|
@@ -26,12 +26,19 @@ class BaseIntegrationTest(unittest.TestCase):
|
|
26
26
|
It provides helper methods to create Airbyte catalogs, records and state messages.
|
27
27
|
"""
|
28
28
|
|
29
|
-
def _get_configured_catalog(
|
30
|
-
|
29
|
+
def _get_configured_catalog(
|
30
|
+
self, destination_mode: DestinationSyncMode
|
31
|
+
) -> ConfiguredAirbyteCatalog:
|
32
|
+
stream_schema = {
|
33
|
+
"type": "object",
|
34
|
+
"properties": {"str_col": {"type": "str"}, "int_col": {"type": "integer"}},
|
35
|
+
}
|
31
36
|
|
32
37
|
overwrite_stream = ConfiguredAirbyteStream(
|
33
38
|
stream=AirbyteStream(
|
34
|
-
name="mystream",
|
39
|
+
name="mystream",
|
40
|
+
json_schema=stream_schema,
|
41
|
+
supported_sync_modes=[SyncMode.incremental, SyncMode.full_refresh],
|
35
42
|
),
|
36
43
|
primary_key=[["int_col"]],
|
37
44
|
sync_mode=SyncMode.incremental,
|
@@ -45,7 +52,10 @@ class BaseIntegrationTest(unittest.TestCase):
|
|
45
52
|
|
46
53
|
def _record(self, stream: str, str_value: str, int_value: int) -> AirbyteMessage:
|
47
54
|
return AirbyteMessage(
|
48
|
-
type=Type.RECORD,
|
55
|
+
type=Type.RECORD,
|
56
|
+
record=AirbyteRecordMessage(
|
57
|
+
stream=stream, data={"str_col": str_value, "int_col": int_value}, emitted_at=0
|
58
|
+
),
|
49
59
|
)
|
50
60
|
|
51
61
|
def setUp(self) -> None:
|
@@ -10,7 +10,11 @@ from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStream
|
|
10
10
|
|
11
11
|
|
12
12
|
def format_exception(exception: Exception) -> str:
|
13
|
-
return
|
13
|
+
return (
|
14
|
+
str(exception)
|
15
|
+
+ "\n"
|
16
|
+
+ "".join(traceback.TracebackException.from_exception(exception).format())
|
17
|
+
)
|
14
18
|
|
15
19
|
|
16
20
|
def create_chunks(iterable: Iterable[Any], batch_size: int) -> Iterator[Tuple[Any, ...]]:
|
@@ -26,4 +30,6 @@ def create_stream_identifier(stream: Union[AirbyteStream, AirbyteRecordMessage])
|
|
26
30
|
if isinstance(stream, AirbyteStream):
|
27
31
|
return str(stream.name if stream.namespace is None else f"{stream.namespace}_{stream.name}")
|
28
32
|
else:
|
29
|
-
return str(
|
33
|
+
return str(
|
34
|
+
stream.stream if stream.namespace is None else f"{stream.namespace}_{stream.stream}"
|
35
|
+
)
|
@@ -27,7 +27,12 @@ class Writer:
|
|
27
27
|
"""
|
28
28
|
|
29
29
|
def __init__(
|
30
|
-
self,
|
30
|
+
self,
|
31
|
+
processing_config: ProcessingConfigModel,
|
32
|
+
indexer: Indexer,
|
33
|
+
embedder: Embedder,
|
34
|
+
batch_size: int,
|
35
|
+
omit_raw_text: bool,
|
31
36
|
) -> None:
|
32
37
|
self.processing_config = processing_config
|
33
38
|
self.indexer = indexer
|
@@ -54,7 +59,9 @@ class Writer:
|
|
54
59
|
self.indexer.delete(ids, namespace, stream)
|
55
60
|
|
56
61
|
for (namespace, stream), chunks in self.chunks.items():
|
57
|
-
embeddings = self.embedder.embed_documents(
|
62
|
+
embeddings = self.embedder.embed_documents(
|
63
|
+
[self._convert_to_document(chunk) for chunk in chunks]
|
64
|
+
)
|
58
65
|
for i, document in enumerate(chunks):
|
59
66
|
document.embedding = embeddings[i]
|
60
67
|
if self.omit_raw_text:
|
@@ -63,7 +70,9 @@ class Writer:
|
|
63
70
|
|
64
71
|
self._init_batch()
|
65
72
|
|
66
|
-
def write(
|
73
|
+
def write(
|
74
|
+
self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]
|
75
|
+
) -> Iterable[AirbyteMessage]:
|
67
76
|
self.processor = DocumentProcessor(self.processing_config, configured_catalog)
|
68
77
|
self.indexer.pre_sync(configured_catalog)
|
69
78
|
for message in input_messages:
|
@@ -74,9 +83,19 @@ class Writer:
|
|
74
83
|
yield message
|
75
84
|
elif message.type == Type.RECORD:
|
76
85
|
record_chunks, record_id_to_delete = self.processor.process(message.record)
|
77
|
-
self.chunks[
|
86
|
+
self.chunks[
|
87
|
+
( # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"
|
88
|
+
message.record.namespace, # type: ignore [union-attr] # record not None
|
89
|
+
message.record.stream, # type: ignore [union-attr] # record not None
|
90
|
+
)
|
91
|
+
].extend(record_chunks)
|
78
92
|
if record_id_to_delete is not None:
|
79
|
-
self.ids_to_delete[
|
93
|
+
self.ids_to_delete[
|
94
|
+
( # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"
|
95
|
+
message.record.namespace, # type: ignore [union-attr] # record not None
|
96
|
+
message.record.stream, # type: ignore [union-attr] # record not None
|
97
|
+
)
|
98
|
+
].append(record_id_to_delete)
|
80
99
|
self.number_of_chunks += len(record_chunks)
|
81
100
|
if self.number_of_chunks >= self.batch_size:
|
82
101
|
self._process_batch()
|