airbyte-cdk 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +358 -0
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +104 -0
- airbyte_cdk/connector.py +123 -0
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/__init__.py +3 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
- airbyte_cdk/connector_builder/main.py +107 -0
- airbyte_cdk/connector_builder/models.py +73 -0
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +83 -0
- airbyte_cdk/destinations/__init__.py +8 -0
- airbyte_cdk/destinations/destination.py +154 -0
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
- airbyte_cdk/destinations/vector_db_based/config.py +298 -0
- airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
- airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
- airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
- airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
- airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
- airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
- airbyte_cdk/entrypoint.py +414 -0
- airbyte_cdk/exception_handler.py +56 -0
- airbyte_cdk/logger.py +109 -0
- airbyte_cdk/models/__init__.py +72 -0
- airbyte_cdk/models/airbyte_protocol.py +88 -0
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/well_known_types.py +5 -0
- airbyte_cdk/py.typed +0 -0
- airbyte_cdk/sources/__init__.py +26 -0
- airbyte_cdk/sources/abstract_source.py +326 -0
- airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
- airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
- airbyte_cdk/sources/config.py +27 -0
- airbyte_cdk/sources/connector_state_manager.py +161 -0
- airbyte_cdk/sources/declarative/__init__.py +3 -0
- airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
- airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
- airbyte_cdk/sources/declarative/auth/token.py +267 -0
- airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
- airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
- airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
- airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
- airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
- airbyte_cdk/sources/declarative/declarative_source.py +36 -0
- airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
- airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
- airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
- airbyte_cdk/sources/declarative/exceptions.py +9 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
- airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
- airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
- airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
- airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
- airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +2 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
- airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
- airbyte_cdk/sources/declarative/requesters/README.md +56 -0
- airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
- airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
- airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
- airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
- airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
- airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
- airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
- airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
- airbyte_cdk/sources/declarative/spec/spec.py +48 -0
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
- airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
- airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
- airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
- airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
- airbyte_cdk/sources/declarative/types.py +25 -0
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
- airbyte_cdk/sources/file_based/config/__init__.py +0 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
- airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
- airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
- airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
- airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
- airbyte_cdk/sources/file_based/exceptions.py +159 -0
- airbyte_cdk/sources/file_based/file_based_source.py +466 -0
- airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
- airbyte_cdk/sources/file_based/file_record_data.py +22 -0
- airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
- airbyte_cdk/sources/file_based/remote_file.py +18 -0
- airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
- airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
- airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
- airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
- airbyte_cdk/sources/file_based/types.py +10 -0
- airbyte_cdk/sources/http_config.py +10 -0
- airbyte_cdk/sources/http_logger.py +55 -0
- airbyte_cdk/sources/message/__init__.py +19 -0
- airbyte_cdk/sources/message/repository.py +137 -0
- airbyte_cdk/sources/source.py +95 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/__init__.py +8 -0
- airbyte_cdk/sources/streams/availability_strategy.py +84 -0
- airbyte_cdk/sources/streams/call_rate.py +704 -0
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
- airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
- airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
- airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
- airbyte_cdk/sources/streams/core.py +703 -0
- airbyte_cdk/sources/streams/http/__init__.py +10 -0
- airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +61 -0
- airbyte_cdk/sources/streams/http/http.py +673 -0
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/sources/streams/utils/__init__.py +3 -0
- airbyte_cdk/sources/types.py +169 -0
- airbyte_cdk/sources/utils/__init__.py +7 -0
- airbyte_cdk/sources/utils/casing.py +12 -0
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +53 -0
- airbyte_cdk/sources/utils/schema_helpers.py +230 -0
- airbyte_cdk/sources/utils/slice_logger.py +57 -0
- airbyte_cdk/sources/utils/transform.py +277 -0
- airbyte_cdk/sources/utils/types.py +7 -0
- airbyte_cdk/sql/__init__.py +0 -0
- airbyte_cdk/sql/_util/__init__.py +0 -0
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/__init__.py +7 -0
- airbyte_cdk/test/catalog_builder.py +81 -0
- airbyte_cdk/test/entrypoint_wrapper.py +250 -0
- airbyte_cdk/test/mock_http/__init__.py +6 -0
- airbyte_cdk/test/mock_http/matcher.py +41 -0
- airbyte_cdk/test/mock_http/mocker.py +185 -0
- airbyte_cdk/test/mock_http/request.py +103 -0
- airbyte_cdk/test/mock_http/response.py +28 -0
- airbyte_cdk/test/mock_http/response_builder.py +237 -0
- airbyte_cdk/test/state_builder.py +33 -0
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +10 -0
- airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
- airbyte_cdk/utils/analytics_message.py +25 -0
- airbyte_cdk/utils/constants.py +5 -0
- airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
- airbyte_cdk/utils/datetime_helpers.py +499 -0
- airbyte_cdk/utils/event_timing.py +85 -0
- airbyte_cdk/utils/is_cloud_environment.py +18 -0
- airbyte_cdk/utils/mapping_helpers.py +162 -0
- airbyte_cdk/utils/message_utils.py +26 -0
- airbyte_cdk/utils/oneof_option_config.py +33 -0
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +270 -0
- airbyte_cdk/utils/slice_hasher.py +37 -0
- airbyte_cdk/utils/spec_schema_transformations.py +26 -0
- airbyte_cdk/utils/stream_status_utils.py +43 -0
- airbyte_cdk/utils/traced_exception.py +145 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
- airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
- airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
- airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
- airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any, List, Union
|
|
7
|
+
|
|
8
|
+
from airbyte_cdk.models import AirbyteMessage, FailureType
|
|
9
|
+
from airbyte_cdk.utils import AirbyteTracedException
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FileBasedSourceError(Enum):
|
|
13
|
+
EMPTY_STREAM = "No files were identified in the stream. This may be because there are no files in the specified container, or because your glob patterns did not match any files. Please verify that your source contains files last modified after the start_date and that your glob patterns are not overly strict."
|
|
14
|
+
GLOB_PARSE_ERROR = "Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
|
|
15
|
+
ENCODING_ERROR = "File encoding error. The configured encoding must match file encoding."
|
|
16
|
+
ERROR_CASTING_VALUE = "Could not cast the value to the expected type."
|
|
17
|
+
ERROR_CASTING_VALUE_UNRECOGNIZED_TYPE = "Could not cast the value to the expected type because the type is not recognized. Valid types are null, array, boolean, integer, number, object, and string."
|
|
18
|
+
ERROR_DECODING_VALUE = "Expected a JSON-decodeable value but could not decode record."
|
|
19
|
+
ERROR_LISTING_FILES = "Error listing files. Please check the credentials provided in the config and verify that they provide permission to list files."
|
|
20
|
+
ERROR_READING_FILE = "Error opening file. Please check the credentials provided in the config and verify that they provide permission to read files."
|
|
21
|
+
ERROR_PARSING_RECORD = "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable."
|
|
22
|
+
ERROR_PARSING_USER_PROVIDED_SCHEMA = (
|
|
23
|
+
"The provided schema could not be transformed into valid JSON Schema."
|
|
24
|
+
)
|
|
25
|
+
ERROR_VALIDATING_RECORD = "One or more records do not pass the schema validation policy. Please modify your input schema, or select a more lenient validation policy."
|
|
26
|
+
ERROR_PARSING_RECORD_MISMATCHED_COLUMNS = "A header field has resolved to `None`. This indicates that the CSV has more rows than the number of header fields. If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
|
|
27
|
+
ERROR_PARSING_RECORD_MISMATCHED_ROWS = "A row's value has resolved to `None`. This indicates that the CSV has more columns in the header field than the number of columns in the row(s). If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
|
|
28
|
+
STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY = "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema."
|
|
29
|
+
NULL_VALUE_IN_SCHEMA = "Error during schema inference: no type was detected for key."
|
|
30
|
+
UNRECOGNIZED_TYPE = "Error during schema inference: unrecognized type."
|
|
31
|
+
SCHEMA_INFERENCE_ERROR = "Error inferring schema from files. Are the files valid?"
|
|
32
|
+
INVALID_SCHEMA_ERROR = "No fields were identified for this schema. This may happen if the stream is empty. Please check your configuration to verify that there are files that match the stream's glob patterns."
|
|
33
|
+
CONFIG_VALIDATION_ERROR = "Error creating stream config object."
|
|
34
|
+
MISSING_SCHEMA = "Expected `json_schema` in the configured catalog but it is missing."
|
|
35
|
+
UNDEFINED_PARSER = "No parser is defined for this file type."
|
|
36
|
+
UNDEFINED_VALIDATION_POLICY = (
|
|
37
|
+
"The validation policy defined in the config does not exist for the source."
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class FileBasedErrorsCollector:
|
|
42
|
+
"""
|
|
43
|
+
The placeholder for all errors collected.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
errors: List[AirbyteMessage] = []
|
|
47
|
+
|
|
48
|
+
def yield_and_raise_collected(self) -> Any:
|
|
49
|
+
if self.errors:
|
|
50
|
+
# emit collected logged messages
|
|
51
|
+
yield from self.errors
|
|
52
|
+
# clean the collector
|
|
53
|
+
self.errors.clear()
|
|
54
|
+
# raising the single exception
|
|
55
|
+
raise AirbyteTracedException(
|
|
56
|
+
internal_message="Please check the logged errors for more information.",
|
|
57
|
+
message="Some errors occured while reading from the source.",
|
|
58
|
+
failure_type=FailureType.config_error,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
def collect(self, logged_error: AirbyteMessage) -> None:
|
|
62
|
+
self.errors.append(logged_error)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class BaseFileBasedSourceError(Exception):
|
|
66
|
+
def __init__(self, error: Union[FileBasedSourceError, str], **kwargs): # type: ignore # noqa
|
|
67
|
+
if isinstance(error, FileBasedSourceError):
|
|
68
|
+
error = FileBasedSourceError(error).value
|
|
69
|
+
super().__init__(
|
|
70
|
+
f"{error} Contact Support if you need assistance.\n{' '.join([f'{k}={v}' for k, v in kwargs.items()])}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ConfigValidationError(BaseFileBasedSourceError):
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class InvalidSchemaError(BaseFileBasedSourceError):
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class MissingSchemaError(BaseFileBasedSourceError):
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class NoFilesMatchingError(BaseFileBasedSourceError):
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class RecordParseError(BaseFileBasedSourceError):
|
|
91
|
+
pass
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class SchemaInferenceError(BaseFileBasedSourceError):
|
|
95
|
+
pass
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class CheckAvailabilityError(BaseFileBasedSourceError):
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class UndefinedParserError(BaseFileBasedSourceError):
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class StopSyncPerValidationPolicy(BaseFileBasedSourceError):
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class ErrorListingFiles(BaseFileBasedSourceError):
|
|
111
|
+
pass
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class DuplicatedFilesError(BaseFileBasedSourceError):
|
|
115
|
+
def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
|
|
116
|
+
self._duplicated_files_names = duplicated_files_names
|
|
117
|
+
self._stream_name: str = kwargs["stream"]
|
|
118
|
+
super().__init__(self._format_duplicate_files_error_message(), **kwargs)
|
|
119
|
+
|
|
120
|
+
def _format_duplicate_files_error_message(self) -> str:
|
|
121
|
+
duplicated_files_messages = []
|
|
122
|
+
for duplicated_file in self._duplicated_files_names:
|
|
123
|
+
for duplicated_file_name, file_paths in duplicated_file.items():
|
|
124
|
+
file_duplicated_message = (
|
|
125
|
+
f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
|
|
126
|
+
+ "".join(f"\n - {file_paths}")
|
|
127
|
+
)
|
|
128
|
+
duplicated_files_messages.append(file_duplicated_message)
|
|
129
|
+
|
|
130
|
+
error_message = (
|
|
131
|
+
f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
|
|
132
|
+
"Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
|
|
133
|
+
"Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
|
|
134
|
+
+ "\n".join(duplicated_files_messages)
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
return error_message
|
|
138
|
+
|
|
139
|
+
def __repr__(self) -> str:
|
|
140
|
+
"""Return a string representation of the exception."""
|
|
141
|
+
class_name = self.__class__.__name__
|
|
142
|
+
properties_str = ", ".join(
|
|
143
|
+
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
|
144
|
+
)
|
|
145
|
+
return f"{class_name}({properties_str})"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class CustomFileBasedException(AirbyteTracedException):
|
|
149
|
+
"""
|
|
150
|
+
A specialized exception for file-based connectors.
|
|
151
|
+
|
|
152
|
+
This exception is designed to bypass the default error handling in the file-based CDK, allowing the use of custom error messages.
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class FileSizeLimitError(CustomFileBasedException):
|
|
159
|
+
pass
|
|
@@ -0,0 +1,466 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import traceback
|
|
7
|
+
from abc import ABC
|
|
8
|
+
from collections import Counter
|
|
9
|
+
from typing import Any, Iterator, List, Mapping, Optional, Tuple, Type, Union
|
|
10
|
+
|
|
11
|
+
from pydantic.v1.error_wrappers import ValidationError
|
|
12
|
+
|
|
13
|
+
from airbyte_cdk.logger import AirbyteLogFormatter, init_logger
|
|
14
|
+
from airbyte_cdk.models import (
|
|
15
|
+
AirbyteMessage,
|
|
16
|
+
AirbyteStateMessage,
|
|
17
|
+
AirbyteStream,
|
|
18
|
+
ConfiguredAirbyteCatalog,
|
|
19
|
+
ConnectorSpecification,
|
|
20
|
+
FailureType,
|
|
21
|
+
Level,
|
|
22
|
+
SyncMode,
|
|
23
|
+
)
|
|
24
|
+
from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
|
|
25
|
+
from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
|
|
26
|
+
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
|
27
|
+
from airbyte_cdk.sources.file_based.availability_strategy import (
|
|
28
|
+
AbstractFileBasedAvailabilityStrategy,
|
|
29
|
+
DefaultFileBasedAvailabilityStrategy,
|
|
30
|
+
)
|
|
31
|
+
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
|
32
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
|
|
33
|
+
FileBasedStreamConfig,
|
|
34
|
+
ValidationPolicy,
|
|
35
|
+
)
|
|
36
|
+
from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import (
|
|
37
|
+
include_identities_stream,
|
|
38
|
+
preserve_directory_structure,
|
|
39
|
+
use_file_transfer,
|
|
40
|
+
use_permissions_transfer,
|
|
41
|
+
)
|
|
42
|
+
from airbyte_cdk.sources.file_based.discovery_policy import (
|
|
43
|
+
AbstractDiscoveryPolicy,
|
|
44
|
+
DefaultDiscoveryPolicy,
|
|
45
|
+
)
|
|
46
|
+
from airbyte_cdk.sources.file_based.exceptions import (
|
|
47
|
+
ConfigValidationError,
|
|
48
|
+
FileBasedErrorsCollector,
|
|
49
|
+
FileBasedSourceError,
|
|
50
|
+
)
|
|
51
|
+
from airbyte_cdk.sources.file_based.file_based_stream_permissions_reader import (
|
|
52
|
+
AbstractFileBasedStreamPermissionsReader,
|
|
53
|
+
)
|
|
54
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
|
55
|
+
from airbyte_cdk.sources.file_based.file_types import default_parsers
|
|
56
|
+
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
|
57
|
+
from airbyte_cdk.sources.file_based.schema_validation_policies import (
|
|
58
|
+
DEFAULT_SCHEMA_VALIDATION_POLICIES,
|
|
59
|
+
AbstractSchemaValidationPolicy,
|
|
60
|
+
)
|
|
61
|
+
from airbyte_cdk.sources.file_based.stream import (
|
|
62
|
+
AbstractFileBasedStream,
|
|
63
|
+
DefaultFileBasedStream,
|
|
64
|
+
FileIdentitiesStream,
|
|
65
|
+
PermissionsFileBasedStream,
|
|
66
|
+
)
|
|
67
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
|
|
68
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
|
|
69
|
+
AbstractConcurrentFileBasedCursor,
|
|
70
|
+
FileBasedConcurrentCursor,
|
|
71
|
+
FileBasedFinalStateCursor,
|
|
72
|
+
)
|
|
73
|
+
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
|
74
|
+
from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository
|
|
75
|
+
from airbyte_cdk.sources.streams import Stream
|
|
76
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
|
|
77
|
+
from airbyte_cdk.utils.analytics_message import create_analytics_message
|
|
78
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
79
|
+
|
|
80
|
+
DEFAULT_CONCURRENCY = 100
|
|
81
|
+
MAX_CONCURRENCY = 100
|
|
82
|
+
INITIAL_N_PARTITIONS = MAX_CONCURRENCY // 2
|
|
83
|
+
IDENTITIES_STREAM = "identities"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
87
|
+
# We make each source override the concurrency level to give control over when they are upgraded.
|
|
88
|
+
_concurrency_level = None
|
|
89
|
+
|
|
90
|
+
def __init__(
|
|
91
|
+
self,
|
|
92
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
93
|
+
spec_class: Type[AbstractFileBasedSpec],
|
|
94
|
+
catalog: Optional[ConfiguredAirbyteCatalog],
|
|
95
|
+
config: Optional[Mapping[str, Any]],
|
|
96
|
+
state: Optional[List[AirbyteStateMessage]],
|
|
97
|
+
availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy] = None,
|
|
98
|
+
discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
|
|
99
|
+
parsers: Mapping[Type[Any], FileTypeParser] = default_parsers,
|
|
100
|
+
validation_policies: Mapping[
|
|
101
|
+
ValidationPolicy, AbstractSchemaValidationPolicy
|
|
102
|
+
] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
|
|
103
|
+
cursor_cls: Type[
|
|
104
|
+
Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]
|
|
105
|
+
] = FileBasedConcurrentCursor,
|
|
106
|
+
stream_permissions_reader: Optional[AbstractFileBasedStreamPermissionsReader] = None,
|
|
107
|
+
):
|
|
108
|
+
self.stream_reader = stream_reader
|
|
109
|
+
self.stream_permissions_reader = stream_permissions_reader
|
|
110
|
+
self.spec_class = spec_class
|
|
111
|
+
self.config = config
|
|
112
|
+
self.catalog = catalog
|
|
113
|
+
self.state = state
|
|
114
|
+
self.availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(
|
|
115
|
+
stream_reader
|
|
116
|
+
)
|
|
117
|
+
self.discovery_policy = discovery_policy
|
|
118
|
+
self.parsers = parsers
|
|
119
|
+
self.validation_policies = validation_policies
|
|
120
|
+
self.stream_schemas = (
|
|
121
|
+
{s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
|
|
122
|
+
)
|
|
123
|
+
self.cursor_cls = cursor_cls
|
|
124
|
+
self.logger = init_logger(f"airbyte.{self.name}")
|
|
125
|
+
self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
|
|
126
|
+
self._message_repository: Optional[MessageRepository] = None
|
|
127
|
+
concurrent_source = ConcurrentSource.create(
|
|
128
|
+
MAX_CONCURRENCY,
|
|
129
|
+
INITIAL_N_PARTITIONS,
|
|
130
|
+
self.logger,
|
|
131
|
+
self._slice_logger,
|
|
132
|
+
self.message_repository,
|
|
133
|
+
)
|
|
134
|
+
self._state = None
|
|
135
|
+
super().__init__(concurrent_source)
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def message_repository(self) -> MessageRepository:
|
|
139
|
+
if self._message_repository is None:
|
|
140
|
+
self._message_repository = InMemoryMessageRepository(
|
|
141
|
+
Level(AirbyteLogFormatter.level_mapping[self.logger.level])
|
|
142
|
+
)
|
|
143
|
+
return self._message_repository
|
|
144
|
+
|
|
145
|
+
def check_connection(
|
|
146
|
+
self, logger: logging.Logger, config: Mapping[str, Any]
|
|
147
|
+
) -> Tuple[bool, Optional[Any]]:
|
|
148
|
+
"""
|
|
149
|
+
Check that the source can be accessed using the user-provided configuration.
|
|
150
|
+
|
|
151
|
+
For each stream, verify that we can list and read files.
|
|
152
|
+
|
|
153
|
+
Returns (True, None) if the connection check is successful.
|
|
154
|
+
|
|
155
|
+
Otherwise, the "error" object should describe what went wrong.
|
|
156
|
+
"""
|
|
157
|
+
try:
|
|
158
|
+
streams = self.streams(config)
|
|
159
|
+
except Exception as config_exception:
|
|
160
|
+
raise AirbyteTracedException(
|
|
161
|
+
internal_message="Please check the logged errors for more information.",
|
|
162
|
+
message=FileBasedSourceError.CONFIG_VALIDATION_ERROR.value,
|
|
163
|
+
exception=AirbyteTracedException(exception=config_exception),
|
|
164
|
+
failure_type=FailureType.config_error,
|
|
165
|
+
)
|
|
166
|
+
if len(streams) == 0:
|
|
167
|
+
return (
|
|
168
|
+
False,
|
|
169
|
+
f"No streams are available for source {self.name}. This is probably an issue with the connector. Please verify that your "
|
|
170
|
+
f"configuration provides permissions to list and read files from the source. Contact support if you are unable to "
|
|
171
|
+
f"resolve this issue.",
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
errors = []
|
|
175
|
+
tracebacks = []
|
|
176
|
+
for stream in streams:
|
|
177
|
+
if isinstance(stream, FileIdentitiesStream):
|
|
178
|
+
identity = next(iter(stream.load_identity_groups()))
|
|
179
|
+
if not identity:
|
|
180
|
+
errors.append(
|
|
181
|
+
"Unable to get identities for current configuration, please check your credentials"
|
|
182
|
+
)
|
|
183
|
+
continue
|
|
184
|
+
if not isinstance(stream, AbstractFileBasedStream):
|
|
185
|
+
raise ValueError(f"Stream {stream} is not a file-based stream.")
|
|
186
|
+
try:
|
|
187
|
+
parsed_config = self._get_parsed_config(config)
|
|
188
|
+
availability_method = (
|
|
189
|
+
stream.availability_strategy.check_availability
|
|
190
|
+
if use_file_transfer(parsed_config) or use_permissions_transfer(parsed_config)
|
|
191
|
+
else stream.availability_strategy.check_availability_and_parsability
|
|
192
|
+
)
|
|
193
|
+
(
|
|
194
|
+
stream_is_available,
|
|
195
|
+
reason,
|
|
196
|
+
) = availability_method(stream, logger, self)
|
|
197
|
+
except AirbyteTracedException as ate:
|
|
198
|
+
errors.append(f"Unable to connect to stream {stream.name} - {ate.message}")
|
|
199
|
+
tracebacks.append(traceback.format_exc())
|
|
200
|
+
except Exception:
|
|
201
|
+
errors.append(f"Unable to connect to stream {stream.name}")
|
|
202
|
+
tracebacks.append(traceback.format_exc())
|
|
203
|
+
else:
|
|
204
|
+
if not stream_is_available and reason:
|
|
205
|
+
errors.append(reason)
|
|
206
|
+
|
|
207
|
+
if len(errors) == 1 and len(tracebacks) == 1:
|
|
208
|
+
raise AirbyteTracedException(
|
|
209
|
+
internal_message=tracebacks[0],
|
|
210
|
+
message=f"{errors[0]}",
|
|
211
|
+
failure_type=FailureType.config_error,
|
|
212
|
+
)
|
|
213
|
+
if len(errors) == 1 and len(tracebacks) == 0:
|
|
214
|
+
raise AirbyteTracedException(
|
|
215
|
+
message=f"{errors[0]}",
|
|
216
|
+
failure_type=FailureType.config_error,
|
|
217
|
+
)
|
|
218
|
+
elif len(errors) > 1:
|
|
219
|
+
raise AirbyteTracedException(
|
|
220
|
+
internal_message="\n".join(tracebacks),
|
|
221
|
+
message=f"{len(errors)} streams with errors: {', '.join(error for error in errors)}",
|
|
222
|
+
failure_type=FailureType.config_error,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
return not bool(errors), (errors or None)
|
|
226
|
+
|
|
227
|
+
def streams(self, config: Mapping[str, Any]) -> List[Stream]:
|
|
228
|
+
"""
|
|
229
|
+
Return a list of this source's streams.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
if self.catalog:
|
|
233
|
+
state_manager = ConnectorStateManager(state=self.state)
|
|
234
|
+
else:
|
|
235
|
+
# During `check` operations we don't have a catalog so cannot create a state manager.
|
|
236
|
+
# Since the state manager is only required for incremental syncs, this is fine.
|
|
237
|
+
state_manager = None
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
parsed_config = self._get_parsed_config(config)
|
|
241
|
+
self.stream_reader.config = parsed_config
|
|
242
|
+
if self.stream_permissions_reader:
|
|
243
|
+
self.stream_permissions_reader.config = parsed_config
|
|
244
|
+
streams: List[Stream] = []
|
|
245
|
+
for stream_config in parsed_config.streams:
|
|
246
|
+
# Like state_manager, `catalog_stream` may be None during `check`
|
|
247
|
+
catalog_stream = self._get_stream_from_catalog(stream_config)
|
|
248
|
+
stream_state = (
|
|
249
|
+
state_manager.get_stream_state(catalog_stream.name, catalog_stream.namespace)
|
|
250
|
+
if (state_manager and catalog_stream)
|
|
251
|
+
else None
|
|
252
|
+
)
|
|
253
|
+
self._validate_input_schema(stream_config)
|
|
254
|
+
|
|
255
|
+
sync_mode = self._get_sync_mode_from_catalog(stream_config.name)
|
|
256
|
+
|
|
257
|
+
if (
|
|
258
|
+
sync_mode == SyncMode.full_refresh
|
|
259
|
+
and hasattr(self, "_concurrency_level")
|
|
260
|
+
and self._concurrency_level is not None
|
|
261
|
+
):
|
|
262
|
+
cursor = FileBasedFinalStateCursor(
|
|
263
|
+
stream_config=stream_config,
|
|
264
|
+
stream_namespace=None,
|
|
265
|
+
message_repository=self.message_repository,
|
|
266
|
+
)
|
|
267
|
+
stream = FileBasedStreamFacade.create_from_stream(
|
|
268
|
+
stream=self._make_file_based_stream(
|
|
269
|
+
stream_config=stream_config,
|
|
270
|
+
cursor=cursor,
|
|
271
|
+
parsed_config=parsed_config,
|
|
272
|
+
),
|
|
273
|
+
source=self,
|
|
274
|
+
logger=self.logger,
|
|
275
|
+
state=stream_state,
|
|
276
|
+
cursor=cursor,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
elif (
|
|
280
|
+
sync_mode == SyncMode.incremental
|
|
281
|
+
and issubclass(self.cursor_cls, AbstractConcurrentFileBasedCursor)
|
|
282
|
+
and hasattr(self, "_concurrency_level")
|
|
283
|
+
and self._concurrency_level is not None
|
|
284
|
+
):
|
|
285
|
+
assert (
|
|
286
|
+
state_manager is not None
|
|
287
|
+
), "No ConnectorStateManager was created, but it is required for incremental syncs. This is unexpected. Please contact Support."
|
|
288
|
+
|
|
289
|
+
cursor = self.cursor_cls(
|
|
290
|
+
stream_config,
|
|
291
|
+
stream_config.name,
|
|
292
|
+
None,
|
|
293
|
+
stream_state,
|
|
294
|
+
self.message_repository,
|
|
295
|
+
state_manager,
|
|
296
|
+
CursorField(DefaultFileBasedStream.ab_last_mod_col),
|
|
297
|
+
)
|
|
298
|
+
stream = FileBasedStreamFacade.create_from_stream(
|
|
299
|
+
stream=self._make_file_based_stream(
|
|
300
|
+
stream_config=stream_config,
|
|
301
|
+
cursor=cursor,
|
|
302
|
+
parsed_config=parsed_config,
|
|
303
|
+
),
|
|
304
|
+
source=self,
|
|
305
|
+
logger=self.logger,
|
|
306
|
+
state=stream_state,
|
|
307
|
+
cursor=cursor,
|
|
308
|
+
)
|
|
309
|
+
else:
|
|
310
|
+
cursor = self.cursor_cls(stream_config)
|
|
311
|
+
stream = self._make_file_based_stream(
|
|
312
|
+
stream_config=stream_config,
|
|
313
|
+
cursor=cursor,
|
|
314
|
+
parsed_config=parsed_config,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
streams.append(stream)
|
|
318
|
+
|
|
319
|
+
if include_identities_stream(parsed_config):
|
|
320
|
+
identities_stream = self._make_identities_stream()
|
|
321
|
+
streams.append(identities_stream)
|
|
322
|
+
return streams
|
|
323
|
+
|
|
324
|
+
except ValidationError as exc:
|
|
325
|
+
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) from exc
|
|
326
|
+
|
|
327
|
+
def _make_default_stream(
|
|
328
|
+
self,
|
|
329
|
+
stream_config: FileBasedStreamConfig,
|
|
330
|
+
cursor: Optional[AbstractFileBasedCursor],
|
|
331
|
+
parsed_config: AbstractFileBasedSpec,
|
|
332
|
+
) -> AbstractFileBasedStream:
|
|
333
|
+
return DefaultFileBasedStream(
|
|
334
|
+
config=stream_config,
|
|
335
|
+
catalog_schema=self.stream_schemas.get(stream_config.name),
|
|
336
|
+
stream_reader=self.stream_reader,
|
|
337
|
+
availability_strategy=self.availability_strategy,
|
|
338
|
+
discovery_policy=self.discovery_policy,
|
|
339
|
+
parsers=self.parsers,
|
|
340
|
+
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
|
341
|
+
errors_collector=self.errors_collector,
|
|
342
|
+
cursor=cursor,
|
|
343
|
+
use_file_transfer=use_file_transfer(parsed_config),
|
|
344
|
+
preserve_directory_structure=preserve_directory_structure(parsed_config),
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
def _ensure_permissions_reader_available(self) -> None:
|
|
348
|
+
"""
|
|
349
|
+
Validates that a stream permissions reader is available.
|
|
350
|
+
Raises a ValueError if the reader is not provided.
|
|
351
|
+
"""
|
|
352
|
+
if not self.stream_permissions_reader:
|
|
353
|
+
raise ValueError(
|
|
354
|
+
"Stream permissions reader is required for streams that use permissions transfer mode."
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
def _make_permissions_stream(
|
|
358
|
+
self, stream_config: FileBasedStreamConfig, cursor: Optional[AbstractFileBasedCursor]
|
|
359
|
+
) -> AbstractFileBasedStream:
|
|
360
|
+
"""
|
|
361
|
+
Creates a stream that reads permissions from files.
|
|
362
|
+
"""
|
|
363
|
+
self._ensure_permissions_reader_available()
|
|
364
|
+
return PermissionsFileBasedStream(
|
|
365
|
+
config=stream_config,
|
|
366
|
+
catalog_schema=self.stream_schemas.get(stream_config.name),
|
|
367
|
+
stream_reader=self.stream_reader,
|
|
368
|
+
availability_strategy=self.availability_strategy,
|
|
369
|
+
discovery_policy=self.discovery_policy,
|
|
370
|
+
parsers=self.parsers,
|
|
371
|
+
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
|
372
|
+
errors_collector=self.errors_collector,
|
|
373
|
+
cursor=cursor,
|
|
374
|
+
stream_permissions_reader=self.stream_permissions_reader, # type: ignore
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
def _make_file_based_stream(
|
|
378
|
+
self,
|
|
379
|
+
stream_config: FileBasedStreamConfig,
|
|
380
|
+
cursor: Optional[AbstractFileBasedCursor],
|
|
381
|
+
parsed_config: AbstractFileBasedSpec,
|
|
382
|
+
) -> AbstractFileBasedStream:
|
|
383
|
+
"""
|
|
384
|
+
Creates different streams depending on the type of the transfer mode selected
|
|
385
|
+
"""
|
|
386
|
+
if use_permissions_transfer(parsed_config):
|
|
387
|
+
return self._make_permissions_stream(stream_config, cursor)
|
|
388
|
+
# we should have a stream for File transfer mode to decouple from DefaultFileBasedStream
|
|
389
|
+
else:
|
|
390
|
+
return self._make_default_stream(stream_config, cursor, parsed_config)
|
|
391
|
+
|
|
392
|
+
def _make_identities_stream(
|
|
393
|
+
self,
|
|
394
|
+
) -> Stream:
|
|
395
|
+
self._ensure_permissions_reader_available()
|
|
396
|
+
return FileIdentitiesStream(
|
|
397
|
+
catalog_schema=self.stream_schemas.get(FileIdentitiesStream.IDENTITIES_STREAM_NAME),
|
|
398
|
+
stream_permissions_reader=self.stream_permissions_reader, # type: ignore
|
|
399
|
+
discovery_policy=self.discovery_policy,
|
|
400
|
+
errors_collector=self.errors_collector,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
def _get_stream_from_catalog(
|
|
404
|
+
self, stream_config: FileBasedStreamConfig
|
|
405
|
+
) -> Optional[AirbyteStream]:
|
|
406
|
+
if self.catalog:
|
|
407
|
+
for stream in self.catalog.streams or []:
|
|
408
|
+
if stream.stream.name == stream_config.name:
|
|
409
|
+
return stream.stream
|
|
410
|
+
return None
|
|
411
|
+
|
|
412
|
+
def _get_sync_mode_from_catalog(self, stream_name: str) -> Optional[SyncMode]:
|
|
413
|
+
if self.catalog:
|
|
414
|
+
for catalog_stream in self.catalog.streams:
|
|
415
|
+
if stream_name == catalog_stream.stream.name:
|
|
416
|
+
return catalog_stream.sync_mode
|
|
417
|
+
self.logger.warning(f"No sync mode was found for {stream_name}.")
|
|
418
|
+
return None
|
|
419
|
+
|
|
420
|
+
def read(
|
|
421
|
+
self,
|
|
422
|
+
logger: logging.Logger,
|
|
423
|
+
config: Mapping[str, Any],
|
|
424
|
+
catalog: ConfiguredAirbyteCatalog,
|
|
425
|
+
state: Optional[List[AirbyteStateMessage]] = None,
|
|
426
|
+
) -> Iterator[AirbyteMessage]:
|
|
427
|
+
yield from super().read(logger, config, catalog, state)
|
|
428
|
+
# emit all the errors collected
|
|
429
|
+
yield from self.errors_collector.yield_and_raise_collected()
|
|
430
|
+
# count streams using a certain parser
|
|
431
|
+
parsed_config = self._get_parsed_config(config)
|
|
432
|
+
for parser, count in Counter(
|
|
433
|
+
stream.format.filetype for stream in parsed_config.streams
|
|
434
|
+
).items():
|
|
435
|
+
yield create_analytics_message(f"file-cdk-{parser}-stream-count", count)
|
|
436
|
+
|
|
437
|
+
def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification:
|
|
438
|
+
"""
|
|
439
|
+
Returns the specification describing what fields can be configured by a user when setting up a file-based source.
|
|
440
|
+
"""
|
|
441
|
+
|
|
442
|
+
return ConnectorSpecification(
|
|
443
|
+
documentationUrl=self.spec_class.documentation_url(),
|
|
444
|
+
connectionSpecification=self.spec_class.schema(),
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
def _get_parsed_config(self, config: Mapping[str, Any]) -> AbstractFileBasedSpec:
|
|
448
|
+
return self.spec_class(**config)
|
|
449
|
+
|
|
450
|
+
def _validate_and_get_validation_policy(
|
|
451
|
+
self, stream_config: FileBasedStreamConfig
|
|
452
|
+
) -> AbstractSchemaValidationPolicy:
|
|
453
|
+
if stream_config.validation_policy not in self.validation_policies:
|
|
454
|
+
# This should never happen because we validate the config against the schema's validation_policy enum
|
|
455
|
+
raise ValidationError(
|
|
456
|
+
f"`validation_policy` must be one of {list(self.validation_policies.keys())}",
|
|
457
|
+
model=FileBasedStreamConfig,
|
|
458
|
+
)
|
|
459
|
+
return self.validation_policies[stream_config.validation_policy]
|
|
460
|
+
|
|
461
|
+
def _validate_input_schema(self, stream_config: FileBasedStreamConfig) -> None:
|
|
462
|
+
if stream_config.schemaless and stream_config.input_schema:
|
|
463
|
+
raise ValidationError(
|
|
464
|
+
"`input_schema` and `schemaless` options cannot both be set",
|
|
465
|
+
model=FileBasedStreamConfig,
|
|
466
|
+
)
|