airbyte-cdk 0.72.0__py3-none-any.whl → 6.13.1.dev4106__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/__init__.py +355 -6
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +29 -10
- airbyte_cdk/connector.py +24 -24
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
- airbyte_cdk/connector_builder/main.py +45 -13
- airbyte_cdk/connector_builder/message_grouper.py +189 -50
- airbyte_cdk/connector_builder/models.py +3 -2
- airbyte_cdk/destinations/__init__.py +4 -3
- airbyte_cdk/destinations/destination.py +54 -20
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/config.py +40 -17
- airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
- airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
- airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
- airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
- airbyte_cdk/entrypoint.py +153 -44
- airbyte_cdk/exception_handler.py +21 -3
- airbyte_cdk/logger.py +30 -44
- airbyte_cdk/models/__init__.py +13 -2
- airbyte_cdk/models/airbyte_protocol.py +86 -1
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/file_transfer_record_message.py +13 -0
- airbyte_cdk/models/well_known_types.py +1 -1
- airbyte_cdk/sources/__init__.py +5 -1
- airbyte_cdk/sources/abstract_source.py +125 -79
- airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
- airbyte_cdk/sources/config.py +3 -2
- airbyte_cdk/sources/connector_state_manager.py +49 -83
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
- airbyte_cdk/sources/declarative/auth/token.py +28 -10
- airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
- airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
- airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +421 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1213 -88
- airbyte_cdk/sources/declarative/declarative_source.py +5 -2
- airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
- airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
- airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
- airbyte_cdk/sources/declarative/extractors/record_filter.py +65 -8
- airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +25 -3
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +159 -74
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
- airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
- airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +1 -1
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1329 -595
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1699 -226
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
- airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +228 -72
- airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
- airbyte_cdk/sources/declarative/spec/spec.py +12 -5
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
- airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
- airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
- airbyte_cdk/sources/declarative/types.py +19 -110
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
- airbyte_cdk/sources/embedded/base_integration.py +16 -5
- airbyte_cdk/sources/embedded/catalog.py +16 -4
- airbyte_cdk/sources/embedded/runner.py +19 -3
- airbyte_cdk/sources/embedded/tools.py +5 -2
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +58 -10
- airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
- airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
- airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
- airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
- airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
- airbyte_cdk/sources/file_based/exceptions.py +52 -15
- airbyte_cdk/sources/file_based/file_based_source.py +163 -33
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +83 -5
- airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +145 -41
- airbyte_cdk/sources/file_based/remote_file.py +1 -1
- airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +175 -45
- airbyte_cdk/sources/http_logger.py +8 -3
- airbyte_cdk/sources/message/__init__.py +7 -1
- airbyte_cdk/sources/message/repository.py +18 -4
- airbyte_cdk/sources/source.py +42 -38
- airbyte_cdk/sources/streams/__init__.py +2 -2
- airbyte_cdk/sources/streams/availability_strategy.py +54 -3
- airbyte_cdk/sources/streams/call_rate.py +64 -21
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
- airbyte_cdk/sources/streams/concurrent/cursor.py +298 -42
- airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
- airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
- airbyte_cdk/sources/streams/core.py +412 -87
- airbyte_cdk/sources/streams/http/__init__.py +2 -1
- airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +27 -7
- airbyte_cdk/sources/streams/http/http.py +369 -246
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
- airbyte_cdk/sources/types.py +154 -0
- airbyte_cdk/sources/utils/record_helper.py +36 -21
- airbyte_cdk/sources/utils/schema_helpers.py +13 -6
- airbyte_cdk/sources/utils/slice_logger.py +4 -1
- airbyte_cdk/sources/utils/transform.py +54 -20
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/catalog_builder.py +70 -18
- airbyte_cdk/test/entrypoint_wrapper.py +117 -42
- airbyte_cdk/test/mock_http/__init__.py +1 -1
- airbyte_cdk/test/mock_http/matcher.py +6 -0
- airbyte_cdk/test/mock_http/mocker.py +57 -10
- airbyte_cdk/test/mock_http/request.py +19 -3
- airbyte_cdk/test/mock_http/response.py +3 -1
- airbyte_cdk/test/mock_http/response_builder.py +32 -16
- airbyte_cdk/test/state_builder.py +18 -10
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +2 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
- airbyte_cdk/utils/analytics_message.py +10 -2
- airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
- airbyte_cdk/utils/event_timing.py +10 -10
- airbyte_cdk/utils/mapping_helpers.py +3 -1
- airbyte_cdk/utils/message_utils.py +20 -11
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +198 -28
- airbyte_cdk/utils/slice_hasher.py +30 -0
- airbyte_cdk/utils/spec_schema_transformations.py +6 -3
- airbyte_cdk/utils/stream_status_utils.py +8 -1
- airbyte_cdk/utils/traced_exception.py +61 -21
- airbyte_cdk-6.13.1.dev4106.dist-info/METADATA +109 -0
- airbyte_cdk-6.13.1.dev4106.dist-info/RECORD +349 -0
- {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/WHEEL +1 -2
- airbyte_cdk-6.13.1.dev4106.dist-info/entry_points.txt +3 -0
- airbyte_cdk/sources/declarative/create_partial.py +0 -92
- airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
- airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
- airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
- airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
- airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
- airbyte_cdk/sources/deprecated/base_source.py +0 -94
- airbyte_cdk/sources/deprecated/client.py +0 -99
- airbyte_cdk/sources/singer/__init__.py +0 -8
- airbyte_cdk/sources/singer/singer_helpers.py +0 -304
- airbyte_cdk/sources/singer/source.py +0 -186
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
- airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
- airbyte_cdk/sources/streams/http/auth/core.py +0 -29
- airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
- airbyte_cdk/sources/streams/http/auth/token.py +0 -47
- airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
- airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
- airbyte_cdk/sources/utils/schema_models.py +0 -84
- airbyte_cdk-0.72.0.dist-info/METADATA +0 -243
- airbyte_cdk-0.72.0.dist-info/RECORD +0 -466
- airbyte_cdk-0.72.0.dist-info/top_level.txt +0 -3
- source_declarative_manifest/main.py +0 -29
- unit_tests/connector_builder/__init__.py +0 -3
- unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
- unit_tests/connector_builder/test_message_grouper.py +0 -713
- unit_tests/connector_builder/utils.py +0 -27
- unit_tests/destinations/test_destination.py +0 -243
- unit_tests/singer/test_singer_helpers.py +0 -56
- unit_tests/singer/test_singer_source.py +0 -112
- unit_tests/sources/__init__.py +0 -0
- unit_tests/sources/concurrent_source/__init__.py +0 -3
- unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
- unit_tests/sources/declarative/__init__.py +0 -3
- unit_tests/sources/declarative/auth/__init__.py +0 -3
- unit_tests/sources/declarative/auth/test_oauth.py +0 -331
- unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
- unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
- unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
- unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
- unit_tests/sources/declarative/checks/__init__.py +0 -3
- unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
- unit_tests/sources/declarative/decoders/__init__.py +0 -0
- unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
- unit_tests/sources/declarative/external_component.py +0 -13
- unit_tests/sources/declarative/extractors/__init__.py +0 -3
- unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
- unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
- unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
- unit_tests/sources/declarative/incremental/__init__.py +0 -0
- unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
- unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
- unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
- unit_tests/sources/declarative/interpolation/__init__.py +0 -3
- unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
- unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
- unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
- unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
- unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
- unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
- unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
- unit_tests/sources/declarative/parsers/__init__.py +0 -3
- unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
- unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
- unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1841
- unit_tests/sources/declarative/parsers/testing_components.py +0 -36
- unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
- unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
- unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
- unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
- unit_tests/sources/declarative/requesters/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
- unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
- unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
- unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
- unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
- unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
- unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
- unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
- unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
- unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
- unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
- unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
- unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
- unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
- unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
- unit_tests/sources/declarative/retrievers/__init__.py +0 -3
- unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
- unit_tests/sources/declarative/schema/__init__.py +0 -6
- unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
- unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
- unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
- unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
- unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
- unit_tests/sources/declarative/states/__init__.py +0 -3
- unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
- unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
- unit_tests/sources/declarative/test_create_partial.py +0 -83
- unit_tests/sources/declarative/test_declarative_stream.py +0 -103
- unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
- unit_tests/sources/declarative/test_types.py +0 -39
- unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
- unit_tests/sources/file_based/__init__.py +0 -0
- unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
- unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
- unit_tests/sources/file_based/config/__init__.py +0 -0
- unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
- unit_tests/sources/file_based/config/test_csv_format.py +0 -34
- unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
- unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
- unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
- unit_tests/sources/file_based/file_types/__init__.py +0 -0
- unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
- unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
- unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
- unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
- unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
- unit_tests/sources/file_based/helpers.py +0 -70
- unit_tests/sources/file_based/in_memory_files_source.py +0 -211
- unit_tests/sources/file_based/scenarios/__init__.py +0 -0
- unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
- unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
- unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
- unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
- unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
- unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
- unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
- unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
- unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
- unit_tests/sources/file_based/stream/__init__.py +0 -0
- unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
- unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
- unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
- unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
- unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
- unit_tests/sources/file_based/test_scenarios.py +0 -253
- unit_tests/sources/file_based/test_schema_helpers.py +0 -346
- unit_tests/sources/fixtures/__init__.py +0 -3
- unit_tests/sources/fixtures/source_test_fixture.py +0 -153
- unit_tests/sources/message/__init__.py +0 -0
- unit_tests/sources/message/test_repository.py +0 -153
- unit_tests/sources/streams/__init__.py +0 -0
- unit_tests/sources/streams/concurrent/__init__.py +0 -3
- unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
- unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
- unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
- unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
- unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
- unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
- unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
- unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
- unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
- unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
- unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
- unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
- unit_tests/sources/streams/http/__init__.py +0 -0
- unit_tests/sources/streams/http/auth/__init__.py +0 -0
- unit_tests/sources/streams/http/auth/test_auth.py +0 -173
- unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
- unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
- unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
- unit_tests/sources/streams/http/test_http.py +0 -635
- unit_tests/sources/streams/test_availability_strategy.py +0 -70
- unit_tests/sources/streams/test_call_rate.py +0 -300
- unit_tests/sources/streams/test_stream_read.py +0 -405
- unit_tests/sources/streams/test_streams_core.py +0 -184
- unit_tests/sources/test_abstract_source.py +0 -1442
- unit_tests/sources/test_concurrent_source.py +0 -112
- unit_tests/sources/test_config.py +0 -92
- unit_tests/sources/test_connector_state_manager.py +0 -482
- unit_tests/sources/test_http_logger.py +0 -252
- unit_tests/sources/test_integration_source.py +0 -86
- unit_tests/sources/test_source.py +0 -684
- unit_tests/sources/test_source_read.py +0 -460
- unit_tests/test/__init__.py +0 -0
- unit_tests/test/mock_http/__init__.py +0 -0
- unit_tests/test/mock_http/test_matcher.py +0 -53
- unit_tests/test/mock_http/test_mocker.py +0 -214
- unit_tests/test/mock_http/test_request.py +0 -117
- unit_tests/test/mock_http/test_response_builder.py +0 -177
- unit_tests/test/test_entrypoint_wrapper.py +0 -240
- unit_tests/utils/__init__.py +0 -0
- unit_tests/utils/test_datetime_format_inferrer.py +0 -60
- unit_tests/utils/test_mapping_helpers.py +0 -54
- unit_tests/utils/test_message_utils.py +0 -91
- unit_tests/utils/test_rate_limiting.py +0 -26
- unit_tests/utils/test_schema_inferrer.py +0 -202
- unit_tests/utils/test_secret_utils.py +0 -135
- unit_tests/utils/test_stream_status_utils.py +0 -61
- unit_tests/utils/test_traced_exception.py +0 -107
- /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
- {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
- {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
- {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
- {airbyte_cdk-0.72.0.dist-info → airbyte_cdk-6.13.1.dev4106.dist-info}/LICENSE.txt +0 -0
@@ -11,26 +11,21 @@ from airbyte_cdk.utils import AirbyteTracedException
|
|
11
11
|
|
12
12
|
class FileBasedSourceError(Enum):
|
13
13
|
EMPTY_STREAM = "No files were identified in the stream. This may be because there are no files in the specified container, or because your glob patterns did not match any files. Please verify that your source contains files last modified after the start_date and that your glob patterns are not overly strict."
|
14
|
-
GLOB_PARSE_ERROR =
|
15
|
-
|
16
|
-
)
|
14
|
+
GLOB_PARSE_ERROR = "Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
|
15
|
+
ENCODING_ERROR = "File encoding error. The configured encoding must match file encoding."
|
17
16
|
ERROR_CASTING_VALUE = "Could not cast the value to the expected type."
|
18
17
|
ERROR_CASTING_VALUE_UNRECOGNIZED_TYPE = "Could not cast the value to the expected type because the type is not recognized. Valid types are null, array, boolean, integer, number, object, and string."
|
19
18
|
ERROR_DECODING_VALUE = "Expected a JSON-decodeable value but could not decode record."
|
20
|
-
ERROR_LISTING_FILES =
|
21
|
-
|
22
|
-
)
|
23
|
-
ERROR_READING_FILE = (
|
24
|
-
"Error opening file. Please check the credentials provided in the config and verify that they provide permission to read files."
|
25
|
-
)
|
19
|
+
ERROR_LISTING_FILES = "Error listing files. Please check the credentials provided in the config and verify that they provide permission to list files."
|
20
|
+
ERROR_READING_FILE = "Error opening file. Please check the credentials provided in the config and verify that they provide permission to read files."
|
26
21
|
ERROR_PARSING_RECORD = "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable."
|
27
|
-
ERROR_PARSING_USER_PROVIDED_SCHEMA =
|
22
|
+
ERROR_PARSING_USER_PROVIDED_SCHEMA = (
|
23
|
+
"The provided schema could not be transformed into valid JSON Schema."
|
24
|
+
)
|
28
25
|
ERROR_VALIDATING_RECORD = "One or more records do not pass the schema validation policy. Please modify your input schema, or select a more lenient validation policy."
|
29
26
|
ERROR_PARSING_RECORD_MISMATCHED_COLUMNS = "A header field has resolved to `None`. This indicates that the CSV has more rows than the number of header fields. If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
|
30
27
|
ERROR_PARSING_RECORD_MISMATCHED_ROWS = "A row's value has resolved to `None`. This indicates that the CSV has more columns in the header field than the number of columns in the row(s). If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
|
31
|
-
STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY =
|
32
|
-
"Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema."
|
33
|
-
)
|
28
|
+
STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY = "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema."
|
34
29
|
NULL_VALUE_IN_SCHEMA = "Error during schema inference: no type was detected for key."
|
35
30
|
UNRECOGNIZED_TYPE = "Error during schema inference: unrecognized type."
|
36
31
|
SCHEMA_INFERENCE_ERROR = "Error inferring schema from files. Are the files valid?"
|
@@ -38,7 +33,9 @@ class FileBasedSourceError(Enum):
|
|
38
33
|
CONFIG_VALIDATION_ERROR = "Error creating stream config object."
|
39
34
|
MISSING_SCHEMA = "Expected `json_schema` in the configured catalog but it is missing."
|
40
35
|
UNDEFINED_PARSER = "No parser is defined for this file type."
|
41
|
-
UNDEFINED_VALIDATION_POLICY =
|
36
|
+
UNDEFINED_VALIDATION_POLICY = (
|
37
|
+
"The validation policy defined in the config does not exist for the source."
|
38
|
+
)
|
42
39
|
|
43
40
|
|
44
41
|
class FileBasedErrorsCollector:
|
@@ -69,7 +66,9 @@ class BaseFileBasedSourceError(Exception):
|
|
69
66
|
def __init__(self, error: Union[FileBasedSourceError, str], **kwargs): # type: ignore # noqa
|
70
67
|
if isinstance(error, FileBasedSourceError):
|
71
68
|
error = FileBasedSourceError(error).value
|
72
|
-
super().__init__(
|
69
|
+
super().__init__(
|
70
|
+
f"{error} Contact Support if you need assistance.\n{' '.join([f'{k}={v}' for k, v in kwargs.items()])}"
|
71
|
+
)
|
73
72
|
|
74
73
|
|
75
74
|
class ConfigValidationError(BaseFileBasedSourceError):
|
@@ -112,6 +111,40 @@ class ErrorListingFiles(BaseFileBasedSourceError):
|
|
112
111
|
pass
|
113
112
|
|
114
113
|
|
114
|
+
class DuplicatedFilesError(BaseFileBasedSourceError):
|
115
|
+
def __init__(self, duplicated_files_names: List[dict[str, List[str]]], **kwargs: Any):
|
116
|
+
self._duplicated_files_names = duplicated_files_names
|
117
|
+
self._stream_name: str = kwargs["stream"]
|
118
|
+
super().__init__(self._format_duplicate_files_error_message(), **kwargs)
|
119
|
+
|
120
|
+
def _format_duplicate_files_error_message(self) -> str:
|
121
|
+
duplicated_files_messages = []
|
122
|
+
for duplicated_file in self._duplicated_files_names:
|
123
|
+
for duplicated_file_name, file_paths in duplicated_file.items():
|
124
|
+
file_duplicated_message = (
|
125
|
+
f"{len(file_paths)} duplicates found for file name {duplicated_file_name}:\n\n"
|
126
|
+
+ "".join(f"\n - {file_paths}")
|
127
|
+
)
|
128
|
+
duplicated_files_messages.append(file_duplicated_message)
|
129
|
+
|
130
|
+
error_message = (
|
131
|
+
f"ERROR: Duplicate filenames found for stream {self._stream_name}. "
|
132
|
+
"Duplicate file names are not allowed if the Preserve Sub-Directories in File Paths option is disabled. "
|
133
|
+
"Please remove or rename the duplicate files before attempting to re-run the sync.\n\n"
|
134
|
+
+ "\n".join(duplicated_files_messages)
|
135
|
+
)
|
136
|
+
|
137
|
+
return error_message
|
138
|
+
|
139
|
+
def __repr__(self) -> str:
|
140
|
+
"""Return a string representation of the exception."""
|
141
|
+
class_name = self.__class__.__name__
|
142
|
+
properties_str = ", ".join(
|
143
|
+
f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
|
144
|
+
)
|
145
|
+
return f"{class_name}({properties_str})"
|
146
|
+
|
147
|
+
|
115
148
|
class CustomFileBasedException(AirbyteTracedException):
|
116
149
|
"""
|
117
150
|
A specialized exception for file-based connectors.
|
@@ -120,3 +153,7 @@ class CustomFileBasedException(AirbyteTracedException):
|
|
120
153
|
"""
|
121
154
|
|
122
155
|
pass
|
156
|
+
|
157
|
+
|
158
|
+
class FileSizeLimitError(CustomFileBasedException):
|
159
|
+
pass
|
@@ -6,7 +6,9 @@ import logging
|
|
6
6
|
import traceback
|
7
7
|
from abc import ABC
|
8
8
|
from collections import Counter
|
9
|
-
from typing import Any, Iterator, List, Mapping,
|
9
|
+
from typing import Any, Iterator, List, Mapping, Optional, Tuple, Type, Union
|
10
|
+
|
11
|
+
from pydantic.v1.error_wrappers import ValidationError
|
10
12
|
|
11
13
|
from airbyte_cdk.logger import AirbyteLogFormatter, init_logger
|
12
14
|
from airbyte_cdk.models import (
|
@@ -22,15 +24,31 @@ from airbyte_cdk.models import (
|
|
22
24
|
from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
|
23
25
|
from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
|
24
26
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
25
|
-
from airbyte_cdk.sources.file_based.availability_strategy import
|
27
|
+
from airbyte_cdk.sources.file_based.availability_strategy import (
|
28
|
+
AbstractFileBasedAvailabilityStrategy,
|
29
|
+
DefaultFileBasedAvailabilityStrategy,
|
30
|
+
)
|
26
31
|
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
27
|
-
from airbyte_cdk.sources.file_based.config.file_based_stream_config import
|
28
|
-
|
29
|
-
|
32
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
|
33
|
+
FileBasedStreamConfig,
|
34
|
+
ValidationPolicy,
|
35
|
+
)
|
36
|
+
from airbyte_cdk.sources.file_based.discovery_policy import (
|
37
|
+
AbstractDiscoveryPolicy,
|
38
|
+
DefaultDiscoveryPolicy,
|
39
|
+
)
|
40
|
+
from airbyte_cdk.sources.file_based.exceptions import (
|
41
|
+
ConfigValidationError,
|
42
|
+
FileBasedErrorsCollector,
|
43
|
+
FileBasedSourceError,
|
44
|
+
)
|
30
45
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
31
46
|
from airbyte_cdk.sources.file_based.file_types import default_parsers
|
32
47
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
33
|
-
from airbyte_cdk.sources.file_based.schema_validation_policies import
|
48
|
+
from airbyte_cdk.sources.file_based.schema_validation_policies import (
|
49
|
+
DEFAULT_SCHEMA_VALIDATION_POLICIES,
|
50
|
+
AbstractSchemaValidationPolicy,
|
51
|
+
)
|
34
52
|
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
|
35
53
|
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
|
36
54
|
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
|
@@ -44,7 +62,6 @@ from airbyte_cdk.sources.streams import Stream
|
|
44
62
|
from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
|
45
63
|
from airbyte_cdk.utils.analytics_message import create_analytics_message
|
46
64
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
47
|
-
from pydantic.error_wrappers import ValidationError
|
48
65
|
|
49
66
|
DEFAULT_CONCURRENCY = 100
|
50
67
|
MAX_CONCURRENCY = 100
|
@@ -61,29 +78,41 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
61
78
|
spec_class: Type[AbstractFileBasedSpec],
|
62
79
|
catalog: Optional[ConfiguredAirbyteCatalog],
|
63
80
|
config: Optional[Mapping[str, Any]],
|
64
|
-
state: Optional[
|
81
|
+
state: Optional[List[AirbyteStateMessage]],
|
65
82
|
availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy] = None,
|
66
83
|
discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
|
67
84
|
parsers: Mapping[Type[Any], FileTypeParser] = default_parsers,
|
68
|
-
validation_policies: Mapping[
|
69
|
-
|
85
|
+
validation_policies: Mapping[
|
86
|
+
ValidationPolicy, AbstractSchemaValidationPolicy
|
87
|
+
] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
|
88
|
+
cursor_cls: Type[
|
89
|
+
Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]
|
90
|
+
] = FileBasedConcurrentCursor,
|
70
91
|
):
|
71
92
|
self.stream_reader = stream_reader
|
72
93
|
self.spec_class = spec_class
|
73
94
|
self.config = config
|
74
95
|
self.catalog = catalog
|
75
96
|
self.state = state
|
76
|
-
self.availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(
|
97
|
+
self.availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(
|
98
|
+
stream_reader
|
99
|
+
)
|
77
100
|
self.discovery_policy = discovery_policy
|
78
101
|
self.parsers = parsers
|
79
102
|
self.validation_policies = validation_policies
|
80
|
-
self.stream_schemas =
|
103
|
+
self.stream_schemas = (
|
104
|
+
{s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
|
105
|
+
)
|
81
106
|
self.cursor_cls = cursor_cls
|
82
107
|
self.logger = init_logger(f"airbyte.{self.name}")
|
83
108
|
self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
|
84
109
|
self._message_repository: Optional[MessageRepository] = None
|
85
110
|
concurrent_source = ConcurrentSource.create(
|
86
|
-
MAX_CONCURRENCY,
|
111
|
+
MAX_CONCURRENCY,
|
112
|
+
INITIAL_N_PARTITIONS,
|
113
|
+
self.logger,
|
114
|
+
self._slice_logger,
|
115
|
+
self.message_repository,
|
87
116
|
)
|
88
117
|
self._state = None
|
89
118
|
super().__init__(concurrent_source)
|
@@ -91,10 +120,14 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
91
120
|
@property
|
92
121
|
def message_repository(self) -> MessageRepository:
|
93
122
|
if self._message_repository is None:
|
94
|
-
self._message_repository = InMemoryMessageRepository(
|
123
|
+
self._message_repository = InMemoryMessageRepository(
|
124
|
+
Level(AirbyteLogFormatter.level_mapping[self.logger.level])
|
125
|
+
)
|
95
126
|
return self._message_repository
|
96
127
|
|
97
|
-
def check_connection(
|
128
|
+
def check_connection(
|
129
|
+
self, logger: logging.Logger, config: Mapping[str, Any]
|
130
|
+
) -> Tuple[bool, Optional[Any]]:
|
98
131
|
"""
|
99
132
|
Check that the source can be accessed using the user-provided configuration.
|
100
133
|
|
@@ -122,20 +155,49 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
122
155
|
)
|
123
156
|
|
124
157
|
errors = []
|
158
|
+
tracebacks = []
|
125
159
|
for stream in streams:
|
126
160
|
if not isinstance(stream, AbstractFileBasedStream):
|
127
161
|
raise ValueError(f"Stream {stream} is not a file-based stream.")
|
128
162
|
try:
|
163
|
+
parsed_config = self._get_parsed_config(config)
|
164
|
+
availability_method = (
|
165
|
+
stream.availability_strategy.check_availability
|
166
|
+
if self._use_file_transfer(parsed_config)
|
167
|
+
else stream.availability_strategy.check_availability_and_parsability
|
168
|
+
)
|
129
169
|
(
|
130
170
|
stream_is_available,
|
131
171
|
reason,
|
132
|
-
) =
|
172
|
+
) = availability_method(stream, logger, self)
|
173
|
+
except AirbyteTracedException as ate:
|
174
|
+
errors.append(f"Unable to connect to stream {stream.name} - {ate.message}")
|
175
|
+
tracebacks.append(traceback.format_exc())
|
133
176
|
except Exception:
|
134
|
-
errors.append(f"Unable to connect to stream {stream.name}
|
177
|
+
errors.append(f"Unable to connect to stream {stream.name}")
|
178
|
+
tracebacks.append(traceback.format_exc())
|
135
179
|
else:
|
136
180
|
if not stream_is_available and reason:
|
137
181
|
errors.append(reason)
|
138
182
|
|
183
|
+
if len(errors) == 1 and len(tracebacks) == 1:
|
184
|
+
raise AirbyteTracedException(
|
185
|
+
internal_message=tracebacks[0],
|
186
|
+
message=f"{errors[0]}",
|
187
|
+
failure_type=FailureType.config_error,
|
188
|
+
)
|
189
|
+
if len(errors) == 1 and len(tracebacks) == 0:
|
190
|
+
raise AirbyteTracedException(
|
191
|
+
message=f"{errors[0]}",
|
192
|
+
failure_type=FailureType.config_error,
|
193
|
+
)
|
194
|
+
elif len(errors) > 1:
|
195
|
+
raise AirbyteTracedException(
|
196
|
+
internal_message="\n".join(tracebacks),
|
197
|
+
message=f"{len(errors)} streams with errors: {', '.join(error for error in errors)}",
|
198
|
+
failure_type=FailureType.config_error,
|
199
|
+
)
|
200
|
+
|
139
201
|
return not bool(errors), (errors or None)
|
140
202
|
|
141
203
|
def streams(self, config: Mapping[str, Any]) -> List[Stream]:
|
@@ -144,10 +206,7 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
144
206
|
"""
|
145
207
|
|
146
208
|
if self.catalog:
|
147
|
-
state_manager = ConnectorStateManager(
|
148
|
-
stream_instance_map={s.stream.name: s.stream for s in self.catalog.streams},
|
149
|
-
state=self.state,
|
150
|
-
)
|
209
|
+
state_manager = ConnectorStateManager(state=self.state)
|
151
210
|
else:
|
152
211
|
# During `check` operations we don't have a catalog so cannot create a state manager.
|
153
212
|
# Since the state manager is only required for incremental syncs, this is fine.
|
@@ -169,12 +228,26 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
169
228
|
|
170
229
|
sync_mode = self._get_sync_mode_from_catalog(stream_config.name)
|
171
230
|
|
172
|
-
if
|
231
|
+
if (
|
232
|
+
sync_mode == SyncMode.full_refresh
|
233
|
+
and hasattr(self, "_concurrency_level")
|
234
|
+
and self._concurrency_level is not None
|
235
|
+
):
|
173
236
|
cursor = FileBasedFinalStateCursor(
|
174
|
-
stream_config=stream_config,
|
237
|
+
stream_config=stream_config,
|
238
|
+
stream_namespace=None,
|
239
|
+
message_repository=self.message_repository,
|
175
240
|
)
|
176
241
|
stream = FileBasedStreamFacade.create_from_stream(
|
177
|
-
self._make_default_stream(
|
242
|
+
stream=self._make_default_stream(
|
243
|
+
stream_config=stream_config,
|
244
|
+
cursor=cursor,
|
245
|
+
parsed_config=parsed_config,
|
246
|
+
),
|
247
|
+
source=self,
|
248
|
+
logger=self.logger,
|
249
|
+
state=stream_state,
|
250
|
+
cursor=cursor,
|
178
251
|
)
|
179
252
|
|
180
253
|
elif (
|
@@ -197,11 +270,23 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
197
270
|
CursorField(DefaultFileBasedStream.ab_last_mod_col),
|
198
271
|
)
|
199
272
|
stream = FileBasedStreamFacade.create_from_stream(
|
200
|
-
self._make_default_stream(
|
273
|
+
stream=self._make_default_stream(
|
274
|
+
stream_config=stream_config,
|
275
|
+
cursor=cursor,
|
276
|
+
parsed_config=parsed_config,
|
277
|
+
),
|
278
|
+
source=self,
|
279
|
+
logger=self.logger,
|
280
|
+
state=stream_state,
|
281
|
+
cursor=cursor,
|
201
282
|
)
|
202
283
|
else:
|
203
284
|
cursor = self.cursor_cls(stream_config)
|
204
|
-
stream = self._make_default_stream(
|
285
|
+
stream = self._make_default_stream(
|
286
|
+
stream_config=stream_config,
|
287
|
+
cursor=cursor,
|
288
|
+
parsed_config=parsed_config,
|
289
|
+
)
|
205
290
|
|
206
291
|
streams.append(stream)
|
207
292
|
return streams
|
@@ -210,7 +295,10 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
210
295
|
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) from exc
|
211
296
|
|
212
297
|
def _make_default_stream(
|
213
|
-
self,
|
298
|
+
self,
|
299
|
+
stream_config: FileBasedStreamConfig,
|
300
|
+
cursor: Optional[AbstractFileBasedCursor],
|
301
|
+
parsed_config: AbstractFileBasedSpec,
|
214
302
|
) -> AbstractFileBasedStream:
|
215
303
|
return DefaultFileBasedStream(
|
216
304
|
config=stream_config,
|
@@ -222,9 +310,13 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
222
310
|
validation_policy=self._validate_and_get_validation_policy(stream_config),
|
223
311
|
errors_collector=self.errors_collector,
|
224
312
|
cursor=cursor,
|
313
|
+
use_file_transfer=self._use_file_transfer(parsed_config),
|
314
|
+
preserve_directory_structure=self._preserve_directory_structure(parsed_config),
|
225
315
|
)
|
226
316
|
|
227
|
-
def _get_stream_from_catalog(
|
317
|
+
def _get_stream_from_catalog(
|
318
|
+
self, stream_config: FileBasedStreamConfig
|
319
|
+
) -> Optional[AirbyteStream]:
|
228
320
|
if self.catalog:
|
229
321
|
for stream in self.catalog.streams or []:
|
230
322
|
if stream.stream.name == stream_config.name:
|
@@ -244,14 +336,16 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
244
336
|
logger: logging.Logger,
|
245
337
|
config: Mapping[str, Any],
|
246
338
|
catalog: ConfiguredAirbyteCatalog,
|
247
|
-
state: Optional[
|
339
|
+
state: Optional[List[AirbyteStateMessage]] = None,
|
248
340
|
) -> Iterator[AirbyteMessage]:
|
249
341
|
yield from super().read(logger, config, catalog, state)
|
250
342
|
# emit all the errors collected
|
251
343
|
yield from self.errors_collector.yield_and_raise_collected()
|
252
344
|
# count streams using a certain parser
|
253
345
|
parsed_config = self._get_parsed_config(config)
|
254
|
-
for parser, count in Counter(
|
346
|
+
for parser, count in Counter(
|
347
|
+
stream.format.filetype for stream in parsed_config.streams
|
348
|
+
).items():
|
255
349
|
yield create_analytics_message(f"file-cdk-{parser}-stream-count", count)
|
256
350
|
|
257
351
|
def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification:
|
@@ -267,14 +361,50 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
267
361
|
def _get_parsed_config(self, config: Mapping[str, Any]) -> AbstractFileBasedSpec:
|
268
362
|
return self.spec_class(**config)
|
269
363
|
|
270
|
-
def _validate_and_get_validation_policy(
|
364
|
+
def _validate_and_get_validation_policy(
|
365
|
+
self, stream_config: FileBasedStreamConfig
|
366
|
+
) -> AbstractSchemaValidationPolicy:
|
271
367
|
if stream_config.validation_policy not in self.validation_policies:
|
272
368
|
# This should never happen because we validate the config against the schema's validation_policy enum
|
273
369
|
raise ValidationError(
|
274
|
-
f"`validation_policy` must be one of {list(self.validation_policies.keys())}",
|
370
|
+
f"`validation_policy` must be one of {list(self.validation_policies.keys())}",
|
371
|
+
model=FileBasedStreamConfig,
|
275
372
|
)
|
276
373
|
return self.validation_policies[stream_config.validation_policy]
|
277
374
|
|
278
375
|
def _validate_input_schema(self, stream_config: FileBasedStreamConfig) -> None:
|
279
376
|
if stream_config.schemaless and stream_config.input_schema:
|
280
|
-
raise ValidationError(
|
377
|
+
raise ValidationError(
|
378
|
+
"`input_schema` and `schemaless` options cannot both be set",
|
379
|
+
model=FileBasedStreamConfig,
|
380
|
+
)
|
381
|
+
|
382
|
+
@staticmethod
|
383
|
+
def _use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
|
384
|
+
use_file_transfer = (
|
385
|
+
hasattr(parsed_config.delivery_method, "delivery_type")
|
386
|
+
and parsed_config.delivery_method.delivery_type == "use_file_transfer"
|
387
|
+
)
|
388
|
+
return use_file_transfer
|
389
|
+
|
390
|
+
@staticmethod
|
391
|
+
def _preserve_directory_structure(parsed_config: AbstractFileBasedSpec) -> bool:
|
392
|
+
"""
|
393
|
+
Determines whether to preserve directory structure during file transfer.
|
394
|
+
|
395
|
+
When enabled, files maintain their subdirectory paths in the destination.
|
396
|
+
When disabled, files are flattened to the root of the destination.
|
397
|
+
|
398
|
+
Args:
|
399
|
+
parsed_config: The parsed configuration containing delivery method settings
|
400
|
+
|
401
|
+
Returns:
|
402
|
+
True if directory structure should be preserved (default), False otherwise
|
403
|
+
"""
|
404
|
+
if (
|
405
|
+
FileBasedSource._use_file_transfer(parsed_config)
|
406
|
+
and hasattr(parsed_config.delivery_method, "preserve_directory_structure")
|
407
|
+
and parsed_config.delivery_method.preserve_directory_structure is not None
|
408
|
+
):
|
409
|
+
return parsed_config.delivery_method.preserve_directory_structure
|
410
|
+
return True
|
@@ -7,11 +7,13 @@ from abc import ABC, abstractmethod
|
|
7
7
|
from datetime import datetime
|
8
8
|
from enum import Enum
|
9
9
|
from io import IOBase
|
10
|
-
from
|
10
|
+
from os import makedirs, path
|
11
|
+
from typing import Any, Dict, Iterable, List, Optional, Set
|
12
|
+
|
13
|
+
from wcmatch.glob import GLOBSTAR, globmatch
|
11
14
|
|
12
15
|
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
13
16
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
14
|
-
from wcmatch.glob import GLOBSTAR, globmatch
|
15
17
|
|
16
18
|
|
17
19
|
class FileReadMode(Enum):
|
@@ -44,7 +46,9 @@ class AbstractFileBasedStreamReader(ABC):
|
|
44
46
|
...
|
45
47
|
|
46
48
|
@abstractmethod
|
47
|
-
def open_file(
|
49
|
+
def open_file(
|
50
|
+
self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger
|
51
|
+
) -> IOBase:
|
48
52
|
"""
|
49
53
|
Return a file handle for reading.
|
50
54
|
|
@@ -79,11 +83,17 @@ class AbstractFileBasedStreamReader(ABC):
|
|
79
83
|
"""
|
80
84
|
...
|
81
85
|
|
82
|
-
def filter_files_by_globs_and_start_date(
|
86
|
+
def filter_files_by_globs_and_start_date(
|
87
|
+
self, files: List[RemoteFile], globs: List[str]
|
88
|
+
) -> Iterable[RemoteFile]:
|
83
89
|
"""
|
84
90
|
Utility method for filtering files based on globs.
|
85
91
|
"""
|
86
|
-
start_date =
|
92
|
+
start_date = (
|
93
|
+
datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT)
|
94
|
+
if self.config and self.config.start_date
|
95
|
+
else None
|
96
|
+
)
|
87
97
|
seen = set()
|
88
98
|
|
89
99
|
for file in files:
|
@@ -92,6 +102,16 @@ class AbstractFileBasedStreamReader(ABC):
|
|
92
102
|
seen.add(file.uri)
|
93
103
|
yield file
|
94
104
|
|
105
|
+
@abstractmethod
|
106
|
+
def file_size(self, file: RemoteFile) -> int:
|
107
|
+
"""Utility method to get size of the remote file.
|
108
|
+
|
109
|
+
This is required for connectors that will support writing to
|
110
|
+
files. If the connector does not support writing files, then the
|
111
|
+
subclass can simply `return 0`.
|
112
|
+
"""
|
113
|
+
...
|
114
|
+
|
95
115
|
@staticmethod
|
96
116
|
def file_matches_globs(file: RemoteFile, globs: List[str]) -> bool:
|
97
117
|
# Use the GLOBSTAR flag to enable recursive ** matching
|
@@ -105,3 +125,61 @@ class AbstractFileBasedStreamReader(ABC):
|
|
105
125
|
"""
|
106
126
|
prefixes = {glob.split("*")[0] for glob in globs}
|
107
127
|
return set(filter(lambda x: bool(x), prefixes))
|
128
|
+
|
129
|
+
def use_file_transfer(self) -> bool:
|
130
|
+
if self.config:
|
131
|
+
use_file_transfer = (
|
132
|
+
hasattr(self.config.delivery_method, "delivery_type")
|
133
|
+
and self.config.delivery_method.delivery_type == "use_file_transfer"
|
134
|
+
)
|
135
|
+
return use_file_transfer
|
136
|
+
return False
|
137
|
+
|
138
|
+
def preserve_directory_structure(self) -> bool:
|
139
|
+
# fall back to preserve subdirectories if config is not present or incomplete
|
140
|
+
if (
|
141
|
+
self.use_file_transfer()
|
142
|
+
and self.config
|
143
|
+
and hasattr(self.config.delivery_method, "preserve_directory_structure")
|
144
|
+
and self.config.delivery_method.preserve_directory_structure is not None
|
145
|
+
):
|
146
|
+
return self.config.delivery_method.preserve_directory_structure
|
147
|
+
return True
|
148
|
+
|
149
|
+
@abstractmethod
|
150
|
+
def get_file(
|
151
|
+
self, file: RemoteFile, local_directory: str, logger: logging.Logger
|
152
|
+
) -> Dict[str, Any]:
|
153
|
+
"""
|
154
|
+
This is required for connectors that will support writing to
|
155
|
+
files. It will handle the logic to download,get,read,acquire or
|
156
|
+
whatever is more efficient to get a file from the source.
|
157
|
+
|
158
|
+
Args:
|
159
|
+
file (RemoteFile): The remote file object containing URI and metadata.
|
160
|
+
local_directory (str): The local directory path where the file will be downloaded.
|
161
|
+
logger (logging.Logger): Logger for logging information and errors.
|
162
|
+
|
163
|
+
Returns:
|
164
|
+
dict: A dictionary containing the following:
|
165
|
+
- "file_url" (str): The absolute path of the downloaded file.
|
166
|
+
- "bytes" (int): The file size in bytes.
|
167
|
+
- "file_relative_path" (str): The relative path of the file for local storage. Is relative to local_directory as
|
168
|
+
this a mounted volume in the pod container.
|
169
|
+
|
170
|
+
"""
|
171
|
+
...
|
172
|
+
|
173
|
+
def _get_file_transfer_paths(self, file: RemoteFile, local_directory: str) -> List[str]:
|
174
|
+
preserve_directory_structure = self.preserve_directory_structure()
|
175
|
+
if preserve_directory_structure:
|
176
|
+
# Remove left slashes from source path format to make relative path for writing locally
|
177
|
+
file_relative_path = file.uri.lstrip("/")
|
178
|
+
else:
|
179
|
+
file_relative_path = path.basename(file.uri)
|
180
|
+
local_file_path = path.join(local_directory, file_relative_path)
|
181
|
+
|
182
|
+
# Ensure the local directory exists
|
183
|
+
makedirs(path.dirname(local_file_path), exist_ok=True)
|
184
|
+
absolute_file_path = path.abspath(local_file_path)
|
185
|
+
return [file_relative_path, local_file_path, absolute_file_path]
|
@@ -2,12 +2,15 @@ from typing import Any, Mapping, Type
|
|
2
2
|
|
3
3
|
from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
|
4
4
|
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
|
5
|
+
from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
|
5
6
|
from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
|
6
7
|
from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
|
7
8
|
from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
|
8
9
|
|
9
10
|
from .avro_parser import AvroParser
|
10
11
|
from .csv_parser import CsvParser
|
12
|
+
from .excel_parser import ExcelParser
|
13
|
+
from .file_transfer import FileTransfer
|
11
14
|
from .file_type_parser import FileTypeParser
|
12
15
|
from .jsonl_parser import JsonlParser
|
13
16
|
from .parquet_parser import ParquetParser
|
@@ -16,9 +19,19 @@ from .unstructured_parser import UnstructuredParser
|
|
16
19
|
default_parsers: Mapping[Type[Any], FileTypeParser] = {
|
17
20
|
AvroFormat: AvroParser(),
|
18
21
|
CsvFormat: CsvParser(),
|
22
|
+
ExcelFormat: ExcelParser(),
|
19
23
|
JsonlFormat: JsonlParser(),
|
20
24
|
ParquetFormat: ParquetParser(),
|
21
25
|
UnstructuredFormat: UnstructuredParser(),
|
22
26
|
}
|
23
27
|
|
24
|
-
__all__ = [
|
28
|
+
__all__ = [
|
29
|
+
"AvroParser",
|
30
|
+
"CsvParser",
|
31
|
+
"ExcelParser",
|
32
|
+
"JsonlParser",
|
33
|
+
"ParquetParser",
|
34
|
+
"UnstructuredParser",
|
35
|
+
"FileTransfer",
|
36
|
+
"default_parsers",
|
37
|
+
]
|