airbyte-cdk 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +358 -0
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +104 -0
- airbyte_cdk/connector.py +123 -0
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/__init__.py +3 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
- airbyte_cdk/connector_builder/main.py +107 -0
- airbyte_cdk/connector_builder/models.py +73 -0
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +83 -0
- airbyte_cdk/destinations/__init__.py +8 -0
- airbyte_cdk/destinations/destination.py +154 -0
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
- airbyte_cdk/destinations/vector_db_based/config.py +298 -0
- airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
- airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
- airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
- airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
- airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
- airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
- airbyte_cdk/entrypoint.py +414 -0
- airbyte_cdk/exception_handler.py +56 -0
- airbyte_cdk/logger.py +109 -0
- airbyte_cdk/models/__init__.py +72 -0
- airbyte_cdk/models/airbyte_protocol.py +88 -0
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/well_known_types.py +5 -0
- airbyte_cdk/py.typed +0 -0
- airbyte_cdk/sources/__init__.py +26 -0
- airbyte_cdk/sources/abstract_source.py +326 -0
- airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
- airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
- airbyte_cdk/sources/config.py +27 -0
- airbyte_cdk/sources/connector_state_manager.py +161 -0
- airbyte_cdk/sources/declarative/__init__.py +3 -0
- airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
- airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
- airbyte_cdk/sources/declarative/auth/token.py +267 -0
- airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
- airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
- airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
- airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
- airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
- airbyte_cdk/sources/declarative/declarative_source.py +36 -0
- airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
- airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
- airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
- airbyte_cdk/sources/declarative/exceptions.py +9 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
- airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
- airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
- airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
- airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
- airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +2 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
- airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
- airbyte_cdk/sources/declarative/requesters/README.md +56 -0
- airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
- airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
- airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
- airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
- airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
- airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
- airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
- airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
- airbyte_cdk/sources/declarative/spec/spec.py +48 -0
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
- airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
- airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
- airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
- airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
- airbyte_cdk/sources/declarative/types.py +25 -0
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
- airbyte_cdk/sources/file_based/config/__init__.py +0 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
- airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
- airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
- airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
- airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
- airbyte_cdk/sources/file_based/exceptions.py +159 -0
- airbyte_cdk/sources/file_based/file_based_source.py +466 -0
- airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
- airbyte_cdk/sources/file_based/file_record_data.py +22 -0
- airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
- airbyte_cdk/sources/file_based/remote_file.py +18 -0
- airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
- airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
- airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
- airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
- airbyte_cdk/sources/file_based/types.py +10 -0
- airbyte_cdk/sources/http_config.py +10 -0
- airbyte_cdk/sources/http_logger.py +55 -0
- airbyte_cdk/sources/message/__init__.py +19 -0
- airbyte_cdk/sources/message/repository.py +137 -0
- airbyte_cdk/sources/source.py +95 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/__init__.py +8 -0
- airbyte_cdk/sources/streams/availability_strategy.py +84 -0
- airbyte_cdk/sources/streams/call_rate.py +704 -0
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
- airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
- airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
- airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
- airbyte_cdk/sources/streams/core.py +703 -0
- airbyte_cdk/sources/streams/http/__init__.py +10 -0
- airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +61 -0
- airbyte_cdk/sources/streams/http/http.py +673 -0
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/sources/streams/utils/__init__.py +3 -0
- airbyte_cdk/sources/types.py +169 -0
- airbyte_cdk/sources/utils/__init__.py +7 -0
- airbyte_cdk/sources/utils/casing.py +12 -0
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +53 -0
- airbyte_cdk/sources/utils/schema_helpers.py +230 -0
- airbyte_cdk/sources/utils/slice_logger.py +57 -0
- airbyte_cdk/sources/utils/transform.py +277 -0
- airbyte_cdk/sources/utils/types.py +7 -0
- airbyte_cdk/sql/__init__.py +0 -0
- airbyte_cdk/sql/_util/__init__.py +0 -0
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/__init__.py +7 -0
- airbyte_cdk/test/catalog_builder.py +81 -0
- airbyte_cdk/test/entrypoint_wrapper.py +250 -0
- airbyte_cdk/test/mock_http/__init__.py +6 -0
- airbyte_cdk/test/mock_http/matcher.py +41 -0
- airbyte_cdk/test/mock_http/mocker.py +185 -0
- airbyte_cdk/test/mock_http/request.py +103 -0
- airbyte_cdk/test/mock_http/response.py +28 -0
- airbyte_cdk/test/mock_http/response_builder.py +237 -0
- airbyte_cdk/test/state_builder.py +33 -0
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +10 -0
- airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
- airbyte_cdk/utils/analytics_message.py +25 -0
- airbyte_cdk/utils/constants.py +5 -0
- airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
- airbyte_cdk/utils/datetime_helpers.py +499 -0
- airbyte_cdk/utils/event_timing.py +85 -0
- airbyte_cdk/utils/is_cloud_environment.py +18 -0
- airbyte_cdk/utils/mapping_helpers.py +162 -0
- airbyte_cdk/utils/message_utils.py +26 -0
- airbyte_cdk/utils/oneof_option_config.py +33 -0
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +270 -0
- airbyte_cdk/utils/slice_hasher.py +37 -0
- airbyte_cdk/utils/spec_schema_transformations.py +26 -0
- airbyte_cdk/utils/stream_status_utils.py +43 -0
- airbyte_cdk/utils/traced_exception.py +145 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
- airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
- airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
- airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
- airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from typing import Any, Dict, Iterable, Optional
|
|
8
|
+
|
|
9
|
+
from airbyte_cdk.sources.file_based import AbstractFileBasedSpec
|
|
10
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class AbstractFileBasedStreamPermissionsReader(ABC):
|
|
14
|
+
"""
|
|
15
|
+
This class is responsible for reading file permissions and Identities from a source.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self) -> None:
|
|
19
|
+
self._config = None
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def config(self) -> Optional[AbstractFileBasedSpec]:
|
|
23
|
+
return self._config
|
|
24
|
+
|
|
25
|
+
@config.setter
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def config(self, value: AbstractFileBasedSpec) -> None:
|
|
28
|
+
"""
|
|
29
|
+
FileBasedSource reads the config from disk and parses it, and once parsed, the source sets the config on its StreamReader.
|
|
30
|
+
|
|
31
|
+
Note: FileBasedSource only requires the keys defined in the abstract config, whereas concrete implementations of StreamReader
|
|
32
|
+
will require keys that (for example) allow it to authenticate with the 3rd party.
|
|
33
|
+
|
|
34
|
+
Therefore, concrete implementations of AbstractFileBasedStreamPermissionsReader's's config setter should assert that `value` is of the correct
|
|
35
|
+
config type for that type of StreamReader.
|
|
36
|
+
"""
|
|
37
|
+
...
|
|
38
|
+
|
|
39
|
+
@abstractmethod
|
|
40
|
+
def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger) -> Dict[str, Any]:
|
|
41
|
+
"""
|
|
42
|
+
This function should return the allow list for a given file, i.e. the list of all identities and their permission levels associated with it
|
|
43
|
+
|
|
44
|
+
e.g.
|
|
45
|
+
def get_file_acl_permissions(self, file: RemoteFile, logger: logging.Logger):
|
|
46
|
+
api_conn = some_api.conn(credentials=SOME_CREDENTIALS)
|
|
47
|
+
result = api_conn.get_file_permissions_info(file.id)
|
|
48
|
+
return MyPermissionsModel(
|
|
49
|
+
id=result["id"],
|
|
50
|
+
access_control_list = result["access_control_list"],
|
|
51
|
+
is_public = result["is_public"],
|
|
52
|
+
).dict()
|
|
53
|
+
"""
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]:
|
|
58
|
+
"""
|
|
59
|
+
This function should return the Identities in a determined "space" or "domain" where the file metadata (ACLs) are fetched and ACLs items (Identities) exists.
|
|
60
|
+
|
|
61
|
+
e.g.
|
|
62
|
+
def load_identity_groups(self, logger: logging.Logger) -> Iterable[Dict[str, Any]]:
|
|
63
|
+
api_conn = some_api.conn(credentials=SOME_CREDENTIALS)
|
|
64
|
+
users_api = api_conn.users()
|
|
65
|
+
groups_api = api_conn.groups()
|
|
66
|
+
members_api = self.google_directory_service.members()
|
|
67
|
+
for user in users_api.list():
|
|
68
|
+
yield my_identity_model(id=user.id, name=user.name, email_address=user.email, type="user").dict()
|
|
69
|
+
for group in groups_api.list():
|
|
70
|
+
group_obj = my_identity_model(id=group.id, name=groups.name, email_address=user.email, type="group").dict()
|
|
71
|
+
for member in members_api.list(group=group):
|
|
72
|
+
group_obj.member_email_addresses = group_obj.member_email_addresses or []
|
|
73
|
+
group_obj.member_email_addresses.append(member.email)
|
|
74
|
+
yield group_obj.dict()
|
|
75
|
+
"""
|
|
76
|
+
...
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
@abstractmethod
|
|
80
|
+
def file_permissions_schema(self) -> Dict[str, Any]:
|
|
81
|
+
"""
|
|
82
|
+
This function should return the permissions schema for file permissions stream.
|
|
83
|
+
|
|
84
|
+
e.g.
|
|
85
|
+
def file_permissions_schema(self) -> Dict[str, Any]:
|
|
86
|
+
# you can also follow the pattern we have for python connectors and have a json file and read from there e.g. schemas/identities.json
|
|
87
|
+
return {
|
|
88
|
+
"type": "object",
|
|
89
|
+
"properties": {
|
|
90
|
+
"id": { "type": "string" },
|
|
91
|
+
"file_path": { "type": "string" },
|
|
92
|
+
"access_control_list": {
|
|
93
|
+
"type": "array",
|
|
94
|
+
"items": { "type": "string" }
|
|
95
|
+
},
|
|
96
|
+
"publicly_accessible": { "type": "boolean" }
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
"""
|
|
100
|
+
...
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
@abstractmethod
|
|
104
|
+
def identities_schema(self) -> Dict[str, Any]:
|
|
105
|
+
"""
|
|
106
|
+
This function should return the identities schema for file identity stream.
|
|
107
|
+
|
|
108
|
+
e.g.
|
|
109
|
+
def identities_schema(self) -> Dict[str, Any]:
|
|
110
|
+
# you can also follow the pattern we have for python connectors and have a json file and read from there e.g. schemas/identities.json
|
|
111
|
+
return {
|
|
112
|
+
"type": "object",
|
|
113
|
+
"properties": {
|
|
114
|
+
"id": { "type": "string" },
|
|
115
|
+
"remote_id": { "type": "string" },
|
|
116
|
+
"name": { "type": ["null", "string"] },
|
|
117
|
+
"email_address": { "type": ["null", "string"] },
|
|
118
|
+
"member_email_addresses": { "type": ["null", "array"] },
|
|
119
|
+
"type": { "type": "string" },
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
"""
|
|
123
|
+
...
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from io import IOBase
|
|
10
|
+
from os import makedirs, path
|
|
11
|
+
from typing import Any, Callable, Iterable, List, MutableMapping, Optional, Set, Tuple
|
|
12
|
+
|
|
13
|
+
from wcmatch.glob import GLOBSTAR, globmatch
|
|
14
|
+
|
|
15
|
+
from airbyte_cdk.models import AirbyteRecordMessageFileReference
|
|
16
|
+
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
|
17
|
+
from airbyte_cdk.sources.file_based.config.validate_config_transfer_modes import (
|
|
18
|
+
include_identities_stream,
|
|
19
|
+
preserve_directory_structure,
|
|
20
|
+
use_file_transfer,
|
|
21
|
+
)
|
|
22
|
+
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
|
|
23
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class FileReadMode(Enum):
|
|
27
|
+
READ = "r"
|
|
28
|
+
READ_BINARY = "rb"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class AbstractFileBasedStreamReader(ABC):
|
|
32
|
+
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
|
|
33
|
+
FILE_RELATIVE_PATH = "file_relative_path"
|
|
34
|
+
FILE_NAME = "file_name"
|
|
35
|
+
LOCAL_FILE_PATH = "local_file_path"
|
|
36
|
+
SOURCE_FILE_URI = "source_file_relative_path"
|
|
37
|
+
FILE_FOLDER = "file_folder"
|
|
38
|
+
|
|
39
|
+
def __init__(self) -> None:
|
|
40
|
+
self._config = None
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def config(self) -> Optional[AbstractFileBasedSpec]:
|
|
44
|
+
return self._config
|
|
45
|
+
|
|
46
|
+
@config.setter
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def config(self, value: AbstractFileBasedSpec) -> None:
|
|
49
|
+
"""
|
|
50
|
+
FileBasedSource reads the config from disk and parses it, and once parsed, the source sets the config on its StreamReader.
|
|
51
|
+
|
|
52
|
+
Note: FileBasedSource only requires the keys defined in the abstract config, whereas concrete implementations of StreamReader
|
|
53
|
+
will require keys that (for example) allow it to authenticate with the 3rd party.
|
|
54
|
+
|
|
55
|
+
Therefore, concrete implementations of AbstractFileBasedStreamReader's config setter should assert that `value` is of the correct
|
|
56
|
+
config type for that type of StreamReader.
|
|
57
|
+
"""
|
|
58
|
+
...
|
|
59
|
+
|
|
60
|
+
@abstractmethod
|
|
61
|
+
def open_file(
|
|
62
|
+
self, file: RemoteFile, mode: FileReadMode, encoding: Optional[str], logger: logging.Logger
|
|
63
|
+
) -> IOBase:
|
|
64
|
+
"""
|
|
65
|
+
Return a file handle for reading.
|
|
66
|
+
|
|
67
|
+
Many sources will be able to use smart_open to implement this method,
|
|
68
|
+
for example:
|
|
69
|
+
|
|
70
|
+
client = boto3.Session(...)
|
|
71
|
+
return smart_open.open(remote_file.uri, transport_params={"client": client})
|
|
72
|
+
"""
|
|
73
|
+
...
|
|
74
|
+
|
|
75
|
+
@abstractmethod
|
|
76
|
+
def get_matching_files(
|
|
77
|
+
self,
|
|
78
|
+
globs: List[str],
|
|
79
|
+
prefix: Optional[str],
|
|
80
|
+
logger: logging.Logger,
|
|
81
|
+
) -> Iterable[RemoteFile]:
|
|
82
|
+
"""
|
|
83
|
+
Return all files that match any of the globs.
|
|
84
|
+
|
|
85
|
+
Example:
|
|
86
|
+
|
|
87
|
+
The source has files "a.json", "foo/a.json", "foo/bar/a.json"
|
|
88
|
+
|
|
89
|
+
If globs = ["*.json"] then this method returns ["a.json"].
|
|
90
|
+
|
|
91
|
+
If globs = ["foo/*.json"] then this method returns ["foo/a.json"].
|
|
92
|
+
|
|
93
|
+
Utility method `self.filter_files_by_globs` and `self.get_prefixes_from_globs`
|
|
94
|
+
are available, which may be helpful when implementing this method.
|
|
95
|
+
"""
|
|
96
|
+
...
|
|
97
|
+
|
|
98
|
+
def filter_files_by_globs_and_start_date(
|
|
99
|
+
self, files: List[RemoteFile], globs: List[str]
|
|
100
|
+
) -> Iterable[RemoteFile]:
|
|
101
|
+
"""
|
|
102
|
+
Utility method for filtering files based on globs.
|
|
103
|
+
"""
|
|
104
|
+
start_date = (
|
|
105
|
+
datetime.strptime(self.config.start_date, self.DATE_TIME_FORMAT)
|
|
106
|
+
if self.config and self.config.start_date
|
|
107
|
+
else None
|
|
108
|
+
)
|
|
109
|
+
seen = set()
|
|
110
|
+
|
|
111
|
+
for file in files:
|
|
112
|
+
if self.file_matches_globs(file, globs):
|
|
113
|
+
if file.uri not in seen and (not start_date or file.last_modified >= start_date):
|
|
114
|
+
seen.add(file.uri)
|
|
115
|
+
yield file
|
|
116
|
+
|
|
117
|
+
@abstractmethod
|
|
118
|
+
def file_size(self, file: RemoteFile) -> int:
|
|
119
|
+
"""Utility method to get size of the remote file.
|
|
120
|
+
|
|
121
|
+
This is required for connectors that will support writing to
|
|
122
|
+
files. If the connector does not support writing files, then the
|
|
123
|
+
subclass can simply `return 0`.
|
|
124
|
+
"""
|
|
125
|
+
...
|
|
126
|
+
|
|
127
|
+
@staticmethod
|
|
128
|
+
def file_matches_globs(file: RemoteFile, globs: List[str]) -> bool:
|
|
129
|
+
# Use the GLOBSTAR flag to enable recursive ** matching
|
|
130
|
+
# (https://facelessuser.github.io/wcmatch/wcmatch/#globstar)
|
|
131
|
+
return any(globmatch(file.uri, g, flags=GLOBSTAR) for g in globs)
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
def get_prefixes_from_globs(globs: List[str]) -> Set[str]:
|
|
135
|
+
"""
|
|
136
|
+
Utility method for extracting prefixes from the globs.
|
|
137
|
+
"""
|
|
138
|
+
prefixes = {glob.split("*")[0] for glob in globs}
|
|
139
|
+
return set(filter(lambda x: bool(x), prefixes))
|
|
140
|
+
|
|
141
|
+
def use_file_transfer(self) -> bool:
|
|
142
|
+
if self.config:
|
|
143
|
+
return use_file_transfer(self.config)
|
|
144
|
+
return False
|
|
145
|
+
|
|
146
|
+
def preserve_directory_structure(self) -> bool:
|
|
147
|
+
# fall back to preserve subdirectories if config is not present or incomplete
|
|
148
|
+
if self.config:
|
|
149
|
+
return preserve_directory_structure(self.config)
|
|
150
|
+
return True
|
|
151
|
+
|
|
152
|
+
def include_identities_stream(self) -> bool:
|
|
153
|
+
if self.config:
|
|
154
|
+
return include_identities_stream(self.config)
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
@abstractmethod
|
|
158
|
+
def upload(
|
|
159
|
+
self, file: RemoteFile, local_directory: str, logger: logging.Logger
|
|
160
|
+
) -> Tuple[FileRecordData, AirbyteRecordMessageFileReference]:
|
|
161
|
+
"""
|
|
162
|
+
This is required for connectors that will support writing to
|
|
163
|
+
files. It will handle the logic to download,get,read,acquire or
|
|
164
|
+
whatever is more efficient to get a file from the source.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
file (RemoteFile): The remote file object containing URI and metadata.
|
|
168
|
+
local_directory (str): The local directory path where the file will be downloaded.
|
|
169
|
+
logger (logging.Logger): Logger for logging information and errors.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
AirbyteRecordMessageFileReference: A file reference object containing:
|
|
173
|
+
- staging_file_url (str): The absolute path to the referenced file in the staging area.
|
|
174
|
+
- file_size_bytes (int): The size of the referenced file in bytes.
|
|
175
|
+
- source_file_relative_path (str): The relative path to the referenced file in source.
|
|
176
|
+
"""
|
|
177
|
+
...
|
|
178
|
+
|
|
179
|
+
def _get_file_transfer_paths(
|
|
180
|
+
self,
|
|
181
|
+
file: RemoteFile,
|
|
182
|
+
local_directory: str,
|
|
183
|
+
parse_file_path_from_uri: Optional[Callable[[str], str]] = None,
|
|
184
|
+
) -> MutableMapping[str, Any]:
|
|
185
|
+
preserve_directory_structure = self.preserve_directory_structure()
|
|
186
|
+
if not parse_file_path_from_uri:
|
|
187
|
+
file_path = file.uri
|
|
188
|
+
else:
|
|
189
|
+
file_path = parse_file_path_from_uri(file.uri)
|
|
190
|
+
|
|
191
|
+
file_name = path.basename(file_path)
|
|
192
|
+
file_folder = path.dirname(file_path)
|
|
193
|
+
if preserve_directory_structure:
|
|
194
|
+
# Remove left slashes from source path format to make relative path for writing locally
|
|
195
|
+
file_relative_path = file_path.lstrip("/")
|
|
196
|
+
else:
|
|
197
|
+
file_relative_path = file_name
|
|
198
|
+
local_file_path = path.join(local_directory, file_relative_path)
|
|
199
|
+
# Ensure the local directory exists
|
|
200
|
+
makedirs(path.dirname(local_file_path), exist_ok=True)
|
|
201
|
+
|
|
202
|
+
file_paths = {
|
|
203
|
+
self.FILE_RELATIVE_PATH: file_relative_path,
|
|
204
|
+
self.LOCAL_FILE_PATH: local_file_path,
|
|
205
|
+
self.FILE_NAME: file_name,
|
|
206
|
+
self.FILE_FOLDER: file_folder,
|
|
207
|
+
self.SOURCE_FILE_URI: file.uri,
|
|
208
|
+
}
|
|
209
|
+
return file_paths
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from pydantic.v1 import BaseModel
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class FileRecordData(BaseModel):
|
|
12
|
+
"""
|
|
13
|
+
A record in a file-based stream.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
folder: str
|
|
17
|
+
filename: str
|
|
18
|
+
bytes: int
|
|
19
|
+
|
|
20
|
+
id: Optional[str] = None
|
|
21
|
+
updated_at: Optional[str] = None
|
|
22
|
+
mime_type: Optional[str] = None
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from typing import Any, Mapping, Type
|
|
2
|
+
|
|
3
|
+
from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
|
|
4
|
+
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat
|
|
5
|
+
from airbyte_cdk.sources.file_based.config.excel_format import ExcelFormat
|
|
6
|
+
from airbyte_cdk.sources.file_based.config.jsonl_format import JsonlFormat
|
|
7
|
+
from airbyte_cdk.sources.file_based.config.parquet_format import ParquetFormat
|
|
8
|
+
from airbyte_cdk.sources.file_based.config.unstructured_format import UnstructuredFormat
|
|
9
|
+
|
|
10
|
+
from .avro_parser import AvroParser
|
|
11
|
+
from .csv_parser import CsvParser
|
|
12
|
+
from .excel_parser import ExcelParser
|
|
13
|
+
from .file_transfer import FileTransfer
|
|
14
|
+
from .file_type_parser import FileTypeParser
|
|
15
|
+
from .jsonl_parser import JsonlParser
|
|
16
|
+
from .parquet_parser import ParquetParser
|
|
17
|
+
from .unstructured_parser import UnstructuredParser
|
|
18
|
+
|
|
19
|
+
default_parsers: Mapping[Type[Any], FileTypeParser] = {
|
|
20
|
+
AvroFormat: AvroParser(),
|
|
21
|
+
CsvFormat: CsvParser(),
|
|
22
|
+
ExcelFormat: ExcelParser(),
|
|
23
|
+
JsonlFormat: JsonlParser(),
|
|
24
|
+
ParquetFormat: ParquetParser(),
|
|
25
|
+
UnstructuredFormat: UnstructuredParser(),
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"AvroParser",
|
|
30
|
+
"CsvParser",
|
|
31
|
+
"ExcelParser",
|
|
32
|
+
"JsonlParser",
|
|
33
|
+
"ParquetParser",
|
|
34
|
+
"UnstructuredParser",
|
|
35
|
+
"FileTransfer",
|
|
36
|
+
"default_parsers",
|
|
37
|
+
]
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, cast
|
|
7
|
+
|
|
8
|
+
import fastavro
|
|
9
|
+
|
|
10
|
+
from airbyte_cdk.sources.file_based.config.avro_format import AvroFormat
|
|
11
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
|
12
|
+
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
|
13
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
|
14
|
+
AbstractFileBasedStreamReader,
|
|
15
|
+
FileReadMode,
|
|
16
|
+
)
|
|
17
|
+
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
|
18
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
19
|
+
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
|
20
|
+
|
|
21
|
+
AVRO_TYPE_TO_JSON_TYPE = {
|
|
22
|
+
"null": "null",
|
|
23
|
+
"boolean": "boolean",
|
|
24
|
+
"int": "integer",
|
|
25
|
+
"long": "integer",
|
|
26
|
+
"float": "number",
|
|
27
|
+
"double": "string", # double -> number conversions can lose precision
|
|
28
|
+
"bytes": "string",
|
|
29
|
+
"string": "string",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
AVRO_LOGICAL_TYPE_TO_JSON = {
|
|
33
|
+
"decimal": {"type": "string"},
|
|
34
|
+
"uuid": {"type": "string"},
|
|
35
|
+
"date": {"type": "string", "format": "date"},
|
|
36
|
+
"time-millis": {"type": "integer"},
|
|
37
|
+
"time-micros": {"type": "integer"},
|
|
38
|
+
"timestamp-millis": {"type": "string", "format": "date-time"},
|
|
39
|
+
"timestamp-micros": {"type": "string"},
|
|
40
|
+
"local-timestamp-millis": {"type": "string", "format": "date-time"},
|
|
41
|
+
"local-timestamp-micros": {"type": "string"},
|
|
42
|
+
# fastavro does not support duration https://fastavro.readthedocs.io/en/latest/logical_types.html
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class AvroParser(FileTypeParser):
|
|
47
|
+
ENCODING = None
|
|
48
|
+
|
|
49
|
+
def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
|
|
50
|
+
"""
|
|
51
|
+
AvroParser does not require config checks, implicit pydantic validation is enough.
|
|
52
|
+
"""
|
|
53
|
+
return True, None
|
|
54
|
+
|
|
55
|
+
async def infer_schema(
|
|
56
|
+
self,
|
|
57
|
+
config: FileBasedStreamConfig,
|
|
58
|
+
file: RemoteFile,
|
|
59
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
60
|
+
logger: logging.Logger,
|
|
61
|
+
) -> SchemaType:
|
|
62
|
+
avro_format = config.format
|
|
63
|
+
if not isinstance(avro_format, AvroFormat):
|
|
64
|
+
raise ValueError(f"Expected ParquetFormat, got {avro_format}")
|
|
65
|
+
|
|
66
|
+
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
|
|
67
|
+
avro_reader = fastavro.reader(fp) # type: ignore [arg-type]
|
|
68
|
+
avro_schema = avro_reader.writer_schema
|
|
69
|
+
if not avro_schema["type"] == "record": # type: ignore [index, call-overload]
|
|
70
|
+
unsupported_type = avro_schema["type"] # type: ignore [index, call-overload]
|
|
71
|
+
raise ValueError(
|
|
72
|
+
f"Only record based avro files are supported. Found {unsupported_type}"
|
|
73
|
+
)
|
|
74
|
+
json_schema = {
|
|
75
|
+
field["name"]: AvroParser._convert_avro_type_to_json( # type: ignore [index]
|
|
76
|
+
avro_format,
|
|
77
|
+
field["name"], # type: ignore [index]
|
|
78
|
+
field["type"], # type: ignore [index]
|
|
79
|
+
)
|
|
80
|
+
for field in avro_schema["fields"] # type: ignore [index, call-overload]
|
|
81
|
+
}
|
|
82
|
+
return json_schema
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def _convert_avro_type_to_json(
|
|
86
|
+
cls, avro_format: AvroFormat, field_name: str, avro_field: str
|
|
87
|
+
) -> Mapping[str, Any]:
|
|
88
|
+
if isinstance(avro_field, str) and avro_field in AVRO_TYPE_TO_JSON_TYPE:
|
|
89
|
+
# Legacy behavior to retain backwards compatibility. Long term we should always represent doubles as strings
|
|
90
|
+
if avro_field == "double" and not avro_format.double_as_string:
|
|
91
|
+
return {"type": "number"}
|
|
92
|
+
return {"type": AVRO_TYPE_TO_JSON_TYPE[avro_field]}
|
|
93
|
+
if isinstance(avro_field, Mapping):
|
|
94
|
+
if avro_field["type"] == "record":
|
|
95
|
+
return {
|
|
96
|
+
"type": "object",
|
|
97
|
+
"properties": {
|
|
98
|
+
object_field["name"]: AvroParser._convert_avro_type_to_json(
|
|
99
|
+
avro_format, object_field["name"], object_field["type"]
|
|
100
|
+
)
|
|
101
|
+
for object_field in avro_field["fields"]
|
|
102
|
+
},
|
|
103
|
+
}
|
|
104
|
+
elif avro_field["type"] == "array":
|
|
105
|
+
if "items" not in avro_field:
|
|
106
|
+
raise ValueError(
|
|
107
|
+
f"{field_name} array type does not have a required field items"
|
|
108
|
+
)
|
|
109
|
+
return {
|
|
110
|
+
"type": "array",
|
|
111
|
+
"items": AvroParser._convert_avro_type_to_json(
|
|
112
|
+
avro_format, "", avro_field["items"]
|
|
113
|
+
),
|
|
114
|
+
}
|
|
115
|
+
elif avro_field["type"] == "enum":
|
|
116
|
+
if "symbols" not in avro_field:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
f"{field_name} enum type does not have a required field symbols"
|
|
119
|
+
)
|
|
120
|
+
if "name" not in avro_field:
|
|
121
|
+
raise ValueError(f"{field_name} enum type does not have a required field name")
|
|
122
|
+
return {"type": "string", "enum": avro_field["symbols"]}
|
|
123
|
+
elif avro_field["type"] == "map":
|
|
124
|
+
if "values" not in avro_field:
|
|
125
|
+
raise ValueError(f"{field_name} map type does not have a required field values")
|
|
126
|
+
return {
|
|
127
|
+
"type": "object",
|
|
128
|
+
"additionalProperties": AvroParser._convert_avro_type_to_json(
|
|
129
|
+
avro_format, "", avro_field["values"]
|
|
130
|
+
),
|
|
131
|
+
}
|
|
132
|
+
elif avro_field["type"] == "fixed" and avro_field.get("logicalType") != "duration":
|
|
133
|
+
if "size" not in avro_field:
|
|
134
|
+
raise ValueError(f"{field_name} fixed type does not have a required field size")
|
|
135
|
+
if not isinstance(avro_field["size"], int):
|
|
136
|
+
raise ValueError(f"{field_name} fixed type size value is not an integer")
|
|
137
|
+
return {
|
|
138
|
+
"type": "string",
|
|
139
|
+
"pattern": f"^[0-9A-Fa-f]{{{avro_field['size'] * 2}}}$",
|
|
140
|
+
}
|
|
141
|
+
elif avro_field.get("logicalType") == "decimal":
|
|
142
|
+
if "precision" not in avro_field:
|
|
143
|
+
raise ValueError(
|
|
144
|
+
f"{field_name} decimal type does not have a required field precision"
|
|
145
|
+
)
|
|
146
|
+
if "scale" not in avro_field:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
f"{field_name} decimal type does not have a required field scale"
|
|
149
|
+
)
|
|
150
|
+
max_whole_number_range = avro_field["precision"] - avro_field["scale"]
|
|
151
|
+
decimal_range = avro_field["scale"]
|
|
152
|
+
|
|
153
|
+
# This regex looks like a mess, but it is validation for at least one whole number and optional fractional numbers
|
|
154
|
+
# For example: ^-?\d{1,5}(?:\.\d{1,3})?$ would accept 12345.123 and 123456.12345 would be rejected
|
|
155
|
+
return {
|
|
156
|
+
"type": "string",
|
|
157
|
+
"pattern": f"^-?\\d{{{1,max_whole_number_range}}}(?:\\.\\d{1,decimal_range})?$",
|
|
158
|
+
}
|
|
159
|
+
elif "logicalType" in avro_field:
|
|
160
|
+
if avro_field["logicalType"] not in AVRO_LOGICAL_TYPE_TO_JSON:
|
|
161
|
+
raise ValueError(
|
|
162
|
+
f"{avro_field['logicalType']} is not a valid Avro logical type"
|
|
163
|
+
)
|
|
164
|
+
return AVRO_LOGICAL_TYPE_TO_JSON[avro_field["logicalType"]]
|
|
165
|
+
else:
|
|
166
|
+
raise ValueError(f"Unsupported avro type: {avro_field}")
|
|
167
|
+
else:
|
|
168
|
+
raise ValueError(f"Unsupported avro type: {avro_field}")
|
|
169
|
+
|
|
170
|
+
def parse_records(
|
|
171
|
+
self,
|
|
172
|
+
config: FileBasedStreamConfig,
|
|
173
|
+
file: RemoteFile,
|
|
174
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
175
|
+
logger: logging.Logger,
|
|
176
|
+
discovered_schema: Optional[Mapping[str, SchemaType]],
|
|
177
|
+
) -> Iterable[Dict[str, Any]]:
|
|
178
|
+
avro_format = config.format or AvroFormat(filetype="avro")
|
|
179
|
+
if not isinstance(avro_format, AvroFormat):
|
|
180
|
+
raise ValueError(f"Expected ParquetFormat, got {avro_format}")
|
|
181
|
+
|
|
182
|
+
line_no = 0
|
|
183
|
+
try:
|
|
184
|
+
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
|
|
185
|
+
avro_reader = fastavro.reader(fp) # type: ignore [arg-type]
|
|
186
|
+
schema = avro_reader.writer_schema
|
|
187
|
+
schema_field_name_to_type = {
|
|
188
|
+
field["name"]: cast(dict[str, Any], field["type"]) # type: ignore [index]
|
|
189
|
+
for field in schema["fields"] # type: ignore [index, call-overload] # If schema is not dict, it is not subscriptable by strings
|
|
190
|
+
}
|
|
191
|
+
for record in avro_reader:
|
|
192
|
+
line_no += 1
|
|
193
|
+
yield {
|
|
194
|
+
record_field: self._to_output_value(
|
|
195
|
+
avro_format,
|
|
196
|
+
schema_field_name_to_type[record_field], # type: ignore [index] # Any not subscriptable
|
|
197
|
+
record[record_field], # type: ignore [index] # Any not subscriptable
|
|
198
|
+
)
|
|
199
|
+
for record_field, record_value in schema_field_name_to_type.items()
|
|
200
|
+
}
|
|
201
|
+
except Exception as exc:
|
|
202
|
+
raise RecordParseError(
|
|
203
|
+
FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no
|
|
204
|
+
) from exc
|
|
205
|
+
|
|
206
|
+
@property
|
|
207
|
+
def file_read_mode(self) -> FileReadMode:
|
|
208
|
+
return FileReadMode.READ_BINARY
|
|
209
|
+
|
|
210
|
+
@staticmethod
|
|
211
|
+
def _to_output_value(
|
|
212
|
+
avro_format: AvroFormat, record_type: Mapping[str, Any], record_value: Any
|
|
213
|
+
) -> Any:
|
|
214
|
+
if isinstance(record_value, bytes):
|
|
215
|
+
return record_value.decode()
|
|
216
|
+
elif not isinstance(record_type, Mapping):
|
|
217
|
+
if record_type == "double" and avro_format.double_as_string:
|
|
218
|
+
return str(record_value)
|
|
219
|
+
return record_value
|
|
220
|
+
if record_type.get("logicalType") in ("decimal", "uuid"):
|
|
221
|
+
return str(record_value)
|
|
222
|
+
elif record_type.get("logicalType") == "date":
|
|
223
|
+
return record_value.isoformat()
|
|
224
|
+
elif record_type.get("logicalType") == "timestamp-millis":
|
|
225
|
+
return record_value.isoformat(sep="T", timespec="milliseconds")
|
|
226
|
+
elif record_type.get("logicalType") == "timestamp-micros":
|
|
227
|
+
return record_value.isoformat(sep="T", timespec="microseconds")
|
|
228
|
+
elif record_type.get("logicalType") == "local-timestamp-millis":
|
|
229
|
+
return record_value.isoformat(sep="T", timespec="milliseconds")
|
|
230
|
+
elif record_type.get("logicalType") == "local-timestamp-micros":
|
|
231
|
+
return record_value.isoformat(sep="T", timespec="microseconds")
|
|
232
|
+
else:
|
|
233
|
+
return record_value
|