airbyte-cdk 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +358 -0
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +104 -0
- airbyte_cdk/connector.py +123 -0
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/__init__.py +3 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
- airbyte_cdk/connector_builder/main.py +107 -0
- airbyte_cdk/connector_builder/models.py +73 -0
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +83 -0
- airbyte_cdk/destinations/__init__.py +8 -0
- airbyte_cdk/destinations/destination.py +154 -0
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
- airbyte_cdk/destinations/vector_db_based/config.py +298 -0
- airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
- airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
- airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
- airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
- airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
- airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
- airbyte_cdk/entrypoint.py +414 -0
- airbyte_cdk/exception_handler.py +56 -0
- airbyte_cdk/logger.py +109 -0
- airbyte_cdk/models/__init__.py +72 -0
- airbyte_cdk/models/airbyte_protocol.py +88 -0
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/well_known_types.py +5 -0
- airbyte_cdk/py.typed +0 -0
- airbyte_cdk/sources/__init__.py +26 -0
- airbyte_cdk/sources/abstract_source.py +326 -0
- airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
- airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
- airbyte_cdk/sources/config.py +27 -0
- airbyte_cdk/sources/connector_state_manager.py +161 -0
- airbyte_cdk/sources/declarative/__init__.py +3 -0
- airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
- airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
- airbyte_cdk/sources/declarative/auth/token.py +267 -0
- airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
- airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
- airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
- airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
- airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
- airbyte_cdk/sources/declarative/declarative_source.py +36 -0
- airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
- airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
- airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
- airbyte_cdk/sources/declarative/exceptions.py +9 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
- airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
- airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
- airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
- airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
- airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +2 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
- airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
- airbyte_cdk/sources/declarative/requesters/README.md +56 -0
- airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
- airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
- airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
- airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
- airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
- airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
- airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
- airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
- airbyte_cdk/sources/declarative/spec/spec.py +48 -0
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
- airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
- airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
- airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
- airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
- airbyte_cdk/sources/declarative/types.py +25 -0
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
- airbyte_cdk/sources/file_based/config/__init__.py +0 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
- airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
- airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
- airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
- airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
- airbyte_cdk/sources/file_based/exceptions.py +159 -0
- airbyte_cdk/sources/file_based/file_based_source.py +466 -0
- airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
- airbyte_cdk/sources/file_based/file_record_data.py +22 -0
- airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
- airbyte_cdk/sources/file_based/remote_file.py +18 -0
- airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
- airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
- airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
- airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
- airbyte_cdk/sources/file_based/types.py +10 -0
- airbyte_cdk/sources/http_config.py +10 -0
- airbyte_cdk/sources/http_logger.py +55 -0
- airbyte_cdk/sources/message/__init__.py +19 -0
- airbyte_cdk/sources/message/repository.py +137 -0
- airbyte_cdk/sources/source.py +95 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/__init__.py +8 -0
- airbyte_cdk/sources/streams/availability_strategy.py +84 -0
- airbyte_cdk/sources/streams/call_rate.py +704 -0
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
- airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
- airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
- airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
- airbyte_cdk/sources/streams/core.py +703 -0
- airbyte_cdk/sources/streams/http/__init__.py +10 -0
- airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +61 -0
- airbyte_cdk/sources/streams/http/http.py +673 -0
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/sources/streams/utils/__init__.py +3 -0
- airbyte_cdk/sources/types.py +169 -0
- airbyte_cdk/sources/utils/__init__.py +7 -0
- airbyte_cdk/sources/utils/casing.py +12 -0
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +53 -0
- airbyte_cdk/sources/utils/schema_helpers.py +230 -0
- airbyte_cdk/sources/utils/slice_logger.py +57 -0
- airbyte_cdk/sources/utils/transform.py +277 -0
- airbyte_cdk/sources/utils/types.py +7 -0
- airbyte_cdk/sql/__init__.py +0 -0
- airbyte_cdk/sql/_util/__init__.py +0 -0
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/__init__.py +7 -0
- airbyte_cdk/test/catalog_builder.py +81 -0
- airbyte_cdk/test/entrypoint_wrapper.py +250 -0
- airbyte_cdk/test/mock_http/__init__.py +6 -0
- airbyte_cdk/test/mock_http/matcher.py +41 -0
- airbyte_cdk/test/mock_http/mocker.py +185 -0
- airbyte_cdk/test/mock_http/request.py +103 -0
- airbyte_cdk/test/mock_http/response.py +28 -0
- airbyte_cdk/test/mock_http/response_builder.py +237 -0
- airbyte_cdk/test/state_builder.py +33 -0
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +10 -0
- airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
- airbyte_cdk/utils/analytics_message.py +25 -0
- airbyte_cdk/utils/constants.py +5 -0
- airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
- airbyte_cdk/utils/datetime_helpers.py +499 -0
- airbyte_cdk/utils/event_timing.py +85 -0
- airbyte_cdk/utils/is_cloud_environment.py +18 -0
- airbyte_cdk/utils/mapping_helpers.py +162 -0
- airbyte_cdk/utils/message_utils.py +26 -0
- airbyte_cdk/utils/oneof_option_config.py +33 -0
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +270 -0
- airbyte_cdk/utils/slice_hasher.py +37 -0
- airbyte_cdk/utils/spec_schema_transformations.py +26 -0
- airbyte_cdk/utils/stream_status_utils.py +43 -0
- airbyte_cdk/utils/traced_exception.py +145 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
- airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
- airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
- airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
- airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import itertools
|
|
7
|
+
import traceback
|
|
8
|
+
from collections import defaultdict
|
|
9
|
+
from copy import deepcopy
|
|
10
|
+
from functools import cache
|
|
11
|
+
from os import path
|
|
12
|
+
from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Set, Tuple, Union
|
|
13
|
+
|
|
14
|
+
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, AirbyteStream, FailureType, Level
|
|
15
|
+
from airbyte_cdk.models import Type as MessageType
|
|
16
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
|
|
17
|
+
from airbyte_cdk.sources.file_based.exceptions import (
|
|
18
|
+
DuplicatedFilesError,
|
|
19
|
+
FileBasedSourceError,
|
|
20
|
+
InvalidSchemaError,
|
|
21
|
+
MissingSchemaError,
|
|
22
|
+
RecordParseError,
|
|
23
|
+
SchemaInferenceError,
|
|
24
|
+
StopSyncPerValidationPolicy,
|
|
25
|
+
)
|
|
26
|
+
from airbyte_cdk.sources.file_based.file_types import FileTransfer
|
|
27
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
28
|
+
from airbyte_cdk.sources.file_based.schema_helpers import (
|
|
29
|
+
SchemaType,
|
|
30
|
+
file_transfer_schema,
|
|
31
|
+
merge_schemas,
|
|
32
|
+
schemaless_schema,
|
|
33
|
+
)
|
|
34
|
+
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
|
|
35
|
+
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
|
36
|
+
from airbyte_cdk.sources.file_based.types import StreamSlice
|
|
37
|
+
from airbyte_cdk.sources.streams import IncrementalMixin
|
|
38
|
+
from airbyte_cdk.sources.streams.core import JsonSchema
|
|
39
|
+
from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
|
|
40
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
44
|
+
"""
|
|
45
|
+
The default file-based stream.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
FILE_TRANSFER_KW = "use_file_transfer"
|
|
49
|
+
PRESERVE_DIRECTORY_STRUCTURE_KW = "preserve_directory_structure"
|
|
50
|
+
FILES_KEY = "files"
|
|
51
|
+
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
|
|
52
|
+
ab_last_mod_col = "_ab_source_file_last_modified"
|
|
53
|
+
ab_file_name_col = "_ab_source_file_url"
|
|
54
|
+
modified = "modified"
|
|
55
|
+
source_file_url = "source_file_url"
|
|
56
|
+
airbyte_columns = [ab_last_mod_col, ab_file_name_col]
|
|
57
|
+
use_file_transfer = False
|
|
58
|
+
preserve_directory_structure = True
|
|
59
|
+
_file_transfer = FileTransfer()
|
|
60
|
+
|
|
61
|
+
def __init__(self, **kwargs: Any):
|
|
62
|
+
if self.FILE_TRANSFER_KW in kwargs:
|
|
63
|
+
self.use_file_transfer = kwargs.pop(self.FILE_TRANSFER_KW, False)
|
|
64
|
+
if self.PRESERVE_DIRECTORY_STRUCTURE_KW in kwargs:
|
|
65
|
+
self.preserve_directory_structure = kwargs.pop(
|
|
66
|
+
self.PRESERVE_DIRECTORY_STRUCTURE_KW, True
|
|
67
|
+
)
|
|
68
|
+
super().__init__(**kwargs)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def state(self) -> MutableMapping[str, Any]:
|
|
72
|
+
return self._cursor.get_state()
|
|
73
|
+
|
|
74
|
+
@state.setter
|
|
75
|
+
def state(self, value: MutableMapping[str, Any]) -> None:
|
|
76
|
+
"""State setter, accept state serialized by state getter."""
|
|
77
|
+
self._cursor.set_initial_state(value)
|
|
78
|
+
|
|
79
|
+
@property # type: ignore # mypy complains wrong type, but AbstractFileBasedCursor is parent of file-based cursors
|
|
80
|
+
def cursor(self) -> Optional[AbstractFileBasedCursor]:
|
|
81
|
+
return self._cursor
|
|
82
|
+
|
|
83
|
+
@cursor.setter
|
|
84
|
+
def cursor(self, value: AbstractFileBasedCursor) -> None:
|
|
85
|
+
if self._cursor is not None:
|
|
86
|
+
raise RuntimeError(
|
|
87
|
+
f"Cursor for stream {self.name} is already set. This is unexpected. Please contact Support."
|
|
88
|
+
)
|
|
89
|
+
self._cursor = value
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def primary_key(self) -> PrimaryKeyType:
|
|
93
|
+
return self.config.primary_key or self.get_parser().get_parser_defined_primary_key(
|
|
94
|
+
self.config
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
def _duplicated_files_names(
|
|
98
|
+
self, slices: List[dict[str, List[RemoteFile]]]
|
|
99
|
+
) -> List[dict[str, List[str]]]:
|
|
100
|
+
seen_file_names: Dict[str, List[str]] = defaultdict(list)
|
|
101
|
+
for file_slice in slices:
|
|
102
|
+
for file_found in file_slice[self.FILES_KEY]:
|
|
103
|
+
file_name = path.basename(file_found.uri)
|
|
104
|
+
seen_file_names[file_name].append(file_found.uri)
|
|
105
|
+
return [
|
|
106
|
+
{file_name: paths} for file_name, paths in seen_file_names.items() if len(paths) > 1
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
def compute_slices(self) -> Iterable[Optional[Mapping[str, Any]]]:
|
|
110
|
+
# Sort files by last_modified, uri and return them grouped by last_modified
|
|
111
|
+
all_files = self.list_files()
|
|
112
|
+
files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
|
|
113
|
+
sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
|
|
114
|
+
slices = [
|
|
115
|
+
{self.FILES_KEY: list(group[1])}
|
|
116
|
+
for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
|
|
117
|
+
]
|
|
118
|
+
if slices and not self.preserve_directory_structure:
|
|
119
|
+
duplicated_files_names = self._duplicated_files_names(slices)
|
|
120
|
+
if duplicated_files_names:
|
|
121
|
+
raise DuplicatedFilesError(
|
|
122
|
+
stream=self.name, duplicated_files_names=duplicated_files_names
|
|
123
|
+
)
|
|
124
|
+
return slices
|
|
125
|
+
|
|
126
|
+
def transform_record(
|
|
127
|
+
self, record: dict[str, Any], file: RemoteFile, last_updated: str
|
|
128
|
+
) -> dict[str, Any]:
|
|
129
|
+
# adds _ab_source_file_last_modified and _ab_source_file_url to the record
|
|
130
|
+
record[self.ab_last_mod_col] = last_updated
|
|
131
|
+
record[self.ab_file_name_col] = file.uri
|
|
132
|
+
return record
|
|
133
|
+
|
|
134
|
+
def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[AirbyteMessage]:
|
|
135
|
+
"""
|
|
136
|
+
Yield all records from all remote files in `list_files_for_this_sync`.
|
|
137
|
+
|
|
138
|
+
If an error is encountered reading records from a file, log a message and do not attempt
|
|
139
|
+
to sync the rest of the file.
|
|
140
|
+
"""
|
|
141
|
+
schema = self.catalog_schema
|
|
142
|
+
if schema is None:
|
|
143
|
+
# On read requests we should always have the catalog available
|
|
144
|
+
raise MissingSchemaError(FileBasedSourceError.MISSING_SCHEMA, stream=self.name)
|
|
145
|
+
# The stream only supports a single file type, so we can use the same parser for all files
|
|
146
|
+
parser = self.get_parser()
|
|
147
|
+
for file in stream_slice["files"]:
|
|
148
|
+
# only serialize the datetime once
|
|
149
|
+
file_datetime_string = file.last_modified.strftime(self.DATE_TIME_FORMAT)
|
|
150
|
+
n_skipped = line_no = 0
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
if self.use_file_transfer:
|
|
154
|
+
for file_record_data, file_reference in self._file_transfer.upload(
|
|
155
|
+
file=file, stream_reader=self.stream_reader, logger=self.logger
|
|
156
|
+
):
|
|
157
|
+
yield stream_data_to_airbyte_message(
|
|
158
|
+
self.name,
|
|
159
|
+
file_record_data.dict(exclude_none=True),
|
|
160
|
+
file_reference=file_reference,
|
|
161
|
+
)
|
|
162
|
+
else:
|
|
163
|
+
for record in parser.parse_records(
|
|
164
|
+
self.config, file, self.stream_reader, self.logger, schema
|
|
165
|
+
):
|
|
166
|
+
line_no += 1
|
|
167
|
+
if self.config.schemaless:
|
|
168
|
+
record = {"data": record}
|
|
169
|
+
elif not self.record_passes_validation_policy(record):
|
|
170
|
+
n_skipped += 1
|
|
171
|
+
continue
|
|
172
|
+
record = self.transform_record(record, file, file_datetime_string)
|
|
173
|
+
yield stream_data_to_airbyte_message(self.name, record)
|
|
174
|
+
self._cursor.add_file(file)
|
|
175
|
+
|
|
176
|
+
except StopSyncPerValidationPolicy:
|
|
177
|
+
yield AirbyteMessage(
|
|
178
|
+
type=MessageType.LOG,
|
|
179
|
+
log=AirbyteLogMessage(
|
|
180
|
+
level=Level.WARN,
|
|
181
|
+
message=f"Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema. stream={self.name} file={file.uri} validation_policy={self.config.validation_policy.value} n_skipped={n_skipped}",
|
|
182
|
+
),
|
|
183
|
+
)
|
|
184
|
+
break
|
|
185
|
+
|
|
186
|
+
except RecordParseError:
|
|
187
|
+
# Increment line_no because the exception was raised before we could increment it
|
|
188
|
+
line_no += 1
|
|
189
|
+
self.errors_collector.collect(
|
|
190
|
+
AirbyteMessage(
|
|
191
|
+
type=MessageType.LOG,
|
|
192
|
+
log=AirbyteLogMessage(
|
|
193
|
+
level=Level.ERROR,
|
|
194
|
+
message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
|
|
195
|
+
stack_trace=traceback.format_exc(),
|
|
196
|
+
),
|
|
197
|
+
),
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
except AirbyteTracedException as exc:
|
|
201
|
+
# Re-raise the exception to stop the whole sync immediately as this is a fatal error
|
|
202
|
+
raise exc
|
|
203
|
+
|
|
204
|
+
except Exception:
|
|
205
|
+
yield AirbyteMessage(
|
|
206
|
+
type=MessageType.LOG,
|
|
207
|
+
log=AirbyteLogMessage(
|
|
208
|
+
level=Level.ERROR,
|
|
209
|
+
message=f"{FileBasedSourceError.ERROR_PARSING_RECORD.value} stream={self.name} file={file.uri} line_no={line_no} n_skipped={n_skipped}",
|
|
210
|
+
stack_trace=traceback.format_exc(),
|
|
211
|
+
),
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
finally:
|
|
215
|
+
if n_skipped:
|
|
216
|
+
yield AirbyteMessage(
|
|
217
|
+
type=MessageType.LOG,
|
|
218
|
+
log=AirbyteLogMessage(
|
|
219
|
+
level=Level.WARN,
|
|
220
|
+
message=f"Records in file did not pass validation policy. stream={self.name} file={file.uri} n_skipped={n_skipped} validation_policy={self.validation_policy.name}",
|
|
221
|
+
),
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
@property
|
|
225
|
+
def cursor_field(self) -> Union[str, List[str]]:
|
|
226
|
+
"""
|
|
227
|
+
Override to return the default cursor field used by this stream e.g: an API entity might always use created_at as the cursor field.
|
|
228
|
+
:return: The name of the field used as a cursor. If the cursor is nested, return an array consisting of the path to the cursor.
|
|
229
|
+
"""
|
|
230
|
+
return self.ab_last_mod_col
|
|
231
|
+
|
|
232
|
+
@cache
|
|
233
|
+
def get_json_schema(self) -> JsonSchema:
|
|
234
|
+
if self.use_file_transfer:
|
|
235
|
+
return file_transfer_schema
|
|
236
|
+
extra_fields = {
|
|
237
|
+
self.ab_last_mod_col: {"type": "string"},
|
|
238
|
+
self.ab_file_name_col: {"type": "string"},
|
|
239
|
+
}
|
|
240
|
+
try:
|
|
241
|
+
schema = self._get_raw_json_schema()
|
|
242
|
+
except InvalidSchemaError as config_exception:
|
|
243
|
+
raise AirbyteTracedException(
|
|
244
|
+
internal_message="Please check the logged errors for more information.",
|
|
245
|
+
message=FileBasedSourceError.SCHEMA_INFERENCE_ERROR.value,
|
|
246
|
+
exception=AirbyteTracedException(exception=config_exception),
|
|
247
|
+
failure_type=FailureType.config_error,
|
|
248
|
+
)
|
|
249
|
+
except AirbyteTracedException as ate:
|
|
250
|
+
raise ate
|
|
251
|
+
except Exception as exc:
|
|
252
|
+
raise SchemaInferenceError(
|
|
253
|
+
FileBasedSourceError.SCHEMA_INFERENCE_ERROR, stream=self.name
|
|
254
|
+
) from exc
|
|
255
|
+
else:
|
|
256
|
+
return {"type": "object", "properties": {**extra_fields, **schema["properties"]}}
|
|
257
|
+
|
|
258
|
+
def _get_raw_json_schema(self) -> JsonSchema:
|
|
259
|
+
if self.config.input_schema:
|
|
260
|
+
return self.config.get_input_schema() # type: ignore
|
|
261
|
+
elif self.config.schemaless:
|
|
262
|
+
return schemaless_schema
|
|
263
|
+
else:
|
|
264
|
+
files = self.list_files()
|
|
265
|
+
first_n_files = len(files)
|
|
266
|
+
|
|
267
|
+
if self.config.recent_n_files_to_read_for_schema_discovery:
|
|
268
|
+
self.logger.info(
|
|
269
|
+
msg=(
|
|
270
|
+
f"Only first {self.config.recent_n_files_to_read_for_schema_discovery} files will be used to infer schema "
|
|
271
|
+
f"for stream {self.name} due to limitation in config."
|
|
272
|
+
)
|
|
273
|
+
)
|
|
274
|
+
first_n_files = self.config.recent_n_files_to_read_for_schema_discovery
|
|
275
|
+
|
|
276
|
+
if first_n_files == 0:
|
|
277
|
+
self.logger.warning(
|
|
278
|
+
msg=f"No files were identified in the stream {self.name}. Setting default schema for the stream."
|
|
279
|
+
)
|
|
280
|
+
return schemaless_schema
|
|
281
|
+
|
|
282
|
+
max_n_files_for_schema_inference = (
|
|
283
|
+
self._discovery_policy.get_max_n_files_for_schema_inference(self.get_parser())
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
if first_n_files > max_n_files_for_schema_inference:
|
|
287
|
+
# Use the most recent files for schema inference, so we pick up schema changes during discovery.
|
|
288
|
+
self.logger.warning(
|
|
289
|
+
msg=f"Refusing to infer schema for {first_n_files} files; using {max_n_files_for_schema_inference} files."
|
|
290
|
+
)
|
|
291
|
+
first_n_files = max_n_files_for_schema_inference
|
|
292
|
+
|
|
293
|
+
files = sorted(files, key=lambda x: x.last_modified, reverse=True)[:first_n_files]
|
|
294
|
+
|
|
295
|
+
inferred_schema = self.infer_schema(files)
|
|
296
|
+
|
|
297
|
+
if not inferred_schema:
|
|
298
|
+
raise InvalidSchemaError(
|
|
299
|
+
FileBasedSourceError.INVALID_SCHEMA_ERROR,
|
|
300
|
+
details=f"Empty schema. Please check that the files are valid for format {self.config.format}",
|
|
301
|
+
stream=self.name,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
schema = {"type": "object", "properties": inferred_schema}
|
|
305
|
+
|
|
306
|
+
return schema
|
|
307
|
+
|
|
308
|
+
def get_files(self) -> Iterable[RemoteFile]:
|
|
309
|
+
"""
|
|
310
|
+
Return all files that belong to the stream as defined by the stream's globs.
|
|
311
|
+
"""
|
|
312
|
+
return self.stream_reader.get_matching_files(
|
|
313
|
+
self.config.globs or [], self.config.legacy_prefix, self.logger
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
def as_airbyte_stream(self) -> AirbyteStream:
|
|
317
|
+
file_stream = super().as_airbyte_stream()
|
|
318
|
+
file_stream.is_file_based = self.use_file_transfer
|
|
319
|
+
return file_stream
|
|
320
|
+
|
|
321
|
+
def infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
|
|
322
|
+
loop = asyncio.get_event_loop()
|
|
323
|
+
schema = loop.run_until_complete(self._infer_schema(files))
|
|
324
|
+
# as infer schema returns a Mapping that is assumed to be immutable, we need to create a deepcopy to avoid modifying the reference
|
|
325
|
+
return self._fill_nulls(deepcopy(schema))
|
|
326
|
+
|
|
327
|
+
@staticmethod
|
|
328
|
+
def _fill_nulls(schema: Mapping[str, Any]) -> Mapping[str, Any]:
|
|
329
|
+
if isinstance(schema, dict):
|
|
330
|
+
for k, v in schema.items():
|
|
331
|
+
if k == "type":
|
|
332
|
+
if isinstance(v, list):
|
|
333
|
+
if "null" not in v:
|
|
334
|
+
schema[k] = ["null"] + v
|
|
335
|
+
elif v != "null":
|
|
336
|
+
if isinstance(v, (str, list)):
|
|
337
|
+
schema[k] = ["null", v]
|
|
338
|
+
else:
|
|
339
|
+
DefaultFileBasedStream._fill_nulls(v)
|
|
340
|
+
else:
|
|
341
|
+
DefaultFileBasedStream._fill_nulls(v)
|
|
342
|
+
elif isinstance(schema, list):
|
|
343
|
+
for item in schema:
|
|
344
|
+
DefaultFileBasedStream._fill_nulls(item)
|
|
345
|
+
return schema
|
|
346
|
+
|
|
347
|
+
async def _infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
|
|
348
|
+
"""
|
|
349
|
+
Infer the schema for a stream.
|
|
350
|
+
|
|
351
|
+
Each file type has a corresponding `infer_schema` handler.
|
|
352
|
+
Dispatch on file type.
|
|
353
|
+
"""
|
|
354
|
+
base_schema: SchemaType = {}
|
|
355
|
+
pending_tasks: Set[asyncio.tasks.Task[SchemaType]] = set()
|
|
356
|
+
|
|
357
|
+
n_started, n_files = 0, len(files)
|
|
358
|
+
files_iterator = iter(files)
|
|
359
|
+
while pending_tasks or n_started < n_files:
|
|
360
|
+
while len(pending_tasks) <= self._discovery_policy.n_concurrent_requests and (
|
|
361
|
+
file := next(files_iterator, None)
|
|
362
|
+
):
|
|
363
|
+
pending_tasks.add(asyncio.create_task(self._infer_file_schema(file)))
|
|
364
|
+
n_started += 1
|
|
365
|
+
# Return when the first task is completed so that we can enqueue a new task as soon as the
|
|
366
|
+
# number of concurrent tasks drops below the number allowed.
|
|
367
|
+
done, pending_tasks = await asyncio.wait(
|
|
368
|
+
pending_tasks, return_when=asyncio.FIRST_COMPLETED
|
|
369
|
+
)
|
|
370
|
+
for task in done:
|
|
371
|
+
try:
|
|
372
|
+
base_schema = merge_schemas(base_schema, task.result())
|
|
373
|
+
except AirbyteTracedException as ate:
|
|
374
|
+
raise ate
|
|
375
|
+
except Exception as exc:
|
|
376
|
+
self.logger.error(
|
|
377
|
+
f"An error occurred inferring the schema. \n {traceback.format_exc()}",
|
|
378
|
+
exc_info=exc,
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
return base_schema
|
|
382
|
+
|
|
383
|
+
async def _infer_file_schema(self, file: RemoteFile) -> SchemaType:
|
|
384
|
+
try:
|
|
385
|
+
return await self.get_parser().infer_schema(
|
|
386
|
+
self.config, file, self.stream_reader, self.logger
|
|
387
|
+
)
|
|
388
|
+
except AirbyteTracedException as ate:
|
|
389
|
+
raise ate
|
|
390
|
+
except Exception as exc:
|
|
391
|
+
raise SchemaInferenceError(
|
|
392
|
+
FileBasedSourceError.SCHEMA_INFERENCE_ERROR,
|
|
393
|
+
file=file.uri,
|
|
394
|
+
format=str(self.config.format),
|
|
395
|
+
stream=self.name,
|
|
396
|
+
) from exc
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
from functools import cache
|
|
6
|
+
from typing import Any, Dict, Iterable, Mapping, MutableMapping, Optional
|
|
7
|
+
|
|
8
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import PrimaryKeyType
|
|
9
|
+
from airbyte_cdk.sources.file_based.discovery_policy import AbstractDiscoveryPolicy
|
|
10
|
+
from airbyte_cdk.sources.file_based.exceptions import FileBasedErrorsCollector
|
|
11
|
+
from airbyte_cdk.sources.file_based.file_based_stream_permissions_reader import (
|
|
12
|
+
AbstractFileBasedStreamPermissionsReader,
|
|
13
|
+
)
|
|
14
|
+
from airbyte_cdk.sources.streams.core import JsonSchema
|
|
15
|
+
from airbyte_cdk.sources.streams.permissions.identities_stream import IdentitiesStream
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FileIdentitiesStream(IdentitiesStream):
|
|
19
|
+
"""
|
|
20
|
+
The identities stream. A full refresh stream to sync identities from a certain domain.
|
|
21
|
+
The stream reader manage the logic to get such data, which is implemented on connector side.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
is_resumable = False
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
catalog_schema: Optional[Mapping[str, Any]],
|
|
29
|
+
stream_permissions_reader: AbstractFileBasedStreamPermissionsReader,
|
|
30
|
+
discovery_policy: AbstractDiscoveryPolicy,
|
|
31
|
+
errors_collector: FileBasedErrorsCollector,
|
|
32
|
+
) -> None:
|
|
33
|
+
super().__init__()
|
|
34
|
+
self.catalog_schema = catalog_schema
|
|
35
|
+
self.stream_permissions_reader = stream_permissions_reader
|
|
36
|
+
self._discovery_policy = discovery_policy
|
|
37
|
+
self.errors_collector = errors_collector
|
|
38
|
+
self._cursor: MutableMapping[str, Any] = {}
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def primary_key(self) -> PrimaryKeyType:
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
def load_identity_groups(self) -> Iterable[Dict[str, Any]]:
|
|
45
|
+
return self.stream_permissions_reader.load_identity_groups(logger=self.logger)
|
|
46
|
+
|
|
47
|
+
@cache
|
|
48
|
+
def get_json_schema(self) -> JsonSchema:
|
|
49
|
+
return self.stream_permissions_reader.identities_schema
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import traceback
|
|
6
|
+
from typing import Any, Dict, Iterable
|
|
7
|
+
|
|
8
|
+
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level
|
|
9
|
+
from airbyte_cdk.models import Type as MessageType
|
|
10
|
+
from airbyte_cdk.sources.file_based.file_based_stream_permissions_reader import (
|
|
11
|
+
AbstractFileBasedStreamPermissionsReader,
|
|
12
|
+
)
|
|
13
|
+
from airbyte_cdk.sources.file_based.stream import DefaultFileBasedStream
|
|
14
|
+
from airbyte_cdk.sources.file_based.types import StreamSlice
|
|
15
|
+
from airbyte_cdk.sources.streams.core import JsonSchema
|
|
16
|
+
from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PermissionsFileBasedStream(DefaultFileBasedStream):
|
|
20
|
+
"""
|
|
21
|
+
A specialized stream for handling file-based ACL permissions.
|
|
22
|
+
|
|
23
|
+
This stream works with the stream_reader to:
|
|
24
|
+
1. Fetch ACL permissions for each file in the source
|
|
25
|
+
2. Transform permissions into a standardized format
|
|
26
|
+
3. Generate records containing permission information
|
|
27
|
+
|
|
28
|
+
The stream_reader is responsible for the actual implementation of permission retrieval
|
|
29
|
+
and schema definition, while this class handles the streaming interface.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self, stream_permissions_reader: AbstractFileBasedStreamPermissionsReader, **kwargs: Any
|
|
34
|
+
):
|
|
35
|
+
super().__init__(**kwargs)
|
|
36
|
+
self.stream_permissions_reader = stream_permissions_reader
|
|
37
|
+
|
|
38
|
+
def _filter_schema_invalid_properties(
|
|
39
|
+
self, configured_catalog_json_schema: Dict[str, Any]
|
|
40
|
+
) -> Dict[str, Any]:
|
|
41
|
+
return self.stream_permissions_reader.file_permissions_schema
|
|
42
|
+
|
|
43
|
+
def read_records_from_slice(self, stream_slice: StreamSlice) -> Iterable[AirbyteMessage]:
|
|
44
|
+
"""
|
|
45
|
+
Yield permissions records from all remote files
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
for file in stream_slice["files"]:
|
|
49
|
+
no_permissions = False
|
|
50
|
+
file_datetime_string = file.last_modified.strftime(self.DATE_TIME_FORMAT)
|
|
51
|
+
try:
|
|
52
|
+
permissions_record = self.stream_permissions_reader.get_file_acl_permissions(
|
|
53
|
+
file, logger=self.logger
|
|
54
|
+
)
|
|
55
|
+
if not permissions_record:
|
|
56
|
+
no_permissions = True
|
|
57
|
+
self.logger.warning(
|
|
58
|
+
f"Unable to fetch permissions. stream={self.name} file={file.uri}"
|
|
59
|
+
)
|
|
60
|
+
continue
|
|
61
|
+
permissions_record = self.transform_record(
|
|
62
|
+
permissions_record, file, file_datetime_string
|
|
63
|
+
)
|
|
64
|
+
yield stream_data_to_airbyte_message(self.name, permissions_record)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
self.logger.error(f"Failed to retrieve permissions for file {file.uri}: {str(e)}")
|
|
67
|
+
yield AirbyteMessage(
|
|
68
|
+
type=MessageType.LOG,
|
|
69
|
+
log=AirbyteLogMessage(
|
|
70
|
+
level=Level.ERROR,
|
|
71
|
+
message=f"Error retrieving files permissions: stream={self.name} file={file.uri}",
|
|
72
|
+
stack_trace=traceback.format_exc(),
|
|
73
|
+
),
|
|
74
|
+
)
|
|
75
|
+
finally:
|
|
76
|
+
if no_permissions:
|
|
77
|
+
yield AirbyteMessage(
|
|
78
|
+
type=MessageType.LOG,
|
|
79
|
+
log=AirbyteLogMessage(
|
|
80
|
+
level=Level.WARN,
|
|
81
|
+
message=f"Unable to fetch permissions. stream={self.name} file={file.uri}",
|
|
82
|
+
),
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def _get_raw_json_schema(self) -> JsonSchema:
|
|
86
|
+
"""
|
|
87
|
+
Retrieve the raw JSON schema for file permissions from the stream reader.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
The file permissions schema that defines the structure of permission records
|
|
91
|
+
"""
|
|
92
|
+
return self.stream_permissions_reader.file_permissions_schema
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
# The goal of this variable is to make an implicit dependency explicit. As part of of the Concurrent CDK work, we are facing a situation
|
|
6
|
+
# where the connection pool size is too small to serve all the threads (see https://github.com/airbytehq/airbyte/issues/32072). In
|
|
7
|
+
# order to fix that, we will increase the requests library pool_maxsize. As there are many pieces of code that sets a requests.Session, we
|
|
8
|
+
# are creating this variable here so that a change in one affects the other. This can be removed once we merge how we do HTTP requests in
|
|
9
|
+
# one piece of code or once we make connection pool size configurable for each piece of code
|
|
10
|
+
MAX_CONNECTION_POOL_SIZE = 20
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Union
|
|
6
|
+
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from airbyte_cdk.sources.message import LogMessage
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def format_http_message(
|
|
13
|
+
response: requests.Response,
|
|
14
|
+
title: str,
|
|
15
|
+
description: str,
|
|
16
|
+
stream_name: Optional[str],
|
|
17
|
+
is_auxiliary: bool | None = None,
|
|
18
|
+
type: Optional[str] = None,
|
|
19
|
+
) -> LogMessage:
|
|
20
|
+
request_type: str = type if type else "HTTP"
|
|
21
|
+
request = response.request
|
|
22
|
+
log_message = {
|
|
23
|
+
"http": {
|
|
24
|
+
"title": title,
|
|
25
|
+
"type": request_type,
|
|
26
|
+
"description": description,
|
|
27
|
+
"request": {
|
|
28
|
+
"method": request.method,
|
|
29
|
+
"body": {
|
|
30
|
+
"content": _normalize_body_string(request.body),
|
|
31
|
+
},
|
|
32
|
+
"headers": dict(request.headers),
|
|
33
|
+
},
|
|
34
|
+
"response": {
|
|
35
|
+
"body": {
|
|
36
|
+
"content": response.text,
|
|
37
|
+
},
|
|
38
|
+
"headers": dict(response.headers),
|
|
39
|
+
"status_code": response.status_code,
|
|
40
|
+
},
|
|
41
|
+
},
|
|
42
|
+
"log": {
|
|
43
|
+
"level": "debug",
|
|
44
|
+
},
|
|
45
|
+
"url": {"full": request.url},
|
|
46
|
+
}
|
|
47
|
+
if is_auxiliary is not None:
|
|
48
|
+
log_message["http"]["is_auxiliary"] = is_auxiliary # type: ignore [index]
|
|
49
|
+
if stream_name:
|
|
50
|
+
log_message["airbyte_cdk"] = {"stream": {"name": stream_name}}
|
|
51
|
+
return log_message # type: ignore[return-value] # got "dict[str, object]", expected "dict[str, JsonType]"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _normalize_body_string(body_str: Optional[Union[str, bytes]]) -> Optional[str]:
|
|
55
|
+
return body_str.decode() if isinstance(body_str, (bytes, bytearray)) else body_str
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
from .repository import (
|
|
6
|
+
InMemoryMessageRepository,
|
|
7
|
+
LogAppenderMessageRepositoryDecorator,
|
|
8
|
+
LogMessage,
|
|
9
|
+
MessageRepository,
|
|
10
|
+
NoopMessageRepository,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"InMemoryMessageRepository",
|
|
15
|
+
"LogAppenderMessageRepositoryDecorator",
|
|
16
|
+
"LogMessage",
|
|
17
|
+
"MessageRepository",
|
|
18
|
+
"NoopMessageRepository",
|
|
19
|
+
]
|