airbyte-cdk 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +358 -0
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +104 -0
- airbyte_cdk/connector.py +123 -0
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/__init__.py +3 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
- airbyte_cdk/connector_builder/main.py +107 -0
- airbyte_cdk/connector_builder/models.py +73 -0
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +83 -0
- airbyte_cdk/destinations/__init__.py +8 -0
- airbyte_cdk/destinations/destination.py +154 -0
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
- airbyte_cdk/destinations/vector_db_based/config.py +298 -0
- airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
- airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
- airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
- airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
- airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
- airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
- airbyte_cdk/entrypoint.py +414 -0
- airbyte_cdk/exception_handler.py +56 -0
- airbyte_cdk/logger.py +109 -0
- airbyte_cdk/models/__init__.py +72 -0
- airbyte_cdk/models/airbyte_protocol.py +88 -0
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/well_known_types.py +5 -0
- airbyte_cdk/py.typed +0 -0
- airbyte_cdk/sources/__init__.py +26 -0
- airbyte_cdk/sources/abstract_source.py +326 -0
- airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
- airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
- airbyte_cdk/sources/config.py +27 -0
- airbyte_cdk/sources/connector_state_manager.py +161 -0
- airbyte_cdk/sources/declarative/__init__.py +3 -0
- airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
- airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
- airbyte_cdk/sources/declarative/auth/token.py +267 -0
- airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
- airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
- airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
- airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
- airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
- airbyte_cdk/sources/declarative/declarative_source.py +36 -0
- airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
- airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
- airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
- airbyte_cdk/sources/declarative/exceptions.py +9 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
- airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
- airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
- airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
- airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
- airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +2 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
- airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
- airbyte_cdk/sources/declarative/requesters/README.md +56 -0
- airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
- airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
- airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
- airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
- airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
- airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
- airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
- airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
- airbyte_cdk/sources/declarative/spec/spec.py +48 -0
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
- airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
- airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
- airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
- airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
- airbyte_cdk/sources/declarative/types.py +25 -0
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
- airbyte_cdk/sources/file_based/config/__init__.py +0 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
- airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
- airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
- airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
- airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
- airbyte_cdk/sources/file_based/exceptions.py +159 -0
- airbyte_cdk/sources/file_based/file_based_source.py +466 -0
- airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
- airbyte_cdk/sources/file_based/file_record_data.py +22 -0
- airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
- airbyte_cdk/sources/file_based/remote_file.py +18 -0
- airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
- airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
- airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
- airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
- airbyte_cdk/sources/file_based/types.py +10 -0
- airbyte_cdk/sources/http_config.py +10 -0
- airbyte_cdk/sources/http_logger.py +55 -0
- airbyte_cdk/sources/message/__init__.py +19 -0
- airbyte_cdk/sources/message/repository.py +137 -0
- airbyte_cdk/sources/source.py +95 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/__init__.py +8 -0
- airbyte_cdk/sources/streams/availability_strategy.py +84 -0
- airbyte_cdk/sources/streams/call_rate.py +704 -0
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
- airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
- airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
- airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
- airbyte_cdk/sources/streams/core.py +703 -0
- airbyte_cdk/sources/streams/http/__init__.py +10 -0
- airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +61 -0
- airbyte_cdk/sources/streams/http/http.py +673 -0
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/sources/streams/utils/__init__.py +3 -0
- airbyte_cdk/sources/types.py +169 -0
- airbyte_cdk/sources/utils/__init__.py +7 -0
- airbyte_cdk/sources/utils/casing.py +12 -0
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +53 -0
- airbyte_cdk/sources/utils/schema_helpers.py +230 -0
- airbyte_cdk/sources/utils/slice_logger.py +57 -0
- airbyte_cdk/sources/utils/transform.py +277 -0
- airbyte_cdk/sources/utils/types.py +7 -0
- airbyte_cdk/sql/__init__.py +0 -0
- airbyte_cdk/sql/_util/__init__.py +0 -0
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/__init__.py +7 -0
- airbyte_cdk/test/catalog_builder.py +81 -0
- airbyte_cdk/test/entrypoint_wrapper.py +250 -0
- airbyte_cdk/test/mock_http/__init__.py +6 -0
- airbyte_cdk/test/mock_http/matcher.py +41 -0
- airbyte_cdk/test/mock_http/mocker.py +185 -0
- airbyte_cdk/test/mock_http/request.py +103 -0
- airbyte_cdk/test/mock_http/response.py +28 -0
- airbyte_cdk/test/mock_http/response_builder.py +237 -0
- airbyte_cdk/test/state_builder.py +33 -0
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +10 -0
- airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
- airbyte_cdk/utils/analytics_message.py +25 -0
- airbyte_cdk/utils/constants.py +5 -0
- airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
- airbyte_cdk/utils/datetime_helpers.py +499 -0
- airbyte_cdk/utils/event_timing.py +85 -0
- airbyte_cdk/utils/is_cloud_environment.py +18 -0
- airbyte_cdk/utils/mapping_helpers.py +162 -0
- airbyte_cdk/utils/message_utils.py +26 -0
- airbyte_cdk/utils/oneof_option_config.py +33 -0
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +270 -0
- airbyte_cdk/utils/slice_hasher.py +37 -0
- airbyte_cdk/utils/spec_schema_transformations.py +26 -0
- airbyte_cdk/utils/stream_status_utils.py +43 -0
- airbyte_cdk/utils/traced_exception.py +145 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
- airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
- airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
- airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
- airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import copy
|
|
7
|
+
from typing import Any, Dict, List, Mapping, Optional, Union
|
|
8
|
+
|
|
9
|
+
from airbyte_cdk.sources.declarative.requesters.request_option import (
|
|
10
|
+
RequestOption,
|
|
11
|
+
RequestOptionType,
|
|
12
|
+
)
|
|
13
|
+
from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _merge_mappings(
|
|
17
|
+
target: Dict[str, Any],
|
|
18
|
+
source: Mapping[str, Any],
|
|
19
|
+
path: Optional[List[str]] = None,
|
|
20
|
+
allow_same_value_merge: bool = False,
|
|
21
|
+
) -> None:
|
|
22
|
+
"""
|
|
23
|
+
Recursively merge two dictionaries, raising an error if there are any conflicts.
|
|
24
|
+
For body_json requests (allow_same_value_merge=True), a conflict occurs only when the same path has different values.
|
|
25
|
+
For other request types (allow_same_value_merge=False), any duplicate key is a conflict, regardless of value.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
target: The dictionary to merge into
|
|
29
|
+
source: The dictionary to merge from
|
|
30
|
+
path: The current path in the nested structure (for error messages)
|
|
31
|
+
allow_same_value_merge: Whether to allow merging the same value into the same key. Set to false by default, should only be true for body_json injections
|
|
32
|
+
"""
|
|
33
|
+
path = path or []
|
|
34
|
+
for key, source_value in source.items():
|
|
35
|
+
current_path = path + [str(key)]
|
|
36
|
+
|
|
37
|
+
if key in target:
|
|
38
|
+
target_value = target[key]
|
|
39
|
+
if isinstance(target_value, dict) and isinstance(source_value, dict):
|
|
40
|
+
# Only body_json supports nested_structures
|
|
41
|
+
if not allow_same_value_merge:
|
|
42
|
+
raise ValueError(
|
|
43
|
+
f"Request body collision, duplicate keys detected at key path: {'.'.join(current_path)}. Please ensure that all keys in the request are unique."
|
|
44
|
+
)
|
|
45
|
+
# If both are dictionaries, recursively merge them
|
|
46
|
+
_merge_mappings(target_value, source_value, current_path, allow_same_value_merge)
|
|
47
|
+
|
|
48
|
+
elif not allow_same_value_merge or target_value != source_value:
|
|
49
|
+
# If same key has different values, that's a conflict
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f"Request body collision, duplicate keys detected at key path: {'.'.join(current_path)}. Please ensure that all keys in the request are unique."
|
|
52
|
+
)
|
|
53
|
+
else:
|
|
54
|
+
# No conflict, just copy the value (using deepcopy for nested structures)
|
|
55
|
+
target[key] = copy.deepcopy(source_value)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def combine_mappings(
|
|
59
|
+
mappings: List[Optional[Union[Mapping[str, Any], str]]],
|
|
60
|
+
allow_same_value_merge: bool = False,
|
|
61
|
+
) -> Union[Mapping[str, Any], str]:
|
|
62
|
+
"""
|
|
63
|
+
Combine multiple mappings into a single mapping.
|
|
64
|
+
|
|
65
|
+
For body_json requests (allow_same_value_merge=True):
|
|
66
|
+
- Supports nested structures (e.g., {"data": {"user": {"id": 1}}})
|
|
67
|
+
- Allows duplicate keys if their values match
|
|
68
|
+
- Raises error if same path has different values
|
|
69
|
+
|
|
70
|
+
For other request types (allow_same_value_merge=False):
|
|
71
|
+
- Only supports flat structures
|
|
72
|
+
- Any duplicate key raises an error, regardless of value
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
mappings: List of mappings to combine
|
|
76
|
+
allow_same_value_merge: Whether to allow duplicate keys with matching values.
|
|
77
|
+
Should only be True for body_json requests.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
A single mapping combining all inputs, or a string if there is exactly one
|
|
81
|
+
string mapping and no other non-empty mappings.
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
ValueError: If there are:
|
|
85
|
+
- Multiple string mappings
|
|
86
|
+
- Both a string mapping and non-empty dictionary mappings
|
|
87
|
+
- Conflicting keys/paths based on allow_same_value_merge setting
|
|
88
|
+
"""
|
|
89
|
+
if not mappings:
|
|
90
|
+
return {}
|
|
91
|
+
|
|
92
|
+
# Count how many string options we have, ignoring None values
|
|
93
|
+
string_options = sum(isinstance(mapping, str) for mapping in mappings if mapping is not None)
|
|
94
|
+
if string_options > 1:
|
|
95
|
+
raise ValueError("Cannot combine multiple string options")
|
|
96
|
+
|
|
97
|
+
# Filter out None values and empty mappings
|
|
98
|
+
non_empty_mappings = [
|
|
99
|
+
m for m in mappings if m is not None and not (isinstance(m, Mapping) and not m)
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# If there is only one string option and no other non-empty mappings, return it
|
|
103
|
+
if string_options == 1:
|
|
104
|
+
if len(non_empty_mappings) > 1:
|
|
105
|
+
raise ValueError("Cannot combine multiple options if one is a string")
|
|
106
|
+
return next(m for m in non_empty_mappings if isinstance(m, str))
|
|
107
|
+
|
|
108
|
+
# Start with an empty result and merge each mapping into it
|
|
109
|
+
result: Dict[str, Any] = {}
|
|
110
|
+
for mapping in non_empty_mappings:
|
|
111
|
+
if mapping and isinstance(mapping, Mapping):
|
|
112
|
+
_merge_mappings(result, mapping, allow_same_value_merge=allow_same_value_merge)
|
|
113
|
+
|
|
114
|
+
return result
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _validate_component_request_option_paths(
|
|
118
|
+
config: Config, *request_options: Optional[RequestOption]
|
|
119
|
+
) -> None:
|
|
120
|
+
"""
|
|
121
|
+
Validates that a component with multiple request options does not have conflicting paths.
|
|
122
|
+
Uses dummy values for validation since actual values might not be available at init time.
|
|
123
|
+
"""
|
|
124
|
+
grouped_options: Dict[RequestOptionType, List[RequestOption]] = {}
|
|
125
|
+
for option in request_options:
|
|
126
|
+
if option:
|
|
127
|
+
grouped_options.setdefault(option.inject_into, []).append(option)
|
|
128
|
+
|
|
129
|
+
for inject_type, options in grouped_options.items():
|
|
130
|
+
if len(options) <= 1:
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
option_dicts: List[Optional[Union[Mapping[str, Any], str]]] = []
|
|
134
|
+
for i, option in enumerate(options):
|
|
135
|
+
option_dict: Dict[str, Any] = {}
|
|
136
|
+
# Use indexed dummy values to ensure we catch conflicts
|
|
137
|
+
option.inject_into_request(option_dict, f"dummy_value_{i}", config)
|
|
138
|
+
option_dicts.append(option_dict)
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
combine_mappings(
|
|
142
|
+
option_dicts, allow_same_value_merge=(inject_type == RequestOptionType.body_json)
|
|
143
|
+
)
|
|
144
|
+
except ValueError as error:
|
|
145
|
+
raise ValueError(error)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def get_interpolation_context(
|
|
149
|
+
stream_state: Optional[StreamState] = None,
|
|
150
|
+
stream_slice: Optional[StreamSlice] = None,
|
|
151
|
+
next_page_token: Optional[Mapping[str, Any]] = None,
|
|
152
|
+
) -> Mapping[str, Any]:
|
|
153
|
+
return {
|
|
154
|
+
"stream_slice": stream_slice,
|
|
155
|
+
"next_page_token": next_page_token,
|
|
156
|
+
# update the context with extra fields, if passed.
|
|
157
|
+
**(
|
|
158
|
+
stream_slice.extra_fields
|
|
159
|
+
if stream_slice is not None and hasattr(stream_slice, "extra_fields")
|
|
160
|
+
else {}
|
|
161
|
+
),
|
|
162
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
|
2
|
+
|
|
3
|
+
from airbyte_cdk.models import AirbyteMessage, Type
|
|
4
|
+
from airbyte_cdk.sources.connector_state_manager import HashableStreamDescriptor
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def get_stream_descriptor(message: AirbyteMessage) -> HashableStreamDescriptor:
|
|
8
|
+
match message.type:
|
|
9
|
+
case Type.RECORD:
|
|
10
|
+
return HashableStreamDescriptor(
|
|
11
|
+
name=message.record.stream, # type: ignore[union-attr] # record has `stream`
|
|
12
|
+
namespace=message.record.namespace, # type: ignore[union-attr] # record has `namespace`
|
|
13
|
+
)
|
|
14
|
+
case Type.STATE:
|
|
15
|
+
if not message.state.stream or not message.state.stream.stream_descriptor: # type: ignore[union-attr] # state has `stream`
|
|
16
|
+
raise ValueError(
|
|
17
|
+
"State message was not in per-stream state format, which is required for record counts."
|
|
18
|
+
)
|
|
19
|
+
return HashableStreamDescriptor(
|
|
20
|
+
name=message.state.stream.stream_descriptor.name, # type: ignore[union-attr] # state has `stream`
|
|
21
|
+
namespace=message.state.stream.stream_descriptor.namespace, # type: ignore[union-attr] # state has `stream`
|
|
22
|
+
)
|
|
23
|
+
case _:
|
|
24
|
+
raise NotImplementedError(
|
|
25
|
+
f"get_stream_descriptor is not implemented for message type '{message.type}'."
|
|
26
|
+
)
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
from typing import Any, Dict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class OneOfOptionConfig:
|
|
9
|
+
"""
|
|
10
|
+
Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.
|
|
11
|
+
|
|
12
|
+
Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
class OptionModel(BaseModel):
|
|
18
|
+
mode: Literal["option_a"] = Field("option_a", const=True)
|
|
19
|
+
option_a_field: str = Field(...)
|
|
20
|
+
|
|
21
|
+
class Config(OneOfOptionConfig):
|
|
22
|
+
title = "Option A"
|
|
23
|
+
description = "Option A description"
|
|
24
|
+
discriminator = "mode"
|
|
25
|
+
```
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def schema_extra(schema: Dict[str, Any], model: Any) -> None:
|
|
30
|
+
if hasattr(model.Config, "description"):
|
|
31
|
+
schema["description"] = model.Config.description
|
|
32
|
+
if hasattr(model.Config, "discriminator"):
|
|
33
|
+
schema.setdefault("required", []).append(model.Config.discriminator)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
5
|
+
from io import StringIO
|
|
6
|
+
from threading import RLock
|
|
7
|
+
from types import TracebackType
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PrintBuffer:
|
|
12
|
+
"""
|
|
13
|
+
A class to buffer print statements and flush them at a specified interval.
|
|
14
|
+
|
|
15
|
+
The PrintBuffer class is designed to capture and buffer output that would
|
|
16
|
+
normally be printed to the standard output (stdout). This can be useful for
|
|
17
|
+
scenarios where you want to minimize the number of I/O operations by grouping
|
|
18
|
+
multiple print statements together and flushing them as a single operation.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
buffer (StringIO): A buffer to store the messages before flushing.
|
|
22
|
+
flush_interval (float): The time interval (in seconds) after which the buffer is flushed.
|
|
23
|
+
last_flush_time (float): The last time the buffer was flushed.
|
|
24
|
+
lock (RLock): A reentrant lock to ensure thread-safe operations.
|
|
25
|
+
|
|
26
|
+
Methods:
|
|
27
|
+
write(message: str) -> None:
|
|
28
|
+
Writes a message to the buffer and flushes if the interval has passed.
|
|
29
|
+
|
|
30
|
+
flush() -> None:
|
|
31
|
+
Flushes the buffer content to the standard output.
|
|
32
|
+
|
|
33
|
+
__enter__() -> "PrintBuffer":
|
|
34
|
+
Enters the runtime context related to this object, redirecting stdout and stderr.
|
|
35
|
+
|
|
36
|
+
__exit__(exc_type, exc_val, exc_tb) -> None:
|
|
37
|
+
Exits the runtime context and restores the original stdout and stderr.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, flush_interval: float = 0.1):
|
|
41
|
+
self.buffer = StringIO()
|
|
42
|
+
self.flush_interval = flush_interval
|
|
43
|
+
self.last_flush_time = time.monotonic()
|
|
44
|
+
self.lock = RLock()
|
|
45
|
+
|
|
46
|
+
def write(self, message: str) -> None:
|
|
47
|
+
with self.lock:
|
|
48
|
+
self.buffer.write(message)
|
|
49
|
+
current_time = time.monotonic()
|
|
50
|
+
if (current_time - self.last_flush_time) >= self.flush_interval:
|
|
51
|
+
self.flush()
|
|
52
|
+
self.last_flush_time = current_time
|
|
53
|
+
|
|
54
|
+
def flush(self) -> None:
|
|
55
|
+
with self.lock:
|
|
56
|
+
combined_message = self.buffer.getvalue()
|
|
57
|
+
sys.__stdout__.write(combined_message) # type: ignore[union-attr]
|
|
58
|
+
self.buffer = StringIO()
|
|
59
|
+
|
|
60
|
+
def __enter__(self) -> "PrintBuffer":
|
|
61
|
+
self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
|
|
62
|
+
# Used to disable buffering during the pytest session, because it is not compatible with capsys
|
|
63
|
+
if "pytest" not in str(type(sys.stdout)).lower():
|
|
64
|
+
sys.stdout = self
|
|
65
|
+
sys.stderr = self
|
|
66
|
+
return self
|
|
67
|
+
|
|
68
|
+
def __exit__(
|
|
69
|
+
self,
|
|
70
|
+
exc_type: Optional[BaseException],
|
|
71
|
+
exc_val: Optional[BaseException],
|
|
72
|
+
exc_tb: Optional[TracebackType],
|
|
73
|
+
) -> None:
|
|
74
|
+
self.flush()
|
|
75
|
+
sys.stdout, sys.stderr = self.old_stdout, self.old_stderr
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from typing import Any, Dict, List, Mapping, Optional
|
|
7
|
+
|
|
8
|
+
from genson import SchemaBuilder, SchemaNode
|
|
9
|
+
from genson.schema.strategies.object import Object
|
|
10
|
+
from genson.schema.strategies.scalar import Number
|
|
11
|
+
|
|
12
|
+
from airbyte_cdk.models import AirbyteRecordMessage
|
|
13
|
+
|
|
14
|
+
# schema keywords
|
|
15
|
+
_TYPE = "type"
|
|
16
|
+
_NULL_TYPE = "null"
|
|
17
|
+
_OBJECT_TYPE = "object"
|
|
18
|
+
_ANY_OF = "anyOf"
|
|
19
|
+
_ITEMS = "items"
|
|
20
|
+
_PROPERTIES = "properties"
|
|
21
|
+
_REQUIRED = "required"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class NoRequiredObj(Object):
|
|
25
|
+
"""
|
|
26
|
+
This class has Object behaviour, but it does not generate "required[]" fields
|
|
27
|
+
every time it parses object. So we don't add unnecessary extra field.
|
|
28
|
+
|
|
29
|
+
The logic is that even reading all the data from a source, it does not mean that there can be another record added with those fields as
|
|
30
|
+
optional. Hence, we make everything nullable.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def to_schema(self) -> Mapping[str, Any]:
|
|
34
|
+
schema: Dict[str, Any] = super(NoRequiredObj, self).to_schema()
|
|
35
|
+
schema.pop("required", None)
|
|
36
|
+
return schema
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class IntegerToNumber(Number):
|
|
40
|
+
"""
|
|
41
|
+
This class has the regular Number behaviour, but it will never emit an integer type.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, node_class: SchemaNode):
|
|
45
|
+
super().__init__(node_class)
|
|
46
|
+
self._type = "number"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class NoRequiredSchemaBuilder(SchemaBuilder):
|
|
50
|
+
EXTRA_STRATEGIES = (NoRequiredObj, IntegerToNumber)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# This type is inferred from the genson lib, but there is no alias provided for it - creating it here for type safety
|
|
54
|
+
InferredSchema = Dict[str, Any]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class SchemaValidationException(Exception):
|
|
58
|
+
@classmethod
|
|
59
|
+
def merge_exceptions(
|
|
60
|
+
cls, exceptions: List["SchemaValidationException"]
|
|
61
|
+
) -> "SchemaValidationException":
|
|
62
|
+
# We assume the schema is the same for all SchemaValidationException
|
|
63
|
+
return SchemaValidationException(
|
|
64
|
+
exceptions[0].schema,
|
|
65
|
+
[x for exception in exceptions for x in exception._validation_errors],
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def __init__(self, schema: InferredSchema, validation_errors: List[Exception]):
|
|
69
|
+
self._schema = schema
|
|
70
|
+
self._validation_errors = validation_errors
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def schema(self) -> InferredSchema:
|
|
74
|
+
return self._schema
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def validation_errors(self) -> List[str]:
|
|
78
|
+
return list(map(lambda error: str(error), self._validation_errors))
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class SchemaInferrer:
|
|
82
|
+
"""
|
|
83
|
+
This class is used to infer a JSON schema which fits all the records passed into it
|
|
84
|
+
throughout its lifecycle via the accumulate method.
|
|
85
|
+
|
|
86
|
+
Instances of this class are stateful, meaning they build their inferred schemas
|
|
87
|
+
from every record passed into the accumulate method.
|
|
88
|
+
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
stream_to_builder: Dict[str, SchemaBuilder]
|
|
92
|
+
|
|
93
|
+
def __init__(
|
|
94
|
+
self, pk: Optional[List[List[str]]] = None, cursor_field: Optional[List[List[str]]] = None
|
|
95
|
+
) -> None:
|
|
96
|
+
self.stream_to_builder = defaultdict(NoRequiredSchemaBuilder)
|
|
97
|
+
self._pk = [] if pk is None else pk
|
|
98
|
+
self._cursor_field = [] if cursor_field is None else cursor_field
|
|
99
|
+
|
|
100
|
+
def accumulate(self, record: AirbyteRecordMessage) -> None:
|
|
101
|
+
"""Uses the input record to add to the inferred schemas maintained by this object"""
|
|
102
|
+
self.stream_to_builder[record.stream].add_object(record.data)
|
|
103
|
+
|
|
104
|
+
def _null_type_in_any_of(self, node: InferredSchema) -> bool:
|
|
105
|
+
if _ANY_OF in node:
|
|
106
|
+
return {_TYPE: _NULL_TYPE} in node[_ANY_OF]
|
|
107
|
+
else:
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
def _remove_type_from_any_of(self, node: InferredSchema) -> None:
|
|
111
|
+
if _ANY_OF in node:
|
|
112
|
+
node.pop(_TYPE, None)
|
|
113
|
+
|
|
114
|
+
def _clean_any_of(self, node: InferredSchema) -> None:
|
|
115
|
+
if len(node[_ANY_OF]) == 2 and self._null_type_in_any_of(node):
|
|
116
|
+
real_type = (
|
|
117
|
+
node[_ANY_OF][1] if node[_ANY_OF][0][_TYPE] == _NULL_TYPE else node[_ANY_OF][0]
|
|
118
|
+
)
|
|
119
|
+
node.update(real_type)
|
|
120
|
+
node[_TYPE] = [node[_TYPE], _NULL_TYPE]
|
|
121
|
+
node.pop(_ANY_OF)
|
|
122
|
+
# populate `type` for `anyOf` if it's not present to pass all other checks
|
|
123
|
+
elif len(node[_ANY_OF]) == 2 and not self._null_type_in_any_of(node):
|
|
124
|
+
node[_TYPE] = [_NULL_TYPE]
|
|
125
|
+
|
|
126
|
+
def _clean_properties(self, node: InferredSchema) -> None:
|
|
127
|
+
for key, value in list(node[_PROPERTIES].items()):
|
|
128
|
+
if isinstance(value, dict) and value.get(_TYPE, None) == _NULL_TYPE:
|
|
129
|
+
node[_PROPERTIES].pop(key)
|
|
130
|
+
else:
|
|
131
|
+
self._clean(value)
|
|
132
|
+
|
|
133
|
+
def _ensure_null_type_on_top(self, node: InferredSchema) -> None:
|
|
134
|
+
if isinstance(node[_TYPE], list):
|
|
135
|
+
if _NULL_TYPE in node[_TYPE]:
|
|
136
|
+
# we want to make sure null is always at the end as it makes schemas more readable
|
|
137
|
+
node[_TYPE].remove(_NULL_TYPE)
|
|
138
|
+
node[_TYPE].append(_NULL_TYPE)
|
|
139
|
+
else:
|
|
140
|
+
node[_TYPE] = [node[_TYPE], _NULL_TYPE]
|
|
141
|
+
|
|
142
|
+
def _clean(self, node: InferredSchema) -> InferredSchema:
|
|
143
|
+
"""
|
|
144
|
+
Recursively cleans up a produced schema:
|
|
145
|
+
- remove anyOf if one of them is just a null value
|
|
146
|
+
- remove properties of type "null"
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
if isinstance(node, dict):
|
|
150
|
+
if _ANY_OF in node:
|
|
151
|
+
self._clean_any_of(node)
|
|
152
|
+
|
|
153
|
+
if _PROPERTIES in node and isinstance(node[_PROPERTIES], dict):
|
|
154
|
+
self._clean_properties(node)
|
|
155
|
+
|
|
156
|
+
if _ITEMS in node:
|
|
157
|
+
self._clean(node[_ITEMS])
|
|
158
|
+
|
|
159
|
+
# this check needs to follow the "anyOf" cleaning as it might populate `type`
|
|
160
|
+
self._ensure_null_type_on_top(node)
|
|
161
|
+
|
|
162
|
+
# remove added `type: ["null"]` for `anyOf` nested node
|
|
163
|
+
self._remove_type_from_any_of(node)
|
|
164
|
+
|
|
165
|
+
return node
|
|
166
|
+
|
|
167
|
+
def _add_required_properties(self, node: InferredSchema) -> InferredSchema:
|
|
168
|
+
"""
|
|
169
|
+
This method takes properties that should be marked as required (self._pk and self._cursor_field) and travel the schema to mark every
|
|
170
|
+
node as required.
|
|
171
|
+
"""
|
|
172
|
+
# Removing nullable for the root as when we call `_clean`, we make everything nullable
|
|
173
|
+
node[_TYPE] = _OBJECT_TYPE
|
|
174
|
+
|
|
175
|
+
exceptions = []
|
|
176
|
+
for field in [x for x in [self._pk, self._cursor_field] if x]:
|
|
177
|
+
try:
|
|
178
|
+
self._add_fields_as_required(node, field)
|
|
179
|
+
except SchemaValidationException as exception:
|
|
180
|
+
exceptions.append(exception)
|
|
181
|
+
|
|
182
|
+
if exceptions:
|
|
183
|
+
raise SchemaValidationException.merge_exceptions(exceptions)
|
|
184
|
+
|
|
185
|
+
return node
|
|
186
|
+
|
|
187
|
+
def _add_fields_as_required(self, node: InferredSchema, composite_key: List[List[str]]) -> None:
|
|
188
|
+
"""
|
|
189
|
+
Take a list of nested keys (this list represents a composite key) and travel the schema to mark every node as required.
|
|
190
|
+
"""
|
|
191
|
+
errors: List[Exception] = []
|
|
192
|
+
|
|
193
|
+
for path in composite_key:
|
|
194
|
+
try:
|
|
195
|
+
self._add_field_as_required(node, path)
|
|
196
|
+
except ValueError as exception:
|
|
197
|
+
errors.append(exception)
|
|
198
|
+
|
|
199
|
+
if errors:
|
|
200
|
+
raise SchemaValidationException(node, errors)
|
|
201
|
+
|
|
202
|
+
def _add_field_as_required(
|
|
203
|
+
self, node: InferredSchema, path: List[str], traveled_path: Optional[List[str]] = None
|
|
204
|
+
) -> None:
|
|
205
|
+
"""
|
|
206
|
+
Take a nested key and travel the schema to mark every node as required.
|
|
207
|
+
"""
|
|
208
|
+
self._remove_null_from_type(node)
|
|
209
|
+
if self._is_leaf(path):
|
|
210
|
+
return
|
|
211
|
+
|
|
212
|
+
if not traveled_path:
|
|
213
|
+
traveled_path = []
|
|
214
|
+
|
|
215
|
+
if _PROPERTIES not in node:
|
|
216
|
+
# This validation is only relevant when `traveled_path` is empty
|
|
217
|
+
raise ValueError(
|
|
218
|
+
f"Path {traveled_path} does not refer to an object but is `{node}` and hence {path} can't be marked as required."
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
next_node = path[0]
|
|
222
|
+
if next_node not in node[_PROPERTIES]:
|
|
223
|
+
raise ValueError(
|
|
224
|
+
f"Path {traveled_path} does not have field `{next_node}` in the schema and hence can't be marked as required."
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
if _TYPE not in node:
|
|
228
|
+
# We do not expect this case to happen but we added a specific error message just in case
|
|
229
|
+
raise ValueError(
|
|
230
|
+
f"Unknown schema error: {traveled_path} is expected to have a type but did not. Schema inferrence is probably broken"
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
if node[_TYPE] not in [
|
|
234
|
+
_OBJECT_TYPE,
|
|
235
|
+
[_NULL_TYPE, _OBJECT_TYPE],
|
|
236
|
+
[_OBJECT_TYPE, _NULL_TYPE],
|
|
237
|
+
]:
|
|
238
|
+
raise ValueError(
|
|
239
|
+
f"Path {traveled_path} is expected to be an object but was of type `{node['properties'][next_node]['type']}`"
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
if _REQUIRED not in node or not node[_REQUIRED]:
|
|
243
|
+
node[_REQUIRED] = [next_node]
|
|
244
|
+
elif next_node not in node[_REQUIRED]:
|
|
245
|
+
node[_REQUIRED].append(next_node)
|
|
246
|
+
|
|
247
|
+
traveled_path.append(next_node)
|
|
248
|
+
self._add_field_as_required(node[_PROPERTIES][next_node], path[1:], traveled_path)
|
|
249
|
+
|
|
250
|
+
def _is_leaf(self, path: List[str]) -> bool:
|
|
251
|
+
return len(path) == 0
|
|
252
|
+
|
|
253
|
+
def _remove_null_from_type(self, node: InferredSchema) -> None:
|
|
254
|
+
if isinstance(node[_TYPE], list):
|
|
255
|
+
if _NULL_TYPE in node[_TYPE]:
|
|
256
|
+
node[_TYPE].remove(_NULL_TYPE)
|
|
257
|
+
if len(node[_TYPE]) == 1:
|
|
258
|
+
node[_TYPE] = node[_TYPE][0]
|
|
259
|
+
|
|
260
|
+
def get_stream_schema(self, stream_name: str) -> Optional[InferredSchema]:
|
|
261
|
+
"""
|
|
262
|
+
Returns the inferred JSON schema for the specified stream. Might be `None` if there were no records for the given stream name.
|
|
263
|
+
"""
|
|
264
|
+
return (
|
|
265
|
+
self._add_required_properties(
|
|
266
|
+
self._clean(self.stream_to_builder[stream_name].to_schema())
|
|
267
|
+
)
|
|
268
|
+
if stream_name in self.stream_to_builder
|
|
269
|
+
else None
|
|
270
|
+
)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
from typing import Any, Final, Mapping, Optional
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class SliceEncoder(json.JSONEncoder):
|
|
7
|
+
def default(self, obj: Any) -> Any:
|
|
8
|
+
if hasattr(obj, "__json_serializable__"):
|
|
9
|
+
return obj.__json_serializable__()
|
|
10
|
+
|
|
11
|
+
# Let the base class default method raise the TypeError
|
|
12
|
+
return super().default(obj)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SliceHasher:
|
|
16
|
+
_ENCODING: Final = "utf-8"
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def hash(
|
|
20
|
+
cls,
|
|
21
|
+
stream_name: str = "<stream name not provided>",
|
|
22
|
+
stream_slice: Optional[Mapping[str, Any]] = None,
|
|
23
|
+
) -> int:
|
|
24
|
+
"""
|
|
25
|
+
Note that streams partition with the same slicing value but with different names might collapse if stream name is not provided
|
|
26
|
+
"""
|
|
27
|
+
if stream_slice:
|
|
28
|
+
try:
|
|
29
|
+
s = json.dumps(stream_slice, sort_keys=True, cls=SliceEncoder)
|
|
30
|
+
hash_input = f"{stream_name}:{s}".encode(cls._ENCODING)
|
|
31
|
+
except TypeError as e:
|
|
32
|
+
raise ValueError(f"Failed to serialize stream slice: {e}")
|
|
33
|
+
else:
|
|
34
|
+
hash_input = stream_name.encode(cls._ENCODING)
|
|
35
|
+
|
|
36
|
+
# Use last 8 bytes as 64-bit integer for better distribution
|
|
37
|
+
return int.from_bytes(hashlib.sha256(hash_input).digest()[-8:], "big")
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from jsonschema import RefResolver
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def resolve_refs(schema: dict[str, Any]) -> dict[str, Any]:
|
|
13
|
+
"""
|
|
14
|
+
For spec schemas generated using Pydantic models, the resulting JSON schema can contain refs between object
|
|
15
|
+
relationships.
|
|
16
|
+
"""
|
|
17
|
+
json_schema_ref_resolver = RefResolver.from_schema(schema)
|
|
18
|
+
str_schema = json.dumps(schema)
|
|
19
|
+
for ref_block in re.findall(r'{"\$ref": "#\/definitions\/.+?(?="})"}', str_schema):
|
|
20
|
+
ref = json.loads(ref_block)["$ref"]
|
|
21
|
+
str_schema = str_schema.replace(
|
|
22
|
+
ref_block, json.dumps(json_schema_ref_resolver.resolve(ref)[1])
|
|
23
|
+
)
|
|
24
|
+
pyschema: dict[str, Any] = json.loads(str_schema)
|
|
25
|
+
del pyschema["definitions"]
|
|
26
|
+
return pyschema
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from typing import List, Optional, Union
|
|
8
|
+
|
|
9
|
+
from airbyte_cdk.models import (
|
|
10
|
+
AirbyteMessage,
|
|
11
|
+
AirbyteStream,
|
|
12
|
+
AirbyteStreamStatus,
|
|
13
|
+
AirbyteStreamStatusReason,
|
|
14
|
+
AirbyteStreamStatusTraceMessage,
|
|
15
|
+
AirbyteTraceMessage,
|
|
16
|
+
StreamDescriptor,
|
|
17
|
+
TraceType,
|
|
18
|
+
)
|
|
19
|
+
from airbyte_cdk.models import Type as MessageType
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def as_airbyte_message(
|
|
23
|
+
stream: Union[AirbyteStream, StreamDescriptor],
|
|
24
|
+
current_status: AirbyteStreamStatus,
|
|
25
|
+
reasons: Optional[List[AirbyteStreamStatusReason]] = None,
|
|
26
|
+
) -> AirbyteMessage:
|
|
27
|
+
"""
|
|
28
|
+
Builds an AirbyteStreamStatusTraceMessage for the provided stream
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
now_millis = datetime.now().timestamp() * 1000.0
|
|
32
|
+
|
|
33
|
+
trace_message = AirbyteTraceMessage(
|
|
34
|
+
type=TraceType.STREAM_STATUS,
|
|
35
|
+
emitted_at=now_millis,
|
|
36
|
+
stream_status=AirbyteStreamStatusTraceMessage(
|
|
37
|
+
stream_descriptor=StreamDescriptor(name=stream.name, namespace=stream.namespace),
|
|
38
|
+
status=current_status,
|
|
39
|
+
reasons=reasons,
|
|
40
|
+
),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
return AirbyteMessage(type=MessageType.TRACE, trace=trace_message)
|