airbyte-cdk 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +358 -0
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +104 -0
- airbyte_cdk/connector.py +123 -0
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/__init__.py +3 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
- airbyte_cdk/connector_builder/main.py +107 -0
- airbyte_cdk/connector_builder/models.py +73 -0
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +83 -0
- airbyte_cdk/destinations/__init__.py +8 -0
- airbyte_cdk/destinations/destination.py +154 -0
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
- airbyte_cdk/destinations/vector_db_based/config.py +298 -0
- airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
- airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
- airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
- airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
- airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
- airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
- airbyte_cdk/entrypoint.py +414 -0
- airbyte_cdk/exception_handler.py +56 -0
- airbyte_cdk/logger.py +109 -0
- airbyte_cdk/models/__init__.py +72 -0
- airbyte_cdk/models/airbyte_protocol.py +88 -0
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/well_known_types.py +5 -0
- airbyte_cdk/py.typed +0 -0
- airbyte_cdk/sources/__init__.py +26 -0
- airbyte_cdk/sources/abstract_source.py +326 -0
- airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
- airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
- airbyte_cdk/sources/config.py +27 -0
- airbyte_cdk/sources/connector_state_manager.py +161 -0
- airbyte_cdk/sources/declarative/__init__.py +3 -0
- airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
- airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
- airbyte_cdk/sources/declarative/auth/token.py +267 -0
- airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
- airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
- airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
- airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
- airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
- airbyte_cdk/sources/declarative/declarative_source.py +36 -0
- airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
- airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
- airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
- airbyte_cdk/sources/declarative/exceptions.py +9 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
- airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
- airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
- airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
- airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
- airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +2 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
- airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
- airbyte_cdk/sources/declarative/requesters/README.md +56 -0
- airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
- airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
- airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
- airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
- airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
- airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
- airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
- airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
- airbyte_cdk/sources/declarative/spec/spec.py +48 -0
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
- airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
- airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
- airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
- airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
- airbyte_cdk/sources/declarative/types.py +25 -0
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
- airbyte_cdk/sources/file_based/config/__init__.py +0 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
- airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
- airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
- airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
- airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
- airbyte_cdk/sources/file_based/exceptions.py +159 -0
- airbyte_cdk/sources/file_based/file_based_source.py +466 -0
- airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
- airbyte_cdk/sources/file_based/file_record_data.py +22 -0
- airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
- airbyte_cdk/sources/file_based/remote_file.py +18 -0
- airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
- airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
- airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
- airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
- airbyte_cdk/sources/file_based/types.py +10 -0
- airbyte_cdk/sources/http_config.py +10 -0
- airbyte_cdk/sources/http_logger.py +55 -0
- airbyte_cdk/sources/message/__init__.py +19 -0
- airbyte_cdk/sources/message/repository.py +137 -0
- airbyte_cdk/sources/source.py +95 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/__init__.py +8 -0
- airbyte_cdk/sources/streams/availability_strategy.py +84 -0
- airbyte_cdk/sources/streams/call_rate.py +704 -0
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
- airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
- airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
- airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
- airbyte_cdk/sources/streams/core.py +703 -0
- airbyte_cdk/sources/streams/http/__init__.py +10 -0
- airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +61 -0
- airbyte_cdk/sources/streams/http/http.py +673 -0
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/sources/streams/utils/__init__.py +3 -0
- airbyte_cdk/sources/types.py +169 -0
- airbyte_cdk/sources/utils/__init__.py +7 -0
- airbyte_cdk/sources/utils/casing.py +12 -0
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +53 -0
- airbyte_cdk/sources/utils/schema_helpers.py +230 -0
- airbyte_cdk/sources/utils/slice_logger.py +57 -0
- airbyte_cdk/sources/utils/transform.py +277 -0
- airbyte_cdk/sources/utils/types.py +7 -0
- airbyte_cdk/sql/__init__.py +0 -0
- airbyte_cdk/sql/_util/__init__.py +0 -0
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/__init__.py +7 -0
- airbyte_cdk/test/catalog_builder.py +81 -0
- airbyte_cdk/test/entrypoint_wrapper.py +250 -0
- airbyte_cdk/test/mock_http/__init__.py +6 -0
- airbyte_cdk/test/mock_http/matcher.py +41 -0
- airbyte_cdk/test/mock_http/mocker.py +185 -0
- airbyte_cdk/test/mock_http/request.py +103 -0
- airbyte_cdk/test/mock_http/response.py +28 -0
- airbyte_cdk/test/mock_http/response_builder.py +237 -0
- airbyte_cdk/test/state_builder.py +33 -0
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +10 -0
- airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
- airbyte_cdk/utils/analytics_message.py +25 -0
- airbyte_cdk/utils/constants.py +5 -0
- airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
- airbyte_cdk/utils/datetime_helpers.py +499 -0
- airbyte_cdk/utils/event_timing.py +85 -0
- airbyte_cdk/utils/is_cloud_environment.py +18 -0
- airbyte_cdk/utils/mapping_helpers.py +162 -0
- airbyte_cdk/utils/message_utils.py +26 -0
- airbyte_cdk/utils/oneof_option_config.py +33 -0
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +270 -0
- airbyte_cdk/utils/slice_hasher.py +37 -0
- airbyte_cdk/utils/spec_schema_transformations.py +26 -0
- airbyte_cdk/utils/stream_status_utils.py +43 -0
- airbyte_cdk/utils/traced_exception.py +145 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
- airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
- airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
- airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
- airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import itertools
|
|
6
|
+
import traceback
|
|
7
|
+
from typing import Any, Iterable, Iterator, Tuple, Union
|
|
8
|
+
|
|
9
|
+
from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStream
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def format_exception(exception: Exception) -> str:
|
|
13
|
+
return (
|
|
14
|
+
str(exception)
|
|
15
|
+
+ "\n"
|
|
16
|
+
+ "".join(traceback.TracebackException.from_exception(exception).format())
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def create_chunks(iterable: Iterable[Any], batch_size: int) -> Iterator[Tuple[Any, ...]]:
|
|
21
|
+
"""A helper function to break an iterable into chunks of size batch_size."""
|
|
22
|
+
it = iter(iterable)
|
|
23
|
+
chunk = tuple(itertools.islice(it, batch_size))
|
|
24
|
+
while chunk:
|
|
25
|
+
yield chunk
|
|
26
|
+
chunk = tuple(itertools.islice(it, batch_size))
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def create_stream_identifier(stream: Union[AirbyteStream, AirbyteRecordMessage]) -> str:
|
|
30
|
+
if isinstance(stream, AirbyteStream):
|
|
31
|
+
return str(stream.name if stream.namespace is None else f"{stream.namespace}_{stream.name}")
|
|
32
|
+
else:
|
|
33
|
+
return str(
|
|
34
|
+
stream.stream if stream.namespace is None else f"{stream.namespace}_{stream.stream}"
|
|
35
|
+
)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from typing import Dict, Iterable, List, Tuple
|
|
8
|
+
|
|
9
|
+
from airbyte_cdk.destinations.vector_db_based.config import ProcessingConfigModel
|
|
10
|
+
from airbyte_cdk.destinations.vector_db_based.document_processor import Chunk, DocumentProcessor
|
|
11
|
+
from airbyte_cdk.destinations.vector_db_based.embedder import Document, Embedder
|
|
12
|
+
from airbyte_cdk.destinations.vector_db_based.indexer import Indexer
|
|
13
|
+
from airbyte_cdk.models import AirbyteMessage, ConfiguredAirbyteCatalog, Type
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Writer:
|
|
17
|
+
"""
|
|
18
|
+
The Writer class is orchestrating the document processor, the embedder and the indexer:
|
|
19
|
+
* Incoming records are passed through the document processor to generate chunks
|
|
20
|
+
* One the configured batch size is reached, the chunks are passed to the embedder to generate embeddings
|
|
21
|
+
* The embedder embeds the chunks
|
|
22
|
+
* The indexer deletes old chunks by the associated record id before indexing the new ones
|
|
23
|
+
|
|
24
|
+
The destination connector is responsible to create a writer instance and pass the input messages iterable to the write method.
|
|
25
|
+
The batch size can be configured by the destination connector to give the freedom of either letting the user configure it or hardcoding it to a sensible value depending on the destination.
|
|
26
|
+
The omit_raw_text parameter can be used to omit the raw text from the chunks. This can be useful if the raw text is very large and not needed for the destination.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
processing_config: ProcessingConfigModel,
|
|
32
|
+
indexer: Indexer,
|
|
33
|
+
embedder: Embedder,
|
|
34
|
+
batch_size: int,
|
|
35
|
+
omit_raw_text: bool,
|
|
36
|
+
) -> None:
|
|
37
|
+
self.processing_config = processing_config
|
|
38
|
+
self.indexer = indexer
|
|
39
|
+
self.embedder = embedder
|
|
40
|
+
self.batch_size = batch_size
|
|
41
|
+
self.omit_raw_text = omit_raw_text
|
|
42
|
+
self._init_batch()
|
|
43
|
+
|
|
44
|
+
def _init_batch(self) -> None:
|
|
45
|
+
self.chunks: Dict[Tuple[str, str], List[Chunk]] = defaultdict(list)
|
|
46
|
+
self.ids_to_delete: Dict[Tuple[str, str], List[str]] = defaultdict(list)
|
|
47
|
+
self.number_of_chunks = 0
|
|
48
|
+
|
|
49
|
+
def _convert_to_document(self, chunk: Chunk) -> Document:
|
|
50
|
+
"""
|
|
51
|
+
Convert a chunk to a document for the embedder.
|
|
52
|
+
"""
|
|
53
|
+
if chunk.page_content is None:
|
|
54
|
+
raise ValueError("Cannot embed a chunk without page content")
|
|
55
|
+
return Document(page_content=chunk.page_content, record=chunk.record)
|
|
56
|
+
|
|
57
|
+
def _process_batch(self) -> None:
|
|
58
|
+
for (namespace, stream), ids in self.ids_to_delete.items():
|
|
59
|
+
self.indexer.delete(ids, namespace, stream)
|
|
60
|
+
|
|
61
|
+
for (namespace, stream), chunks in self.chunks.items():
|
|
62
|
+
embeddings = self.embedder.embed_documents(
|
|
63
|
+
[self._convert_to_document(chunk) for chunk in chunks]
|
|
64
|
+
)
|
|
65
|
+
for i, document in enumerate(chunks):
|
|
66
|
+
document.embedding = embeddings[i]
|
|
67
|
+
if self.omit_raw_text:
|
|
68
|
+
document.page_content = None
|
|
69
|
+
self.indexer.index(chunks, namespace, stream)
|
|
70
|
+
|
|
71
|
+
self._init_batch()
|
|
72
|
+
|
|
73
|
+
def write(
|
|
74
|
+
self, configured_catalog: ConfiguredAirbyteCatalog, input_messages: Iterable[AirbyteMessage]
|
|
75
|
+
) -> Iterable[AirbyteMessage]:
|
|
76
|
+
self.processor = DocumentProcessor(self.processing_config, configured_catalog)
|
|
77
|
+
self.indexer.pre_sync(configured_catalog)
|
|
78
|
+
for message in input_messages:
|
|
79
|
+
if message.type == Type.STATE:
|
|
80
|
+
# Emitting a state message indicates that all records which came before it have been written to the destination. So we flush
|
|
81
|
+
# the queue to ensure writes happen, then output the state message to indicate it's safe to checkpoint state
|
|
82
|
+
self._process_batch()
|
|
83
|
+
yield message
|
|
84
|
+
elif message.type == Type.RECORD:
|
|
85
|
+
record_chunks, record_id_to_delete = self.processor.process(message.record)
|
|
86
|
+
self.chunks[
|
|
87
|
+
( # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"
|
|
88
|
+
message.record.namespace, # type: ignore [union-attr] # record not None
|
|
89
|
+
message.record.stream, # type: ignore [union-attr] # record not None
|
|
90
|
+
)
|
|
91
|
+
].extend(record_chunks)
|
|
92
|
+
if record_id_to_delete is not None:
|
|
93
|
+
self.ids_to_delete[
|
|
94
|
+
( # type: ignore [index] # expected "tuple[str, str]", got "tuple[str | Any | None, str | Any]"
|
|
95
|
+
message.record.namespace, # type: ignore [union-attr] # record not None
|
|
96
|
+
message.record.stream, # type: ignore [union-attr] # record not None
|
|
97
|
+
)
|
|
98
|
+
].append(record_id_to_delete)
|
|
99
|
+
self.number_of_chunks += len(record_chunks)
|
|
100
|
+
if self.number_of_chunks >= self.batch_size:
|
|
101
|
+
self._process_batch()
|
|
102
|
+
|
|
103
|
+
self._process_batch()
|
|
104
|
+
yield from self.indexer.post_sync()
|
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import importlib
|
|
7
|
+
import ipaddress
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import os.path
|
|
11
|
+
import socket
|
|
12
|
+
import sys
|
|
13
|
+
import tempfile
|
|
14
|
+
from collections import defaultdict
|
|
15
|
+
from functools import wraps
|
|
16
|
+
from typing import Any, DefaultDict, Iterable, List, Mapping, Optional
|
|
17
|
+
from urllib.parse import urlparse
|
|
18
|
+
|
|
19
|
+
import orjson
|
|
20
|
+
import requests
|
|
21
|
+
from requests import PreparedRequest, Response, Session
|
|
22
|
+
|
|
23
|
+
from airbyte_cdk.connector import TConfig
|
|
24
|
+
from airbyte_cdk.exception_handler import init_uncaught_exception_handler
|
|
25
|
+
from airbyte_cdk.logger import PRINT_BUFFER, init_logger
|
|
26
|
+
from airbyte_cdk.models import (
|
|
27
|
+
AirbyteConnectionStatus,
|
|
28
|
+
AirbyteMessage,
|
|
29
|
+
AirbyteMessageSerializer,
|
|
30
|
+
AirbyteStateStats,
|
|
31
|
+
ConnectorSpecification,
|
|
32
|
+
FailureType,
|
|
33
|
+
Status,
|
|
34
|
+
Type,
|
|
35
|
+
)
|
|
36
|
+
from airbyte_cdk.sources import Source
|
|
37
|
+
from airbyte_cdk.sources.connector_state_manager import HashableStreamDescriptor
|
|
38
|
+
from airbyte_cdk.sources.utils.schema_helpers import check_config_against_spec_or_exit, split_config
|
|
39
|
+
|
|
40
|
+
# from airbyte_cdk.utils import PrintBuffer, is_cloud_environment, message_utils # add PrintBuffer back once fixed
|
|
41
|
+
from airbyte_cdk.utils import is_cloud_environment, message_utils
|
|
42
|
+
from airbyte_cdk.utils.airbyte_secrets_utils import get_secrets, update_secrets
|
|
43
|
+
from airbyte_cdk.utils.constants import ENV_REQUEST_CACHE_PATH
|
|
44
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
45
|
+
|
|
46
|
+
logger = init_logger("airbyte")
|
|
47
|
+
|
|
48
|
+
VALID_URL_SCHEMES = ["https"]
|
|
49
|
+
CLOUD_DEPLOYMENT_MODE = "cloud"
|
|
50
|
+
_HAS_LOGGED_FOR_SERIALIZATION_ERROR = False
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class AirbyteEntrypoint(object):
|
|
54
|
+
def __init__(self, source: Source):
|
|
55
|
+
init_uncaught_exception_handler(logger)
|
|
56
|
+
|
|
57
|
+
# Deployment mode is read when instantiating the entrypoint because it is the common path shared by syncs and connector builder test requests
|
|
58
|
+
if is_cloud_environment():
|
|
59
|
+
_init_internal_request_filter()
|
|
60
|
+
|
|
61
|
+
self.source = source
|
|
62
|
+
self.logger = logging.getLogger(f"airbyte.{getattr(source, 'name', '')}")
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def parse_args(args: List[str]) -> argparse.Namespace:
|
|
66
|
+
# set up parent parsers
|
|
67
|
+
parent_parser = argparse.ArgumentParser(add_help=False)
|
|
68
|
+
parent_parser.add_argument(
|
|
69
|
+
"--debug", action="store_true", help="enables detailed debug logs related to the sync"
|
|
70
|
+
)
|
|
71
|
+
main_parser = argparse.ArgumentParser()
|
|
72
|
+
subparsers = main_parser.add_subparsers(title="commands", dest="command")
|
|
73
|
+
|
|
74
|
+
# spec
|
|
75
|
+
subparsers.add_parser(
|
|
76
|
+
"spec", help="outputs the json configuration specification", parents=[parent_parser]
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# check
|
|
80
|
+
check_parser = subparsers.add_parser(
|
|
81
|
+
"check", help="checks the config can be used to connect", parents=[parent_parser]
|
|
82
|
+
)
|
|
83
|
+
required_check_parser = check_parser.add_argument_group("required named arguments")
|
|
84
|
+
required_check_parser.add_argument(
|
|
85
|
+
"--config", type=str, required=True, help="path to the json configuration file"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# discover
|
|
89
|
+
discover_parser = subparsers.add_parser(
|
|
90
|
+
"discover",
|
|
91
|
+
help="outputs a catalog describing the source's schema",
|
|
92
|
+
parents=[parent_parser],
|
|
93
|
+
)
|
|
94
|
+
required_discover_parser = discover_parser.add_argument_group("required named arguments")
|
|
95
|
+
required_discover_parser.add_argument(
|
|
96
|
+
"--config", type=str, required=True, help="path to the json configuration file"
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# read
|
|
100
|
+
read_parser = subparsers.add_parser(
|
|
101
|
+
"read", help="reads the source and outputs messages to STDOUT", parents=[parent_parser]
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
read_parser.add_argument(
|
|
105
|
+
"--state", type=str, required=False, help="path to the json-encoded state file"
|
|
106
|
+
)
|
|
107
|
+
required_read_parser = read_parser.add_argument_group("required named arguments")
|
|
108
|
+
required_read_parser.add_argument(
|
|
109
|
+
"--config", type=str, required=True, help="path to the json configuration file"
|
|
110
|
+
)
|
|
111
|
+
required_read_parser.add_argument(
|
|
112
|
+
"--catalog",
|
|
113
|
+
type=str,
|
|
114
|
+
required=True,
|
|
115
|
+
help="path to the catalog used to determine which data to read",
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
return main_parser.parse_args(args)
|
|
119
|
+
|
|
120
|
+
def run(self, parsed_args: argparse.Namespace) -> Iterable[str]:
|
|
121
|
+
cmd = parsed_args.command
|
|
122
|
+
if not cmd:
|
|
123
|
+
raise Exception("No command passed")
|
|
124
|
+
|
|
125
|
+
if hasattr(parsed_args, "debug") and parsed_args.debug:
|
|
126
|
+
self.logger.setLevel(logging.DEBUG)
|
|
127
|
+
logger.setLevel(logging.DEBUG)
|
|
128
|
+
self.logger.debug("Debug logs enabled")
|
|
129
|
+
else:
|
|
130
|
+
self.logger.setLevel(logging.INFO)
|
|
131
|
+
|
|
132
|
+
source_spec: ConnectorSpecification = self.source.spec(self.logger)
|
|
133
|
+
try:
|
|
134
|
+
with tempfile.TemporaryDirectory(
|
|
135
|
+
# Cleanup can fail on Windows due to file locks. Ignore if so,
|
|
136
|
+
# rather than failing the whole process.
|
|
137
|
+
ignore_cleanup_errors=True,
|
|
138
|
+
) as temp_dir:
|
|
139
|
+
os.environ[ENV_REQUEST_CACHE_PATH] = (
|
|
140
|
+
temp_dir # set this as default directory for request_cache to store *.sqlite files
|
|
141
|
+
)
|
|
142
|
+
if cmd == "spec":
|
|
143
|
+
message = AirbyteMessage(type=Type.SPEC, spec=source_spec)
|
|
144
|
+
yield from [
|
|
145
|
+
self.airbyte_message_to_string(queued_message)
|
|
146
|
+
for queued_message in self._emit_queued_messages(self.source)
|
|
147
|
+
]
|
|
148
|
+
yield self.airbyte_message_to_string(message)
|
|
149
|
+
else:
|
|
150
|
+
raw_config = self.source.read_config(parsed_args.config)
|
|
151
|
+
config = self.source.configure(raw_config, temp_dir)
|
|
152
|
+
|
|
153
|
+
yield from [
|
|
154
|
+
self.airbyte_message_to_string(queued_message)
|
|
155
|
+
for queued_message in self._emit_queued_messages(self.source)
|
|
156
|
+
]
|
|
157
|
+
if cmd == "check":
|
|
158
|
+
yield from map(
|
|
159
|
+
AirbyteEntrypoint.airbyte_message_to_string,
|
|
160
|
+
self.check(source_spec, config),
|
|
161
|
+
)
|
|
162
|
+
elif cmd == "discover":
|
|
163
|
+
yield from map(
|
|
164
|
+
AirbyteEntrypoint.airbyte_message_to_string,
|
|
165
|
+
self.discover(source_spec, config),
|
|
166
|
+
)
|
|
167
|
+
elif cmd == "read":
|
|
168
|
+
config_catalog = self.source.read_catalog(parsed_args.catalog)
|
|
169
|
+
state = self.source.read_state(parsed_args.state)
|
|
170
|
+
|
|
171
|
+
yield from map(
|
|
172
|
+
AirbyteEntrypoint.airbyte_message_to_string,
|
|
173
|
+
self.read(source_spec, config, config_catalog, state),
|
|
174
|
+
)
|
|
175
|
+
else:
|
|
176
|
+
raise Exception("Unexpected command " + cmd)
|
|
177
|
+
finally:
|
|
178
|
+
yield from [
|
|
179
|
+
self.airbyte_message_to_string(queued_message)
|
|
180
|
+
for queued_message in self._emit_queued_messages(self.source)
|
|
181
|
+
]
|
|
182
|
+
|
|
183
|
+
def check(
|
|
184
|
+
self, source_spec: ConnectorSpecification, config: TConfig
|
|
185
|
+
) -> Iterable[AirbyteMessage]:
|
|
186
|
+
self.set_up_secret_filter(config, source_spec.connectionSpecification)
|
|
187
|
+
try:
|
|
188
|
+
self.validate_connection(source_spec, config)
|
|
189
|
+
except AirbyteTracedException as traced_exc:
|
|
190
|
+
connection_status = traced_exc.as_connection_status_message()
|
|
191
|
+
# The platform uses the exit code to surface unexpected failures so we raise the exception if the failure type not a config error
|
|
192
|
+
# If the failure is not exceptional, we'll emit a failed connection status message and return
|
|
193
|
+
if traced_exc.failure_type != FailureType.config_error:
|
|
194
|
+
raise traced_exc
|
|
195
|
+
if connection_status:
|
|
196
|
+
yield from self._emit_queued_messages(self.source)
|
|
197
|
+
yield connection_status
|
|
198
|
+
return
|
|
199
|
+
|
|
200
|
+
try:
|
|
201
|
+
check_result = self.source.check(self.logger, config)
|
|
202
|
+
except AirbyteTracedException as traced_exc:
|
|
203
|
+
yield traced_exc.as_airbyte_message()
|
|
204
|
+
# The platform uses the exit code to surface unexpected failures so we raise the exception if the failure type not a config error
|
|
205
|
+
# If the failure is not exceptional, we'll emit a failed connection status message and return
|
|
206
|
+
if traced_exc.failure_type != FailureType.config_error:
|
|
207
|
+
raise traced_exc
|
|
208
|
+
else:
|
|
209
|
+
yield AirbyteMessage(
|
|
210
|
+
type=Type.CONNECTION_STATUS,
|
|
211
|
+
connectionStatus=AirbyteConnectionStatus(
|
|
212
|
+
status=Status.FAILED, message=traced_exc.message
|
|
213
|
+
),
|
|
214
|
+
)
|
|
215
|
+
return
|
|
216
|
+
if check_result.status == Status.SUCCEEDED:
|
|
217
|
+
self.logger.info("Check succeeded")
|
|
218
|
+
else:
|
|
219
|
+
self.logger.error("Check failed")
|
|
220
|
+
|
|
221
|
+
yield from self._emit_queued_messages(self.source)
|
|
222
|
+
yield AirbyteMessage(type=Type.CONNECTION_STATUS, connectionStatus=check_result)
|
|
223
|
+
|
|
224
|
+
def discover(
|
|
225
|
+
self, source_spec: ConnectorSpecification, config: TConfig
|
|
226
|
+
) -> Iterable[AirbyteMessage]:
|
|
227
|
+
self.set_up_secret_filter(config, source_spec.connectionSpecification)
|
|
228
|
+
if self.source.check_config_against_spec:
|
|
229
|
+
self.validate_connection(source_spec, config)
|
|
230
|
+
catalog = self.source.discover(self.logger, config)
|
|
231
|
+
|
|
232
|
+
yield from self._emit_queued_messages(self.source)
|
|
233
|
+
yield AirbyteMessage(type=Type.CATALOG, catalog=catalog)
|
|
234
|
+
|
|
235
|
+
def read(
|
|
236
|
+
self, source_spec: ConnectorSpecification, config: TConfig, catalog: Any, state: list[Any]
|
|
237
|
+
) -> Iterable[AirbyteMessage]:
|
|
238
|
+
self.set_up_secret_filter(config, source_spec.connectionSpecification)
|
|
239
|
+
if self.source.check_config_against_spec:
|
|
240
|
+
self.validate_connection(source_spec, config)
|
|
241
|
+
|
|
242
|
+
# The Airbyte protocol dictates that counts be expressed as float/double to better protect against integer overflows
|
|
243
|
+
stream_message_counter: DefaultDict[HashableStreamDescriptor, float] = defaultdict(float)
|
|
244
|
+
for message in self.source.read(self.logger, config, catalog, state):
|
|
245
|
+
yield self.handle_record_counts(message, stream_message_counter)
|
|
246
|
+
for message in self._emit_queued_messages(self.source):
|
|
247
|
+
yield self.handle_record_counts(message, stream_message_counter)
|
|
248
|
+
|
|
249
|
+
@staticmethod
|
|
250
|
+
def handle_record_counts(
|
|
251
|
+
message: AirbyteMessage, stream_message_count: DefaultDict[HashableStreamDescriptor, float]
|
|
252
|
+
) -> AirbyteMessage:
|
|
253
|
+
match message.type:
|
|
254
|
+
case Type.RECORD:
|
|
255
|
+
if message.record is None:
|
|
256
|
+
raise ValueError("Record message must have a record attribute")
|
|
257
|
+
|
|
258
|
+
stream_message_count[
|
|
259
|
+
HashableStreamDescriptor(
|
|
260
|
+
name=message.record.stream, # type: ignore[union-attr] # record has `stream`
|
|
261
|
+
namespace=message.record.namespace, # type: ignore[union-attr] # record has `namespace`
|
|
262
|
+
)
|
|
263
|
+
] += 1.0
|
|
264
|
+
case Type.STATE:
|
|
265
|
+
if message.state is None:
|
|
266
|
+
raise ValueError("State message must have a state attribute")
|
|
267
|
+
|
|
268
|
+
stream_descriptor = message_utils.get_stream_descriptor(message)
|
|
269
|
+
|
|
270
|
+
# Set record count from the counter onto the state message
|
|
271
|
+
message.state.sourceStats = message.state.sourceStats or AirbyteStateStats() # type: ignore[union-attr] # state has `sourceStats`
|
|
272
|
+
message.state.sourceStats.recordCount = stream_message_count.get( # type: ignore[union-attr] # state has `sourceStats`
|
|
273
|
+
stream_descriptor, 0.0
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Reset the counter
|
|
277
|
+
stream_message_count[stream_descriptor] = 0.0
|
|
278
|
+
return message
|
|
279
|
+
|
|
280
|
+
@staticmethod
|
|
281
|
+
def validate_connection(source_spec: ConnectorSpecification, config: TConfig) -> None:
|
|
282
|
+
# Remove internal flags from config before validating so
|
|
283
|
+
# jsonschema's additionalProperties flag won't fail the validation
|
|
284
|
+
connector_config, _ = split_config(config)
|
|
285
|
+
check_config_against_spec_or_exit(connector_config, source_spec)
|
|
286
|
+
|
|
287
|
+
@staticmethod
|
|
288
|
+
def set_up_secret_filter(config: TConfig, connection_specification: Mapping[str, Any]) -> None:
|
|
289
|
+
# Now that we have the config, we can use it to get a list of ai airbyte_secrets
|
|
290
|
+
# that we should filter in logging to avoid leaking secrets
|
|
291
|
+
config_secrets = get_secrets(connection_specification, config)
|
|
292
|
+
update_secrets(config_secrets)
|
|
293
|
+
|
|
294
|
+
@staticmethod
|
|
295
|
+
def airbyte_message_to_string(airbyte_message: AirbyteMessage) -> str:
|
|
296
|
+
global _HAS_LOGGED_FOR_SERIALIZATION_ERROR
|
|
297
|
+
serialized_message = AirbyteMessageSerializer.dump(airbyte_message)
|
|
298
|
+
try:
|
|
299
|
+
return orjson.dumps(serialized_message).decode()
|
|
300
|
+
except Exception as exception:
|
|
301
|
+
if not _HAS_LOGGED_FOR_SERIALIZATION_ERROR:
|
|
302
|
+
logger.warning(
|
|
303
|
+
f"There was an error during the serialization of an AirbyteMessage: `{exception}`. This might impact the sync performances."
|
|
304
|
+
)
|
|
305
|
+
_HAS_LOGGED_FOR_SERIALIZATION_ERROR = True
|
|
306
|
+
return json.dumps(serialized_message)
|
|
307
|
+
|
|
308
|
+
@classmethod
|
|
309
|
+
def extract_state(cls, args: List[str]) -> Optional[Any]:
|
|
310
|
+
parsed_args = cls.parse_args(args)
|
|
311
|
+
if hasattr(parsed_args, "state"):
|
|
312
|
+
return parsed_args.state
|
|
313
|
+
return None
|
|
314
|
+
|
|
315
|
+
@classmethod
|
|
316
|
+
def extract_catalog(cls, args: List[str]) -> Optional[Any]:
|
|
317
|
+
parsed_args = cls.parse_args(args)
|
|
318
|
+
if hasattr(parsed_args, "catalog"):
|
|
319
|
+
return parsed_args.catalog
|
|
320
|
+
return None
|
|
321
|
+
|
|
322
|
+
@classmethod
|
|
323
|
+
def extract_config(cls, args: List[str]) -> Optional[Any]:
|
|
324
|
+
parsed_args = cls.parse_args(args)
|
|
325
|
+
if hasattr(parsed_args, "config"):
|
|
326
|
+
return parsed_args.config
|
|
327
|
+
return None
|
|
328
|
+
|
|
329
|
+
def _emit_queued_messages(self, source: Source) -> Iterable[AirbyteMessage]:
|
|
330
|
+
if hasattr(source, "message_repository") and source.message_repository:
|
|
331
|
+
yield from source.message_repository.consume_queue()
|
|
332
|
+
return
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def launch(source: Source, args: List[str]) -> None:
|
|
336
|
+
source_entrypoint = AirbyteEntrypoint(source)
|
|
337
|
+
parsed_args = source_entrypoint.parse_args(args)
|
|
338
|
+
# temporarily removes the PrintBuffer because we're seeing weird print behavior for concurrent syncs
|
|
339
|
+
# Refer to: https://github.com/airbytehq/oncall/issues/6235
|
|
340
|
+
with PRINT_BUFFER:
|
|
341
|
+
for message in source_entrypoint.run(parsed_args):
|
|
342
|
+
# simply printing is creating issues for concurrent CDK as Python uses different two instructions to print: one for the message and
|
|
343
|
+
# the other for the break line. Adding `\n` to the message ensure that both are printed at the same time
|
|
344
|
+
print(f"{message}\n", end="")
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _init_internal_request_filter() -> None:
|
|
348
|
+
"""
|
|
349
|
+
Wraps the Python requests library to prevent sending requests to internal URL endpoints.
|
|
350
|
+
"""
|
|
351
|
+
wrapped_fn = Session.send
|
|
352
|
+
|
|
353
|
+
@wraps(wrapped_fn)
|
|
354
|
+
def filtered_send(self: Any, request: PreparedRequest, **kwargs: Any) -> Response:
|
|
355
|
+
parsed_url = urlparse(request.url)
|
|
356
|
+
|
|
357
|
+
if parsed_url.scheme not in VALID_URL_SCHEMES:
|
|
358
|
+
raise requests.exceptions.InvalidSchema(
|
|
359
|
+
"Invalid Protocol Scheme: The endpoint that data is being requested from is using an invalid or insecure "
|
|
360
|
+
+ f"protocol {parsed_url.scheme!r}. Valid protocol schemes: {','.join(VALID_URL_SCHEMES)}"
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
if not parsed_url.hostname:
|
|
364
|
+
raise requests.exceptions.InvalidURL(
|
|
365
|
+
"Invalid URL specified: The endpoint that data is being requested from is not a valid URL"
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
is_private = _is_private_url(parsed_url.hostname, parsed_url.port) # type: ignore [arg-type]
|
|
370
|
+
if is_private:
|
|
371
|
+
raise AirbyteTracedException(
|
|
372
|
+
internal_message=f"Invalid URL endpoint: `{parsed_url.hostname!r}` belongs to a private network",
|
|
373
|
+
failure_type=FailureType.config_error,
|
|
374
|
+
message="Invalid URL endpoint: The endpoint that data is being requested from belongs to a private network. Source connectors only support requesting data from public API endpoints.",
|
|
375
|
+
)
|
|
376
|
+
except socket.gaierror as exception:
|
|
377
|
+
# This is a special case where the developer specifies an IP address string that is not formatted correctly like trailing
|
|
378
|
+
# whitespace which will fail the socket IP lookup. This only happens when using IP addresses and not text hostnames.
|
|
379
|
+
# Knowing that this is a request using the requests library, we will mock the exception without calling the lib
|
|
380
|
+
raise requests.exceptions.InvalidURL(f"Invalid URL {parsed_url}: {exception}")
|
|
381
|
+
|
|
382
|
+
return wrapped_fn(self, request, **kwargs)
|
|
383
|
+
|
|
384
|
+
Session.send = filtered_send # type: ignore [method-assign]
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def _is_private_url(hostname: str, port: int) -> bool:
|
|
388
|
+
"""
|
|
389
|
+
Helper method that checks if any of the IP addresses associated with a hostname belong to a private network.
|
|
390
|
+
"""
|
|
391
|
+
address_info_entries = socket.getaddrinfo(hostname, port)
|
|
392
|
+
for entry in address_info_entries:
|
|
393
|
+
# getaddrinfo() returns entries in the form of a 5-tuple where the IP is stored as the sockaddr. For IPv4 this
|
|
394
|
+
# is a 2-tuple and for IPv6 it is a 4-tuple, but the address is always the first value of the tuple at 0.
|
|
395
|
+
# See https://docs.python.org/3/library/socket.html#socket.getaddrinfo for more details.
|
|
396
|
+
ip_address = entry[4][0]
|
|
397
|
+
if ipaddress.ip_address(ip_address).is_private:
|
|
398
|
+
return True
|
|
399
|
+
return False
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def main() -> None:
|
|
403
|
+
impl_module = os.environ.get("AIRBYTE_IMPL_MODULE", Source.__module__)
|
|
404
|
+
impl_class = os.environ.get("AIRBYTE_IMPL_PATH", Source.__name__)
|
|
405
|
+
module = importlib.import_module(impl_module)
|
|
406
|
+
impl = getattr(module, impl_class)
|
|
407
|
+
|
|
408
|
+
# set up and run entrypoint
|
|
409
|
+
source = impl()
|
|
410
|
+
|
|
411
|
+
if not isinstance(source, Source):
|
|
412
|
+
raise Exception("Source implementation provided does not implement Source class!")
|
|
413
|
+
|
|
414
|
+
launch(source, sys.argv[1:])
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import sys
|
|
7
|
+
from types import TracebackType
|
|
8
|
+
from typing import Any, List, Mapping, Optional
|
|
9
|
+
|
|
10
|
+
from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets
|
|
11
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def assemble_uncaught_exception(
|
|
15
|
+
exception_type: type[BaseException], exception_value: BaseException
|
|
16
|
+
) -> AirbyteTracedException:
|
|
17
|
+
if issubclass(exception_type, AirbyteTracedException):
|
|
18
|
+
return exception_value # type: ignore # validated as part of the previous line
|
|
19
|
+
return AirbyteTracedException.from_exception(exception_value)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def init_uncaught_exception_handler(logger: logging.Logger) -> None:
|
|
23
|
+
"""
|
|
24
|
+
Handles uncaught exceptions by emitting an AirbyteTraceMessage and making sure they are not
|
|
25
|
+
printed to the console without having secrets removed.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def hook_fn(
|
|
29
|
+
exception_type: type[BaseException],
|
|
30
|
+
exception_value: BaseException,
|
|
31
|
+
traceback_: Optional[TracebackType],
|
|
32
|
+
) -> Any:
|
|
33
|
+
# For developer ergonomics, we want to see the stack trace in the logs when we do a ctrl-c
|
|
34
|
+
if issubclass(exception_type, KeyboardInterrupt):
|
|
35
|
+
sys.__excepthook__(exception_type, exception_value, traceback_)
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
logger.fatal(exception_value, exc_info=exception_value)
|
|
39
|
+
|
|
40
|
+
# emit an AirbyteTraceMessage for any exception that gets to this spot
|
|
41
|
+
traced_exc = assemble_uncaught_exception(exception_type, exception_value)
|
|
42
|
+
|
|
43
|
+
traced_exc.emit_message()
|
|
44
|
+
|
|
45
|
+
sys.excepthook = hook_fn
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def generate_failed_streams_error_message(stream_failures: Mapping[str, List[Exception]]) -> str:
|
|
49
|
+
failures = "\n".join(
|
|
50
|
+
[
|
|
51
|
+
f"{stream}: {filter_secrets(exception.__repr__())}"
|
|
52
|
+
for stream, exceptions in stream_failures.items()
|
|
53
|
+
for exception in exceptions
|
|
54
|
+
]
|
|
55
|
+
)
|
|
56
|
+
return f"During the sync, the following streams did not sync successfully: {failures}"
|