airbyte-cdk 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +358 -0
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +104 -0
- airbyte_cdk/connector.py +123 -0
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/__init__.py +3 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
- airbyte_cdk/connector_builder/main.py +107 -0
- airbyte_cdk/connector_builder/models.py +73 -0
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +83 -0
- airbyte_cdk/destinations/__init__.py +8 -0
- airbyte_cdk/destinations/destination.py +154 -0
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
- airbyte_cdk/destinations/vector_db_based/config.py +298 -0
- airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
- airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
- airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
- airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
- airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
- airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
- airbyte_cdk/entrypoint.py +414 -0
- airbyte_cdk/exception_handler.py +56 -0
- airbyte_cdk/logger.py +109 -0
- airbyte_cdk/models/__init__.py +72 -0
- airbyte_cdk/models/airbyte_protocol.py +88 -0
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/well_known_types.py +5 -0
- airbyte_cdk/py.typed +0 -0
- airbyte_cdk/sources/__init__.py +26 -0
- airbyte_cdk/sources/abstract_source.py +326 -0
- airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
- airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
- airbyte_cdk/sources/config.py +27 -0
- airbyte_cdk/sources/connector_state_manager.py +161 -0
- airbyte_cdk/sources/declarative/__init__.py +3 -0
- airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
- airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
- airbyte_cdk/sources/declarative/auth/token.py +267 -0
- airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
- airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
- airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
- airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
- airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
- airbyte_cdk/sources/declarative/declarative_source.py +36 -0
- airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
- airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
- airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
- airbyte_cdk/sources/declarative/exceptions.py +9 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
- airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
- airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
- airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
- airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
- airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +2 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
- airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
- airbyte_cdk/sources/declarative/requesters/README.md +56 -0
- airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
- airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
- airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
- airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
- airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
- airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
- airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
- airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
- airbyte_cdk/sources/declarative/spec/spec.py +48 -0
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
- airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
- airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
- airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
- airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
- airbyte_cdk/sources/declarative/types.py +25 -0
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
- airbyte_cdk/sources/file_based/config/__init__.py +0 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
- airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
- airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
- airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
- airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
- airbyte_cdk/sources/file_based/exceptions.py +159 -0
- airbyte_cdk/sources/file_based/file_based_source.py +466 -0
- airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
- airbyte_cdk/sources/file_based/file_record_data.py +22 -0
- airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
- airbyte_cdk/sources/file_based/remote_file.py +18 -0
- airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
- airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
- airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
- airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
- airbyte_cdk/sources/file_based/types.py +10 -0
- airbyte_cdk/sources/http_config.py +10 -0
- airbyte_cdk/sources/http_logger.py +55 -0
- airbyte_cdk/sources/message/__init__.py +19 -0
- airbyte_cdk/sources/message/repository.py +137 -0
- airbyte_cdk/sources/source.py +95 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/__init__.py +8 -0
- airbyte_cdk/sources/streams/availability_strategy.py +84 -0
- airbyte_cdk/sources/streams/call_rate.py +704 -0
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
- airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
- airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
- airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
- airbyte_cdk/sources/streams/core.py +703 -0
- airbyte_cdk/sources/streams/http/__init__.py +10 -0
- airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +61 -0
- airbyte_cdk/sources/streams/http/http.py +673 -0
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/sources/streams/utils/__init__.py +3 -0
- airbyte_cdk/sources/types.py +169 -0
- airbyte_cdk/sources/utils/__init__.py +7 -0
- airbyte_cdk/sources/utils/casing.py +12 -0
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +53 -0
- airbyte_cdk/sources/utils/schema_helpers.py +230 -0
- airbyte_cdk/sources/utils/slice_logger.py +57 -0
- airbyte_cdk/sources/utils/transform.py +277 -0
- airbyte_cdk/sources/utils/types.py +7 -0
- airbyte_cdk/sql/__init__.py +0 -0
- airbyte_cdk/sql/_util/__init__.py +0 -0
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/__init__.py +7 -0
- airbyte_cdk/test/catalog_builder.py +81 -0
- airbyte_cdk/test/entrypoint_wrapper.py +250 -0
- airbyte_cdk/test/mock_http/__init__.py +6 -0
- airbyte_cdk/test/mock_http/matcher.py +41 -0
- airbyte_cdk/test/mock_http/mocker.py +185 -0
- airbyte_cdk/test/mock_http/request.py +103 -0
- airbyte_cdk/test/mock_http/response.py +28 -0
- airbyte_cdk/test/mock_http/response_builder.py +237 -0
- airbyte_cdk/test/state_builder.py +33 -0
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +10 -0
- airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
- airbyte_cdk/utils/analytics_message.py +25 -0
- airbyte_cdk/utils/constants.py +5 -0
- airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
- airbyte_cdk/utils/datetime_helpers.py +499 -0
- airbyte_cdk/utils/event_timing.py +85 -0
- airbyte_cdk/utils/is_cloud_environment.py +18 -0
- airbyte_cdk/utils/mapping_helpers.py +162 -0
- airbyte_cdk/utils/message_utils.py +26 -0
- airbyte_cdk/utils/oneof_option_config.py +33 -0
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +270 -0
- airbyte_cdk/utils/slice_hasher.py +37 -0
- airbyte_cdk/utils/spec_schema_transformations.py +26 -0
- airbyte_cdk/utils/stream_status_utils.py +43 -0
- airbyte_cdk/utils/traced_exception.py +145 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
- airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
- airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
- airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
- airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from io import IOBase
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
|
|
9
|
+
|
|
10
|
+
import orjson
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from numpy import datetime64, issubdtype
|
|
13
|
+
from numpy import dtype as dtype_
|
|
14
|
+
from pydantic.v1 import BaseModel
|
|
15
|
+
|
|
16
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
|
|
17
|
+
ExcelFormat,
|
|
18
|
+
FileBasedStreamConfig,
|
|
19
|
+
)
|
|
20
|
+
from airbyte_cdk.sources.file_based.exceptions import (
|
|
21
|
+
ConfigValidationError,
|
|
22
|
+
FileBasedSourceError,
|
|
23
|
+
RecordParseError,
|
|
24
|
+
)
|
|
25
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
|
26
|
+
AbstractFileBasedStreamReader,
|
|
27
|
+
FileReadMode,
|
|
28
|
+
)
|
|
29
|
+
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
|
30
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
31
|
+
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ExcelParser(FileTypeParser):
|
|
35
|
+
ENCODING = None
|
|
36
|
+
|
|
37
|
+
def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
|
|
38
|
+
"""
|
|
39
|
+
ExcelParser does not require config checks, implicit pydantic validation is enough.
|
|
40
|
+
"""
|
|
41
|
+
return True, None
|
|
42
|
+
|
|
43
|
+
async def infer_schema(
|
|
44
|
+
self,
|
|
45
|
+
config: FileBasedStreamConfig,
|
|
46
|
+
file: RemoteFile,
|
|
47
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
48
|
+
logger: logging.Logger,
|
|
49
|
+
) -> SchemaType:
|
|
50
|
+
"""
|
|
51
|
+
Infers the schema of the Excel file by examining its contents.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
config (FileBasedStreamConfig): Configuration for the file-based stream.
|
|
55
|
+
file (RemoteFile): The remote file to be read.
|
|
56
|
+
stream_reader (AbstractFileBasedStreamReader): Reader to read the file.
|
|
57
|
+
logger (logging.Logger): Logger for logging information and errors.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
SchemaType: Inferred schema of the Excel file.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
# Validate the format of the config
|
|
64
|
+
self.validate_format(config.format, logger)
|
|
65
|
+
|
|
66
|
+
fields: Dict[str, str] = {}
|
|
67
|
+
|
|
68
|
+
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
|
|
69
|
+
df = self.open_and_parse_file(fp)
|
|
70
|
+
for column, df_type in df.dtypes.items():
|
|
71
|
+
# Choose the broadest data type if the column's data type differs in dataframes
|
|
72
|
+
prev_frame_column_type = fields.get(column) # type: ignore [call-overload]
|
|
73
|
+
fields[column] = self.dtype_to_json_type( # type: ignore [index]
|
|
74
|
+
prev_frame_column_type,
|
|
75
|
+
df_type,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
schema = {
|
|
79
|
+
field: (
|
|
80
|
+
{"type": "string", "format": "date-time"}
|
|
81
|
+
if fields[field] == "date-time"
|
|
82
|
+
else {"type": fields[field]}
|
|
83
|
+
)
|
|
84
|
+
for field in fields
|
|
85
|
+
}
|
|
86
|
+
return schema
|
|
87
|
+
|
|
88
|
+
def parse_records(
|
|
89
|
+
self,
|
|
90
|
+
config: FileBasedStreamConfig,
|
|
91
|
+
file: RemoteFile,
|
|
92
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
93
|
+
logger: logging.Logger,
|
|
94
|
+
discovered_schema: Optional[Mapping[str, SchemaType]] = None,
|
|
95
|
+
) -> Iterable[Dict[str, Any]]:
|
|
96
|
+
"""
|
|
97
|
+
Parses records from an Excel file based on the provided configuration.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
config (FileBasedStreamConfig): Configuration for the file-based stream.
|
|
101
|
+
file (RemoteFile): The remote file to be read.
|
|
102
|
+
stream_reader (AbstractFileBasedStreamReader): Reader to read the file.
|
|
103
|
+
logger (logging.Logger): Logger for logging information and errors.
|
|
104
|
+
discovered_schema (Optional[Mapping[str, SchemaType]]): Discovered schema for validation.
|
|
105
|
+
|
|
106
|
+
Yields:
|
|
107
|
+
Iterable[Dict[str, Any]]: Parsed records from the Excel file.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
# Validate the format of the config
|
|
111
|
+
self.validate_format(config.format, logger)
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
# Open and parse the file using the stream reader
|
|
115
|
+
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
|
|
116
|
+
df = self.open_and_parse_file(fp)
|
|
117
|
+
# Yield records as dictionaries
|
|
118
|
+
# DataFrame.to_dict() method returns datetime values in pandas.Timestamp values, which are not serializable by orjson
|
|
119
|
+
# DataFrame.to_json() returns string with datetime values serialized to iso8601 with microseconds to align with pydantic behavior
|
|
120
|
+
# see PR description: https://github.com/airbytehq/airbyte/pull/44444/
|
|
121
|
+
yield from orjson.loads(
|
|
122
|
+
df.to_json(orient="records", date_format="iso", date_unit="us")
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
except Exception as exc:
|
|
126
|
+
# Raise a RecordParseError if any exception occurs during parsing
|
|
127
|
+
raise RecordParseError(
|
|
128
|
+
FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri
|
|
129
|
+
) from exc
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def file_read_mode(self) -> FileReadMode:
|
|
133
|
+
"""
|
|
134
|
+
Returns the file read mode for the Excel file.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
FileReadMode: The file read mode (binary).
|
|
138
|
+
"""
|
|
139
|
+
return FileReadMode.READ_BINARY
|
|
140
|
+
|
|
141
|
+
@staticmethod
|
|
142
|
+
def dtype_to_json_type(
|
|
143
|
+
current_type: Optional[str],
|
|
144
|
+
dtype: dtype_, # type: ignore [type-arg]
|
|
145
|
+
) -> str:
|
|
146
|
+
"""
|
|
147
|
+
Convert Pandas DataFrame types to Airbyte Types.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
current_type (Optional[str]): One of the previous types based on earlier dataframes.
|
|
151
|
+
dtype: Pandas DataFrame type.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
str: Corresponding Airbyte Type.
|
|
155
|
+
"""
|
|
156
|
+
number_types = ("int64", "float64")
|
|
157
|
+
if current_type == "string":
|
|
158
|
+
# Previous column values were of the string type, no need to look further.
|
|
159
|
+
return current_type
|
|
160
|
+
if dtype is object:
|
|
161
|
+
return "string"
|
|
162
|
+
if dtype in number_types and (not current_type or current_type == "number"):
|
|
163
|
+
return "number"
|
|
164
|
+
if dtype == "bool" and (not current_type or current_type == "boolean"):
|
|
165
|
+
return "boolean"
|
|
166
|
+
if issubdtype(dtype, datetime64):
|
|
167
|
+
return "date-time"
|
|
168
|
+
return "string"
|
|
169
|
+
|
|
170
|
+
@staticmethod
|
|
171
|
+
def validate_format(excel_format: BaseModel, logger: logging.Logger) -> None:
|
|
172
|
+
"""
|
|
173
|
+
Validates if the given format is of type ExcelFormat.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
excel_format (Any): The format to be validated.
|
|
177
|
+
|
|
178
|
+
Raises:
|
|
179
|
+
ConfigValidationError: If the format is not ExcelFormat.
|
|
180
|
+
"""
|
|
181
|
+
if not isinstance(excel_format, ExcelFormat):
|
|
182
|
+
logger.info(f"Expected ExcelFormat, got {excel_format}")
|
|
183
|
+
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR)
|
|
184
|
+
|
|
185
|
+
@staticmethod
|
|
186
|
+
def open_and_parse_file(fp: Union[IOBase, str, Path]) -> pd.DataFrame:
|
|
187
|
+
"""
|
|
188
|
+
Opens and parses the Excel file.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
fp: File pointer to the Excel file.
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
pd.DataFrame: Parsed data from the Excel file.
|
|
195
|
+
"""
|
|
196
|
+
return pd.ExcelFile(fp, engine="calamine").parse() # type: ignore [arg-type, call-overload, no-any-return]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
import logging
|
|
5
|
+
from typing import Iterable, Tuple
|
|
6
|
+
|
|
7
|
+
from airbyte_cdk.models import AirbyteRecordMessageFileReference
|
|
8
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
|
9
|
+
from airbyte_cdk.sources.file_based.file_record_data import FileRecordData
|
|
10
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
11
|
+
from airbyte_cdk.sources.utils.files_directory import get_files_directory
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FileTransfer:
|
|
15
|
+
def __init__(self) -> None:
|
|
16
|
+
self._local_directory = get_files_directory()
|
|
17
|
+
|
|
18
|
+
def upload(
|
|
19
|
+
self,
|
|
20
|
+
file: RemoteFile,
|
|
21
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
22
|
+
logger: logging.Logger,
|
|
23
|
+
) -> Iterable[Tuple[FileRecordData, AirbyteRecordMessageFileReference]]:
|
|
24
|
+
try:
|
|
25
|
+
yield stream_reader.upload(
|
|
26
|
+
file=file, local_directory=self._local_directory, logger=logger
|
|
27
|
+
)
|
|
28
|
+
except Exception as ex:
|
|
29
|
+
logger.error("An error has occurred while getting file: %s", str(ex))
|
|
30
|
+
raise ex
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple
|
|
8
|
+
|
|
9
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
|
10
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
|
11
|
+
AbstractFileBasedStreamReader,
|
|
12
|
+
FileReadMode,
|
|
13
|
+
)
|
|
14
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
15
|
+
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
|
16
|
+
|
|
17
|
+
Record = Dict[str, Any]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FileTypeParser(ABC):
|
|
21
|
+
"""
|
|
22
|
+
An abstract class containing methods that must be implemented for each
|
|
23
|
+
supported file type.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def parser_max_n_files_for_schema_inference(self) -> Optional[int]:
|
|
28
|
+
"""
|
|
29
|
+
The discovery policy decides how many files are loaded for schema inference. This method can provide a parser-specific override. If it's defined, the smaller of the two values will be used.
|
|
30
|
+
"""
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def parser_max_n_files_for_parsability(self) -> Optional[int]:
|
|
35
|
+
"""
|
|
36
|
+
The availability policy decides how many files are loaded for checking whether parsing works correctly. This method can provide a parser-specific override. If it's defined, the smaller of the two values will be used.
|
|
37
|
+
"""
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
def get_parser_defined_primary_key(self, config: FileBasedStreamConfig) -> Optional[str]:
|
|
41
|
+
"""
|
|
42
|
+
The parser can define a primary key. If no user-defined primary key is provided, this will be used.
|
|
43
|
+
"""
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
@abstractmethod
|
|
47
|
+
def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
|
|
48
|
+
"""
|
|
49
|
+
Check whether the config is valid for this file type. If it is, return True and None. If it's not, return False and an error message explaining why it's invalid.
|
|
50
|
+
"""
|
|
51
|
+
return True, None
|
|
52
|
+
|
|
53
|
+
@abstractmethod
|
|
54
|
+
async def infer_schema(
|
|
55
|
+
self,
|
|
56
|
+
config: FileBasedStreamConfig,
|
|
57
|
+
file: RemoteFile,
|
|
58
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
59
|
+
logger: logging.Logger,
|
|
60
|
+
) -> SchemaType:
|
|
61
|
+
"""
|
|
62
|
+
Infer the JSON Schema for this file.
|
|
63
|
+
"""
|
|
64
|
+
...
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def parse_records(
|
|
68
|
+
self,
|
|
69
|
+
config: FileBasedStreamConfig,
|
|
70
|
+
file: RemoteFile,
|
|
71
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
72
|
+
logger: logging.Logger,
|
|
73
|
+
discovered_schema: Optional[Mapping[str, SchemaType]],
|
|
74
|
+
) -> Iterable[Record]:
|
|
75
|
+
"""
|
|
76
|
+
Parse and emit each record.
|
|
77
|
+
"""
|
|
78
|
+
...
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
@abstractmethod
|
|
82
|
+
def file_read_mode(self) -> FileReadMode:
|
|
83
|
+
"""
|
|
84
|
+
The mode in which the file should be opened for reading.
|
|
85
|
+
"""
|
|
86
|
+
...
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
from typing import Any, Dict, Iterable, Mapping, Optional, Tuple, Union
|
|
8
|
+
|
|
9
|
+
import orjson
|
|
10
|
+
|
|
11
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
|
12
|
+
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
|
13
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
|
14
|
+
AbstractFileBasedStreamReader,
|
|
15
|
+
FileReadMode,
|
|
16
|
+
)
|
|
17
|
+
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
|
18
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
19
|
+
from airbyte_cdk.sources.file_based.schema_helpers import (
|
|
20
|
+
PYTHON_TYPE_MAPPING,
|
|
21
|
+
SchemaType,
|
|
22
|
+
merge_schemas,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class JsonlParser(FileTypeParser):
|
|
27
|
+
MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000
|
|
28
|
+
ENCODING = "utf8"
|
|
29
|
+
|
|
30
|
+
def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
|
|
31
|
+
"""
|
|
32
|
+
JsonlParser does not require config checks, implicit pydantic validation is enough.
|
|
33
|
+
"""
|
|
34
|
+
return True, None
|
|
35
|
+
|
|
36
|
+
async def infer_schema(
|
|
37
|
+
self,
|
|
38
|
+
config: FileBasedStreamConfig,
|
|
39
|
+
file: RemoteFile,
|
|
40
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
41
|
+
logger: logging.Logger,
|
|
42
|
+
) -> SchemaType:
|
|
43
|
+
"""
|
|
44
|
+
Infers the schema for the file by inferring the schema for each line, and merging
|
|
45
|
+
it with the previously-inferred schema.
|
|
46
|
+
"""
|
|
47
|
+
inferred_schema: Mapping[str, Any] = {}
|
|
48
|
+
|
|
49
|
+
for entry in self._parse_jsonl_entries(file, stream_reader, logger, read_limit=True):
|
|
50
|
+
line_schema = self._infer_schema_for_record(entry)
|
|
51
|
+
inferred_schema = merge_schemas(inferred_schema, line_schema)
|
|
52
|
+
|
|
53
|
+
return inferred_schema
|
|
54
|
+
|
|
55
|
+
def parse_records(
|
|
56
|
+
self,
|
|
57
|
+
config: FileBasedStreamConfig,
|
|
58
|
+
file: RemoteFile,
|
|
59
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
60
|
+
logger: logging.Logger,
|
|
61
|
+
discovered_schema: Optional[Mapping[str, SchemaType]],
|
|
62
|
+
) -> Iterable[Dict[str, Any]]:
|
|
63
|
+
"""
|
|
64
|
+
This code supports parsing json objects over multiple lines even though this does not align with the JSONL format. This is for
|
|
65
|
+
backward compatibility reasons i.e. the previous source-s3 parser did support this. The drawback is:
|
|
66
|
+
* performance as the way we support json over multiple lines is very brute forced
|
|
67
|
+
* given that we don't have `newlines_in_values` config to scope the possible inputs, we might parse the whole file before knowing if
|
|
68
|
+
the input is improperly formatted or if the json is over multiple lines
|
|
69
|
+
|
|
70
|
+
The goal is to run the V4 of source-s3 in production, track the warning log emitted when there are multiline json objects and
|
|
71
|
+
deprecate this feature if it's not a valid use case.
|
|
72
|
+
"""
|
|
73
|
+
yield from self._parse_jsonl_entries(file, stream_reader, logger)
|
|
74
|
+
|
|
75
|
+
@classmethod
|
|
76
|
+
def _infer_schema_for_record(cls, record: Dict[str, Any]) -> Dict[str, Any]:
|
|
77
|
+
record_schema = {}
|
|
78
|
+
for key, value in record.items():
|
|
79
|
+
if value is None:
|
|
80
|
+
record_schema[key] = {"type": "null"}
|
|
81
|
+
else:
|
|
82
|
+
record_schema[key] = {"type": PYTHON_TYPE_MAPPING[type(value)]}
|
|
83
|
+
|
|
84
|
+
return record_schema
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def file_read_mode(self) -> FileReadMode:
|
|
88
|
+
return FileReadMode.READ
|
|
89
|
+
|
|
90
|
+
def _parse_jsonl_entries(
|
|
91
|
+
self,
|
|
92
|
+
file: RemoteFile,
|
|
93
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
94
|
+
logger: logging.Logger,
|
|
95
|
+
read_limit: bool = False,
|
|
96
|
+
) -> Iterable[Dict[str, Any]]:
|
|
97
|
+
with stream_reader.open_file(file, self.file_read_mode, self.ENCODING, logger) as fp:
|
|
98
|
+
read_bytes = 0
|
|
99
|
+
|
|
100
|
+
had_json_parsing_error = False
|
|
101
|
+
has_warned_for_multiline_json_object = False
|
|
102
|
+
yielded_at_least_once = False
|
|
103
|
+
|
|
104
|
+
accumulator = None
|
|
105
|
+
for line in fp:
|
|
106
|
+
if not accumulator:
|
|
107
|
+
accumulator = self._instantiate_accumulator(line)
|
|
108
|
+
read_bytes += len(line)
|
|
109
|
+
accumulator += line # type: ignore [operator] # In reality, it's either bytes or string and we add the same type
|
|
110
|
+
try:
|
|
111
|
+
record = orjson.loads(accumulator)
|
|
112
|
+
if had_json_parsing_error and not has_warned_for_multiline_json_object:
|
|
113
|
+
logger.warning(
|
|
114
|
+
f"File at {file.uri} is using multiline JSON. Performance could be greatly reduced"
|
|
115
|
+
)
|
|
116
|
+
has_warned_for_multiline_json_object = True
|
|
117
|
+
|
|
118
|
+
yield record
|
|
119
|
+
yielded_at_least_once = True
|
|
120
|
+
accumulator = self._instantiate_accumulator(line)
|
|
121
|
+
except orjson.JSONDecodeError:
|
|
122
|
+
had_json_parsing_error = True
|
|
123
|
+
|
|
124
|
+
if (
|
|
125
|
+
read_limit
|
|
126
|
+
and yielded_at_least_once
|
|
127
|
+
and read_bytes >= self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE
|
|
128
|
+
):
|
|
129
|
+
logger.warning(
|
|
130
|
+
f"Exceeded the maximum number of bytes per file for schema inference ({self.MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE}). "
|
|
131
|
+
f"Inferring schema from an incomplete set of records."
|
|
132
|
+
)
|
|
133
|
+
break
|
|
134
|
+
|
|
135
|
+
if had_json_parsing_error and not yielded_at_least_once:
|
|
136
|
+
raise RecordParseError(
|
|
137
|
+
FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
@staticmethod
|
|
141
|
+
def _instantiate_accumulator(line: Union[bytes, str]) -> Union[bytes, str]:
|
|
142
|
+
if isinstance(line, bytes):
|
|
143
|
+
return bytes("", json.detect_encoding(line))
|
|
144
|
+
elif isinstance(line, str):
|
|
145
|
+
return ""
|