airbyte-cdk 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +358 -0
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +104 -0
- airbyte_cdk/connector.py +123 -0
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/__init__.py +3 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
- airbyte_cdk/connector_builder/main.py +107 -0
- airbyte_cdk/connector_builder/models.py +73 -0
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +83 -0
- airbyte_cdk/destinations/__init__.py +8 -0
- airbyte_cdk/destinations/destination.py +154 -0
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
- airbyte_cdk/destinations/vector_db_based/config.py +298 -0
- airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
- airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
- airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
- airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
- airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
- airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
- airbyte_cdk/entrypoint.py +414 -0
- airbyte_cdk/exception_handler.py +56 -0
- airbyte_cdk/logger.py +109 -0
- airbyte_cdk/models/__init__.py +72 -0
- airbyte_cdk/models/airbyte_protocol.py +88 -0
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/well_known_types.py +5 -0
- airbyte_cdk/py.typed +0 -0
- airbyte_cdk/sources/__init__.py +26 -0
- airbyte_cdk/sources/abstract_source.py +326 -0
- airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
- airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
- airbyte_cdk/sources/config.py +27 -0
- airbyte_cdk/sources/connector_state_manager.py +161 -0
- airbyte_cdk/sources/declarative/__init__.py +3 -0
- airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
- airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
- airbyte_cdk/sources/declarative/auth/token.py +267 -0
- airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
- airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
- airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
- airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
- airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
- airbyte_cdk/sources/declarative/declarative_source.py +36 -0
- airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
- airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
- airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
- airbyte_cdk/sources/declarative/exceptions.py +9 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
- airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
- airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
- airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
- airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
- airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +2 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
- airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
- airbyte_cdk/sources/declarative/requesters/README.md +56 -0
- airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
- airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
- airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
- airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
- airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
- airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
- airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
- airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
- airbyte_cdk/sources/declarative/spec/spec.py +48 -0
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
- airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
- airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
- airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
- airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
- airbyte_cdk/sources/declarative/types.py +25 -0
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
- airbyte_cdk/sources/file_based/config/__init__.py +0 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
- airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
- airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
- airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
- airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
- airbyte_cdk/sources/file_based/exceptions.py +159 -0
- airbyte_cdk/sources/file_based/file_based_source.py +466 -0
- airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
- airbyte_cdk/sources/file_based/file_record_data.py +22 -0
- airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
- airbyte_cdk/sources/file_based/remote_file.py +18 -0
- airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
- airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
- airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
- airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
- airbyte_cdk/sources/file_based/types.py +10 -0
- airbyte_cdk/sources/http_config.py +10 -0
- airbyte_cdk/sources/http_logger.py +55 -0
- airbyte_cdk/sources/message/__init__.py +19 -0
- airbyte_cdk/sources/message/repository.py +137 -0
- airbyte_cdk/sources/source.py +95 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/__init__.py +8 -0
- airbyte_cdk/sources/streams/availability_strategy.py +84 -0
- airbyte_cdk/sources/streams/call_rate.py +704 -0
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
- airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
- airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
- airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
- airbyte_cdk/sources/streams/core.py +703 -0
- airbyte_cdk/sources/streams/http/__init__.py +10 -0
- airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +61 -0
- airbyte_cdk/sources/streams/http/http.py +673 -0
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/sources/streams/utils/__init__.py +3 -0
- airbyte_cdk/sources/types.py +169 -0
- airbyte_cdk/sources/utils/__init__.py +7 -0
- airbyte_cdk/sources/utils/casing.py +12 -0
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +53 -0
- airbyte_cdk/sources/utils/schema_helpers.py +230 -0
- airbyte_cdk/sources/utils/slice_logger.py +57 -0
- airbyte_cdk/sources/utils/transform.py +277 -0
- airbyte_cdk/sources/utils/types.py +7 -0
- airbyte_cdk/sql/__init__.py +0 -0
- airbyte_cdk/sql/_util/__init__.py +0 -0
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/__init__.py +7 -0
- airbyte_cdk/test/catalog_builder.py +81 -0
- airbyte_cdk/test/entrypoint_wrapper.py +250 -0
- airbyte_cdk/test/mock_http/__init__.py +6 -0
- airbyte_cdk/test/mock_http/matcher.py +41 -0
- airbyte_cdk/test/mock_http/mocker.py +185 -0
- airbyte_cdk/test/mock_http/request.py +103 -0
- airbyte_cdk/test/mock_http/response.py +28 -0
- airbyte_cdk/test/mock_http/response_builder.py +237 -0
- airbyte_cdk/test/state_builder.py +33 -0
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +10 -0
- airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
- airbyte_cdk/utils/analytics_message.py +25 -0
- airbyte_cdk/utils/constants.py +5 -0
- airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
- airbyte_cdk/utils/datetime_helpers.py +499 -0
- airbyte_cdk/utils/event_timing.py +85 -0
- airbyte_cdk/utils/is_cloud_environment.py +18 -0
- airbyte_cdk/utils/mapping_helpers.py +162 -0
- airbyte_cdk/utils/message_utils.py +26 -0
- airbyte_cdk/utils/oneof_option_config.py +33 -0
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +270 -0
- airbyte_cdk/utils/slice_hasher.py +37 -0
- airbyte_cdk/utils/spec_schema_transformations.py +26 -0
- airbyte_cdk/utils/stream_status_utils.py +43 -0
- airbyte_cdk/utils/traced_exception.py +145 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
- airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
- airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
- airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
- airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,527 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from functools import partial
|
|
11
|
+
from io import IOBase
|
|
12
|
+
from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set, Tuple
|
|
13
|
+
from uuid import uuid4
|
|
14
|
+
|
|
15
|
+
import orjson
|
|
16
|
+
|
|
17
|
+
from airbyte_cdk.models import FailureType
|
|
18
|
+
from airbyte_cdk.sources.file_based.config.csv_format import (
|
|
19
|
+
CsvFormat,
|
|
20
|
+
CsvHeaderAutogenerated,
|
|
21
|
+
CsvHeaderUserProvided,
|
|
22
|
+
InferenceType,
|
|
23
|
+
)
|
|
24
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
|
25
|
+
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
|
26
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
|
27
|
+
AbstractFileBasedStreamReader,
|
|
28
|
+
FileReadMode,
|
|
29
|
+
)
|
|
30
|
+
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
|
31
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
32
|
+
from airbyte_cdk.sources.file_based.schema_helpers import TYPE_PYTHON_MAPPING, SchemaType
|
|
33
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
34
|
+
|
|
35
|
+
DIALECT_NAME = "_config_dialect"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class _CsvReader:
|
|
39
|
+
def read_data(
|
|
40
|
+
self,
|
|
41
|
+
config: FileBasedStreamConfig,
|
|
42
|
+
file: RemoteFile,
|
|
43
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
44
|
+
logger: logging.Logger,
|
|
45
|
+
file_read_mode: FileReadMode,
|
|
46
|
+
) -> Generator[Dict[str, Any], None, None]:
|
|
47
|
+
config_format = _extract_format(config)
|
|
48
|
+
lineno = 0
|
|
49
|
+
|
|
50
|
+
# Formats are configured individually per-stream so a unique dialect should be registered for each stream.
|
|
51
|
+
# We don't unregister the dialect because we are lazily parsing each csv file to generate records
|
|
52
|
+
# Give each stream's dialect a unique name; otherwise, when we are doing a concurrent sync we can end up
|
|
53
|
+
# with a race condition where a thread attempts to use a dialect before a separate thread has finished
|
|
54
|
+
# registering it.
|
|
55
|
+
dialect_name = f"{config.name}_{str(uuid4())}_{DIALECT_NAME}"
|
|
56
|
+
csv.register_dialect(
|
|
57
|
+
dialect_name,
|
|
58
|
+
delimiter=config_format.delimiter,
|
|
59
|
+
quotechar=config_format.quote_char,
|
|
60
|
+
escapechar=config_format.escape_char,
|
|
61
|
+
doublequote=config_format.double_quote,
|
|
62
|
+
quoting=csv.QUOTE_MINIMAL,
|
|
63
|
+
)
|
|
64
|
+
with stream_reader.open_file(file, file_read_mode, config_format.encoding, logger) as fp:
|
|
65
|
+
try:
|
|
66
|
+
headers = self._get_headers(fp, config_format, dialect_name)
|
|
67
|
+
except UnicodeError:
|
|
68
|
+
raise AirbyteTracedException(
|
|
69
|
+
message=f"{FileBasedSourceError.ENCODING_ERROR.value} Expected encoding: {config_format.encoding}",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
rows_to_skip = (
|
|
73
|
+
config_format.skip_rows_before_header
|
|
74
|
+
+ (1 if config_format.header_definition.has_header_row() else 0)
|
|
75
|
+
+ config_format.skip_rows_after_header
|
|
76
|
+
)
|
|
77
|
+
self._skip_rows(fp, rows_to_skip)
|
|
78
|
+
lineno += rows_to_skip
|
|
79
|
+
|
|
80
|
+
reader = csv.DictReader(fp, dialect=dialect_name, fieldnames=headers) # type: ignore
|
|
81
|
+
try:
|
|
82
|
+
for row in reader:
|
|
83
|
+
lineno += 1
|
|
84
|
+
|
|
85
|
+
# The row was not properly parsed if any of the values are None. This will most likely occur if there are more columns
|
|
86
|
+
# than headers or more headers dans columns
|
|
87
|
+
if None in row:
|
|
88
|
+
if config_format.ignore_errors_on_fields_mismatch:
|
|
89
|
+
logger.error(
|
|
90
|
+
f"Skipping record in line {lineno} of file {file.uri}; invalid CSV row with missing column."
|
|
91
|
+
)
|
|
92
|
+
else:
|
|
93
|
+
raise RecordParseError(
|
|
94
|
+
FileBasedSourceError.ERROR_PARSING_RECORD_MISMATCHED_COLUMNS,
|
|
95
|
+
filename=file.uri,
|
|
96
|
+
lineno=lineno,
|
|
97
|
+
)
|
|
98
|
+
if None in row.values():
|
|
99
|
+
if config_format.ignore_errors_on_fields_mismatch:
|
|
100
|
+
logger.error(
|
|
101
|
+
f"Skipping record in line {lineno} of file {file.uri}; invalid CSV row with extra column."
|
|
102
|
+
)
|
|
103
|
+
else:
|
|
104
|
+
raise RecordParseError(
|
|
105
|
+
FileBasedSourceError.ERROR_PARSING_RECORD_MISMATCHED_ROWS,
|
|
106
|
+
filename=file.uri,
|
|
107
|
+
lineno=lineno,
|
|
108
|
+
)
|
|
109
|
+
yield row
|
|
110
|
+
finally:
|
|
111
|
+
# due to RecordParseError or GeneratorExit
|
|
112
|
+
csv.unregister_dialect(dialect_name)
|
|
113
|
+
|
|
114
|
+
def _get_headers(self, fp: IOBase, config_format: CsvFormat, dialect_name: str) -> List[str]:
|
|
115
|
+
"""
|
|
116
|
+
Assumes the fp is pointing to the beginning of the files and will reset it as such
|
|
117
|
+
"""
|
|
118
|
+
# Note that this method assumes the dialect has already been registered if we're parsing the headers
|
|
119
|
+
if isinstance(config_format.header_definition, CsvHeaderUserProvided):
|
|
120
|
+
return config_format.header_definition.column_names
|
|
121
|
+
|
|
122
|
+
if isinstance(config_format.header_definition, CsvHeaderAutogenerated):
|
|
123
|
+
self._skip_rows(
|
|
124
|
+
fp, config_format.skip_rows_before_header + config_format.skip_rows_after_header
|
|
125
|
+
)
|
|
126
|
+
headers = self._auto_generate_headers(fp, dialect_name)
|
|
127
|
+
else:
|
|
128
|
+
# Then read the header
|
|
129
|
+
self._skip_rows(fp, config_format.skip_rows_before_header)
|
|
130
|
+
reader = csv.reader(fp, dialect=dialect_name) # type: ignore
|
|
131
|
+
headers = list(next(reader))
|
|
132
|
+
|
|
133
|
+
fp.seek(0)
|
|
134
|
+
return headers
|
|
135
|
+
|
|
136
|
+
def _auto_generate_headers(self, fp: IOBase, dialect_name: str) -> List[str]:
|
|
137
|
+
"""
|
|
138
|
+
Generates field names as [f0, f1, ...] in the same way as pyarrow's csv reader with autogenerate_column_names=True.
|
|
139
|
+
See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ReadOptions.html
|
|
140
|
+
"""
|
|
141
|
+
reader = csv.reader(fp, dialect=dialect_name) # type: ignore
|
|
142
|
+
number_of_columns = len(next(reader)) # type: ignore
|
|
143
|
+
return [f"f{i}" for i in range(number_of_columns)]
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def _skip_rows(fp: IOBase, rows_to_skip: int) -> None:
|
|
147
|
+
"""
|
|
148
|
+
Skip rows before the header. This has to be done on the file object itself, not the reader
|
|
149
|
+
"""
|
|
150
|
+
for _ in range(rows_to_skip):
|
|
151
|
+
fp.readline()
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class CsvParser(FileTypeParser):
|
|
155
|
+
_MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE = 1_000_000
|
|
156
|
+
|
|
157
|
+
def __init__(self, csv_reader: Optional[_CsvReader] = None, csv_field_max_bytes: int = 2**31):
|
|
158
|
+
# Increase the maximum length of data that can be parsed in a single CSV field. The default is 128k, which is typically sufficient
|
|
159
|
+
# but given the use of Airbyte in loading a large variety of data it is best to allow for a larger maximum field size to avoid
|
|
160
|
+
# skipping data on load. https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072
|
|
161
|
+
csv.field_size_limit(csv_field_max_bytes)
|
|
162
|
+
self._csv_reader = csv_reader if csv_reader else _CsvReader()
|
|
163
|
+
|
|
164
|
+
def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
|
|
165
|
+
"""
|
|
166
|
+
CsvParser does not require config checks, implicit pydantic validation is enough.
|
|
167
|
+
"""
|
|
168
|
+
return True, None
|
|
169
|
+
|
|
170
|
+
async def infer_schema(
|
|
171
|
+
self,
|
|
172
|
+
config: FileBasedStreamConfig,
|
|
173
|
+
file: RemoteFile,
|
|
174
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
175
|
+
logger: logging.Logger,
|
|
176
|
+
) -> SchemaType:
|
|
177
|
+
input_schema = config.get_input_schema()
|
|
178
|
+
if input_schema:
|
|
179
|
+
return input_schema
|
|
180
|
+
|
|
181
|
+
# todo: the existing InMemoryFilesSource.open_file() test source doesn't currently require an encoding, but actual
|
|
182
|
+
# sources will likely require one. Rather than modify the interface now we can wait until the real use case
|
|
183
|
+
config_format = _extract_format(config)
|
|
184
|
+
type_inferrer_by_field: Dict[str, _TypeInferrer] = defaultdict(
|
|
185
|
+
lambda: _JsonTypeInferrer(
|
|
186
|
+
config_format.true_values, config_format.false_values, config_format.null_values
|
|
187
|
+
)
|
|
188
|
+
if config_format.inference_type != InferenceType.NONE
|
|
189
|
+
else _DisabledTypeInferrer()
|
|
190
|
+
)
|
|
191
|
+
data_generator = self._csv_reader.read_data(
|
|
192
|
+
config, file, stream_reader, logger, self.file_read_mode
|
|
193
|
+
)
|
|
194
|
+
read_bytes = 0
|
|
195
|
+
for row in data_generator:
|
|
196
|
+
for header, value in row.items():
|
|
197
|
+
type_inferrer_by_field[header].add_value(value)
|
|
198
|
+
# This is not accurate as a representation of how many bytes were read because csv does some processing on the actual value
|
|
199
|
+
# before returning. Given we would like to be more accurate, we could wrap the IO file using a decorator
|
|
200
|
+
read_bytes += len(value)
|
|
201
|
+
read_bytes += len(row) - 1 # for separators
|
|
202
|
+
if read_bytes >= self._MAX_BYTES_PER_FILE_FOR_SCHEMA_INFERENCE:
|
|
203
|
+
break
|
|
204
|
+
|
|
205
|
+
if not type_inferrer_by_field:
|
|
206
|
+
raise AirbyteTracedException(
|
|
207
|
+
message=f"Could not infer schema as there are no rows in {file.uri}. If having an empty CSV file is expected, ignore this. "
|
|
208
|
+
f"Else, please contact Airbyte.",
|
|
209
|
+
failure_type=FailureType.config_error,
|
|
210
|
+
)
|
|
211
|
+
schema = {
|
|
212
|
+
header.strip(): {"type": type_inferred.infer()}
|
|
213
|
+
for header, type_inferred in type_inferrer_by_field.items()
|
|
214
|
+
}
|
|
215
|
+
data_generator.close()
|
|
216
|
+
return schema
|
|
217
|
+
|
|
218
|
+
def parse_records(
|
|
219
|
+
self,
|
|
220
|
+
config: FileBasedStreamConfig,
|
|
221
|
+
file: RemoteFile,
|
|
222
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
223
|
+
logger: logging.Logger,
|
|
224
|
+
discovered_schema: Optional[Mapping[str, SchemaType]],
|
|
225
|
+
) -> Iterable[Dict[str, Any]]:
|
|
226
|
+
line_no = 0
|
|
227
|
+
try:
|
|
228
|
+
config_format = _extract_format(config)
|
|
229
|
+
if discovered_schema:
|
|
230
|
+
property_types = {
|
|
231
|
+
col: prop["type"] for col, prop in discovered_schema["properties"].items()
|
|
232
|
+
}
|
|
233
|
+
deduped_property_types = CsvParser._pre_propcess_property_types(property_types)
|
|
234
|
+
else:
|
|
235
|
+
deduped_property_types = {}
|
|
236
|
+
cast_fn = CsvParser._get_cast_function(
|
|
237
|
+
deduped_property_types, config_format, logger, config.schemaless
|
|
238
|
+
)
|
|
239
|
+
data_generator = self._csv_reader.read_data(
|
|
240
|
+
config, file, stream_reader, logger, self.file_read_mode
|
|
241
|
+
)
|
|
242
|
+
for row in data_generator:
|
|
243
|
+
line_no += 1
|
|
244
|
+
yield CsvParser._to_nullable(
|
|
245
|
+
cast_fn(row),
|
|
246
|
+
deduped_property_types,
|
|
247
|
+
config_format.null_values,
|
|
248
|
+
config_format.strings_can_be_null,
|
|
249
|
+
)
|
|
250
|
+
except RecordParseError as parse_err:
|
|
251
|
+
raise RecordParseError(
|
|
252
|
+
FileBasedSourceError.ERROR_PARSING_RECORD, filename=file.uri, lineno=line_no
|
|
253
|
+
) from parse_err
|
|
254
|
+
finally:
|
|
255
|
+
data_generator.close()
|
|
256
|
+
|
|
257
|
+
@property
|
|
258
|
+
def file_read_mode(self) -> FileReadMode:
|
|
259
|
+
return FileReadMode.READ
|
|
260
|
+
|
|
261
|
+
@staticmethod
|
|
262
|
+
def _get_cast_function(
|
|
263
|
+
deduped_property_types: Mapping[str, str],
|
|
264
|
+
config_format: CsvFormat,
|
|
265
|
+
logger: logging.Logger,
|
|
266
|
+
schemaless: bool,
|
|
267
|
+
) -> Callable[[Mapping[str, str]], Mapping[str, str]]:
|
|
268
|
+
# Only cast values if the schema is provided
|
|
269
|
+
if deduped_property_types and not schemaless:
|
|
270
|
+
return partial(
|
|
271
|
+
CsvParser._cast_types,
|
|
272
|
+
deduped_property_types=deduped_property_types,
|
|
273
|
+
config_format=config_format,
|
|
274
|
+
logger=logger,
|
|
275
|
+
)
|
|
276
|
+
else:
|
|
277
|
+
# If no schema is provided, yield the rows as they are
|
|
278
|
+
return _no_cast
|
|
279
|
+
|
|
280
|
+
@staticmethod
|
|
281
|
+
def _to_nullable(
|
|
282
|
+
row: Mapping[str, str],
|
|
283
|
+
deduped_property_types: Mapping[str, str],
|
|
284
|
+
null_values: Set[str],
|
|
285
|
+
strings_can_be_null: bool,
|
|
286
|
+
) -> Dict[str, Optional[str]]:
|
|
287
|
+
nullable = {
|
|
288
|
+
k: None
|
|
289
|
+
if CsvParser._value_is_none(
|
|
290
|
+
v, deduped_property_types.get(k), null_values, strings_can_be_null
|
|
291
|
+
)
|
|
292
|
+
else v
|
|
293
|
+
for k, v in row.items()
|
|
294
|
+
}
|
|
295
|
+
return nullable
|
|
296
|
+
|
|
297
|
+
@staticmethod
|
|
298
|
+
def _value_is_none(
|
|
299
|
+
value: Any,
|
|
300
|
+
deduped_property_type: Optional[str],
|
|
301
|
+
null_values: Set[str],
|
|
302
|
+
strings_can_be_null: bool,
|
|
303
|
+
) -> bool:
|
|
304
|
+
return value in null_values and (strings_can_be_null or deduped_property_type != "string")
|
|
305
|
+
|
|
306
|
+
@staticmethod
|
|
307
|
+
def _pre_propcess_property_types(property_types: Dict[str, Any]) -> Mapping[str, str]:
|
|
308
|
+
"""
|
|
309
|
+
Transform the property types to be non-nullable and remove duplicate types if any.
|
|
310
|
+
Sample input:
|
|
311
|
+
{
|
|
312
|
+
"col1": ["string", "null"],
|
|
313
|
+
"col2": ["string", "string", "null"],
|
|
314
|
+
"col3": "integer"
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
Sample output:
|
|
318
|
+
{
|
|
319
|
+
"col1": "string",
|
|
320
|
+
"col2": "string",
|
|
321
|
+
"col3": "integer",
|
|
322
|
+
}
|
|
323
|
+
"""
|
|
324
|
+
output = {}
|
|
325
|
+
for prop, prop_type in property_types.items():
|
|
326
|
+
if isinstance(prop_type, list):
|
|
327
|
+
prop_type_distinct = set(prop_type)
|
|
328
|
+
prop_type_distinct.remove("null")
|
|
329
|
+
if len(prop_type_distinct) != 1:
|
|
330
|
+
raise ValueError(f"Could not get non nullable type from {prop_type}")
|
|
331
|
+
output[prop] = next(iter(prop_type_distinct))
|
|
332
|
+
else:
|
|
333
|
+
output[prop] = prop_type
|
|
334
|
+
return output
|
|
335
|
+
|
|
336
|
+
@staticmethod
|
|
337
|
+
def _cast_types(
|
|
338
|
+
row: Dict[str, str],
|
|
339
|
+
deduped_property_types: Mapping[str, str],
|
|
340
|
+
config_format: CsvFormat,
|
|
341
|
+
logger: logging.Logger,
|
|
342
|
+
) -> Dict[str, Any]:
|
|
343
|
+
"""
|
|
344
|
+
Casts the values in the input 'row' dictionary according to the types defined in the JSON schema.
|
|
345
|
+
|
|
346
|
+
Array and object types are only handled if they can be deserialized as JSON.
|
|
347
|
+
|
|
348
|
+
If any errors are encountered, the value will be emitted as a string.
|
|
349
|
+
"""
|
|
350
|
+
warnings = []
|
|
351
|
+
result = {}
|
|
352
|
+
|
|
353
|
+
for key, value in row.items():
|
|
354
|
+
prop_type = deduped_property_types.get(key)
|
|
355
|
+
cast_value: Any = value
|
|
356
|
+
|
|
357
|
+
if prop_type in TYPE_PYTHON_MAPPING and prop_type is not None:
|
|
358
|
+
_, python_type = TYPE_PYTHON_MAPPING[prop_type]
|
|
359
|
+
|
|
360
|
+
if python_type is None:
|
|
361
|
+
if value == "":
|
|
362
|
+
cast_value = None
|
|
363
|
+
else:
|
|
364
|
+
warnings.append(_format_warning(key, value, prop_type))
|
|
365
|
+
|
|
366
|
+
elif python_type is bool:
|
|
367
|
+
try:
|
|
368
|
+
cast_value = _value_to_bool(
|
|
369
|
+
value, config_format.true_values, config_format.false_values
|
|
370
|
+
)
|
|
371
|
+
except ValueError:
|
|
372
|
+
warnings.append(_format_warning(key, value, prop_type))
|
|
373
|
+
|
|
374
|
+
elif python_type is dict:
|
|
375
|
+
try:
|
|
376
|
+
# we don't re-use _value_to_object here because we type the column as object as long as there is only one object
|
|
377
|
+
cast_value = orjson.loads(value)
|
|
378
|
+
except orjson.JSONDecodeError:
|
|
379
|
+
warnings.append(_format_warning(key, value, prop_type))
|
|
380
|
+
|
|
381
|
+
elif python_type is list:
|
|
382
|
+
try:
|
|
383
|
+
cast_value = _value_to_list(value)
|
|
384
|
+
except (ValueError, json.JSONDecodeError):
|
|
385
|
+
warnings.append(_format_warning(key, value, prop_type))
|
|
386
|
+
|
|
387
|
+
elif python_type:
|
|
388
|
+
try:
|
|
389
|
+
cast_value = _value_to_python_type(value, python_type)
|
|
390
|
+
except ValueError:
|
|
391
|
+
warnings.append(_format_warning(key, value, prop_type))
|
|
392
|
+
|
|
393
|
+
result[key] = cast_value
|
|
394
|
+
|
|
395
|
+
if warnings:
|
|
396
|
+
logger.warning(
|
|
397
|
+
f"{FileBasedSourceError.ERROR_CASTING_VALUE.value}: {','.join([w for w in warnings])}",
|
|
398
|
+
)
|
|
399
|
+
return result
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
class _TypeInferrer(ABC):
|
|
403
|
+
@abstractmethod
|
|
404
|
+
def add_value(self, value: Any) -> None:
|
|
405
|
+
pass
|
|
406
|
+
|
|
407
|
+
@abstractmethod
|
|
408
|
+
def infer(self) -> str:
|
|
409
|
+
pass
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
class _DisabledTypeInferrer(_TypeInferrer):
|
|
413
|
+
def add_value(self, value: Any) -> None:
|
|
414
|
+
pass
|
|
415
|
+
|
|
416
|
+
def infer(self) -> str:
|
|
417
|
+
return "string"
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
class _JsonTypeInferrer(_TypeInferrer):
|
|
421
|
+
_NULL_TYPE = "null"
|
|
422
|
+
_BOOLEAN_TYPE = "boolean"
|
|
423
|
+
_INTEGER_TYPE = "integer"
|
|
424
|
+
_NUMBER_TYPE = "number"
|
|
425
|
+
_STRING_TYPE = "string"
|
|
426
|
+
|
|
427
|
+
def __init__(
|
|
428
|
+
self, boolean_trues: Set[str], boolean_falses: Set[str], null_values: Set[str]
|
|
429
|
+
) -> None:
|
|
430
|
+
self._boolean_trues = boolean_trues
|
|
431
|
+
self._boolean_falses = boolean_falses
|
|
432
|
+
self._null_values = null_values
|
|
433
|
+
self._values: Set[str] = set()
|
|
434
|
+
|
|
435
|
+
def add_value(self, value: Any) -> None:
|
|
436
|
+
self._values.add(value)
|
|
437
|
+
|
|
438
|
+
def infer(self) -> str:
|
|
439
|
+
types_by_value = {value: self._infer_type(value) for value in self._values}
|
|
440
|
+
types_excluding_null_values = [
|
|
441
|
+
types for types in types_by_value.values() if self._NULL_TYPE not in types
|
|
442
|
+
]
|
|
443
|
+
if not types_excluding_null_values:
|
|
444
|
+
# this is highly unusual but we will consider the column as a string
|
|
445
|
+
return self._STRING_TYPE
|
|
446
|
+
|
|
447
|
+
types = set.intersection(*types_excluding_null_values)
|
|
448
|
+
if self._BOOLEAN_TYPE in types:
|
|
449
|
+
return self._BOOLEAN_TYPE
|
|
450
|
+
elif self._INTEGER_TYPE in types:
|
|
451
|
+
return self._INTEGER_TYPE
|
|
452
|
+
elif self._NUMBER_TYPE in types:
|
|
453
|
+
return self._NUMBER_TYPE
|
|
454
|
+
return self._STRING_TYPE
|
|
455
|
+
|
|
456
|
+
def _infer_type(self, value: str) -> Set[str]:
|
|
457
|
+
inferred_types = set()
|
|
458
|
+
|
|
459
|
+
if value in self._null_values:
|
|
460
|
+
inferred_types.add(self._NULL_TYPE)
|
|
461
|
+
if self._is_boolean(value):
|
|
462
|
+
inferred_types.add(self._BOOLEAN_TYPE)
|
|
463
|
+
if self._is_integer(value):
|
|
464
|
+
inferred_types.add(self._INTEGER_TYPE)
|
|
465
|
+
inferred_types.add(self._NUMBER_TYPE)
|
|
466
|
+
elif self._is_number(value):
|
|
467
|
+
inferred_types.add(self._NUMBER_TYPE)
|
|
468
|
+
|
|
469
|
+
inferred_types.add(self._STRING_TYPE)
|
|
470
|
+
return inferred_types
|
|
471
|
+
|
|
472
|
+
def _is_boolean(self, value: str) -> bool:
|
|
473
|
+
try:
|
|
474
|
+
_value_to_bool(value, self._boolean_trues, self._boolean_falses)
|
|
475
|
+
return True
|
|
476
|
+
except ValueError:
|
|
477
|
+
return False
|
|
478
|
+
|
|
479
|
+
@staticmethod
|
|
480
|
+
def _is_integer(value: str) -> bool:
|
|
481
|
+
try:
|
|
482
|
+
_value_to_python_type(value, int)
|
|
483
|
+
return True
|
|
484
|
+
except ValueError:
|
|
485
|
+
return False
|
|
486
|
+
|
|
487
|
+
@staticmethod
|
|
488
|
+
def _is_number(value: str) -> bool:
|
|
489
|
+
try:
|
|
490
|
+
_value_to_python_type(value, float)
|
|
491
|
+
return True
|
|
492
|
+
except ValueError:
|
|
493
|
+
return False
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def _value_to_bool(value: str, true_values: Set[str], false_values: Set[str]) -> bool:
|
|
497
|
+
if value in true_values:
|
|
498
|
+
return True
|
|
499
|
+
if value in false_values:
|
|
500
|
+
return False
|
|
501
|
+
raise ValueError(f"Value {value} is not a valid boolean value")
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def _value_to_list(value: str) -> List[Any]:
|
|
505
|
+
parsed_value = json.loads(value)
|
|
506
|
+
if isinstance(parsed_value, list):
|
|
507
|
+
return parsed_value
|
|
508
|
+
raise ValueError(f"Value {parsed_value} is not a valid list value")
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def _value_to_python_type(value: str, python_type: type) -> Any:
|
|
512
|
+
return python_type(value)
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
def _format_warning(key: str, value: str, expected_type: Optional[Any]) -> str:
|
|
516
|
+
return f"{key}: value={value},expected_type={expected_type}"
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
def _no_cast(row: Mapping[str, str]) -> Mapping[str, str]:
|
|
520
|
+
return row
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def _extract_format(config: FileBasedStreamConfig) -> CsvFormat:
|
|
524
|
+
config_format = config.format
|
|
525
|
+
if not isinstance(config_format, CsvFormat):
|
|
526
|
+
raise ValueError(f"Invalid format config: {config_format}")
|
|
527
|
+
return config_format
|