airbyte-cdk 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +358 -0
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +104 -0
- airbyte_cdk/connector.py +123 -0
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/__init__.py +3 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
- airbyte_cdk/connector_builder/main.py +107 -0
- airbyte_cdk/connector_builder/models.py +73 -0
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +83 -0
- airbyte_cdk/destinations/__init__.py +8 -0
- airbyte_cdk/destinations/destination.py +154 -0
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
- airbyte_cdk/destinations/vector_db_based/config.py +298 -0
- airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
- airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
- airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
- airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
- airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
- airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
- airbyte_cdk/entrypoint.py +414 -0
- airbyte_cdk/exception_handler.py +56 -0
- airbyte_cdk/logger.py +109 -0
- airbyte_cdk/models/__init__.py +72 -0
- airbyte_cdk/models/airbyte_protocol.py +88 -0
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/well_known_types.py +5 -0
- airbyte_cdk/py.typed +0 -0
- airbyte_cdk/sources/__init__.py +26 -0
- airbyte_cdk/sources/abstract_source.py +326 -0
- airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
- airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
- airbyte_cdk/sources/config.py +27 -0
- airbyte_cdk/sources/connector_state_manager.py +161 -0
- airbyte_cdk/sources/declarative/__init__.py +3 -0
- airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
- airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
- airbyte_cdk/sources/declarative/auth/token.py +267 -0
- airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
- airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
- airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
- airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
- airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
- airbyte_cdk/sources/declarative/declarative_source.py +36 -0
- airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
- airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
- airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
- airbyte_cdk/sources/declarative/exceptions.py +9 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
- airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
- airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
- airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
- airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
- airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +2 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
- airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
- airbyte_cdk/sources/declarative/requesters/README.md +56 -0
- airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
- airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
- airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
- airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
- airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
- airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
- airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
- airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
- airbyte_cdk/sources/declarative/spec/spec.py +48 -0
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
- airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
- airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
- airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
- airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
- airbyte_cdk/sources/declarative/types.py +25 -0
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
- airbyte_cdk/sources/file_based/config/__init__.py +0 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
- airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
- airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
- airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
- airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
- airbyte_cdk/sources/file_based/exceptions.py +159 -0
- airbyte_cdk/sources/file_based/file_based_source.py +466 -0
- airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
- airbyte_cdk/sources/file_based/file_record_data.py +22 -0
- airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
- airbyte_cdk/sources/file_based/remote_file.py +18 -0
- airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
- airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
- airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
- airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
- airbyte_cdk/sources/file_based/types.py +10 -0
- airbyte_cdk/sources/http_config.py +10 -0
- airbyte_cdk/sources/http_logger.py +55 -0
- airbyte_cdk/sources/message/__init__.py +19 -0
- airbyte_cdk/sources/message/repository.py +137 -0
- airbyte_cdk/sources/source.py +95 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/__init__.py +8 -0
- airbyte_cdk/sources/streams/availability_strategy.py +84 -0
- airbyte_cdk/sources/streams/call_rate.py +704 -0
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
- airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
- airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
- airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
- airbyte_cdk/sources/streams/core.py +703 -0
- airbyte_cdk/sources/streams/http/__init__.py +10 -0
- airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +61 -0
- airbyte_cdk/sources/streams/http/http.py +673 -0
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/sources/streams/utils/__init__.py +3 -0
- airbyte_cdk/sources/types.py +169 -0
- airbyte_cdk/sources/utils/__init__.py +7 -0
- airbyte_cdk/sources/utils/casing.py +12 -0
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +53 -0
- airbyte_cdk/sources/utils/schema_helpers.py +230 -0
- airbyte_cdk/sources/utils/slice_logger.py +57 -0
- airbyte_cdk/sources/utils/transform.py +277 -0
- airbyte_cdk/sources/utils/types.py +7 -0
- airbyte_cdk/sql/__init__.py +0 -0
- airbyte_cdk/sql/_util/__init__.py +0 -0
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/__init__.py +7 -0
- airbyte_cdk/test/catalog_builder.py +81 -0
- airbyte_cdk/test/entrypoint_wrapper.py +250 -0
- airbyte_cdk/test/mock_http/__init__.py +6 -0
- airbyte_cdk/test/mock_http/matcher.py +41 -0
- airbyte_cdk/test/mock_http/mocker.py +185 -0
- airbyte_cdk/test/mock_http/request.py +103 -0
- airbyte_cdk/test/mock_http/response.py +28 -0
- airbyte_cdk/test/mock_http/response_builder.py +237 -0
- airbyte_cdk/test/state_builder.py +33 -0
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +10 -0
- airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
- airbyte_cdk/utils/analytics_message.py +25 -0
- airbyte_cdk/utils/constants.py +5 -0
- airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
- airbyte_cdk/utils/datetime_helpers.py +499 -0
- airbyte_cdk/utils/event_timing.py +85 -0
- airbyte_cdk/utils/is_cloud_environment.py +18 -0
- airbyte_cdk/utils/mapping_helpers.py +162 -0
- airbyte_cdk/utils/message_utils.py +26 -0
- airbyte_cdk/utils/oneof_option_config.py +33 -0
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +270 -0
- airbyte_cdk/utils/slice_hasher.py +37 -0
- airbyte_cdk/utils/spec_schema_transformations.py +26 -0
- airbyte_cdk/utils/stream_status_utils.py +43 -0
- airbyte_cdk/utils/traced_exception.py +145 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
- airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
- airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
- airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
- airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import traceback
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from io import BytesIO, IOBase
|
|
9
|
+
from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union
|
|
10
|
+
|
|
11
|
+
import backoff
|
|
12
|
+
import dpath
|
|
13
|
+
import nltk
|
|
14
|
+
import requests
|
|
15
|
+
from unstructured.file_utils.filetype import (
|
|
16
|
+
EXT_TO_FILETYPE,
|
|
17
|
+
FILETYPE_TO_MIMETYPE,
|
|
18
|
+
STR_TO_FILETYPE,
|
|
19
|
+
FileType,
|
|
20
|
+
detect_filetype,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
from airbyte_cdk.models import FailureType
|
|
24
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
|
25
|
+
from airbyte_cdk.sources.file_based.config.unstructured_format import (
|
|
26
|
+
APIParameterConfigModel,
|
|
27
|
+
APIProcessingConfigModel,
|
|
28
|
+
LocalProcessingConfigModel,
|
|
29
|
+
UnstructuredFormat,
|
|
30
|
+
)
|
|
31
|
+
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
|
|
32
|
+
from airbyte_cdk.sources.file_based.file_based_stream_reader import (
|
|
33
|
+
AbstractFileBasedStreamReader,
|
|
34
|
+
FileReadMode,
|
|
35
|
+
)
|
|
36
|
+
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
|
37
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
38
|
+
from airbyte_cdk.sources.file_based.schema_helpers import SchemaType
|
|
39
|
+
from airbyte_cdk.utils import is_cloud_environment
|
|
40
|
+
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
41
|
+
|
|
42
|
+
unstructured_partition_pdf = None
|
|
43
|
+
unstructured_partition_docx = None
|
|
44
|
+
unstructured_partition_pptx = None
|
|
45
|
+
|
|
46
|
+
AIRBYTE_NLTK_DATA_DIR = "/airbyte/nltk_data"
|
|
47
|
+
TMP_NLTK_DATA_DIR = "/tmp/nltk_data"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_nltk_temp_folder() -> str:
|
|
51
|
+
"""
|
|
52
|
+
For non-root connectors /tmp is not currently writable, but we should allow it in the future.
|
|
53
|
+
It's safe to use /airbyte for now. Fallback to /tmp for local development.
|
|
54
|
+
"""
|
|
55
|
+
try:
|
|
56
|
+
nltk_data_dir = AIRBYTE_NLTK_DATA_DIR
|
|
57
|
+
os.makedirs(nltk_data_dir, exist_ok=True)
|
|
58
|
+
except OSError:
|
|
59
|
+
nltk_data_dir = TMP_NLTK_DATA_DIR
|
|
60
|
+
os.makedirs(nltk_data_dir, exist_ok=True)
|
|
61
|
+
return nltk_data_dir
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
nltk_data_dir = get_nltk_temp_folder()
|
|
66
|
+
nltk.data.path.append(nltk_data_dir)
|
|
67
|
+
nltk.data.find("tokenizers/punkt.zip")
|
|
68
|
+
nltk.data.find("tokenizers/punkt_tab.zip")
|
|
69
|
+
nltk.data.find("tokenizers/averaged_perceptron_tagger_eng.zip")
|
|
70
|
+
except LookupError:
|
|
71
|
+
nltk.download("punkt", download_dir=nltk_data_dir, quiet=True)
|
|
72
|
+
nltk.download("punkt_tab", download_dir=nltk_data_dir, quiet=True)
|
|
73
|
+
nltk.download("averaged_perceptron_tagger_eng", download_dir=nltk_data_dir, quiet=True)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def optional_decode(contents: Union[str, bytes]) -> str:
|
|
77
|
+
if isinstance(contents, bytes):
|
|
78
|
+
return contents.decode("utf-8")
|
|
79
|
+
return contents
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _import_unstructured() -> None:
|
|
83
|
+
"""Dynamically imported as needed, due to slow import speed."""
|
|
84
|
+
global unstructured_partition_pdf
|
|
85
|
+
global unstructured_partition_docx
|
|
86
|
+
global unstructured_partition_pptx
|
|
87
|
+
from unstructured.partition.docx import partition_docx
|
|
88
|
+
from unstructured.partition.pdf import partition_pdf
|
|
89
|
+
from unstructured.partition.pptx import partition_pptx
|
|
90
|
+
|
|
91
|
+
# separate global variables to properly propagate typing
|
|
92
|
+
unstructured_partition_pdf = partition_pdf
|
|
93
|
+
unstructured_partition_docx = partition_docx
|
|
94
|
+
unstructured_partition_pptx = partition_pptx
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def user_error(e: Exception) -> bool:
|
|
98
|
+
"""
|
|
99
|
+
Return True if this exception is caused by user error, False otherwise.
|
|
100
|
+
"""
|
|
101
|
+
if not isinstance(e, RecordParseError):
|
|
102
|
+
return False
|
|
103
|
+
if not isinstance(e, requests.exceptions.RequestException):
|
|
104
|
+
return False
|
|
105
|
+
return bool(e.response and 400 <= e.response.status_code < 500)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
CLOUD_DEPLOYMENT_MODE = "cloud"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class UnstructuredParser(FileTypeParser):
|
|
112
|
+
@property
|
|
113
|
+
def parser_max_n_files_for_schema_inference(self) -> Optional[int]:
|
|
114
|
+
"""
|
|
115
|
+
Just check one file as the schema is static
|
|
116
|
+
"""
|
|
117
|
+
return 1
|
|
118
|
+
|
|
119
|
+
@property
|
|
120
|
+
def parser_max_n_files_for_parsability(self) -> Optional[int]:
|
|
121
|
+
"""
|
|
122
|
+
Do not check any files for parsability because it might be an expensive operation and doesn't give much confidence whether the sync will succeed.
|
|
123
|
+
"""
|
|
124
|
+
return 0
|
|
125
|
+
|
|
126
|
+
def get_parser_defined_primary_key(self, config: FileBasedStreamConfig) -> Optional[str]:
|
|
127
|
+
"""
|
|
128
|
+
Return the document_key field as the primary key.
|
|
129
|
+
|
|
130
|
+
his will pre-select the document key column as the primary key when setting up a connection, making it easier for the user to configure normalization in the destination.
|
|
131
|
+
"""
|
|
132
|
+
return "document_key"
|
|
133
|
+
|
|
134
|
+
async def infer_schema(
|
|
135
|
+
self,
|
|
136
|
+
config: FileBasedStreamConfig,
|
|
137
|
+
file: RemoteFile,
|
|
138
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
139
|
+
logger: logging.Logger,
|
|
140
|
+
) -> SchemaType:
|
|
141
|
+
format = _extract_format(config)
|
|
142
|
+
with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
|
|
143
|
+
filetype = self._get_filetype(file_handle, file)
|
|
144
|
+
if filetype not in self._supported_file_types() and not format.skip_unprocessable_files:
|
|
145
|
+
raise self._create_parse_error(
|
|
146
|
+
file,
|
|
147
|
+
self._get_file_type_error_message(filetype),
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return {
|
|
151
|
+
"content": {
|
|
152
|
+
"type": "string",
|
|
153
|
+
"description": "Content of the file as markdown. Might be null if the file could not be parsed",
|
|
154
|
+
},
|
|
155
|
+
"document_key": {
|
|
156
|
+
"type": "string",
|
|
157
|
+
"description": "Unique identifier of the document, e.g. the file path",
|
|
158
|
+
},
|
|
159
|
+
"_ab_source_file_parse_error": {
|
|
160
|
+
"type": "string",
|
|
161
|
+
"description": "Error message if the file could not be parsed even though the file is supported",
|
|
162
|
+
},
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
def parse_records(
|
|
166
|
+
self,
|
|
167
|
+
config: FileBasedStreamConfig,
|
|
168
|
+
file: RemoteFile,
|
|
169
|
+
stream_reader: AbstractFileBasedStreamReader,
|
|
170
|
+
logger: logging.Logger,
|
|
171
|
+
discovered_schema: Optional[Mapping[str, SchemaType]],
|
|
172
|
+
) -> Iterable[Dict[str, Any]]:
|
|
173
|
+
format = _extract_format(config)
|
|
174
|
+
with stream_reader.open_file(file, self.file_read_mode, None, logger) as file_handle:
|
|
175
|
+
try:
|
|
176
|
+
markdown = self._read_file(file_handle, file, format, logger)
|
|
177
|
+
yield {
|
|
178
|
+
"content": markdown,
|
|
179
|
+
"document_key": file.uri,
|
|
180
|
+
"_ab_source_file_parse_error": None,
|
|
181
|
+
}
|
|
182
|
+
except RecordParseError as e:
|
|
183
|
+
# RecordParseError is raised when the file can't be parsed because of a problem with the file content (either the file is not supported or the file is corrupted)
|
|
184
|
+
# if the skip_unprocessable_files flag is set, we log a warning and pass the error as part of the document
|
|
185
|
+
# otherwise, we raise the error to fail the sync
|
|
186
|
+
if format.skip_unprocessable_files:
|
|
187
|
+
exception_str = str(e)
|
|
188
|
+
logger.warn(f"File {file.uri} caused an error during parsing: {exception_str}.")
|
|
189
|
+
yield {
|
|
190
|
+
"content": None,
|
|
191
|
+
"document_key": file.uri,
|
|
192
|
+
"_ab_source_file_parse_error": exception_str,
|
|
193
|
+
}
|
|
194
|
+
logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
|
|
195
|
+
else:
|
|
196
|
+
raise e
|
|
197
|
+
except Exception as e:
|
|
198
|
+
exception_str = str(e)
|
|
199
|
+
logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
|
|
200
|
+
raise e
|
|
201
|
+
|
|
202
|
+
def _read_file(
|
|
203
|
+
self,
|
|
204
|
+
file_handle: IOBase,
|
|
205
|
+
remote_file: RemoteFile,
|
|
206
|
+
format: UnstructuredFormat,
|
|
207
|
+
logger: logging.Logger,
|
|
208
|
+
) -> str:
|
|
209
|
+
_import_unstructured()
|
|
210
|
+
if (
|
|
211
|
+
(not unstructured_partition_pdf)
|
|
212
|
+
or (not unstructured_partition_docx)
|
|
213
|
+
or (not unstructured_partition_pptx)
|
|
214
|
+
):
|
|
215
|
+
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
|
|
216
|
+
raise Exception("unstructured library is not available")
|
|
217
|
+
|
|
218
|
+
filetype: FileType | None = self._get_filetype(file_handle, remote_file)
|
|
219
|
+
|
|
220
|
+
if filetype is None or filetype not in self._supported_file_types():
|
|
221
|
+
raise self._create_parse_error(
|
|
222
|
+
remote_file,
|
|
223
|
+
self._get_file_type_error_message(filetype),
|
|
224
|
+
)
|
|
225
|
+
if filetype in {FileType.MD, FileType.TXT}:
|
|
226
|
+
file_content: bytes = file_handle.read()
|
|
227
|
+
decoded_content: str = optional_decode(file_content)
|
|
228
|
+
return decoded_content
|
|
229
|
+
if format.processing.mode == "local":
|
|
230
|
+
return self._read_file_locally(
|
|
231
|
+
file_handle,
|
|
232
|
+
filetype,
|
|
233
|
+
format.strategy,
|
|
234
|
+
remote_file,
|
|
235
|
+
)
|
|
236
|
+
elif format.processing.mode == "api":
|
|
237
|
+
try:
|
|
238
|
+
result: str = self._read_file_remotely_with_retries(
|
|
239
|
+
file_handle,
|
|
240
|
+
format.processing,
|
|
241
|
+
filetype,
|
|
242
|
+
format.strategy,
|
|
243
|
+
remote_file,
|
|
244
|
+
)
|
|
245
|
+
except Exception as e:
|
|
246
|
+
# If a parser error happens during remotely processing the file, this means the file is corrupted. This case is handled by the parse_records method, so just rethrow.
|
|
247
|
+
#
|
|
248
|
+
# For other exceptions, re-throw as config error so the sync is stopped as problems with the external API need to be resolved by the user and are not considered part of the SLA.
|
|
249
|
+
# Once this parser leaves experimental stage, we should consider making this a system error instead for issues that might be transient.
|
|
250
|
+
if isinstance(e, RecordParseError):
|
|
251
|
+
raise e
|
|
252
|
+
raise AirbyteTracedException.from_exception(
|
|
253
|
+
e, failure_type=FailureType.config_error
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
return result
|
|
257
|
+
|
|
258
|
+
def _params_to_dict(
|
|
259
|
+
self, params: Optional[List[APIParameterConfigModel]], strategy: str
|
|
260
|
+
) -> Dict[str, Union[str, List[str]]]:
|
|
261
|
+
result_dict: Dict[str, Union[str, List[str]]] = {"strategy": strategy}
|
|
262
|
+
if params is None:
|
|
263
|
+
return result_dict
|
|
264
|
+
for item in params:
|
|
265
|
+
key = item.name
|
|
266
|
+
value = item.value
|
|
267
|
+
if key in result_dict:
|
|
268
|
+
existing_value = result_dict[key]
|
|
269
|
+
# If the key already exists, append the new value to its list
|
|
270
|
+
if isinstance(existing_value, list):
|
|
271
|
+
existing_value.append(value)
|
|
272
|
+
else:
|
|
273
|
+
result_dict[key] = [existing_value, value]
|
|
274
|
+
else:
|
|
275
|
+
# If the key doesn't exist, add it to the dictionary
|
|
276
|
+
result_dict[key] = value
|
|
277
|
+
|
|
278
|
+
return result_dict
|
|
279
|
+
|
|
280
|
+
def check_config(self, config: FileBasedStreamConfig) -> Tuple[bool, Optional[str]]:
|
|
281
|
+
"""
|
|
282
|
+
Perform a connection check for the parser config:
|
|
283
|
+
- Verify that encryption is enabled if the API is hosted on a cloud instance.
|
|
284
|
+
- Verify that the API can extract text from a file.
|
|
285
|
+
|
|
286
|
+
For local processing, we don't need to perform any additional checks, implicit pydantic validation is enough.
|
|
287
|
+
"""
|
|
288
|
+
format_config = _extract_format(config)
|
|
289
|
+
if isinstance(format_config.processing, LocalProcessingConfigModel):
|
|
290
|
+
if format_config.strategy == "hi_res":
|
|
291
|
+
return False, "Hi-res strategy is not supported for local processing"
|
|
292
|
+
return True, None
|
|
293
|
+
|
|
294
|
+
if is_cloud_environment() and not format_config.processing.api_url.startswith("https://"):
|
|
295
|
+
return False, "Base URL must start with https://"
|
|
296
|
+
|
|
297
|
+
try:
|
|
298
|
+
self._read_file_remotely(
|
|
299
|
+
BytesIO(b"# Airbyte source connection test"),
|
|
300
|
+
format_config.processing,
|
|
301
|
+
FileType.MD,
|
|
302
|
+
"auto",
|
|
303
|
+
RemoteFile(uri="test", last_modified=datetime.now()),
|
|
304
|
+
)
|
|
305
|
+
except Exception:
|
|
306
|
+
return False, "".join(traceback.format_exc())
|
|
307
|
+
|
|
308
|
+
return True, None
|
|
309
|
+
|
|
310
|
+
@backoff.on_exception(
|
|
311
|
+
backoff.expo, requests.exceptions.RequestException, max_tries=5, giveup=user_error
|
|
312
|
+
)
|
|
313
|
+
def _read_file_remotely_with_retries(
|
|
314
|
+
self,
|
|
315
|
+
file_handle: IOBase,
|
|
316
|
+
format: APIProcessingConfigModel,
|
|
317
|
+
filetype: FileType,
|
|
318
|
+
strategy: str,
|
|
319
|
+
remote_file: RemoteFile,
|
|
320
|
+
) -> str:
|
|
321
|
+
"""
|
|
322
|
+
Read a file remotely, retrying up to 5 times if the error is not caused by user error. This is useful for transient network errors or the API server being overloaded temporarily.
|
|
323
|
+
"""
|
|
324
|
+
return self._read_file_remotely(file_handle, format, filetype, strategy, remote_file)
|
|
325
|
+
|
|
326
|
+
def _read_file_remotely(
|
|
327
|
+
self,
|
|
328
|
+
file_handle: IOBase,
|
|
329
|
+
format: APIProcessingConfigModel,
|
|
330
|
+
filetype: FileType,
|
|
331
|
+
strategy: str,
|
|
332
|
+
remote_file: RemoteFile,
|
|
333
|
+
) -> str:
|
|
334
|
+
headers = {"accept": "application/json", "unstructured-api-key": format.api_key}
|
|
335
|
+
|
|
336
|
+
data = self._params_to_dict(format.parameters, strategy)
|
|
337
|
+
|
|
338
|
+
file_data = {"files": ("filename", file_handle, FILETYPE_TO_MIMETYPE[filetype])}
|
|
339
|
+
|
|
340
|
+
response = requests.post(
|
|
341
|
+
f"{format.api_url}/general/v0/general", headers=headers, data=data, files=file_data
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
if response.status_code == 422:
|
|
345
|
+
# 422 means the file couldn't be processed, but the API is working. Treat this as a parsing error (passing an error record to the destination).
|
|
346
|
+
raise self._create_parse_error(remote_file, response.json())
|
|
347
|
+
else:
|
|
348
|
+
# Other error statuses are raised as requests exceptions (retry everything except user errors)
|
|
349
|
+
response.raise_for_status()
|
|
350
|
+
|
|
351
|
+
json_response = response.json()
|
|
352
|
+
|
|
353
|
+
return self._render_markdown(json_response)
|
|
354
|
+
|
|
355
|
+
def _read_file_locally(
|
|
356
|
+
self, file_handle: IOBase, filetype: FileType, strategy: str, remote_file: RemoteFile
|
|
357
|
+
) -> str:
|
|
358
|
+
_import_unstructured()
|
|
359
|
+
if (
|
|
360
|
+
(not unstructured_partition_pdf)
|
|
361
|
+
or (not unstructured_partition_docx)
|
|
362
|
+
or (not unstructured_partition_pptx)
|
|
363
|
+
):
|
|
364
|
+
# check whether unstructured library is actually available for better error message and to ensure proper typing (can't be None after this point)
|
|
365
|
+
raise Exception("unstructured library is not available")
|
|
366
|
+
|
|
367
|
+
file: Any = file_handle
|
|
368
|
+
|
|
369
|
+
# before the parsing logic is entered, the file is read completely to make sure it is in local memory
|
|
370
|
+
file_handle.seek(0)
|
|
371
|
+
file_handle.read()
|
|
372
|
+
file_handle.seek(0)
|
|
373
|
+
|
|
374
|
+
try:
|
|
375
|
+
if filetype == FileType.PDF:
|
|
376
|
+
# for PDF, read the file into a BytesIO object because some code paths in pdf parsing are doing an instance check on the file object and don't work with file-like objects
|
|
377
|
+
file_handle.seek(0)
|
|
378
|
+
with BytesIO(file_handle.read()) as file:
|
|
379
|
+
file_handle.seek(0)
|
|
380
|
+
elements = unstructured_partition_pdf(file=file, strategy=strategy)
|
|
381
|
+
elif filetype == FileType.DOCX:
|
|
382
|
+
elements = unstructured_partition_docx(file=file)
|
|
383
|
+
elif filetype == FileType.PPTX:
|
|
384
|
+
elements = unstructured_partition_pptx(file=file)
|
|
385
|
+
except Exception as e:
|
|
386
|
+
raise self._create_parse_error(remote_file, str(e))
|
|
387
|
+
|
|
388
|
+
return self._render_markdown([element.to_dict() for element in elements])
|
|
389
|
+
|
|
390
|
+
def _create_parse_error(
|
|
391
|
+
self,
|
|
392
|
+
remote_file: RemoteFile,
|
|
393
|
+
message: str,
|
|
394
|
+
) -> RecordParseError:
|
|
395
|
+
return RecordParseError(
|
|
396
|
+
FileBasedSourceError.ERROR_PARSING_RECORD, filename=remote_file.uri, message=message
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
def _get_filetype(self, file: IOBase, remote_file: RemoteFile) -> Optional[FileType]:
|
|
400
|
+
"""
|
|
401
|
+
Detect the file type based on the file name and the file content.
|
|
402
|
+
|
|
403
|
+
There are three strategies to determine the file type:
|
|
404
|
+
1. Use the mime type if available (only some sources support it)
|
|
405
|
+
2. Use the file name if available
|
|
406
|
+
3. Use the file content
|
|
407
|
+
"""
|
|
408
|
+
if remote_file.mime_type and remote_file.mime_type in STR_TO_FILETYPE:
|
|
409
|
+
return STR_TO_FILETYPE[remote_file.mime_type]
|
|
410
|
+
|
|
411
|
+
# set name to none, otherwise unstructured will try to get the modified date from the local file system
|
|
412
|
+
if hasattr(file, "name"):
|
|
413
|
+
file.name = None
|
|
414
|
+
|
|
415
|
+
# detect_filetype is either using the file name or file content
|
|
416
|
+
# if possible, try to leverage the file name to detect the file type
|
|
417
|
+
# if the file name is not available, use the file content
|
|
418
|
+
file_type: FileType | None = None
|
|
419
|
+
try:
|
|
420
|
+
file_type = detect_filetype(
|
|
421
|
+
filename=remote_file.uri,
|
|
422
|
+
)
|
|
423
|
+
except Exception:
|
|
424
|
+
# Path doesn't exist locally. Try something else...
|
|
425
|
+
pass
|
|
426
|
+
|
|
427
|
+
if file_type and file_type != FileType.UNK:
|
|
428
|
+
return file_type
|
|
429
|
+
|
|
430
|
+
type_based_on_content = detect_filetype(file=file)
|
|
431
|
+
file.seek(0) # detect_filetype is reading to read the file content, so we need to reset
|
|
432
|
+
|
|
433
|
+
if type_based_on_content and type_based_on_content != FileType.UNK:
|
|
434
|
+
return type_based_on_content
|
|
435
|
+
|
|
436
|
+
extension = "." + remote_file.uri.split(".")[-1].lower()
|
|
437
|
+
if extension in EXT_TO_FILETYPE:
|
|
438
|
+
return EXT_TO_FILETYPE[extension]
|
|
439
|
+
|
|
440
|
+
return None
|
|
441
|
+
|
|
442
|
+
def _supported_file_types(self) -> List[Any]:
|
|
443
|
+
return [FileType.MD, FileType.PDF, FileType.DOCX, FileType.PPTX, FileType.TXT]
|
|
444
|
+
|
|
445
|
+
def _get_file_type_error_message(
|
|
446
|
+
self,
|
|
447
|
+
file_type: FileType | None,
|
|
448
|
+
) -> str:
|
|
449
|
+
supported_file_types = ", ".join([str(type) for type in self._supported_file_types()])
|
|
450
|
+
return f"File type {file_type or 'None'!s} is not supported. Supported file types are {supported_file_types}"
|
|
451
|
+
|
|
452
|
+
def _render_markdown(self, elements: List[Any]) -> str:
|
|
453
|
+
return "\n\n".join((self._convert_to_markdown(el) for el in elements))
|
|
454
|
+
|
|
455
|
+
def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
|
|
456
|
+
if dpath.get(el, "type") == "Title":
|
|
457
|
+
category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
|
|
458
|
+
if not isinstance(category_depth, int):
|
|
459
|
+
category_depth = (
|
|
460
|
+
int(category_depth) if isinstance(category_depth, (str, float)) else 1
|
|
461
|
+
)
|
|
462
|
+
heading_str = "#" * category_depth
|
|
463
|
+
return f"{heading_str} {dpath.get(el, 'text')}"
|
|
464
|
+
elif dpath.get(el, "type") == "ListItem":
|
|
465
|
+
return f"- {dpath.get(el, 'text')}"
|
|
466
|
+
elif dpath.get(el, "type") == "Formula":
|
|
467
|
+
return f"```\n{dpath.get(el, 'text')}\n```"
|
|
468
|
+
else:
|
|
469
|
+
return str(dpath.get(el, "text", default=""))
|
|
470
|
+
|
|
471
|
+
@property
|
|
472
|
+
def file_read_mode(self) -> FileReadMode:
|
|
473
|
+
return FileReadMode.READ_BINARY
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def _extract_format(config: FileBasedStreamConfig) -> UnstructuredFormat:
|
|
477
|
+
config_format = config.format
|
|
478
|
+
if not isinstance(config_format, UnstructuredFormat):
|
|
479
|
+
raise ValueError(f"Invalid format config: {config_format}")
|
|
480
|
+
return config_format
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from pydantic.v1 import BaseModel
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RemoteFile(BaseModel):
|
|
12
|
+
"""
|
|
13
|
+
A file in a file-based stream.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
uri: str
|
|
17
|
+
last_modified: datetime
|
|
18
|
+
mime_type: Optional[str] = None
|