airbyte-cdk 0.0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +358 -0
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +236 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +104 -0
- airbyte_cdk/connector.py +123 -0
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/__init__.py +3 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +121 -0
- airbyte_cdk/connector_builder/main.py +107 -0
- airbyte_cdk/connector_builder/models.py +73 -0
- airbyte_cdk/connector_builder/test_reader/__init__.py +7 -0
- airbyte_cdk/connector_builder/test_reader/helpers.py +689 -0
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +173 -0
- airbyte_cdk/connector_builder/test_reader/reader.py +441 -0
- airbyte_cdk/connector_builder/test_reader/types.py +83 -0
- airbyte_cdk/destinations/__init__.py +8 -0
- airbyte_cdk/destinations/destination.py +154 -0
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/__init__.py +38 -0
- airbyte_cdk/destinations/vector_db_based/config.py +298 -0
- airbyte_cdk/destinations/vector_db_based/document_processor.py +223 -0
- airbyte_cdk/destinations/vector_db_based/embedder.py +303 -0
- airbyte_cdk/destinations/vector_db_based/indexer.py +78 -0
- airbyte_cdk/destinations/vector_db_based/test_utils.py +63 -0
- airbyte_cdk/destinations/vector_db_based/utils.py +35 -0
- airbyte_cdk/destinations/vector_db_based/writer.py +104 -0
- airbyte_cdk/entrypoint.py +414 -0
- airbyte_cdk/exception_handler.py +56 -0
- airbyte_cdk/logger.py +109 -0
- airbyte_cdk/models/__init__.py +72 -0
- airbyte_cdk/models/airbyte_protocol.py +88 -0
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/well_known_types.py +5 -0
- airbyte_cdk/py.typed +0 -0
- airbyte_cdk/sources/__init__.py +26 -0
- airbyte_cdk/sources/abstract_source.py +326 -0
- airbyte_cdk/sources/concurrent_source/__init__.py +8 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +255 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +165 -0
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +147 -0
- airbyte_cdk/sources/concurrent_source/partition_generation_completed_sentinel.py +24 -0
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +115 -0
- airbyte_cdk/sources/config.py +27 -0
- airbyte_cdk/sources/connector_state_manager.py +161 -0
- airbyte_cdk/sources/declarative/__init__.py +3 -0
- airbyte_cdk/sources/declarative/async_job/__init__.py +0 -0
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +525 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +79 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +8 -0
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +42 -0
- airbyte_cdk/sources/declarative/auth/jwt.py +197 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +293 -0
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +45 -0
- airbyte_cdk/sources/declarative/auth/token.py +267 -0
- airbyte_cdk/sources/declarative/auth/token_provider.py +82 -0
- airbyte_cdk/sources/declarative/checks/__init__.py +24 -0
- airbyte_cdk/sources/declarative/checks/check_dynamic_stream.py +61 -0
- airbyte_cdk/sources/declarative/checks/check_stream.py +56 -0
- airbyte_cdk/sources/declarative/checks/connection_checker.py +35 -0
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +526 -0
- airbyte_cdk/sources/declarative/datetime/__init__.py +3 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +65 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +118 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +3975 -0
- airbyte_cdk/sources/declarative/declarative_source.py +36 -0
- airbyte_cdk/sources/declarative/declarative_stream.py +241 -0
- airbyte_cdk/sources/declarative/decoders/__init__.py +33 -0
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +218 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +32 -0
- airbyte_cdk/sources/declarative/decoders/decoder_parser.py +30 -0
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +65 -0
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/decoders/zipfile_decoder.py +56 -0
- airbyte_cdk/sources/declarative/exceptions.py +9 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +21 -0
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +86 -0
- airbyte_cdk/sources/declarative/extractors/http_selector.py +37 -0
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +27 -0
- airbyte_cdk/sources/declarative/extractors/record_filter.py +91 -0
- airbyte_cdk/sources/declarative/extractors/record_selector.py +170 -0
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +176 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +37 -0
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +497 -0
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +459 -0
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +357 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +380 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/__init__.py +9 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +139 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +66 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +56 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +52 -0
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +79 -0
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +34 -0
- airbyte_cdk/sources/declarative/interpolation/jinja.py +161 -0
- airbyte_cdk/sources/declarative/interpolation/macros.py +191 -0
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +421 -0
- airbyte_cdk/sources/declarative/migrations/__init__.py +0 -0
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +2 -0
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +2503 -0
- airbyte_cdk/sources/declarative/parsers/__init__.py +3 -0
- airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py +157 -0
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +21 -0
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +172 -0
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +213 -0
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +3407 -0
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +29 -0
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +121 -0
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +63 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +437 -0
- airbyte_cdk/sources/declarative/requesters/README.md +56 -0
- airbyte_cdk/sources/declarative/requesters/__init__.py +9 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +45 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +41 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +70 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +77 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +101 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +147 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +17 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +179 -0
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +350 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +433 -0
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +21 -0
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +327 -0
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +76 -0
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +65 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +25 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +98 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +102 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +71 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +48 -0
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +66 -0
- airbyte_cdk/sources/declarative/requesters/request_option.py +117 -0
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +23 -0
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +92 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +59 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +68 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +119 -0
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +79 -0
- airbyte_cdk/sources/declarative/requesters/request_path.py +15 -0
- airbyte_cdk/sources/declarative/requesters/requester.py +144 -0
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +19 -0
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +124 -0
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +89 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +54 -0
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +702 -0
- airbyte_cdk/sources/declarative/schema/__init__.py +25 -0
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +47 -0
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +285 -0
- airbyte_cdk/sources/declarative/schema/inline_schema_loader.py +19 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +92 -0
- airbyte_cdk/sources/declarative/schema/schema_loader.py +17 -0
- airbyte_cdk/sources/declarative/spec/__init__.py +7 -0
- airbyte_cdk/sources/declarative/spec/spec.py +48 -0
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +7 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +93 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +25 -0
- airbyte_cdk/sources/declarative/transformations/__init__.py +17 -0
- airbyte_cdk/sources/declarative/transformations/add_fields.py +146 -0
- airbyte_cdk/sources/declarative/transformations/dpath_flatten_fields.py +61 -0
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +75 -0
- airbyte_cdk/sources/declarative/transformations/transformation.py +37 -0
- airbyte_cdk/sources/declarative/types.py +25 -0
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +67 -0
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +11 -0
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +73 -0
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +149 -0
- airbyte_cdk/sources/file_based/config/__init__.py +0 -0
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +153 -0
- airbyte_cdk/sources/file_based/config/avro_format.py +25 -0
- airbyte_cdk/sources/file_based/config/csv_format.py +210 -0
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +99 -0
- airbyte_cdk/sources/file_based/config/jsonl_format.py +18 -0
- airbyte_cdk/sources/file_based/config/parquet_format.py +25 -0
- airbyte_cdk/sources/file_based/config/unstructured_format.py +102 -0
- airbyte_cdk/sources/file_based/config/validate_config_transfer_modes.py +81 -0
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +8 -0
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +21 -0
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +33 -0
- airbyte_cdk/sources/file_based/exceptions.py +159 -0
- airbyte_cdk/sources/file_based/file_based_source.py +466 -0
- airbyte_cdk/sources/file_based/file_based_stream_permissions_reader.py +123 -0
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +209 -0
- airbyte_cdk/sources/file_based/file_record_data.py +22 -0
- airbyte_cdk/sources/file_based/file_types/__init__.py +37 -0
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +233 -0
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +527 -0
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +30 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +86 -0
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +145 -0
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +275 -0
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +480 -0
- airbyte_cdk/sources/file_based/remote_file.py +18 -0
- airbyte_cdk/sources/file_based/schema_helpers.py +281 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +17 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +20 -0
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +52 -0
- airbyte_cdk/sources/file_based/stream/__init__.py +13 -0
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +197 -0
- airbyte_cdk/sources/file_based/stream/concurrent/__init__.py +0 -0
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +343 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +9 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +59 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +313 -0
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +83 -0
- airbyte_cdk/sources/file_based/stream/cursor/__init__.py +4 -0
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +66 -0
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +149 -0
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +396 -0
- airbyte_cdk/sources/file_based/stream/identities_stream.py +49 -0
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +92 -0
- airbyte_cdk/sources/file_based/types.py +10 -0
- airbyte_cdk/sources/http_config.py +10 -0
- airbyte_cdk/sources/http_logger.py +55 -0
- airbyte_cdk/sources/message/__init__.py +19 -0
- airbyte_cdk/sources/message/repository.py +137 -0
- airbyte_cdk/sources/source.py +95 -0
- airbyte_cdk/sources/specs/transfer_modes.py +26 -0
- airbyte_cdk/sources/streams/__init__.py +8 -0
- airbyte_cdk/sources/streams/availability_strategy.py +84 -0
- airbyte_cdk/sources/streams/call_rate.py +704 -0
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/streams/checkpoint/cursor.py +77 -0
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +96 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream_facade.py +37 -0
- airbyte_cdk/sources/streams/concurrent/adapters.py +397 -0
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +94 -0
- airbyte_cdk/sources/streams/concurrent/clamping.py +99 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +481 -0
- airbyte_cdk/sources/streams/concurrent/cursor_types.py +32 -0
- airbyte_cdk/sources/streams/concurrent/default_stream.py +102 -0
- airbyte_cdk/sources/streams/concurrent/exceptions.py +18 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +42 -0
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +64 -0
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +45 -0
- airbyte_cdk/sources/streams/concurrent/partitions/__init__.py +3 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +48 -0
- airbyte_cdk/sources/streams/concurrent/partitions/partition_generator.py +18 -0
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +38 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/__init__.py +0 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +182 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +223 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/incrementing_count_stream_state_converter.py +92 -0
- airbyte_cdk/sources/streams/core.py +703 -0
- airbyte_cdk/sources/streams/http/__init__.py +10 -0
- airbyte_cdk/sources/streams/http/availability_strategy.py +54 -0
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +61 -0
- airbyte_cdk/sources/streams/http/http.py +673 -0
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +158 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/__init__.py +14 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +479 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +34 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +436 -0
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +83 -0
- airbyte_cdk/sources/streams/permissions/identities_stream.py +75 -0
- airbyte_cdk/sources/streams/utils/__init__.py +3 -0
- airbyte_cdk/sources/types.py +169 -0
- airbyte_cdk/sources/utils/__init__.py +7 -0
- airbyte_cdk/sources/utils/casing.py +12 -0
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +53 -0
- airbyte_cdk/sources/utils/schema_helpers.py +230 -0
- airbyte_cdk/sources/utils/slice_logger.py +57 -0
- airbyte_cdk/sources/utils/transform.py +277 -0
- airbyte_cdk/sources/utils/types.py +7 -0
- airbyte_cdk/sql/__init__.py +0 -0
- airbyte_cdk/sql/_util/__init__.py +0 -0
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/__init__.py +7 -0
- airbyte_cdk/test/catalog_builder.py +81 -0
- airbyte_cdk/test/entrypoint_wrapper.py +250 -0
- airbyte_cdk/test/mock_http/__init__.py +6 -0
- airbyte_cdk/test/mock_http/matcher.py +41 -0
- airbyte_cdk/test/mock_http/mocker.py +185 -0
- airbyte_cdk/test/mock_http/request.py +103 -0
- airbyte_cdk/test/mock_http/response.py +28 -0
- airbyte_cdk/test/mock_http/response_builder.py +237 -0
- airbyte_cdk/test/state_builder.py +33 -0
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +59 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +10 -0
- airbyte_cdk/utils/airbyte_secrets_utils.py +80 -0
- airbyte_cdk/utils/analytics_message.py +25 -0
- airbyte_cdk/utils/constants.py +5 -0
- airbyte_cdk/utils/datetime_format_inferrer.py +94 -0
- airbyte_cdk/utils/datetime_helpers.py +499 -0
- airbyte_cdk/utils/event_timing.py +85 -0
- airbyte_cdk/utils/is_cloud_environment.py +18 -0
- airbyte_cdk/utils/mapping_helpers.py +162 -0
- airbyte_cdk/utils/message_utils.py +26 -0
- airbyte_cdk/utils/oneof_option_config.py +33 -0
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +270 -0
- airbyte_cdk/utils/slice_hasher.py +37 -0
- airbyte_cdk/utils/spec_schema_transformations.py +26 -0
- airbyte_cdk/utils/stream_status_utils.py +43 -0
- airbyte_cdk/utils/traced_exception.py +145 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE.txt +19 -0
- airbyte_cdk-0.0.0.dev0.dist-info/LICENSE_SHORT +1 -0
- airbyte_cdk-0.0.0.dev0.dist-info/METADATA +111 -0
- airbyte_cdk-0.0.0.dev0.dist-info/RECORD +368 -0
- airbyte_cdk-0.0.0.dev0.dist-info/WHEEL +4 -0
- airbyte_cdk-0.0.0.dev0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import datetime, timedelta
|
|
7
|
+
from threading import RLock
|
|
8
|
+
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, MutableMapping, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, Type
|
|
11
|
+
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
|
12
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
|
13
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
14
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import (
|
|
15
|
+
AbstractConcurrentFileBasedCursor,
|
|
16
|
+
)
|
|
17
|
+
from airbyte_cdk.sources.file_based.stream.cursor import DefaultFileBasedCursor
|
|
18
|
+
from airbyte_cdk.sources.file_based.types import StreamState
|
|
19
|
+
from airbyte_cdk.sources.message.repository import MessageRepository
|
|
20
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import CursorField
|
|
21
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
|
22
|
+
from airbyte_cdk.sources.types import Record
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
|
|
26
|
+
|
|
27
|
+
_NULL_FILE = ""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
31
|
+
CURSOR_FIELD = "_ab_source_file_last_modified"
|
|
32
|
+
DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = (
|
|
33
|
+
DefaultFileBasedCursor.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
|
|
34
|
+
)
|
|
35
|
+
DEFAULT_MAX_HISTORY_SIZE = 10_000
|
|
36
|
+
DATE_TIME_FORMAT = DefaultFileBasedCursor.DATE_TIME_FORMAT
|
|
37
|
+
zero_value = datetime.min
|
|
38
|
+
zero_cursor_value = f"0001-01-01T00:00:00.000000Z_{_NULL_FILE}"
|
|
39
|
+
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
stream_config: FileBasedStreamConfig,
|
|
43
|
+
stream_name: str,
|
|
44
|
+
stream_namespace: Optional[str],
|
|
45
|
+
stream_state: MutableMapping[str, Any],
|
|
46
|
+
message_repository: MessageRepository,
|
|
47
|
+
connector_state_manager: ConnectorStateManager,
|
|
48
|
+
cursor_field: CursorField,
|
|
49
|
+
) -> None:
|
|
50
|
+
super().__init__()
|
|
51
|
+
self._stream_name = stream_name
|
|
52
|
+
self._stream_namespace = stream_namespace
|
|
53
|
+
self._state = stream_state
|
|
54
|
+
self._message_repository = message_repository
|
|
55
|
+
self._connector_state_manager = connector_state_manager
|
|
56
|
+
self._cursor_field = cursor_field
|
|
57
|
+
self._time_window_if_history_is_full = timedelta(
|
|
58
|
+
days=stream_config.days_to_sync_if_history_is_full
|
|
59
|
+
or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
|
|
60
|
+
)
|
|
61
|
+
self._state_lock = RLock()
|
|
62
|
+
self._pending_files_lock = RLock()
|
|
63
|
+
self._pending_files: Optional[Dict[str, RemoteFile]] = None
|
|
64
|
+
self._file_to_datetime_history = stream_state.get("history", {}) if stream_state else {}
|
|
65
|
+
self._prev_cursor_value = self._compute_prev_sync_cursor(stream_state)
|
|
66
|
+
self._sync_start = self._compute_start_time()
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def state(self) -> MutableMapping[str, Any]:
|
|
70
|
+
return self._state
|
|
71
|
+
|
|
72
|
+
def observe(self, record: Record) -> None:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
def close_partition(self, partition: Partition) -> None:
|
|
76
|
+
with self._pending_files_lock:
|
|
77
|
+
if self._pending_files is None:
|
|
78
|
+
raise RuntimeError(
|
|
79
|
+
"Expected pending partitions to be set but it was not. This is unexpected. Please contact Support."
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None:
|
|
83
|
+
with self._pending_files_lock:
|
|
84
|
+
self._pending_files = {}
|
|
85
|
+
for partition in partitions:
|
|
86
|
+
_slice = partition.to_slice()
|
|
87
|
+
if _slice is None:
|
|
88
|
+
continue
|
|
89
|
+
for file in _slice["files"]:
|
|
90
|
+
if file.uri in self._pending_files.keys():
|
|
91
|
+
raise RuntimeError(
|
|
92
|
+
f"Already found file {_slice} in pending files. This is unexpected. Please contact Support."
|
|
93
|
+
)
|
|
94
|
+
self._pending_files.update({file.uri: file})
|
|
95
|
+
|
|
96
|
+
def _compute_prev_sync_cursor(self, value: Optional[StreamState]) -> Tuple[datetime, str]:
|
|
97
|
+
if not value:
|
|
98
|
+
return self.zero_value, ""
|
|
99
|
+
prev_cursor_str = value.get(self._cursor_field.cursor_field_key) or self.zero_cursor_value
|
|
100
|
+
# So if we see a cursor greater than the earliest file, it means that we have likely synced all files.
|
|
101
|
+
# However, we take the earliest file as the cursor value for the purpose of checking which files to
|
|
102
|
+
# sync, in case new files have been uploaded in the meantime.
|
|
103
|
+
# This should be very rare, as it would indicate a race condition where a file with an earlier
|
|
104
|
+
# last_modified time was uploaded after a file with a later last_modified time. Since last_modified
|
|
105
|
+
# represents the start time that the file was uploaded, we can usually expect that all previous
|
|
106
|
+
# files have already been uploaded. If that's the case, they'll be in history and we'll skip
|
|
107
|
+
# re-uploading them.
|
|
108
|
+
earliest_file_cursor_value = self._get_cursor_key_from_file(
|
|
109
|
+
self._compute_earliest_file_in_history()
|
|
110
|
+
)
|
|
111
|
+
cursor_str = min(prev_cursor_str, earliest_file_cursor_value)
|
|
112
|
+
cursor_dt, cursor_uri = cursor_str.split("_", 1)
|
|
113
|
+
return datetime.strptime(cursor_dt, self.DATE_TIME_FORMAT), cursor_uri
|
|
114
|
+
|
|
115
|
+
def _get_cursor_key_from_file(self, file: Optional[RemoteFile]) -> str:
|
|
116
|
+
if file:
|
|
117
|
+
return f"{datetime.strftime(file.last_modified, self.DATE_TIME_FORMAT)}_{file.uri}"
|
|
118
|
+
return self.zero_cursor_value
|
|
119
|
+
|
|
120
|
+
def _compute_earliest_file_in_history(self) -> Optional[RemoteFile]:
|
|
121
|
+
with self._state_lock:
|
|
122
|
+
if self._file_to_datetime_history:
|
|
123
|
+
filename, last_modified = min(
|
|
124
|
+
self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])
|
|
125
|
+
)
|
|
126
|
+
return RemoteFile(
|
|
127
|
+
uri=filename,
|
|
128
|
+
last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT),
|
|
129
|
+
)
|
|
130
|
+
else:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
def add_file(self, file: RemoteFile) -> None:
|
|
134
|
+
"""
|
|
135
|
+
Add a file to the cursor. This method is called when a file is processed by the stream.
|
|
136
|
+
:param file: The file to add
|
|
137
|
+
"""
|
|
138
|
+
if self._pending_files is None:
|
|
139
|
+
raise RuntimeError(
|
|
140
|
+
"Expected pending partitions to be set but it was not. This is unexpected. Please contact Support."
|
|
141
|
+
)
|
|
142
|
+
with self._pending_files_lock:
|
|
143
|
+
with self._state_lock:
|
|
144
|
+
if file.uri not in self._pending_files:
|
|
145
|
+
self._message_repository.emit_message(
|
|
146
|
+
AirbyteMessage(
|
|
147
|
+
type=Type.LOG,
|
|
148
|
+
log=AirbyteLogMessage(
|
|
149
|
+
level=Level.WARN,
|
|
150
|
+
message=f"The file {file.uri} was not found in the list of pending files. This is unexpected. Please contact Support",
|
|
151
|
+
),
|
|
152
|
+
)
|
|
153
|
+
)
|
|
154
|
+
else:
|
|
155
|
+
self._pending_files.pop(file.uri)
|
|
156
|
+
self._file_to_datetime_history[file.uri] = file.last_modified.strftime(
|
|
157
|
+
self.DATE_TIME_FORMAT
|
|
158
|
+
)
|
|
159
|
+
if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE:
|
|
160
|
+
# Get the earliest file based on its last modified date and its uri
|
|
161
|
+
oldest_file = self._compute_earliest_file_in_history()
|
|
162
|
+
if oldest_file:
|
|
163
|
+
del self._file_to_datetime_history[oldest_file.uri]
|
|
164
|
+
else:
|
|
165
|
+
raise Exception(
|
|
166
|
+
"The history is full but there is no files in the history. This should never happen and might be indicative of a bug in the CDK."
|
|
167
|
+
)
|
|
168
|
+
self.emit_state_message()
|
|
169
|
+
|
|
170
|
+
def emit_state_message(self) -> None:
|
|
171
|
+
with self._state_lock:
|
|
172
|
+
new_state = self.get_state()
|
|
173
|
+
self._connector_state_manager.update_state_for_stream(
|
|
174
|
+
self._stream_name,
|
|
175
|
+
self._stream_namespace,
|
|
176
|
+
new_state,
|
|
177
|
+
)
|
|
178
|
+
state_message = self._connector_state_manager.create_state_message(
|
|
179
|
+
self._stream_name, self._stream_namespace
|
|
180
|
+
)
|
|
181
|
+
self._message_repository.emit_message(state_message)
|
|
182
|
+
|
|
183
|
+
def _get_new_cursor_value(self) -> str:
|
|
184
|
+
with self._pending_files_lock:
|
|
185
|
+
with self._state_lock:
|
|
186
|
+
if self._pending_files:
|
|
187
|
+
# If there are partitions that haven't been synced, we don't know whether the files that have been synced
|
|
188
|
+
# represent a contiguous region.
|
|
189
|
+
# To avoid missing files, we only increment the cursor up to the oldest pending file, because we know
|
|
190
|
+
# that all older files have been synced.
|
|
191
|
+
return self._get_cursor_key_from_file(self._compute_earliest_pending_file())
|
|
192
|
+
elif self._file_to_datetime_history:
|
|
193
|
+
# If all partitions have been synced, we know that the sync is up-to-date and so can advance
|
|
194
|
+
# the cursor to the newest file in history.
|
|
195
|
+
return self._get_cursor_key_from_file(self._compute_latest_file_in_history())
|
|
196
|
+
else:
|
|
197
|
+
return f"{self.zero_value.strftime(self.DATE_TIME_FORMAT)}_"
|
|
198
|
+
|
|
199
|
+
def _compute_earliest_pending_file(self) -> Optional[RemoteFile]:
|
|
200
|
+
if self._pending_files:
|
|
201
|
+
return min(self._pending_files.values(), key=lambda x: x.last_modified)
|
|
202
|
+
else:
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
def _compute_latest_file_in_history(self) -> Optional[RemoteFile]:
|
|
206
|
+
with self._state_lock:
|
|
207
|
+
if self._file_to_datetime_history:
|
|
208
|
+
filename, last_modified = max(
|
|
209
|
+
self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])
|
|
210
|
+
)
|
|
211
|
+
return RemoteFile(
|
|
212
|
+
uri=filename,
|
|
213
|
+
last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT),
|
|
214
|
+
)
|
|
215
|
+
else:
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
def get_files_to_sync(
|
|
219
|
+
self, all_files: Iterable[RemoteFile], logger: logging.Logger
|
|
220
|
+
) -> Iterable[RemoteFile]:
|
|
221
|
+
"""
|
|
222
|
+
Given the list of files in the source, return the files that should be synced.
|
|
223
|
+
:param all_files: All files in the source
|
|
224
|
+
:param logger:
|
|
225
|
+
:return: The files that should be synced
|
|
226
|
+
"""
|
|
227
|
+
with self._state_lock:
|
|
228
|
+
if self._is_history_full():
|
|
229
|
+
logger.warning(
|
|
230
|
+
f"The state history is full. "
|
|
231
|
+
f"This sync and future syncs won't be able to use the history to filter out duplicate files. "
|
|
232
|
+
f"It will instead use the time window of {self._time_window_if_history_is_full} to filter out files."
|
|
233
|
+
)
|
|
234
|
+
for f in all_files:
|
|
235
|
+
if self._should_sync_file(f, logger):
|
|
236
|
+
yield f
|
|
237
|
+
|
|
238
|
+
def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool:
|
|
239
|
+
with self._state_lock:
|
|
240
|
+
if file.uri in self._file_to_datetime_history:
|
|
241
|
+
# If the file's uri is in the history, we should sync the file if it has been modified since it was synced
|
|
242
|
+
updated_at_from_history = datetime.strptime(
|
|
243
|
+
self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT
|
|
244
|
+
)
|
|
245
|
+
if file.last_modified < updated_at_from_history:
|
|
246
|
+
self._message_repository.emit_message(
|
|
247
|
+
AirbyteMessage(
|
|
248
|
+
type=Type.LOG,
|
|
249
|
+
log=AirbyteLogMessage(
|
|
250
|
+
level=Level.WARN,
|
|
251
|
+
message=f"The file {file.uri}'s last modified date is older than the last time it was synced. This is unexpected. Skipping the file.",
|
|
252
|
+
),
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
return False
|
|
256
|
+
else:
|
|
257
|
+
return file.last_modified > updated_at_from_history
|
|
258
|
+
|
|
259
|
+
prev_cursor_timestamp, prev_cursor_uri = self._prev_cursor_value
|
|
260
|
+
if self._is_history_full():
|
|
261
|
+
if file.last_modified > prev_cursor_timestamp:
|
|
262
|
+
# If the history is partial and the file's datetime is strictly greater than the cursor, we should sync it
|
|
263
|
+
return True
|
|
264
|
+
elif file.last_modified == prev_cursor_timestamp:
|
|
265
|
+
# If the history is partial and the file's datetime is equal to the earliest file in the history,
|
|
266
|
+
# we should sync it if its uri is greater than or equal to the cursor value.
|
|
267
|
+
return file.uri > prev_cursor_uri
|
|
268
|
+
else:
|
|
269
|
+
return file.last_modified >= self._sync_start
|
|
270
|
+
else:
|
|
271
|
+
# The file is not in the history and the history is complete. We know we need to sync the file
|
|
272
|
+
return True
|
|
273
|
+
|
|
274
|
+
def _is_history_full(self) -> bool:
|
|
275
|
+
"""
|
|
276
|
+
Returns true if the state's history is full, meaning new entries will start to replace old entries.
|
|
277
|
+
"""
|
|
278
|
+
with self._state_lock:
|
|
279
|
+
if self._file_to_datetime_history is None:
|
|
280
|
+
raise RuntimeError(
|
|
281
|
+
"The history object has not been set. This is unexpected. Please contact Support."
|
|
282
|
+
)
|
|
283
|
+
return len(self._file_to_datetime_history) >= self.DEFAULT_MAX_HISTORY_SIZE
|
|
284
|
+
|
|
285
|
+
def _compute_start_time(self) -> datetime:
|
|
286
|
+
if not self._file_to_datetime_history:
|
|
287
|
+
return datetime.min
|
|
288
|
+
else:
|
|
289
|
+
earliest = min(self._file_to_datetime_history.values())
|
|
290
|
+
earliest_dt = datetime.strptime(earliest, self.DATE_TIME_FORMAT)
|
|
291
|
+
if self._is_history_full():
|
|
292
|
+
time_window = datetime.now() - self._time_window_if_history_is_full
|
|
293
|
+
earliest_dt = min(earliest_dt, time_window)
|
|
294
|
+
return earliest_dt
|
|
295
|
+
|
|
296
|
+
def get_start_time(self) -> datetime:
|
|
297
|
+
return self._sync_start
|
|
298
|
+
|
|
299
|
+
def get_state(self) -> MutableMapping[str, Any]:
|
|
300
|
+
"""
|
|
301
|
+
Get the state of the cursor.
|
|
302
|
+
"""
|
|
303
|
+
with self._state_lock:
|
|
304
|
+
return {
|
|
305
|
+
"history": self._file_to_datetime_history,
|
|
306
|
+
self._cursor_field.cursor_field_key: self._get_new_cursor_value(),
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
def set_initial_state(self, value: StreamState) -> None:
|
|
310
|
+
pass
|
|
311
|
+
|
|
312
|
+
def ensure_at_least_one_state_emitted(self) -> None:
|
|
313
|
+
self.emit_state_message()
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from typing import TYPE_CHECKING, Any, Iterable, List, MutableMapping, Optional
|
|
8
|
+
|
|
9
|
+
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
|
10
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
|
11
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
12
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import (
|
|
13
|
+
AbstractConcurrentFileBasedCursor,
|
|
14
|
+
)
|
|
15
|
+
from airbyte_cdk.sources.file_based.types import StreamState
|
|
16
|
+
from airbyte_cdk.sources.message import MessageRepository
|
|
17
|
+
from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY
|
|
18
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
|
19
|
+
from airbyte_cdk.sources.types import Record
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamPartition
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class FileBasedFinalStateCursor(AbstractConcurrentFileBasedCursor):
|
|
26
|
+
"""Cursor that is used to guarantee at least one state message is emitted for a concurrent file-based stream."""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
stream_config: FileBasedStreamConfig,
|
|
31
|
+
message_repository: MessageRepository,
|
|
32
|
+
stream_namespace: Optional[str],
|
|
33
|
+
**kwargs: Any,
|
|
34
|
+
):
|
|
35
|
+
self._stream_name = stream_config.name
|
|
36
|
+
self._stream_namespace = stream_namespace
|
|
37
|
+
self._message_repository = message_repository
|
|
38
|
+
# Normally the connector state manager operates at the source-level. However, we only need it to write the sentinel
|
|
39
|
+
# state message rather than manage overall source state. This is also only temporary as we move to the resumable
|
|
40
|
+
# full refresh world where every stream uses a FileBasedConcurrentCursor with incremental state.
|
|
41
|
+
self._connector_state_manager = ConnectorStateManager()
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def state(self) -> MutableMapping[str, Any]:
|
|
45
|
+
return {NO_CURSOR_STATE_KEY: True}
|
|
46
|
+
|
|
47
|
+
def observe(self, record: Record) -> None:
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
def close_partition(self, partition: Partition) -> None:
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None:
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
def add_file(self, file: RemoteFile) -> None:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
def get_files_to_sync(
|
|
60
|
+
self, all_files: Iterable[RemoteFile], logger: logging.Logger
|
|
61
|
+
) -> Iterable[RemoteFile]:
|
|
62
|
+
return all_files
|
|
63
|
+
|
|
64
|
+
def get_state(self) -> MutableMapping[str, Any]:
|
|
65
|
+
return {}
|
|
66
|
+
|
|
67
|
+
def set_initial_state(self, value: StreamState) -> None:
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
def get_start_time(self) -> datetime:
|
|
71
|
+
return datetime.min
|
|
72
|
+
|
|
73
|
+
def emit_state_message(self) -> None:
|
|
74
|
+
pass
|
|
75
|
+
|
|
76
|
+
def ensure_at_least_one_state_emitted(self) -> None:
|
|
77
|
+
self._connector_state_manager.update_state_for_stream(
|
|
78
|
+
self._stream_name, self._stream_namespace, self.state
|
|
79
|
+
)
|
|
80
|
+
state_message = self._connector_state_manager.create_state_message(
|
|
81
|
+
self._stream_name, self._stream_namespace
|
|
82
|
+
)
|
|
83
|
+
self._message_repository.emit_message(state_message)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from typing import Any, Iterable, MutableMapping
|
|
9
|
+
|
|
10
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
|
11
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
12
|
+
from airbyte_cdk.sources.file_based.types import StreamState
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AbstractFileBasedCursor(ABC):
|
|
16
|
+
"""
|
|
17
|
+
Abstract base class for cursors used by file-based streams.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def __init__(self, stream_config: FileBasedStreamConfig, **kwargs: Any):
|
|
22
|
+
"""
|
|
23
|
+
Common interface for all cursors.
|
|
24
|
+
"""
|
|
25
|
+
...
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def add_file(self, file: RemoteFile) -> None:
|
|
29
|
+
"""
|
|
30
|
+
Add a file to the cursor. This method is called when a file is processed by the stream.
|
|
31
|
+
:param file: The file to add
|
|
32
|
+
"""
|
|
33
|
+
...
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
def set_initial_state(self, value: StreamState) -> None:
|
|
37
|
+
"""
|
|
38
|
+
Set the initial state of the cursor. The cursor cannot be initialized at construction time because the stream doesn't know its state yet.
|
|
39
|
+
:param value: The stream state
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def get_state(self) -> MutableMapping[str, Any]:
|
|
44
|
+
"""
|
|
45
|
+
Get the state of the cursor.
|
|
46
|
+
"""
|
|
47
|
+
...
|
|
48
|
+
|
|
49
|
+
@abstractmethod
|
|
50
|
+
def get_start_time(self) -> datetime:
|
|
51
|
+
"""
|
|
52
|
+
Returns the start time of the current sync.
|
|
53
|
+
"""
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def get_files_to_sync(
|
|
58
|
+
self, all_files: Iterable[RemoteFile], logger: logging.Logger
|
|
59
|
+
) -> Iterable[RemoteFile]:
|
|
60
|
+
"""
|
|
61
|
+
Given the list of files in the source, return the files that should be synced.
|
|
62
|
+
:param all_files: All files in the source
|
|
63
|
+
:param logger:
|
|
64
|
+
:return: The files that should be synced
|
|
65
|
+
"""
|
|
66
|
+
...
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
|
3
|
+
#
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import datetime, timedelta
|
|
7
|
+
from typing import Any, Iterable, MutableMapping, Optional
|
|
8
|
+
|
|
9
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
|
10
|
+
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
|
11
|
+
from airbyte_cdk.sources.file_based.stream.cursor.abstract_file_based_cursor import (
|
|
12
|
+
AbstractFileBasedCursor,
|
|
13
|
+
)
|
|
14
|
+
from airbyte_cdk.sources.file_based.types import StreamState
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DefaultFileBasedCursor(AbstractFileBasedCursor):
|
|
18
|
+
DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = 3
|
|
19
|
+
DEFAULT_MAX_HISTORY_SIZE = 10_000
|
|
20
|
+
DATE_TIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
|
|
21
|
+
CURSOR_FIELD = "_ab_source_file_last_modified"
|
|
22
|
+
|
|
23
|
+
def __init__(self, stream_config: FileBasedStreamConfig, **_: Any):
|
|
24
|
+
super().__init__(stream_config) # type: ignore [safe-super]
|
|
25
|
+
self._file_to_datetime_history: MutableMapping[str, str] = {}
|
|
26
|
+
self._time_window_if_history_is_full = timedelta(
|
|
27
|
+
days=stream_config.days_to_sync_if_history_is_full
|
|
28
|
+
or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
if self._time_window_if_history_is_full <= timedelta():
|
|
32
|
+
raise ValueError(
|
|
33
|
+
f"days_to_sync_if_history_is_full must be a positive timedelta, got {self._time_window_if_history_is_full}"
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
self._start_time = self._compute_start_time()
|
|
37
|
+
self._initial_earliest_file_in_history: Optional[RemoteFile] = None
|
|
38
|
+
|
|
39
|
+
def set_initial_state(self, value: StreamState) -> None:
|
|
40
|
+
self._file_to_datetime_history = value.get("history", {})
|
|
41
|
+
self._start_time = self._compute_start_time()
|
|
42
|
+
self._initial_earliest_file_in_history = self._compute_earliest_file_in_history()
|
|
43
|
+
|
|
44
|
+
def add_file(self, file: RemoteFile) -> None:
|
|
45
|
+
self._file_to_datetime_history[file.uri] = file.last_modified.strftime(
|
|
46
|
+
self.DATE_TIME_FORMAT
|
|
47
|
+
)
|
|
48
|
+
if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE:
|
|
49
|
+
# Get the earliest file based on its last modified date and its uri
|
|
50
|
+
oldest_file = self._compute_earliest_file_in_history()
|
|
51
|
+
if oldest_file:
|
|
52
|
+
del self._file_to_datetime_history[oldest_file.uri]
|
|
53
|
+
else:
|
|
54
|
+
raise Exception(
|
|
55
|
+
"The history is full but there is no files in the history. This should never happen and might be indicative of a bug in the CDK."
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
def get_state(self) -> StreamState:
|
|
59
|
+
state = {"history": self._file_to_datetime_history, self.CURSOR_FIELD: self._get_cursor()}
|
|
60
|
+
return state
|
|
61
|
+
|
|
62
|
+
def _get_cursor(self) -> Optional[str]:
|
|
63
|
+
"""
|
|
64
|
+
Returns the cursor value.
|
|
65
|
+
|
|
66
|
+
Files are synced in order of last-modified with secondary sort on filename, so the cursor value is
|
|
67
|
+
a string joining the last-modified timestamp of the last synced file and the name of the file.
|
|
68
|
+
"""
|
|
69
|
+
if self._file_to_datetime_history.items():
|
|
70
|
+
filename, timestamp = max(
|
|
71
|
+
self._file_to_datetime_history.items(), key=lambda x: (x[1], x[0])
|
|
72
|
+
)
|
|
73
|
+
return f"{timestamp}_{filename}"
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
def _is_history_full(self) -> bool:
|
|
77
|
+
"""
|
|
78
|
+
Returns true if the state's history is full, meaning new entries will start to replace old entries.
|
|
79
|
+
"""
|
|
80
|
+
return len(self._file_to_datetime_history) >= self.DEFAULT_MAX_HISTORY_SIZE
|
|
81
|
+
|
|
82
|
+
def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool:
|
|
83
|
+
if file.uri in self._file_to_datetime_history:
|
|
84
|
+
# If the file's uri is in the history, we should sync the file if it has been modified since it was synced
|
|
85
|
+
updated_at_from_history = datetime.strptime(
|
|
86
|
+
self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT
|
|
87
|
+
)
|
|
88
|
+
if file.last_modified < updated_at_from_history:
|
|
89
|
+
logger.warning(
|
|
90
|
+
f"The file {file.uri}'s last modified date is older than the last time it was synced. This is unexpected. Skipping the file."
|
|
91
|
+
)
|
|
92
|
+
else:
|
|
93
|
+
return file.last_modified > updated_at_from_history
|
|
94
|
+
return file.last_modified > updated_at_from_history
|
|
95
|
+
if self._is_history_full():
|
|
96
|
+
if self._initial_earliest_file_in_history is None:
|
|
97
|
+
return True
|
|
98
|
+
if file.last_modified > self._initial_earliest_file_in_history.last_modified:
|
|
99
|
+
# If the history is partial and the file's datetime is strictly greater than the earliest file in the history,
|
|
100
|
+
# we should sync it
|
|
101
|
+
return True
|
|
102
|
+
elif file.last_modified == self._initial_earliest_file_in_history.last_modified:
|
|
103
|
+
# If the history is partial and the file's datetime is equal to the earliest file in the history,
|
|
104
|
+
# we should sync it if its uri is strictly greater than the earliest file in the history
|
|
105
|
+
return file.uri > self._initial_earliest_file_in_history.uri
|
|
106
|
+
else:
|
|
107
|
+
# Otherwise, only sync the file if it has been modified since the start of the time window
|
|
108
|
+
return file.last_modified >= self.get_start_time()
|
|
109
|
+
else:
|
|
110
|
+
# The file is not in the history and the history is complete. We know we need to sync the file
|
|
111
|
+
return True
|
|
112
|
+
|
|
113
|
+
def get_files_to_sync(
|
|
114
|
+
self, all_files: Iterable[RemoteFile], logger: logging.Logger
|
|
115
|
+
) -> Iterable[RemoteFile]:
|
|
116
|
+
if self._is_history_full():
|
|
117
|
+
logger.warning(
|
|
118
|
+
f"The state history is full. "
|
|
119
|
+
f"This sync and future syncs won't be able to use the history to filter out duplicate files. "
|
|
120
|
+
f"It will instead use the time window of {self._time_window_if_history_is_full} to filter out files."
|
|
121
|
+
)
|
|
122
|
+
for f in all_files:
|
|
123
|
+
if self._should_sync_file(f, logger):
|
|
124
|
+
yield f
|
|
125
|
+
|
|
126
|
+
def get_start_time(self) -> datetime:
|
|
127
|
+
return self._start_time
|
|
128
|
+
|
|
129
|
+
def _compute_earliest_file_in_history(self) -> Optional[RemoteFile]:
|
|
130
|
+
if self._file_to_datetime_history:
|
|
131
|
+
filename, last_modified = min(
|
|
132
|
+
self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])
|
|
133
|
+
)
|
|
134
|
+
return RemoteFile(
|
|
135
|
+
uri=filename, last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT)
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
def _compute_start_time(self) -> datetime:
|
|
141
|
+
if not self._file_to_datetime_history:
|
|
142
|
+
return datetime.min
|
|
143
|
+
else:
|
|
144
|
+
earliest = min(self._file_to_datetime_history.values())
|
|
145
|
+
earliest_dt = datetime.strptime(earliest, self.DATE_TIME_FORMAT)
|
|
146
|
+
if self._is_history_full():
|
|
147
|
+
time_window = datetime.now() - self._time_window_if_history_is_full
|
|
148
|
+
earliest_dt = min(earliest_dt, time_window)
|
|
149
|
+
return earliest_dt
|