airbyte-cdk 0.72.1__py3-none-any.whl → 6.17.1.dev1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- airbyte_cdk/__init__.py +355 -6
- airbyte_cdk/cli/__init__.py +1 -0
- airbyte_cdk/cli/source_declarative_manifest/__init__.py +5 -0
- airbyte_cdk/cli/source_declarative_manifest/_run.py +230 -0
- airbyte_cdk/cli/source_declarative_manifest/spec.json +17 -0
- airbyte_cdk/config_observation.py +29 -10
- airbyte_cdk/connector.py +24 -24
- airbyte_cdk/connector_builder/README.md +53 -0
- airbyte_cdk/connector_builder/connector_builder_handler.py +37 -11
- airbyte_cdk/connector_builder/main.py +45 -13
- airbyte_cdk/connector_builder/message_grouper.py +189 -50
- airbyte_cdk/connector_builder/models.py +3 -2
- airbyte_cdk/destinations/__init__.py +4 -3
- airbyte_cdk/destinations/destination.py +54 -20
- airbyte_cdk/destinations/vector_db_based/README.md +37 -0
- airbyte_cdk/destinations/vector_db_based/config.py +40 -17
- airbyte_cdk/destinations/vector_db_based/document_processor.py +56 -17
- airbyte_cdk/destinations/vector_db_based/embedder.py +57 -15
- airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
- airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +24 -5
- airbyte_cdk/entrypoint.py +153 -44
- airbyte_cdk/exception_handler.py +21 -3
- airbyte_cdk/logger.py +30 -44
- airbyte_cdk/models/__init__.py +13 -2
- airbyte_cdk/models/airbyte_protocol.py +86 -1
- airbyte_cdk/models/airbyte_protocol_serializers.py +44 -0
- airbyte_cdk/models/file_transfer_record_message.py +13 -0
- airbyte_cdk/models/well_known_types.py +1 -1
- airbyte_cdk/sources/__init__.py +5 -1
- airbyte_cdk/sources/abstract_source.py +125 -79
- airbyte_cdk/sources/concurrent_source/__init__.py +7 -2
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +102 -36
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +29 -36
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +94 -10
- airbyte_cdk/sources/concurrent_source/stream_thread_exception.py +25 -0
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +20 -14
- airbyte_cdk/sources/config.py +3 -2
- airbyte_cdk/sources/connector_state_manager.py +49 -83
- airbyte_cdk/sources/declarative/async_job/job.py +52 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +497 -0
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +75 -0
- airbyte_cdk/sources/declarative/async_job/repository.py +35 -0
- airbyte_cdk/sources/declarative/async_job/status.py +24 -0
- airbyte_cdk/sources/declarative/async_job/timer.py +39 -0
- airbyte_cdk/sources/declarative/auth/__init__.py +2 -3
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/jwt.py +191 -0
- airbyte_cdk/sources/declarative/auth/oauth.py +60 -20
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +10 -2
- airbyte_cdk/sources/declarative/auth/token.py +28 -10
- airbyte_cdk/sources/declarative/auth/token_provider.py +9 -8
- airbyte_cdk/sources/declarative/checks/check_stream.py +16 -8
- airbyte_cdk/sources/declarative/checks/connection_checker.py +4 -2
- airbyte_cdk/sources/declarative/concurrency_level/__init__.py +7 -0
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +50 -0
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +490 -0
- airbyte_cdk/sources/declarative/datetime/datetime_parser.py +4 -0
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +26 -6
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +1185 -85
- airbyte_cdk/sources/declarative/declarative_source.py +5 -2
- airbyte_cdk/sources/declarative/declarative_stream.py +95 -9
- airbyte_cdk/sources/declarative/decoders/__init__.py +23 -2
- airbyte_cdk/sources/declarative/decoders/composite_raw_decoder.py +97 -0
- airbyte_cdk/sources/declarative/decoders/decoder.py +11 -4
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +92 -5
- airbyte_cdk/sources/declarative/decoders/noop_decoder.py +21 -0
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +39 -0
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +98 -0
- airbyte_cdk/sources/declarative/extractors/__init__.py +12 -1
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +29 -24
- airbyte_cdk/sources/declarative/extractors/http_selector.py +4 -5
- airbyte_cdk/sources/declarative/extractors/record_extractor.py +2 -3
- airbyte_cdk/sources/declarative/extractors/record_filter.py +63 -8
- airbyte_cdk/sources/declarative/extractors/record_selector.py +85 -26
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +177 -0
- airbyte_cdk/sources/declarative/extractors/type_transformer.py +55 -0
- airbyte_cdk/sources/declarative/incremental/__init__.py +31 -3
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +340 -0
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +156 -48
- airbyte_cdk/sources/declarative/incremental/declarative_cursor.py +13 -0
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +350 -0
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +174 -74
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +200 -0
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +122 -0
- airbyte_cdk/sources/declarative/interpolation/filters.py +27 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +23 -5
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +12 -8
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +13 -6
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +21 -6
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +9 -3
- airbyte_cdk/sources/declarative/interpolation/jinja.py +72 -37
- airbyte_cdk/sources/declarative/interpolation/macros.py +72 -17
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +193 -52
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +98 -0
- airbyte_cdk/sources/declarative/migrations/state_migration.py +24 -0
- airbyte_cdk/sources/declarative/models/__init__.py +1 -1
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +1319 -603
- airbyte_cdk/sources/declarative/parsers/custom_exceptions.py +2 -2
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +26 -4
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +26 -15
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +1759 -225
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +24 -4
- airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py +65 -0
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +176 -0
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +39 -9
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +62 -0
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +15 -3
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +222 -39
- airbyte_cdk/sources/declarative/requesters/error_handlers/__init__.py +19 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +19 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/exponential_backoff_strategy.py +19 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +4 -2
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +41 -9
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +29 -14
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategy.py +5 -13
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +32 -16
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +46 -56
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +40 -0
- airbyte_cdk/sources/declarative/requesters/error_handlers/error_handler.py +6 -32
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +119 -41
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +228 -0
- airbyte_cdk/sources/declarative/requesters/http_requester.py +98 -344
- airbyte_cdk/sources/declarative/requesters/paginators/__init__.py +14 -3
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +105 -46
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +14 -8
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +19 -8
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/__init__.py +9 -3
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +53 -21
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +42 -19
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +25 -12
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +13 -10
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +26 -13
- airbyte_cdk/sources/declarative/requesters/request_options/__init__.py +15 -2
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +91 -0
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +60 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +31 -14
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +27 -15
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +63 -10
- airbyte_cdk/sources/declarative/requesters/request_options/request_options_provider.py +1 -1
- airbyte_cdk/sources/declarative/requesters/requester.py +9 -17
- airbyte_cdk/sources/declarative/resolvers/__init__.py +41 -0
- airbyte_cdk/sources/declarative/resolvers/components_resolver.py +55 -0
- airbyte_cdk/sources/declarative/resolvers/config_components_resolver.py +136 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +112 -0
- airbyte_cdk/sources/declarative/retrievers/__init__.py +6 -2
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +100 -0
- airbyte_cdk/sources/declarative/retrievers/retriever.py +1 -3
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +229 -73
- airbyte_cdk/sources/declarative/schema/__init__.py +14 -1
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +5 -3
- airbyte_cdk/sources/declarative/schema/dynamic_schema_loader.py +236 -0
- airbyte_cdk/sources/declarative/schema/json_file_schema_loader.py +8 -8
- airbyte_cdk/sources/declarative/spec/spec.py +12 -5
- airbyte_cdk/sources/declarative/stream_slicers/__init__.py +1 -2
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +88 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +9 -14
- airbyte_cdk/sources/declarative/transformations/add_fields.py +19 -11
- airbyte_cdk/sources/declarative/transformations/flatten_fields.py +52 -0
- airbyte_cdk/sources/declarative/transformations/keys_replace_transformation.py +61 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_lower_transformation.py +22 -0
- airbyte_cdk/sources/declarative/transformations/keys_to_snake_transformation.py +68 -0
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +13 -10
- airbyte_cdk/sources/declarative/transformations/transformation.py +5 -5
- airbyte_cdk/sources/declarative/types.py +19 -110
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +31 -10
- airbyte_cdk/sources/embedded/base_integration.py +16 -5
- airbyte_cdk/sources/embedded/catalog.py +16 -4
- airbyte_cdk/sources/embedded/runner.py +19 -3
- airbyte_cdk/sources/embedded/tools.py +5 -2
- airbyte_cdk/sources/file_based/README.md +152 -0
- airbyte_cdk/sources/file_based/__init__.py +24 -0
- airbyte_cdk/sources/file_based/availability_strategy/__init__.py +9 -2
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +22 -6
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +46 -10
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +47 -10
- airbyte_cdk/sources/file_based/config/avro_format.py +2 -1
- airbyte_cdk/sources/file_based/config/csv_format.py +29 -10
- airbyte_cdk/sources/file_based/config/excel_format.py +18 -0
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +16 -4
- airbyte_cdk/sources/file_based/config/jsonl_format.py +2 -1
- airbyte_cdk/sources/file_based/config/parquet_format.py +2 -1
- airbyte_cdk/sources/file_based/config/unstructured_format.py +13 -5
- airbyte_cdk/sources/file_based/discovery_policy/__init__.py +6 -2
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
- airbyte_cdk/sources/file_based/exceptions.py +18 -15
- airbyte_cdk/sources/file_based/file_based_source.py +140 -33
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +69 -5
- airbyte_cdk/sources/file_based/file_types/__init__.py +14 -1
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +75 -24
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +116 -34
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +196 -0
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +37 -0
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +24 -8
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +60 -18
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +141 -41
- airbyte_cdk/sources/file_based/remote_file.py +1 -1
- airbyte_cdk/sources/file_based/schema_helpers.py +38 -10
- airbyte_cdk/sources/file_based/schema_validation_policies/__init__.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +50 -13
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +67 -27
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/__init__.py +5 -1
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +14 -23
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +54 -18
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +21 -9
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +27 -10
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +147 -45
- airbyte_cdk/sources/http_logger.py +8 -3
- airbyte_cdk/sources/message/__init__.py +7 -1
- airbyte_cdk/sources/message/repository.py +18 -4
- airbyte_cdk/sources/source.py +42 -38
- airbyte_cdk/sources/streams/__init__.py +2 -2
- airbyte_cdk/sources/streams/availability_strategy.py +54 -3
- airbyte_cdk/sources/streams/call_rate.py +64 -21
- airbyte_cdk/sources/streams/checkpoint/__init__.py +26 -0
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +335 -0
- airbyte_cdk/sources/{declarative/incremental → streams/checkpoint}/cursor.py +17 -14
- airbyte_cdk/sources/streams/checkpoint/per_partition_key_serializer.py +22 -0
- airbyte_cdk/sources/streams/checkpoint/resumable_full_refresh_cursor.py +51 -0
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +110 -0
- airbyte_cdk/sources/streams/concurrent/README.md +7 -0
- airbyte_cdk/sources/streams/concurrent/abstract_stream.py +7 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +84 -75
- airbyte_cdk/sources/streams/concurrent/availability_strategy.py +30 -2
- airbyte_cdk/sources/streams/concurrent/cursor.py +313 -48
- airbyte_cdk/sources/streams/concurrent/default_stream.py +12 -3
- airbyte_cdk/sources/streams/concurrent/exceptions.py +3 -0
- airbyte_cdk/sources/streams/concurrent/helpers.py +14 -3
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +12 -3
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +10 -3
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +1 -16
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +15 -5
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +109 -17
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +90 -72
- airbyte_cdk/sources/streams/core.py +412 -87
- airbyte_cdk/sources/streams/http/__init__.py +2 -1
- airbyte_cdk/sources/streams/http/availability_strategy.py +12 -101
- airbyte_cdk/sources/streams/http/error_handlers/__init__.py +22 -0
- airbyte_cdk/sources/streams/http/error_handlers/backoff_strategy.py +28 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_backoff_strategy.py +17 -0
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +86 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +42 -0
- airbyte_cdk/sources/streams/http/error_handlers/error_message_parser.py +19 -0
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +110 -0
- airbyte_cdk/sources/streams/http/error_handlers/json_error_message_parser.py +52 -0
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +65 -0
- airbyte_cdk/sources/streams/http/exceptions.py +27 -7
- airbyte_cdk/sources/streams/http/http.py +369 -246
- airbyte_cdk/sources/streams/http/http_client.py +531 -0
- airbyte_cdk/sources/streams/http/rate_limiting.py +76 -12
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +28 -9
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_token.py +2 -1
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +90 -35
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
- airbyte_cdk/sources/types.py +154 -0
- airbyte_cdk/sources/utils/record_helper.py +36 -21
- airbyte_cdk/sources/utils/schema_helpers.py +13 -6
- airbyte_cdk/sources/utils/slice_logger.py +4 -1
- airbyte_cdk/sources/utils/transform.py +54 -20
- airbyte_cdk/sql/_util/hashing.py +34 -0
- airbyte_cdk/sql/_util/name_normalizers.py +92 -0
- airbyte_cdk/sql/constants.py +32 -0
- airbyte_cdk/sql/exceptions.py +235 -0
- airbyte_cdk/sql/secrets.py +123 -0
- airbyte_cdk/sql/shared/__init__.py +15 -0
- airbyte_cdk/sql/shared/catalog_providers.py +145 -0
- airbyte_cdk/sql/shared/sql_processor.py +786 -0
- airbyte_cdk/sql/types.py +160 -0
- airbyte_cdk/test/catalog_builder.py +70 -18
- airbyte_cdk/test/entrypoint_wrapper.py +117 -42
- airbyte_cdk/test/mock_http/__init__.py +1 -1
- airbyte_cdk/test/mock_http/matcher.py +6 -0
- airbyte_cdk/test/mock_http/mocker.py +57 -10
- airbyte_cdk/test/mock_http/request.py +19 -3
- airbyte_cdk/test/mock_http/response.py +3 -1
- airbyte_cdk/test/mock_http/response_builder.py +32 -16
- airbyte_cdk/test/state_builder.py +18 -10
- airbyte_cdk/test/utils/__init__.py +1 -0
- airbyte_cdk/test/utils/data.py +24 -0
- airbyte_cdk/test/utils/http_mocking.py +16 -0
- airbyte_cdk/test/utils/manifest_only_fixtures.py +60 -0
- airbyte_cdk/test/utils/reading.py +26 -0
- airbyte_cdk/utils/__init__.py +2 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +5 -3
- airbyte_cdk/utils/analytics_message.py +10 -2
- airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
- airbyte_cdk/utils/event_timing.py +10 -10
- airbyte_cdk/utils/mapping_helpers.py +3 -1
- airbyte_cdk/utils/message_utils.py +20 -11
- airbyte_cdk/utils/print_buffer.py +75 -0
- airbyte_cdk/utils/schema_inferrer.py +198 -28
- airbyte_cdk/utils/slice_hasher.py +30 -0
- airbyte_cdk/utils/spec_schema_transformations.py +6 -3
- airbyte_cdk/utils/stream_status_utils.py +8 -1
- airbyte_cdk/utils/traced_exception.py +61 -21
- airbyte_cdk-6.17.1.dev1.dist-info/METADATA +109 -0
- airbyte_cdk-6.17.1.dev1.dist-info/RECORD +350 -0
- {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.17.1.dev1.dist-info}/WHEEL +1 -2
- airbyte_cdk-6.17.1.dev1.dist-info/entry_points.txt +3 -0
- airbyte_cdk/sources/declarative/create_partial.py +0 -92
- airbyte_cdk/sources/declarative/parsers/class_types_registry.py +0 -102
- airbyte_cdk/sources/declarative/parsers/default_implementation_registry.py +0 -64
- airbyte_cdk/sources/declarative/requesters/error_handlers/response_action.py +0 -16
- airbyte_cdk/sources/declarative/requesters/error_handlers/response_status.py +0 -68
- airbyte_cdk/sources/declarative/stream_slicers/cartesian_product_stream_slicer.py +0 -114
- airbyte_cdk/sources/deprecated/base_source.py +0 -94
- airbyte_cdk/sources/deprecated/client.py +0 -99
- airbyte_cdk/sources/singer/__init__.py +0 -8
- airbyte_cdk/sources/singer/singer_helpers.py +0 -304
- airbyte_cdk/sources/singer/source.py +0 -186
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +0 -23
- airbyte_cdk/sources/streams/http/auth/__init__.py +0 -17
- airbyte_cdk/sources/streams/http/auth/core.py +0 -29
- airbyte_cdk/sources/streams/http/auth/oauth.py +0 -113
- airbyte_cdk/sources/streams/http/auth/token.py +0 -47
- airbyte_cdk/sources/streams/utils/stream_helper.py +0 -40
- airbyte_cdk/sources/utils/catalog_helpers.py +0 -22
- airbyte_cdk/sources/utils/schema_models.py +0 -84
- airbyte_cdk-0.72.1.dist-info/METADATA +0 -243
- airbyte_cdk-0.72.1.dist-info/RECORD +0 -466
- airbyte_cdk-0.72.1.dist-info/top_level.txt +0 -3
- source_declarative_manifest/main.py +0 -29
- unit_tests/connector_builder/__init__.py +0 -3
- unit_tests/connector_builder/test_connector_builder_handler.py +0 -871
- unit_tests/connector_builder/test_message_grouper.py +0 -713
- unit_tests/connector_builder/utils.py +0 -27
- unit_tests/destinations/test_destination.py +0 -243
- unit_tests/singer/test_singer_helpers.py +0 -56
- unit_tests/singer/test_singer_source.py +0 -112
- unit_tests/sources/__init__.py +0 -0
- unit_tests/sources/concurrent_source/__init__.py +0 -3
- unit_tests/sources/concurrent_source/test_concurrent_source_adapter.py +0 -106
- unit_tests/sources/declarative/__init__.py +0 -3
- unit_tests/sources/declarative/auth/__init__.py +0 -3
- unit_tests/sources/declarative/auth/test_oauth.py +0 -331
- unit_tests/sources/declarative/auth/test_selective_authenticator.py +0 -39
- unit_tests/sources/declarative/auth/test_session_token_auth.py +0 -182
- unit_tests/sources/declarative/auth/test_token_auth.py +0 -200
- unit_tests/sources/declarative/auth/test_token_provider.py +0 -73
- unit_tests/sources/declarative/checks/__init__.py +0 -3
- unit_tests/sources/declarative/checks/test_check_stream.py +0 -146
- unit_tests/sources/declarative/decoders/__init__.py +0 -0
- unit_tests/sources/declarative/decoders/test_json_decoder.py +0 -16
- unit_tests/sources/declarative/external_component.py +0 -13
- unit_tests/sources/declarative/extractors/__init__.py +0 -3
- unit_tests/sources/declarative/extractors/test_dpath_extractor.py +0 -55
- unit_tests/sources/declarative/extractors/test_record_filter.py +0 -55
- unit_tests/sources/declarative/extractors/test_record_selector.py +0 -179
- unit_tests/sources/declarative/incremental/__init__.py +0 -0
- unit_tests/sources/declarative/incremental/test_datetime_based_cursor.py +0 -860
- unit_tests/sources/declarative/incremental/test_per_partition_cursor.py +0 -406
- unit_tests/sources/declarative/incremental/test_per_partition_cursor_integration.py +0 -332
- unit_tests/sources/declarative/interpolation/__init__.py +0 -3
- unit_tests/sources/declarative/interpolation/test_filters.py +0 -80
- unit_tests/sources/declarative/interpolation/test_interpolated_boolean.py +0 -40
- unit_tests/sources/declarative/interpolation/test_interpolated_mapping.py +0 -35
- unit_tests/sources/declarative/interpolation/test_interpolated_nested_mapping.py +0 -45
- unit_tests/sources/declarative/interpolation/test_interpolated_string.py +0 -25
- unit_tests/sources/declarative/interpolation/test_jinja.py +0 -240
- unit_tests/sources/declarative/interpolation/test_macros.py +0 -73
- unit_tests/sources/declarative/parsers/__init__.py +0 -3
- unit_tests/sources/declarative/parsers/test_manifest_component_transformer.py +0 -406
- unit_tests/sources/declarative/parsers/test_manifest_reference_resolver.py +0 -139
- unit_tests/sources/declarative/parsers/test_model_to_component_factory.py +0 -1847
- unit_tests/sources/declarative/parsers/testing_components.py +0 -36
- unit_tests/sources/declarative/partition_routers/__init__.py +0 -3
- unit_tests/sources/declarative/partition_routers/test_list_partition_router.py +0 -155
- unit_tests/sources/declarative/partition_routers/test_single_partition_router.py +0 -14
- unit_tests/sources/declarative/partition_routers/test_substream_partition_router.py +0 -404
- unit_tests/sources/declarative/requesters/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/error_handlers/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_constant_backoff.py +0 -34
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_exponential_backoff.py +0 -36
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_header_helper.py +0 -38
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_time_from_header.py +0 -35
- unit_tests/sources/declarative/requesters/error_handlers/backoff_strategies/test_wait_until_time_from_header.py +0 -64
- unit_tests/sources/declarative/requesters/error_handlers/test_composite_error_handler.py +0 -213
- unit_tests/sources/declarative/requesters/error_handlers/test_default_error_handler.py +0 -178
- unit_tests/sources/declarative/requesters/error_handlers/test_http_response_filter.py +0 -121
- unit_tests/sources/declarative/requesters/error_handlers/test_response_status.py +0 -44
- unit_tests/sources/declarative/requesters/paginators/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/paginators/test_cursor_pagination_strategy.py +0 -64
- unit_tests/sources/declarative/requesters/paginators/test_default_paginator.py +0 -313
- unit_tests/sources/declarative/requesters/paginators/test_no_paginator.py +0 -12
- unit_tests/sources/declarative/requesters/paginators/test_offset_increment.py +0 -58
- unit_tests/sources/declarative/requesters/paginators/test_page_increment.py +0 -70
- unit_tests/sources/declarative/requesters/paginators/test_request_option.py +0 -43
- unit_tests/sources/declarative/requesters/paginators/test_stop_condition.py +0 -105
- unit_tests/sources/declarative/requesters/request_options/__init__.py +0 -3
- unit_tests/sources/declarative/requesters/request_options/test_interpolated_request_options_provider.py +0 -101
- unit_tests/sources/declarative/requesters/test_http_requester.py +0 -974
- unit_tests/sources/declarative/requesters/test_interpolated_request_input_provider.py +0 -32
- unit_tests/sources/declarative/retrievers/__init__.py +0 -3
- unit_tests/sources/declarative/retrievers/test_simple_retriever.py +0 -542
- unit_tests/sources/declarative/schema/__init__.py +0 -6
- unit_tests/sources/declarative/schema/source_test/SourceTest.py +0 -8
- unit_tests/sources/declarative/schema/source_test/__init__.py +0 -3
- unit_tests/sources/declarative/schema/test_default_schema_loader.py +0 -32
- unit_tests/sources/declarative/schema/test_inline_schema_loader.py +0 -19
- unit_tests/sources/declarative/schema/test_json_file_schema_loader.py +0 -26
- unit_tests/sources/declarative/states/__init__.py +0 -3
- unit_tests/sources/declarative/stream_slicers/__init__.py +0 -3
- unit_tests/sources/declarative/stream_slicers/test_cartesian_product_stream_slicer.py +0 -225
- unit_tests/sources/declarative/test_create_partial.py +0 -83
- unit_tests/sources/declarative/test_declarative_stream.py +0 -103
- unit_tests/sources/declarative/test_manifest_declarative_source.py +0 -1260
- unit_tests/sources/declarative/test_types.py +0 -39
- unit_tests/sources/declarative/test_yaml_declarative_source.py +0 -148
- unit_tests/sources/file_based/__init__.py +0 -0
- unit_tests/sources/file_based/availability_strategy/__init__.py +0 -0
- unit_tests/sources/file_based/availability_strategy/test_default_file_based_availability_strategy.py +0 -100
- unit_tests/sources/file_based/config/__init__.py +0 -0
- unit_tests/sources/file_based/config/test_abstract_file_based_spec.py +0 -28
- unit_tests/sources/file_based/config/test_csv_format.py +0 -34
- unit_tests/sources/file_based/config/test_file_based_stream_config.py +0 -84
- unit_tests/sources/file_based/discovery_policy/__init__.py +0 -0
- unit_tests/sources/file_based/discovery_policy/test_default_discovery_policy.py +0 -31
- unit_tests/sources/file_based/file_types/__init__.py +0 -0
- unit_tests/sources/file_based/file_types/test_avro_parser.py +0 -243
- unit_tests/sources/file_based/file_types/test_csv_parser.py +0 -546
- unit_tests/sources/file_based/file_types/test_jsonl_parser.py +0 -158
- unit_tests/sources/file_based/file_types/test_parquet_parser.py +0 -274
- unit_tests/sources/file_based/file_types/test_unstructured_parser.py +0 -593
- unit_tests/sources/file_based/helpers.py +0 -70
- unit_tests/sources/file_based/in_memory_files_source.py +0 -211
- unit_tests/sources/file_based/scenarios/__init__.py +0 -0
- unit_tests/sources/file_based/scenarios/avro_scenarios.py +0 -744
- unit_tests/sources/file_based/scenarios/check_scenarios.py +0 -220
- unit_tests/sources/file_based/scenarios/concurrent_incremental_scenarios.py +0 -2844
- unit_tests/sources/file_based/scenarios/csv_scenarios.py +0 -3105
- unit_tests/sources/file_based/scenarios/file_based_source_builder.py +0 -91
- unit_tests/sources/file_based/scenarios/incremental_scenarios.py +0 -1926
- unit_tests/sources/file_based/scenarios/jsonl_scenarios.py +0 -930
- unit_tests/sources/file_based/scenarios/parquet_scenarios.py +0 -754
- unit_tests/sources/file_based/scenarios/scenario_builder.py +0 -234
- unit_tests/sources/file_based/scenarios/unstructured_scenarios.py +0 -608
- unit_tests/sources/file_based/scenarios/user_input_schema_scenarios.py +0 -746
- unit_tests/sources/file_based/scenarios/validation_policy_scenarios.py +0 -726
- unit_tests/sources/file_based/stream/__init__.py +0 -0
- unit_tests/sources/file_based/stream/concurrent/__init__.py +0 -0
- unit_tests/sources/file_based/stream/concurrent/test_adapters.py +0 -362
- unit_tests/sources/file_based/stream/concurrent/test_file_based_concurrent_cursor.py +0 -458
- unit_tests/sources/file_based/stream/test_default_file_based_cursor.py +0 -310
- unit_tests/sources/file_based/stream/test_default_file_based_stream.py +0 -244
- unit_tests/sources/file_based/test_file_based_scenarios.py +0 -320
- unit_tests/sources/file_based/test_file_based_stream_reader.py +0 -272
- unit_tests/sources/file_based/test_scenarios.py +0 -253
- unit_tests/sources/file_based/test_schema_helpers.py +0 -346
- unit_tests/sources/fixtures/__init__.py +0 -3
- unit_tests/sources/fixtures/source_test_fixture.py +0 -153
- unit_tests/sources/message/__init__.py +0 -0
- unit_tests/sources/message/test_repository.py +0 -153
- unit_tests/sources/streams/__init__.py +0 -0
- unit_tests/sources/streams/concurrent/__init__.py +0 -3
- unit_tests/sources/streams/concurrent/scenarios/__init__.py +0 -3
- unit_tests/sources/streams/concurrent/scenarios/incremental_scenarios.py +0 -250
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_builder.py +0 -140
- unit_tests/sources/streams/concurrent/scenarios/stream_facade_scenarios.py +0 -452
- unit_tests/sources/streams/concurrent/scenarios/test_concurrent_scenarios.py +0 -76
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_scenarios.py +0 -418
- unit_tests/sources/streams/concurrent/scenarios/thread_based_concurrent_stream_source_builder.py +0 -142
- unit_tests/sources/streams/concurrent/scenarios/utils.py +0 -55
- unit_tests/sources/streams/concurrent/test_adapters.py +0 -380
- unit_tests/sources/streams/concurrent/test_concurrent_read_processor.py +0 -684
- unit_tests/sources/streams/concurrent/test_cursor.py +0 -139
- unit_tests/sources/streams/concurrent/test_datetime_state_converter.py +0 -369
- unit_tests/sources/streams/concurrent/test_default_stream.py +0 -197
- unit_tests/sources/streams/concurrent/test_partition_enqueuer.py +0 -90
- unit_tests/sources/streams/concurrent/test_partition_reader.py +0 -67
- unit_tests/sources/streams/concurrent/test_thread_pool_manager.py +0 -106
- unit_tests/sources/streams/http/__init__.py +0 -0
- unit_tests/sources/streams/http/auth/__init__.py +0 -0
- unit_tests/sources/streams/http/auth/test_auth.py +0 -173
- unit_tests/sources/streams/http/requests_native_auth/__init__.py +0 -0
- unit_tests/sources/streams/http/requests_native_auth/test_requests_native_auth.py +0 -423
- unit_tests/sources/streams/http/test_availability_strategy.py +0 -180
- unit_tests/sources/streams/http/test_http.py +0 -635
- unit_tests/sources/streams/test_availability_strategy.py +0 -70
- unit_tests/sources/streams/test_call_rate.py +0 -300
- unit_tests/sources/streams/test_stream_read.py +0 -405
- unit_tests/sources/streams/test_streams_core.py +0 -184
- unit_tests/sources/test_abstract_source.py +0 -1442
- unit_tests/sources/test_concurrent_source.py +0 -112
- unit_tests/sources/test_config.py +0 -92
- unit_tests/sources/test_connector_state_manager.py +0 -482
- unit_tests/sources/test_http_logger.py +0 -252
- unit_tests/sources/test_integration_source.py +0 -86
- unit_tests/sources/test_source.py +0 -684
- unit_tests/sources/test_source_read.py +0 -460
- unit_tests/test/__init__.py +0 -0
- unit_tests/test/mock_http/__init__.py +0 -0
- unit_tests/test/mock_http/test_matcher.py +0 -53
- unit_tests/test/mock_http/test_mocker.py +0 -214
- unit_tests/test/mock_http/test_request.py +0 -117
- unit_tests/test/mock_http/test_response_builder.py +0 -177
- unit_tests/test/test_entrypoint_wrapper.py +0 -240
- unit_tests/utils/__init__.py +0 -0
- unit_tests/utils/test_datetime_format_inferrer.py +0 -60
- unit_tests/utils/test_mapping_helpers.py +0 -54
- unit_tests/utils/test_message_utils.py +0 -91
- unit_tests/utils/test_rate_limiting.py +0 -26
- unit_tests/utils/test_schema_inferrer.py +0 -202
- unit_tests/utils/test_secret_utils.py +0 -135
- unit_tests/utils/test_stream_status_utils.py +0 -61
- unit_tests/utils/test_traced_exception.py +0 -107
- /airbyte_cdk/sources/{deprecated → declarative/async_job}/__init__.py +0 -0
- {source_declarative_manifest → airbyte_cdk/sources/declarative/migrations}/__init__.py +0 -0
- {unit_tests/destinations → airbyte_cdk/sql}/__init__.py +0 -0
- {unit_tests/singer → airbyte_cdk/sql/_util}/__init__.py +0 -0
- {airbyte_cdk-0.72.1.dist-info → airbyte_cdk-6.17.1.dev1.dist-info}/LICENSE.txt +0 -0
@@ -0,0 +1,68 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
import re
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from typing import Any, Dict, List, Optional
|
8
|
+
|
9
|
+
import unidecode
|
10
|
+
|
11
|
+
from airbyte_cdk.sources.declarative.transformations import RecordTransformation
|
12
|
+
from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
|
13
|
+
|
14
|
+
|
15
|
+
@dataclass
|
16
|
+
class KeysToSnakeCaseTransformation(RecordTransformation):
|
17
|
+
token_pattern: re.Pattern[str] = re.compile(
|
18
|
+
r"[A-Z]+[a-z]*|[a-z]+|\d+|(?P<NoToken>[^a-zA-Z\d]+)"
|
19
|
+
)
|
20
|
+
|
21
|
+
def transform(
|
22
|
+
self,
|
23
|
+
record: Dict[str, Any],
|
24
|
+
config: Optional[Config] = None,
|
25
|
+
stream_state: Optional[StreamState] = None,
|
26
|
+
stream_slice: Optional[StreamSlice] = None,
|
27
|
+
) -> None:
|
28
|
+
transformed_record = self._transform_record(record)
|
29
|
+
record.clear()
|
30
|
+
record.update(transformed_record)
|
31
|
+
|
32
|
+
def _transform_record(self, record: Dict[str, Any]) -> Dict[str, Any]:
|
33
|
+
transformed_record = {}
|
34
|
+
for key, value in record.items():
|
35
|
+
transformed_key = self.process_key(key)
|
36
|
+
transformed_value = value
|
37
|
+
|
38
|
+
if isinstance(value, dict):
|
39
|
+
transformed_value = self._transform_record(value)
|
40
|
+
|
41
|
+
transformed_record[transformed_key] = transformed_value
|
42
|
+
return transformed_record
|
43
|
+
|
44
|
+
def process_key(self, key: str) -> str:
|
45
|
+
key = self.normalize_key(key)
|
46
|
+
tokens = self.tokenize_key(key)
|
47
|
+
tokens = self.filter_tokens(tokens)
|
48
|
+
return self.tokens_to_snake_case(tokens)
|
49
|
+
|
50
|
+
def normalize_key(self, key: str) -> str:
|
51
|
+
return unidecode.unidecode(key)
|
52
|
+
|
53
|
+
def tokenize_key(self, key: str) -> List[str]:
|
54
|
+
tokens = []
|
55
|
+
for match in self.token_pattern.finditer(key):
|
56
|
+
token = match.group(0) if match.group("NoToken") is None else ""
|
57
|
+
tokens.append(token)
|
58
|
+
return tokens
|
59
|
+
|
60
|
+
def filter_tokens(self, tokens: List[str]) -> List[str]:
|
61
|
+
if len(tokens) >= 3:
|
62
|
+
tokens = tokens[:1] + [t for t in tokens[1:-1] if t] + tokens[-1:]
|
63
|
+
if tokens and tokens[0].isdigit():
|
64
|
+
tokens.insert(0, "")
|
65
|
+
return tokens
|
66
|
+
|
67
|
+
def tokens_to_snake_case(self, tokens: List[str]) -> str:
|
68
|
+
return "_".join(token.lower() for token in tokens)
|
@@ -3,13 +3,14 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
from dataclasses import InitVar, dataclass
|
6
|
-
from typing import Any, List, Mapping, Optional
|
6
|
+
from typing import Any, Dict, List, Mapping, Optional
|
7
7
|
|
8
|
+
import dpath
|
8
9
|
import dpath.exceptions
|
9
|
-
|
10
|
+
|
10
11
|
from airbyte_cdk.sources.declarative.interpolation.interpolated_boolean import InterpolatedBoolean
|
11
12
|
from airbyte_cdk.sources.declarative.transformations import RecordTransformation
|
12
|
-
from airbyte_cdk.sources.
|
13
|
+
from airbyte_cdk.sources.types import Config, FieldPointer, StreamSlice, StreamState
|
13
14
|
|
14
15
|
|
15
16
|
@dataclass
|
@@ -44,15 +45,17 @@ class RemoveFields(RecordTransformation):
|
|
44
45
|
condition: str = ""
|
45
46
|
|
46
47
|
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
47
|
-
self._filter_interpolator = InterpolatedBoolean(
|
48
|
+
self._filter_interpolator = InterpolatedBoolean(
|
49
|
+
condition=self.condition, parameters=parameters
|
50
|
+
)
|
48
51
|
|
49
52
|
def transform(
|
50
53
|
self,
|
51
|
-
record:
|
54
|
+
record: Dict[str, Any],
|
52
55
|
config: Optional[Config] = None,
|
53
56
|
stream_state: Optional[StreamState] = None,
|
54
57
|
stream_slice: Optional[StreamSlice] = None,
|
55
|
-
) ->
|
58
|
+
) -> None:
|
56
59
|
"""
|
57
60
|
:param record: The record to be transformed
|
58
61
|
:return: the input record with the requested fields removed
|
@@ -60,13 +63,13 @@ class RemoveFields(RecordTransformation):
|
|
60
63
|
for pointer in self.field_pointers:
|
61
64
|
# the dpath library by default doesn't delete fields from arrays
|
62
65
|
try:
|
63
|
-
dpath.
|
66
|
+
dpath.delete(
|
64
67
|
record,
|
65
68
|
pointer,
|
66
|
-
afilter=(lambda x: self._filter_interpolator.eval(config or {}, property=x))
|
69
|
+
afilter=(lambda x: self._filter_interpolator.eval(config or {}, property=x))
|
70
|
+
if self.condition
|
71
|
+
else None,
|
67
72
|
)
|
68
73
|
except dpath.exceptions.PathNotFound:
|
69
74
|
# if the (potentially nested) property does not exist, silently skip
|
70
75
|
pass
|
71
|
-
|
72
|
-
return record
|
@@ -4,9 +4,9 @@
|
|
4
4
|
|
5
5
|
from abc import abstractmethod
|
6
6
|
from dataclasses import dataclass
|
7
|
-
from typing import Any,
|
7
|
+
from typing import Any, Dict, Optional
|
8
8
|
|
9
|
-
from airbyte_cdk.sources.
|
9
|
+
from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
|
10
10
|
|
11
11
|
|
12
12
|
@dataclass
|
@@ -18,13 +18,13 @@ class RecordTransformation:
|
|
18
18
|
@abstractmethod
|
19
19
|
def transform(
|
20
20
|
self,
|
21
|
-
record:
|
21
|
+
record: Dict[str, Any],
|
22
22
|
config: Optional[Config] = None,
|
23
23
|
stream_state: Optional[StreamState] = None,
|
24
24
|
stream_slice: Optional[StreamSlice] = None,
|
25
|
-
) ->
|
25
|
+
) -> None:
|
26
26
|
"""
|
27
|
-
Transform a record by adding, deleting, or mutating fields.
|
27
|
+
Transform a record by adding, deleting, or mutating fields directly from the record reference passed in argument.
|
28
28
|
|
29
29
|
:param record: The input record to be transformed
|
30
30
|
:param config: The user-provided configuration as specified by the source's spec
|
@@ -4,113 +4,22 @@
|
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
|
7
|
-
from
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
@property
|
27
|
-
def associated_slice(self) -> Optional[StreamSlice]:
|
28
|
-
return self._associated_slice
|
29
|
-
|
30
|
-
def __repr__(self) -> str:
|
31
|
-
return repr(self._data)
|
32
|
-
|
33
|
-
def __getitem__(self, key: str) -> Any:
|
34
|
-
return self._data[key]
|
35
|
-
|
36
|
-
def __len__(self) -> int:
|
37
|
-
return len(self._data)
|
38
|
-
|
39
|
-
def __iter__(self) -> Any:
|
40
|
-
return iter(self._data)
|
41
|
-
|
42
|
-
def __contains__(self, item: object) -> bool:
|
43
|
-
return item in self._data
|
44
|
-
|
45
|
-
def __eq__(self, other: object) -> bool:
|
46
|
-
if isinstance(other, Record):
|
47
|
-
# noinspection PyProtectedMember
|
48
|
-
return self._data == other._data
|
49
|
-
return False
|
50
|
-
|
51
|
-
def __ne__(self, other: object) -> bool:
|
52
|
-
return not self.__eq__(other)
|
53
|
-
|
54
|
-
|
55
|
-
class StreamSlice(Mapping[str, Any]):
|
56
|
-
def __init__(self, *, partition: Mapping[str, Any], cursor_slice: Mapping[str, Any]) -> None:
|
57
|
-
self._partition = partition
|
58
|
-
self._cursor_slice = cursor_slice
|
59
|
-
if partition.keys() & cursor_slice.keys():
|
60
|
-
raise ValueError("Keys for partition and incremental sync cursor should not overlap")
|
61
|
-
self._stream_slice = dict(partition) | dict(cursor_slice)
|
62
|
-
|
63
|
-
@property
|
64
|
-
def partition(self) -> Mapping[str, Any]:
|
65
|
-
p = self._partition
|
66
|
-
while isinstance(p, StreamSlice):
|
67
|
-
p = p.partition
|
68
|
-
return p
|
69
|
-
|
70
|
-
@property
|
71
|
-
def cursor_slice(self) -> Mapping[str, Any]:
|
72
|
-
c = self._cursor_slice
|
73
|
-
while isinstance(c, StreamSlice):
|
74
|
-
c = c.cursor_slice
|
75
|
-
return c
|
76
|
-
|
77
|
-
def __repr__(self) -> str:
|
78
|
-
return repr(self._stream_slice)
|
79
|
-
|
80
|
-
def __setitem__(self, key: str, value: Any) -> None:
|
81
|
-
raise ValueError("StreamSlice is immutable")
|
82
|
-
|
83
|
-
def __getitem__(self, key: str) -> Any:
|
84
|
-
return self._stream_slice[key]
|
85
|
-
|
86
|
-
def __len__(self) -> int:
|
87
|
-
return len(self._stream_slice)
|
88
|
-
|
89
|
-
def __iter__(self) -> Iterator[str]:
|
90
|
-
return iter(self._stream_slice)
|
91
|
-
|
92
|
-
def __contains__(self, item: Any) -> bool:
|
93
|
-
return item in self._stream_slice
|
94
|
-
|
95
|
-
def keys(self) -> KeysView[str]:
|
96
|
-
return self._stream_slice.keys()
|
97
|
-
|
98
|
-
def items(self) -> ItemsView[str, Any]:
|
99
|
-
return self._stream_slice.items()
|
100
|
-
|
101
|
-
def values(self) -> ValuesView[Any]:
|
102
|
-
return self._stream_slice.values()
|
103
|
-
|
104
|
-
def get(self, key: str, default: Any = None) -> Optional[Any]:
|
105
|
-
return self._stream_slice.get(key, default)
|
106
|
-
|
107
|
-
def __eq__(self, other: Any) -> bool:
|
108
|
-
if isinstance(other, dict):
|
109
|
-
return self._stream_slice == other
|
110
|
-
if isinstance(other, StreamSlice):
|
111
|
-
# noinspection PyProtectedMember
|
112
|
-
return self._partition == other._partition and self._cursor_slice == other._cursor_slice
|
113
|
-
return False
|
114
|
-
|
115
|
-
def __ne__(self, other: Any) -> bool:
|
116
|
-
return not self.__eq__(other)
|
7
|
+
from airbyte_cdk.sources.types import (
|
8
|
+
Config,
|
9
|
+
ConnectionDefinition,
|
10
|
+
FieldPointer,
|
11
|
+
Record,
|
12
|
+
StreamSlice,
|
13
|
+
StreamState,
|
14
|
+
)
|
15
|
+
|
16
|
+
# Note: This package originally contained class definitions for low-code CDK types, but we promoted them into the Python CDK.
|
17
|
+
# We've migrated connectors in the repository to reference the new location, but these assignments are used to retain backwards
|
18
|
+
# compatibility for sources created by OSS customers or on forks. This can be removed when we start bumping major versions.
|
19
|
+
|
20
|
+
FieldPointer = FieldPointer
|
21
|
+
Config = Config
|
22
|
+
ConnectionDefinition = ConnectionDefinition
|
23
|
+
StreamState = StreamState
|
24
|
+
Record = Record
|
25
|
+
StreamSlice = StreamSlice
|
@@ -3,31 +3,52 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
import pkgutil
|
6
|
+
from typing import Any, List, Mapping, Optional
|
6
7
|
|
7
8
|
import yaml
|
8
|
-
from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
|
9
|
-
from airbyte_cdk.sources.declarative.types import ConnectionDefinition
|
10
9
|
|
10
|
+
from airbyte_cdk.models import AirbyteStateMessage, ConfiguredAirbyteCatalog
|
11
|
+
from airbyte_cdk.sources.declarative.concurrent_declarative_source import (
|
12
|
+
ConcurrentDeclarativeSource,
|
13
|
+
)
|
14
|
+
from airbyte_cdk.sources.types import ConnectionDefinition
|
11
15
|
|
12
|
-
|
16
|
+
|
17
|
+
class YamlDeclarativeSource(ConcurrentDeclarativeSource[List[AirbyteStateMessage]]):
|
13
18
|
"""Declarative source defined by a yaml file"""
|
14
19
|
|
15
|
-
def __init__(
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
path_to_yaml: str,
|
23
|
+
debug: bool = False,
|
24
|
+
catalog: Optional[ConfiguredAirbyteCatalog] = None,
|
25
|
+
config: Optional[Mapping[str, Any]] = None,
|
26
|
+
state: Optional[List[AirbyteStateMessage]] = None,
|
27
|
+
) -> None:
|
16
28
|
"""
|
17
29
|
:param path_to_yaml: Path to the yaml file describing the source
|
18
30
|
"""
|
19
31
|
self._path_to_yaml = path_to_yaml
|
20
32
|
source_config = self._read_and_parse_yaml_file(path_to_yaml)
|
21
|
-
super().__init__(source_config, debug)
|
22
33
|
|
23
|
-
|
34
|
+
super().__init__(
|
35
|
+
catalog=catalog or ConfiguredAirbyteCatalog(streams=[]),
|
36
|
+
config=config or {},
|
37
|
+
state=state or [],
|
38
|
+
source_config=source_config,
|
39
|
+
)
|
40
|
+
|
41
|
+
def _read_and_parse_yaml_file(self, path_to_yaml_file: str) -> ConnectionDefinition:
|
24
42
|
package = self.__class__.__module__.split(".")[0]
|
25
43
|
|
26
44
|
yaml_config = pkgutil.get_data(package, path_to_yaml_file)
|
27
|
-
|
28
|
-
|
45
|
+
if yaml_config:
|
46
|
+
decoded_yaml = yaml_config.decode()
|
47
|
+
return self._parse(decoded_yaml)
|
48
|
+
else:
|
49
|
+
return {}
|
29
50
|
|
30
|
-
def _emit_manifest_debug_message(self, extra_args: dict):
|
51
|
+
def _emit_manifest_debug_message(self, extra_args: dict[str, Any]) -> None:
|
31
52
|
extra_args["path_to_yaml"] = self._path_to_yaml
|
32
53
|
self.logger.debug("declarative source created from parsed YAML manifest", extra=extra_args)
|
33
54
|
|
@@ -39,4 +60,4 @@ class YamlDeclarativeSource(ManifestDeclarativeSource):
|
|
39
60
|
:param connection_definition_str: yaml string to parse
|
40
61
|
:return: The ConnectionDefinition parsed from connection_definition_str
|
41
62
|
"""
|
42
|
-
return yaml.safe_load(connection_definition_str)
|
63
|
+
return yaml.safe_load(connection_definition_str) # type: ignore # yaml.safe_load doesn't return a type but know it is a Mapping
|
@@ -6,11 +6,15 @@ from abc import ABC, abstractmethod
|
|
6
6
|
from typing import Generic, Iterable, Optional, TypeVar
|
7
7
|
|
8
8
|
from airbyte_cdk.connector import TConfig
|
9
|
-
from airbyte_cdk.
|
9
|
+
from airbyte_cdk.models import AirbyteRecordMessage, AirbyteStateMessage, SyncMode, Type
|
10
|
+
from airbyte_cdk.sources.embedded.catalog import (
|
11
|
+
create_configured_catalog,
|
12
|
+
get_stream,
|
13
|
+
get_stream_names,
|
14
|
+
)
|
10
15
|
from airbyte_cdk.sources.embedded.runner import SourceRunner
|
11
16
|
from airbyte_cdk.sources.embedded.tools import get_defined_id
|
12
17
|
from airbyte_cdk.sources.utils.schema_helpers import check_config_against_spec_or_exit
|
13
|
-
from airbyte_protocol.models import AirbyteRecordMessage, AirbyteStateMessage, SyncMode, Type
|
14
18
|
|
15
19
|
TOutput = TypeVar("TOutput")
|
16
20
|
|
@@ -31,11 +35,15 @@ class BaseEmbeddedIntegration(ABC, Generic[TConfig, TOutput]):
|
|
31
35
|
"""
|
32
36
|
pass
|
33
37
|
|
34
|
-
def _load_data(
|
38
|
+
def _load_data(
|
39
|
+
self, stream_name: str, state: Optional[AirbyteStateMessage] = None
|
40
|
+
) -> Iterable[TOutput]:
|
35
41
|
catalog = self.source.discover(self.config)
|
36
42
|
stream = get_stream(catalog, stream_name)
|
37
43
|
if not stream:
|
38
|
-
raise ValueError(
|
44
|
+
raise ValueError(
|
45
|
+
f"Stream {stream_name} not found, the following streams are available: {', '.join(get_stream_names(catalog))}"
|
46
|
+
)
|
39
47
|
if SyncMode.incremental not in stream.supported_sync_modes:
|
40
48
|
configured_catalog = create_configured_catalog(stream, sync_mode=SyncMode.full_refresh)
|
41
49
|
else:
|
@@ -43,7 +51,10 @@ class BaseEmbeddedIntegration(ABC, Generic[TConfig, TOutput]):
|
|
43
51
|
|
44
52
|
for message in self.source.read(self.config, configured_catalog, state):
|
45
53
|
if message.type == Type.RECORD:
|
46
|
-
output = self._handle_record(
|
54
|
+
output = self._handle_record(
|
55
|
+
message.record,
|
56
|
+
get_defined_id(stream, message.record.data), # type: ignore[union-attr, arg-type]
|
57
|
+
)
|
47
58
|
if output:
|
48
59
|
yield output
|
49
60
|
elif message.type is Type.STATE and message.state:
|
@@ -31,15 +31,27 @@ def to_configured_stream(
|
|
31
31
|
primary_key: Optional[List[List[str]]] = None,
|
32
32
|
) -> ConfiguredAirbyteStream:
|
33
33
|
return ConfiguredAirbyteStream(
|
34
|
-
stream=stream,
|
34
|
+
stream=stream,
|
35
|
+
sync_mode=sync_mode,
|
36
|
+
destination_sync_mode=destination_sync_mode,
|
37
|
+
cursor_field=cursor_field,
|
38
|
+
primary_key=primary_key,
|
35
39
|
)
|
36
40
|
|
37
41
|
|
38
|
-
def to_configured_catalog(
|
42
|
+
def to_configured_catalog(
|
43
|
+
configured_streams: List[ConfiguredAirbyteStream],
|
44
|
+
) -> ConfiguredAirbyteCatalog:
|
39
45
|
return ConfiguredAirbyteCatalog(streams=configured_streams)
|
40
46
|
|
41
47
|
|
42
|
-
def create_configured_catalog(
|
43
|
-
|
48
|
+
def create_configured_catalog(
|
49
|
+
stream: AirbyteStream, sync_mode: SyncMode = SyncMode.full_refresh
|
50
|
+
) -> ConfiguredAirbyteCatalog:
|
51
|
+
configured_streams = [
|
52
|
+
to_configured_stream(
|
53
|
+
stream, sync_mode=sync_mode, primary_key=stream.source_defined_primary_key
|
54
|
+
)
|
55
|
+
]
|
44
56
|
|
45
57
|
return to_configured_catalog(configured_streams)
|
@@ -8,7 +8,13 @@ from abc import ABC, abstractmethod
|
|
8
8
|
from typing import Generic, Iterable, Optional
|
9
9
|
|
10
10
|
from airbyte_cdk.connector import TConfig
|
11
|
-
from airbyte_cdk.models import
|
11
|
+
from airbyte_cdk.models import (
|
12
|
+
AirbyteCatalog,
|
13
|
+
AirbyteMessage,
|
14
|
+
AirbyteStateMessage,
|
15
|
+
ConfiguredAirbyteCatalog,
|
16
|
+
ConnectorSpecification,
|
17
|
+
)
|
12
18
|
from airbyte_cdk.sources.source import Source
|
13
19
|
|
14
20
|
|
@@ -22,7 +28,12 @@ class SourceRunner(ABC, Generic[TConfig]):
|
|
22
28
|
pass
|
23
29
|
|
24
30
|
@abstractmethod
|
25
|
-
def read(
|
31
|
+
def read(
|
32
|
+
self,
|
33
|
+
config: TConfig,
|
34
|
+
catalog: ConfiguredAirbyteCatalog,
|
35
|
+
state: Optional[AirbyteStateMessage],
|
36
|
+
) -> Iterable[AirbyteMessage]:
|
26
37
|
pass
|
27
38
|
|
28
39
|
|
@@ -37,5 +48,10 @@ class CDKRunner(SourceRunner[TConfig]):
|
|
37
48
|
def discover(self, config: TConfig) -> AirbyteCatalog:
|
38
49
|
return self._source.discover(self._logger, config)
|
39
50
|
|
40
|
-
def read(
|
51
|
+
def read(
|
52
|
+
self,
|
53
|
+
config: TConfig,
|
54
|
+
catalog: ConfiguredAirbyteCatalog,
|
55
|
+
state: Optional[AirbyteStateMessage],
|
56
|
+
) -> Iterable[AirbyteMessage]:
|
41
57
|
return self._source.read(self._logger, config, catalog, state=[state] if state else [])
|
@@ -5,10 +5,13 @@
|
|
5
5
|
from typing import Any, Callable, Dict, Iterable, Optional
|
6
6
|
|
7
7
|
import dpath
|
8
|
+
|
8
9
|
from airbyte_cdk.models import AirbyteStream
|
9
10
|
|
10
11
|
|
11
|
-
def get_first(
|
12
|
+
def get_first(
|
13
|
+
iterable: Iterable[Any], predicate: Callable[[Any], bool] = lambda m: True
|
14
|
+
) -> Optional[Any]:
|
12
15
|
return next(filter(predicate, iterable), None)
|
13
16
|
|
14
17
|
|
@@ -18,7 +21,7 @@ def get_defined_id(stream: AirbyteStream, data: Dict[str, Any]) -> Optional[str]
|
|
18
21
|
primary_key = []
|
19
22
|
for key in stream.source_defined_primary_key:
|
20
23
|
try:
|
21
|
-
primary_key.append(str(dpath.
|
24
|
+
primary_key.append(str(dpath.get(data, key)))
|
22
25
|
except KeyError:
|
23
26
|
primary_key.append("__not_found__")
|
24
27
|
return "_".join(primary_key)
|
@@ -0,0 +1,152 @@
|
|
1
|
+
## Behavior
|
2
|
+
|
3
|
+
The Airbyte protocol defines the actions `spec`, `discover`, `check` and `read` for a source to be compliant. Here is the high-level description of the flow for a file-based source:
|
4
|
+
|
5
|
+
- spec: calls AbstractFileBasedSpec.documentation_url and AbstractFileBasedSpec.schema to return a ConnectorSpecification.
|
6
|
+
- discover: calls Source.streams, and subsequently Stream.get_json_schema; this uses Source.open_file to open files during schema discovery.
|
7
|
+
- check: Source.check_connection is called from the entrypoint code (in the main CDK).
|
8
|
+
- read: Stream.read_records calls Stream.list_files which calls Source.list_matching_files, and then also uses Source.open_file to parse records from the file handle.
|
9
|
+
|
10
|
+
## How to Implement Your Own
|
11
|
+
|
12
|
+
To create a file-based source a user must extend three classes – AbstractFileBasedSource, AbstractFileBasedSpec, and AbstractStreamReader – to create an implementation for the connector’s specific storage system. They then initialize a FileBasedSource with the instance of AbstractStreamReader specific to their storage system.
|
13
|
+
|
14
|
+
The abstract classes house the vast majority of the logic required by file-based sources. For example, when extending AbstractStreamReader, users only have to implement three methods:
|
15
|
+
|
16
|
+
- list_matching_files: lists files matching the glob pattern(s) provided in the config.
|
17
|
+
- open_file: returns a file handle for reading.
|
18
|
+
- config property setter: concrete implementations of AbstractFileBasedStreamReader's config setter should assert that `value` is of the correct config type for that type of StreamReader.
|
19
|
+
|
20
|
+
The result is that an implementation of a source might look like this:
|
21
|
+
|
22
|
+
```
|
23
|
+
class CustomStreamReader(AbstractStreamReader):
|
24
|
+
def open_file(self, remote_file: RemoteFile) -> FileHandler:
|
25
|
+
<...>
|
26
|
+
|
27
|
+
def get_matching_files(
|
28
|
+
self,
|
29
|
+
globs: List[str],
|
30
|
+
logger: logging.Logger,
|
31
|
+
) -> Iterable[RemoteFile]:
|
32
|
+
<...>
|
33
|
+
|
34
|
+
@config.setter
|
35
|
+
def config(self, value: Config):
|
36
|
+
assert isinstance(value, CustomConfig)
|
37
|
+
self._config = value
|
38
|
+
|
39
|
+
|
40
|
+
class CustomConfig(AbstractFileBasedSpec):
|
41
|
+
@classmethod
|
42
|
+
def documentation_url(cls) -> AnyUrl:
|
43
|
+
return AnyUrl("https://docs.airbyte.com/integrations/sources/s3", scheme="https")
|
44
|
+
|
45
|
+
a_spec_field: str = Field(title="A Spec Field", description="This is where you describe the fields of the spec", order=0)
|
46
|
+
<...>
|
47
|
+
```
|
48
|
+
|
49
|
+
For more information, feel free to check the docstrings of each classes or check specific implementations (like source-s3).
|
50
|
+
|
51
|
+
## Supported File Types
|
52
|
+
|
53
|
+
### Avro
|
54
|
+
|
55
|
+
Avro is a serialization format developed by [Apache](https://avro.apache.org/docs/). Avro configuration options for the file-based CDK:
|
56
|
+
|
57
|
+
- `double_as_string`: Whether to convert double fields to strings. This is recommended if you have decimal numbers with a high degree of precision because there can be a loss precision when handling floating point numbers.
|
58
|
+
|
59
|
+
### CSV
|
60
|
+
|
61
|
+
CSV is a format loosely described by [RFC 4180](https://www.rfc-editor.org/rfc/rfc4180). The format is quite flexible which leads to a ton of options to consider:
|
62
|
+
|
63
|
+
- `delimiter`: The character delimiting individual cells in the CSV data. By name, CSV is comma separated so the default value is `,`
|
64
|
+
- `quote_char`: When quoted fields are used, it is possible for a field to span multiple lines, even when line breaks appear within such field. The default quote character is `"`.
|
65
|
+
- `escape_char`: The character used for escaping special characters.
|
66
|
+
- `encoding`: The character encoding of the file. By default, `UTF-8`
|
67
|
+
- `double_quote`: Whether two quotes in a quoted CSV value denote a single quote in the data.
|
68
|
+
- `quoting_behavior`: The quoting behavior determines when a value in a row should have quote marks added around it.
|
69
|
+
- `skip_rows_before_header`: The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.
|
70
|
+
- `skip_rows_after_header`: The number of rows to skip after the header row.
|
71
|
+
- `autogenerate_column_names`: If your CSV does not have a header row, the file-based CDK will need this enable to generate column names.
|
72
|
+
- `null_values`: As CSV does not explicitly define a value for null values, the user can specify a set of case-sensitive strings that should be interpreted as null values.
|
73
|
+
- `true_values`: As CSV does not explicitly define a value for positive boolean, the user can specify a set of case-sensitive strings that should be interpreted as true values.
|
74
|
+
- `false_values`: As CSV does not explicitly define a value for negative boolean, the user can specify a set of case-sensitive strings that should be interpreted as false values.
|
75
|
+
|
76
|
+
### JSONL
|
77
|
+
|
78
|
+
[JSONL](https://jsonlines.org/) (or JSON Lines) is a format where each row is a JSON object. There are no configuration option for this format. For backward compatibility reasons, the JSONL parser currently supports multiline objects even though this is not part of the JSONL standard. Following some data gathering, we reserve the right to remove the support for this. Given that files have multiline JSON objects, performances will be slow.
|
79
|
+
|
80
|
+
### Parquet
|
81
|
+
|
82
|
+
Parquet is a file format defined by [Apache](https://parquet.apache.org/). Configuration options are:
|
83
|
+
|
84
|
+
- `decimal_as_float`: Whether to convert decimal fields to floats. There is a loss of precision when converting decimals to floats, so this is not recommended.
|
85
|
+
|
86
|
+
### Document file types (PDF, DOCX, Markdown)
|
87
|
+
|
88
|
+
For file share source connectors, the `unstructured` parser can be used to parse document file types. The textual content of the whole file will be parsed as a single record with a `content` field containing the text encoded as markdown.
|
89
|
+
|
90
|
+
To use the unstructured parser, the libraries `poppler` and `tesseract` need to be installed on the system running the connector. For example, on Ubuntu, you can install them with the following command:
|
91
|
+
|
92
|
+
```
|
93
|
+
apt-get install -y tesseract-ocr poppler-utils
|
94
|
+
```
|
95
|
+
|
96
|
+
on Mac, you can install these via brew:
|
97
|
+
|
98
|
+
```
|
99
|
+
brew install poppler
|
100
|
+
brew install tesseract
|
101
|
+
```
|
102
|
+
|
103
|
+
## Schema
|
104
|
+
|
105
|
+
Having a schema allows for the file-based CDK to take action when there is a discrepancy between a record and what are the expected types of the record fields.
|
106
|
+
|
107
|
+
Schema can be either inferred or user provided.
|
108
|
+
|
109
|
+
- If the user defines it a format using JSON types, inference will not apply. Input schemas are a key/value pair of strings describing column name and data type. Supported types are `["string", "number", "integer", "object", "array", "boolean", "null"]`. For example, `{"col1": "string", "col2": "boolean"}`.
|
110
|
+
- If the user enables schemaless sync, schema will `{"data": "object"}` and therefore emitted records will look like `{"data": {"col1": val1, …}}`. This is recommended if the contents between files in the stream vary significantly, and/or if data is very nested.
|
111
|
+
- Else, the file-based CDK will infer the schema depending on the file type. Some file formats defined the schema as part of their metadata (like Parquet), some do on the record-level (like Avro) and some don't have any explicit typing (like JSON or CSV). Note that all CSV values are inferred as strings except where we are supporting legacy configurations. Any file format that does not define their schema on a metadata level will require the file-based CDK to iterate to a number of records. There is a limit of bytes that will be consumed in order to infer the schema.
|
112
|
+
|
113
|
+
### Validation Policies
|
114
|
+
|
115
|
+
Users will be required to select one of 3 different options, in the event that records are encountered that don’t conform to the schema.
|
116
|
+
|
117
|
+
- Skip nonconforming records: check each record to see if it conforms to the user-input or inferred schema; skip the record if it doesn't conform. We keep a count of the number of records in each file that do and do not conform and emit a log message with these counts once we’re done reading the file.
|
118
|
+
- Emit all records: emit all records, even if they do not conform to the user-provided or inferred schema. Columns that don't exist in the configured catalog probably won't be available in the destination's table since that's the current behavior.
|
119
|
+
Only error if there are conflicting field types or malformed rows.
|
120
|
+
- Stop the sync and wait for schema re-discovery: if a record is encountered that does not conform to the configured catalog’s schema, we log a message and stop the whole sync. Note: this option is not recommended if the files have very different columns or datatypes, because the inferred schema may vary significantly at discover time.
|
121
|
+
|
122
|
+
When the `schemaless` is enabled, validation will be skipped.
|
123
|
+
|
124
|
+
## Breaking Changes (compared to previous S3 implementation)
|
125
|
+
|
126
|
+
- [CSV] Mapping of type `array` and `object`: before, they were mapped as `large_string` and hence casted as strings. Given the new changes, if `array` or `object` is specified, the value will be casted as `array` and `object` respectively.
|
127
|
+
- [CSV] Before, a string value would not be considered as `null_values` if the column type was a string. We will now start to cast string columns with values matching `null_values` to null.
|
128
|
+
- [CSV] `decimal_point` option is deprecated: It is not possible anymore to use another character than `.` to separate the integer part from non-integer part. Given that the float is format with another character than this, it will be considered as a string.
|
129
|
+
- [Parquet] `columns` option is deprecated: You can use Airbyte column selection in order to have the same behavior. We don't expect it, but this could have impact on the performance as payload could be bigger.
|
130
|
+
|
131
|
+
## Incremental syncs
|
132
|
+
|
133
|
+
The file-based connectors supports the following [sync modes](https://docs.airbyte.com/cloud/core-concepts#connection-sync-modes):
|
134
|
+
|
135
|
+
| Feature | Supported? |
|
136
|
+
| :--------------------------------------------- | :--------- |
|
137
|
+
| Full Refresh Sync | Yes |
|
138
|
+
| Incremental Sync | Yes |
|
139
|
+
| Replicate Incremental Deletes | No |
|
140
|
+
| Replicate Multiple Files \(pattern matching\) | Yes |
|
141
|
+
| Replicate Multiple Streams \(distinct tables\) | Yes |
|
142
|
+
| Namespaces | No |
|
143
|
+
|
144
|
+
We recommend you do not manually modify files that are already synced. The connector has file-level granularity, which means adding or modifying a row in a CSV file will trigger a re-sync of the content of that file.
|
145
|
+
|
146
|
+
### Incremental sync
|
147
|
+
|
148
|
+
After the initial sync, the connector only pulls files that were modified since the last sync.
|
149
|
+
|
150
|
+
The connector checkpoints the connection states when it is done syncing all files for a given timestamp. The connection's state only keeps track of the last 10 000 files synced. If more than 10 000 files are synced, the connector won't be able to rely on the connection state to deduplicate files. In this case, the connector will initialize its cursor to the minimum between the earliest file in the history, or 3 days ago.
|
151
|
+
|
152
|
+
Both the maximum number of files, and the time buffer can be configured by connector developers.
|