airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +17 -2
- airbyte_cdk/config_observation.py +10 -3
- airbyte_cdk/connector.py +19 -9
- airbyte_cdk/connector_builder/connector_builder_handler.py +28 -8
- airbyte_cdk/connector_builder/main.py +26 -6
- airbyte_cdk/connector_builder/message_grouper.py +95 -25
- airbyte_cdk/destinations/destination.py +47 -14
- airbyte_cdk/destinations/vector_db_based/config.py +36 -14
- airbyte_cdk/destinations/vector_db_based/document_processor.py +49 -11
- airbyte_cdk/destinations/vector_db_based/embedder.py +52 -11
- airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
- airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +15 -4
- airbyte_cdk/entrypoint.py +82 -26
- airbyte_cdk/exception_handler.py +13 -3
- airbyte_cdk/logger.py +10 -2
- airbyte_cdk/models/airbyte_protocol.py +11 -5
- airbyte_cdk/models/airbyte_protocol_serializers.py +9 -3
- airbyte_cdk/models/well_known_types.py +1 -1
- airbyte_cdk/sources/abstract_source.py +63 -17
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +47 -14
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +25 -7
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +27 -6
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +9 -3
- airbyte_cdk/sources/connector_state_manager.py +32 -10
- airbyte_cdk/sources/declarative/async_job/job.py +3 -1
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +68 -14
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +24 -6
- airbyte_cdk/sources/declarative/async_job/repository.py +3 -1
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/jwt.py +27 -7
- airbyte_cdk/sources/declarative/auth/oauth.py +35 -11
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/token.py +25 -8
- airbyte_cdk/sources/declarative/checks/check_stream.py +12 -4
- airbyte_cdk/sources/declarative/checks/connection_checker.py +3 -1
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +11 -3
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +106 -50
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +20 -6
- airbyte_cdk/sources/declarative/declarative_source.py +3 -1
- airbyte_cdk/sources/declarative/declarative_stream.py +27 -6
- airbyte_cdk/sources/declarative/decoders/decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +3 -1
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +6 -2
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +6 -2
- airbyte_cdk/sources/declarative/extractors/record_filter.py +24 -7
- airbyte_cdk/sources/declarative/extractors/record_selector.py +10 -3
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +15 -5
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +96 -31
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +22 -8
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +46 -15
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +19 -5
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +3 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +20 -2
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +5 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +10 -3
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +7 -1
- airbyte_cdk/sources/declarative/interpolation/jinja.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +106 -24
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +7 -2
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +656 -678
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +13 -4
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +9 -2
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +782 -232
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +29 -7
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +25 -7
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +54 -15
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +6 -2
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +15 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +18 -8
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +16 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +51 -14
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +29 -8
- airbyte_cdk/sources/declarative/requesters/http_requester.py +58 -16
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +49 -14
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +24 -7
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +9 -3
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +6 -2
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +19 -6
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +3 -1
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +21 -7
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +18 -6
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +27 -8
- airbyte_cdk/sources/declarative/requesters/requester.py +3 -1
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -5
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +105 -24
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +3 -1
- airbyte_cdk/sources/declarative/spec/spec.py +8 -2
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +3 -1
- airbyte_cdk/sources/declarative/transformations/add_fields.py +12 -3
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +6 -2
- airbyte_cdk/sources/declarative/types.py +8 -1
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +3 -1
- airbyte_cdk/sources/embedded/base_integration.py +14 -4
- airbyte_cdk/sources/embedded/catalog.py +16 -4
- airbyte_cdk/sources/embedded/runner.py +19 -3
- airbyte_cdk/sources/embedded/tools.py +3 -1
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +27 -7
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +12 -6
- airbyte_cdk/sources/file_based/config/csv_format.py +21 -9
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +6 -2
- airbyte_cdk/sources/file_based/config/unstructured_format.py +10 -3
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
- airbyte_cdk/sources/file_based/exceptions.py +13 -15
- airbyte_cdk/sources/file_based/file_based_source.py +82 -24
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -5
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +58 -17
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +89 -26
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +25 -7
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +20 -6
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +57 -16
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +64 -15
- airbyte_cdk/sources/file_based/schema_helpers.py +33 -10
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +33 -10
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +47 -11
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +13 -22
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +53 -17
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +17 -5
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +26 -9
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +67 -21
- airbyte_cdk/sources/http_logger.py +5 -1
- airbyte_cdk/sources/message/repository.py +18 -4
- airbyte_cdk/sources/source.py +17 -7
- airbyte_cdk/sources/streams/availability_strategy.py +9 -3
- airbyte_cdk/sources/streams/call_rate.py +63 -19
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +31 -7
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +6 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +77 -22
- airbyte_cdk/sources/streams/concurrent/cursor.py +56 -20
- airbyte_cdk/sources/streams/concurrent/default_stream.py +9 -2
- airbyte_cdk/sources/streams/concurrent/helpers.py +6 -2
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +9 -2
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +4 -1
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +10 -2
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +6 -2
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +25 -10
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +32 -16
- airbyte_cdk/sources/streams/core.py +77 -22
- airbyte_cdk/sources/streams/http/availability_strategy.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +4 -1
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +16 -5
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +9 -3
- airbyte_cdk/sources/streams/http/exceptions.py +2 -2
- airbyte_cdk/sources/streams/http/http.py +133 -33
- airbyte_cdk/sources/streams/http/http_client.py +91 -29
- airbyte_cdk/sources/streams/http/rate_limiting.py +23 -7
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +19 -6
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +38 -11
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
- airbyte_cdk/sources/types.py +5 -1
- airbyte_cdk/sources/utils/record_helper.py +12 -3
- airbyte_cdk/sources/utils/schema_helpers.py +9 -3
- airbyte_cdk/sources/utils/slice_logger.py +4 -1
- airbyte_cdk/sources/utils/transform.py +24 -9
- airbyte_cdk/sql/exceptions.py +19 -6
- airbyte_cdk/sql/secrets.py +3 -1
- airbyte_cdk/sql/shared/catalog_providers.py +13 -4
- airbyte_cdk/sql/shared/sql_processor.py +44 -14
- airbyte_cdk/test/catalog_builder.py +19 -8
- airbyte_cdk/test/entrypoint_wrapper.py +27 -8
- airbyte_cdk/test/mock_http/mocker.py +41 -11
- airbyte_cdk/test/mock_http/request.py +9 -3
- airbyte_cdk/test/mock_http/response.py +3 -1
- airbyte_cdk/test/mock_http/response_builder.py +29 -7
- airbyte_cdk/test/state_builder.py +10 -2
- airbyte_cdk/test/utils/data.py +6 -2
- airbyte_cdk/test/utils/http_mocking.py +3 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +3 -1
- airbyte_cdk/utils/analytics_message.py +10 -2
- airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
- airbyte_cdk/utils/mapping_helpers.py +3 -1
- airbyte_cdk/utils/message_utils.py +11 -4
- airbyte_cdk/utils/print_buffer.py +6 -1
- airbyte_cdk/utils/schema_inferrer.py +30 -9
- airbyte_cdk/utils/spec_schema_transformations.py +3 -1
- airbyte_cdk/utils/traced_exception.py +35 -9
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/METADATA +7 -6
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/RECORD +198 -198
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.5.5.dist-info}/WHEEL +0 -0
@@ -31,15 +31,27 @@ def to_configured_stream(
|
|
31
31
|
primary_key: Optional[List[List[str]]] = None,
|
32
32
|
) -> ConfiguredAirbyteStream:
|
33
33
|
return ConfiguredAirbyteStream(
|
34
|
-
stream=stream,
|
34
|
+
stream=stream,
|
35
|
+
sync_mode=sync_mode,
|
36
|
+
destination_sync_mode=destination_sync_mode,
|
37
|
+
cursor_field=cursor_field,
|
38
|
+
primary_key=primary_key,
|
35
39
|
)
|
36
40
|
|
37
41
|
|
38
|
-
def to_configured_catalog(
|
42
|
+
def to_configured_catalog(
|
43
|
+
configured_streams: List[ConfiguredAirbyteStream],
|
44
|
+
) -> ConfiguredAirbyteCatalog:
|
39
45
|
return ConfiguredAirbyteCatalog(streams=configured_streams)
|
40
46
|
|
41
47
|
|
42
|
-
def create_configured_catalog(
|
43
|
-
|
48
|
+
def create_configured_catalog(
|
49
|
+
stream: AirbyteStream, sync_mode: SyncMode = SyncMode.full_refresh
|
50
|
+
) -> ConfiguredAirbyteCatalog:
|
51
|
+
configured_streams = [
|
52
|
+
to_configured_stream(
|
53
|
+
stream, sync_mode=sync_mode, primary_key=stream.source_defined_primary_key
|
54
|
+
)
|
55
|
+
]
|
44
56
|
|
45
57
|
return to_configured_catalog(configured_streams)
|
@@ -8,7 +8,13 @@ from abc import ABC, abstractmethod
|
|
8
8
|
from typing import Generic, Iterable, Optional
|
9
9
|
|
10
10
|
from airbyte_cdk.connector import TConfig
|
11
|
-
from airbyte_cdk.models import
|
11
|
+
from airbyte_cdk.models import (
|
12
|
+
AirbyteCatalog,
|
13
|
+
AirbyteMessage,
|
14
|
+
AirbyteStateMessage,
|
15
|
+
ConfiguredAirbyteCatalog,
|
16
|
+
ConnectorSpecification,
|
17
|
+
)
|
12
18
|
from airbyte_cdk.sources.source import Source
|
13
19
|
|
14
20
|
|
@@ -22,7 +28,12 @@ class SourceRunner(ABC, Generic[TConfig]):
|
|
22
28
|
pass
|
23
29
|
|
24
30
|
@abstractmethod
|
25
|
-
def read(
|
31
|
+
def read(
|
32
|
+
self,
|
33
|
+
config: TConfig,
|
34
|
+
catalog: ConfiguredAirbyteCatalog,
|
35
|
+
state: Optional[AirbyteStateMessage],
|
36
|
+
) -> Iterable[AirbyteMessage]:
|
26
37
|
pass
|
27
38
|
|
28
39
|
|
@@ -37,5 +48,10 @@ class CDKRunner(SourceRunner[TConfig]):
|
|
37
48
|
def discover(self, config: TConfig) -> AirbyteCatalog:
|
38
49
|
return self._source.discover(self._logger, config)
|
39
50
|
|
40
|
-
def read(
|
51
|
+
def read(
|
52
|
+
self,
|
53
|
+
config: TConfig,
|
54
|
+
catalog: ConfiguredAirbyteCatalog,
|
55
|
+
state: Optional[AirbyteStateMessage],
|
56
|
+
) -> Iterable[AirbyteMessage]:
|
41
57
|
return self._source.read(self._logger, config, catalog, state=[state] if state else [])
|
@@ -8,7 +8,9 @@ import dpath
|
|
8
8
|
from airbyte_cdk.models import AirbyteStream
|
9
9
|
|
10
10
|
|
11
|
-
def get_first(
|
11
|
+
def get_first(
|
12
|
+
iterable: Iterable[Any], predicate: Callable[[Any], bool] = lambda m: True
|
13
|
+
) -> Optional[Any]:
|
12
14
|
return next(filter(predicate, iterable), None)
|
13
15
|
|
14
16
|
|
airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py
CHANGED
@@ -22,7 +22,9 @@ if TYPE_CHECKING:
|
|
22
22
|
|
23
23
|
class AbstractFileBasedAvailabilityStrategy(AvailabilityStrategy):
|
24
24
|
@abstractmethod
|
25
|
-
def check_availability(
|
25
|
+
def check_availability(
|
26
|
+
self, stream: Stream, logger: logging.Logger, _: Optional[Source]
|
27
|
+
) -> Tuple[bool, Optional[str]]:
|
26
28
|
"""
|
27
29
|
Perform a connection check for the stream.
|
28
30
|
|
@@ -48,10 +50,16 @@ class AbstractFileBasedAvailabilityStrategyWrapper(AbstractAvailabilityStrategy)
|
|
48
50
|
self.stream = stream
|
49
51
|
|
50
52
|
def check_availability(self, logger: logging.Logger) -> StreamAvailability:
|
51
|
-
is_available, reason = self.stream.availability_strategy.check_availability(
|
53
|
+
is_available, reason = self.stream.availability_strategy.check_availability(
|
54
|
+
self.stream, logger, None
|
55
|
+
)
|
52
56
|
if is_available:
|
53
57
|
return StreamAvailable()
|
54
58
|
return StreamUnavailable(reason or "")
|
55
59
|
|
56
|
-
def check_availability_and_parsability(
|
57
|
-
|
60
|
+
def check_availability_and_parsability(
|
61
|
+
self, logger: logging.Logger
|
62
|
+
) -> Tuple[bool, Optional[str]]:
|
63
|
+
return self.stream.availability_strategy.check_availability_and_parsability(
|
64
|
+
self.stream, logger, None
|
65
|
+
)
|
airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py
CHANGED
@@ -8,8 +8,14 @@ from typing import TYPE_CHECKING, Optional, Tuple
|
|
8
8
|
|
9
9
|
from airbyte_cdk import AirbyteTracedException
|
10
10
|
from airbyte_cdk.sources import Source
|
11
|
-
from airbyte_cdk.sources.file_based.availability_strategy import
|
12
|
-
|
11
|
+
from airbyte_cdk.sources.file_based.availability_strategy import (
|
12
|
+
AbstractFileBasedAvailabilityStrategy,
|
13
|
+
)
|
14
|
+
from airbyte_cdk.sources.file_based.exceptions import (
|
15
|
+
CheckAvailabilityError,
|
16
|
+
CustomFileBasedException,
|
17
|
+
FileBasedSourceError,
|
18
|
+
)
|
13
19
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
14
20
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
15
21
|
from airbyte_cdk.sources.file_based.schema_helpers import conforms_to_schema
|
@@ -22,7 +28,9 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
|
|
22
28
|
def __init__(self, stream_reader: AbstractFileBasedStreamReader):
|
23
29
|
self.stream_reader = stream_reader
|
24
30
|
|
25
|
-
def check_availability(
|
31
|
+
def check_availability(
|
32
|
+
self, stream: "AbstractFileBasedStream", logger: logging.Logger, _: Optional[Source]
|
33
|
+
) -> Tuple[bool, Optional[str]]: # type: ignore[override]
|
26
34
|
"""
|
27
35
|
Perform a connection check for the stream (verify that we can list files from the stream).
|
28
36
|
|
@@ -87,15 +95,25 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
|
|
87
95
|
except CustomFileBasedException as exc:
|
88
96
|
raise CheckAvailabilityError(str(exc), stream=stream.name) from exc
|
89
97
|
except Exception as exc:
|
90
|
-
raise CheckAvailabilityError(
|
98
|
+
raise CheckAvailabilityError(
|
99
|
+
FileBasedSourceError.ERROR_LISTING_FILES, stream=stream.name
|
100
|
+
) from exc
|
91
101
|
|
92
102
|
return file
|
93
103
|
|
94
|
-
def _check_parse_record(
|
104
|
+
def _check_parse_record(
|
105
|
+
self, stream: "AbstractFileBasedStream", file: RemoteFile, logger: logging.Logger
|
106
|
+
) -> None:
|
95
107
|
parser = stream.get_parser()
|
96
108
|
|
97
109
|
try:
|
98
|
-
record = next(
|
110
|
+
record = next(
|
111
|
+
iter(
|
112
|
+
parser.parse_records(
|
113
|
+
stream.config, file, self.stream_reader, logger, discovered_schema=None
|
114
|
+
)
|
115
|
+
)
|
116
|
+
)
|
99
117
|
except StopIteration:
|
100
118
|
# The file is empty. We've verified that we can open it, so will
|
101
119
|
# consider the connection check successful even though it means
|
@@ -104,7 +122,9 @@ class DefaultFileBasedAvailabilityStrategy(AbstractFileBasedAvailabilityStrategy
|
|
104
122
|
except AirbyteTracedException as ate:
|
105
123
|
raise ate
|
106
124
|
except Exception as exc:
|
107
|
-
raise CheckAvailabilityError(
|
125
|
+
raise CheckAvailabilityError(
|
126
|
+
FileBasedSourceError.ERROR_READING_FILE, stream=stream.name, file=file.uri
|
127
|
+
) from exc
|
108
128
|
|
109
129
|
schema = stream.catalog_schema or stream.config.input_schema
|
110
130
|
if schema and stream.validation_policy.validate_schema_before_sync:
|
@@ -107,10 +107,16 @@ class AbstractFileBasedSpec(BaseModel):
|
|
107
107
|
|
108
108
|
properties_to_change = ["validation_policy"]
|
109
109
|
for property_to_change in properties_to_change:
|
110
|
-
property_object = schema["properties"]["streams"]["items"]["properties"][
|
110
|
+
property_object = schema["properties"]["streams"]["items"]["properties"][
|
111
|
+
property_to_change
|
112
|
+
]
|
111
113
|
if "anyOf" in property_object:
|
112
|
-
schema["properties"]["streams"]["items"]["properties"][property_to_change][
|
113
|
-
|
114
|
+
schema["properties"]["streams"]["items"]["properties"][property_to_change][
|
115
|
+
"type"
|
116
|
+
] = "object"
|
117
|
+
schema["properties"]["streams"]["items"]["properties"][property_to_change][
|
118
|
+
"oneOf"
|
119
|
+
] = property_object.pop("anyOf")
|
114
120
|
AbstractFileBasedSpec.move_enum_to_root(property_object)
|
115
121
|
|
116
122
|
csv_format_schemas = list(
|
@@ -121,9 +127,9 @@ class AbstractFileBasedSpec(BaseModel):
|
|
121
127
|
)
|
122
128
|
if len(csv_format_schemas) != 1:
|
123
129
|
raise ValueError(f"Expecting only one CSV format but got {csv_format_schemas}")
|
124
|
-
csv_format_schemas[0]["properties"]["header_definition"]["oneOf"] = csv_format_schemas[0][
|
125
|
-
"
|
126
|
-
)
|
130
|
+
csv_format_schemas[0]["properties"]["header_definition"]["oneOf"] = csv_format_schemas[0][
|
131
|
+
"properties"
|
132
|
+
]["header_definition"].pop("anyOf", [])
|
127
133
|
csv_format_schemas[0]["properties"]["header_definition"]["type"] = "object"
|
128
134
|
return schema
|
129
135
|
|
@@ -70,7 +70,9 @@ class CsvHeaderUserProvided(BaseModel):
|
|
70
70
|
@validator("column_names")
|
71
71
|
def validate_column_names(cls, v: List[str]) -> List[str]:
|
72
72
|
if not v:
|
73
|
-
raise ValueError(
|
73
|
+
raise ValueError(
|
74
|
+
"At least one column name needs to be provided when using user provided headers"
|
75
|
+
)
|
74
76
|
return v
|
75
77
|
|
76
78
|
|
@@ -107,7 +109,9 @@ class CsvFormat(BaseModel):
|
|
107
109
|
description='The character encoding of the CSV data. Leave blank to default to <strong>UTF8</strong>. See <a href="https://docs.python.org/3/library/codecs.html#standard-encodings" target="_blank">list of python encodings</a> for allowable options.',
|
108
110
|
)
|
109
111
|
double_quote: bool = Field(
|
110
|
-
title="Double Quote",
|
112
|
+
title="Double Quote",
|
113
|
+
default=True,
|
114
|
+
description="Whether two quotes in a quoted CSV value denote a single quote in the data.",
|
111
115
|
)
|
112
116
|
null_values: Set[str] = Field(
|
113
117
|
title="Null Values",
|
@@ -125,12 +129,16 @@ class CsvFormat(BaseModel):
|
|
125
129
|
description="The number of rows to skip before the header row. For example, if the header row is on the 3rd row, enter 2 in this field.",
|
126
130
|
)
|
127
131
|
skip_rows_after_header: int = Field(
|
128
|
-
title="Skip Rows After Header",
|
132
|
+
title="Skip Rows After Header",
|
133
|
+
default=0,
|
134
|
+
description="The number of rows to skip after the header row.",
|
129
135
|
)
|
130
|
-
header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] =
|
131
|
-
|
132
|
-
|
133
|
-
|
136
|
+
header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = (
|
137
|
+
Field(
|
138
|
+
title="CSV Header Definition",
|
139
|
+
default=CsvHeaderFromCsv(header_definition_type=CsvHeaderDefinitionType.FROM_CSV.value),
|
140
|
+
description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
|
141
|
+
)
|
134
142
|
)
|
135
143
|
true_values: Set[str] = Field(
|
136
144
|
title="True Values",
|
@@ -189,9 +197,13 @@ class CsvFormat(BaseModel):
|
|
189
197
|
definition_type = values.get("header_definition_type")
|
190
198
|
column_names = values.get("user_provided_column_names")
|
191
199
|
if definition_type == CsvHeaderDefinitionType.USER_PROVIDED and not column_names:
|
192
|
-
raise ValidationError(
|
200
|
+
raise ValidationError(
|
201
|
+
"`user_provided_column_names` should be defined if the definition 'User Provided'.",
|
202
|
+
model=CsvFormat,
|
203
|
+
)
|
193
204
|
if definition_type != CsvHeaderDefinitionType.USER_PROVIDED and column_names:
|
194
205
|
raise ValidationError(
|
195
|
-
"`user_provided_column_names` should not be defined if the definition is not 'User Provided'.",
|
206
|
+
"`user_provided_column_names` should not be defined if the definition is not 'User Provided'.",
|
207
|
+
model=CsvFormat,
|
196
208
|
)
|
197
209
|
return values
|
@@ -56,7 +56,9 @@ class FileBasedStreamConfig(BaseModel):
|
|
56
56
|
description="When the state history of the file store is full, syncs will only read files that were last modified in the provided day range.",
|
57
57
|
default=3,
|
58
58
|
)
|
59
|
-
format: Union[
|
59
|
+
format: Union[
|
60
|
+
AvroFormat, CsvFormat, JsonlFormat, ParquetFormat, UnstructuredFormat, ExcelFormat
|
61
|
+
] = Field(
|
60
62
|
title="Format",
|
61
63
|
description="The configuration options that are used to alter how to read incoming files that deviate from the standard formatting.",
|
62
64
|
)
|
@@ -89,6 +91,8 @@ class FileBasedStreamConfig(BaseModel):
|
|
89
91
|
if self.input_schema:
|
90
92
|
schema = type_mapping_to_jsonschema(self.input_schema)
|
91
93
|
if not schema:
|
92
|
-
raise ValueError(
|
94
|
+
raise ValueError(
|
95
|
+
f"Unable to create JSON schema from input schema {self.input_schema}"
|
96
|
+
)
|
93
97
|
return schema
|
94
98
|
return None
|
@@ -13,7 +13,9 @@ class LocalProcessingConfigModel(BaseModel):
|
|
13
13
|
|
14
14
|
class Config(OneOfOptionConfig):
|
15
15
|
title = "Local"
|
16
|
-
description =
|
16
|
+
description = (
|
17
|
+
"Process files locally, supporting `fast` and `ocr` modes. This is the default option."
|
18
|
+
)
|
17
19
|
discriminator = "mode"
|
18
20
|
|
19
21
|
|
@@ -23,7 +25,9 @@ class APIParameterConfigModel(BaseModel):
|
|
23
25
|
description="The name of the unstructured API parameter to use",
|
24
26
|
examples=["combine_under_n_chars", "languages"],
|
25
27
|
)
|
26
|
-
value: str = Field(
|
28
|
+
value: str = Field(
|
29
|
+
title="Value", description="The value of the parameter", examples=["true", "hi_res"]
|
30
|
+
)
|
27
31
|
|
28
32
|
|
29
33
|
class APIProcessingConfigModel(BaseModel):
|
@@ -85,7 +89,10 @@ class UnstructuredFormat(BaseModel):
|
|
85
89
|
description="The strategy used to parse documents. `fast` extracts text directly from the document which doesn't work for all files. `ocr_only` is more reliable, but slower. `hi_res` is the most reliable, but requires an API key and a hosted instance of unstructured and can't be used with local mode. See the unstructured.io documentation for more details: https://unstructured-io.github.io/unstructured/core/partition.html#partition-pdf",
|
86
90
|
)
|
87
91
|
|
88
|
-
processing: Union[
|
92
|
+
processing: Union[
|
93
|
+
LocalProcessingConfigModel,
|
94
|
+
APIProcessingConfigModel,
|
95
|
+
] = Field(
|
89
96
|
default=LocalProcessingConfigModel(mode="local"),
|
90
97
|
title="Processing",
|
91
98
|
description="Processing configuration",
|
@@ -15,9 +15,7 @@ class AbstractDiscoveryPolicy(ABC):
|
|
15
15
|
|
16
16
|
@property
|
17
17
|
@abstractmethod
|
18
|
-
def n_concurrent_requests(self) -> int:
|
19
|
-
...
|
18
|
+
def n_concurrent_requests(self) -> int: ...
|
20
19
|
|
21
20
|
@abstractmethod
|
22
|
-
def get_max_n_files_for_schema_inference(self, parser: FileTypeParser) -> int:
|
23
|
-
...
|
21
|
+
def get_max_n_files_for_schema_inference(self, parser: FileTypeParser) -> int: ...
|
@@ -2,7 +2,9 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
|
-
from airbyte_cdk.sources.file_based.discovery_policy.abstract_discovery_policy import
|
5
|
+
from airbyte_cdk.sources.file_based.discovery_policy.abstract_discovery_policy import (
|
6
|
+
AbstractDiscoveryPolicy,
|
7
|
+
)
|
6
8
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
7
9
|
|
8
10
|
DEFAULT_N_CONCURRENT_REQUESTS = 10
|
@@ -23,6 +25,9 @@ class DefaultDiscoveryPolicy(AbstractDiscoveryPolicy):
|
|
23
25
|
return min(
|
24
26
|
filter(
|
25
27
|
None,
|
26
|
-
(
|
28
|
+
(
|
29
|
+
DEFAULT_MAX_N_FILES_FOR_STREAM_SCHEMA_INFERENCE,
|
30
|
+
parser.parser_max_n_files_for_schema_inference,
|
31
|
+
),
|
27
32
|
)
|
28
33
|
)
|
@@ -11,27 +11,21 @@ from airbyte_cdk.utils import AirbyteTracedException
|
|
11
11
|
|
12
12
|
class FileBasedSourceError(Enum):
|
13
13
|
EMPTY_STREAM = "No files were identified in the stream. This may be because there are no files in the specified container, or because your glob patterns did not match any files. Please verify that your source contains files last modified after the start_date and that your glob patterns are not overly strict."
|
14
|
-
GLOB_PARSE_ERROR =
|
15
|
-
"Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
|
16
|
-
)
|
14
|
+
GLOB_PARSE_ERROR = "Error parsing glob pattern. Please refer to the glob pattern rules at https://facelessuser.github.io/wcmatch/glob/#split."
|
17
15
|
ENCODING_ERROR = "File encoding error. The configured encoding must match file encoding."
|
18
16
|
ERROR_CASTING_VALUE = "Could not cast the value to the expected type."
|
19
17
|
ERROR_CASTING_VALUE_UNRECOGNIZED_TYPE = "Could not cast the value to the expected type because the type is not recognized. Valid types are null, array, boolean, integer, number, object, and string."
|
20
18
|
ERROR_DECODING_VALUE = "Expected a JSON-decodeable value but could not decode record."
|
21
|
-
ERROR_LISTING_FILES =
|
22
|
-
|
23
|
-
)
|
24
|
-
ERROR_READING_FILE = (
|
25
|
-
"Error opening file. Please check the credentials provided in the config and verify that they provide permission to read files."
|
26
|
-
)
|
19
|
+
ERROR_LISTING_FILES = "Error listing files. Please check the credentials provided in the config and verify that they provide permission to list files."
|
20
|
+
ERROR_READING_FILE = "Error opening file. Please check the credentials provided in the config and verify that they provide permission to read files."
|
27
21
|
ERROR_PARSING_RECORD = "Error parsing record. This could be due to a mismatch between the config's file type and the actual file type, or because the file or record is not parseable."
|
28
|
-
ERROR_PARSING_USER_PROVIDED_SCHEMA =
|
22
|
+
ERROR_PARSING_USER_PROVIDED_SCHEMA = (
|
23
|
+
"The provided schema could not be transformed into valid JSON Schema."
|
24
|
+
)
|
29
25
|
ERROR_VALIDATING_RECORD = "One or more records do not pass the schema validation policy. Please modify your input schema, or select a more lenient validation policy."
|
30
26
|
ERROR_PARSING_RECORD_MISMATCHED_COLUMNS = "A header field has resolved to `None`. This indicates that the CSV has more rows than the number of header fields. If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
|
31
27
|
ERROR_PARSING_RECORD_MISMATCHED_ROWS = "A row's value has resolved to `None`. This indicates that the CSV has more columns in the header field than the number of columns in the row(s). If you input your schema or headers, please verify that the number of columns corresponds to the number of columns in your CSV's rows."
|
32
|
-
STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY =
|
33
|
-
"Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema."
|
34
|
-
)
|
28
|
+
STOP_SYNC_PER_SCHEMA_VALIDATION_POLICY = "Stopping sync in accordance with the configured validation policy. Records in file did not conform to the schema."
|
35
29
|
NULL_VALUE_IN_SCHEMA = "Error during schema inference: no type was detected for key."
|
36
30
|
UNRECOGNIZED_TYPE = "Error during schema inference: unrecognized type."
|
37
31
|
SCHEMA_INFERENCE_ERROR = "Error inferring schema from files. Are the files valid?"
|
@@ -39,7 +33,9 @@ class FileBasedSourceError(Enum):
|
|
39
33
|
CONFIG_VALIDATION_ERROR = "Error creating stream config object."
|
40
34
|
MISSING_SCHEMA = "Expected `json_schema` in the configured catalog but it is missing."
|
41
35
|
UNDEFINED_PARSER = "No parser is defined for this file type."
|
42
|
-
UNDEFINED_VALIDATION_POLICY =
|
36
|
+
UNDEFINED_VALIDATION_POLICY = (
|
37
|
+
"The validation policy defined in the config does not exist for the source."
|
38
|
+
)
|
43
39
|
|
44
40
|
|
45
41
|
class FileBasedErrorsCollector:
|
@@ -70,7 +66,9 @@ class BaseFileBasedSourceError(Exception):
|
|
70
66
|
def __init__(self, error: Union[FileBasedSourceError, str], **kwargs): # type: ignore # noqa
|
71
67
|
if isinstance(error, FileBasedSourceError):
|
72
68
|
error = FileBasedSourceError(error).value
|
73
|
-
super().__init__(
|
69
|
+
super().__init__(
|
70
|
+
f"{error} Contact Support if you need assistance.\n{' '.join([f'{k}={v}' for k, v in kwargs.items()])}"
|
71
|
+
)
|
74
72
|
|
75
73
|
|
76
74
|
class ConfigValidationError(BaseFileBasedSourceError):
|
@@ -22,15 +22,31 @@ from airbyte_cdk.models import (
|
|
22
22
|
from airbyte_cdk.sources.concurrent_source.concurrent_source import ConcurrentSource
|
23
23
|
from airbyte_cdk.sources.concurrent_source.concurrent_source_adapter import ConcurrentSourceAdapter
|
24
24
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
25
|
-
from airbyte_cdk.sources.file_based.availability_strategy import
|
25
|
+
from airbyte_cdk.sources.file_based.availability_strategy import (
|
26
|
+
AbstractFileBasedAvailabilityStrategy,
|
27
|
+
DefaultFileBasedAvailabilityStrategy,
|
28
|
+
)
|
26
29
|
from airbyte_cdk.sources.file_based.config.abstract_file_based_spec import AbstractFileBasedSpec
|
27
|
-
from airbyte_cdk.sources.file_based.config.file_based_stream_config import
|
28
|
-
|
29
|
-
|
30
|
+
from airbyte_cdk.sources.file_based.config.file_based_stream_config import (
|
31
|
+
FileBasedStreamConfig,
|
32
|
+
ValidationPolicy,
|
33
|
+
)
|
34
|
+
from airbyte_cdk.sources.file_based.discovery_policy import (
|
35
|
+
AbstractDiscoveryPolicy,
|
36
|
+
DefaultDiscoveryPolicy,
|
37
|
+
)
|
38
|
+
from airbyte_cdk.sources.file_based.exceptions import (
|
39
|
+
ConfigValidationError,
|
40
|
+
FileBasedErrorsCollector,
|
41
|
+
FileBasedSourceError,
|
42
|
+
)
|
30
43
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
31
44
|
from airbyte_cdk.sources.file_based.file_types import default_parsers
|
32
45
|
from airbyte_cdk.sources.file_based.file_types.file_type_parser import FileTypeParser
|
33
|
-
from airbyte_cdk.sources.file_based.schema_validation_policies import
|
46
|
+
from airbyte_cdk.sources.file_based.schema_validation_policies import (
|
47
|
+
DEFAULT_SCHEMA_VALIDATION_POLICIES,
|
48
|
+
AbstractSchemaValidationPolicy,
|
49
|
+
)
|
34
50
|
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream, DefaultFileBasedStream
|
35
51
|
from airbyte_cdk.sources.file_based.stream.concurrent.adapters import FileBasedStreamFacade
|
36
52
|
from airbyte_cdk.sources.file_based.stream.concurrent.cursor import (
|
@@ -65,25 +81,37 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
65
81
|
availability_strategy: Optional[AbstractFileBasedAvailabilityStrategy] = None,
|
66
82
|
discovery_policy: AbstractDiscoveryPolicy = DefaultDiscoveryPolicy(),
|
67
83
|
parsers: Mapping[Type[Any], FileTypeParser] = default_parsers,
|
68
|
-
validation_policies: Mapping[
|
69
|
-
|
84
|
+
validation_policies: Mapping[
|
85
|
+
ValidationPolicy, AbstractSchemaValidationPolicy
|
86
|
+
] = DEFAULT_SCHEMA_VALIDATION_POLICIES,
|
87
|
+
cursor_cls: Type[
|
88
|
+
Union[AbstractConcurrentFileBasedCursor, AbstractFileBasedCursor]
|
89
|
+
] = FileBasedConcurrentCursor,
|
70
90
|
):
|
71
91
|
self.stream_reader = stream_reader
|
72
92
|
self.spec_class = spec_class
|
73
93
|
self.config = config
|
74
94
|
self.catalog = catalog
|
75
95
|
self.state = state
|
76
|
-
self.availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(
|
96
|
+
self.availability_strategy = availability_strategy or DefaultFileBasedAvailabilityStrategy(
|
97
|
+
stream_reader
|
98
|
+
)
|
77
99
|
self.discovery_policy = discovery_policy
|
78
100
|
self.parsers = parsers
|
79
101
|
self.validation_policies = validation_policies
|
80
|
-
self.stream_schemas =
|
102
|
+
self.stream_schemas = (
|
103
|
+
{s.stream.name: s.stream.json_schema for s in catalog.streams} if catalog else {}
|
104
|
+
)
|
81
105
|
self.cursor_cls = cursor_cls
|
82
106
|
self.logger = init_logger(f"airbyte.{self.name}")
|
83
107
|
self.errors_collector: FileBasedErrorsCollector = FileBasedErrorsCollector()
|
84
108
|
self._message_repository: Optional[MessageRepository] = None
|
85
109
|
concurrent_source = ConcurrentSource.create(
|
86
|
-
MAX_CONCURRENCY,
|
110
|
+
MAX_CONCURRENCY,
|
111
|
+
INITIAL_N_PARTITIONS,
|
112
|
+
self.logger,
|
113
|
+
self._slice_logger,
|
114
|
+
self.message_repository,
|
87
115
|
)
|
88
116
|
self._state = None
|
89
117
|
super().__init__(concurrent_source)
|
@@ -91,10 +119,14 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
91
119
|
@property
|
92
120
|
def message_repository(self) -> MessageRepository:
|
93
121
|
if self._message_repository is None:
|
94
|
-
self._message_repository = InMemoryMessageRepository(
|
122
|
+
self._message_repository = InMemoryMessageRepository(
|
123
|
+
Level(AirbyteLogFormatter.level_mapping[self.logger.level])
|
124
|
+
)
|
95
125
|
return self._message_repository
|
96
126
|
|
97
|
-
def check_connection(
|
127
|
+
def check_connection(
|
128
|
+
self, logger: logging.Logger, config: Mapping[str, Any]
|
129
|
+
) -> Tuple[bool, Optional[Any]]:
|
98
130
|
"""
|
99
131
|
Check that the source can be accessed using the user-provided configuration.
|
100
132
|
|
@@ -195,13 +227,21 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
195
227
|
|
196
228
|
sync_mode = self._get_sync_mode_from_catalog(stream_config.name)
|
197
229
|
|
198
|
-
if
|
230
|
+
if (
|
231
|
+
sync_mode == SyncMode.full_refresh
|
232
|
+
and hasattr(self, "_concurrency_level")
|
233
|
+
and self._concurrency_level is not None
|
234
|
+
):
|
199
235
|
cursor = FileBasedFinalStateCursor(
|
200
|
-
stream_config=stream_config,
|
236
|
+
stream_config=stream_config,
|
237
|
+
stream_namespace=None,
|
238
|
+
message_repository=self.message_repository,
|
201
239
|
)
|
202
240
|
stream = FileBasedStreamFacade.create_from_stream(
|
203
241
|
stream=self._make_default_stream(
|
204
|
-
stream_config=stream_config,
|
242
|
+
stream_config=stream_config,
|
243
|
+
cursor=cursor,
|
244
|
+
use_file_transfer=self._use_file_transfer(parsed_config),
|
205
245
|
),
|
206
246
|
source=self,
|
207
247
|
logger=self.logger,
|
@@ -230,7 +270,9 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
230
270
|
)
|
231
271
|
stream = FileBasedStreamFacade.create_from_stream(
|
232
272
|
stream=self._make_default_stream(
|
233
|
-
stream_config=stream_config,
|
273
|
+
stream_config=stream_config,
|
274
|
+
cursor=cursor,
|
275
|
+
use_file_transfer=self._use_file_transfer(parsed_config),
|
234
276
|
),
|
235
277
|
source=self,
|
236
278
|
logger=self.logger,
|
@@ -240,7 +282,9 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
240
282
|
else:
|
241
283
|
cursor = self.cursor_cls(stream_config)
|
242
284
|
stream = self._make_default_stream(
|
243
|
-
stream_config=stream_config,
|
285
|
+
stream_config=stream_config,
|
286
|
+
cursor=cursor,
|
287
|
+
use_file_transfer=self._use_file_transfer(parsed_config),
|
244
288
|
)
|
245
289
|
|
246
290
|
streams.append(stream)
|
@@ -250,7 +294,10 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
250
294
|
raise ConfigValidationError(FileBasedSourceError.CONFIG_VALIDATION_ERROR) from exc
|
251
295
|
|
252
296
|
def _make_default_stream(
|
253
|
-
self,
|
297
|
+
self,
|
298
|
+
stream_config: FileBasedStreamConfig,
|
299
|
+
cursor: Optional[AbstractFileBasedCursor],
|
300
|
+
use_file_transfer: bool = False,
|
254
301
|
) -> AbstractFileBasedStream:
|
255
302
|
return DefaultFileBasedStream(
|
256
303
|
config=stream_config,
|
@@ -265,7 +312,9 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
265
312
|
use_file_transfer=use_file_transfer,
|
266
313
|
)
|
267
314
|
|
268
|
-
def _get_stream_from_catalog(
|
315
|
+
def _get_stream_from_catalog(
|
316
|
+
self, stream_config: FileBasedStreamConfig
|
317
|
+
) -> Optional[AirbyteStream]:
|
269
318
|
if self.catalog:
|
270
319
|
for stream in self.catalog.streams or []:
|
271
320
|
if stream.stream.name == stream_config.name:
|
@@ -292,7 +341,9 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
292
341
|
yield from self.errors_collector.yield_and_raise_collected()
|
293
342
|
# count streams using a certain parser
|
294
343
|
parsed_config = self._get_parsed_config(config)
|
295
|
-
for parser, count in Counter(
|
344
|
+
for parser, count in Counter(
|
345
|
+
stream.format.filetype for stream in parsed_config.streams
|
346
|
+
).items():
|
296
347
|
yield create_analytics_message(f"file-cdk-{parser}-stream-count", count)
|
297
348
|
|
298
349
|
def spec(self, *args: Any, **kwargs: Any) -> ConnectorSpecification:
|
@@ -308,21 +359,28 @@ class FileBasedSource(ConcurrentSourceAdapter, ABC):
|
|
308
359
|
def _get_parsed_config(self, config: Mapping[str, Any]) -> AbstractFileBasedSpec:
|
309
360
|
return self.spec_class(**config)
|
310
361
|
|
311
|
-
def _validate_and_get_validation_policy(
|
362
|
+
def _validate_and_get_validation_policy(
|
363
|
+
self, stream_config: FileBasedStreamConfig
|
364
|
+
) -> AbstractSchemaValidationPolicy:
|
312
365
|
if stream_config.validation_policy not in self.validation_policies:
|
313
366
|
# This should never happen because we validate the config against the schema's validation_policy enum
|
314
367
|
raise ValidationError(
|
315
|
-
f"`validation_policy` must be one of {list(self.validation_policies.keys())}",
|
368
|
+
f"`validation_policy` must be one of {list(self.validation_policies.keys())}",
|
369
|
+
model=FileBasedStreamConfig,
|
316
370
|
)
|
317
371
|
return self.validation_policies[stream_config.validation_policy]
|
318
372
|
|
319
373
|
def _validate_input_schema(self, stream_config: FileBasedStreamConfig) -> None:
|
320
374
|
if stream_config.schemaless and stream_config.input_schema:
|
321
|
-
raise ValidationError(
|
375
|
+
raise ValidationError(
|
376
|
+
"`input_schema` and `schemaless` options cannot both be set",
|
377
|
+
model=FileBasedStreamConfig,
|
378
|
+
)
|
322
379
|
|
323
380
|
@staticmethod
|
324
381
|
def _use_file_transfer(parsed_config: AbstractFileBasedSpec) -> bool:
|
325
382
|
use_file_transfer = (
|
326
|
-
hasattr(parsed_config.delivery_method, "delivery_type")
|
383
|
+
hasattr(parsed_config.delivery_method, "delivery_type")
|
384
|
+
and parsed_config.delivery_method.delivery_type == "use_file_transfer"
|
327
385
|
)
|
328
386
|
return use_file_transfer
|