airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +17 -2
- airbyte_cdk/config_observation.py +10 -3
- airbyte_cdk/connector.py +19 -9
- airbyte_cdk/connector_builder/connector_builder_handler.py +28 -8
- airbyte_cdk/connector_builder/main.py +26 -6
- airbyte_cdk/connector_builder/message_grouper.py +95 -25
- airbyte_cdk/destinations/destination.py +47 -14
- airbyte_cdk/destinations/vector_db_based/config.py +36 -14
- airbyte_cdk/destinations/vector_db_based/document_processor.py +49 -11
- airbyte_cdk/destinations/vector_db_based/embedder.py +52 -11
- airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
- airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +15 -4
- airbyte_cdk/entrypoint.py +82 -26
- airbyte_cdk/exception_handler.py +13 -3
- airbyte_cdk/logger.py +10 -2
- airbyte_cdk/models/airbyte_protocol.py +11 -5
- airbyte_cdk/models/airbyte_protocol_serializers.py +9 -3
- airbyte_cdk/models/well_known_types.py +1 -1
- airbyte_cdk/sources/abstract_source.py +63 -17
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +47 -14
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +25 -7
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +27 -6
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +9 -3
- airbyte_cdk/sources/connector_state_manager.py +32 -10
- airbyte_cdk/sources/declarative/async_job/job.py +3 -1
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +68 -14
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +24 -6
- airbyte_cdk/sources/declarative/async_job/repository.py +3 -1
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/jwt.py +27 -7
- airbyte_cdk/sources/declarative/auth/oauth.py +35 -11
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/token.py +25 -8
- airbyte_cdk/sources/declarative/checks/check_stream.py +12 -4
- airbyte_cdk/sources/declarative/checks/connection_checker.py +3 -1
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +11 -3
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +106 -50
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +20 -6
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +43 -0
- airbyte_cdk/sources/declarative/declarative_source.py +3 -1
- airbyte_cdk/sources/declarative/declarative_stream.py +27 -6
- airbyte_cdk/sources/declarative/decoders/__init__.py +2 -2
- airbyte_cdk/sources/declarative/decoders/decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +48 -13
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +3 -1
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +6 -2
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +6 -2
- airbyte_cdk/sources/declarative/extractors/record_filter.py +24 -7
- airbyte_cdk/sources/declarative/extractors/record_selector.py +10 -3
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +15 -5
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +96 -31
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +22 -8
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +46 -15
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +19 -5
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +3 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +20 -2
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +5 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +10 -3
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +7 -1
- airbyte_cdk/sources/declarative/interpolation/jinja.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +106 -24
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +14 -5
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +697 -678
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +13 -4
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +9 -2
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +802 -232
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +29 -7
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +25 -7
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +54 -15
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +6 -2
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +15 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +18 -8
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +16 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +51 -14
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +29 -8
- airbyte_cdk/sources/declarative/requesters/http_requester.py +58 -16
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +49 -14
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +24 -7
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +9 -3
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +6 -2
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +19 -6
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +3 -1
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +21 -7
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +18 -6
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +27 -8
- airbyte_cdk/sources/declarative/requesters/requester.py +3 -1
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -5
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +105 -24
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +3 -1
- airbyte_cdk/sources/declarative/spec/spec.py +8 -2
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +3 -1
- airbyte_cdk/sources/declarative/transformations/add_fields.py +12 -3
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +6 -2
- airbyte_cdk/sources/declarative/types.py +8 -1
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +3 -1
- airbyte_cdk/sources/embedded/base_integration.py +14 -4
- airbyte_cdk/sources/embedded/catalog.py +16 -4
- airbyte_cdk/sources/embedded/runner.py +19 -3
- airbyte_cdk/sources/embedded/tools.py +3 -1
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +27 -7
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +12 -6
- airbyte_cdk/sources/file_based/config/csv_format.py +21 -9
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +6 -2
- airbyte_cdk/sources/file_based/config/unstructured_format.py +10 -3
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
- airbyte_cdk/sources/file_based/exceptions.py +13 -15
- airbyte_cdk/sources/file_based/file_based_source.py +82 -24
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -5
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +58 -17
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +89 -26
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +25 -7
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +20 -6
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +57 -16
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +64 -15
- airbyte_cdk/sources/file_based/schema_helpers.py +33 -10
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +33 -10
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +47 -11
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +13 -22
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +53 -17
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +17 -5
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +26 -9
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +67 -21
- airbyte_cdk/sources/http_logger.py +5 -1
- airbyte_cdk/sources/message/repository.py +18 -4
- airbyte_cdk/sources/source.py +17 -7
- airbyte_cdk/sources/streams/availability_strategy.py +9 -3
- airbyte_cdk/sources/streams/call_rate.py +63 -19
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +31 -7
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +6 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +77 -22
- airbyte_cdk/sources/streams/concurrent/cursor.py +56 -20
- airbyte_cdk/sources/streams/concurrent/default_stream.py +9 -2
- airbyte_cdk/sources/streams/concurrent/helpers.py +6 -2
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +9 -2
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +4 -1
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +10 -2
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +6 -2
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +25 -10
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +32 -16
- airbyte_cdk/sources/streams/core.py +77 -22
- airbyte_cdk/sources/streams/http/availability_strategy.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +4 -1
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +16 -5
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +9 -3
- airbyte_cdk/sources/streams/http/exceptions.py +2 -2
- airbyte_cdk/sources/streams/http/http.py +133 -33
- airbyte_cdk/sources/streams/http/http_client.py +91 -29
- airbyte_cdk/sources/streams/http/rate_limiting.py +23 -7
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +19 -6
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +38 -11
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
- airbyte_cdk/sources/types.py +5 -1
- airbyte_cdk/sources/utils/record_helper.py +12 -3
- airbyte_cdk/sources/utils/schema_helpers.py +9 -3
- airbyte_cdk/sources/utils/slice_logger.py +4 -1
- airbyte_cdk/sources/utils/transform.py +24 -9
- airbyte_cdk/sql/exceptions.py +19 -6
- airbyte_cdk/sql/secrets.py +3 -1
- airbyte_cdk/sql/shared/catalog_providers.py +13 -4
- airbyte_cdk/sql/shared/sql_processor.py +44 -14
- airbyte_cdk/test/catalog_builder.py +19 -8
- airbyte_cdk/test/entrypoint_wrapper.py +27 -8
- airbyte_cdk/test/mock_http/mocker.py +41 -11
- airbyte_cdk/test/mock_http/request.py +9 -3
- airbyte_cdk/test/mock_http/response.py +3 -1
- airbyte_cdk/test/mock_http/response_builder.py +29 -7
- airbyte_cdk/test/state_builder.py +10 -2
- airbyte_cdk/test/utils/data.py +6 -2
- airbyte_cdk/test/utils/http_mocking.py +3 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +3 -1
- airbyte_cdk/utils/analytics_message.py +10 -2
- airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
- airbyte_cdk/utils/mapping_helpers.py +3 -1
- airbyte_cdk/utils/message_utils.py +11 -4
- airbyte_cdk/utils/print_buffer.py +6 -1
- airbyte_cdk/utils/schema_inferrer.py +30 -9
- airbyte_cdk/utils/spec_schema_transformations.py +3 -1
- airbyte_cdk/utils/traced_exception.py +35 -9
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/METADATA +8 -7
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/RECORD +200 -200
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/WHEEL +0 -0
airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py
CHANGED
@@ -24,45 +24,36 @@ class AbstractConcurrentFileBasedCursor(Cursor, AbstractFileBasedCursor, ABC):
|
|
24
24
|
|
25
25
|
@property
|
26
26
|
@abstractmethod
|
27
|
-
def state(self) -> MutableMapping[str, Any]:
|
28
|
-
...
|
27
|
+
def state(self) -> MutableMapping[str, Any]: ...
|
29
28
|
|
30
29
|
@abstractmethod
|
31
|
-
def observe(self, record: Record) -> None:
|
32
|
-
...
|
30
|
+
def observe(self, record: Record) -> None: ...
|
33
31
|
|
34
32
|
@abstractmethod
|
35
|
-
def close_partition(self, partition: Partition) -> None:
|
36
|
-
...
|
33
|
+
def close_partition(self, partition: Partition) -> None: ...
|
37
34
|
|
38
35
|
@abstractmethod
|
39
|
-
def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None:
|
40
|
-
...
|
36
|
+
def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None: ...
|
41
37
|
|
42
38
|
@abstractmethod
|
43
|
-
def add_file(self, file: RemoteFile) -> None:
|
44
|
-
...
|
39
|
+
def add_file(self, file: RemoteFile) -> None: ...
|
45
40
|
|
46
41
|
@abstractmethod
|
47
|
-
def get_files_to_sync(
|
48
|
-
|
42
|
+
def get_files_to_sync(
|
43
|
+
self, all_files: Iterable[RemoteFile], logger: logging.Logger
|
44
|
+
) -> Iterable[RemoteFile]: ...
|
49
45
|
|
50
46
|
@abstractmethod
|
51
|
-
def get_state(self) -> MutableMapping[str, Any]:
|
52
|
-
...
|
47
|
+
def get_state(self) -> MutableMapping[str, Any]: ...
|
53
48
|
|
54
49
|
@abstractmethod
|
55
|
-
def set_initial_state(self, value: StreamState) -> None:
|
56
|
-
...
|
50
|
+
def set_initial_state(self, value: StreamState) -> None: ...
|
57
51
|
|
58
52
|
@abstractmethod
|
59
|
-
def get_start_time(self) -> datetime:
|
60
|
-
...
|
53
|
+
def get_start_time(self) -> datetime: ...
|
61
54
|
|
62
55
|
@abstractmethod
|
63
|
-
def emit_state_message(self) -> None:
|
64
|
-
...
|
56
|
+
def emit_state_message(self) -> None: ...
|
65
57
|
|
66
58
|
@abstractmethod
|
67
|
-
def ensure_at_least_one_state_emitted(self) -> None:
|
68
|
-
...
|
59
|
+
def ensure_at_least_one_state_emitted(self) -> None: ...
|
@@ -11,7 +11,9 @@ from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, Type
|
|
11
11
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
12
12
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
13
13
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
14
|
-
from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import
|
14
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import (
|
15
|
+
AbstractConcurrentFileBasedCursor,
|
16
|
+
)
|
15
17
|
from airbyte_cdk.sources.file_based.stream.cursor import DefaultFileBasedCursor
|
16
18
|
from airbyte_cdk.sources.file_based.types import StreamState
|
17
19
|
from airbyte_cdk.sources.message.repository import MessageRepository
|
@@ -27,7 +29,9 @@ _NULL_FILE = ""
|
|
27
29
|
|
28
30
|
class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
29
31
|
CURSOR_FIELD = "_ab_source_file_last_modified"
|
30
|
-
DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL =
|
32
|
+
DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL = (
|
33
|
+
DefaultFileBasedCursor.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
|
34
|
+
)
|
31
35
|
DEFAULT_MAX_HISTORY_SIZE = 10_000
|
32
36
|
DATE_TIME_FORMAT = DefaultFileBasedCursor.DATE_TIME_FORMAT
|
33
37
|
zero_value = datetime.min
|
@@ -51,7 +55,8 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
51
55
|
self._connector_state_manager = connector_state_manager
|
52
56
|
self._cursor_field = cursor_field
|
53
57
|
self._time_window_if_history_is_full = timedelta(
|
54
|
-
days=stream_config.days_to_sync_if_history_is_full
|
58
|
+
days=stream_config.days_to_sync_if_history_is_full
|
59
|
+
or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
|
55
60
|
)
|
56
61
|
self._state_lock = RLock()
|
57
62
|
self._pending_files_lock = RLock()
|
@@ -70,7 +75,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
70
75
|
def close_partition(self, partition: Partition) -> None:
|
71
76
|
with self._pending_files_lock:
|
72
77
|
if self._pending_files is None:
|
73
|
-
raise RuntimeError(
|
78
|
+
raise RuntimeError(
|
79
|
+
"Expected pending partitions to be set but it was not. This is unexpected. Please contact Support."
|
80
|
+
)
|
74
81
|
|
75
82
|
def set_pending_partitions(self, partitions: List["FileBasedStreamPartition"]) -> None:
|
76
83
|
with self._pending_files_lock:
|
@@ -81,7 +88,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
81
88
|
continue
|
82
89
|
for file in _slice["files"]:
|
83
90
|
if file.uri in self._pending_files.keys():
|
84
|
-
raise RuntimeError(
|
91
|
+
raise RuntimeError(
|
92
|
+
f"Already found file {_slice} in pending files. This is unexpected. Please contact Support."
|
93
|
+
)
|
85
94
|
self._pending_files.update({file.uri: file})
|
86
95
|
|
87
96
|
def _compute_prev_sync_cursor(self, value: Optional[StreamState]) -> Tuple[datetime, str]:
|
@@ -96,7 +105,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
96
105
|
# represents the start time that the file was uploaded, we can usually expect that all previous
|
97
106
|
# files have already been uploaded. If that's the case, they'll be in history and we'll skip
|
98
107
|
# re-uploading them.
|
99
|
-
earliest_file_cursor_value = self._get_cursor_key_from_file(
|
108
|
+
earliest_file_cursor_value = self._get_cursor_key_from_file(
|
109
|
+
self._compute_earliest_file_in_history()
|
110
|
+
)
|
100
111
|
cursor_str = min(prev_cursor_str, earliest_file_cursor_value)
|
101
112
|
cursor_dt, cursor_uri = cursor_str.split("_", 1)
|
102
113
|
return datetime.strptime(cursor_dt, self.DATE_TIME_FORMAT), cursor_uri
|
@@ -109,8 +120,13 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
109
120
|
def _compute_earliest_file_in_history(self) -> Optional[RemoteFile]:
|
110
121
|
with self._state_lock:
|
111
122
|
if self._file_to_datetime_history:
|
112
|
-
filename, last_modified = min(
|
113
|
-
|
123
|
+
filename, last_modified = min(
|
124
|
+
self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])
|
125
|
+
)
|
126
|
+
return RemoteFile(
|
127
|
+
uri=filename,
|
128
|
+
last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT),
|
129
|
+
)
|
114
130
|
else:
|
115
131
|
return None
|
116
132
|
|
@@ -120,7 +136,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
120
136
|
:param file: The file to add
|
121
137
|
"""
|
122
138
|
if self._pending_files is None:
|
123
|
-
raise RuntimeError(
|
139
|
+
raise RuntimeError(
|
140
|
+
"Expected pending partitions to be set but it was not. This is unexpected. Please contact Support."
|
141
|
+
)
|
124
142
|
with self._pending_files_lock:
|
125
143
|
with self._state_lock:
|
126
144
|
if file.uri not in self._pending_files:
|
@@ -135,7 +153,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
135
153
|
)
|
136
154
|
else:
|
137
155
|
self._pending_files.pop(file.uri)
|
138
|
-
self._file_to_datetime_history[file.uri] = file.last_modified.strftime(
|
156
|
+
self._file_to_datetime_history[file.uri] = file.last_modified.strftime(
|
157
|
+
self.DATE_TIME_FORMAT
|
158
|
+
)
|
139
159
|
if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE:
|
140
160
|
# Get the earliest file based on its last modified date and its uri
|
141
161
|
oldest_file = self._compute_earliest_file_in_history()
|
@@ -155,7 +175,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
155
175
|
self._stream_namespace,
|
156
176
|
new_state,
|
157
177
|
)
|
158
|
-
state_message = self._connector_state_manager.create_state_message(
|
178
|
+
state_message = self._connector_state_manager.create_state_message(
|
179
|
+
self._stream_name, self._stream_namespace
|
180
|
+
)
|
159
181
|
self._message_repository.emit_message(state_message)
|
160
182
|
|
161
183
|
def _get_new_cursor_value(self) -> str:
|
@@ -183,12 +205,19 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
183
205
|
def _compute_latest_file_in_history(self) -> Optional[RemoteFile]:
|
184
206
|
with self._state_lock:
|
185
207
|
if self._file_to_datetime_history:
|
186
|
-
filename, last_modified = max(
|
187
|
-
|
208
|
+
filename, last_modified = max(
|
209
|
+
self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])
|
210
|
+
)
|
211
|
+
return RemoteFile(
|
212
|
+
uri=filename,
|
213
|
+
last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT),
|
214
|
+
)
|
188
215
|
else:
|
189
216
|
return None
|
190
217
|
|
191
|
-
def get_files_to_sync(
|
218
|
+
def get_files_to_sync(
|
219
|
+
self, all_files: Iterable[RemoteFile], logger: logging.Logger
|
220
|
+
) -> Iterable[RemoteFile]:
|
192
221
|
"""
|
193
222
|
Given the list of files in the source, return the files that should be synced.
|
194
223
|
:param all_files: All files in the source
|
@@ -210,7 +239,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
210
239
|
with self._state_lock:
|
211
240
|
if file.uri in self._file_to_datetime_history:
|
212
241
|
# If the file's uri is in the history, we should sync the file if it has been modified since it was synced
|
213
|
-
updated_at_from_history = datetime.strptime(
|
242
|
+
updated_at_from_history = datetime.strptime(
|
243
|
+
self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT
|
244
|
+
)
|
214
245
|
if file.last_modified < updated_at_from_history:
|
215
246
|
self._message_repository.emit_message(
|
216
247
|
AirbyteMessage(
|
@@ -246,7 +277,9 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
246
277
|
"""
|
247
278
|
with self._state_lock:
|
248
279
|
if self._file_to_datetime_history is None:
|
249
|
-
raise RuntimeError(
|
280
|
+
raise RuntimeError(
|
281
|
+
"The history object has not been set. This is unexpected. Please contact Support."
|
282
|
+
)
|
250
283
|
return len(self._file_to_datetime_history) >= self.DEFAULT_MAX_HISTORY_SIZE
|
251
284
|
|
252
285
|
def _compute_start_time(self) -> datetime:
|
@@ -268,7 +301,10 @@ class FileBasedConcurrentCursor(AbstractConcurrentFileBasedCursor):
|
|
268
301
|
Get the state of the cursor.
|
269
302
|
"""
|
270
303
|
with self._state_lock:
|
271
|
-
return {
|
304
|
+
return {
|
305
|
+
"history": self._file_to_datetime_history,
|
306
|
+
self._cursor_field.cursor_field_key: self._get_new_cursor_value(),
|
307
|
+
}
|
272
308
|
|
273
309
|
def set_initial_state(self, value: StreamState) -> None:
|
274
310
|
pass
|
@@ -9,7 +9,9 @@ from typing import TYPE_CHECKING, Any, Iterable, List, MutableMapping, Optional
|
|
9
9
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
10
10
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
11
11
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
12
|
-
from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import
|
12
|
+
from airbyte_cdk.sources.file_based.stream.concurrent.cursor.abstract_concurrent_file_based_cursor import (
|
13
|
+
AbstractConcurrentFileBasedCursor,
|
14
|
+
)
|
13
15
|
from airbyte_cdk.sources.file_based.types import StreamState
|
14
16
|
from airbyte_cdk.sources.message import MessageRepository
|
15
17
|
from airbyte_cdk.sources.streams import NO_CURSOR_STATE_KEY
|
@@ -24,7 +26,11 @@ class FileBasedFinalStateCursor(AbstractConcurrentFileBasedCursor):
|
|
24
26
|
"""Cursor that is used to guarantee at least one state message is emitted for a concurrent file-based stream."""
|
25
27
|
|
26
28
|
def __init__(
|
27
|
-
self,
|
29
|
+
self,
|
30
|
+
stream_config: FileBasedStreamConfig,
|
31
|
+
message_repository: MessageRepository,
|
32
|
+
stream_namespace: Optional[str],
|
33
|
+
**kwargs: Any,
|
28
34
|
):
|
29
35
|
self._stream_name = stream_config.name
|
30
36
|
self._stream_namespace = stream_namespace
|
@@ -50,7 +56,9 @@ class FileBasedFinalStateCursor(AbstractConcurrentFileBasedCursor):
|
|
50
56
|
def add_file(self, file: RemoteFile) -> None:
|
51
57
|
pass
|
52
58
|
|
53
|
-
def get_files_to_sync(
|
59
|
+
def get_files_to_sync(
|
60
|
+
self, all_files: Iterable[RemoteFile], logger: logging.Logger
|
61
|
+
) -> Iterable[RemoteFile]:
|
54
62
|
return all_files
|
55
63
|
|
56
64
|
def get_state(self) -> MutableMapping[str, Any]:
|
@@ -66,6 +74,10 @@ class FileBasedFinalStateCursor(AbstractConcurrentFileBasedCursor):
|
|
66
74
|
pass
|
67
75
|
|
68
76
|
def ensure_at_least_one_state_emitted(self) -> None:
|
69
|
-
self._connector_state_manager.update_state_for_stream(
|
70
|
-
|
77
|
+
self._connector_state_manager.update_state_for_stream(
|
78
|
+
self._stream_name, self._stream_namespace, self.state
|
79
|
+
)
|
80
|
+
state_message = self._connector_state_manager.create_state_message(
|
81
|
+
self._stream_name, self._stream_namespace
|
82
|
+
)
|
71
83
|
self._message_repository.emit_message(state_message)
|
@@ -54,7 +54,9 @@ class AbstractFileBasedCursor(ABC):
|
|
54
54
|
...
|
55
55
|
|
56
56
|
@abstractmethod
|
57
|
-
def get_files_to_sync(
|
57
|
+
def get_files_to_sync(
|
58
|
+
self, all_files: Iterable[RemoteFile], logger: logging.Logger
|
59
|
+
) -> Iterable[RemoteFile]:
|
58
60
|
"""
|
59
61
|
Given the list of files in the source, return the files that should be synced.
|
60
62
|
:param all_files: All files in the source
|
@@ -8,7 +8,9 @@ from typing import Any, Iterable, MutableMapping, Optional
|
|
8
8
|
|
9
9
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
10
10
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
11
|
-
from airbyte_cdk.sources.file_based.stream.cursor.abstract_file_based_cursor import
|
11
|
+
from airbyte_cdk.sources.file_based.stream.cursor.abstract_file_based_cursor import (
|
12
|
+
AbstractFileBasedCursor,
|
13
|
+
)
|
12
14
|
from airbyte_cdk.sources.file_based.types import StreamState
|
13
15
|
|
14
16
|
|
@@ -22,11 +24,14 @@ class DefaultFileBasedCursor(AbstractFileBasedCursor):
|
|
22
24
|
super().__init__(stream_config)
|
23
25
|
self._file_to_datetime_history: MutableMapping[str, str] = {}
|
24
26
|
self._time_window_if_history_is_full = timedelta(
|
25
|
-
days=stream_config.days_to_sync_if_history_is_full
|
27
|
+
days=stream_config.days_to_sync_if_history_is_full
|
28
|
+
or self.DEFAULT_DAYS_TO_SYNC_IF_HISTORY_IS_FULL
|
26
29
|
)
|
27
30
|
|
28
31
|
if self._time_window_if_history_is_full <= timedelta():
|
29
|
-
raise ValueError(
|
32
|
+
raise ValueError(
|
33
|
+
f"days_to_sync_if_history_is_full must be a positive timedelta, got {self._time_window_if_history_is_full}"
|
34
|
+
)
|
30
35
|
|
31
36
|
self._start_time = self._compute_start_time()
|
32
37
|
self._initial_earliest_file_in_history: Optional[RemoteFile] = None
|
@@ -37,7 +42,9 @@ class DefaultFileBasedCursor(AbstractFileBasedCursor):
|
|
37
42
|
self._initial_earliest_file_in_history = self._compute_earliest_file_in_history()
|
38
43
|
|
39
44
|
def add_file(self, file: RemoteFile) -> None:
|
40
|
-
self._file_to_datetime_history[file.uri] = file.last_modified.strftime(
|
45
|
+
self._file_to_datetime_history[file.uri] = file.last_modified.strftime(
|
46
|
+
self.DATE_TIME_FORMAT
|
47
|
+
)
|
41
48
|
if len(self._file_to_datetime_history) > self.DEFAULT_MAX_HISTORY_SIZE:
|
42
49
|
# Get the earliest file based on its last modified date and its uri
|
43
50
|
oldest_file = self._compute_earliest_file_in_history()
|
@@ -60,7 +67,9 @@ class DefaultFileBasedCursor(AbstractFileBasedCursor):
|
|
60
67
|
a string joining the last-modified timestamp of the last synced file and the name of the file.
|
61
68
|
"""
|
62
69
|
if self._file_to_datetime_history.items():
|
63
|
-
filename, timestamp = max(
|
70
|
+
filename, timestamp = max(
|
71
|
+
self._file_to_datetime_history.items(), key=lambda x: (x[1], x[0])
|
72
|
+
)
|
64
73
|
return f"{timestamp}_{filename}"
|
65
74
|
return None
|
66
75
|
|
@@ -73,7 +82,9 @@ class DefaultFileBasedCursor(AbstractFileBasedCursor):
|
|
73
82
|
def _should_sync_file(self, file: RemoteFile, logger: logging.Logger) -> bool:
|
74
83
|
if file.uri in self._file_to_datetime_history:
|
75
84
|
# If the file's uri is in the history, we should sync the file if it has been modified since it was synced
|
76
|
-
updated_at_from_history = datetime.strptime(
|
85
|
+
updated_at_from_history = datetime.strptime(
|
86
|
+
self._file_to_datetime_history[file.uri], self.DATE_TIME_FORMAT
|
87
|
+
)
|
77
88
|
if file.last_modified < updated_at_from_history:
|
78
89
|
logger.warning(
|
79
90
|
f"The file {file.uri}'s last modified date is older than the last time it was synced. This is unexpected. Skipping the file."
|
@@ -99,7 +110,9 @@ class DefaultFileBasedCursor(AbstractFileBasedCursor):
|
|
99
110
|
# The file is not in the history and the history is complete. We know we need to sync the file
|
100
111
|
return True
|
101
112
|
|
102
|
-
def get_files_to_sync(
|
113
|
+
def get_files_to_sync(
|
114
|
+
self, all_files: Iterable[RemoteFile], logger: logging.Logger
|
115
|
+
) -> Iterable[RemoteFile]:
|
103
116
|
if self._is_history_full():
|
104
117
|
logger.warning(
|
105
118
|
f"The state history is full. "
|
@@ -115,8 +128,12 @@ class DefaultFileBasedCursor(AbstractFileBasedCursor):
|
|
115
128
|
|
116
129
|
def _compute_earliest_file_in_history(self) -> Optional[RemoteFile]:
|
117
130
|
if self._file_to_datetime_history:
|
118
|
-
filename, last_modified = min(
|
119
|
-
|
131
|
+
filename, last_modified = min(
|
132
|
+
self._file_to_datetime_history.items(), key=lambda f: (f[1], f[0])
|
133
|
+
)
|
134
|
+
return RemoteFile(
|
135
|
+
uri=filename, last_modified=datetime.strptime(last_modified, self.DATE_TIME_FORMAT)
|
136
|
+
)
|
120
137
|
else:
|
121
138
|
return None
|
122
139
|
|
@@ -22,7 +22,12 @@ from airbyte_cdk.sources.file_based.exceptions import (
|
|
22
22
|
)
|
23
23
|
from airbyte_cdk.sources.file_based.file_types import FileTransfer
|
24
24
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
25
|
-
from airbyte_cdk.sources.file_based.schema_helpers import
|
25
|
+
from airbyte_cdk.sources.file_based.schema_helpers import (
|
26
|
+
SchemaType,
|
27
|
+
file_transfer_schema,
|
28
|
+
merge_schemas,
|
29
|
+
schemaless_schema,
|
30
|
+
)
|
26
31
|
from airbyte_cdk.sources.file_based.stream import AbstractFileBasedStream
|
27
32
|
from airbyte_cdk.sources.file_based.stream.cursor import AbstractFileBasedCursor
|
28
33
|
from airbyte_cdk.sources.file_based.types import StreamSlice
|
@@ -33,7 +38,6 @@ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
33
38
|
|
34
39
|
|
35
40
|
class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
36
|
-
|
37
41
|
"""
|
38
42
|
The default file-based stream.
|
39
43
|
"""
|
@@ -68,18 +72,28 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
68
72
|
@cursor.setter
|
69
73
|
def cursor(self, value: AbstractFileBasedCursor) -> None:
|
70
74
|
if self._cursor is not None:
|
71
|
-
raise RuntimeError(
|
75
|
+
raise RuntimeError(
|
76
|
+
f"Cursor for stream {self.name} is already set. This is unexpected. Please contact Support."
|
77
|
+
)
|
72
78
|
self._cursor = value
|
73
79
|
|
74
80
|
@property
|
75
81
|
def primary_key(self) -> PrimaryKeyType:
|
76
|
-
return self.config.primary_key or self.get_parser().get_parser_defined_primary_key(
|
82
|
+
return self.config.primary_key or self.get_parser().get_parser_defined_primary_key(
|
83
|
+
self.config
|
84
|
+
)
|
77
85
|
|
78
|
-
def _filter_schema_invalid_properties(
|
86
|
+
def _filter_schema_invalid_properties(
|
87
|
+
self, configured_catalog_json_schema: Dict[str, Any]
|
88
|
+
) -> Dict[str, Any]:
|
79
89
|
if self.use_file_transfer:
|
80
90
|
return {
|
81
91
|
"type": "object",
|
82
|
-
"properties": {
|
92
|
+
"properties": {
|
93
|
+
"file_path": {"type": "string"},
|
94
|
+
"file_size": {"type": "string"},
|
95
|
+
self.ab_file_name_col: {"type": "string"},
|
96
|
+
},
|
83
97
|
}
|
84
98
|
else:
|
85
99
|
return super()._filter_schema_invalid_properties(configured_catalog_json_schema)
|
@@ -89,16 +103,23 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
89
103
|
all_files = self.list_files()
|
90
104
|
files_to_read = self._cursor.get_files_to_sync(all_files, self.logger)
|
91
105
|
sorted_files_to_read = sorted(files_to_read, key=lambda f: (f.last_modified, f.uri))
|
92
|
-
slices = [
|
106
|
+
slices = [
|
107
|
+
{"files": list(group[1])}
|
108
|
+
for group in itertools.groupby(sorted_files_to_read, lambda f: f.last_modified)
|
109
|
+
]
|
93
110
|
return slices
|
94
111
|
|
95
|
-
def transform_record(
|
112
|
+
def transform_record(
|
113
|
+
self, record: dict[str, Any], file: RemoteFile, last_updated: str
|
114
|
+
) -> dict[str, Any]:
|
96
115
|
# adds _ab_source_file_last_modified and _ab_source_file_url to the record
|
97
116
|
record[self.ab_last_mod_col] = last_updated
|
98
117
|
record[self.ab_file_name_col] = file.uri
|
99
118
|
return record
|
100
119
|
|
101
|
-
def transform_record_for_file_transfer(
|
120
|
+
def transform_record_for_file_transfer(
|
121
|
+
self, record: dict[str, Any], file: RemoteFile
|
122
|
+
) -> dict[str, Any]:
|
102
123
|
# timstamp() returns a float representing the number of seconds since the unix epoch
|
103
124
|
record[self.modified] = int(file.last_modified.timestamp()) * 1000
|
104
125
|
record[self.source_file_url] = file.uri
|
@@ -127,15 +148,21 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
127
148
|
self.logger.info(f"{self.name}: {file} file-based syncing")
|
128
149
|
# todo: complete here the code to not rely on local parser
|
129
150
|
file_transfer = FileTransfer()
|
130
|
-
for record in file_transfer.get_file(
|
151
|
+
for record in file_transfer.get_file(
|
152
|
+
self.config, file, self.stream_reader, self.logger
|
153
|
+
):
|
131
154
|
line_no += 1
|
132
155
|
if not self.record_passes_validation_policy(record):
|
133
156
|
n_skipped += 1
|
134
157
|
continue
|
135
158
|
record = self.transform_record_for_file_transfer(record, file)
|
136
|
-
yield stream_data_to_airbyte_message(
|
159
|
+
yield stream_data_to_airbyte_message(
|
160
|
+
self.name, record, is_file_transfer_message=True
|
161
|
+
)
|
137
162
|
else:
|
138
|
-
for record in parser.parse_records(
|
163
|
+
for record in parser.parse_records(
|
164
|
+
self.config, file, self.stream_reader, self.logger, schema
|
165
|
+
):
|
139
166
|
line_no += 1
|
140
167
|
if self.config.schemaless:
|
141
168
|
record = {"data": record}
|
@@ -220,7 +247,9 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
220
247
|
except AirbyteTracedException as ate:
|
221
248
|
raise ate
|
222
249
|
except Exception as exc:
|
223
|
-
raise SchemaInferenceError(
|
250
|
+
raise SchemaInferenceError(
|
251
|
+
FileBasedSourceError.SCHEMA_INFERENCE_ERROR, stream=self.name
|
252
|
+
) from exc
|
224
253
|
else:
|
225
254
|
return {"type": "object", "properties": {**extra_fields, **schema["properties"]}}
|
226
255
|
|
@@ -245,14 +274,20 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
245
274
|
first_n_files = self.config.recent_n_files_to_read_for_schema_discovery
|
246
275
|
|
247
276
|
if first_n_files == 0:
|
248
|
-
self.logger.warning(
|
277
|
+
self.logger.warning(
|
278
|
+
msg=f"No files were identified in the stream {self.name}. Setting default schema for the stream."
|
279
|
+
)
|
249
280
|
return schemaless_schema
|
250
281
|
|
251
|
-
max_n_files_for_schema_inference =
|
282
|
+
max_n_files_for_schema_inference = (
|
283
|
+
self._discovery_policy.get_max_n_files_for_schema_inference(self.get_parser())
|
284
|
+
)
|
252
285
|
|
253
286
|
if first_n_files > max_n_files_for_schema_inference:
|
254
287
|
# Use the most recent files for schema inference, so we pick up schema changes during discovery.
|
255
|
-
self.logger.warning(
|
288
|
+
self.logger.warning(
|
289
|
+
msg=f"Refusing to infer schema for {first_n_files} files; using {max_n_files_for_schema_inference} files."
|
290
|
+
)
|
256
291
|
first_n_files = max_n_files_for_schema_inference
|
257
292
|
|
258
293
|
files = sorted(files, key=lambda x: x.last_modified, reverse=True)[:first_n_files]
|
@@ -274,7 +309,9 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
274
309
|
"""
|
275
310
|
Return all files that belong to the stream as defined by the stream's globs.
|
276
311
|
"""
|
277
|
-
return self.stream_reader.get_matching_files(
|
312
|
+
return self.stream_reader.get_matching_files(
|
313
|
+
self.config.globs or [], self.config.legacy_prefix, self.logger
|
314
|
+
)
|
278
315
|
|
279
316
|
def infer_schema(self, files: List[RemoteFile]) -> Mapping[str, Any]:
|
280
317
|
loop = asyncio.get_event_loop()
|
@@ -312,25 +349,34 @@ class DefaultFileBasedStream(AbstractFileBasedStream, IncrementalMixin):
|
|
312
349
|
n_started, n_files = 0, len(files)
|
313
350
|
files_iterator = iter(files)
|
314
351
|
while pending_tasks or n_started < n_files:
|
315
|
-
while len(pending_tasks) <= self._discovery_policy.n_concurrent_requests and (
|
352
|
+
while len(pending_tasks) <= self._discovery_policy.n_concurrent_requests and (
|
353
|
+
file := next(files_iterator, None)
|
354
|
+
):
|
316
355
|
pending_tasks.add(asyncio.create_task(self._infer_file_schema(file)))
|
317
356
|
n_started += 1
|
318
357
|
# Return when the first task is completed so that we can enqueue a new task as soon as the
|
319
358
|
# number of concurrent tasks drops below the number allowed.
|
320
|
-
done, pending_tasks = await asyncio.wait(
|
359
|
+
done, pending_tasks = await asyncio.wait(
|
360
|
+
pending_tasks, return_when=asyncio.FIRST_COMPLETED
|
361
|
+
)
|
321
362
|
for task in done:
|
322
363
|
try:
|
323
364
|
base_schema = merge_schemas(base_schema, task.result())
|
324
365
|
except AirbyteTracedException as ate:
|
325
366
|
raise ate
|
326
367
|
except Exception as exc:
|
327
|
-
self.logger.error(
|
368
|
+
self.logger.error(
|
369
|
+
f"An error occurred inferring the schema. \n {traceback.format_exc()}",
|
370
|
+
exc_info=exc,
|
371
|
+
)
|
328
372
|
|
329
373
|
return base_schema
|
330
374
|
|
331
375
|
async def _infer_file_schema(self, file: RemoteFile) -> SchemaType:
|
332
376
|
try:
|
333
|
-
return await self.get_parser().infer_schema(
|
377
|
+
return await self.get_parser().infer_schema(
|
378
|
+
self.config, file, self.stream_reader, self.logger
|
379
|
+
)
|
334
380
|
except AirbyteTracedException as ate:
|
335
381
|
raise ate
|
336
382
|
except Exception as exc:
|
@@ -9,7 +9,11 @@ from airbyte_cdk.sources.message import LogMessage
|
|
9
9
|
|
10
10
|
|
11
11
|
def format_http_message(
|
12
|
-
response: requests.Response,
|
12
|
+
response: requests.Response,
|
13
|
+
title: str,
|
14
|
+
description: str,
|
15
|
+
stream_name: Optional[str],
|
16
|
+
is_auxiliary: bool = None,
|
13
17
|
) -> LogMessage:
|
14
18
|
request = response.request
|
15
19
|
log_message = {
|
@@ -28,7 +28,9 @@ _SEVERITY_BY_LOG_LEVEL = {
|
|
28
28
|
|
29
29
|
def _is_severe_enough(threshold: Level, level: Level) -> bool:
|
30
30
|
if threshold not in _SEVERITY_BY_LOG_LEVEL:
|
31
|
-
_LOGGER.warning(
|
31
|
+
_LOGGER.warning(
|
32
|
+
f"Log level {threshold} for threshold is not supported. This is probably a CDK bug. Please contact Airbyte."
|
33
|
+
)
|
32
34
|
return True
|
33
35
|
|
34
36
|
if level not in _SEVERITY_BY_LOG_LEVEL:
|
@@ -80,7 +82,12 @@ class InMemoryMessageRepository(MessageRepository):
|
|
80
82
|
def log_message(self, level: Level, message_provider: Callable[[], LogMessage]) -> None:
|
81
83
|
if _is_severe_enough(self._log_level, level):
|
82
84
|
self.emit_message(
|
83
|
-
AirbyteMessage(
|
85
|
+
AirbyteMessage(
|
86
|
+
type=Type.LOG,
|
87
|
+
log=AirbyteLogMessage(
|
88
|
+
level=level, message=filter_secrets(json.dumps(message_provider()))
|
89
|
+
),
|
90
|
+
)
|
84
91
|
)
|
85
92
|
|
86
93
|
def consume_queue(self) -> Iterable[AirbyteMessage]:
|
@@ -89,7 +96,12 @@ class InMemoryMessageRepository(MessageRepository):
|
|
89
96
|
|
90
97
|
|
91
98
|
class LogAppenderMessageRepositoryDecorator(MessageRepository):
|
92
|
-
def __init__(
|
99
|
+
def __init__(
|
100
|
+
self,
|
101
|
+
dict_to_append: LogMessage,
|
102
|
+
decorated: MessageRepository,
|
103
|
+
log_level: Level = Level.INFO,
|
104
|
+
):
|
93
105
|
self._dict_to_append = dict_to_append
|
94
106
|
self._decorated = decorated
|
95
107
|
self._log_level = log_level
|
@@ -106,7 +118,9 @@ class LogAppenderMessageRepositoryDecorator(MessageRepository):
|
|
106
118
|
def consume_queue(self) -> Iterable[AirbyteMessage]:
|
107
119
|
return self._decorated.consume_queue()
|
108
120
|
|
109
|
-
def _append_second_to_first(
|
121
|
+
def _append_second_to_first(
|
122
|
+
self, first: LogMessage, second: LogMessage, path: Optional[List[str]] = None
|
123
|
+
) -> LogMessage:
|
110
124
|
if path is None:
|
111
125
|
path = []
|
112
126
|
|