airbyte-cdk 6.5.3rc2__py3-none-any.whl → 6.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +17 -2
- airbyte_cdk/config_observation.py +10 -3
- airbyte_cdk/connector.py +19 -9
- airbyte_cdk/connector_builder/connector_builder_handler.py +28 -8
- airbyte_cdk/connector_builder/main.py +26 -6
- airbyte_cdk/connector_builder/message_grouper.py +95 -25
- airbyte_cdk/destinations/destination.py +47 -14
- airbyte_cdk/destinations/vector_db_based/config.py +36 -14
- airbyte_cdk/destinations/vector_db_based/document_processor.py +49 -11
- airbyte_cdk/destinations/vector_db_based/embedder.py +52 -11
- airbyte_cdk/destinations/vector_db_based/test_utils.py +14 -4
- airbyte_cdk/destinations/vector_db_based/utils.py +8 -2
- airbyte_cdk/destinations/vector_db_based/writer.py +15 -4
- airbyte_cdk/entrypoint.py +82 -26
- airbyte_cdk/exception_handler.py +13 -3
- airbyte_cdk/logger.py +10 -2
- airbyte_cdk/models/airbyte_protocol.py +11 -5
- airbyte_cdk/models/airbyte_protocol_serializers.py +9 -3
- airbyte_cdk/models/well_known_types.py +1 -1
- airbyte_cdk/sources/abstract_source.py +63 -17
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +47 -14
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +25 -7
- airbyte_cdk/sources/concurrent_source/concurrent_source_adapter.py +27 -6
- airbyte_cdk/sources/concurrent_source/thread_pool_manager.py +9 -3
- airbyte_cdk/sources/connector_state_manager.py +32 -10
- airbyte_cdk/sources/declarative/async_job/job.py +3 -1
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +68 -14
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +24 -6
- airbyte_cdk/sources/declarative/async_job/repository.py +3 -1
- airbyte_cdk/sources/declarative/auth/declarative_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/jwt.py +27 -7
- airbyte_cdk/sources/declarative/auth/oauth.py +35 -11
- airbyte_cdk/sources/declarative/auth/selective_authenticator.py +3 -1
- airbyte_cdk/sources/declarative/auth/token.py +25 -8
- airbyte_cdk/sources/declarative/checks/check_stream.py +12 -4
- airbyte_cdk/sources/declarative/checks/connection_checker.py +3 -1
- airbyte_cdk/sources/declarative/concurrency_level/concurrency_level.py +11 -3
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +106 -50
- airbyte_cdk/sources/declarative/datetime/min_max_datetime.py +20 -6
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +43 -0
- airbyte_cdk/sources/declarative/declarative_source.py +3 -1
- airbyte_cdk/sources/declarative/declarative_stream.py +27 -6
- airbyte_cdk/sources/declarative/decoders/__init__.py +2 -2
- airbyte_cdk/sources/declarative/decoders/decoder.py +3 -1
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +48 -13
- airbyte_cdk/sources/declarative/decoders/pagination_decoder_decorator.py +3 -1
- airbyte_cdk/sources/declarative/decoders/xml_decoder.py +6 -2
- airbyte_cdk/sources/declarative/extractors/dpath_extractor.py +6 -2
- airbyte_cdk/sources/declarative/extractors/record_filter.py +24 -7
- airbyte_cdk/sources/declarative/extractors/record_selector.py +10 -3
- airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py +15 -5
- airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py +96 -31
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +22 -8
- airbyte_cdk/sources/declarative/incremental/per_partition_cursor.py +46 -15
- airbyte_cdk/sources/declarative/incremental/per_partition_with_global.py +19 -5
- airbyte_cdk/sources/declarative/incremental/resumable_full_refresh_cursor.py +3 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_boolean.py +20 -2
- airbyte_cdk/sources/declarative/interpolation/interpolated_mapping.py +5 -1
- airbyte_cdk/sources/declarative/interpolation/interpolated_nested_mapping.py +10 -3
- airbyte_cdk/sources/declarative/interpolation/interpolated_string.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/interpolation.py +7 -1
- airbyte_cdk/sources/declarative/interpolation/jinja.py +6 -2
- airbyte_cdk/sources/declarative/interpolation/macros.py +19 -4
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +106 -24
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +14 -5
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +697 -678
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +13 -4
- airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py +9 -2
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +802 -232
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +29 -7
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +25 -7
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +54 -15
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/constant_backoff_strategy.py +6 -2
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/header_helper.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_time_from_header_backoff_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/backoff_strategies/wait_until_time_from_header_backoff_strategy.py +15 -5
- airbyte_cdk/sources/declarative/requesters/error_handlers/composite_error_handler.py +3 -1
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_error_handler.py +18 -8
- airbyte_cdk/sources/declarative/requesters/error_handlers/default_http_response_filter.py +16 -7
- airbyte_cdk/sources/declarative/requesters/error_handlers/http_response_filter.py +51 -14
- airbyte_cdk/sources/declarative/requesters/http_job_repository.py +29 -8
- airbyte_cdk/sources/declarative/requesters/http_requester.py +58 -16
- airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py +49 -14
- airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/paginator.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py +17 -5
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/offset_increment.py +24 -7
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/page_increment.py +9 -3
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/pagination_strategy.py +3 -1
- airbyte_cdk/sources/declarative/requesters/paginators/strategies/stop_condition.py +6 -2
- airbyte_cdk/sources/declarative/requesters/request_options/datetime_based_request_options_provider.py +19 -6
- airbyte_cdk/sources/declarative/requesters/request_options/default_request_options_provider.py +3 -1
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_nested_request_input_provider.py +21 -7
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_input_provider.py +18 -6
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +27 -8
- airbyte_cdk/sources/declarative/requesters/requester.py +3 -1
- airbyte_cdk/sources/declarative/retrievers/async_retriever.py +12 -5
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +105 -24
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +3 -1
- airbyte_cdk/sources/declarative/spec/spec.py +8 -2
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +3 -1
- airbyte_cdk/sources/declarative/transformations/add_fields.py +12 -3
- airbyte_cdk/sources/declarative/transformations/remove_fields.py +6 -2
- airbyte_cdk/sources/declarative/types.py +8 -1
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +3 -1
- airbyte_cdk/sources/embedded/base_integration.py +14 -4
- airbyte_cdk/sources/embedded/catalog.py +16 -4
- airbyte_cdk/sources/embedded/runner.py +19 -3
- airbyte_cdk/sources/embedded/tools.py +3 -1
- airbyte_cdk/sources/file_based/availability_strategy/abstract_file_based_availability_strategy.py +12 -4
- airbyte_cdk/sources/file_based/availability_strategy/default_file_based_availability_strategy.py +27 -7
- airbyte_cdk/sources/file_based/config/abstract_file_based_spec.py +12 -6
- airbyte_cdk/sources/file_based/config/csv_format.py +21 -9
- airbyte_cdk/sources/file_based/config/file_based_stream_config.py +6 -2
- airbyte_cdk/sources/file_based/config/unstructured_format.py +10 -3
- airbyte_cdk/sources/file_based/discovery_policy/abstract_discovery_policy.py +2 -4
- airbyte_cdk/sources/file_based/discovery_policy/default_discovery_policy.py +7 -2
- airbyte_cdk/sources/file_based/exceptions.py +13 -15
- airbyte_cdk/sources/file_based/file_based_source.py +82 -24
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +16 -5
- airbyte_cdk/sources/file_based/file_types/avro_parser.py +58 -17
- airbyte_cdk/sources/file_based/file_types/csv_parser.py +89 -26
- airbyte_cdk/sources/file_based/file_types/excel_parser.py +25 -7
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
- airbyte_cdk/sources/file_based/file_types/file_type_parser.py +4 -1
- airbyte_cdk/sources/file_based/file_types/jsonl_parser.py +20 -6
- airbyte_cdk/sources/file_based/file_types/parquet_parser.py +57 -16
- airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +64 -15
- airbyte_cdk/sources/file_based/schema_helpers.py +33 -10
- airbyte_cdk/sources/file_based/schema_validation_policies/abstract_schema_validation_policy.py +3 -1
- airbyte_cdk/sources/file_based/schema_validation_policies/default_schema_validation_policies.py +16 -5
- airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py +33 -10
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +47 -11
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/abstract_concurrent_file_based_cursor.py +13 -22
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_concurrent_cursor.py +53 -17
- airbyte_cdk/sources/file_based/stream/concurrent/cursor/file_based_final_state_cursor.py +17 -5
- airbyte_cdk/sources/file_based/stream/cursor/abstract_file_based_cursor.py +3 -1
- airbyte_cdk/sources/file_based/stream/cursor/default_file_based_cursor.py +26 -9
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +67 -21
- airbyte_cdk/sources/http_logger.py +5 -1
- airbyte_cdk/sources/message/repository.py +18 -4
- airbyte_cdk/sources/source.py +17 -7
- airbyte_cdk/sources/streams/availability_strategy.py +9 -3
- airbyte_cdk/sources/streams/call_rate.py +63 -19
- airbyte_cdk/sources/streams/checkpoint/checkpoint_reader.py +31 -7
- airbyte_cdk/sources/streams/checkpoint/substream_resumable_full_refresh_cursor.py +6 -2
- airbyte_cdk/sources/streams/concurrent/adapters.py +77 -22
- airbyte_cdk/sources/streams/concurrent/cursor.py +56 -20
- airbyte_cdk/sources/streams/concurrent/default_stream.py +9 -2
- airbyte_cdk/sources/streams/concurrent/helpers.py +6 -2
- airbyte_cdk/sources/streams/concurrent/partition_enqueuer.py +9 -2
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +4 -1
- airbyte_cdk/sources/streams/concurrent/partitions/record.py +10 -2
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +6 -2
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +25 -10
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +32 -16
- airbyte_cdk/sources/streams/core.py +77 -22
- airbyte_cdk/sources/streams/http/availability_strategy.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/default_error_mapping.py +4 -1
- airbyte_cdk/sources/streams/http/error_handlers/error_handler.py +3 -1
- airbyte_cdk/sources/streams/http/error_handlers/http_status_error_handler.py +16 -5
- airbyte_cdk/sources/streams/http/error_handlers/response_models.py +9 -3
- airbyte_cdk/sources/streams/http/exceptions.py +2 -2
- airbyte_cdk/sources/streams/http/http.py +133 -33
- airbyte_cdk/sources/streams/http/http_client.py +91 -29
- airbyte_cdk/sources/streams/http/rate_limiting.py +23 -7
- airbyte_cdk/sources/streams/http/requests_native_auth/abstract_oauth.py +19 -6
- airbyte_cdk/sources/streams/http/requests_native_auth/oauth.py +38 -11
- airbyte_cdk/sources/streams/http/requests_native_auth/token.py +13 -3
- airbyte_cdk/sources/types.py +5 -1
- airbyte_cdk/sources/utils/record_helper.py +12 -3
- airbyte_cdk/sources/utils/schema_helpers.py +9 -3
- airbyte_cdk/sources/utils/slice_logger.py +4 -1
- airbyte_cdk/sources/utils/transform.py +24 -9
- airbyte_cdk/sql/exceptions.py +19 -6
- airbyte_cdk/sql/secrets.py +3 -1
- airbyte_cdk/sql/shared/catalog_providers.py +13 -4
- airbyte_cdk/sql/shared/sql_processor.py +44 -14
- airbyte_cdk/test/catalog_builder.py +19 -8
- airbyte_cdk/test/entrypoint_wrapper.py +27 -8
- airbyte_cdk/test/mock_http/mocker.py +41 -11
- airbyte_cdk/test/mock_http/request.py +9 -3
- airbyte_cdk/test/mock_http/response.py +3 -1
- airbyte_cdk/test/mock_http/response_builder.py +29 -7
- airbyte_cdk/test/state_builder.py +10 -2
- airbyte_cdk/test/utils/data.py +6 -2
- airbyte_cdk/test/utils/http_mocking.py +3 -1
- airbyte_cdk/utils/airbyte_secrets_utils.py +3 -1
- airbyte_cdk/utils/analytics_message.py +10 -2
- airbyte_cdk/utils/datetime_format_inferrer.py +4 -1
- airbyte_cdk/utils/mapping_helpers.py +3 -1
- airbyte_cdk/utils/message_utils.py +11 -4
- airbyte_cdk/utils/print_buffer.py +6 -1
- airbyte_cdk/utils/schema_inferrer.py +30 -9
- airbyte_cdk/utils/spec_schema_transformations.py +3 -1
- airbyte_cdk/utils/traced_exception.py +35 -9
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/METADATA +8 -7
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/RECORD +200 -200
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.5.3rc2.dist-info → airbyte_cdk-6.6.0.dist-info}/WHEEL +0 -0
airbyte_cdk/exception_handler.py
CHANGED
@@ -11,7 +11,9 @@ from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets
|
|
11
11
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
12
12
|
|
13
13
|
|
14
|
-
def assemble_uncaught_exception(
|
14
|
+
def assemble_uncaught_exception(
|
15
|
+
exception_type: type[BaseException], exception_value: BaseException
|
16
|
+
) -> AirbyteTracedException:
|
15
17
|
if issubclass(exception_type, AirbyteTracedException):
|
16
18
|
return exception_value # type: ignore # validated as part of the previous line
|
17
19
|
return AirbyteTracedException.from_exception(exception_value)
|
@@ -23,7 +25,11 @@ def init_uncaught_exception_handler(logger: logging.Logger) -> None:
|
|
23
25
|
printed to the console without having secrets removed.
|
24
26
|
"""
|
25
27
|
|
26
|
-
def hook_fn(
|
28
|
+
def hook_fn(
|
29
|
+
exception_type: type[BaseException],
|
30
|
+
exception_value: BaseException,
|
31
|
+
traceback_: Optional[TracebackType],
|
32
|
+
) -> Any:
|
27
33
|
# For developer ergonomics, we want to see the stack trace in the logs when we do a ctrl-c
|
28
34
|
if issubclass(exception_type, KeyboardInterrupt):
|
29
35
|
sys.__excepthook__(exception_type, exception_value, traceback_)
|
@@ -41,6 +47,10 @@ def init_uncaught_exception_handler(logger: logging.Logger) -> None:
|
|
41
47
|
|
42
48
|
def generate_failed_streams_error_message(stream_failures: Mapping[str, List[Exception]]) -> str:
|
43
49
|
failures = "\n".join(
|
44
|
-
[
|
50
|
+
[
|
51
|
+
f"{stream}: {filter_secrets(exception.__repr__())}"
|
52
|
+
for stream, exceptions in stream_failures.items()
|
53
|
+
for exception in exceptions
|
54
|
+
]
|
45
55
|
)
|
46
56
|
return f"During the sync, the following streams did not sync successfully: {failures}"
|
airbyte_cdk/logger.py
CHANGED
@@ -7,7 +7,13 @@ import logging
|
|
7
7
|
import logging.config
|
8
8
|
from typing import Any, Callable, Mapping, Optional, Tuple
|
9
9
|
|
10
|
-
from airbyte_cdk.models import
|
10
|
+
from airbyte_cdk.models import (
|
11
|
+
AirbyteLogMessage,
|
12
|
+
AirbyteMessage,
|
13
|
+
AirbyteMessageSerializer,
|
14
|
+
Level,
|
15
|
+
Type,
|
16
|
+
)
|
11
17
|
from airbyte_cdk.utils.airbyte_secrets_utils import filter_secrets
|
12
18
|
from orjson import orjson
|
13
19
|
|
@@ -68,7 +74,9 @@ class AirbyteLogFormatter(logging.Formatter):
|
|
68
74
|
else:
|
69
75
|
message = super().format(record)
|
70
76
|
message = filter_secrets(message)
|
71
|
-
log_message = AirbyteMessage(
|
77
|
+
log_message = AirbyteMessage(
|
78
|
+
type=Type.LOG, log=AirbyteLogMessage(level=airbyte_level, message=message)
|
79
|
+
)
|
72
80
|
return orjson.dumps(AirbyteMessageSerializer.dump(log_message)).decode() # type: ignore[no-any-return] # orjson.dumps(message).decode() always returns string
|
73
81
|
|
74
82
|
@staticmethod
|
@@ -6,9 +6,11 @@ from dataclasses import InitVar, dataclass
|
|
6
6
|
from typing import Annotated, Any, Dict, List, Mapping, Optional, Union
|
7
7
|
|
8
8
|
from airbyte_cdk.models.file_transfer_record_message import AirbyteFileTransferRecordMessage
|
9
|
-
from airbyte_protocol_dataclasses.models import *
|
9
|
+
from airbyte_protocol_dataclasses.models import * # noqa: F403 # Allow '*'
|
10
10
|
from serpyco_rs.metadata import Alias
|
11
11
|
|
12
|
+
# ruff: noqa: F405 # ignore fuzzy import issues with 'import *'
|
13
|
+
|
12
14
|
|
13
15
|
@dataclass
|
14
16
|
class AirbyteStateBlob:
|
@@ -42,7 +44,11 @@ class AirbyteStateBlob:
|
|
42
44
|
setattr(self, key, value)
|
43
45
|
|
44
46
|
def __eq__(self, other: object) -> bool:
|
45
|
-
return
|
47
|
+
return (
|
48
|
+
False
|
49
|
+
if not isinstance(other, AirbyteStateBlob)
|
50
|
+
else bool(self.__dict__ == other.__dict__)
|
51
|
+
)
|
46
52
|
|
47
53
|
|
48
54
|
# The following dataclasses have been redeclared to include the new version of AirbyteStateBlob
|
@@ -62,9 +68,9 @@ class AirbyteGlobalState:
|
|
62
68
|
class AirbyteStateMessage:
|
63
69
|
type: Optional[AirbyteStateType] = None # type: ignore [name-defined]
|
64
70
|
stream: Optional[AirbyteStreamState] = None
|
65
|
-
global_: Annotated[
|
66
|
-
|
67
|
-
|
71
|
+
global_: Annotated[AirbyteGlobalState | None, Alias("global")] = (
|
72
|
+
None # "global" is a reserved keyword in python ⇒ Alias is used for (de-)serialization
|
73
|
+
)
|
68
74
|
data: Optional[Dict[str, Any]] = None
|
69
75
|
sourceStats: Optional[AirbyteStateStats] = None # type: ignore [name-defined]
|
70
76
|
destinationStats: Optional[AirbyteStateStats] = None # type: ignore [name-defined]
|
@@ -30,9 +30,15 @@ def custom_type_resolver(t: type) -> CustomType[AirbyteStateBlob, Dict[str, Any]
|
|
30
30
|
return AirbyteStateBlobType() if t is AirbyteStateBlob else None
|
31
31
|
|
32
32
|
|
33
|
-
AirbyteStreamStateSerializer = Serializer(
|
34
|
-
|
35
|
-
|
33
|
+
AirbyteStreamStateSerializer = Serializer(
|
34
|
+
AirbyteStreamState, omit_none=True, custom_type_resolver=custom_type_resolver
|
35
|
+
)
|
36
|
+
AirbyteStateMessageSerializer = Serializer(
|
37
|
+
AirbyteStateMessage, omit_none=True, custom_type_resolver=custom_type_resolver
|
38
|
+
)
|
39
|
+
AirbyteMessageSerializer = Serializer(
|
40
|
+
AirbyteMessage, omit_none=True, custom_type_resolver=custom_type_resolver
|
41
|
+
)
|
36
42
|
ConfiguredAirbyteCatalogSerializer = Serializer(ConfiguredAirbyteCatalog, omit_none=True)
|
37
43
|
ConfiguredAirbyteStreamSerializer = Serializer(ConfiguredAirbyteStream, omit_none=True)
|
38
44
|
ConnectorSpecificationSerializer = Serializer(ConnectorSpecification, omit_none=True)
|
@@ -4,7 +4,18 @@
|
|
4
4
|
|
5
5
|
import logging
|
6
6
|
from abc import ABC, abstractmethod
|
7
|
-
from typing import
|
7
|
+
from typing import (
|
8
|
+
Any,
|
9
|
+
Dict,
|
10
|
+
Iterable,
|
11
|
+
Iterator,
|
12
|
+
List,
|
13
|
+
Mapping,
|
14
|
+
MutableMapping,
|
15
|
+
Optional,
|
16
|
+
Tuple,
|
17
|
+
Union,
|
18
|
+
)
|
8
19
|
|
9
20
|
from airbyte_cdk.exception_handler import generate_failed_streams_error_message
|
10
21
|
from airbyte_cdk.models import (
|
@@ -30,7 +41,9 @@ from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_messa
|
|
30
41
|
from airbyte_cdk.sources.utils.schema_helpers import InternalConfig, split_config
|
31
42
|
from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
|
32
43
|
from airbyte_cdk.utils.event_timing import create_timer
|
33
|
-
from airbyte_cdk.utils.stream_status_utils import
|
44
|
+
from airbyte_cdk.utils.stream_status_utils import (
|
45
|
+
as_airbyte_message as stream_status_as_airbyte_message,
|
46
|
+
)
|
34
47
|
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
35
48
|
|
36
49
|
_default_message_repository = InMemoryMessageRepository()
|
@@ -43,7 +56,9 @@ class AbstractSource(Source, ABC):
|
|
43
56
|
"""
|
44
57
|
|
45
58
|
@abstractmethod
|
46
|
-
def check_connection(
|
59
|
+
def check_connection(
|
60
|
+
self, logger: logging.Logger, config: Mapping[str, Any]
|
61
|
+
) -> Tuple[bool, Optional[Any]]:
|
47
62
|
"""
|
48
63
|
:param logger: source logger
|
49
64
|
:param config: The user-provided configuration as specified by the source's spec.
|
@@ -109,7 +124,9 @@ class AbstractSource(Source, ABC):
|
|
109
124
|
# Used direct reference to `stream_instance` instead of `is_stream_exist` to avoid mypy type checking errors
|
110
125
|
if not stream_instance:
|
111
126
|
if not self.raise_exception_on_missing_stream:
|
112
|
-
yield stream_status_as_airbyte_message(
|
127
|
+
yield stream_status_as_airbyte_message(
|
128
|
+
configured_stream.stream, AirbyteStreamStatus.INCOMPLETE
|
129
|
+
)
|
113
130
|
continue
|
114
131
|
|
115
132
|
error_message = (
|
@@ -129,7 +146,9 @@ class AbstractSource(Source, ABC):
|
|
129
146
|
|
130
147
|
timer.start_event(f"Syncing stream {configured_stream.stream.name}")
|
131
148
|
logger.info(f"Marking stream {configured_stream.stream.name} as STARTED")
|
132
|
-
yield stream_status_as_airbyte_message(
|
149
|
+
yield stream_status_as_airbyte_message(
|
150
|
+
configured_stream.stream, AirbyteStreamStatus.STARTED
|
151
|
+
)
|
133
152
|
yield from self._read_stream(
|
134
153
|
logger=logger,
|
135
154
|
stream_instance=stream_instance,
|
@@ -138,13 +157,19 @@ class AbstractSource(Source, ABC):
|
|
138
157
|
internal_config=internal_config,
|
139
158
|
)
|
140
159
|
logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
|
141
|
-
yield stream_status_as_airbyte_message(
|
160
|
+
yield stream_status_as_airbyte_message(
|
161
|
+
configured_stream.stream, AirbyteStreamStatus.COMPLETE
|
162
|
+
)
|
142
163
|
|
143
164
|
except Exception as e:
|
144
165
|
yield from self._emit_queued_messages()
|
145
|
-
logger.exception(
|
166
|
+
logger.exception(
|
167
|
+
f"Encountered an exception while reading stream {configured_stream.stream.name}"
|
168
|
+
)
|
146
169
|
logger.info(f"Marking stream {configured_stream.stream.name} as STOPPED")
|
147
|
-
yield stream_status_as_airbyte_message(
|
170
|
+
yield stream_status_as_airbyte_message(
|
171
|
+
configured_stream.stream, AirbyteStreamStatus.INCOMPLETE
|
172
|
+
)
|
148
173
|
|
149
174
|
stream_descriptor = StreamDescriptor(name=configured_stream.stream.name)
|
150
175
|
|
@@ -152,10 +177,14 @@ class AbstractSource(Source, ABC):
|
|
152
177
|
traced_exception = e
|
153
178
|
info_message = f"Stopping sync on error from stream {configured_stream.stream.name} because {self.name} does not support continuing syncs on error."
|
154
179
|
else:
|
155
|
-
traced_exception = self._serialize_exception(
|
180
|
+
traced_exception = self._serialize_exception(
|
181
|
+
stream_descriptor, e, stream_instance=stream_instance
|
182
|
+
)
|
156
183
|
info_message = f"{self.name} does not support continuing syncs on error from stream {configured_stream.stream.name}"
|
157
184
|
|
158
|
-
yield traced_exception.as_sanitized_airbyte_message(
|
185
|
+
yield traced_exception.as_sanitized_airbyte_message(
|
186
|
+
stream_descriptor=stream_descriptor
|
187
|
+
)
|
159
188
|
stream_name_to_exception[stream_instance.name] = traced_exception # type: ignore # use configured_stream if stream_instance is None
|
160
189
|
if self.stop_sync_on_stream_failure:
|
161
190
|
logger.info(info_message)
|
@@ -169,12 +198,16 @@ class AbstractSource(Source, ABC):
|
|
169
198
|
logger.info(timer.report())
|
170
199
|
|
171
200
|
if len(stream_name_to_exception) > 0:
|
172
|
-
error_message = generate_failed_streams_error_message(
|
201
|
+
error_message = generate_failed_streams_error_message(
|
202
|
+
{key: [value] for key, value in stream_name_to_exception.items()}
|
203
|
+
) # type: ignore # for some reason, mypy can't figure out the types for key and value
|
173
204
|
logger.info(error_message)
|
174
205
|
# We still raise at least one exception when a stream raises an exception because the platform currently relies
|
175
206
|
# on a non-zero exit code to determine if a sync attempt has failed. We also raise the exception as a config_error
|
176
207
|
# type because this combined error isn't actionable, but rather the previously emitted individual errors.
|
177
|
-
raise AirbyteTracedException(
|
208
|
+
raise AirbyteTracedException(
|
209
|
+
message=error_message, failure_type=FailureType.config_error
|
210
|
+
)
|
178
211
|
logger.info(f"Finished syncing {self.name}")
|
179
212
|
|
180
213
|
@staticmethod
|
@@ -183,7 +216,9 @@ class AbstractSource(Source, ABC):
|
|
183
216
|
) -> AirbyteTracedException:
|
184
217
|
display_message = stream_instance.get_error_display_message(e) if stream_instance else None
|
185
218
|
if display_message:
|
186
|
-
return AirbyteTracedException.from_exception(
|
219
|
+
return AirbyteTracedException.from_exception(
|
220
|
+
e, message=display_message, stream_descriptor=stream_descriptor
|
221
|
+
)
|
187
222
|
return AirbyteTracedException.from_exception(e, stream_descriptor=stream_descriptor)
|
188
223
|
|
189
224
|
@property
|
@@ -199,7 +234,9 @@ class AbstractSource(Source, ABC):
|
|
199
234
|
internal_config: InternalConfig,
|
200
235
|
) -> Iterator[AirbyteMessage]:
|
201
236
|
if internal_config.page_size and isinstance(stream_instance, HttpStream):
|
202
|
-
logger.info(
|
237
|
+
logger.info(
|
238
|
+
f"Setting page size for {stream_instance.name} to {internal_config.page_size}"
|
239
|
+
)
|
203
240
|
stream_instance.page_size = internal_config.page_size
|
204
241
|
logger.debug(
|
205
242
|
f"Syncing configured stream: {configured_stream.stream.name}",
|
@@ -243,7 +280,9 @@ class AbstractSource(Source, ABC):
|
|
243
280
|
if record_counter == 1:
|
244
281
|
logger.info(f"Marking stream {stream_name} as RUNNING")
|
245
282
|
# If we just read the first record of the stream, emit the transition to the RUNNING state
|
246
|
-
yield stream_status_as_airbyte_message(
|
283
|
+
yield stream_status_as_airbyte_message(
|
284
|
+
configured_stream.stream, AirbyteStreamStatus.RUNNING
|
285
|
+
)
|
247
286
|
yield from self._emit_queued_messages()
|
248
287
|
yield record
|
249
288
|
|
@@ -254,7 +293,9 @@ class AbstractSource(Source, ABC):
|
|
254
293
|
yield from self.message_repository.consume_queue()
|
255
294
|
return
|
256
295
|
|
257
|
-
def _get_message(
|
296
|
+
def _get_message(
|
297
|
+
self, record_data_or_message: Union[StreamData, AirbyteMessage], stream: Stream
|
298
|
+
) -> AirbyteMessage:
|
258
299
|
"""
|
259
300
|
Converts the input to an AirbyteMessage if it is a StreamData. Returns the input as is if it is already an AirbyteMessage
|
260
301
|
"""
|
@@ -262,7 +303,12 @@ class AbstractSource(Source, ABC):
|
|
262
303
|
case AirbyteMessage():
|
263
304
|
return record_data_or_message
|
264
305
|
case _:
|
265
|
-
return stream_data_to_airbyte_message(
|
306
|
+
return stream_data_to_airbyte_message(
|
307
|
+
stream.name,
|
308
|
+
record_data_or_message,
|
309
|
+
stream.transformer,
|
310
|
+
stream.get_json_schema(),
|
311
|
+
)
|
266
312
|
|
267
313
|
@property
|
268
314
|
def message_repository(self) -> Union[None, MessageRepository]:
|
@@ -7,7 +7,9 @@ from typing import Dict, Iterable, List, Optional, Set
|
|
7
7
|
from airbyte_cdk.exception_handler import generate_failed_streams_error_message
|
8
8
|
from airbyte_cdk.models import AirbyteMessage, AirbyteStreamStatus, FailureType, StreamDescriptor
|
9
9
|
from airbyte_cdk.models import Type as MessageType
|
10
|
-
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import
|
10
|
+
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import (
|
11
|
+
PartitionGenerationCompletedSentinel,
|
12
|
+
)
|
11
13
|
from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException
|
12
14
|
from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
|
13
15
|
from airbyte_cdk.sources.message import MessageRepository
|
@@ -20,7 +22,9 @@ from airbyte_cdk.sources.streams.concurrent.partitions.types import PartitionCom
|
|
20
22
|
from airbyte_cdk.sources.utils.record_helper import stream_data_to_airbyte_message
|
21
23
|
from airbyte_cdk.sources.utils.slice_logger import SliceLogger
|
22
24
|
from airbyte_cdk.utils import AirbyteTracedException
|
23
|
-
from airbyte_cdk.utils.stream_status_utils import
|
25
|
+
from airbyte_cdk.utils.stream_status_utils import (
|
26
|
+
as_airbyte_message as stream_status_as_airbyte_message,
|
27
|
+
)
|
24
28
|
|
25
29
|
|
26
30
|
class ConcurrentReadProcessor:
|
@@ -61,7 +65,9 @@ class ConcurrentReadProcessor:
|
|
61
65
|
self._streams_done: Set[str] = set()
|
62
66
|
self._exceptions_per_stream_name: dict[str, List[Exception]] = {}
|
63
67
|
|
64
|
-
def on_partition_generation_completed(
|
68
|
+
def on_partition_generation_completed(
|
69
|
+
self, sentinel: PartitionGenerationCompletedSentinel
|
70
|
+
) -> Iterable[AirbyteMessage]:
|
65
71
|
"""
|
66
72
|
This method is called when a partition generation is completed.
|
67
73
|
1. Remove the stream from the list of streams currently generating partitions
|
@@ -72,7 +78,10 @@ class ConcurrentReadProcessor:
|
|
72
78
|
self._streams_currently_generating_partitions.remove(sentinel.stream.name)
|
73
79
|
# It is possible for the stream to already be done if no partitions were generated
|
74
80
|
# If the partition generation process was completed and there are no partitions left to process, the stream is done
|
75
|
-
if
|
81
|
+
if (
|
82
|
+
self._is_stream_done(stream_name)
|
83
|
+
or len(self._streams_to_running_partitions[stream_name]) == 0
|
84
|
+
):
|
76
85
|
yield from self._on_stream_is_done(stream_name)
|
77
86
|
if self._stream_instances_to_start_partition_generation:
|
78
87
|
yield self.start_next_partition_generator() # type:ignore # None may be yielded
|
@@ -87,10 +96,14 @@ class ConcurrentReadProcessor:
|
|
87
96
|
stream_name = partition.stream_name()
|
88
97
|
self._streams_to_running_partitions[stream_name].add(partition)
|
89
98
|
if self._slice_logger.should_log_slice_message(self._logger):
|
90
|
-
self._message_repository.emit_message(
|
99
|
+
self._message_repository.emit_message(
|
100
|
+
self._slice_logger.create_slice_log_message(partition.to_slice())
|
101
|
+
)
|
91
102
|
self._thread_pool_manager.submit(self._partition_reader.process_partition, partition)
|
92
103
|
|
93
|
-
def on_partition_complete_sentinel(
|
104
|
+
def on_partition_complete_sentinel(
|
105
|
+
self, sentinel: PartitionCompleteSentinel
|
106
|
+
) -> Iterable[AirbyteMessage]:
|
94
107
|
"""
|
95
108
|
This method is called when a partition is completed.
|
96
109
|
1. Close the partition
|
@@ -112,7 +125,10 @@ class ConcurrentReadProcessor:
|
|
112
125
|
if partition in partitions_running:
|
113
126
|
partitions_running.remove(partition)
|
114
127
|
# If all partitions were generated and this was the last one, the stream is done
|
115
|
-
if
|
128
|
+
if (
|
129
|
+
partition.stream_name() not in self._streams_currently_generating_partitions
|
130
|
+
and len(partitions_running) == 0
|
131
|
+
):
|
116
132
|
yield from self._on_stream_is_done(partition.stream_name())
|
117
133
|
yield from self._message_repository.consume_queue()
|
118
134
|
|
@@ -139,7 +155,9 @@ class ConcurrentReadProcessor:
|
|
139
155
|
if message.type == MessageType.RECORD:
|
140
156
|
if self._record_counter[stream.name] == 0:
|
141
157
|
self._logger.info(f"Marking stream {stream.name} as RUNNING")
|
142
|
-
yield stream_status_as_airbyte_message(
|
158
|
+
yield stream_status_as_airbyte_message(
|
159
|
+
stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING
|
160
|
+
)
|
143
161
|
self._record_counter[stream.name] += 1
|
144
162
|
stream.cursor.observe(record)
|
145
163
|
yield message
|
@@ -152,13 +170,17 @@ class ConcurrentReadProcessor:
|
|
152
170
|
2. Raise the exception
|
153
171
|
"""
|
154
172
|
self._flag_exception(exception.stream_name, exception.exception)
|
155
|
-
self._logger.exception(
|
173
|
+
self._logger.exception(
|
174
|
+
f"Exception while syncing stream {exception.stream_name}", exc_info=exception.exception
|
175
|
+
)
|
156
176
|
|
157
177
|
stream_descriptor = StreamDescriptor(name=exception.stream_name)
|
158
178
|
if isinstance(exception.exception, AirbyteTracedException):
|
159
179
|
yield exception.exception.as_airbyte_message(stream_descriptor=stream_descriptor)
|
160
180
|
else:
|
161
|
-
yield AirbyteTracedException.from_exception(
|
181
|
+
yield AirbyteTracedException.from_exception(
|
182
|
+
exception, stream_descriptor=stream_descriptor
|
183
|
+
).as_airbyte_message()
|
162
184
|
|
163
185
|
def _flag_exception(self, stream_name: str, exception: Exception) -> None:
|
164
186
|
self._exceptions_per_stream_name.setdefault(stream_name, []).append(exception)
|
@@ -192,7 +214,12 @@ class ConcurrentReadProcessor:
|
|
192
214
|
2. There are no more streams to read from
|
193
215
|
3. All partitions for all streams are closed
|
194
216
|
"""
|
195
|
-
is_done = all(
|
217
|
+
is_done = all(
|
218
|
+
[
|
219
|
+
self._is_stream_done(stream_name)
|
220
|
+
for stream_name in self._stream_name_to_instance.keys()
|
221
|
+
]
|
222
|
+
)
|
196
223
|
if is_done and self._exceptions_per_stream_name:
|
197
224
|
error_message = generate_failed_streams_error_message(self._exceptions_per_stream_name)
|
198
225
|
self._logger.info(error_message)
|
@@ -200,7 +227,9 @@ class ConcurrentReadProcessor:
|
|
200
227
|
# on a non-zero exit code to determine if a sync attempt has failed. We also raise the exception as a config_error
|
201
228
|
# type because this combined error isn't actionable, but rather the previously emitted individual errors.
|
202
229
|
raise AirbyteTracedException(
|
203
|
-
message=error_message,
|
230
|
+
message=error_message,
|
231
|
+
internal_message="Concurrent read failure",
|
232
|
+
failure_type=FailureType.config_error,
|
204
233
|
)
|
205
234
|
return is_done
|
206
235
|
|
@@ -208,7 +237,9 @@ class ConcurrentReadProcessor:
|
|
208
237
|
return stream_name in self._streams_done
|
209
238
|
|
210
239
|
def _on_stream_is_done(self, stream_name: str) -> Iterable[AirbyteMessage]:
|
211
|
-
self._logger.info(
|
240
|
+
self._logger.info(
|
241
|
+
f"Read {self._record_counter[stream_name]} records from {stream_name} stream"
|
242
|
+
)
|
212
243
|
self._logger.info(f"Marking stream {stream_name} as STOPPED")
|
213
244
|
stream = self._stream_name_to_instance[stream_name]
|
214
245
|
stream.cursor.ensure_at_least_one_state_emitted()
|
@@ -216,6 +247,8 @@ class ConcurrentReadProcessor:
|
|
216
247
|
self._logger.info(f"Finished syncing {stream.name}")
|
217
248
|
self._streams_done.add(stream_name)
|
218
249
|
stream_status = (
|
219
|
-
AirbyteStreamStatus.INCOMPLETE
|
250
|
+
AirbyteStreamStatus.INCOMPLETE
|
251
|
+
if self._exceptions_per_stream_name.get(stream_name, [])
|
252
|
+
else AirbyteStreamStatus.COMPLETE
|
220
253
|
)
|
221
254
|
yield stream_status_as_airbyte_message(stream.as_airbyte_stream(), stream_status)
|
@@ -8,7 +8,9 @@ from typing import Iterable, Iterator, List
|
|
8
8
|
|
9
9
|
from airbyte_cdk.models import AirbyteMessage
|
10
10
|
from airbyte_cdk.sources.concurrent_source.concurrent_read_processor import ConcurrentReadProcessor
|
11
|
-
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import
|
11
|
+
from airbyte_cdk.sources.concurrent_source.partition_generation_completed_sentinel import (
|
12
|
+
PartitionGenerationCompletedSentinel,
|
13
|
+
)
|
12
14
|
from airbyte_cdk.sources.concurrent_source.stream_thread_exception import StreamThreadException
|
13
15
|
from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPoolManager
|
14
16
|
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
|
@@ -17,7 +19,10 @@ from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionE
|
|
17
19
|
from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader
|
18
20
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
19
21
|
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
20
|
-
from airbyte_cdk.sources.streams.concurrent.partitions.types import
|
22
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.types import (
|
23
|
+
PartitionCompleteSentinel,
|
24
|
+
QueueItem,
|
25
|
+
)
|
21
26
|
from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
|
22
27
|
|
23
28
|
|
@@ -41,14 +46,25 @@ class ConcurrentSource:
|
|
41
46
|
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
|
42
47
|
) -> "ConcurrentSource":
|
43
48
|
is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1
|
44
|
-
too_many_generator =
|
45
|
-
|
49
|
+
too_many_generator = (
|
50
|
+
not is_single_threaded and initial_number_of_partitions_to_generate >= num_workers
|
51
|
+
)
|
52
|
+
assert (
|
53
|
+
not too_many_generator
|
54
|
+
), "It is required to have more workers than threads generating partitions"
|
46
55
|
threadpool = ThreadPoolManager(
|
47
|
-
concurrent.futures.ThreadPoolExecutor(
|
56
|
+
concurrent.futures.ThreadPoolExecutor(
|
57
|
+
max_workers=num_workers, thread_name_prefix="workerpool"
|
58
|
+
),
|
48
59
|
logger,
|
49
60
|
)
|
50
61
|
return ConcurrentSource(
|
51
|
-
threadpool,
|
62
|
+
threadpool,
|
63
|
+
logger,
|
64
|
+
slice_logger,
|
65
|
+
message_repository,
|
66
|
+
initial_number_of_partitions_to_generate,
|
67
|
+
timeout_seconds,
|
52
68
|
)
|
53
69
|
|
54
70
|
def __init__(
|
@@ -107,7 +123,9 @@ class ConcurrentSource:
|
|
107
123
|
self._threadpool.check_for_errors_and_shutdown()
|
108
124
|
self._logger.info("Finished syncing")
|
109
125
|
|
110
|
-
def _submit_initial_partition_generators(
|
126
|
+
def _submit_initial_partition_generators(
|
127
|
+
self, concurrent_stream_processor: ConcurrentReadProcessor
|
128
|
+
) -> Iterable[AirbyteMessage]:
|
111
129
|
for _ in range(self._initial_number_partitions_to_generate):
|
112
130
|
status_message = concurrent_stream_processor.start_next_partition_generator()
|
113
131
|
if status_message:
|
@@ -15,8 +15,17 @@ from airbyte_cdk.sources.streams import Stream
|
|
15
15
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
16
16
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import AbstractStreamFacade
|
17
17
|
from airbyte_cdk.sources.streams.concurrent.adapters import StreamFacade
|
18
|
-
from airbyte_cdk.sources.streams.concurrent.cursor import
|
19
|
-
|
18
|
+
from airbyte_cdk.sources.streams.concurrent.cursor import (
|
19
|
+
ConcurrentCursor,
|
20
|
+
Cursor,
|
21
|
+
CursorField,
|
22
|
+
CursorValueType,
|
23
|
+
FinalStateCursor,
|
24
|
+
GapType,
|
25
|
+
)
|
26
|
+
from airbyte_cdk.sources.streams.concurrent.state_converters.abstract_stream_state_converter import (
|
27
|
+
AbstractStreamStateConverter,
|
28
|
+
)
|
20
29
|
|
21
30
|
DEFAULT_LOOKBACK_SECONDS = 0
|
22
31
|
|
@@ -43,14 +52,20 @@ class ConcurrentSourceAdapter(AbstractSource, ABC):
|
|
43
52
|
abstract_streams = self._select_abstract_streams(config, catalog)
|
44
53
|
concurrent_stream_names = {stream.name for stream in abstract_streams}
|
45
54
|
configured_catalog_for_regular_streams = ConfiguredAirbyteCatalog(
|
46
|
-
streams=[
|
55
|
+
streams=[
|
56
|
+
stream
|
57
|
+
for stream in catalog.streams
|
58
|
+
if stream.stream.name not in concurrent_stream_names
|
59
|
+
]
|
47
60
|
)
|
48
61
|
if abstract_streams:
|
49
62
|
yield from self._concurrent_source.read(abstract_streams)
|
50
63
|
if configured_catalog_for_regular_streams.streams:
|
51
64
|
yield from super().read(logger, config, configured_catalog_for_regular_streams, state)
|
52
65
|
|
53
|
-
def _select_abstract_streams(
|
66
|
+
def _select_abstract_streams(
|
67
|
+
self, config: Mapping[str, Any], configured_catalog: ConfiguredAirbyteCatalog
|
68
|
+
) -> List[AbstractStream]:
|
54
69
|
"""
|
55
70
|
Selects streams that can be processed concurrently and returns their abstract representations.
|
56
71
|
"""
|
@@ -67,7 +82,11 @@ class ConcurrentSourceAdapter(AbstractSource, ABC):
|
|
67
82
|
return abstract_streams
|
68
83
|
|
69
84
|
def convert_to_concurrent_stream(
|
70
|
-
self,
|
85
|
+
self,
|
86
|
+
logger: logging.Logger,
|
87
|
+
stream: Stream,
|
88
|
+
state_manager: ConnectorStateManager,
|
89
|
+
cursor: Optional[Cursor] = None,
|
71
90
|
) -> Stream:
|
72
91
|
"""
|
73
92
|
Prepares a stream for concurrent processing by initializing or assigning a cursor,
|
@@ -106,7 +125,9 @@ class ConcurrentSourceAdapter(AbstractSource, ABC):
|
|
106
125
|
|
107
126
|
if cursor_field_name:
|
108
127
|
if not isinstance(cursor_field_name, str):
|
109
|
-
raise ValueError(
|
128
|
+
raise ValueError(
|
129
|
+
f"Cursor field type must be a string, but received {type(cursor_field_name).__name__}."
|
130
|
+
)
|
110
131
|
|
111
132
|
return ConcurrentCursor(
|
112
133
|
stream.name,
|
@@ -37,7 +37,9 @@ class ThreadPoolManager:
|
|
37
37
|
def prune_to_validate_has_reached_futures_limit(self) -> bool:
|
38
38
|
self._prune_futures(self._futures)
|
39
39
|
if len(self._futures) > self._logging_threshold:
|
40
|
-
self._logger.warning(
|
40
|
+
self._logger.warning(
|
41
|
+
f"ThreadPoolManager: The list of futures is getting bigger than expected ({len(self._futures)})"
|
42
|
+
)
|
41
43
|
return len(self._futures) >= self._max_concurrent_tasks
|
42
44
|
|
43
45
|
def submit(self, function: Callable[..., Any], *args: Any) -> None:
|
@@ -92,14 +94,18 @@ class ThreadPoolManager:
|
|
92
94
|
)
|
93
95
|
self._stop_and_raise_exception(self._most_recently_seen_exception)
|
94
96
|
|
95
|
-
exceptions_from_futures = [
|
97
|
+
exceptions_from_futures = [
|
98
|
+
f for f in [future.exception() for future in self._futures] if f is not None
|
99
|
+
]
|
96
100
|
if exceptions_from_futures:
|
97
101
|
exception = RuntimeError(f"Failed reading with errors: {exceptions_from_futures}")
|
98
102
|
self._stop_and_raise_exception(exception)
|
99
103
|
else:
|
100
104
|
futures_not_done = [f for f in self._futures if not f.done()]
|
101
105
|
if futures_not_done:
|
102
|
-
exception = RuntimeError(
|
106
|
+
exception = RuntimeError(
|
107
|
+
f"Failed reading with futures not done: {futures_not_done}"
|
108
|
+
)
|
103
109
|
self._stop_and_raise_exception(exception)
|
104
110
|
else:
|
105
111
|
self._shutdown()
|