airbyte-cdk 6.45.0.dev4107__py3-none-any.whl → 6.45.0.post6.dev14369631849__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +45 -6
- airbyte_cdk/connector_builder/main.py +5 -2
- airbyte_cdk/models/__init__.py +0 -1
- airbyte_cdk/models/airbyte_protocol.py +3 -1
- airbyte_cdk/models/file_transfer_record_message.py +13 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +1 -1
- airbyte_cdk/sources/declarative/async_job/job.py +6 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +18 -18
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +22 -6
- airbyte_cdk/sources/declarative/checks/__init__.py +5 -2
- airbyte_cdk/sources/declarative/checks/check_stream.py +113 -11
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +0 -8
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +210 -50
- airbyte_cdk/sources/declarative/extractors/record_selector.py +1 -6
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +2 -1
- airbyte_cdk/sources/declarative/interpolation/macros.py +10 -4
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +23 -2
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +142 -43
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +16 -4
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +263 -50
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +4 -0
- airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +150 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +5 -1
- airbyte_cdk/sources/declarative/requesters/query_properties/__init__.py +13 -0
- airbyte_cdk/sources/declarative/requesters/query_properties/properties_from_endpoint.py +40 -0
- airbyte_cdk/sources/declarative/requesters/query_properties/property_chunking.py +69 -0
- airbyte_cdk/sources/declarative/requesters/query_properties/query_properties.py +58 -0
- airbyte_cdk/sources/declarative/requesters/query_properties/strategies/__init__.py +10 -0
- airbyte_cdk/sources/declarative/requesters/query_properties/strategies/group_by_key.py +33 -0
- airbyte_cdk/sources/declarative/requesters/query_properties/strategies/merge_strategy.py +19 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +25 -2
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +101 -30
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +1 -1
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +4 -9
- airbyte_cdk/sources/declarative/transformations/add_fields.py +3 -1
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +15 -38
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +15 -8
- airbyte_cdk/sources/file_based/schema_helpers.py +1 -9
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +12 -3
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +31 -16
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +3 -1
- airbyte_cdk/sources/streams/concurrent/default_stream.py +0 -3
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +4 -0
- airbyte_cdk/sources/types.py +2 -11
- airbyte_cdk/sources/utils/record_helper.py +8 -8
- airbyte_cdk/test/mock_http/response_builder.py +0 -8
- {airbyte_cdk-6.45.0.dev4107.dist-info → airbyte_cdk-6.45.0.post6.dev14369631849.dist-info}/METADATA +2 -2
- {airbyte_cdk-6.45.0.dev4107.dist-info → airbyte_cdk-6.45.0.post6.dev14369631849.dist-info}/RECORD +52 -46
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +0 -89
- airbyte_cdk/sources/file_based/file_record_data.py +0 -22
- airbyte_cdk/sources/utils/files_directory.py +0 -15
- {airbyte_cdk-6.45.0.dev4107.dist-info → airbyte_cdk-6.45.0.post6.dev14369631849.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.45.0.dev4107.dist-info → airbyte_cdk-6.45.0.post6.dev14369631849.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.45.0.dev4107.dist-info → airbyte_cdk-6.45.0.post6.dev14369631849.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.45.0.dev4107.dist-info → airbyte_cdk-6.45.0.post6.dev14369631849.dist-info}/entry_points.txt +0 -0
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
|
6
6
|
from dataclasses import asdict, dataclass, field
|
7
|
-
from typing import Any, List, Mapping
|
7
|
+
from typing import Any, Dict, List, Mapping
|
8
8
|
|
9
9
|
from airbyte_cdk.connector_builder.test_reader import TestReader
|
10
10
|
from airbyte_cdk.models import (
|
@@ -27,30 +27,34 @@ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
27
27
|
DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE = 5
|
28
28
|
DEFAULT_MAXIMUM_NUMBER_OF_SLICES = 5
|
29
29
|
DEFAULT_MAXIMUM_RECORDS = 100
|
30
|
+
DEFAULT_MAXIMUM_STREAMS = 100
|
30
31
|
|
31
32
|
MAX_PAGES_PER_SLICE_KEY = "max_pages_per_slice"
|
32
33
|
MAX_SLICES_KEY = "max_slices"
|
33
34
|
MAX_RECORDS_KEY = "max_records"
|
35
|
+
MAX_STREAMS_KEY = "max_streams"
|
34
36
|
|
35
37
|
|
36
38
|
@dataclass
|
37
|
-
class
|
39
|
+
class TestLimits:
|
38
40
|
max_records: int = field(default=DEFAULT_MAXIMUM_RECORDS)
|
39
41
|
max_pages_per_slice: int = field(default=DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE)
|
40
42
|
max_slices: int = field(default=DEFAULT_MAXIMUM_NUMBER_OF_SLICES)
|
43
|
+
max_streams: int = field(default=DEFAULT_MAXIMUM_STREAMS)
|
41
44
|
|
42
45
|
|
43
|
-
def get_limits(config: Mapping[str, Any]) ->
|
46
|
+
def get_limits(config: Mapping[str, Any]) -> TestLimits:
|
44
47
|
command_config = config.get("__test_read_config", {})
|
45
48
|
max_pages_per_slice = (
|
46
49
|
command_config.get(MAX_PAGES_PER_SLICE_KEY) or DEFAULT_MAXIMUM_NUMBER_OF_PAGES_PER_SLICE
|
47
50
|
)
|
48
51
|
max_slices = command_config.get(MAX_SLICES_KEY) or DEFAULT_MAXIMUM_NUMBER_OF_SLICES
|
49
52
|
max_records = command_config.get(MAX_RECORDS_KEY) or DEFAULT_MAXIMUM_RECORDS
|
50
|
-
|
53
|
+
max_streams = command_config.get(MAX_STREAMS_KEY) or DEFAULT_MAXIMUM_STREAMS
|
54
|
+
return TestLimits(max_records, max_pages_per_slice, max_slices, max_streams)
|
51
55
|
|
52
56
|
|
53
|
-
def create_source(config: Mapping[str, Any], limits:
|
57
|
+
def create_source(config: Mapping[str, Any], limits: TestLimits) -> ManifestDeclarativeSource:
|
54
58
|
manifest = config["__injected_declarative_manifest"]
|
55
59
|
return ManifestDeclarativeSource(
|
56
60
|
config=config,
|
@@ -71,7 +75,7 @@ def read_stream(
|
|
71
75
|
config: Mapping[str, Any],
|
72
76
|
configured_catalog: ConfiguredAirbyteCatalog,
|
73
77
|
state: List[AirbyteStateMessage],
|
74
|
-
limits:
|
78
|
+
limits: TestLimits,
|
75
79
|
) -> AirbyteMessage:
|
76
80
|
try:
|
77
81
|
test_read_handler = TestReader(
|
@@ -117,5 +121,40 @@ def resolve_manifest(source: ManifestDeclarativeSource) -> AirbyteMessage:
|
|
117
121
|
return error.as_airbyte_message()
|
118
122
|
|
119
123
|
|
124
|
+
def full_resolve_manifest(source: ManifestDeclarativeSource, limits: TestLimits) -> AirbyteMessage:
|
125
|
+
try:
|
126
|
+
manifest = {**source.resolved_manifest}
|
127
|
+
streams = manifest.get("streams", [])
|
128
|
+
for stream in streams:
|
129
|
+
stream["dynamic_stream_name"] = None
|
130
|
+
|
131
|
+
mapped_streams: Dict[str, List[Dict[str, Any]]] = {}
|
132
|
+
for stream in source.dynamic_streams:
|
133
|
+
generated_streams = mapped_streams.setdefault(stream["dynamic_stream_name"], [])
|
134
|
+
|
135
|
+
if len(generated_streams) < limits.max_streams:
|
136
|
+
generated_streams += [stream]
|
137
|
+
|
138
|
+
for generated_streams_list in mapped_streams.values():
|
139
|
+
streams.extend(generated_streams_list)
|
140
|
+
|
141
|
+
manifest["streams"] = streams
|
142
|
+
return AirbyteMessage(
|
143
|
+
type=Type.RECORD,
|
144
|
+
record=AirbyteRecordMessage(
|
145
|
+
data={"manifest": manifest},
|
146
|
+
emitted_at=_emitted_at(),
|
147
|
+
stream="full_resolve_manifest",
|
148
|
+
),
|
149
|
+
)
|
150
|
+
except AirbyteTracedException as exc:
|
151
|
+
return exc.as_airbyte_message()
|
152
|
+
except Exception as exc:
|
153
|
+
error = AirbyteTracedException.from_exception(
|
154
|
+
exc, message=f"Error full resolving manifest: {str(exc)}"
|
155
|
+
)
|
156
|
+
return error.as_airbyte_message()
|
157
|
+
|
158
|
+
|
120
159
|
def _emitted_at() -> int:
|
121
160
|
return ab_datetime_now().to_epoch_millis()
|
@@ -10,8 +10,9 @@ import orjson
|
|
10
10
|
|
11
11
|
from airbyte_cdk.connector import BaseConnector
|
12
12
|
from airbyte_cdk.connector_builder.connector_builder_handler import (
|
13
|
-
|
13
|
+
TestLimits,
|
14
14
|
create_source,
|
15
|
+
full_resolve_manifest,
|
15
16
|
get_limits,
|
16
17
|
read_stream,
|
17
18
|
resolve_manifest,
|
@@ -72,7 +73,7 @@ def handle_connector_builder_request(
|
|
72
73
|
config: Mapping[str, Any],
|
73
74
|
catalog: Optional[ConfiguredAirbyteCatalog],
|
74
75
|
state: List[AirbyteStateMessage],
|
75
|
-
limits:
|
76
|
+
limits: TestLimits,
|
76
77
|
) -> AirbyteMessage:
|
77
78
|
if command == "resolve_manifest":
|
78
79
|
return resolve_manifest(source)
|
@@ -81,6 +82,8 @@ def handle_connector_builder_request(
|
|
81
82
|
catalog is not None
|
82
83
|
), "`test_read` requires a valid `ConfiguredAirbyteCatalog`, got None."
|
83
84
|
return read_stream(source, config, catalog, state, limits)
|
85
|
+
elif command == "full_resolve_manifest":
|
86
|
+
return full_resolve_manifest(source, limits)
|
84
87
|
else:
|
85
88
|
raise ValueError(f"Unrecognized command {command}.")
|
86
89
|
|
airbyte_cdk/models/__init__.py
CHANGED
@@ -8,6 +8,8 @@ from typing import Annotated, Any, Dict, List, Mapping, Optional, Union
|
|
8
8
|
from airbyte_protocol_dataclasses.models import * # noqa: F403 # Allow '*'
|
9
9
|
from serpyco_rs.metadata import Alias
|
10
10
|
|
11
|
+
from airbyte_cdk.models.file_transfer_record_message import AirbyteFileTransferRecordMessage
|
12
|
+
|
11
13
|
# ruff: noqa: F405 # ignore fuzzy import issues with 'import *'
|
12
14
|
|
13
15
|
|
@@ -82,7 +84,7 @@ class AirbyteMessage:
|
|
82
84
|
spec: Optional[ConnectorSpecification] = None # type: ignore [name-defined]
|
83
85
|
connectionStatus: Optional[AirbyteConnectionStatus] = None # type: ignore [name-defined]
|
84
86
|
catalog: Optional[AirbyteCatalog] = None # type: ignore [name-defined]
|
85
|
-
record: Optional[AirbyteRecordMessage] = None # type: ignore [name-defined]
|
87
|
+
record: Optional[Union[AirbyteFileTransferRecordMessage, AirbyteRecordMessage]] = None # type: ignore [name-defined]
|
86
88
|
state: Optional[AirbyteStateMessage] = None
|
87
89
|
trace: Optional[AirbyteTraceMessage] = None # type: ignore [name-defined]
|
88
90
|
control: Optional[AirbyteControlMessage] = None # type: ignore [name-defined]
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from typing import Any, Dict, Optional
|
5
|
+
|
6
|
+
|
7
|
+
@dataclass
|
8
|
+
class AirbyteFileTransferRecordMessage:
|
9
|
+
stream: str
|
10
|
+
file: Dict[str, Any]
|
11
|
+
emitted_at: int
|
12
|
+
namespace: Optional[str] = None
|
13
|
+
data: Optional[Dict[str, Any]] = None
|
@@ -149,7 +149,7 @@ class ConcurrentReadProcessor:
|
|
149
149
|
message = stream_data_to_airbyte_message(
|
150
150
|
stream_name=record.stream_name,
|
151
151
|
data_or_message=record.data,
|
152
|
-
|
152
|
+
is_file_transfer_message=record.is_file_transfer_message,
|
153
153
|
)
|
154
154
|
stream = self._stream_name_to_instance[record.stream_name]
|
155
155
|
|
@@ -34,6 +34,12 @@ class AsyncJob:
|
|
34
34
|
|
35
35
|
def status(self) -> AsyncJobStatus:
|
36
36
|
if self._timer.has_timed_out():
|
37
|
+
# TODO: we should account the fact that,
|
38
|
+
# certain APIs could send the `Timeout` status,
|
39
|
+
# thus we should not return `Timeout` in that case,
|
40
|
+
# but act based on the scenario.
|
41
|
+
|
42
|
+
# the default behavior is to return `Timeout` status and retry.
|
37
43
|
return AsyncJobStatus.TIMED_OUT
|
38
44
|
return self._status
|
39
45
|
|
@@ -44,16 +44,21 @@ class AsyncPartition:
|
|
44
44
|
This bucket of api_jobs is a bit useless for this iteration but should become interesting when we will be able to split jobs
|
45
45
|
"""
|
46
46
|
|
47
|
-
|
47
|
+
_DEFAULT_MAX_JOB_RETRY = 3
|
48
48
|
|
49
|
-
def __init__(
|
49
|
+
def __init__(
|
50
|
+
self, jobs: List[AsyncJob], stream_slice: StreamSlice, job_max_retry: Optional[int] = None
|
51
|
+
) -> None:
|
50
52
|
self._attempts_per_job = {job: 1 for job in jobs}
|
51
53
|
self._stream_slice = stream_slice
|
54
|
+
self._job_max_retry = (
|
55
|
+
job_max_retry if job_max_retry is not None else self._DEFAULT_MAX_JOB_RETRY
|
56
|
+
)
|
52
57
|
|
53
58
|
def has_reached_max_attempt(self) -> bool:
|
54
59
|
return any(
|
55
60
|
map(
|
56
|
-
lambda attempt_count: attempt_count >= self.
|
61
|
+
lambda attempt_count: attempt_count >= self._job_max_retry,
|
57
62
|
self._attempts_per_job.values(),
|
58
63
|
)
|
59
64
|
)
|
@@ -62,7 +67,7 @@ class AsyncPartition:
|
|
62
67
|
current_attempt_count = self._attempts_per_job.pop(job_to_replace, None)
|
63
68
|
if current_attempt_count is None:
|
64
69
|
raise ValueError("Could not find job to replace")
|
65
|
-
elif current_attempt_count >= self.
|
70
|
+
elif current_attempt_count >= self._job_max_retry:
|
66
71
|
raise ValueError(f"Max attempt reached for job in partition {self._stream_slice}")
|
67
72
|
|
68
73
|
new_attempt_count = current_attempt_count + 1
|
@@ -155,6 +160,7 @@ class AsyncJobOrchestrator:
|
|
155
160
|
message_repository: MessageRepository,
|
156
161
|
exceptions_to_break_on: Iterable[Type[Exception]] = tuple(),
|
157
162
|
has_bulk_parent: bool = False,
|
163
|
+
job_max_retry: Optional[int] = None,
|
158
164
|
) -> None:
|
159
165
|
"""
|
160
166
|
If the stream slices provided as a parameters relies on a async job streams that relies on the same JobTracker, `has_bulk_parent`
|
@@ -175,11 +181,12 @@ class AsyncJobOrchestrator:
|
|
175
181
|
self._message_repository = message_repository
|
176
182
|
self._exceptions_to_break_on: Tuple[Type[Exception], ...] = tuple(exceptions_to_break_on)
|
177
183
|
self._has_bulk_parent = has_bulk_parent
|
184
|
+
self._job_max_retry = job_max_retry
|
178
185
|
|
179
186
|
self._non_breaking_exceptions: List[Exception] = []
|
180
187
|
|
181
188
|
def _replace_failed_jobs(self, partition: AsyncPartition) -> None:
|
182
|
-
failed_status_jobs = (AsyncJobStatus.FAILED,)
|
189
|
+
failed_status_jobs = (AsyncJobStatus.FAILED, AsyncJobStatus.TIMED_OUT)
|
183
190
|
jobs_to_replace = [job for job in partition.jobs if job.status() in failed_status_jobs]
|
184
191
|
for job in jobs_to_replace:
|
185
192
|
new_job = self._start_job(job.job_parameters(), job.api_job_id())
|
@@ -214,7 +221,7 @@ class AsyncJobOrchestrator:
|
|
214
221
|
for _slice in self._slice_iterator:
|
215
222
|
at_least_one_slice_consumed_from_slice_iterator_during_current_iteration = True
|
216
223
|
job = self._start_job(_slice)
|
217
|
-
self._running_partitions.append(AsyncPartition([job], _slice))
|
224
|
+
self._running_partitions.append(AsyncPartition([job], _slice, self._job_max_retry))
|
218
225
|
if self._has_bulk_parent and self._slice_iterator.has_next():
|
219
226
|
break
|
220
227
|
except ConcurrentJobLimitReached:
|
@@ -363,7 +370,7 @@ class AsyncJobOrchestrator:
|
|
363
370
|
self._reallocate_partition(current_running_partitions, partition)
|
364
371
|
|
365
372
|
# We only remove completed / timeout jobs jobs as we want failed jobs to be re-allocated in priority
|
366
|
-
self.
|
373
|
+
self._remove_completed_jobs(partition)
|
367
374
|
|
368
375
|
# update the referenced list with running partitions
|
369
376
|
self._running_partitions = current_running_partitions
|
@@ -378,11 +385,7 @@ class AsyncJobOrchestrator:
|
|
378
385
|
def _stop_timed_out_jobs(self, partition: AsyncPartition) -> None:
|
379
386
|
for job in partition.jobs:
|
380
387
|
if job.status() == AsyncJobStatus.TIMED_OUT:
|
381
|
-
self._abort_job(job, free_job_allocation=
|
382
|
-
raise AirbyteTracedException(
|
383
|
-
internal_message=f"Job {job.api_job_id()} has timed out. Try increasing the `polling job timeout`.",
|
384
|
-
failure_type=FailureType.config_error,
|
385
|
-
)
|
388
|
+
self._abort_job(job, free_job_allocation=False)
|
386
389
|
|
387
390
|
def _abort_job(self, job: AsyncJob, free_job_allocation: bool = True) -> None:
|
388
391
|
try:
|
@@ -392,7 +395,7 @@ class AsyncJobOrchestrator:
|
|
392
395
|
except Exception as exception:
|
393
396
|
LOGGER.warning(f"Could not free budget for job {job.api_job_id()}: {exception}")
|
394
397
|
|
395
|
-
def
|
398
|
+
def _remove_completed_jobs(self, partition: AsyncPartition) -> None:
|
396
399
|
"""
|
397
400
|
Remove completed or timed out jobs from the partition.
|
398
401
|
|
@@ -400,7 +403,7 @@ class AsyncJobOrchestrator:
|
|
400
403
|
partition (AsyncPartition): The partition to process.
|
401
404
|
"""
|
402
405
|
for job in partition.jobs:
|
403
|
-
if job.status()
|
406
|
+
if job.status() == AsyncJobStatus.COMPLETED:
|
404
407
|
self._job_tracker.remove_job(job.api_job_id())
|
405
408
|
|
406
409
|
def _reallocate_partition(
|
@@ -415,10 +418,7 @@ class AsyncJobOrchestrator:
|
|
415
418
|
current_running_partitions (list): The list of currently running partitions.
|
416
419
|
partition (AsyncPartition): The partition to reallocate.
|
417
420
|
"""
|
418
|
-
|
419
|
-
if job.status() != AsyncJobStatus.TIMED_OUT:
|
420
|
-
# allow the FAILED jobs to be re-allocated for partition
|
421
|
-
current_running_partitions.insert(0, partition)
|
421
|
+
current_running_partitions.insert(0, partition)
|
422
422
|
|
423
423
|
def _process_partitions_with_errors(self, partition: AsyncPartition) -> None:
|
424
424
|
"""
|
@@ -3,9 +3,11 @@
|
|
3
3
|
import logging
|
4
4
|
import threading
|
5
5
|
import uuid
|
6
|
-
from
|
6
|
+
from dataclasses import dataclass, field
|
7
|
+
from typing import Any, Mapping, Set, Union
|
7
8
|
|
8
9
|
from airbyte_cdk.logger import lazy_log
|
10
|
+
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
|
9
11
|
|
10
12
|
LOGGER = logging.getLogger("airbyte")
|
11
13
|
|
@@ -14,15 +16,29 @@ class ConcurrentJobLimitReached(Exception):
|
|
14
16
|
pass
|
15
17
|
|
16
18
|
|
19
|
+
@dataclass
|
17
20
|
class JobTracker:
|
18
|
-
|
21
|
+
limit: Union[int, str]
|
22
|
+
config: Mapping[str, Any] = field(default_factory=dict)
|
23
|
+
|
24
|
+
def __post_init__(self) -> None:
|
19
25
|
self._jobs: Set[str] = set()
|
20
|
-
|
26
|
+
self._lock = threading.Lock()
|
27
|
+
if isinstance(self.limit, str):
|
28
|
+
try:
|
29
|
+
self.limit = int(
|
30
|
+
InterpolatedString(self.limit, parameters={}).eval(config=self.config)
|
31
|
+
)
|
32
|
+
except Exception as e:
|
33
|
+
LOGGER.warning(
|
34
|
+
f"Error interpolating max job count: {self.limit}. Setting to 1. {e}"
|
35
|
+
)
|
36
|
+
self.limit = 1
|
37
|
+
if self.limit < 1:
|
21
38
|
LOGGER.warning(
|
22
|
-
f"The `max_concurrent_async_job_count` property is less than 1: {limit}. Setting to 1. Please update the source manifest to set a valid value."
|
39
|
+
f"The `max_concurrent_async_job_count` property is less than 1: {self.limit}. Setting to 1. Please update the source manifest to set a valid value."
|
23
40
|
)
|
24
|
-
self._limit =
|
25
|
-
self._lock = threading.Lock()
|
41
|
+
self._limit = self.limit if self.limit >= 1 else 1
|
26
42
|
|
27
43
|
def try_to_get_intent(self) -> str:
|
28
44
|
lazy_log(
|
@@ -7,7 +7,10 @@ from typing import Mapping
|
|
7
7
|
from pydantic.v1 import BaseModel
|
8
8
|
|
9
9
|
from airbyte_cdk.sources.declarative.checks.check_dynamic_stream import CheckDynamicStream
|
10
|
-
from airbyte_cdk.sources.declarative.checks.check_stream import
|
10
|
+
from airbyte_cdk.sources.declarative.checks.check_stream import (
|
11
|
+
CheckStream,
|
12
|
+
DynamicStreamCheckConfig,
|
13
|
+
)
|
11
14
|
from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker
|
12
15
|
from airbyte_cdk.sources.declarative.models import (
|
13
16
|
CheckDynamicStream as CheckDynamicStreamModel,
|
@@ -21,4 +24,4 @@ COMPONENTS_CHECKER_TYPE_MAPPING: Mapping[str, type[BaseModel]] = {
|
|
21
24
|
"CheckDynamicStream": CheckDynamicStreamModel,
|
22
25
|
}
|
23
26
|
|
24
|
-
__all__ = ["CheckStream", "CheckDynamicStream", "ConnectionChecker"]
|
27
|
+
__all__ = ["CheckStream", "CheckDynamicStream", "ConnectionChecker", "DynamicStreamCheckConfig"]
|
@@ -5,13 +5,23 @@
|
|
5
5
|
import logging
|
6
6
|
import traceback
|
7
7
|
from dataclasses import InitVar, dataclass
|
8
|
-
from typing import Any, List, Mapping, Tuple
|
8
|
+
from typing import Any, Dict, List, Mapping, Optional, Tuple
|
9
9
|
|
10
10
|
from airbyte_cdk import AbstractSource
|
11
11
|
from airbyte_cdk.sources.declarative.checks.connection_checker import ConnectionChecker
|
12
12
|
from airbyte_cdk.sources.streams.http.availability_strategy import HttpAvailabilityStrategy
|
13
13
|
|
14
14
|
|
15
|
+
@dataclass(frozen=True)
|
16
|
+
class DynamicStreamCheckConfig:
|
17
|
+
"""Defines the configuration for dynamic stream during connection checking. This class specifies
|
18
|
+
what dynamic streams in the stream template should be updated with value, supporting dynamic interpolation
|
19
|
+
and type enforcement."""
|
20
|
+
|
21
|
+
dynamic_stream_name: str
|
22
|
+
stream_count: int = 0
|
23
|
+
|
24
|
+
|
15
25
|
@dataclass
|
16
26
|
class CheckStream(ConnectionChecker):
|
17
27
|
"""
|
@@ -23,34 +33,126 @@ class CheckStream(ConnectionChecker):
|
|
23
33
|
|
24
34
|
stream_names: List[str]
|
25
35
|
parameters: InitVar[Mapping[str, Any]]
|
36
|
+
dynamic_streams_check_configs: Optional[List[DynamicStreamCheckConfig]] = None
|
26
37
|
|
27
38
|
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
28
39
|
self._parameters = parameters
|
40
|
+
if self.dynamic_streams_check_configs is None:
|
41
|
+
self.dynamic_streams_check_configs = []
|
42
|
+
|
43
|
+
def _log_error(self, logger: logging.Logger, action: str, error: Exception) -> Tuple[bool, str]:
|
44
|
+
"""Logs an error and returns a formatted error message."""
|
45
|
+
error_message = f"Encountered an error while {action}. Error: {error}"
|
46
|
+
logger.error(error_message + f"Error traceback: \n {traceback.format_exc()}", exc_info=True)
|
47
|
+
return False, error_message
|
29
48
|
|
30
49
|
def check_connection(
|
31
50
|
self, source: AbstractSource, logger: logging.Logger, config: Mapping[str, Any]
|
32
51
|
) -> Tuple[bool, Any]:
|
33
|
-
|
52
|
+
"""Checks the connection to the source and its streams."""
|
53
|
+
try:
|
54
|
+
streams = source.streams(config=config)
|
55
|
+
if not streams:
|
56
|
+
return False, f"No streams to connect to from source {source}"
|
57
|
+
except Exception as error:
|
58
|
+
return self._log_error(logger, "discovering streams", error)
|
59
|
+
|
34
60
|
stream_name_to_stream = {s.name: s for s in streams}
|
35
|
-
if len(streams) == 0:
|
36
|
-
return False, f"No streams to connect to from source {source}"
|
37
61
|
for stream_name in self.stream_names:
|
38
|
-
if stream_name not in stream_name_to_stream
|
62
|
+
if stream_name not in stream_name_to_stream:
|
39
63
|
raise ValueError(
|
40
|
-
f"{stream_name} is not part of the catalog. Expected one of {stream_name_to_stream.keys()}."
|
64
|
+
f"{stream_name} is not part of the catalog. Expected one of {list(stream_name_to_stream.keys())}."
|
41
65
|
)
|
42
66
|
|
67
|
+
stream_availability, message = self._check_stream_availability(
|
68
|
+
stream_name_to_stream, stream_name, logger
|
69
|
+
)
|
70
|
+
if not stream_availability:
|
71
|
+
return stream_availability, message
|
72
|
+
|
73
|
+
should_check_dynamic_streams = (
|
74
|
+
hasattr(source, "resolved_manifest")
|
75
|
+
and hasattr(source, "dynamic_streams")
|
76
|
+
and self.dynamic_streams_check_configs
|
77
|
+
)
|
78
|
+
|
79
|
+
if should_check_dynamic_streams:
|
80
|
+
return self._check_dynamic_streams_availability(source, stream_name_to_stream, logger)
|
81
|
+
|
82
|
+
return True, None
|
83
|
+
|
84
|
+
def _check_stream_availability(
|
85
|
+
self, stream_name_to_stream: Dict[str, Any], stream_name: str, logger: logging.Logger
|
86
|
+
) -> Tuple[bool, Any]:
|
87
|
+
"""Checks if streams are available."""
|
88
|
+
availability_strategy = HttpAvailabilityStrategy()
|
89
|
+
try:
|
43
90
|
stream = stream_name_to_stream[stream_name]
|
44
|
-
|
91
|
+
stream_is_available, reason = availability_strategy.check_availability(stream, logger)
|
92
|
+
if not stream_is_available:
|
93
|
+
message = f"Stream {stream_name} is not available: {reason}"
|
94
|
+
logger.warning(message)
|
95
|
+
return stream_is_available, message
|
96
|
+
except Exception as error:
|
97
|
+
return self._log_error(logger, f"checking availability of stream {stream_name}", error)
|
98
|
+
return True, None
|
99
|
+
|
100
|
+
def _check_dynamic_streams_availability(
|
101
|
+
self, source: AbstractSource, stream_name_to_stream: Dict[str, Any], logger: logging.Logger
|
102
|
+
) -> Tuple[bool, Any]:
|
103
|
+
"""Checks the availability of dynamic streams."""
|
104
|
+
dynamic_streams = source.resolved_manifest.get("dynamic_streams", []) # type: ignore[attr-defined] # The source's resolved_manifest manifest is checked before calling this method
|
105
|
+
dynamic_stream_name_to_dynamic_stream = {
|
106
|
+
ds.get("name", f"dynamic_stream_{i}"): ds for i, ds in enumerate(dynamic_streams)
|
107
|
+
}
|
108
|
+
generated_streams = self._map_generated_streams(source.dynamic_streams) # type: ignore[attr-defined] # The source's dynamic_streams manifest is checked before calling this method
|
109
|
+
|
110
|
+
for check_config in self.dynamic_streams_check_configs: # type: ignore[union-attr] # None value for self.dynamic_streams_check_configs handled in __post_init__
|
111
|
+
if check_config.dynamic_stream_name not in dynamic_stream_name_to_dynamic_stream:
|
112
|
+
return (
|
113
|
+
False,
|
114
|
+
f"Dynamic stream {check_config.dynamic_stream_name} is not found in manifest.",
|
115
|
+
)
|
116
|
+
|
117
|
+
generated = generated_streams.get(check_config.dynamic_stream_name, [])
|
118
|
+
stream_availability, message = self._check_generated_streams_availability(
|
119
|
+
generated, stream_name_to_stream, logger, check_config.stream_count
|
120
|
+
)
|
121
|
+
if not stream_availability:
|
122
|
+
return stream_availability, message
|
123
|
+
|
124
|
+
return True, None
|
125
|
+
|
126
|
+
def _map_generated_streams(
|
127
|
+
self, dynamic_streams: List[Dict[str, Any]]
|
128
|
+
) -> Dict[str, List[Dict[str, Any]]]:
|
129
|
+
"""Maps dynamic stream names to their corresponding generated streams."""
|
130
|
+
mapped_streams: Dict[str, List[Dict[str, Any]]] = {}
|
131
|
+
for stream in dynamic_streams:
|
132
|
+
mapped_streams.setdefault(stream["dynamic_stream_name"], []).append(stream)
|
133
|
+
return mapped_streams
|
134
|
+
|
135
|
+
def _check_generated_streams_availability(
|
136
|
+
self,
|
137
|
+
generated_streams: List[Dict[str, Any]],
|
138
|
+
stream_name_to_stream: Dict[str, Any],
|
139
|
+
logger: logging.Logger,
|
140
|
+
max_count: int,
|
141
|
+
) -> Tuple[bool, Any]:
|
142
|
+
"""Checks availability of generated dynamic streams."""
|
143
|
+
availability_strategy = HttpAvailabilityStrategy()
|
144
|
+
for declarative_stream in generated_streams[: min(max_count, len(generated_streams))]:
|
145
|
+
stream = stream_name_to_stream[declarative_stream["name"]]
|
45
146
|
try:
|
46
147
|
stream_is_available, reason = availability_strategy.check_availability(
|
47
148
|
stream, logger
|
48
149
|
)
|
49
150
|
if not stream_is_available:
|
50
|
-
|
151
|
+
message = f"Dynamic Stream {stream.name} is not available: {reason}"
|
152
|
+
logger.warning(message)
|
153
|
+
return False, message
|
51
154
|
except Exception as error:
|
52
|
-
|
53
|
-
f"
|
155
|
+
return self._log_error(
|
156
|
+
logger, f"checking availability of dynamic stream {stream.name}", error
|
54
157
|
)
|
55
|
-
return False, f"Unable to connect to stream {stream_name} - {error}"
|
56
158
|
return True, None
|
@@ -25,7 +25,6 @@ from airbyte_cdk.sources.declarative.incremental.per_partition_with_global impor
|
|
25
25
|
PerPartitionWithGlobalCursor,
|
26
26
|
)
|
27
27
|
from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
|
28
|
-
from airbyte_cdk.sources.declarative.models import FileUploader
|
29
28
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
30
29
|
ConcurrencyLevel as ConcurrencyLevelModel,
|
31
30
|
)
|
@@ -207,10 +206,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
207
206
|
# these legacy Python streams the way we do low-code streams to determine if they are concurrent compatible,
|
208
207
|
# so we need to treat them as synchronous
|
209
208
|
|
210
|
-
supports_file_transfer = (
|
211
|
-
"file_uploader" in name_to_stream_mapping[declarative_stream.name]
|
212
|
-
)
|
213
|
-
|
214
209
|
if (
|
215
210
|
isinstance(declarative_stream, DeclarativeStream)
|
216
211
|
and name_to_stream_mapping[declarative_stream.name]["type"]
|
@@ -327,7 +322,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
327
322
|
else None,
|
328
323
|
logger=self.logger,
|
329
324
|
cursor=cursor,
|
330
|
-
supports_file_transfer=supports_file_transfer,
|
331
325
|
)
|
332
326
|
)
|
333
327
|
elif (
|
@@ -359,7 +353,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
359
353
|
cursor_field=None,
|
360
354
|
logger=self.logger,
|
361
355
|
cursor=final_state_cursor,
|
362
|
-
supports_file_transfer=supports_file_transfer,
|
363
356
|
)
|
364
357
|
)
|
365
358
|
elif (
|
@@ -413,7 +406,6 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
413
406
|
cursor_field=perpartition_cursor.cursor_field.cursor_field_key,
|
414
407
|
logger=self.logger,
|
415
408
|
cursor=perpartition_cursor,
|
416
|
-
supports_file_transfer=supports_file_transfer,
|
417
409
|
)
|
418
410
|
)
|
419
411
|
else:
|