airbyte-cdk 6.41.8__py3-none-any.whl → 6.41.9.dev4101__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/models/__init__.py +1 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +1 -0
- airbyte_cdk/sources/declarative/async_job/job.py +0 -6
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +18 -18
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +6 -22
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +22 -0
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +39 -64
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +1 -2
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +25 -45
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +24 -45
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +0 -4
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +61 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +17 -4
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +2 -8
- airbyte_cdk/sources/streams/concurrent/default_stream.py +3 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +0 -4
- airbyte_cdk/sources/types.py +11 -0
- airbyte_cdk/sources/utils/files_directory.py +15 -0
- airbyte_cdk/sources/utils/record_helper.py +8 -1
- {airbyte_cdk-6.41.8.dist-info → airbyte_cdk-6.41.9.dev4101.dist-info}/METADATA +2 -2
- {airbyte_cdk-6.41.8.dist-info → airbyte_cdk-6.41.9.dev4101.dist-info}/RECORD +25 -24
- airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -150
- {airbyte_cdk-6.41.8.dist-info → airbyte_cdk-6.41.9.dev4101.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.41.8.dist-info → airbyte_cdk-6.41.9.dev4101.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.41.8.dist-info → airbyte_cdk-6.41.9.dev4101.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.41.8.dist-info → airbyte_cdk-6.41.9.dev4101.dist-info}/entry_points.txt +0 -0
airbyte_cdk/models/__init__.py
CHANGED
@@ -150,6 +150,7 @@ class ConcurrentReadProcessor:
|
|
150
150
|
stream_name=record.stream_name,
|
151
151
|
data_or_message=record.data,
|
152
152
|
is_file_transfer_message=record.is_file_transfer_message,
|
153
|
+
file_reference=record.file_reference,
|
153
154
|
)
|
154
155
|
stream = self._stream_name_to_instance[record.stream_name]
|
155
156
|
|
@@ -34,12 +34,6 @@ class AsyncJob:
|
|
34
34
|
|
35
35
|
def status(self) -> AsyncJobStatus:
|
36
36
|
if self._timer.has_timed_out():
|
37
|
-
# TODO: we should account the fact that,
|
38
|
-
# certain APIs could send the `Timeout` status,
|
39
|
-
# thus we should not return `Timeout` in that case,
|
40
|
-
# but act based on the scenario.
|
41
|
-
|
42
|
-
# the default behavior is to return `Timeout` status and retry.
|
43
37
|
return AsyncJobStatus.TIMED_OUT
|
44
38
|
return self._status
|
45
39
|
|
@@ -44,21 +44,16 @@ class AsyncPartition:
|
|
44
44
|
This bucket of api_jobs is a bit useless for this iteration but should become interesting when we will be able to split jobs
|
45
45
|
"""
|
46
46
|
|
47
|
-
|
47
|
+
_MAX_NUMBER_OF_ATTEMPTS = 3
|
48
48
|
|
49
|
-
def __init__(
|
50
|
-
self, jobs: List[AsyncJob], stream_slice: StreamSlice, job_max_retry: Optional[int] = None
|
51
|
-
) -> None:
|
49
|
+
def __init__(self, jobs: List[AsyncJob], stream_slice: StreamSlice) -> None:
|
52
50
|
self._attempts_per_job = {job: 1 for job in jobs}
|
53
51
|
self._stream_slice = stream_slice
|
54
|
-
self._job_max_retry = (
|
55
|
-
job_max_retry if job_max_retry is not None else self._DEFAULT_MAX_JOB_RETRY
|
56
|
-
)
|
57
52
|
|
58
53
|
def has_reached_max_attempt(self) -> bool:
|
59
54
|
return any(
|
60
55
|
map(
|
61
|
-
lambda attempt_count: attempt_count >= self.
|
56
|
+
lambda attempt_count: attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS,
|
62
57
|
self._attempts_per_job.values(),
|
63
58
|
)
|
64
59
|
)
|
@@ -67,7 +62,7 @@ class AsyncPartition:
|
|
67
62
|
current_attempt_count = self._attempts_per_job.pop(job_to_replace, None)
|
68
63
|
if current_attempt_count is None:
|
69
64
|
raise ValueError("Could not find job to replace")
|
70
|
-
elif current_attempt_count >= self.
|
65
|
+
elif current_attempt_count >= self._MAX_NUMBER_OF_ATTEMPTS:
|
71
66
|
raise ValueError(f"Max attempt reached for job in partition {self._stream_slice}")
|
72
67
|
|
73
68
|
new_attempt_count = current_attempt_count + 1
|
@@ -160,7 +155,6 @@ class AsyncJobOrchestrator:
|
|
160
155
|
message_repository: MessageRepository,
|
161
156
|
exceptions_to_break_on: Iterable[Type[Exception]] = tuple(),
|
162
157
|
has_bulk_parent: bool = False,
|
163
|
-
job_max_retry: Optional[int] = None,
|
164
158
|
) -> None:
|
165
159
|
"""
|
166
160
|
If the stream slices provided as a parameters relies on a async job streams that relies on the same JobTracker, `has_bulk_parent`
|
@@ -181,12 +175,11 @@ class AsyncJobOrchestrator:
|
|
181
175
|
self._message_repository = message_repository
|
182
176
|
self._exceptions_to_break_on: Tuple[Type[Exception], ...] = tuple(exceptions_to_break_on)
|
183
177
|
self._has_bulk_parent = has_bulk_parent
|
184
|
-
self._job_max_retry = job_max_retry
|
185
178
|
|
186
179
|
self._non_breaking_exceptions: List[Exception] = []
|
187
180
|
|
188
181
|
def _replace_failed_jobs(self, partition: AsyncPartition) -> None:
|
189
|
-
failed_status_jobs = (AsyncJobStatus.FAILED,
|
182
|
+
failed_status_jobs = (AsyncJobStatus.FAILED,)
|
190
183
|
jobs_to_replace = [job for job in partition.jobs if job.status() in failed_status_jobs]
|
191
184
|
for job in jobs_to_replace:
|
192
185
|
new_job = self._start_job(job.job_parameters(), job.api_job_id())
|
@@ -221,7 +214,7 @@ class AsyncJobOrchestrator:
|
|
221
214
|
for _slice in self._slice_iterator:
|
222
215
|
at_least_one_slice_consumed_from_slice_iterator_during_current_iteration = True
|
223
216
|
job = self._start_job(_slice)
|
224
|
-
self._running_partitions.append(AsyncPartition([job], _slice
|
217
|
+
self._running_partitions.append(AsyncPartition([job], _slice))
|
225
218
|
if self._has_bulk_parent and self._slice_iterator.has_next():
|
226
219
|
break
|
227
220
|
except ConcurrentJobLimitReached:
|
@@ -370,7 +363,7 @@ class AsyncJobOrchestrator:
|
|
370
363
|
self._reallocate_partition(current_running_partitions, partition)
|
371
364
|
|
372
365
|
# We only remove completed / timeout jobs jobs as we want failed jobs to be re-allocated in priority
|
373
|
-
self.
|
366
|
+
self._remove_completed_or_timed_out_jobs(partition)
|
374
367
|
|
375
368
|
# update the referenced list with running partitions
|
376
369
|
self._running_partitions = current_running_partitions
|
@@ -385,7 +378,11 @@ class AsyncJobOrchestrator:
|
|
385
378
|
def _stop_timed_out_jobs(self, partition: AsyncPartition) -> None:
|
386
379
|
for job in partition.jobs:
|
387
380
|
if job.status() == AsyncJobStatus.TIMED_OUT:
|
388
|
-
self._abort_job(job, free_job_allocation=
|
381
|
+
self._abort_job(job, free_job_allocation=True)
|
382
|
+
raise AirbyteTracedException(
|
383
|
+
internal_message=f"Job {job.api_job_id()} has timed out. Try increasing the `polling job timeout`.",
|
384
|
+
failure_type=FailureType.config_error,
|
385
|
+
)
|
389
386
|
|
390
387
|
def _abort_job(self, job: AsyncJob, free_job_allocation: bool = True) -> None:
|
391
388
|
try:
|
@@ -395,7 +392,7 @@ class AsyncJobOrchestrator:
|
|
395
392
|
except Exception as exception:
|
396
393
|
LOGGER.warning(f"Could not free budget for job {job.api_job_id()}: {exception}")
|
397
394
|
|
398
|
-
def
|
395
|
+
def _remove_completed_or_timed_out_jobs(self, partition: AsyncPartition) -> None:
|
399
396
|
"""
|
400
397
|
Remove completed or timed out jobs from the partition.
|
401
398
|
|
@@ -403,7 +400,7 @@ class AsyncJobOrchestrator:
|
|
403
400
|
partition (AsyncPartition): The partition to process.
|
404
401
|
"""
|
405
402
|
for job in partition.jobs:
|
406
|
-
if job.status()
|
403
|
+
if job.status() in [AsyncJobStatus.COMPLETED, AsyncJobStatus.TIMED_OUT]:
|
407
404
|
self._job_tracker.remove_job(job.api_job_id())
|
408
405
|
|
409
406
|
def _reallocate_partition(
|
@@ -418,7 +415,10 @@ class AsyncJobOrchestrator:
|
|
418
415
|
current_running_partitions (list): The list of currently running partitions.
|
419
416
|
partition (AsyncPartition): The partition to reallocate.
|
420
417
|
"""
|
421
|
-
|
418
|
+
for job in partition.jobs:
|
419
|
+
if job.status() != AsyncJobStatus.TIMED_OUT:
|
420
|
+
# allow the FAILED jobs to be re-allocated for partition
|
421
|
+
current_running_partitions.insert(0, partition)
|
422
422
|
|
423
423
|
def _process_partitions_with_errors(self, partition: AsyncPartition) -> None:
|
424
424
|
"""
|
@@ -3,11 +3,9 @@
|
|
3
3
|
import logging
|
4
4
|
import threading
|
5
5
|
import uuid
|
6
|
-
from
|
7
|
-
from typing import Any, Mapping, Set, Union
|
6
|
+
from typing import Set
|
8
7
|
|
9
8
|
from airbyte_cdk.logger import lazy_log
|
10
|
-
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
|
11
9
|
|
12
10
|
LOGGER = logging.getLogger("airbyte")
|
13
11
|
|
@@ -16,29 +14,15 @@ class ConcurrentJobLimitReached(Exception):
|
|
16
14
|
pass
|
17
15
|
|
18
16
|
|
19
|
-
@dataclass
|
20
17
|
class JobTracker:
|
21
|
-
limit:
|
22
|
-
config: Mapping[str, Any] = field(default_factory=dict)
|
23
|
-
|
24
|
-
def __post_init__(self) -> None:
|
18
|
+
def __init__(self, limit: int):
|
25
19
|
self._jobs: Set[str] = set()
|
26
|
-
|
27
|
-
if isinstance(self.limit, str):
|
28
|
-
try:
|
29
|
-
self.limit = int(
|
30
|
-
InterpolatedString(self.limit, parameters={}).eval(config=self.config)
|
31
|
-
)
|
32
|
-
except Exception as e:
|
33
|
-
LOGGER.warning(
|
34
|
-
f"Error interpolating max job count: {self.limit}. Setting to 1. {e}"
|
35
|
-
)
|
36
|
-
self.limit = 1
|
37
|
-
if self.limit < 1:
|
20
|
+
if limit < 1:
|
38
21
|
LOGGER.warning(
|
39
|
-
f"The `max_concurrent_async_job_count` property is less than 1: {
|
22
|
+
f"The `max_concurrent_async_job_count` property is less than 1: {limit}. Setting to 1. Please update the source manifest to set a valid value."
|
40
23
|
)
|
41
|
-
self._limit =
|
24
|
+
self._limit = 1 if limit < 1 else limit
|
25
|
+
self._lock = threading.Lock()
|
42
26
|
|
43
27
|
def try_to_get_intent(self) -> str:
|
44
28
|
lazy_log(
|
@@ -25,6 +25,7 @@ from airbyte_cdk.sources.declarative.incremental.per_partition_with_global impor
|
|
25
25
|
PerPartitionWithGlobalCursor,
|
26
26
|
)
|
27
27
|
from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
|
28
|
+
from airbyte_cdk.sources.declarative.models import FileUploader
|
28
29
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
29
30
|
ConcurrencyLevel as ConcurrencyLevelModel,
|
30
31
|
)
|
@@ -206,6 +207,20 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
206
207
|
# these legacy Python streams the way we do low-code streams to determine if they are concurrent compatible,
|
207
208
|
# so we need to treat them as synchronous
|
208
209
|
|
210
|
+
file_uploader = None
|
211
|
+
if isinstance(declarative_stream, DeclarativeStream):
|
212
|
+
file_uploader = (
|
213
|
+
self._constructor.create_component(
|
214
|
+
model_type=FileUploader,
|
215
|
+
component_definition=name_to_stream_mapping[declarative_stream.name][
|
216
|
+
"file_uploader"
|
217
|
+
],
|
218
|
+
config=config,
|
219
|
+
)
|
220
|
+
if "file_uploader" in name_to_stream_mapping[declarative_stream.name]
|
221
|
+
else None
|
222
|
+
)
|
223
|
+
|
209
224
|
if (
|
210
225
|
isinstance(declarative_stream, DeclarativeStream)
|
211
226
|
and name_to_stream_mapping[declarative_stream.name]["type"]
|
@@ -273,6 +288,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
273
288
|
declarative_stream.get_json_schema(),
|
274
289
|
retriever,
|
275
290
|
self.message_repository,
|
291
|
+
file_uploader,
|
276
292
|
),
|
277
293
|
stream_slicer=declarative_stream.retriever.stream_slicer,
|
278
294
|
)
|
@@ -303,6 +319,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
303
319
|
declarative_stream.get_json_schema(),
|
304
320
|
retriever,
|
305
321
|
self.message_repository,
|
322
|
+
file_uploader,
|
306
323
|
),
|
307
324
|
stream_slicer=cursor,
|
308
325
|
)
|
@@ -322,6 +339,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
322
339
|
else None,
|
323
340
|
logger=self.logger,
|
324
341
|
cursor=cursor,
|
342
|
+
supports_file_transfer=bool(file_uploader),
|
325
343
|
)
|
326
344
|
)
|
327
345
|
elif (
|
@@ -333,6 +351,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
333
351
|
declarative_stream.get_json_schema(),
|
334
352
|
declarative_stream.retriever,
|
335
353
|
self.message_repository,
|
354
|
+
file_uploader,
|
336
355
|
),
|
337
356
|
declarative_stream.retriever.stream_slicer,
|
338
357
|
)
|
@@ -353,6 +372,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
353
372
|
cursor_field=None,
|
354
373
|
logger=self.logger,
|
355
374
|
cursor=final_state_cursor,
|
375
|
+
supports_file_transfer=bool(file_uploader),
|
356
376
|
)
|
357
377
|
)
|
358
378
|
elif (
|
@@ -392,6 +412,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
392
412
|
declarative_stream.get_json_schema(),
|
393
413
|
retriever,
|
394
414
|
self.message_repository,
|
415
|
+
file_uploader,
|
395
416
|
),
|
396
417
|
perpartition_cursor,
|
397
418
|
)
|
@@ -406,6 +427,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
406
427
|
cursor_field=perpartition_cursor.cursor_field.cursor_field_key,
|
407
428
|
logger=self.logger,
|
408
429
|
cursor=perpartition_cursor,
|
430
|
+
supports_file_transfer=bool(file_uploader),
|
409
431
|
)
|
410
432
|
)
|
411
433
|
else:
|
@@ -47,12 +47,7 @@ properties:
|
|
47
47
|
max_concurrent_async_job_count:
|
48
48
|
title: Maximum Concurrent Asynchronous Jobs
|
49
49
|
description: Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.
|
50
|
-
type:
|
51
|
-
- integer
|
52
|
-
- string
|
53
|
-
examples:
|
54
|
-
- 3
|
55
|
-
- "{{ config['max_concurrent_async_job_count'] }}"
|
50
|
+
type: integer
|
56
51
|
metadata:
|
57
52
|
type: object
|
58
53
|
description: For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata.
|
@@ -1427,6 +1422,33 @@ definitions:
|
|
1427
1422
|
- "$ref": "#/definitions/LegacyToPerPartitionStateMigration"
|
1428
1423
|
- "$ref": "#/definitions/CustomStateMigration"
|
1429
1424
|
default: []
|
1425
|
+
file_uploader:
|
1426
|
+
title: File Uploader
|
1427
|
+
description: (experimental) Describes how to fetch a file
|
1428
|
+
type: object
|
1429
|
+
required:
|
1430
|
+
- type
|
1431
|
+
- requester
|
1432
|
+
- download_target_extractor
|
1433
|
+
properties:
|
1434
|
+
type:
|
1435
|
+
type: string
|
1436
|
+
enum: [ FileUploader ]
|
1437
|
+
requester:
|
1438
|
+
description: Requester component that describes how to prepare HTTP requests to send to the source API.
|
1439
|
+
anyOf:
|
1440
|
+
- "$ref": "#/definitions/CustomRequester"
|
1441
|
+
- "$ref": "#/definitions/HttpRequester"
|
1442
|
+
download_target_extractor:
|
1443
|
+
description: Responsible for fetching the url where the file is located. This is applied on each records and not on the HTTP response
|
1444
|
+
anyOf:
|
1445
|
+
- "$ref": "#/definitions/CustomRecordExtractor"
|
1446
|
+
- "$ref": "#/definitions/DpathExtractor"
|
1447
|
+
file_extractor:
|
1448
|
+
description: Responsible for fetching the content of the file. If not defined, the assumption is that the whole response body is the file content
|
1449
|
+
anyOf:
|
1450
|
+
- "$ref": "#/definitions/CustomRecordExtractor"
|
1451
|
+
- "$ref": "#/definitions/DpathExtractor"
|
1430
1452
|
$parameters:
|
1431
1453
|
type: object
|
1432
1454
|
additional_properties: true
|
@@ -2197,8 +2219,7 @@ definitions:
|
|
2197
2219
|
type: object
|
2198
2220
|
additionalProperties: true
|
2199
2221
|
JsonDecoder:
|
2200
|
-
title:
|
2201
|
-
description: Select 'JSON' if the response is formatted as a JSON object.
|
2222
|
+
title: Json Decoder
|
2202
2223
|
type: object
|
2203
2224
|
required:
|
2204
2225
|
- type
|
@@ -2207,8 +2228,8 @@ definitions:
|
|
2207
2228
|
type: string
|
2208
2229
|
enum: [JsonDecoder]
|
2209
2230
|
JsonlDecoder:
|
2210
|
-
title:
|
2211
|
-
description:
|
2231
|
+
title: JSONL Decoder
|
2232
|
+
description: Use this if the response consists of JSON objects separated by new lines (`\n`) in JSONL format.
|
2212
2233
|
type: object
|
2213
2234
|
required:
|
2214
2235
|
- type
|
@@ -2333,8 +2354,8 @@ definitions:
|
|
2333
2354
|
type: object
|
2334
2355
|
additionalProperties: true
|
2335
2356
|
IterableDecoder:
|
2336
|
-
title: Iterable
|
2337
|
-
description:
|
2357
|
+
title: Iterable Decoder
|
2358
|
+
description: Use this if the response consists of strings separated by new lines (`\n`). The Decoder will wrap each row into a JSON object with the `record` key.
|
2338
2359
|
type: object
|
2339
2360
|
required:
|
2340
2361
|
- type
|
@@ -2343,8 +2364,8 @@ definitions:
|
|
2343
2364
|
type: string
|
2344
2365
|
enum: [IterableDecoder]
|
2345
2366
|
XmlDecoder:
|
2346
|
-
title: XML
|
2347
|
-
description:
|
2367
|
+
title: XML Decoder
|
2368
|
+
description: Use this if the response is XML.
|
2348
2369
|
type: object
|
2349
2370
|
required:
|
2350
2371
|
- type
|
@@ -2375,8 +2396,8 @@ definitions:
|
|
2375
2396
|
type: object
|
2376
2397
|
additionalProperties: true
|
2377
2398
|
ZipfileDecoder:
|
2378
|
-
title:
|
2379
|
-
description:
|
2399
|
+
title: Zipfile Decoder
|
2400
|
+
description: Decoder for response data that is returned as zipfile(s).
|
2380
2401
|
type: object
|
2381
2402
|
additionalProperties: true
|
2382
2403
|
required:
|
@@ -2900,7 +2921,7 @@ definitions:
|
|
2900
2921
|
title: Lazy Read Pointer
|
2901
2922
|
description: If set, this will enable lazy reading, using the initial read of parent records to extract child records.
|
2902
2923
|
type: array
|
2903
|
-
default: []
|
2924
|
+
default: [ ]
|
2904
2925
|
items:
|
2905
2926
|
- type: string
|
2906
2927
|
interpolation_context:
|
@@ -3205,7 +3226,7 @@ definitions:
|
|
3205
3226
|
properties:
|
3206
3227
|
type:
|
3207
3228
|
type: string
|
3208
|
-
enum: [StateDelegatingStream]
|
3229
|
+
enum: [ StateDelegatingStream ]
|
3209
3230
|
name:
|
3210
3231
|
title: Name
|
3211
3232
|
description: The stream name.
|
@@ -3260,14 +3281,12 @@ definitions:
|
|
3260
3281
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3261
3282
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3262
3283
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3263
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3264
3284
|
- type: array
|
3265
3285
|
items:
|
3266
3286
|
anyOf:
|
3267
3287
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3268
3288
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3269
3289
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3270
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3271
3290
|
decoder:
|
3272
3291
|
title: Decoder
|
3273
3292
|
description: Component decoding the response so records can be extracted.
|
@@ -3284,8 +3303,6 @@ definitions:
|
|
3284
3303
|
type: object
|
3285
3304
|
additionalProperties: true
|
3286
3305
|
GzipDecoder:
|
3287
|
-
title: gzip
|
3288
|
-
description: Select 'gzip' for response data that is compressed with gzip. Requires specifying an inner data type/decoder to parse the decompressed data.
|
3289
3306
|
type: object
|
3290
3307
|
required:
|
3291
3308
|
- type
|
@@ -3301,8 +3318,6 @@ definitions:
|
|
3301
3318
|
- "$ref": "#/definitions/JsonDecoder"
|
3302
3319
|
- "$ref": "#/definitions/JsonlDecoder"
|
3303
3320
|
CsvDecoder:
|
3304
|
-
title: CSV
|
3305
|
-
description: "Select 'CSV' for response data that is formatted as CSV (comma-separated values). Can specify an encoding (default: 'utf-8') and a delimiter (default: ',')."
|
3306
3321
|
type: object
|
3307
3322
|
required:
|
3308
3323
|
- type
|
@@ -3433,14 +3448,12 @@ definitions:
|
|
3433
3448
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3434
3449
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3435
3450
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3436
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3437
3451
|
- type: array
|
3438
3452
|
items:
|
3439
3453
|
anyOf:
|
3440
3454
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3441
3455
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3442
3456
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3443
|
-
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3444
3457
|
decoder:
|
3445
3458
|
title: Decoder
|
3446
3459
|
description: Component decoding the response so records can be extracted.
|
@@ -3557,44 +3570,6 @@ definitions:
|
|
3557
3570
|
$parameters:
|
3558
3571
|
type: object
|
3559
3572
|
additionalProperties: true
|
3560
|
-
GroupingPartitionRouter:
|
3561
|
-
title: Grouping Partition Router
|
3562
|
-
description: >
|
3563
|
-
A decorator on top of a partition router that groups partitions into batches of a specified size.
|
3564
|
-
This is useful for APIs that support filtering by multiple partition keys in a single request.
|
3565
|
-
Note that per-partition incremental syncs may not work as expected because the grouping
|
3566
|
-
of partitions might change between syncs, potentially leading to inconsistent state tracking.
|
3567
|
-
type: object
|
3568
|
-
required:
|
3569
|
-
- type
|
3570
|
-
- group_size
|
3571
|
-
- underlying_partition_router
|
3572
|
-
properties:
|
3573
|
-
type:
|
3574
|
-
type: string
|
3575
|
-
enum: [GroupingPartitionRouter]
|
3576
|
-
group_size:
|
3577
|
-
title: Group Size
|
3578
|
-
description: The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.
|
3579
|
-
type: integer
|
3580
|
-
examples:
|
3581
|
-
- 10
|
3582
|
-
- 50
|
3583
|
-
underlying_partition_router:
|
3584
|
-
title: Underlying Partition Router
|
3585
|
-
description: The partition router whose output will be grouped. This can be any valid partition router component.
|
3586
|
-
anyOf:
|
3587
|
-
- "$ref": "#/definitions/CustomPartitionRouter"
|
3588
|
-
- "$ref": "#/definitions/ListPartitionRouter"
|
3589
|
-
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3590
|
-
deduplicate:
|
3591
|
-
title: Deduplicate Partitions
|
3592
|
-
description: If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.
|
3593
|
-
type: boolean
|
3594
|
-
default: true
|
3595
|
-
$parameters:
|
3596
|
-
type: object
|
3597
|
-
additionalProperties: true
|
3598
3573
|
WaitUntilTimeFromHeader:
|
3599
3574
|
title: Wait Until Time Defined In Response Header
|
3600
3575
|
description: Extract time at which we can retry the request from response header and wait for the difference between now and that time.
|
@@ -79,7 +79,6 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
79
79
|
connector_state_manager: ConnectorStateManager,
|
80
80
|
connector_state_converter: AbstractStreamStateConverter,
|
81
81
|
cursor_field: CursorField,
|
82
|
-
use_global_cursor: bool = False,
|
83
82
|
) -> None:
|
84
83
|
self._global_cursor: Optional[StreamState] = {}
|
85
84
|
self._stream_name = stream_name
|
@@ -107,7 +106,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
107
106
|
self._lookback_window: int = 0
|
108
107
|
self._parent_state: Optional[StreamState] = None
|
109
108
|
self._number_of_partitions: int = 0
|
110
|
-
self._use_global_cursor: bool =
|
109
|
+
self._use_global_cursor: bool = False
|
111
110
|
self._partition_serializer = PerPartitionKeySerializer()
|
112
111
|
# Track the last time a state message was emitted
|
113
112
|
self._last_emission_time: float = 0.0
|
@@ -1890,10 +1890,9 @@ class DeclarativeSource1(BaseModel):
|
|
1890
1890
|
spec: Optional[Spec] = None
|
1891
1891
|
concurrency_level: Optional[ConcurrencyLevel] = None
|
1892
1892
|
api_budget: Optional[HTTPAPIBudget] = None
|
1893
|
-
max_concurrent_async_job_count: Optional[
|
1893
|
+
max_concurrent_async_job_count: Optional[int] = Field(
|
1894
1894
|
None,
|
1895
1895
|
description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.",
|
1896
|
-
examples=[3, "{{ config['max_concurrent_async_job_count'] }}"],
|
1897
1896
|
title="Maximum Concurrent Asynchronous Jobs",
|
1898
1897
|
)
|
1899
1898
|
metadata: Optional[Dict[str, Any]] = Field(
|
@@ -1923,10 +1922,9 @@ class DeclarativeSource2(BaseModel):
|
|
1923
1922
|
spec: Optional[Spec] = None
|
1924
1923
|
concurrency_level: Optional[ConcurrencyLevel] = None
|
1925
1924
|
api_budget: Optional[HTTPAPIBudget] = None
|
1926
|
-
max_concurrent_async_job_count: Optional[
|
1925
|
+
max_concurrent_async_job_count: Optional[int] = Field(
|
1927
1926
|
None,
|
1928
1927
|
description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.",
|
1929
|
-
examples=[3, "{{ config['max_concurrent_async_job_count'] }}"],
|
1930
1928
|
title="Maximum Concurrent Asynchronous Jobs",
|
1931
1929
|
)
|
1932
1930
|
metadata: Optional[Dict[str, Any]] = Field(
|
@@ -2280,6 +2278,22 @@ class StateDelegatingStream(BaseModel):
|
|
2280
2278
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2281
2279
|
|
2282
2280
|
|
2281
|
+
class FileUploader(BaseModel):
|
2282
|
+
type: Literal["FileUploader"]
|
2283
|
+
requester: Union[CustomRequester, HttpRequester] = Field(
|
2284
|
+
...,
|
2285
|
+
description="Requester component that describes how to prepare HTTP requests to send to the source API.",
|
2286
|
+
)
|
2287
|
+
download_target_extractor: Union[CustomRecordExtractor, DpathExtractor] = Field(
|
2288
|
+
...,
|
2289
|
+
description="Responsible for fetching the url where the file is located. This is applied on each records and not on the HTTP response",
|
2290
|
+
)
|
2291
|
+
file_extractor: Optional[Union[CustomRecordExtractor, DpathExtractor]] = Field(
|
2292
|
+
None,
|
2293
|
+
description="Responsible for fetching the content of the file. If not defined, the assumption is that the whole response body is the file content",
|
2294
|
+
)
|
2295
|
+
|
2296
|
+
|
2283
2297
|
class SimpleRetriever(BaseModel):
|
2284
2298
|
type: Literal["SimpleRetriever"]
|
2285
2299
|
record_selector: RecordSelector = Field(
|
@@ -2303,21 +2317,18 @@ class SimpleRetriever(BaseModel):
|
|
2303
2317
|
CustomPartitionRouter,
|
2304
2318
|
ListPartitionRouter,
|
2305
2319
|
SubstreamPartitionRouter,
|
2306
|
-
|
2307
|
-
List[
|
2308
|
-
Union[
|
2309
|
-
CustomPartitionRouter,
|
2310
|
-
ListPartitionRouter,
|
2311
|
-
SubstreamPartitionRouter,
|
2312
|
-
GroupingPartitionRouter,
|
2313
|
-
]
|
2314
|
-
],
|
2320
|
+
List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]],
|
2315
2321
|
]
|
2316
2322
|
] = Field(
|
2317
2323
|
[],
|
2318
2324
|
description="PartitionRouter component that describes how to partition the stream, enabling incremental syncs and checkpointing.",
|
2319
2325
|
title="Partition Router",
|
2320
2326
|
)
|
2327
|
+
file_uploader: Optional[FileUploader] = Field(
|
2328
|
+
None,
|
2329
|
+
description="(experimental) Describes how to fetch a file",
|
2330
|
+
title="File Uploader",
|
2331
|
+
)
|
2321
2332
|
decoder: Optional[
|
2322
2333
|
Union[
|
2323
2334
|
CustomDecoder,
|
@@ -2393,15 +2404,7 @@ class AsyncRetriever(BaseModel):
|
|
2393
2404
|
CustomPartitionRouter,
|
2394
2405
|
ListPartitionRouter,
|
2395
2406
|
SubstreamPartitionRouter,
|
2396
|
-
|
2397
|
-
List[
|
2398
|
-
Union[
|
2399
|
-
CustomPartitionRouter,
|
2400
|
-
ListPartitionRouter,
|
2401
|
-
SubstreamPartitionRouter,
|
2402
|
-
GroupingPartitionRouter,
|
2403
|
-
]
|
2404
|
-
],
|
2407
|
+
List[Union[CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter]],
|
2405
2408
|
]
|
2406
2409
|
] = Field(
|
2407
2410
|
[],
|
@@ -2453,29 +2456,6 @@ class SubstreamPartitionRouter(BaseModel):
|
|
2453
2456
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2454
2457
|
|
2455
2458
|
|
2456
|
-
class GroupingPartitionRouter(BaseModel):
|
2457
|
-
type: Literal["GroupingPartitionRouter"]
|
2458
|
-
group_size: int = Field(
|
2459
|
-
...,
|
2460
|
-
description="The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.",
|
2461
|
-
examples=[10, 50],
|
2462
|
-
title="Group Size",
|
2463
|
-
)
|
2464
|
-
underlying_partition_router: Union[
|
2465
|
-
CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter
|
2466
|
-
] = Field(
|
2467
|
-
...,
|
2468
|
-
description="The partition router whose output will be grouped. This can be any valid partition router component.",
|
2469
|
-
title="Underlying Partition Router",
|
2470
|
-
)
|
2471
|
-
deduplicate: Optional[bool] = Field(
|
2472
|
-
True,
|
2473
|
-
description="If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.",
|
2474
|
-
title="Deduplicate Partitions",
|
2475
|
-
)
|
2476
|
-
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2477
|
-
|
2478
|
-
|
2479
2459
|
class HttpComponentsResolver(BaseModel):
|
2480
2460
|
type: Literal["HttpComponentsResolver"]
|
2481
2461
|
retriever: Union[AsyncRetriever, CustomRetriever, SimpleRetriever] = Field(
|