airbyte-cdk 6.41.5__py3-none-any.whl → 6.41.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +22 -6
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +64 -12
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +2 -1
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +45 -4
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +40 -0
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +4 -0
- airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +150 -0
- {airbyte_cdk-6.41.5.dist-info → airbyte_cdk-6.41.7.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.41.5.dist-info → airbyte_cdk-6.41.7.dist-info}/RECORD +13 -12
- {airbyte_cdk-6.41.5.dist-info → airbyte_cdk-6.41.7.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.41.5.dist-info → airbyte_cdk-6.41.7.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.41.5.dist-info → airbyte_cdk-6.41.7.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.41.5.dist-info → airbyte_cdk-6.41.7.dist-info}/entry_points.txt +0 -0
@@ -3,9 +3,11 @@
|
|
3
3
|
import logging
|
4
4
|
import threading
|
5
5
|
import uuid
|
6
|
-
from
|
6
|
+
from dataclasses import dataclass, field
|
7
|
+
from typing import Any, Mapping, Set, Union
|
7
8
|
|
8
9
|
from airbyte_cdk.logger import lazy_log
|
10
|
+
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
|
9
11
|
|
10
12
|
LOGGER = logging.getLogger("airbyte")
|
11
13
|
|
@@ -14,15 +16,29 @@ class ConcurrentJobLimitReached(Exception):
|
|
14
16
|
pass
|
15
17
|
|
16
18
|
|
19
|
+
@dataclass
|
17
20
|
class JobTracker:
|
18
|
-
|
21
|
+
limit: Union[int, str]
|
22
|
+
config: Mapping[str, Any] = field(default_factory=dict)
|
23
|
+
|
24
|
+
def __post_init__(self) -> None:
|
19
25
|
self._jobs: Set[str] = set()
|
20
|
-
|
26
|
+
self._lock = threading.Lock()
|
27
|
+
if isinstance(self.limit, str):
|
28
|
+
try:
|
29
|
+
self.limit = int(
|
30
|
+
InterpolatedString(self.limit, parameters={}).eval(config=self.config)
|
31
|
+
)
|
32
|
+
except Exception as e:
|
33
|
+
LOGGER.warning(
|
34
|
+
f"Error interpolating max job count: {self.limit}. Setting to 1. {e}"
|
35
|
+
)
|
36
|
+
self.limit = 1
|
37
|
+
if self.limit < 1:
|
21
38
|
LOGGER.warning(
|
22
|
-
f"The `max_concurrent_async_job_count` property is less than 1: {limit}. Setting to 1. Please update the source manifest to set a valid value."
|
39
|
+
f"The `max_concurrent_async_job_count` property is less than 1: {self.limit}. Setting to 1. Please update the source manifest to set a valid value."
|
23
40
|
)
|
24
|
-
self._limit =
|
25
|
-
self._lock = threading.Lock()
|
41
|
+
self._limit = self.limit if self.limit >= 1 else 1
|
26
42
|
|
27
43
|
def try_to_get_intent(self) -> str:
|
28
44
|
lazy_log(
|
@@ -47,7 +47,12 @@ properties:
|
|
47
47
|
max_concurrent_async_job_count:
|
48
48
|
title: Maximum Concurrent Asynchronous Jobs
|
49
49
|
description: Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.
|
50
|
-
type:
|
50
|
+
type:
|
51
|
+
- integer
|
52
|
+
- string
|
53
|
+
examples:
|
54
|
+
- 3
|
55
|
+
- "{{ config['max_concurrent_async_job_count'] }}"
|
51
56
|
metadata:
|
52
57
|
type: object
|
53
58
|
description: For internal Airbyte use only - DO NOT modify manually. Used by consumers of declarative manifests for storing related metadata.
|
@@ -2192,7 +2197,8 @@ definitions:
|
|
2192
2197
|
type: object
|
2193
2198
|
additionalProperties: true
|
2194
2199
|
JsonDecoder:
|
2195
|
-
title:
|
2200
|
+
title: JSON
|
2201
|
+
description: Select 'JSON' if the response is formatted as a JSON object.
|
2196
2202
|
type: object
|
2197
2203
|
required:
|
2198
2204
|
- type
|
@@ -2201,8 +2207,8 @@ definitions:
|
|
2201
2207
|
type: string
|
2202
2208
|
enum: [JsonDecoder]
|
2203
2209
|
JsonlDecoder:
|
2204
|
-
title:
|
2205
|
-
description:
|
2210
|
+
title: JSON Lines
|
2211
|
+
description: Select 'JSON Lines' if the response consists of JSON objects separated by new lines ('\n') in JSONL format.
|
2206
2212
|
type: object
|
2207
2213
|
required:
|
2208
2214
|
- type
|
@@ -2327,8 +2333,8 @@ definitions:
|
|
2327
2333
|
type: object
|
2328
2334
|
additionalProperties: true
|
2329
2335
|
IterableDecoder:
|
2330
|
-
title: Iterable
|
2331
|
-
description:
|
2336
|
+
title: Iterable
|
2337
|
+
description: Select 'Iterable' if the response consists of strings separated by new lines (`\n`). The string will then be wrapped into a JSON object with the `record` key.
|
2332
2338
|
type: object
|
2333
2339
|
required:
|
2334
2340
|
- type
|
@@ -2337,8 +2343,8 @@ definitions:
|
|
2337
2343
|
type: string
|
2338
2344
|
enum: [IterableDecoder]
|
2339
2345
|
XmlDecoder:
|
2340
|
-
title: XML
|
2341
|
-
description:
|
2346
|
+
title: XML
|
2347
|
+
description: Select 'XML' if the response consists of XML-formatted data.
|
2342
2348
|
type: object
|
2343
2349
|
required:
|
2344
2350
|
- type
|
@@ -2369,8 +2375,8 @@ definitions:
|
|
2369
2375
|
type: object
|
2370
2376
|
additionalProperties: true
|
2371
2377
|
ZipfileDecoder:
|
2372
|
-
title:
|
2373
|
-
description:
|
2378
|
+
title: ZIP File
|
2379
|
+
description: Select 'ZIP file' for response data that is returned as a zipfile. Requires specifying an inner data type/decoder to parse the unzipped data.
|
2374
2380
|
type: object
|
2375
2381
|
additionalProperties: true
|
2376
2382
|
required:
|
@@ -2894,7 +2900,7 @@ definitions:
|
|
2894
2900
|
title: Lazy Read Pointer
|
2895
2901
|
description: If set, this will enable lazy reading, using the initial read of parent records to extract child records.
|
2896
2902
|
type: array
|
2897
|
-
default: [
|
2903
|
+
default: []
|
2898
2904
|
items:
|
2899
2905
|
- type: string
|
2900
2906
|
interpolation_context:
|
@@ -3199,7 +3205,7 @@ definitions:
|
|
3199
3205
|
properties:
|
3200
3206
|
type:
|
3201
3207
|
type: string
|
3202
|
-
enum: [
|
3208
|
+
enum: [StateDelegatingStream]
|
3203
3209
|
name:
|
3204
3210
|
title: Name
|
3205
3211
|
description: The stream name.
|
@@ -3254,12 +3260,14 @@ definitions:
|
|
3254
3260
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3255
3261
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3256
3262
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3263
|
+
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3257
3264
|
- type: array
|
3258
3265
|
items:
|
3259
3266
|
anyOf:
|
3260
3267
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3261
3268
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3262
3269
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3270
|
+
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3263
3271
|
decoder:
|
3264
3272
|
title: Decoder
|
3265
3273
|
description: Component decoding the response so records can be extracted.
|
@@ -3276,6 +3284,8 @@ definitions:
|
|
3276
3284
|
type: object
|
3277
3285
|
additionalProperties: true
|
3278
3286
|
GzipDecoder:
|
3287
|
+
title: gzip
|
3288
|
+
description: Select 'gzip' for response data that is compressed with gzip. Requires specifying an inner data type/decoder to parse the decompressed data.
|
3279
3289
|
type: object
|
3280
3290
|
required:
|
3281
3291
|
- type
|
@@ -3291,6 +3301,8 @@ definitions:
|
|
3291
3301
|
- "$ref": "#/definitions/JsonDecoder"
|
3292
3302
|
- "$ref": "#/definitions/JsonlDecoder"
|
3293
3303
|
CsvDecoder:
|
3304
|
+
title: CSV
|
3305
|
+
description: "Select 'CSV' for response data that is formatted as CSV (comma-separated values). Can specify an encoding (default: 'utf-8') and a delimiter (default: ',')."
|
3294
3306
|
type: object
|
3295
3307
|
required:
|
3296
3308
|
- type
|
@@ -3421,12 +3433,14 @@ definitions:
|
|
3421
3433
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3422
3434
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3423
3435
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3436
|
+
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3424
3437
|
- type: array
|
3425
3438
|
items:
|
3426
3439
|
anyOf:
|
3427
3440
|
- "$ref": "#/definitions/CustomPartitionRouter"
|
3428
3441
|
- "$ref": "#/definitions/ListPartitionRouter"
|
3429
3442
|
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3443
|
+
- "$ref": "#/definitions/GroupingPartitionRouter"
|
3430
3444
|
decoder:
|
3431
3445
|
title: Decoder
|
3432
3446
|
description: Component decoding the response so records can be extracted.
|
@@ -3543,6 +3557,44 @@ definitions:
|
|
3543
3557
|
$parameters:
|
3544
3558
|
type: object
|
3545
3559
|
additionalProperties: true
|
3560
|
+
GroupingPartitionRouter:
|
3561
|
+
title: Grouping Partition Router
|
3562
|
+
description: >
|
3563
|
+
A decorator on top of a partition router that groups partitions into batches of a specified size.
|
3564
|
+
This is useful for APIs that support filtering by multiple partition keys in a single request.
|
3565
|
+
Note that per-partition incremental syncs may not work as expected because the grouping
|
3566
|
+
of partitions might change between syncs, potentially leading to inconsistent state tracking.
|
3567
|
+
type: object
|
3568
|
+
required:
|
3569
|
+
- type
|
3570
|
+
- group_size
|
3571
|
+
- underlying_partition_router
|
3572
|
+
properties:
|
3573
|
+
type:
|
3574
|
+
type: string
|
3575
|
+
enum: [GroupingPartitionRouter]
|
3576
|
+
group_size:
|
3577
|
+
title: Group Size
|
3578
|
+
description: The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.
|
3579
|
+
type: integer
|
3580
|
+
examples:
|
3581
|
+
- 10
|
3582
|
+
- 50
|
3583
|
+
underlying_partition_router:
|
3584
|
+
title: Underlying Partition Router
|
3585
|
+
description: The partition router whose output will be grouped. This can be any valid partition router component.
|
3586
|
+
anyOf:
|
3587
|
+
- "$ref": "#/definitions/CustomPartitionRouter"
|
3588
|
+
- "$ref": "#/definitions/ListPartitionRouter"
|
3589
|
+
- "$ref": "#/definitions/SubstreamPartitionRouter"
|
3590
|
+
deduplicate:
|
3591
|
+
title: Deduplicate Partitions
|
3592
|
+
description: If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.
|
3593
|
+
type: boolean
|
3594
|
+
default: true
|
3595
|
+
$parameters:
|
3596
|
+
type: object
|
3597
|
+
additionalProperties: true
|
3546
3598
|
WaitUntilTimeFromHeader:
|
3547
3599
|
title: Wait Until Time Defined In Response Header
|
3548
3600
|
description: Extract time at which we can retry the request from response header and wait for the difference between now and that time.
|
@@ -79,6 +79,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
79
79
|
connector_state_manager: ConnectorStateManager,
|
80
80
|
connector_state_converter: AbstractStreamStateConverter,
|
81
81
|
cursor_field: CursorField,
|
82
|
+
use_global_cursor: bool = False,
|
82
83
|
) -> None:
|
83
84
|
self._global_cursor: Optional[StreamState] = {}
|
84
85
|
self._stream_name = stream_name
|
@@ -106,7 +107,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
106
107
|
self._lookback_window: int = 0
|
107
108
|
self._parent_state: Optional[StreamState] = None
|
108
109
|
self._number_of_partitions: int = 0
|
109
|
-
self._use_global_cursor: bool =
|
110
|
+
self._use_global_cursor: bool = use_global_cursor
|
110
111
|
self._partition_serializer = PerPartitionKeySerializer()
|
111
112
|
# Track the last time a state message was emitted
|
112
113
|
self._last_emission_time: float = 0.0
|
@@ -1890,9 +1890,10 @@ class DeclarativeSource1(BaseModel):
|
|
1890
1890
|
spec: Optional[Spec] = None
|
1891
1891
|
concurrency_level: Optional[ConcurrencyLevel] = None
|
1892
1892
|
api_budget: Optional[HTTPAPIBudget] = None
|
1893
|
-
max_concurrent_async_job_count: Optional[int] = Field(
|
1893
|
+
max_concurrent_async_job_count: Optional[Union[int, str]] = Field(
|
1894
1894
|
None,
|
1895
1895
|
description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.",
|
1896
|
+
examples=[3, "{{ config['max_concurrent_async_job_count'] }}"],
|
1896
1897
|
title="Maximum Concurrent Asynchronous Jobs",
|
1897
1898
|
)
|
1898
1899
|
metadata: Optional[Dict[str, Any]] = Field(
|
@@ -1922,9 +1923,10 @@ class DeclarativeSource2(BaseModel):
|
|
1922
1923
|
spec: Optional[Spec] = None
|
1923
1924
|
concurrency_level: Optional[ConcurrencyLevel] = None
|
1924
1925
|
api_budget: Optional[HTTPAPIBudget] = None
|
1925
|
-
max_concurrent_async_job_count: Optional[int] = Field(
|
1926
|
+
max_concurrent_async_job_count: Optional[Union[int, str]] = Field(
|
1926
1927
|
None,
|
1927
1928
|
description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.",
|
1929
|
+
examples=[3, "{{ config['max_concurrent_async_job_count'] }}"],
|
1928
1930
|
title="Maximum Concurrent Asynchronous Jobs",
|
1929
1931
|
)
|
1930
1932
|
metadata: Optional[Dict[str, Any]] = Field(
|
@@ -2301,7 +2303,15 @@ class SimpleRetriever(BaseModel):
|
|
2301
2303
|
CustomPartitionRouter,
|
2302
2304
|
ListPartitionRouter,
|
2303
2305
|
SubstreamPartitionRouter,
|
2304
|
-
|
2306
|
+
GroupingPartitionRouter,
|
2307
|
+
List[
|
2308
|
+
Union[
|
2309
|
+
CustomPartitionRouter,
|
2310
|
+
ListPartitionRouter,
|
2311
|
+
SubstreamPartitionRouter,
|
2312
|
+
GroupingPartitionRouter,
|
2313
|
+
]
|
2314
|
+
],
|
2305
2315
|
]
|
2306
2316
|
] = Field(
|
2307
2317
|
[],
|
@@ -2383,7 +2393,15 @@ class AsyncRetriever(BaseModel):
|
|
2383
2393
|
CustomPartitionRouter,
|
2384
2394
|
ListPartitionRouter,
|
2385
2395
|
SubstreamPartitionRouter,
|
2386
|
-
|
2396
|
+
GroupingPartitionRouter,
|
2397
|
+
List[
|
2398
|
+
Union[
|
2399
|
+
CustomPartitionRouter,
|
2400
|
+
ListPartitionRouter,
|
2401
|
+
SubstreamPartitionRouter,
|
2402
|
+
GroupingPartitionRouter,
|
2403
|
+
]
|
2404
|
+
],
|
2387
2405
|
]
|
2388
2406
|
] = Field(
|
2389
2407
|
[],
|
@@ -2435,6 +2453,29 @@ class SubstreamPartitionRouter(BaseModel):
|
|
2435
2453
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2436
2454
|
|
2437
2455
|
|
2456
|
+
class GroupingPartitionRouter(BaseModel):
|
2457
|
+
type: Literal["GroupingPartitionRouter"]
|
2458
|
+
group_size: int = Field(
|
2459
|
+
...,
|
2460
|
+
description="The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.",
|
2461
|
+
examples=[10, 50],
|
2462
|
+
title="Group Size",
|
2463
|
+
)
|
2464
|
+
underlying_partition_router: Union[
|
2465
|
+
CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter
|
2466
|
+
] = Field(
|
2467
|
+
...,
|
2468
|
+
description="The partition router whose output will be grouped. This can be any valid partition router component.",
|
2469
|
+
title="Underlying Partition Router",
|
2470
|
+
)
|
2471
|
+
deduplicate: Optional[bool] = Field(
|
2472
|
+
True,
|
2473
|
+
description="If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.",
|
2474
|
+
title="Deduplicate Partitions",
|
2475
|
+
)
|
2476
|
+
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2477
|
+
|
2478
|
+
|
2438
2479
|
class HttpComponentsResolver(BaseModel):
|
2439
2480
|
type: Literal["HttpComponentsResolver"]
|
2440
2481
|
retriever: Union[AsyncRetriever, CustomRetriever, SimpleRetriever] = Field(
|
@@ -227,6 +227,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
227
227
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
228
228
|
FlattenFields as FlattenFieldsModel,
|
229
229
|
)
|
230
|
+
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
231
|
+
GroupingPartitionRouter as GroupingPartitionRouterModel,
|
232
|
+
)
|
230
233
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
231
234
|
GzipDecoder as GzipDecoderModel,
|
232
235
|
)
|
@@ -385,6 +388,7 @@ from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import (
|
|
385
388
|
)
|
386
389
|
from airbyte_cdk.sources.declarative.partition_routers import (
|
387
390
|
CartesianProductStreamSlicer,
|
391
|
+
GroupingPartitionRouter,
|
388
392
|
ListPartitionRouter,
|
389
393
|
PartitionRouter,
|
390
394
|
SinglePartitionRouter,
|
@@ -638,6 +642,7 @@ class ModelToComponentFactory:
|
|
638
642
|
UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy,
|
639
643
|
RateModel: self.create_rate,
|
640
644
|
HttpRequestRegexMatcherModel: self.create_http_request_matcher,
|
645
|
+
GroupingPartitionRouterModel: self.create_grouping_partition_router,
|
641
646
|
}
|
642
647
|
|
643
648
|
# Needed for the case where we need to perform a second parse on the fields of a custom component
|
@@ -1355,6 +1360,9 @@ class ModelToComponentFactory:
|
|
1355
1360
|
)
|
1356
1361
|
stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state)
|
1357
1362
|
|
1363
|
+
# Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state
|
1364
|
+
use_global_cursor = isinstance(partition_router, GroupingPartitionRouter)
|
1365
|
+
|
1358
1366
|
# Return the concurrent cursor and state converter
|
1359
1367
|
return ConcurrentPerPartitionCursor(
|
1360
1368
|
cursor_factory=cursor_factory,
|
@@ -1366,6 +1374,7 @@ class ModelToComponentFactory:
|
|
1366
1374
|
connector_state_manager=state_manager,
|
1367
1375
|
connector_state_converter=connector_state_converter,
|
1368
1376
|
cursor_field=cursor_field,
|
1377
|
+
use_global_cursor=use_global_cursor,
|
1369
1378
|
)
|
1370
1379
|
|
1371
1380
|
@staticmethod
|
@@ -3370,3 +3379,34 @@ class ModelToComponentFactory:
|
|
3370
3379
|
self._api_budget = self.create_component(
|
3371
3380
|
model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config
|
3372
3381
|
)
|
3382
|
+
|
3383
|
+
def create_grouping_partition_router(
|
3384
|
+
self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any
|
3385
|
+
) -> GroupingPartitionRouter:
|
3386
|
+
underlying_router = self._create_component_from_model(
|
3387
|
+
model=model.underlying_partition_router, config=config
|
3388
|
+
)
|
3389
|
+
if model.group_size < 1:
|
3390
|
+
raise ValueError(f"Group size must be greater than 0, got {model.group_size}")
|
3391
|
+
|
3392
|
+
# Request options in underlying partition routers are not supported for GroupingPartitionRouter
|
3393
|
+
# because they are specific to individual partitions and cannot be aggregated or handled
|
3394
|
+
# when grouping, potentially leading to incorrect API calls. Any request customization
|
3395
|
+
# should be managed at the stream level through the requester's configuration.
|
3396
|
+
if isinstance(underlying_router, SubstreamPartitionRouter):
|
3397
|
+
if any(
|
3398
|
+
parent_config.request_option
|
3399
|
+
for parent_config in underlying_router.parent_stream_configs
|
3400
|
+
):
|
3401
|
+
raise ValueError("Request options are not supported for GroupingPartitionRouter.")
|
3402
|
+
|
3403
|
+
if isinstance(underlying_router, ListPartitionRouter):
|
3404
|
+
if underlying_router.request_option:
|
3405
|
+
raise ValueError("Request options are not supported for GroupingPartitionRouter.")
|
3406
|
+
|
3407
|
+
return GroupingPartitionRouter(
|
3408
|
+
group_size=model.group_size,
|
3409
|
+
underlying_partition_router=underlying_router,
|
3410
|
+
deduplicate=model.deduplicate if model.deduplicate is not None else True,
|
3411
|
+
config=config,
|
3412
|
+
)
|
@@ -8,6 +8,9 @@ from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_route
|
|
8
8
|
from airbyte_cdk.sources.declarative.partition_routers.cartesian_product_stream_slicer import (
|
9
9
|
CartesianProductStreamSlicer,
|
10
10
|
)
|
11
|
+
from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import (
|
12
|
+
GroupingPartitionRouter,
|
13
|
+
)
|
11
14
|
from airbyte_cdk.sources.declarative.partition_routers.list_partition_router import (
|
12
15
|
ListPartitionRouter,
|
13
16
|
)
|
@@ -22,6 +25,7 @@ from airbyte_cdk.sources.declarative.partition_routers.substream_partition_route
|
|
22
25
|
__all__ = [
|
23
26
|
"AsyncJobPartitionRouter",
|
24
27
|
"CartesianProductStreamSlicer",
|
28
|
+
"GroupingPartitionRouter",
|
25
29
|
"ListPartitionRouter",
|
26
30
|
"SinglePartitionRouter",
|
27
31
|
"SubstreamPartitionRouter",
|
@@ -0,0 +1,150 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from typing import Any, Iterable, Mapping, Optional
|
7
|
+
|
8
|
+
from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
|
9
|
+
from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class GroupingPartitionRouter(PartitionRouter):
|
14
|
+
"""
|
15
|
+
A partition router that groups partitions from an underlying partition router into batches of a specified size.
|
16
|
+
This is useful for APIs that support filtering by multiple partition keys in a single request.
|
17
|
+
|
18
|
+
Attributes:
|
19
|
+
group_size (int): The number of partitions to include in each group.
|
20
|
+
underlying_partition_router (PartitionRouter): The partition router whose output will be grouped.
|
21
|
+
deduplicate (bool): If True, ensures unique partitions within each group by removing duplicates based on the partition key.
|
22
|
+
config (Config): The connector configuration.
|
23
|
+
parameters (Mapping[str, Any]): Additional parameters for interpolation and configuration.
|
24
|
+
"""
|
25
|
+
|
26
|
+
group_size: int
|
27
|
+
underlying_partition_router: PartitionRouter
|
28
|
+
config: Config
|
29
|
+
deduplicate: bool = True
|
30
|
+
|
31
|
+
def __post_init__(self) -> None:
|
32
|
+
self._state: Optional[Mapping[str, StreamState]] = {}
|
33
|
+
|
34
|
+
def stream_slices(self) -> Iterable[StreamSlice]:
|
35
|
+
"""
|
36
|
+
Lazily groups partitions from the underlying partition router into batches of size `group_size`.
|
37
|
+
|
38
|
+
This method processes partitions one at a time from the underlying router, maintaining a batch buffer.
|
39
|
+
When the buffer reaches `group_size` or the underlying router is exhausted, it yields a grouped slice.
|
40
|
+
If deduplication is enabled, it tracks seen partition keys to ensure uniqueness within the current batch.
|
41
|
+
|
42
|
+
Yields:
|
43
|
+
Iterable[StreamSlice]: An iterable of StreamSlice objects, where each slice contains a batch of partition values.
|
44
|
+
"""
|
45
|
+
batch = []
|
46
|
+
seen_keys = set()
|
47
|
+
|
48
|
+
# Iterate over partitions lazily from the underlying router
|
49
|
+
for partition in self.underlying_partition_router.stream_slices():
|
50
|
+
# Extract the partition key (assuming single key-value pair, e.g., {"board_ids": value})
|
51
|
+
partition_keys = list(partition.partition.keys())
|
52
|
+
# skip parent_slice as it is part of SubstreamPartitionRouter partition
|
53
|
+
if "parent_slice" in partition_keys:
|
54
|
+
partition_keys.remove("parent_slice")
|
55
|
+
if len(partition_keys) != 1:
|
56
|
+
raise ValueError(
|
57
|
+
f"GroupingPartitionRouter expects a single partition key-value pair. Got {partition.partition}"
|
58
|
+
)
|
59
|
+
key = partition.partition[partition_keys[0]]
|
60
|
+
|
61
|
+
# Skip duplicates if deduplication is enabled
|
62
|
+
if self.deduplicate and key in seen_keys:
|
63
|
+
continue
|
64
|
+
|
65
|
+
# Add partition to the batch
|
66
|
+
batch.append(partition)
|
67
|
+
if self.deduplicate:
|
68
|
+
seen_keys.add(key)
|
69
|
+
|
70
|
+
# Yield the batch when it reaches the group_size
|
71
|
+
if len(batch) == self.group_size:
|
72
|
+
self._state = self.underlying_partition_router.get_stream_state()
|
73
|
+
yield self._create_grouped_slice(batch)
|
74
|
+
batch = [] # Reset the batch
|
75
|
+
|
76
|
+
self._state = self.underlying_partition_router.get_stream_state()
|
77
|
+
# Yield any remaining partitions if the batch isn't empty
|
78
|
+
if batch:
|
79
|
+
yield self._create_grouped_slice(batch)
|
80
|
+
|
81
|
+
def _create_grouped_slice(self, batch: list[StreamSlice]) -> StreamSlice:
|
82
|
+
"""
|
83
|
+
Creates a grouped StreamSlice from a batch of partitions, aggregating extra fields into a dictionary with list values.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
batch (list[StreamSlice]): A list of StreamSlice objects to group.
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
StreamSlice: A single StreamSlice with combined partition and extra field values.
|
90
|
+
"""
|
91
|
+
# Combine partition values into a single dict with lists
|
92
|
+
grouped_partition = {
|
93
|
+
key: [p.partition.get(key) for p in batch] for key in batch[0].partition.keys()
|
94
|
+
}
|
95
|
+
|
96
|
+
# Aggregate extra fields into a dict with list values
|
97
|
+
extra_fields_dict = (
|
98
|
+
{
|
99
|
+
key: [p.extra_fields.get(key) for p in batch]
|
100
|
+
for key in set().union(*(p.extra_fields.keys() for p in batch if p.extra_fields))
|
101
|
+
}
|
102
|
+
if any(p.extra_fields for p in batch)
|
103
|
+
else {}
|
104
|
+
)
|
105
|
+
return StreamSlice(
|
106
|
+
partition=grouped_partition,
|
107
|
+
cursor_slice={}, # Cursor is managed by the underlying router or incremental sync
|
108
|
+
extra_fields=extra_fields_dict,
|
109
|
+
)
|
110
|
+
|
111
|
+
def get_request_params(
|
112
|
+
self,
|
113
|
+
stream_state: Optional[StreamState] = None,
|
114
|
+
stream_slice: Optional[StreamSlice] = None,
|
115
|
+
next_page_token: Optional[Mapping[str, Any]] = None,
|
116
|
+
) -> Mapping[str, Any]:
|
117
|
+
return {}
|
118
|
+
|
119
|
+
def get_request_headers(
|
120
|
+
self,
|
121
|
+
stream_state: Optional[StreamState] = None,
|
122
|
+
stream_slice: Optional[StreamSlice] = None,
|
123
|
+
next_page_token: Optional[Mapping[str, Any]] = None,
|
124
|
+
) -> Mapping[str, Any]:
|
125
|
+
return {}
|
126
|
+
|
127
|
+
def get_request_body_data(
|
128
|
+
self,
|
129
|
+
stream_state: Optional[StreamState] = None,
|
130
|
+
stream_slice: Optional[StreamSlice] = None,
|
131
|
+
next_page_token: Optional[Mapping[str, Any]] = None,
|
132
|
+
) -> Mapping[str, Any]:
|
133
|
+
return {}
|
134
|
+
|
135
|
+
def get_request_body_json(
|
136
|
+
self,
|
137
|
+
stream_state: Optional[StreamState] = None,
|
138
|
+
stream_slice: Optional[StreamSlice] = None,
|
139
|
+
next_page_token: Optional[Mapping[str, Any]] = None,
|
140
|
+
) -> Mapping[str, Any]:
|
141
|
+
return {}
|
142
|
+
|
143
|
+
def set_initial_state(self, stream_state: StreamState) -> None:
|
144
|
+
"""Delegate state initialization to the underlying partition router."""
|
145
|
+
self.underlying_partition_router.set_initial_state(stream_state)
|
146
|
+
self._state = self.underlying_partition_router.get_stream_state()
|
147
|
+
|
148
|
+
def get_stream_state(self) -> Optional[Mapping[str, StreamState]]:
|
149
|
+
"""Delegate state retrieval to the underlying partition router."""
|
150
|
+
return self._state
|
@@ -50,7 +50,7 @@ airbyte_cdk/sources/declarative/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4G
|
|
50
50
|
airbyte_cdk/sources/declarative/async_job/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
51
51
|
airbyte_cdk/sources/declarative/async_job/job.py,sha256=aR5UZAkNUYA1I1zjUMAcvdzCFL3lXXOllkFmlhEKgkc,2001
|
52
52
|
airbyte_cdk/sources/declarative/async_job/job_orchestrator.py,sha256=tcHvB5QdBnx4XQmFvr4Swdq2DLRPst5w5M-OIJHnp5c,22034
|
53
|
-
airbyte_cdk/sources/declarative/async_job/job_tracker.py,sha256=
|
53
|
+
airbyte_cdk/sources/declarative/async_job/job_tracker.py,sha256=JowKzdT4E6IeE1cYIf4mOtB6sVEJoCeSsfzaFi9ghQ8,3231
|
54
54
|
airbyte_cdk/sources/declarative/async_job/repository.py,sha256=2OkWiZp5IKTOi_SIpP1U-Rw3gH36LBy_a8CgXoENTtg,1044
|
55
55
|
airbyte_cdk/sources/declarative/async_job/status.py,sha256=mkExR-uOAO1ckUnclaUOa74l2N9CdhLbVFM6KDoBgBM,715
|
56
56
|
airbyte_cdk/sources/declarative/async_job/timer.py,sha256=Fb8P72CQ7jIzJyzMSSNuBf2vt8bmrg9SrfmNxKwph2A,1242
|
@@ -71,7 +71,7 @@ airbyte_cdk/sources/declarative/concurrent_declarative_source.py,sha256=uhy0dRkA
|
|
71
71
|
airbyte_cdk/sources/declarative/datetime/__init__.py,sha256=4Hw-PX1-VgESLF16cDdvuYCzGJtHntThLF4qIiULWeo,61
|
72
72
|
airbyte_cdk/sources/declarative/datetime/datetime_parser.py,sha256=_zGNGq31RNy_0QBLt_EcTvgPyhj7urPdx6oA3M5-r3o,3150
|
73
73
|
airbyte_cdk/sources/declarative/datetime/min_max_datetime.py,sha256=0BHBtDNQZfvwM45-tY5pNlTcKAFSGGNxemoi0Jic-0E,5785
|
74
|
-
airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=
|
74
|
+
airbyte_cdk/sources/declarative/declarative_component_schema.yaml,sha256=Nvtm2TXNpzJ3sUEgqvnp7GnGUwM4yWgyh2D1F863cUU,152972
|
75
75
|
airbyte_cdk/sources/declarative/declarative_source.py,sha256=nF7wBqFd3AQmEKAm4CnIo29CJoQL562cJGSCeL8U8bA,1531
|
76
76
|
airbyte_cdk/sources/declarative/declarative_stream.py,sha256=dCRlddBUSaJmBNBz1pSO1r2rTw8AP5d2_vlmIeGs2gg,10767
|
77
77
|
airbyte_cdk/sources/declarative/decoders/__init__.py,sha256=JHb_0d3SE6kNY10mxA5YBEKPeSbsWYjByq1gUQxepoE,953
|
@@ -93,7 +93,7 @@ airbyte_cdk/sources/declarative/extractors/record_selector.py,sha256=HCqx7IyENM_
|
|
93
93
|
airbyte_cdk/sources/declarative/extractors/response_to_file_extractor.py,sha256=WJyA2OYIEgFpVP5Y3o0tIj69AV6IKkn9B16MeXaEItI,6513
|
94
94
|
airbyte_cdk/sources/declarative/extractors/type_transformer.py,sha256=d6Y2Rfg8pMVEEnHllfVksWZdNVOU55yk34O03dP9muY,1626
|
95
95
|
airbyte_cdk/sources/declarative/incremental/__init__.py,sha256=U1oZKtBaEC6IACmvziY9Wzg7Z8EgF4ZuR7NwvjlB_Sk,1255
|
96
|
-
airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py,sha256=
|
96
|
+
airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py,sha256=Mrx5XY6G8ZT-imsjUggpjzWo-Po_Wvi1WpylEW1ohIQ,22263
|
97
97
|
airbyte_cdk/sources/declarative/incremental/datetime_based_cursor.py,sha256=Rbe6lJLTtZ5en33MwZiB9-H9-AwDMNHgwBZs8EqhYqk,22172
|
98
98
|
airbyte_cdk/sources/declarative/incremental/declarative_cursor.py,sha256=5Bhw9VRPyIuCaD0wmmq_L3DZsa-rJgtKSEUzSd8YYD0,536
|
99
99
|
airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py,sha256=2tsE6FgXzemf4fZZ4uGtd8QpRBl9GJ2CRqSNJE5p0EI,16077
|
@@ -114,16 +114,17 @@ airbyte_cdk/sources/declarative/migrations/__init__.py,sha256=47DEQpj8HBSa-_TImW
|
|
114
114
|
airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py,sha256=iemy3fKLczcU0-Aor7tx5jcT6DRedKMqyK7kCOp01hg,3924
|
115
115
|
airbyte_cdk/sources/declarative/migrations/state_migration.py,sha256=KWPjealMLKSMtajXgkdGgKg7EmTLR-CqqD7UIh0-eDU,794
|
116
116
|
airbyte_cdk/sources/declarative/models/__init__.py,sha256=nUFxNCiKeYRVXuZEKA7GD-lTHxsiKcQ8FitZjKhPIvE,100
|
117
|
-
airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=
|
117
|
+
airbyte_cdk/sources/declarative/models/declarative_component_schema.py,sha256=8ljLhODbBlMxacuKZzlWwmNeGkb6oqfKBVZA70Cg7gE,108248
|
118
118
|
airbyte_cdk/sources/declarative/parsers/__init__.py,sha256=ZnqYNxHsKCgO38IwB34RQyRMXTs4GTvlRi3ImKnIioo,61
|
119
119
|
airbyte_cdk/sources/declarative/parsers/custom_code_compiler.py,sha256=nlVvHC511NUyDEEIRBkoeDTAvLqKNp-hRy8D19z8tdk,5941
|
120
120
|
airbyte_cdk/sources/declarative/parsers/custom_exceptions.py,sha256=Rir9_z3Kcd5Es0-LChrzk-0qubAsiK_RSEnLmK2OXm8,553
|
121
121
|
airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py,sha256=CXwTfD3wSQq3okcqwigpprbHhSURUokh4GK2OmOyKC8,9132
|
122
122
|
airbyte_cdk/sources/declarative/parsers/manifest_reference_resolver.py,sha256=IWUOdF03o-aQn0Occo1BJCxU0Pz-QILk5L67nzw2thw,6803
|
123
|
-
airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=
|
124
|
-
airbyte_cdk/sources/declarative/partition_routers/__init__.py,sha256=
|
123
|
+
airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py,sha256=APJkP5dmDU4aIaj7w3quGjrP1cV3MMp2gxbTckhOVRA,149720
|
124
|
+
airbyte_cdk/sources/declarative/partition_routers/__init__.py,sha256=TBC9AkGaUqHm2IKHMPN6punBIcY5tWGULowcLoAVkfw,1109
|
125
125
|
airbyte_cdk/sources/declarative/partition_routers/async_job_partition_router.py,sha256=VelO7zKqKtzMJ35jyFeg0ypJLQC0plqqIBNXoBW1G2E,3001
|
126
126
|
airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py,sha256=c5cuVFM6NFkuQqG8Z5IwkBuwDrvXZN1CunUOM_L0ezg,6892
|
127
|
+
airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py,sha256=-W1CAg2NayCMDNj7QLWn7Nqipaz7av9sLjbMnyMGUek,6271
|
127
128
|
airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py,sha256=tmGGpMoOBmaMfhVZq53AEWxoHm2lmNVi6hA2_IVEnAA,4882
|
128
129
|
airbyte_cdk/sources/declarative/partition_routers/partition_router.py,sha256=YyEIzdmLd1FjbVP3QbQ2VFCLW_P-OGbVh6VpZShp54k,2218
|
129
130
|
airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py,sha256=SKzKjSyfccq4dxGIh-J6ejrgkCHzaiTIazmbmeQiRD4,1942
|
@@ -358,9 +359,9 @@ airbyte_cdk/utils/slice_hasher.py,sha256=EDxgROHDbfG-QKQb59m7h_7crN1tRiawdf5uU7G
|
|
358
359
|
airbyte_cdk/utils/spec_schema_transformations.py,sha256=-5HTuNsnDBAhj-oLeQXwpTGA0HdcjFOf2zTEMUTTg_Y,816
|
359
360
|
airbyte_cdk/utils/stream_status_utils.py,sha256=ZmBoiy5HVbUEHAMrUONxZvxnvfV9CesmQJLDTAIWnWw,1171
|
360
361
|
airbyte_cdk/utils/traced_exception.py,sha256=C8uIBuCL_E4WnBAOPSxBicD06JAldoN9fGsQDp463OY,6292
|
361
|
-
airbyte_cdk-6.41.
|
362
|
-
airbyte_cdk-6.41.
|
363
|
-
airbyte_cdk-6.41.
|
364
|
-
airbyte_cdk-6.41.
|
365
|
-
airbyte_cdk-6.41.
|
366
|
-
airbyte_cdk-6.41.
|
362
|
+
airbyte_cdk-6.41.7.dist-info/LICENSE.txt,sha256=Wfe61S4BaGPj404v8lrAbvhjYR68SHlkzeYrg3_bbuM,1051
|
363
|
+
airbyte_cdk-6.41.7.dist-info/LICENSE_SHORT,sha256=aqF6D1NcESmpn-cqsxBtszTEnHKnlsp8L4x9wAh3Nxg,55
|
364
|
+
airbyte_cdk-6.41.7.dist-info/METADATA,sha256=4Td-yOXScntR4BKB6Nw94j_CWYB0BgALwx_qvFfBIHs,6071
|
365
|
+
airbyte_cdk-6.41.7.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
366
|
+
airbyte_cdk-6.41.7.dist-info/entry_points.txt,sha256=fj-e3PAQvsxsQzyyq8UkG1k8spunWnD4BAH2AwlR6NM,95
|
367
|
+
airbyte_cdk-6.41.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|