airbyte-cdk 6.41.9.dev4101__py3-none-any.whl → 6.42.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +25 -0
- airbyte_cdk/connector_builder/main.py +3 -0
- airbyte_cdk/models/__init__.py +0 -1
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +0 -1
- airbyte_cdk/sources/declarative/async_job/job.py +6 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +18 -18
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +22 -6
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +0 -22
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +71 -39
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +2 -1
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +17 -2
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +48 -25
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +45 -24
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +4 -0
- airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +150 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +5 -1
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +4 -17
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +8 -2
- airbyte_cdk/sources/streams/concurrent/default_stream.py +0 -3
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +4 -0
- airbyte_cdk/sources/types.py +0 -11
- airbyte_cdk/sources/utils/record_helper.py +1 -8
- {airbyte_cdk-6.41.9.dev4101.dist-info → airbyte_cdk-6.42.0.dist-info}/METADATA +2 -2
- {airbyte_cdk-6.41.9.dev4101.dist-info → airbyte_cdk-6.42.0.dist-info}/RECORD +28 -29
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +0 -61
- airbyte_cdk/sources/utils/files_directory.py +0 -15
- {airbyte_cdk-6.41.9.dev4101.dist-info → airbyte_cdk-6.42.0.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.41.9.dev4101.dist-info → airbyte_cdk-6.42.0.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.41.9.dev4101.dist-info → airbyte_cdk-6.42.0.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.41.9.dev4101.dist-info → airbyte_cdk-6.42.0.dist-info}/entry_points.txt +0 -0
@@ -1890,9 +1890,10 @@ class DeclarativeSource1(BaseModel):
|
|
1890
1890
|
spec: Optional[Spec] = None
|
1891
1891
|
concurrency_level: Optional[ConcurrencyLevel] = None
|
1892
1892
|
api_budget: Optional[HTTPAPIBudget] = None
|
1893
|
-
max_concurrent_async_job_count: Optional[int] = Field(
|
1893
|
+
max_concurrent_async_job_count: Optional[Union[int, str]] = Field(
|
1894
1894
|
None,
|
1895
1895
|
description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.",
|
1896
|
+
examples=[3, "{{ config['max_concurrent_async_job_count'] }}"],
|
1896
1897
|
title="Maximum Concurrent Asynchronous Jobs",
|
1897
1898
|
)
|
1898
1899
|
metadata: Optional[Dict[str, Any]] = Field(
|
@@ -1922,9 +1923,10 @@ class DeclarativeSource2(BaseModel):
|
|
1922
1923
|
spec: Optional[Spec] = None
|
1923
1924
|
concurrency_level: Optional[ConcurrencyLevel] = None
|
1924
1925
|
api_budget: Optional[HTTPAPIBudget] = None
|
1925
|
-
max_concurrent_async_job_count: Optional[int] = Field(
|
1926
|
+
max_concurrent_async_job_count: Optional[Union[int, str]] = Field(
|
1926
1927
|
None,
|
1927
1928
|
description="Maximum number of concurrent asynchronous jobs to run. This property is only relevant for sources/streams that support asynchronous job execution through the AsyncRetriever (e.g. a report-based stream that initiates a job, polls the job status, and then fetches the job results). This is often set by the API's maximum number of concurrent jobs on the account level. Refer to the API's documentation for this information.",
|
1929
|
+
examples=[3, "{{ config['max_concurrent_async_job_count'] }}"],
|
1928
1930
|
title="Maximum Concurrent Asynchronous Jobs",
|
1929
1931
|
)
|
1930
1932
|
metadata: Optional[Dict[str, Any]] = Field(
|
@@ -2278,22 +2280,6 @@ class StateDelegatingStream(BaseModel):
|
|
2278
2280
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2279
2281
|
|
2280
2282
|
|
2281
|
-
class FileUploader(BaseModel):
|
2282
|
-
type: Literal["FileUploader"]
|
2283
|
-
requester: Union[CustomRequester, HttpRequester] = Field(
|
2284
|
-
...,
|
2285
|
-
description="Requester component that describes how to prepare HTTP requests to send to the source API.",
|
2286
|
-
)
|
2287
|
-
download_target_extractor: Union[CustomRecordExtractor, DpathExtractor] = Field(
|
2288
|
-
...,
|
2289
|
-
description="Responsible for fetching the url where the file is located. This is applied on each records and not on the HTTP response",
|
2290
|
-
)
|
2291
|
-
file_extractor: Optional[Union[CustomRecordExtractor, DpathExtractor]] = Field(
|
2292
|
-
None,
|
2293
|
-
description="Responsible for fetching the content of the file. If not defined, the assumption is that the whole response body is the file content",
|
2294
|
-
)
|
2295
|
-
|
2296
|
-
|
2297
2283
|
class SimpleRetriever(BaseModel):
|
2298
2284
|
type: Literal["SimpleRetriever"]
|
2299
2285
|
record_selector: RecordSelector = Field(
|
@@ -2317,18 +2303,21 @@ class SimpleRetriever(BaseModel):
|
|
2317
2303
|
CustomPartitionRouter,
|
2318
2304
|
ListPartitionRouter,
|
2319
2305
|
SubstreamPartitionRouter,
|
2320
|
-
|
2306
|
+
GroupingPartitionRouter,
|
2307
|
+
List[
|
2308
|
+
Union[
|
2309
|
+
CustomPartitionRouter,
|
2310
|
+
ListPartitionRouter,
|
2311
|
+
SubstreamPartitionRouter,
|
2312
|
+
GroupingPartitionRouter,
|
2313
|
+
]
|
2314
|
+
],
|
2321
2315
|
]
|
2322
2316
|
] = Field(
|
2323
2317
|
[],
|
2324
2318
|
description="PartitionRouter component that describes how to partition the stream, enabling incremental syncs and checkpointing.",
|
2325
2319
|
title="Partition Router",
|
2326
2320
|
)
|
2327
|
-
file_uploader: Optional[FileUploader] = Field(
|
2328
|
-
None,
|
2329
|
-
description="(experimental) Describes how to fetch a file",
|
2330
|
-
title="File Uploader",
|
2331
|
-
)
|
2332
2321
|
decoder: Optional[
|
2333
2322
|
Union[
|
2334
2323
|
CustomDecoder,
|
@@ -2404,7 +2393,15 @@ class AsyncRetriever(BaseModel):
|
|
2404
2393
|
CustomPartitionRouter,
|
2405
2394
|
ListPartitionRouter,
|
2406
2395
|
SubstreamPartitionRouter,
|
2407
|
-
|
2396
|
+
GroupingPartitionRouter,
|
2397
|
+
List[
|
2398
|
+
Union[
|
2399
|
+
CustomPartitionRouter,
|
2400
|
+
ListPartitionRouter,
|
2401
|
+
SubstreamPartitionRouter,
|
2402
|
+
GroupingPartitionRouter,
|
2403
|
+
]
|
2404
|
+
],
|
2408
2405
|
]
|
2409
2406
|
] = Field(
|
2410
2407
|
[],
|
@@ -2456,6 +2453,29 @@ class SubstreamPartitionRouter(BaseModel):
|
|
2456
2453
|
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2457
2454
|
|
2458
2455
|
|
2456
|
+
class GroupingPartitionRouter(BaseModel):
|
2457
|
+
type: Literal["GroupingPartitionRouter"]
|
2458
|
+
group_size: int = Field(
|
2459
|
+
...,
|
2460
|
+
description="The number of partitions to include in each group. This determines how many partition values are batched together in a single slice.",
|
2461
|
+
examples=[10, 50],
|
2462
|
+
title="Group Size",
|
2463
|
+
)
|
2464
|
+
underlying_partition_router: Union[
|
2465
|
+
CustomPartitionRouter, ListPartitionRouter, SubstreamPartitionRouter
|
2466
|
+
] = Field(
|
2467
|
+
...,
|
2468
|
+
description="The partition router whose output will be grouped. This can be any valid partition router component.",
|
2469
|
+
title="Underlying Partition Router",
|
2470
|
+
)
|
2471
|
+
deduplicate: Optional[bool] = Field(
|
2472
|
+
True,
|
2473
|
+
description="If true, ensures that partitions are unique within each group by removing duplicates based on the partition key.",
|
2474
|
+
title="Deduplicate Partitions",
|
2475
|
+
)
|
2476
|
+
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
2477
|
+
|
2478
|
+
|
2459
2479
|
class HttpComponentsResolver(BaseModel):
|
2460
2480
|
type: Literal["HttpComponentsResolver"]
|
2461
2481
|
retriever: Union[AsyncRetriever, CustomRetriever, SimpleRetriever] = Field(
|
@@ -2469,6 +2489,9 @@ class HttpComponentsResolver(BaseModel):
|
|
2469
2489
|
|
2470
2490
|
class DynamicDeclarativeStream(BaseModel):
|
2471
2491
|
type: Literal["DynamicDeclarativeStream"]
|
2492
|
+
name: Optional[str] = Field(
|
2493
|
+
"", description="The dynamic stream name.", example=["Tables"], title="Name"
|
2494
|
+
)
|
2472
2495
|
stream_template: DeclarativeStream = Field(
|
2473
2496
|
..., description="Reference to the stream template.", title="Stream Template"
|
2474
2497
|
)
|
@@ -102,6 +102,7 @@ from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_mi
|
|
102
102
|
)
|
103
103
|
from airbyte_cdk.sources.declarative.models import (
|
104
104
|
CustomStateMigration,
|
105
|
+
GzipDecoder,
|
105
106
|
)
|
106
107
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
107
108
|
AddedFieldDefinition as AddedFieldDefinitionModel,
|
@@ -220,15 +221,15 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
220
221
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
221
222
|
ExponentialBackoffStrategy as ExponentialBackoffStrategyModel,
|
222
223
|
)
|
223
|
-
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
224
|
-
FileUploader as FileUploaderModel,
|
225
|
-
)
|
226
224
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
227
225
|
FixedWindowCallRatePolicy as FixedWindowCallRatePolicyModel,
|
228
226
|
)
|
229
227
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
230
228
|
FlattenFields as FlattenFieldsModel,
|
231
229
|
)
|
230
|
+
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
231
|
+
GroupingPartitionRouter as GroupingPartitionRouterModel,
|
232
|
+
)
|
232
233
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
233
234
|
GzipDecoder as GzipDecoderModel,
|
234
235
|
)
|
@@ -387,6 +388,7 @@ from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import (
|
|
387
388
|
)
|
388
389
|
from airbyte_cdk.sources.declarative.partition_routers import (
|
389
390
|
CartesianProductStreamSlicer,
|
391
|
+
GroupingPartitionRouter,
|
390
392
|
ListPartitionRouter,
|
391
393
|
PartitionRouter,
|
392
394
|
SinglePartitionRouter,
|
@@ -444,7 +446,6 @@ from airbyte_cdk.sources.declarative.retrievers import (
|
|
444
446
|
SimpleRetriever,
|
445
447
|
SimpleRetrieverTestReadDecorator,
|
446
448
|
)
|
447
|
-
from airbyte_cdk.sources.declarative.retrievers.file_uploader import FileUploader
|
448
449
|
from airbyte_cdk.sources.declarative.schema import (
|
449
450
|
ComplexFieldType,
|
450
451
|
DefaultSchemaLoader,
|
@@ -636,12 +637,12 @@ class ModelToComponentFactory:
|
|
636
637
|
ComponentMappingDefinitionModel: self.create_components_mapping_definition,
|
637
638
|
ZipfileDecoderModel: self.create_zipfile_decoder,
|
638
639
|
HTTPAPIBudgetModel: self.create_http_api_budget,
|
639
|
-
FileUploaderModel: self.create_file_uploader,
|
640
640
|
FixedWindowCallRatePolicyModel: self.create_fixed_window_call_rate_policy,
|
641
641
|
MovingWindowCallRatePolicyModel: self.create_moving_window_call_rate_policy,
|
642
642
|
UnlimitedCallRatePolicyModel: self.create_unlimited_call_rate_policy,
|
643
643
|
RateModel: self.create_rate,
|
644
644
|
HttpRequestRegexMatcherModel: self.create_http_request_matcher,
|
645
|
+
GroupingPartitionRouterModel: self.create_grouping_partition_router,
|
645
646
|
}
|
646
647
|
|
647
648
|
# Needed for the case where we need to perform a second parse on the fields of a custom component
|
@@ -1359,6 +1360,9 @@ class ModelToComponentFactory:
|
|
1359
1360
|
)
|
1360
1361
|
stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state)
|
1361
1362
|
|
1363
|
+
# Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state
|
1364
|
+
use_global_cursor = isinstance(partition_router, GroupingPartitionRouter)
|
1365
|
+
|
1362
1366
|
# Return the concurrent cursor and state converter
|
1363
1367
|
return ConcurrentPerPartitionCursor(
|
1364
1368
|
cursor_factory=cursor_factory,
|
@@ -1370,6 +1374,7 @@ class ModelToComponentFactory:
|
|
1370
1374
|
connector_state_manager=state_manager,
|
1371
1375
|
connector_state_converter=connector_state_converter,
|
1372
1376
|
cursor_field=cursor_field,
|
1377
|
+
use_global_cursor=use_global_cursor,
|
1373
1378
|
)
|
1374
1379
|
|
1375
1380
|
@staticmethod
|
@@ -3077,8 +3082,11 @@ class ModelToComponentFactory:
|
|
3077
3082
|
stream_slices,
|
3078
3083
|
self._job_tracker,
|
3079
3084
|
self._message_repository,
|
3080
|
-
has_bulk_parent=False,
|
3081
3085
|
# FIXME work would need to be done here in order to detect if a stream as a parent stream that is bulk
|
3086
|
+
has_bulk_parent=False,
|
3087
|
+
# set the `job_max_retry` to 1 for the `Connector Builder`` use-case.
|
3088
|
+
# `None` == default retry is set to 3 attempts, under the hood.
|
3089
|
+
job_max_retry=1 if self._emit_connector_builder_messages else None,
|
3082
3090
|
),
|
3083
3091
|
stream_slicer=stream_slicer,
|
3084
3092
|
config=config,
|
@@ -3322,24 +3330,6 @@ class ModelToComponentFactory:
|
|
3322
3330
|
matchers=matchers,
|
3323
3331
|
)
|
3324
3332
|
|
3325
|
-
def create_file_uploader(
|
3326
|
-
self, model: FileUploaderModel, config: Config, **kwargs: Any
|
3327
|
-
) -> FileUploader:
|
3328
|
-
name = "File Uploader"
|
3329
|
-
requester = self._create_component_from_model(
|
3330
|
-
model=model.requester,
|
3331
|
-
config=config,
|
3332
|
-
name=name,
|
3333
|
-
**kwargs,
|
3334
|
-
)
|
3335
|
-
download_target_extractor = self._create_component_from_model(
|
3336
|
-
model=model.download_target_extractor,
|
3337
|
-
config=config,
|
3338
|
-
name=name,
|
3339
|
-
**kwargs,
|
3340
|
-
)
|
3341
|
-
return FileUploader(requester, download_target_extractor)
|
3342
|
-
|
3343
3333
|
def create_moving_window_call_rate_policy(
|
3344
3334
|
self, model: MovingWindowCallRatePolicyModel, config: Config, **kwargs: Any
|
3345
3335
|
) -> MovingWindowCallRatePolicy:
|
@@ -3389,3 +3379,34 @@ class ModelToComponentFactory:
|
|
3389
3379
|
self._api_budget = self.create_component(
|
3390
3380
|
model_type=HTTPAPIBudgetModel, component_definition=component_definition, config=config
|
3391
3381
|
)
|
3382
|
+
|
3383
|
+
def create_grouping_partition_router(
|
3384
|
+
self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any
|
3385
|
+
) -> GroupingPartitionRouter:
|
3386
|
+
underlying_router = self._create_component_from_model(
|
3387
|
+
model=model.underlying_partition_router, config=config
|
3388
|
+
)
|
3389
|
+
if model.group_size < 1:
|
3390
|
+
raise ValueError(f"Group size must be greater than 0, got {model.group_size}")
|
3391
|
+
|
3392
|
+
# Request options in underlying partition routers are not supported for GroupingPartitionRouter
|
3393
|
+
# because they are specific to individual partitions and cannot be aggregated or handled
|
3394
|
+
# when grouping, potentially leading to incorrect API calls. Any request customization
|
3395
|
+
# should be managed at the stream level through the requester's configuration.
|
3396
|
+
if isinstance(underlying_router, SubstreamPartitionRouter):
|
3397
|
+
if any(
|
3398
|
+
parent_config.request_option
|
3399
|
+
for parent_config in underlying_router.parent_stream_configs
|
3400
|
+
):
|
3401
|
+
raise ValueError("Request options are not supported for GroupingPartitionRouter.")
|
3402
|
+
|
3403
|
+
if isinstance(underlying_router, ListPartitionRouter):
|
3404
|
+
if underlying_router.request_option:
|
3405
|
+
raise ValueError("Request options are not supported for GroupingPartitionRouter.")
|
3406
|
+
|
3407
|
+
return GroupingPartitionRouter(
|
3408
|
+
group_size=model.group_size,
|
3409
|
+
underlying_partition_router=underlying_router,
|
3410
|
+
deduplicate=model.deduplicate if model.deduplicate is not None else True,
|
3411
|
+
config=config,
|
3412
|
+
)
|
@@ -8,6 +8,9 @@ from airbyte_cdk.sources.declarative.partition_routers.async_job_partition_route
|
|
8
8
|
from airbyte_cdk.sources.declarative.partition_routers.cartesian_product_stream_slicer import (
|
9
9
|
CartesianProductStreamSlicer,
|
10
10
|
)
|
11
|
+
from airbyte_cdk.sources.declarative.partition_routers.grouping_partition_router import (
|
12
|
+
GroupingPartitionRouter,
|
13
|
+
)
|
11
14
|
from airbyte_cdk.sources.declarative.partition_routers.list_partition_router import (
|
12
15
|
ListPartitionRouter,
|
13
16
|
)
|
@@ -22,6 +25,7 @@ from airbyte_cdk.sources.declarative.partition_routers.substream_partition_route
|
|
22
25
|
__all__ = [
|
23
26
|
"AsyncJobPartitionRouter",
|
24
27
|
"CartesianProductStreamSlicer",
|
28
|
+
"GroupingPartitionRouter",
|
25
29
|
"ListPartitionRouter",
|
26
30
|
"SinglePartitionRouter",
|
27
31
|
"SubstreamPartitionRouter",
|
@@ -0,0 +1,150 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from typing import Any, Iterable, Mapping, Optional
|
7
|
+
|
8
|
+
from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
|
9
|
+
from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class GroupingPartitionRouter(PartitionRouter):
|
14
|
+
"""
|
15
|
+
A partition router that groups partitions from an underlying partition router into batches of a specified size.
|
16
|
+
This is useful for APIs that support filtering by multiple partition keys in a single request.
|
17
|
+
|
18
|
+
Attributes:
|
19
|
+
group_size (int): The number of partitions to include in each group.
|
20
|
+
underlying_partition_router (PartitionRouter): The partition router whose output will be grouped.
|
21
|
+
deduplicate (bool): If True, ensures unique partitions within each group by removing duplicates based on the partition key.
|
22
|
+
config (Config): The connector configuration.
|
23
|
+
parameters (Mapping[str, Any]): Additional parameters for interpolation and configuration.
|
24
|
+
"""
|
25
|
+
|
26
|
+
group_size: int
|
27
|
+
underlying_partition_router: PartitionRouter
|
28
|
+
config: Config
|
29
|
+
deduplicate: bool = True
|
30
|
+
|
31
|
+
def __post_init__(self) -> None:
|
32
|
+
self._state: Optional[Mapping[str, StreamState]] = {}
|
33
|
+
|
34
|
+
def stream_slices(self) -> Iterable[StreamSlice]:
|
35
|
+
"""
|
36
|
+
Lazily groups partitions from the underlying partition router into batches of size `group_size`.
|
37
|
+
|
38
|
+
This method processes partitions one at a time from the underlying router, maintaining a batch buffer.
|
39
|
+
When the buffer reaches `group_size` or the underlying router is exhausted, it yields a grouped slice.
|
40
|
+
If deduplication is enabled, it tracks seen partition keys to ensure uniqueness within the current batch.
|
41
|
+
|
42
|
+
Yields:
|
43
|
+
Iterable[StreamSlice]: An iterable of StreamSlice objects, where each slice contains a batch of partition values.
|
44
|
+
"""
|
45
|
+
batch = []
|
46
|
+
seen_keys = set()
|
47
|
+
|
48
|
+
# Iterate over partitions lazily from the underlying router
|
49
|
+
for partition in self.underlying_partition_router.stream_slices():
|
50
|
+
# Extract the partition key (assuming single key-value pair, e.g., {"board_ids": value})
|
51
|
+
partition_keys = list(partition.partition.keys())
|
52
|
+
# skip parent_slice as it is part of SubstreamPartitionRouter partition
|
53
|
+
if "parent_slice" in partition_keys:
|
54
|
+
partition_keys.remove("parent_slice")
|
55
|
+
if len(partition_keys) != 1:
|
56
|
+
raise ValueError(
|
57
|
+
f"GroupingPartitionRouter expects a single partition key-value pair. Got {partition.partition}"
|
58
|
+
)
|
59
|
+
key = partition.partition[partition_keys[0]]
|
60
|
+
|
61
|
+
# Skip duplicates if deduplication is enabled
|
62
|
+
if self.deduplicate and key in seen_keys:
|
63
|
+
continue
|
64
|
+
|
65
|
+
# Add partition to the batch
|
66
|
+
batch.append(partition)
|
67
|
+
if self.deduplicate:
|
68
|
+
seen_keys.add(key)
|
69
|
+
|
70
|
+
# Yield the batch when it reaches the group_size
|
71
|
+
if len(batch) == self.group_size:
|
72
|
+
self._state = self.underlying_partition_router.get_stream_state()
|
73
|
+
yield self._create_grouped_slice(batch)
|
74
|
+
batch = [] # Reset the batch
|
75
|
+
|
76
|
+
self._state = self.underlying_partition_router.get_stream_state()
|
77
|
+
# Yield any remaining partitions if the batch isn't empty
|
78
|
+
if batch:
|
79
|
+
yield self._create_grouped_slice(batch)
|
80
|
+
|
81
|
+
def _create_grouped_slice(self, batch: list[StreamSlice]) -> StreamSlice:
|
82
|
+
"""
|
83
|
+
Creates a grouped StreamSlice from a batch of partitions, aggregating extra fields into a dictionary with list values.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
batch (list[StreamSlice]): A list of StreamSlice objects to group.
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
StreamSlice: A single StreamSlice with combined partition and extra field values.
|
90
|
+
"""
|
91
|
+
# Combine partition values into a single dict with lists
|
92
|
+
grouped_partition = {
|
93
|
+
key: [p.partition.get(key) for p in batch] for key in batch[0].partition.keys()
|
94
|
+
}
|
95
|
+
|
96
|
+
# Aggregate extra fields into a dict with list values
|
97
|
+
extra_fields_dict = (
|
98
|
+
{
|
99
|
+
key: [p.extra_fields.get(key) for p in batch]
|
100
|
+
for key in set().union(*(p.extra_fields.keys() for p in batch if p.extra_fields))
|
101
|
+
}
|
102
|
+
if any(p.extra_fields for p in batch)
|
103
|
+
else {}
|
104
|
+
)
|
105
|
+
return StreamSlice(
|
106
|
+
partition=grouped_partition,
|
107
|
+
cursor_slice={}, # Cursor is managed by the underlying router or incremental sync
|
108
|
+
extra_fields=extra_fields_dict,
|
109
|
+
)
|
110
|
+
|
111
|
+
def get_request_params(
|
112
|
+
self,
|
113
|
+
stream_state: Optional[StreamState] = None,
|
114
|
+
stream_slice: Optional[StreamSlice] = None,
|
115
|
+
next_page_token: Optional[Mapping[str, Any]] = None,
|
116
|
+
) -> Mapping[str, Any]:
|
117
|
+
return {}
|
118
|
+
|
119
|
+
def get_request_headers(
|
120
|
+
self,
|
121
|
+
stream_state: Optional[StreamState] = None,
|
122
|
+
stream_slice: Optional[StreamSlice] = None,
|
123
|
+
next_page_token: Optional[Mapping[str, Any]] = None,
|
124
|
+
) -> Mapping[str, Any]:
|
125
|
+
return {}
|
126
|
+
|
127
|
+
def get_request_body_data(
|
128
|
+
self,
|
129
|
+
stream_state: Optional[StreamState] = None,
|
130
|
+
stream_slice: Optional[StreamSlice] = None,
|
131
|
+
next_page_token: Optional[Mapping[str, Any]] = None,
|
132
|
+
) -> Mapping[str, Any]:
|
133
|
+
return {}
|
134
|
+
|
135
|
+
def get_request_body_json(
|
136
|
+
self,
|
137
|
+
stream_state: Optional[StreamState] = None,
|
138
|
+
stream_slice: Optional[StreamSlice] = None,
|
139
|
+
next_page_token: Optional[Mapping[str, Any]] = None,
|
140
|
+
) -> Mapping[str, Any]:
|
141
|
+
return {}
|
142
|
+
|
143
|
+
def set_initial_state(self, stream_state: StreamState) -> None:
|
144
|
+
"""Delegate state initialization to the underlying partition router."""
|
145
|
+
self.underlying_partition_router.set_initial_state(stream_state)
|
146
|
+
self._state = self.underlying_partition_router.get_stream_state()
|
147
|
+
|
148
|
+
def get_stream_state(self) -> Optional[Mapping[str, StreamState]]:
|
149
|
+
"""Delegate state retrieval to the underlying partition router."""
|
150
|
+
return self._state
|
@@ -374,7 +374,11 @@ class SubstreamPartitionRouter(PartitionRouter):
|
|
374
374
|
# Ignore per-partition states or invalid formats.
|
375
375
|
if isinstance(substream_state, (list, dict)) or len(substream_state_values) != 1:
|
376
376
|
# If a global state is present under the key "state", use its first value.
|
377
|
-
if
|
377
|
+
if (
|
378
|
+
"state" in stream_state
|
379
|
+
and isinstance(stream_state["state"], dict)
|
380
|
+
and stream_state["state"] != {}
|
381
|
+
):
|
378
382
|
substream_state = list(stream_state["state"].values())[0]
|
379
383
|
else:
|
380
384
|
return {}
|
@@ -3,7 +3,6 @@
|
|
3
3
|
from typing import Any, Iterable, Mapping, Optional
|
4
4
|
|
5
5
|
from airbyte_cdk.sources.declarative.retrievers import Retriever
|
6
|
-
from airbyte_cdk.sources.declarative.retrievers.file_uploader import FileUploader
|
7
6
|
from airbyte_cdk.sources.message import MessageRepository
|
8
7
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
9
8
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
|
@@ -19,7 +18,6 @@ class DeclarativePartitionFactory:
|
|
19
18
|
json_schema: Mapping[str, Any],
|
20
19
|
retriever: Retriever,
|
21
20
|
message_repository: MessageRepository,
|
22
|
-
file_uploader: Optional[FileUploader] = None,
|
23
21
|
) -> None:
|
24
22
|
"""
|
25
23
|
The DeclarativePartitionFactory takes a retriever_factory and not a retriever directly. The reason is that our components are not
|
@@ -30,7 +28,6 @@ class DeclarativePartitionFactory:
|
|
30
28
|
self._json_schema = json_schema
|
31
29
|
self._retriever = retriever
|
32
30
|
self._message_repository = message_repository
|
33
|
-
self._file_uploader = file_uploader
|
34
31
|
|
35
32
|
def create(self, stream_slice: StreamSlice) -> Partition:
|
36
33
|
return DeclarativePartition(
|
@@ -38,7 +35,6 @@ class DeclarativePartitionFactory:
|
|
38
35
|
self._json_schema,
|
39
36
|
self._retriever,
|
40
37
|
self._message_repository,
|
41
|
-
self._file_uploader,
|
42
38
|
stream_slice,
|
43
39
|
)
|
44
40
|
|
@@ -50,32 +46,23 @@ class DeclarativePartition(Partition):
|
|
50
46
|
json_schema: Mapping[str, Any],
|
51
47
|
retriever: Retriever,
|
52
48
|
message_repository: MessageRepository,
|
53
|
-
file_uploader: Optional[FileUploader],
|
54
49
|
stream_slice: StreamSlice,
|
55
50
|
):
|
56
51
|
self._stream_name = stream_name
|
57
52
|
self._json_schema = json_schema
|
58
53
|
self._retriever = retriever
|
59
54
|
self._message_repository = message_repository
|
60
|
-
self._file_uploader = file_uploader
|
61
55
|
self._stream_slice = stream_slice
|
62
56
|
self._hash = SliceHasher.hash(self._stream_name, self._stream_slice)
|
63
57
|
|
64
58
|
def read(self) -> Iterable[Record]:
|
65
59
|
for stream_data in self._retriever.read_records(self._json_schema, self._stream_slice):
|
66
60
|
if isinstance(stream_data, Mapping):
|
67
|
-
|
68
|
-
stream_data
|
69
|
-
|
70
|
-
|
71
|
-
data=stream_data,
|
72
|
-
stream_name=self.stream_name(),
|
73
|
-
associated_slice=self._stream_slice,
|
74
|
-
)
|
61
|
+
yield Record(
|
62
|
+
data=stream_data,
|
63
|
+
stream_name=self.stream_name(),
|
64
|
+
associated_slice=self._stream_slice,
|
75
65
|
)
|
76
|
-
if self._file_uploader:
|
77
|
-
self._file_uploader.upload(record)
|
78
|
-
yield record
|
79
66
|
else:
|
80
67
|
self._message_repository.emit_message(stream_data)
|
81
68
|
|
@@ -8,12 +8,18 @@ from typing import Any, Dict, Iterable
|
|
8
8
|
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
|
9
9
|
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader
|
10
10
|
from airbyte_cdk.sources.file_based.remote_file import RemoteFile
|
11
|
-
|
11
|
+
|
12
|
+
AIRBYTE_STAGING_DIRECTORY = os.getenv("AIRBYTE_STAGING_DIRECTORY", "/staging/files")
|
13
|
+
DEFAULT_LOCAL_DIRECTORY = "/tmp/airbyte-file-transfer"
|
12
14
|
|
13
15
|
|
14
16
|
class FileTransfer:
|
15
17
|
def __init__(self) -> None:
|
16
|
-
self._local_directory =
|
18
|
+
self._local_directory = (
|
19
|
+
AIRBYTE_STAGING_DIRECTORY
|
20
|
+
if os.path.exists(AIRBYTE_STAGING_DIRECTORY)
|
21
|
+
else DEFAULT_LOCAL_DIRECTORY
|
22
|
+
)
|
17
23
|
|
18
24
|
def get_file(
|
19
25
|
self,
|
@@ -29,7 +29,6 @@ class DefaultStream(AbstractStream):
|
|
29
29
|
logger: Logger,
|
30
30
|
cursor: Cursor,
|
31
31
|
namespace: Optional[str] = None,
|
32
|
-
supports_file_transfer: bool = False,
|
33
32
|
) -> None:
|
34
33
|
self._stream_partition_generator = partition_generator
|
35
34
|
self._name = name
|
@@ -40,7 +39,6 @@ class DefaultStream(AbstractStream):
|
|
40
39
|
self._logger = logger
|
41
40
|
self._cursor = cursor
|
42
41
|
self._namespace = namespace
|
43
|
-
self._supports_file_transfer = supports_file_transfer
|
44
42
|
|
45
43
|
def generate_partitions(self) -> Iterable[Partition]:
|
46
44
|
yield from self._stream_partition_generator.generate()
|
@@ -70,7 +68,6 @@ class DefaultStream(AbstractStream):
|
|
70
68
|
json_schema=dict(self._json_schema),
|
71
69
|
supported_sync_modes=[SyncMode.full_refresh],
|
72
70
|
is_resumable=False,
|
73
|
-
is_file_based=self._supports_file_transfer,
|
74
71
|
)
|
75
72
|
|
76
73
|
if self._namespace:
|
@@ -71,6 +71,10 @@ class AbstractStreamStateConverter(ABC):
|
|
71
71
|
for stream_slice in state.get("slices", []):
|
72
72
|
stream_slice[self.START_KEY] = self._from_state_message(stream_slice[self.START_KEY])
|
73
73
|
stream_slice[self.END_KEY] = self._from_state_message(stream_slice[self.END_KEY])
|
74
|
+
if self.MOST_RECENT_RECORD_KEY in stream_slice:
|
75
|
+
stream_slice[self.MOST_RECENT_RECORD_KEY] = self._from_state_message(
|
76
|
+
stream_slice[self.MOST_RECENT_RECORD_KEY]
|
77
|
+
)
|
74
78
|
return state
|
75
79
|
|
76
80
|
def serialize(
|
airbyte_cdk/sources/types.py
CHANGED
@@ -6,7 +6,6 @@ from __future__ import annotations
|
|
6
6
|
|
7
7
|
from typing import Any, ItemsView, Iterator, KeysView, List, Mapping, Optional, ValuesView
|
8
8
|
|
9
|
-
from airbyte_cdk.models import AirbyteRecordMessageFileReference
|
10
9
|
from airbyte_cdk.utils.slice_hasher import SliceHasher
|
11
10
|
|
12
11
|
# A FieldPointer designates a path to a field inside a mapping. For example, retrieving ["k1", "k1.2"] in the object {"k1" :{"k1.2":
|
@@ -25,13 +24,11 @@ class Record(Mapping[str, Any]):
|
|
25
24
|
stream_name: str,
|
26
25
|
associated_slice: Optional[StreamSlice] = None,
|
27
26
|
is_file_transfer_message: bool = False,
|
28
|
-
file_reference: Optional[AirbyteRecordMessageFileReference] = None,
|
29
27
|
):
|
30
28
|
self._data = data
|
31
29
|
self._associated_slice = associated_slice
|
32
30
|
self.stream_name = stream_name
|
33
31
|
self.is_file_transfer_message = is_file_transfer_message
|
34
|
-
self._file_reference = file_reference
|
35
32
|
|
36
33
|
@property
|
37
34
|
def data(self) -> Mapping[str, Any]:
|
@@ -41,14 +38,6 @@ class Record(Mapping[str, Any]):
|
|
41
38
|
def associated_slice(self) -> Optional[StreamSlice]:
|
42
39
|
return self._associated_slice
|
43
40
|
|
44
|
-
@property
|
45
|
-
def file_reference(self) -> AirbyteRecordMessageFileReference:
|
46
|
-
return self._file_reference
|
47
|
-
|
48
|
-
@file_reference.setter
|
49
|
-
def file_reference(self, value: AirbyteRecordMessageFileReference):
|
50
|
-
self._file_reference = value
|
51
|
-
|
52
41
|
def __repr__(self) -> str:
|
53
42
|
return repr(self._data)
|
54
43
|
|
@@ -9,7 +9,6 @@ from airbyte_cdk.models import (
|
|
9
9
|
AirbyteLogMessage,
|
10
10
|
AirbyteMessage,
|
11
11
|
AirbyteRecordMessage,
|
12
|
-
AirbyteRecordMessageFileReference,
|
13
12
|
AirbyteTraceMessage,
|
14
13
|
)
|
15
14
|
from airbyte_cdk.models import Type as MessageType
|
@@ -24,7 +23,6 @@ def stream_data_to_airbyte_message(
|
|
24
23
|
transformer: TypeTransformer = TypeTransformer(TransformConfig.NoTransform),
|
25
24
|
schema: Optional[Mapping[str, Any]] = None,
|
26
25
|
is_file_transfer_message: bool = False,
|
27
|
-
file_reference: Optional[AirbyteRecordMessageFileReference] = None,
|
28
26
|
) -> AirbyteMessage:
|
29
27
|
if schema is None:
|
30
28
|
schema = {}
|
@@ -43,12 +41,7 @@ def stream_data_to_airbyte_message(
|
|
43
41
|
stream=stream_name, file=data, emitted_at=now_millis, data={}
|
44
42
|
)
|
45
43
|
else:
|
46
|
-
message = AirbyteRecordMessage(
|
47
|
-
stream=stream_name,
|
48
|
-
data=data,
|
49
|
-
emitted_at=now_millis,
|
50
|
-
file_reference=file_reference,
|
51
|
-
)
|
44
|
+
message = AirbyteRecordMessage(stream=stream_name, data=data, emitted_at=now_millis)
|
52
45
|
return AirbyteMessage(type=MessageType.RECORD, record=message)
|
53
46
|
case AirbyteTraceMessage():
|
54
47
|
return AirbyteMessage(type=MessageType.TRACE, trace=data_or_message)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: airbyte-cdk
|
3
|
-
Version: 6.
|
3
|
+
Version: 6.42.0
|
4
4
|
Summary: A framework for writing Airbyte Connectors.
|
5
5
|
Home-page: https://airbyte.com
|
6
6
|
License: MIT
|
@@ -22,7 +22,7 @@ Provides-Extra: sql
|
|
22
22
|
Provides-Extra: vector-db-based
|
23
23
|
Requires-Dist: Jinja2 (>=3.1.2,<3.2.0)
|
24
24
|
Requires-Dist: PyYAML (>=6.0.1,<7.0.0)
|
25
|
-
Requires-Dist: airbyte-protocol-models-dataclasses (
|
25
|
+
Requires-Dist: airbyte-protocol-models-dataclasses (>=0.14,<0.15)
|
26
26
|
Requires-Dist: anyascii (>=0.3.2,<0.4.0)
|
27
27
|
Requires-Dist: avro (>=1.11.2,<1.13.0) ; extra == "file-based"
|
28
28
|
Requires-Dist: backoff
|