airbyte-cdk 6.45.0.dev4107__py3-none-any.whl → 6.45.0.post6.dev14369631849__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +45 -6
- airbyte_cdk/connector_builder/main.py +5 -2
- airbyte_cdk/models/__init__.py +0 -1
- airbyte_cdk/models/airbyte_protocol.py +3 -1
- airbyte_cdk/models/file_transfer_record_message.py +13 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +1 -1
- airbyte_cdk/sources/declarative/async_job/job.py +6 -0
- airbyte_cdk/sources/declarative/async_job/job_orchestrator.py +18 -18
- airbyte_cdk/sources/declarative/async_job/job_tracker.py +22 -6
- airbyte_cdk/sources/declarative/checks/__init__.py +5 -2
- airbyte_cdk/sources/declarative/checks/check_stream.py +113 -11
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +0 -8
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +210 -50
- airbyte_cdk/sources/declarative/extractors/record_selector.py +1 -6
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +2 -1
- airbyte_cdk/sources/declarative/interpolation/macros.py +10 -4
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +23 -2
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +142 -43
- airbyte_cdk/sources/declarative/parsers/manifest_component_transformer.py +16 -4
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +263 -50
- airbyte_cdk/sources/declarative/partition_routers/__init__.py +4 -0
- airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +150 -0
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +5 -1
- airbyte_cdk/sources/declarative/requesters/query_properties/__init__.py +13 -0
- airbyte_cdk/sources/declarative/requesters/query_properties/properties_from_endpoint.py +40 -0
- airbyte_cdk/sources/declarative/requesters/query_properties/property_chunking.py +69 -0
- airbyte_cdk/sources/declarative/requesters/query_properties/query_properties.py +58 -0
- airbyte_cdk/sources/declarative/requesters/query_properties/strategies/__init__.py +10 -0
- airbyte_cdk/sources/declarative/requesters/query_properties/strategies/group_by_key.py +33 -0
- airbyte_cdk/sources/declarative/requesters/query_properties/strategies/merge_strategy.py +19 -0
- airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py +25 -2
- airbyte_cdk/sources/declarative/retrievers/simple_retriever.py +101 -30
- airbyte_cdk/sources/declarative/schema/default_schema_loader.py +1 -1
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +4 -9
- airbyte_cdk/sources/declarative/transformations/add_fields.py +3 -1
- airbyte_cdk/sources/file_based/file_based_stream_reader.py +15 -38
- airbyte_cdk/sources/file_based/file_types/file_transfer.py +15 -8
- airbyte_cdk/sources/file_based/schema_helpers.py +1 -9
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +12 -3
- airbyte_cdk/sources/file_based/stream/default_file_based_stream.py +31 -16
- airbyte_cdk/sources/file_based/stream/permissions_file_based_stream.py +3 -1
- airbyte_cdk/sources/streams/concurrent/default_stream.py +0 -3
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +4 -0
- airbyte_cdk/sources/types.py +2 -11
- airbyte_cdk/sources/utils/record_helper.py +8 -8
- airbyte_cdk/test/mock_http/response_builder.py +0 -8
- {airbyte_cdk-6.45.0.dev4107.dist-info → airbyte_cdk-6.45.0.post6.dev14369631849.dist-info}/METADATA +2 -2
- {airbyte_cdk-6.45.0.dev4107.dist-info → airbyte_cdk-6.45.0.post6.dev14369631849.dist-info}/RECORD +52 -46
- airbyte_cdk/sources/declarative/retrievers/file_uploader.py +0 -89
- airbyte_cdk/sources/file_based/file_record_data.py +0 -22
- airbyte_cdk/sources/utils/files_directory.py +0 -15
- {airbyte_cdk-6.45.0.dev4107.dist-info → airbyte_cdk-6.45.0.post6.dev14369631849.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.45.0.dev4107.dist-info → airbyte_cdk-6.45.0.post6.dev14369631849.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.45.0.dev4107.dist-info → airbyte_cdk-6.45.0.post6.dev14369631849.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.45.0.dev4107.dist-info → airbyte_cdk-6.45.0.post6.dev14369631849.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,150 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from typing import Any, Iterable, Mapping, Optional
|
7
|
+
|
8
|
+
from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
|
9
|
+
from airbyte_cdk.sources.types import Config, StreamSlice, StreamState
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class GroupingPartitionRouter(PartitionRouter):
|
14
|
+
"""
|
15
|
+
A partition router that groups partitions from an underlying partition router into batches of a specified size.
|
16
|
+
This is useful for APIs that support filtering by multiple partition keys in a single request.
|
17
|
+
|
18
|
+
Attributes:
|
19
|
+
group_size (int): The number of partitions to include in each group.
|
20
|
+
underlying_partition_router (PartitionRouter): The partition router whose output will be grouped.
|
21
|
+
deduplicate (bool): If True, ensures unique partitions within each group by removing duplicates based on the partition key.
|
22
|
+
config (Config): The connector configuration.
|
23
|
+
parameters (Mapping[str, Any]): Additional parameters for interpolation and configuration.
|
24
|
+
"""
|
25
|
+
|
26
|
+
group_size: int
|
27
|
+
underlying_partition_router: PartitionRouter
|
28
|
+
config: Config
|
29
|
+
deduplicate: bool = True
|
30
|
+
|
31
|
+
def __post_init__(self) -> None:
|
32
|
+
self._state: Optional[Mapping[str, StreamState]] = {}
|
33
|
+
|
34
|
+
def stream_slices(self) -> Iterable[StreamSlice]:
|
35
|
+
"""
|
36
|
+
Lazily groups partitions from the underlying partition router into batches of size `group_size`.
|
37
|
+
|
38
|
+
This method processes partitions one at a time from the underlying router, maintaining a batch buffer.
|
39
|
+
When the buffer reaches `group_size` or the underlying router is exhausted, it yields a grouped slice.
|
40
|
+
If deduplication is enabled, it tracks seen partition keys to ensure uniqueness within the current batch.
|
41
|
+
|
42
|
+
Yields:
|
43
|
+
Iterable[StreamSlice]: An iterable of StreamSlice objects, where each slice contains a batch of partition values.
|
44
|
+
"""
|
45
|
+
batch = []
|
46
|
+
seen_keys = set()
|
47
|
+
|
48
|
+
# Iterate over partitions lazily from the underlying router
|
49
|
+
for partition in self.underlying_partition_router.stream_slices():
|
50
|
+
# Extract the partition key (assuming single key-value pair, e.g., {"board_ids": value})
|
51
|
+
partition_keys = list(partition.partition.keys())
|
52
|
+
# skip parent_slice as it is part of SubstreamPartitionRouter partition
|
53
|
+
if "parent_slice" in partition_keys:
|
54
|
+
partition_keys.remove("parent_slice")
|
55
|
+
if len(partition_keys) != 1:
|
56
|
+
raise ValueError(
|
57
|
+
f"GroupingPartitionRouter expects a single partition key-value pair. Got {partition.partition}"
|
58
|
+
)
|
59
|
+
key = partition.partition[partition_keys[0]]
|
60
|
+
|
61
|
+
# Skip duplicates if deduplication is enabled
|
62
|
+
if self.deduplicate and key in seen_keys:
|
63
|
+
continue
|
64
|
+
|
65
|
+
# Add partition to the batch
|
66
|
+
batch.append(partition)
|
67
|
+
if self.deduplicate:
|
68
|
+
seen_keys.add(key)
|
69
|
+
|
70
|
+
# Yield the batch when it reaches the group_size
|
71
|
+
if len(batch) == self.group_size:
|
72
|
+
self._state = self.underlying_partition_router.get_stream_state()
|
73
|
+
yield self._create_grouped_slice(batch)
|
74
|
+
batch = [] # Reset the batch
|
75
|
+
|
76
|
+
self._state = self.underlying_partition_router.get_stream_state()
|
77
|
+
# Yield any remaining partitions if the batch isn't empty
|
78
|
+
if batch:
|
79
|
+
yield self._create_grouped_slice(batch)
|
80
|
+
|
81
|
+
def _create_grouped_slice(self, batch: list[StreamSlice]) -> StreamSlice:
|
82
|
+
"""
|
83
|
+
Creates a grouped StreamSlice from a batch of partitions, aggregating extra fields into a dictionary with list values.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
batch (list[StreamSlice]): A list of StreamSlice objects to group.
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
StreamSlice: A single StreamSlice with combined partition and extra field values.
|
90
|
+
"""
|
91
|
+
# Combine partition values into a single dict with lists
|
92
|
+
grouped_partition = {
|
93
|
+
key: [p.partition.get(key) for p in batch] for key in batch[0].partition.keys()
|
94
|
+
}
|
95
|
+
|
96
|
+
# Aggregate extra fields into a dict with list values
|
97
|
+
extra_fields_dict = (
|
98
|
+
{
|
99
|
+
key: [p.extra_fields.get(key) for p in batch]
|
100
|
+
for key in set().union(*(p.extra_fields.keys() for p in batch if p.extra_fields))
|
101
|
+
}
|
102
|
+
if any(p.extra_fields for p in batch)
|
103
|
+
else {}
|
104
|
+
)
|
105
|
+
return StreamSlice(
|
106
|
+
partition=grouped_partition,
|
107
|
+
cursor_slice={}, # Cursor is managed by the underlying router or incremental sync
|
108
|
+
extra_fields=extra_fields_dict,
|
109
|
+
)
|
110
|
+
|
111
|
+
def get_request_params(
|
112
|
+
self,
|
113
|
+
stream_state: Optional[StreamState] = None,
|
114
|
+
stream_slice: Optional[StreamSlice] = None,
|
115
|
+
next_page_token: Optional[Mapping[str, Any]] = None,
|
116
|
+
) -> Mapping[str, Any]:
|
117
|
+
return {}
|
118
|
+
|
119
|
+
def get_request_headers(
|
120
|
+
self,
|
121
|
+
stream_state: Optional[StreamState] = None,
|
122
|
+
stream_slice: Optional[StreamSlice] = None,
|
123
|
+
next_page_token: Optional[Mapping[str, Any]] = None,
|
124
|
+
) -> Mapping[str, Any]:
|
125
|
+
return {}
|
126
|
+
|
127
|
+
def get_request_body_data(
|
128
|
+
self,
|
129
|
+
stream_state: Optional[StreamState] = None,
|
130
|
+
stream_slice: Optional[StreamSlice] = None,
|
131
|
+
next_page_token: Optional[Mapping[str, Any]] = None,
|
132
|
+
) -> Mapping[str, Any]:
|
133
|
+
return {}
|
134
|
+
|
135
|
+
def get_request_body_json(
|
136
|
+
self,
|
137
|
+
stream_state: Optional[StreamState] = None,
|
138
|
+
stream_slice: Optional[StreamSlice] = None,
|
139
|
+
next_page_token: Optional[Mapping[str, Any]] = None,
|
140
|
+
) -> Mapping[str, Any]:
|
141
|
+
return {}
|
142
|
+
|
143
|
+
def set_initial_state(self, stream_state: StreamState) -> None:
|
144
|
+
"""Delegate state initialization to the underlying partition router."""
|
145
|
+
self.underlying_partition_router.set_initial_state(stream_state)
|
146
|
+
self._state = self.underlying_partition_router.get_stream_state()
|
147
|
+
|
148
|
+
def get_stream_state(self) -> Optional[Mapping[str, StreamState]]:
|
149
|
+
"""Delegate state retrieval to the underlying partition router."""
|
150
|
+
return self._state
|
@@ -374,7 +374,11 @@ class SubstreamPartitionRouter(PartitionRouter):
|
|
374
374
|
# Ignore per-partition states or invalid formats.
|
375
375
|
if isinstance(substream_state, (list, dict)) or len(substream_state_values) != 1:
|
376
376
|
# If a global state is present under the key "state", use its first value.
|
377
|
-
if
|
377
|
+
if (
|
378
|
+
"state" in stream_state
|
379
|
+
and isinstance(stream_state["state"], dict)
|
380
|
+
and stream_state["state"] != {}
|
381
|
+
):
|
378
382
|
substream_state = list(stream_state["state"].values())[0]
|
379
383
|
else:
|
380
384
|
return {}
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
from airbyte_cdk.sources.declarative.requesters.query_properties.properties_from_endpoint import (
|
4
|
+
PropertiesFromEndpoint,
|
5
|
+
)
|
6
|
+
from airbyte_cdk.sources.declarative.requesters.query_properties.property_chunking import (
|
7
|
+
PropertyChunking,
|
8
|
+
)
|
9
|
+
from airbyte_cdk.sources.declarative.requesters.query_properties.query_properties import (
|
10
|
+
QueryProperties,
|
11
|
+
)
|
12
|
+
|
13
|
+
__all__ = ["PropertiesFromEndpoint", "PropertyChunking", "QueryProperties"]
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
from dataclasses import InitVar, dataclass
|
4
|
+
from typing import Any, Iterable, List, Mapping, Optional
|
5
|
+
|
6
|
+
import dpath
|
7
|
+
|
8
|
+
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
|
9
|
+
from airbyte_cdk.sources.declarative.retrievers import Retriever
|
10
|
+
from airbyte_cdk.sources.types import Config, StreamSlice
|
11
|
+
|
12
|
+
|
13
|
+
@dataclass
|
14
|
+
class PropertiesFromEndpoint:
|
15
|
+
"""
|
16
|
+
Component that defines the behavior around how to dynamically retrieve a set of request properties from an
|
17
|
+
API endpoint. The set retrieved can then be injected into the requests to extract records from an API source.
|
18
|
+
"""
|
19
|
+
|
20
|
+
property_field_path: List[str]
|
21
|
+
retriever: Retriever
|
22
|
+
config: Config
|
23
|
+
parameters: InitVar[Mapping[str, Any]]
|
24
|
+
|
25
|
+
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
26
|
+
self._property_field_path = [
|
27
|
+
InterpolatedString(string=property_field, parameters=parameters)
|
28
|
+
for property_field in self.property_field_path
|
29
|
+
]
|
30
|
+
|
31
|
+
def get_properties_from_endpoint(self, stream_slice: Optional[StreamSlice]) -> Iterable[str]:
|
32
|
+
response_properties = self.retriever.read_records(
|
33
|
+
records_schema={}, stream_slice=stream_slice
|
34
|
+
)
|
35
|
+
for property_obj in response_properties:
|
36
|
+
path = [
|
37
|
+
node.eval(self.config) if not isinstance(node, str) else node
|
38
|
+
for node in self._property_field_path
|
39
|
+
]
|
40
|
+
yield dpath.get(property_obj, path, default=[]) # type: ignore # extracted will be a MutableMapping, given input data structure
|
@@ -0,0 +1,69 @@
|
|
1
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
from dataclasses import InitVar, dataclass
|
4
|
+
from enum import Enum
|
5
|
+
from typing import Any, Iterable, List, Mapping, Optional
|
6
|
+
|
7
|
+
from airbyte_cdk.sources.declarative.requesters.query_properties.strategies import GroupByKey
|
8
|
+
from airbyte_cdk.sources.declarative.requesters.query_properties.strategies.merge_strategy import (
|
9
|
+
RecordMergeStrategy,
|
10
|
+
)
|
11
|
+
from airbyte_cdk.sources.types import Config, Record
|
12
|
+
|
13
|
+
|
14
|
+
class PropertyLimitType(Enum):
|
15
|
+
"""
|
16
|
+
The heuristic that determines when the maximum size of the current chunk of properties and when a new
|
17
|
+
one should be started.
|
18
|
+
"""
|
19
|
+
|
20
|
+
characters = "characters"
|
21
|
+
property_count = "property_count"
|
22
|
+
|
23
|
+
|
24
|
+
@dataclass
|
25
|
+
class PropertyChunking:
|
26
|
+
"""
|
27
|
+
Defines the behavior for how the complete list of properties to query for are broken down into smaller groups
|
28
|
+
that will be used for multiple requests to the target API.
|
29
|
+
"""
|
30
|
+
|
31
|
+
property_limit_type: PropertyLimitType
|
32
|
+
property_limit: Optional[int]
|
33
|
+
record_merge_strategy: Optional[RecordMergeStrategy]
|
34
|
+
parameters: InitVar[Mapping[str, Any]]
|
35
|
+
config: Config
|
36
|
+
|
37
|
+
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
38
|
+
self._record_merge_strategy = self.record_merge_strategy or GroupByKey(
|
39
|
+
key="id", config=self.config, parameters=parameters
|
40
|
+
)
|
41
|
+
|
42
|
+
def get_request_property_chunks(
|
43
|
+
self, property_fields: Iterable[str], always_include_properties: Optional[List[str]]
|
44
|
+
) -> Iterable[List[str]]:
|
45
|
+
if not self.property_limit:
|
46
|
+
single_property_chunk = list(property_fields)
|
47
|
+
if always_include_properties:
|
48
|
+
single_property_chunk.extend(always_include_properties)
|
49
|
+
yield single_property_chunk
|
50
|
+
return
|
51
|
+
current_chunk = list(always_include_properties) if always_include_properties else []
|
52
|
+
chunk_size = 0
|
53
|
+
for property_field in property_fields:
|
54
|
+
# If property_limit_type is not defined, we default to property_count which is just an incrementing count
|
55
|
+
property_field_size = (
|
56
|
+
len(property_field)
|
57
|
+
if self.property_limit_type == PropertyLimitType.characters
|
58
|
+
else 1
|
59
|
+
)
|
60
|
+
if chunk_size + property_field_size > self.property_limit:
|
61
|
+
yield current_chunk
|
62
|
+
current_chunk = list(always_include_properties) if always_include_properties else []
|
63
|
+
chunk_size = 0
|
64
|
+
current_chunk.append(property_field)
|
65
|
+
chunk_size += property_field_size
|
66
|
+
yield current_chunk
|
67
|
+
|
68
|
+
def get_merge_key(self, record: Record) -> Optional[str]:
|
69
|
+
return self._record_merge_strategy.get_group_key(record=record)
|
@@ -0,0 +1,58 @@
|
|
1
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
from dataclasses import InitVar, dataclass
|
4
|
+
from typing import Any, Iterable, List, Mapping, Optional, Union
|
5
|
+
|
6
|
+
from airbyte_cdk.sources.declarative.requesters.query_properties import (
|
7
|
+
PropertiesFromEndpoint,
|
8
|
+
PropertyChunking,
|
9
|
+
)
|
10
|
+
from airbyte_cdk.sources.types import Config, StreamSlice
|
11
|
+
|
12
|
+
|
13
|
+
@dataclass
|
14
|
+
class QueryProperties:
|
15
|
+
"""
|
16
|
+
Low-code component that encompasses the behavior to inject additional property values into the outbound API
|
17
|
+
requests. Property values can be defined statically within the manifest or dynamically by making requests
|
18
|
+
to a partner API to retrieve the properties. Query properties also allow for splitting of the total set of
|
19
|
+
properties into smaller chunks to satisfy API restrictions around the total amount of data retrieved
|
20
|
+
"""
|
21
|
+
|
22
|
+
property_list: Optional[Union[List[str], PropertiesFromEndpoint]]
|
23
|
+
always_include_properties: Optional[List[str]]
|
24
|
+
property_chunking: Optional[PropertyChunking]
|
25
|
+
config: Config
|
26
|
+
parameters: InitVar[Mapping[str, Any]]
|
27
|
+
|
28
|
+
def get_request_property_chunks(
|
29
|
+
self, stream_slice: Optional[StreamSlice] = None
|
30
|
+
) -> Iterable[List[str]]:
|
31
|
+
"""
|
32
|
+
Uses the defined property_list to fetch the total set of properties dynamically or from a static list
|
33
|
+
and based on the resulting properties, performs property chunking if applicable.
|
34
|
+
:param stream_slice: The StreamSlice of the current partition being processed during the sync. This is included
|
35
|
+
because subcomponents of QueryProperties can make use of interpolation of the top-level StreamSlice object
|
36
|
+
"""
|
37
|
+
fields: Union[Iterable[str], List[str]]
|
38
|
+
if isinstance(self.property_list, PropertiesFromEndpoint):
|
39
|
+
fields = self.property_list.get_properties_from_endpoint(stream_slice=stream_slice)
|
40
|
+
else:
|
41
|
+
fields = self.property_list if self.property_list else []
|
42
|
+
|
43
|
+
if self.property_chunking:
|
44
|
+
yield from self.property_chunking.get_request_property_chunks(
|
45
|
+
property_fields=fields, always_include_properties=self.always_include_properties
|
46
|
+
)
|
47
|
+
else:
|
48
|
+
yield list(fields)
|
49
|
+
|
50
|
+
# delete later, but leaving this to keep the discussion thread on the PR from getting hidden
|
51
|
+
def has_multiple_chunks(self, stream_slice: Optional[StreamSlice]) -> bool:
|
52
|
+
property_chunks = iter(self.get_request_property_chunks(stream_slice=stream_slice))
|
53
|
+
try:
|
54
|
+
next(property_chunks)
|
55
|
+
next(property_chunks)
|
56
|
+
return True
|
57
|
+
except StopIteration:
|
58
|
+
return False
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
from airbyte_cdk.sources.declarative.requesters.query_properties.strategies.group_by_key import (
|
4
|
+
GroupByKey,
|
5
|
+
)
|
6
|
+
from airbyte_cdk.sources.declarative.requesters.query_properties.strategies.merge_strategy import (
|
7
|
+
RecordMergeStrategy,
|
8
|
+
)
|
9
|
+
|
10
|
+
__all__ = ["GroupByKey", "RecordMergeStrategy"]
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
from dataclasses import InitVar, dataclass
|
4
|
+
from typing import Any, List, Mapping, Optional, Union
|
5
|
+
|
6
|
+
from airbyte_cdk.sources.declarative.requesters.query_properties.strategies.merge_strategy import (
|
7
|
+
RecordMergeStrategy,
|
8
|
+
)
|
9
|
+
from airbyte_cdk.sources.types import Config, Record
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class GroupByKey(RecordMergeStrategy):
|
14
|
+
"""
|
15
|
+
Record merge strategy that combines records together according to values on the record for one or many keys.
|
16
|
+
"""
|
17
|
+
|
18
|
+
key: Union[str, List[str]]
|
19
|
+
parameters: InitVar[Mapping[str, Any]]
|
20
|
+
config: Config
|
21
|
+
|
22
|
+
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
23
|
+
self._keys = [self.key] if isinstance(self.key, str) else self.key
|
24
|
+
|
25
|
+
def get_group_key(self, record: Record) -> Optional[str]:
|
26
|
+
resolved_keys = []
|
27
|
+
for key in self._keys:
|
28
|
+
key_value = record.data.get(key)
|
29
|
+
if key_value:
|
30
|
+
resolved_keys.append(key_value)
|
31
|
+
else:
|
32
|
+
return None
|
33
|
+
return ",".join(resolved_keys)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
from airbyte_cdk.sources.types import Record
|
8
|
+
|
9
|
+
|
10
|
+
@dataclass
|
11
|
+
class RecordMergeStrategy(ABC):
|
12
|
+
"""
|
13
|
+
Describe the interface for how records that required multiple requests to get the complete set of fields
|
14
|
+
should be merged back into a single record.
|
15
|
+
"""
|
16
|
+
|
17
|
+
@abstractmethod
|
18
|
+
def get_group_key(self, record: Record) -> Optional[str]:
|
19
|
+
pass
|
airbyte_cdk/sources/declarative/requesters/request_options/interpolated_request_options_provider.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
#
|
2
|
-
# Copyright (c)
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
5
|
from dataclasses import InitVar, dataclass, field
|
6
|
-
from typing import Any, Mapping, MutableMapping, Optional, Union
|
6
|
+
from typing import Any, List, Mapping, MutableMapping, Optional, Union
|
7
7
|
|
8
8
|
from airbyte_cdk.sources.declarative.interpolation.interpolated_nested_mapping import NestedMapping
|
9
9
|
from airbyte_cdk.sources.declarative.requesters.request_options.interpolated_nested_request_input_provider import (
|
@@ -40,6 +40,7 @@ class InterpolatedRequestOptionsProvider(RequestOptionsProvider):
|
|
40
40
|
request_headers: Optional[RequestInput] = None
|
41
41
|
request_body_data: Optional[RequestInput] = None
|
42
42
|
request_body_json: Optional[NestedMapping] = None
|
43
|
+
query_properties_key: Optional[str] = None
|
43
44
|
|
44
45
|
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
45
46
|
if self.request_parameters is None:
|
@@ -83,6 +84,28 @@ class InterpolatedRequestOptionsProvider(RequestOptionsProvider):
|
|
83
84
|
valid_value_types=ValidRequestTypes,
|
84
85
|
)
|
85
86
|
if isinstance(interpolated_value, dict):
|
87
|
+
if self.query_properties_key:
|
88
|
+
if not stream_slice:
|
89
|
+
raise ValueError(
|
90
|
+
"stream_slice should not be None if query properties in requests is enabled. Please contact Airbyte Support"
|
91
|
+
)
|
92
|
+
elif (
|
93
|
+
"query_properties" not in stream_slice.extra_fields
|
94
|
+
or stream_slice.extra_fields.get("query_properties") is None
|
95
|
+
):
|
96
|
+
raise ValueError(
|
97
|
+
"QueryProperties component is defined but stream_partition does not contain query_properties. Please contact Airbyte Support"
|
98
|
+
)
|
99
|
+
elif not isinstance(stream_slice.extra_fields.get("query_properties"), List):
|
100
|
+
raise ValueError(
|
101
|
+
"QueryProperties component is defined but stream_slice.extra_fields.query_properties is not a List. Please contact Airbyte Support"
|
102
|
+
)
|
103
|
+
interpolated_value = {
|
104
|
+
**interpolated_value,
|
105
|
+
self.query_properties_key: ",".join(
|
106
|
+
stream_slice.extra_fields.get("query_properties") # type: ignore # Earlier type checks validate query_properties type
|
107
|
+
),
|
108
|
+
}
|
86
109
|
return interpolated_value
|
87
110
|
return {}
|
88
111
|
|
@@ -1,8 +1,9 @@
|
|
1
1
|
#
|
2
|
-
# Copyright (c)
|
2
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
|
5
5
|
import json
|
6
|
+
from collections import defaultdict
|
6
7
|
from dataclasses import InitVar, dataclass, field
|
7
8
|
from functools import partial
|
8
9
|
from itertools import islice
|
@@ -12,6 +13,7 @@ from typing import (
|
|
12
13
|
Iterable,
|
13
14
|
List,
|
14
15
|
Mapping,
|
16
|
+
MutableMapping,
|
15
17
|
Optional,
|
16
18
|
Set,
|
17
19
|
Tuple,
|
@@ -31,6 +33,7 @@ from airbyte_cdk.sources.declarative.partition_routers.single_partition_router i
|
|
31
33
|
)
|
32
34
|
from airbyte_cdk.sources.declarative.requesters.paginators.no_pagination import NoPagination
|
33
35
|
from airbyte_cdk.sources.declarative.requesters.paginators.paginator import Paginator
|
36
|
+
from airbyte_cdk.sources.declarative.requesters.query_properties import QueryProperties
|
34
37
|
from airbyte_cdk.sources.declarative.requesters.request_options import (
|
35
38
|
DefaultRequestOptionsProvider,
|
36
39
|
RequestOptionsProvider,
|
@@ -88,6 +91,7 @@ class SimpleRetriever(Retriever):
|
|
88
91
|
)
|
89
92
|
cursor: Optional[DeclarativeCursor] = None
|
90
93
|
ignore_stream_slicer_parameters_on_paginated_requests: bool = False
|
94
|
+
additional_query_properties: Optional[QueryProperties] = None
|
91
95
|
|
92
96
|
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
93
97
|
self._paginator = self.paginator or NoPagination(parameters=parameters)
|
@@ -445,43 +449,110 @@ class SimpleRetriever(Retriever):
|
|
445
449
|
:param stream_slice: The stream slice to read data for
|
446
450
|
:return: The records read from the API source
|
447
451
|
"""
|
448
|
-
_slice = stream_slice or StreamSlice(partition={}, cursor_slice={}) # None-check
|
449
452
|
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
453
|
+
property_chunks = (
|
454
|
+
list(
|
455
|
+
self.additional_query_properties.get_request_property_chunks(
|
456
|
+
stream_slice=stream_slice
|
457
|
+
)
|
458
|
+
)
|
459
|
+
if self.additional_query_properties
|
460
|
+
else []
|
456
461
|
)
|
462
|
+
records_without_merge_key = []
|
463
|
+
merged_records: MutableMapping[str, Any] = defaultdict(dict)
|
457
464
|
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
# Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
|
462
|
-
# fetch more records. The platform deletes stream state for full refresh streams before starting a
|
463
|
-
# new job, so we don't need to worry about this value existing for the initial attempt
|
464
|
-
if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
|
465
|
-
return
|
465
|
+
_slice = stream_slice or StreamSlice(partition={}, cursor_slice={}) # None-check
|
466
|
+
most_recent_record_from_slice = None
|
466
467
|
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
468
|
+
if self.additional_query_properties:
|
469
|
+
for properties in property_chunks:
|
470
|
+
_slice = StreamSlice(
|
471
|
+
partition=_slice.partition or {},
|
472
|
+
cursor_slice=_slice.cursor_slice or {},
|
473
|
+
extra_fields={"query_properties": properties},
|
474
|
+
) # None-check
|
475
|
+
|
476
|
+
record_generator = partial(
|
477
|
+
self._parse_records,
|
478
|
+
stream_slice=_slice,
|
479
|
+
stream_state=self.state or {},
|
480
|
+
records_schema=records_schema,
|
479
481
|
)
|
480
|
-
yield stream_data
|
481
482
|
|
483
|
+
for stream_data in self._read_pages(record_generator, self.state, _slice):
|
484
|
+
current_record = self._extract_record(stream_data, _slice)
|
485
|
+
if self.cursor and current_record:
|
486
|
+
self.cursor.observe(_slice, current_record)
|
487
|
+
|
488
|
+
# Latest record read, not necessarily within slice boundaries.
|
489
|
+
# TODO Remove once all custom components implement `observe` method.
|
490
|
+
# https://github.com/airbytehq/airbyte-internal-issues/issues/6955
|
491
|
+
most_recent_record_from_slice = self._get_most_recent_record(
|
492
|
+
most_recent_record_from_slice, current_record, _slice
|
493
|
+
)
|
494
|
+
|
495
|
+
if current_record and self.additional_query_properties.property_chunking:
|
496
|
+
merge_key = (
|
497
|
+
self.additional_query_properties.property_chunking.get_merge_key(
|
498
|
+
current_record
|
499
|
+
)
|
500
|
+
)
|
501
|
+
if merge_key:
|
502
|
+
merged_records[merge_key].update(current_record)
|
503
|
+
else:
|
504
|
+
# We should still emit records even if the record did not have a merge key
|
505
|
+
records_without_merge_key.append(current_record)
|
506
|
+
else:
|
507
|
+
yield stream_data
|
482
508
|
if self.cursor:
|
483
509
|
self.cursor.close_slice(_slice, most_recent_record_from_slice)
|
484
|
-
|
510
|
+
|
511
|
+
if len(merged_records) > 0:
|
512
|
+
yield from [
|
513
|
+
Record(data=merged_record, stream_name=self.name, associated_slice=stream_slice)
|
514
|
+
for merged_record in merged_records.values()
|
515
|
+
]
|
516
|
+
if len(records_without_merge_key) > 0:
|
517
|
+
yield from records_without_merge_key
|
518
|
+
else:
|
519
|
+
_slice = stream_slice or StreamSlice(partition={}, cursor_slice={}) # None-check
|
520
|
+
|
521
|
+
most_recent_record_from_slice = None
|
522
|
+
record_generator = partial(
|
523
|
+
self._parse_records,
|
524
|
+
stream_slice=stream_slice,
|
525
|
+
stream_state=self.state or {},
|
526
|
+
records_schema=records_schema,
|
527
|
+
)
|
528
|
+
|
529
|
+
if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
|
530
|
+
stream_state = self.state
|
531
|
+
|
532
|
+
# Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
|
533
|
+
# fetch more records. The platform deletes stream state for full refresh streams before starting a
|
534
|
+
# new job, so we don't need to worry about this value existing for the initial attempt
|
535
|
+
if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
|
536
|
+
return
|
537
|
+
|
538
|
+
yield from self._read_single_page(record_generator, stream_state, _slice)
|
539
|
+
else:
|
540
|
+
for stream_data in self._read_pages(record_generator, self.state, _slice):
|
541
|
+
current_record = self._extract_record(stream_data, _slice)
|
542
|
+
if self.cursor and current_record:
|
543
|
+
self.cursor.observe(_slice, current_record)
|
544
|
+
|
545
|
+
# Latest record read, not necessarily within slice boundaries.
|
546
|
+
# TODO Remove once all custom components implement `observe` method.
|
547
|
+
# https://github.com/airbytehq/airbyte-internal-issues/issues/6955
|
548
|
+
most_recent_record_from_slice = self._get_most_recent_record(
|
549
|
+
most_recent_record_from_slice, current_record, _slice
|
550
|
+
)
|
551
|
+
yield stream_data
|
552
|
+
|
553
|
+
if self.cursor:
|
554
|
+
self.cursor.close_slice(_slice, most_recent_record_from_slice)
|
555
|
+
return
|
485
556
|
|
486
557
|
def _get_most_recent_record(
|
487
558
|
self,
|
@@ -37,7 +37,7 @@ class DefaultSchemaLoader(SchemaLoader):
|
|
37
37
|
|
38
38
|
try:
|
39
39
|
return self.default_loader.get_json_schema()
|
40
|
-
except OSError:
|
40
|
+
except (OSError, ValueError):
|
41
41
|
# A slight hack since we don't directly have the stream name. However, when building the default filepath we assume the
|
42
42
|
# runtime options stores stream name 'name' so we'll do the same here
|
43
43
|
stream_name = self._parameters.get("name", "")
|
@@ -58,16 +58,11 @@ class DeclarativePartition(Partition):
|
|
58
58
|
def read(self) -> Iterable[Record]:
|
59
59
|
for stream_data in self._retriever.read_records(self._json_schema, self._stream_slice):
|
60
60
|
if isinstance(stream_data, Mapping):
|
61
|
-
|
62
|
-
stream_data
|
63
|
-
|
64
|
-
|
65
|
-
data=stream_data,
|
66
|
-
stream_name=self.stream_name(),
|
67
|
-
associated_slice=self._stream_slice,
|
68
|
-
)
|
61
|
+
yield Record(
|
62
|
+
data=stream_data,
|
63
|
+
stream_name=self.stream_name(),
|
64
|
+
associated_slice=self._stream_slice,
|
69
65
|
)
|
70
|
-
yield record
|
71
66
|
else:
|
72
67
|
self._message_repository.emit_message(stream_data)
|
73
68
|
|