airbyte-cdk 6.6.0rc1__py3-none-any.whl → 6.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/__init__.py +10 -4
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +2 -1
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +58 -28
- airbyte_cdk/sources/declarative/declarative_component_schema.yaml +43 -0
- airbyte_cdk/sources/declarative/decoders/__init__.py +2 -2
- airbyte_cdk/sources/declarative/decoders/json_decoder.py +45 -12
- airbyte_cdk/sources/declarative/manifest_declarative_source.py +6 -3
- airbyte_cdk/sources/declarative/migrations/legacy_to_per_partition_state_migration.py +7 -3
- airbyte_cdk/sources/declarative/models/declarative_component_schema.py +45 -4
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +23 -3
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +85 -0
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer.py +6 -13
- airbyte_cdk/sources/file_based/stream/concurrent/adapters.py +0 -11
- airbyte_cdk/sources/streams/concurrent/adapters.py +4 -102
- airbyte_cdk/sources/streams/concurrent/cursor.py +50 -17
- airbyte_cdk/sources/streams/concurrent/partitions/partition.py +0 -15
- airbyte_cdk/sources/streams/concurrent/partitions/stream_slicer.py +21 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/abstract_stream_state_converter.py +7 -0
- airbyte_cdk/sources/streams/concurrent/state_converters/datetime_stream_state_converter.py +5 -1
- airbyte_cdk/utils/slice_hasher.py +30 -0
- {airbyte_cdk-6.6.0rc1.dist-info → airbyte_cdk-6.6.1.dist-info}/METADATA +5 -5
- {airbyte_cdk-6.6.0rc1.dist-info → airbyte_cdk-6.6.1.dist-info}/RECORD +24 -21
- {airbyte_cdk-6.6.0rc1.dist-info → airbyte_cdk-6.6.1.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.6.0rc1.dist-info → airbyte_cdk-6.6.1.dist-info}/WHEEL +0 -0
airbyte_cdk/__init__.py
CHANGED
@@ -282,12 +282,18 @@ __all__ = [
|
|
282
282
|
"StreamSlice",
|
283
283
|
]
|
284
284
|
|
285
|
-
__version__
|
286
|
-
"airbyte-cdk",
|
287
|
-
third_choice=_dunamai.Version.from_any_vcs,
|
288
|
-
).serialize()
|
285
|
+
__version__: str
|
289
286
|
"""Version generated by poetry dynamic versioning during publish.
|
290
287
|
|
291
288
|
When running in development, dunamai will calculate a new prerelease version
|
292
289
|
from existing git release tag info.
|
293
290
|
"""
|
291
|
+
|
292
|
+
try:
|
293
|
+
__version__ = _dunamai.get_version(
|
294
|
+
"airbyte-cdk",
|
295
|
+
third_choice=_dunamai.Version.from_any_vcs,
|
296
|
+
fallback=_dunamai.Version("0.0.0+dev"),
|
297
|
+
).serialize()
|
298
|
+
except:
|
299
|
+
__version__ = "0.0.0+dev"
|
@@ -114,7 +114,8 @@ class ConcurrentReadProcessor:
|
|
114
114
|
|
115
115
|
try:
|
116
116
|
if sentinel.is_successful:
|
117
|
-
partition.
|
117
|
+
stream = self._stream_name_to_instance[partition.stream_name()]
|
118
|
+
stream.cursor.close_partition(partition)
|
118
119
|
except Exception as exception:
|
119
120
|
self._flag_exception(partition.stream_name(), exception)
|
120
121
|
yield AirbyteTracedException.from_exception(
|
@@ -3,7 +3,7 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
import logging
|
6
|
-
from typing import Any, Generic, Iterator, List, Mapping, Optional, Tuple, Union
|
6
|
+
from typing import Any, Generic, Iterator, List, Mapping, Optional, Tuple, Union, Callable
|
7
7
|
|
8
8
|
from airbyte_cdk.models import (
|
9
9
|
AirbyteCatalog,
|
@@ -16,6 +16,9 @@ from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
|
16
16
|
from airbyte_cdk.sources.declarative.concurrency_level import ConcurrencyLevel
|
17
17
|
from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream
|
18
18
|
from airbyte_cdk.sources.declarative.extractors import RecordSelector
|
19
|
+
from airbyte_cdk.sources.declarative.extractors.record_filter import (
|
20
|
+
ClientSideIncrementalRecordFilterDecorator,
|
21
|
+
)
|
19
22
|
from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
|
20
23
|
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
|
21
24
|
from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
|
@@ -24,18 +27,24 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
24
27
|
)
|
25
28
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
26
29
|
DatetimeBasedCursor as DatetimeBasedCursorModel,
|
30
|
+
DeclarativeStream as DeclarativeStreamModel,
|
27
31
|
)
|
28
32
|
from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import (
|
29
33
|
ModelToComponentFactory,
|
34
|
+
ComponentDefinition,
|
30
35
|
)
|
31
36
|
from airbyte_cdk.sources.declarative.requesters import HttpRequester
|
32
|
-
from airbyte_cdk.sources.declarative.retrievers import SimpleRetriever
|
37
|
+
from airbyte_cdk.sources.declarative.retrievers import SimpleRetriever, Retriever
|
38
|
+
from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import (
|
39
|
+
DeclarativePartitionFactory,
|
40
|
+
StreamSlicerPartitionGenerator,
|
41
|
+
)
|
33
42
|
from airbyte_cdk.sources.declarative.transformations.add_fields import AddFields
|
34
43
|
from airbyte_cdk.sources.declarative.types import ConnectionDefinition
|
35
44
|
from airbyte_cdk.sources.source import TState
|
45
|
+
from airbyte_cdk.sources.types import Config, StreamState
|
36
46
|
from airbyte_cdk.sources.streams import Stream
|
37
47
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
38
|
-
from airbyte_cdk.sources.streams.concurrent.adapters import CursorPartitionGenerator
|
39
48
|
from airbyte_cdk.sources.streams.concurrent.availability_strategy import (
|
40
49
|
AlwaysAvailableAvailabilityStrategy,
|
41
50
|
)
|
@@ -210,31 +219,18 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
210
219
|
)
|
211
220
|
)
|
212
221
|
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
if declarative_stream.retriever.cursor:
|
226
|
-
declarative_stream.retriever.cursor.set_initial_state(
|
227
|
-
stream_state=stream_state
|
228
|
-
)
|
229
|
-
declarative_stream.retriever.cursor = None
|
230
|
-
|
231
|
-
partition_generator = CursorPartitionGenerator(
|
232
|
-
stream=declarative_stream,
|
233
|
-
message_repository=self.message_repository, # type: ignore # message_repository is always instantiated with a value by factory
|
234
|
-
cursor=cursor,
|
235
|
-
connector_state_converter=connector_state_converter,
|
236
|
-
cursor_field=[cursor.cursor_field.cursor_field_key],
|
237
|
-
slice_boundary_fields=cursor.slice_boundary_fields,
|
222
|
+
partition_generator = StreamSlicerPartitionGenerator(
|
223
|
+
DeclarativePartitionFactory(
|
224
|
+
declarative_stream.name,
|
225
|
+
declarative_stream.get_json_schema(),
|
226
|
+
self._retriever_factory(
|
227
|
+
name_to_stream_mapping[declarative_stream.name],
|
228
|
+
config,
|
229
|
+
stream_state,
|
230
|
+
),
|
231
|
+
self.message_repository,
|
232
|
+
),
|
233
|
+
cursor,
|
238
234
|
)
|
239
235
|
|
240
236
|
concurrent_streams.append(
|
@@ -291,6 +287,9 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
291
287
|
if isinstance(record_selector, RecordSelector):
|
292
288
|
if (
|
293
289
|
record_selector.record_filter
|
290
|
+
and not isinstance(
|
291
|
+
record_selector.record_filter, ClientSideIncrementalRecordFilterDecorator
|
292
|
+
)
|
294
293
|
and "stream_state" in record_selector.record_filter.condition
|
295
294
|
):
|
296
295
|
self.logger.warning(
|
@@ -344,3 +343,34 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
344
343
|
if stream.stream.name not in concurrent_stream_names
|
345
344
|
]
|
346
345
|
)
|
346
|
+
|
347
|
+
def _retriever_factory(
|
348
|
+
self, stream_config: ComponentDefinition, source_config: Config, stream_state: StreamState
|
349
|
+
) -> Callable[[], Retriever]:
|
350
|
+
def _factory_method() -> Retriever:
|
351
|
+
declarative_stream: DeclarativeStream = self._constructor.create_component(
|
352
|
+
DeclarativeStreamModel,
|
353
|
+
stream_config,
|
354
|
+
source_config,
|
355
|
+
emit_connector_builder_messages=self._emit_connector_builder_messages,
|
356
|
+
)
|
357
|
+
|
358
|
+
# This is an optimization so that we don't invoke any cursor or state management flows within the
|
359
|
+
# low-code framework because state management is handled through the ConcurrentCursor.
|
360
|
+
if (
|
361
|
+
declarative_stream
|
362
|
+
and declarative_stream.retriever
|
363
|
+
and isinstance(declarative_stream.retriever, SimpleRetriever)
|
364
|
+
):
|
365
|
+
# Also a temporary hack. In the legacy Stream implementation, as part of the read, set_initial_state() is
|
366
|
+
# called to instantiate incoming state on the cursor. Although we no longer rely on the legacy low-code cursor
|
367
|
+
# for concurrent checkpointing, low-code components like StopConditionPaginationStrategyDecorator and
|
368
|
+
# ClientSideIncrementalRecordFilterDecorator still rely on a DatetimeBasedCursor that is properly initialized
|
369
|
+
# with state.
|
370
|
+
if declarative_stream.retriever.cursor:
|
371
|
+
declarative_stream.retriever.cursor.set_initial_state(stream_state=stream_state)
|
372
|
+
declarative_stream.retriever.cursor = None
|
373
|
+
|
374
|
+
return declarative_stream.retriever
|
375
|
+
|
376
|
+
return _factory_method
|
@@ -1750,6 +1750,45 @@ definitions:
|
|
1750
1750
|
type:
|
1751
1751
|
type: string
|
1752
1752
|
enum: [XmlDecoder]
|
1753
|
+
CustomDecoder:
|
1754
|
+
title: Custom Decoder
|
1755
|
+
description: Use this to implement custom decoder logic.
|
1756
|
+
type: object
|
1757
|
+
additionalProperties: true
|
1758
|
+
required:
|
1759
|
+
- type
|
1760
|
+
- class_name
|
1761
|
+
properties:
|
1762
|
+
type:
|
1763
|
+
type: string
|
1764
|
+
enum: [CustomDecoder]
|
1765
|
+
class_name:
|
1766
|
+
title: Class Name
|
1767
|
+
description: Fully-qualified name of the class that will be implementing the custom decoding. Has to be a sub class of Decoder. The format is `source_<name>.<package>.<class_name>`.
|
1768
|
+
type: string
|
1769
|
+
additionalProperties: true
|
1770
|
+
examples:
|
1771
|
+
- "source_amazon_ads.components.GzipJsonlDecoder"
|
1772
|
+
$parameters:
|
1773
|
+
type: object
|
1774
|
+
additionalProperties: true
|
1775
|
+
GzipJsonDecoder:
|
1776
|
+
title: GzipJson Decoder
|
1777
|
+
description: Use this if the response is Gzip compressed Json.
|
1778
|
+
type: object
|
1779
|
+
additionalProperties: true
|
1780
|
+
required:
|
1781
|
+
- type
|
1782
|
+
properties:
|
1783
|
+
type:
|
1784
|
+
type: string
|
1785
|
+
enum: [GzipJsonDecoder]
|
1786
|
+
encoding:
|
1787
|
+
type: string
|
1788
|
+
default: utf-8
|
1789
|
+
$parameters:
|
1790
|
+
type: object
|
1791
|
+
additionalProperties: true
|
1753
1792
|
ListPartitionRouter:
|
1754
1793
|
title: List Partition Router
|
1755
1794
|
description: A Partition router that specifies a list of attributes where each attribute describes a portion of the complete data set for a stream. During a sync, each value is iterated over and can be used as input to outbound API requests.
|
@@ -2404,10 +2443,12 @@ definitions:
|
|
2404
2443
|
title: Decoder
|
2405
2444
|
description: Component decoding the response so records can be extracted.
|
2406
2445
|
anyOf:
|
2446
|
+
- "$ref": "#/definitions/CustomDecoder"
|
2407
2447
|
- "$ref": "#/definitions/JsonDecoder"
|
2408
2448
|
- "$ref": "#/definitions/JsonlDecoder"
|
2409
2449
|
- "$ref": "#/definitions/IterableDecoder"
|
2410
2450
|
- "$ref": "#/definitions/XmlDecoder"
|
2451
|
+
- "$ref": "#/definitions/GzipJsonDecoder"
|
2411
2452
|
$parameters:
|
2412
2453
|
type: object
|
2413
2454
|
additionalProperties: true
|
@@ -2520,10 +2561,12 @@ definitions:
|
|
2520
2561
|
title: Decoder
|
2521
2562
|
description: Component decoding the response so records can be extracted.
|
2522
2563
|
anyOf:
|
2564
|
+
- "$ref": "#/definitions/CustomDecoder"
|
2523
2565
|
- "$ref": "#/definitions/JsonDecoder"
|
2524
2566
|
- "$ref": "#/definitions/JsonlDecoder"
|
2525
2567
|
- "$ref": "#/definitions/IterableDecoder"
|
2526
2568
|
- "$ref": "#/definitions/XmlDecoder"
|
2569
|
+
- "$ref": "#/definitions/GzipJsonDecoder"
|
2527
2570
|
$parameters:
|
2528
2571
|
type: object
|
2529
2572
|
additionalProperties: true
|
@@ -3,9 +3,9 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
from airbyte_cdk.sources.declarative.decoders.decoder import Decoder
|
6
|
-
from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder, JsonlDecoder, IterableDecoder
|
6
|
+
from airbyte_cdk.sources.declarative.decoders.json_decoder import JsonDecoder, JsonlDecoder, IterableDecoder, GzipJsonDecoder
|
7
7
|
from airbyte_cdk.sources.declarative.decoders.noop_decoder import NoopDecoder
|
8
8
|
from airbyte_cdk.sources.declarative.decoders.pagination_decoder_decorator import PaginationDecoderDecorator
|
9
9
|
from airbyte_cdk.sources.declarative.decoders.xml_decoder import XmlDecoder
|
10
10
|
|
11
|
-
__all__ = ["Decoder", "JsonDecoder", "JsonlDecoder", "IterableDecoder", "NoopDecoder", "PaginationDecoderDecorator", "XmlDecoder"]
|
11
|
+
__all__ = ["Decoder", "JsonDecoder", "JsonlDecoder", "IterableDecoder", "GzipJsonDecoder", "NoopDecoder", "PaginationDecoderDecorator", "XmlDecoder"]
|
@@ -1,14 +1,15 @@
|
|
1
1
|
#
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
|
-
|
4
|
+
import codecs
|
5
5
|
import logging
|
6
6
|
from dataclasses import InitVar, dataclass
|
7
|
-
from
|
7
|
+
from gzip import decompress
|
8
|
+
from typing import Any, Generator, Mapping, MutableMapping, List, Optional
|
8
9
|
|
9
10
|
import requests
|
10
11
|
from airbyte_cdk.sources.declarative.decoders.decoder import Decoder
|
11
|
-
|
12
|
+
import orjson
|
12
13
|
|
13
14
|
logger = logging.getLogger("airbyte")
|
14
15
|
|
@@ -24,24 +25,32 @@ class JsonDecoder(Decoder):
|
|
24
25
|
def is_stream_response(self) -> bool:
|
25
26
|
return False
|
26
27
|
|
27
|
-
def decode(
|
28
|
+
def decode(
|
29
|
+
self, response: requests.Response
|
30
|
+
) -> Generator[MutableMapping[str, Any], None, None]:
|
28
31
|
"""
|
29
32
|
Given the response is an empty string or an emtpy list, the function will return a generator with an empty mapping.
|
30
33
|
"""
|
31
34
|
try:
|
32
35
|
body_json = response.json()
|
33
|
-
|
34
|
-
body_json = [body_json]
|
35
|
-
if len(body_json) == 0:
|
36
|
-
yield {}
|
37
|
-
else:
|
38
|
-
yield from body_json
|
36
|
+
yield from self.parse_body_json(body_json)
|
39
37
|
except requests.exceptions.JSONDecodeError:
|
40
38
|
logger.warning(
|
41
39
|
f"Response cannot be parsed into json: {response.status_code=}, {response.text=}"
|
42
40
|
)
|
43
41
|
yield {}
|
44
42
|
|
43
|
+
@staticmethod
|
44
|
+
def parse_body_json(
|
45
|
+
body_json: MutableMapping[str, Any] | List[MutableMapping[str, Any]],
|
46
|
+
) -> Generator[MutableMapping[str, Any], None, None]:
|
47
|
+
if not isinstance(body_json, list):
|
48
|
+
body_json = [body_json]
|
49
|
+
if len(body_json) == 0:
|
50
|
+
yield {}
|
51
|
+
else:
|
52
|
+
yield from body_json
|
53
|
+
|
45
54
|
|
46
55
|
@dataclass
|
47
56
|
class IterableDecoder(Decoder):
|
@@ -54,7 +63,9 @@ class IterableDecoder(Decoder):
|
|
54
63
|
def is_stream_response(self) -> bool:
|
55
64
|
return True
|
56
65
|
|
57
|
-
def decode(
|
66
|
+
def decode(
|
67
|
+
self, response: requests.Response
|
68
|
+
) -> Generator[MutableMapping[str, Any], None, None]:
|
58
69
|
for line in response.iter_lines():
|
59
70
|
yield {"record": line.decode()}
|
60
71
|
|
@@ -70,8 +81,30 @@ class JsonlDecoder(Decoder):
|
|
70
81
|
def is_stream_response(self) -> bool:
|
71
82
|
return True
|
72
83
|
|
73
|
-
def decode(
|
84
|
+
def decode(
|
85
|
+
self, response: requests.Response
|
86
|
+
) -> Generator[MutableMapping[str, Any], None, None]:
|
74
87
|
# TODO???: set delimiter? usually it is `\n` but maybe it would be useful to set optional?
|
75
88
|
# https://github.com/airbytehq/airbyte-internal-issues/issues/8436
|
76
89
|
for record in response.iter_lines():
|
77
90
|
yield orjson.loads(record)
|
91
|
+
|
92
|
+
|
93
|
+
@dataclass
|
94
|
+
class GzipJsonDecoder(JsonDecoder):
|
95
|
+
encoding: Optional[str]
|
96
|
+
|
97
|
+
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
|
98
|
+
if self.encoding:
|
99
|
+
try:
|
100
|
+
codecs.lookup(self.encoding)
|
101
|
+
except LookupError:
|
102
|
+
raise ValueError(
|
103
|
+
f"Invalid encoding '{self.encoding}'. Please check provided encoding"
|
104
|
+
)
|
105
|
+
|
106
|
+
def decode(
|
107
|
+
self, response: requests.Response
|
108
|
+
) -> Generator[MutableMapping[str, Any], None, None]:
|
109
|
+
raw_string = decompress(response.content).decode(encoding=self.encoding or "utf-8")
|
110
|
+
yield from self.parse_body_json(orjson.loads(raw_string))
|
@@ -8,7 +8,7 @@ import pkgutil
|
|
8
8
|
import re
|
9
9
|
from copy import deepcopy
|
10
10
|
from importlib import metadata
|
11
|
-
from typing import Any, Dict, Iterator, List, Mapping, Optional, Tuple
|
11
|
+
from typing import Any, Dict, Iterator, List, Mapping, Optional, Tuple
|
12
12
|
|
13
13
|
import yaml
|
14
14
|
from airbyte_cdk.models import (
|
@@ -94,7 +94,7 @@ class ManifestDeclarativeSource(DeclarativeSource):
|
|
94
94
|
return self._source_config
|
95
95
|
|
96
96
|
@property
|
97
|
-
def message_repository(self) ->
|
97
|
+
def message_repository(self) -> MessageRepository:
|
98
98
|
return self._message_repository
|
99
99
|
|
100
100
|
@property
|
@@ -256,7 +256,10 @@ class ManifestDeclarativeSource(DeclarativeSource):
|
|
256
256
|
manifest_version, "manifest"
|
257
257
|
)
|
258
258
|
|
259
|
-
if
|
259
|
+
if cdk_version.startswith("0.0.0"):
|
260
|
+
# Skipping version compatibility check on unreleased dev branch
|
261
|
+
pass
|
262
|
+
elif cdk_major < manifest_major or (
|
260
263
|
cdk_major == manifest_major and cdk_minor < manifest_minor
|
261
264
|
):
|
262
265
|
raise ValidationError(
|
@@ -4,7 +4,11 @@ from typing import Any, Mapping
|
|
4
4
|
|
5
5
|
from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString
|
6
6
|
from airbyte_cdk.sources.declarative.migrations.state_migration import StateMigration
|
7
|
-
from airbyte_cdk.sources.declarative.models import
|
7
|
+
from airbyte_cdk.sources.declarative.models import (
|
8
|
+
DatetimeBasedCursor,
|
9
|
+
SubstreamPartitionRouter,
|
10
|
+
CustomIncrementalSync,
|
11
|
+
)
|
8
12
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import ParentStreamConfig
|
9
13
|
|
10
14
|
|
@@ -32,7 +36,7 @@ class LegacyToPerPartitionStateMigration(StateMigration):
|
|
32
36
|
def __init__(
|
33
37
|
self,
|
34
38
|
partition_router: SubstreamPartitionRouter,
|
35
|
-
cursor: DatetimeBasedCursor,
|
39
|
+
cursor: CustomIncrementalSync | DatetimeBasedCursor,
|
36
40
|
config: Mapping[str, Any],
|
37
41
|
parameters: Mapping[str, Any],
|
38
42
|
):
|
@@ -64,7 +68,7 @@ class LegacyToPerPartitionStateMigration(StateMigration):
|
|
64
68
|
return False
|
65
69
|
|
66
70
|
# There is exactly one parent stream
|
67
|
-
number_of_parent_streams = len(self._partition_router.parent_stream_configs)
|
71
|
+
number_of_parent_streams = len(self._partition_router.parent_stream_configs) # type: ignore # custom partition will introduce this attribute if needed
|
68
72
|
if number_of_parent_streams != 1:
|
69
73
|
# There should be exactly one parent stream
|
70
74
|
return False
|
@@ -4,10 +4,9 @@
|
|
4
4
|
from __future__ import annotations
|
5
5
|
|
6
6
|
from enum import Enum
|
7
|
-
from typing import Any, Dict, List, Optional, Union
|
7
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
8
8
|
|
9
9
|
from pydantic.v1 import BaseModel, Extra, Field
|
10
|
-
from typing_extensions import Literal
|
11
10
|
|
12
11
|
|
13
12
|
class AuthFlowType(Enum):
|
@@ -632,6 +631,7 @@ class HttpResponseFilter(BaseModel):
|
|
632
631
|
description="Match the response if its HTTP code is included in this list.",
|
633
632
|
examples=[[420, 429], [500]],
|
634
633
|
title="HTTP Codes",
|
634
|
+
unique_items=True,
|
635
635
|
)
|
636
636
|
predicate: Optional[str] = Field(
|
637
637
|
None,
|
@@ -687,6 +687,29 @@ class XmlDecoder(BaseModel):
|
|
687
687
|
type: Literal["XmlDecoder"]
|
688
688
|
|
689
689
|
|
690
|
+
class CustomDecoder(BaseModel):
|
691
|
+
class Config:
|
692
|
+
extra = Extra.allow
|
693
|
+
|
694
|
+
type: Literal["CustomDecoder"]
|
695
|
+
class_name: str = Field(
|
696
|
+
...,
|
697
|
+
description="Fully-qualified name of the class that will be implementing the custom decoding. Has to be a sub class of Decoder. The format is `source_<name>.<package>.<class_name>`.",
|
698
|
+
examples=["source_amazon_ads.components.GzipJsonlDecoder"],
|
699
|
+
title="Class Name",
|
700
|
+
)
|
701
|
+
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
702
|
+
|
703
|
+
|
704
|
+
class GzipJsonDecoder(BaseModel):
|
705
|
+
class Config:
|
706
|
+
extra = Extra.allow
|
707
|
+
|
708
|
+
type: Literal["GzipJsonDecoder"]
|
709
|
+
encoding: Optional[str] = "utf-8"
|
710
|
+
parameters: Optional[Dict[str, Any]] = Field(None, alias="$parameters")
|
711
|
+
|
712
|
+
|
690
713
|
class MinMaxDatetime(BaseModel):
|
691
714
|
type: Literal["MinMaxDatetime"]
|
692
715
|
datetime: str = Field(
|
@@ -1620,7 +1643,16 @@ class SimpleRetriever(BaseModel):
|
|
1620
1643
|
description="PartitionRouter component that describes how to partition the stream, enabling incremental syncs and checkpointing.",
|
1621
1644
|
title="Partition Router",
|
1622
1645
|
)
|
1623
|
-
decoder: Optional[
|
1646
|
+
decoder: Optional[
|
1647
|
+
Union[
|
1648
|
+
CustomDecoder,
|
1649
|
+
JsonDecoder,
|
1650
|
+
JsonlDecoder,
|
1651
|
+
IterableDecoder,
|
1652
|
+
XmlDecoder,
|
1653
|
+
GzipJsonDecoder,
|
1654
|
+
]
|
1655
|
+
] = Field(
|
1624
1656
|
None,
|
1625
1657
|
description="Component decoding the response so records can be extracted.",
|
1626
1658
|
title="Decoder",
|
@@ -1680,7 +1712,16 @@ class AsyncRetriever(BaseModel):
|
|
1680
1712
|
description="PartitionRouter component that describes how to partition the stream, enabling incremental syncs and checkpointing.",
|
1681
1713
|
title="Partition Router",
|
1682
1714
|
)
|
1683
|
-
decoder: Optional[
|
1715
|
+
decoder: Optional[
|
1716
|
+
Union[
|
1717
|
+
CustomDecoder,
|
1718
|
+
JsonDecoder,
|
1719
|
+
JsonlDecoder,
|
1720
|
+
IterableDecoder,
|
1721
|
+
XmlDecoder,
|
1722
|
+
GzipJsonDecoder,
|
1723
|
+
]
|
1724
|
+
] = Field(
|
1684
1725
|
None,
|
1685
1726
|
description="Component decoding the response so records can be extracted.",
|
1686
1727
|
title="Decoder",
|
@@ -58,6 +58,7 @@ from airbyte_cdk.sources.declarative.datetime import MinMaxDatetime
|
|
58
58
|
from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream
|
59
59
|
from airbyte_cdk.sources.declarative.decoders import (
|
60
60
|
Decoder,
|
61
|
+
GzipJsonDecoder,
|
61
62
|
IterableDecoder,
|
62
63
|
JsonDecoder,
|
63
64
|
JsonlDecoder,
|
@@ -134,6 +135,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
134
135
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
135
136
|
CustomBackoffStrategy as CustomBackoffStrategyModel,
|
136
137
|
)
|
138
|
+
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
139
|
+
CustomDecoder as CustomDecoderModel,
|
140
|
+
)
|
137
141
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
138
142
|
CustomErrorHandler as CustomErrorHandlerModel,
|
139
143
|
)
|
@@ -182,6 +186,9 @@ from airbyte_cdk.sources.declarative.models.declarative_component_schema import
|
|
182
186
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
183
187
|
ExponentialBackoffStrategy as ExponentialBackoffStrategyModel,
|
184
188
|
)
|
189
|
+
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
190
|
+
GzipJsonDecoder as GzipJsonDecoderModel,
|
191
|
+
)
|
185
192
|
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
|
186
193
|
HttpRequester as HttpRequesterModel,
|
187
194
|
)
|
@@ -402,6 +409,7 @@ class ModelToComponentFactory:
|
|
402
409
|
CursorPaginationModel: self.create_cursor_pagination,
|
403
410
|
CustomAuthenticatorModel: self.create_custom_component,
|
404
411
|
CustomBackoffStrategyModel: self.create_custom_component,
|
412
|
+
CustomDecoderModel: self.create_custom_component,
|
405
413
|
CustomErrorHandlerModel: self.create_custom_component,
|
406
414
|
CustomIncrementalSyncModel: self.create_custom_component,
|
407
415
|
CustomRecordExtractorModel: self.create_custom_component,
|
@@ -425,6 +433,7 @@ class ModelToComponentFactory:
|
|
425
433
|
InlineSchemaLoaderModel: self.create_inline_schema_loader,
|
426
434
|
JsonDecoderModel: self.create_json_decoder,
|
427
435
|
JsonlDecoderModel: self.create_jsonl_decoder,
|
436
|
+
GzipJsonDecoderModel: self.create_gzipjson_decoder,
|
428
437
|
KeysToLowerModel: self.create_keys_to_lower_transformation,
|
429
438
|
IterableDecoderModel: self.create_iterable_decoder,
|
430
439
|
XmlDecoderModel: self.create_xml_decoder,
|
@@ -619,11 +628,16 @@ class ModelToComponentFactory:
|
|
619
628
|
"LegacyToPerPartitionStateMigrations can only be applied with a parent stream configuration."
|
620
629
|
)
|
621
630
|
|
631
|
+
if not hasattr(declarative_stream, "incremental_sync"):
|
632
|
+
raise ValueError(
|
633
|
+
"LegacyToPerPartitionStateMigrations can only be applied with an incremental_sync configuration."
|
634
|
+
)
|
635
|
+
|
622
636
|
return LegacyToPerPartitionStateMigration(
|
623
|
-
|
624
|
-
declarative_stream.incremental_sync,
|
637
|
+
partition_router, # type: ignore # was already checked above
|
638
|
+
declarative_stream.incremental_sync, # type: ignore # was already checked. Migration can be applied only to incremental streams.
|
625
639
|
config,
|
626
|
-
declarative_stream.parameters,
|
640
|
+
declarative_stream.parameters, # type: ignore # different type is expected here Mapping[str, Any], got Dict[str, Any]
|
627
641
|
) # type: ignore # The retriever type was already checked
|
628
642
|
|
629
643
|
def create_session_token_authenticator(
|
@@ -1548,6 +1562,12 @@ class ModelToComponentFactory:
|
|
1548
1562
|
def create_xml_decoder(model: XmlDecoderModel, config: Config, **kwargs: Any) -> XmlDecoder:
|
1549
1563
|
return XmlDecoder(parameters={})
|
1550
1564
|
|
1565
|
+
@staticmethod
|
1566
|
+
def create_gzipjson_decoder(
|
1567
|
+
model: GzipJsonDecoderModel, config: Config, **kwargs: Any
|
1568
|
+
) -> GzipJsonDecoder:
|
1569
|
+
return GzipJsonDecoder(parameters={}, encoding=model.encoding)
|
1570
|
+
|
1551
1571
|
@staticmethod
|
1552
1572
|
def create_json_file_schema_loader(
|
1553
1573
|
model: JsonFileSchemaLoaderModel, config: Config, **kwargs: Any
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
|
2
|
+
|
3
|
+
from typing import Iterable, Optional, Mapping, Any, Callable
|
4
|
+
|
5
|
+
from airbyte_cdk.sources.declarative.retrievers import Retriever
|
6
|
+
from airbyte_cdk.sources.message import MessageRepository
|
7
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
8
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
|
9
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.record import Record
|
10
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer
|
11
|
+
from airbyte_cdk.sources.types import StreamSlice
|
12
|
+
from airbyte_cdk.utils.slice_hasher import SliceHasher
|
13
|
+
|
14
|
+
|
15
|
+
class DeclarativePartitionFactory:
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
stream_name: str,
|
19
|
+
json_schema: Mapping[str, Any],
|
20
|
+
retriever_factory: Callable[[], Retriever],
|
21
|
+
message_repository: MessageRepository,
|
22
|
+
) -> None:
|
23
|
+
"""
|
24
|
+
The DeclarativePartitionFactory takes a retriever_factory and not a retriever directly. The reason is that our components are not
|
25
|
+
thread safe and classes like `DefaultPaginator` may not work because multiple threads can access and modify a shared field across each other.
|
26
|
+
In order to avoid these problems, we will create one retriever per thread which should make the processing thread-safe.
|
27
|
+
"""
|
28
|
+
self._stream_name = stream_name
|
29
|
+
self._json_schema = json_schema
|
30
|
+
self._retriever_factory = retriever_factory
|
31
|
+
self._message_repository = message_repository
|
32
|
+
|
33
|
+
def create(self, stream_slice: StreamSlice) -> Partition:
|
34
|
+
return DeclarativePartition(
|
35
|
+
self._stream_name,
|
36
|
+
self._json_schema,
|
37
|
+
self._retriever_factory(),
|
38
|
+
self._message_repository,
|
39
|
+
stream_slice,
|
40
|
+
)
|
41
|
+
|
42
|
+
|
43
|
+
class DeclarativePartition(Partition):
|
44
|
+
def __init__(
|
45
|
+
self,
|
46
|
+
stream_name: str,
|
47
|
+
json_schema: Mapping[str, Any],
|
48
|
+
retriever: Retriever,
|
49
|
+
message_repository: MessageRepository,
|
50
|
+
stream_slice: StreamSlice,
|
51
|
+
):
|
52
|
+
self._stream_name = stream_name
|
53
|
+
self._json_schema = json_schema
|
54
|
+
self._retriever = retriever
|
55
|
+
self._message_repository = message_repository
|
56
|
+
self._stream_slice = stream_slice
|
57
|
+
self._hash = SliceHasher.hash(self._stream_name, self._stream_slice)
|
58
|
+
|
59
|
+
def read(self) -> Iterable[Record]:
|
60
|
+
for stream_data in self._retriever.read_records(self._json_schema, self._stream_slice):
|
61
|
+
if isinstance(stream_data, Mapping):
|
62
|
+
yield Record(stream_data, self)
|
63
|
+
else:
|
64
|
+
self._message_repository.emit_message(stream_data)
|
65
|
+
|
66
|
+
def to_slice(self) -> Optional[Mapping[str, Any]]:
|
67
|
+
return self._stream_slice
|
68
|
+
|
69
|
+
def stream_name(self) -> str:
|
70
|
+
return self._stream_name
|
71
|
+
|
72
|
+
def __hash__(self) -> int:
|
73
|
+
return self._hash
|
74
|
+
|
75
|
+
|
76
|
+
class StreamSlicerPartitionGenerator(PartitionGenerator):
|
77
|
+
def __init__(
|
78
|
+
self, partition_factory: DeclarativePartitionFactory, stream_slicer: StreamSlicer
|
79
|
+
) -> None:
|
80
|
+
self._partition_factory = partition_factory
|
81
|
+
self._stream_slicer = stream_slicer
|
82
|
+
|
83
|
+
def generate(self) -> Iterable[Partition]:
|
84
|
+
for stream_slice in self._stream_slicer.stream_slices():
|
85
|
+
yield self._partition_factory.create(stream_slice)
|