airbyte-cdk 6.60.16__py3-none-any.whl → 6.60.16.post40.dev17219503797__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +32 -36
- airbyte_cdk/connector_builder/main.py +3 -3
- airbyte_cdk/connector_builder/test_reader/helpers.py +24 -2
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +1 -1
- airbyte_cdk/manifest_server/Dockerfile +45 -0
- airbyte_cdk/manifest_server/README.md +142 -0
- airbyte_cdk/manifest_server/__init__.py +3 -0
- airbyte_cdk/manifest_server/api_models/__init__.py +41 -0
- airbyte_cdk/manifest_server/api_models/capabilities.py +7 -0
- airbyte_cdk/manifest_server/api_models/dicts.py +17 -0
- airbyte_cdk/manifest_server/api_models/manifest.py +73 -0
- airbyte_cdk/manifest_server/api_models/stream.py +76 -0
- airbyte_cdk/manifest_server/app.py +17 -0
- airbyte_cdk/manifest_server/auth.py +43 -0
- airbyte_cdk/manifest_server/cli/__init__.py +5 -0
- airbyte_cdk/manifest_server/cli/_common.py +28 -0
- airbyte_cdk/manifest_server/cli/_info.py +30 -0
- airbyte_cdk/manifest_server/cli/_openapi.py +43 -0
- airbyte_cdk/manifest_server/cli/_start.py +38 -0
- airbyte_cdk/manifest_server/cli/run.py +59 -0
- airbyte_cdk/manifest_server/command_processor/__init__.py +0 -0
- airbyte_cdk/manifest_server/command_processor/processor.py +151 -0
- airbyte_cdk/manifest_server/command_processor/utils.py +76 -0
- airbyte_cdk/manifest_server/main.py +24 -0
- airbyte_cdk/manifest_server/openapi.yaml +641 -0
- airbyte_cdk/manifest_server/routers/__init__.py +0 -0
- airbyte_cdk/manifest_server/routers/capabilities.py +25 -0
- airbyte_cdk/manifest_server/routers/health.py +13 -0
- airbyte_cdk/manifest_server/routers/manifest.py +137 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +15 -22
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +30 -18
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +73 -3
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +4 -0
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +42 -4
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer_test_read_decorator.py +2 -2
- airbyte_cdk/sources/message/concurrent_repository.py +47 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +23 -7
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +46 -5
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +7 -1
- airbyte_cdk/sources/streams/http/http_client.py +4 -1
- airbyte_cdk/sources/utils/slice_logger.py +4 -0
- {airbyte_cdk-6.60.16.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/METADATA +4 -1
- {airbyte_cdk-6.60.16.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/RECORD +47 -21
- {airbyte_cdk-6.60.16.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/entry_points.txt +1 -0
- {airbyte_cdk-6.60.16.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.60.16.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.60.16.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/WHEEL +0 -0
@@ -0,0 +1,137 @@
|
|
1
|
+
import hashlib
|
2
|
+
from dataclasses import asdict
|
3
|
+
from typing import Any, Dict, List, Mapping, Optional
|
4
|
+
|
5
|
+
import jsonschema
|
6
|
+
from fastapi import APIRouter, Depends, HTTPException
|
7
|
+
|
8
|
+
from airbyte_cdk.manifest_server.api_models.manifest import (
|
9
|
+
CheckRequest,
|
10
|
+
CheckResponse,
|
11
|
+
DiscoverRequest,
|
12
|
+
DiscoverResponse,
|
13
|
+
)
|
14
|
+
from airbyte_cdk.models import AirbyteStateMessageSerializer
|
15
|
+
from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
|
16
|
+
from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import (
|
17
|
+
INJECTED_COMPONENTS_PY,
|
18
|
+
INJECTED_COMPONENTS_PY_CHECKSUMS,
|
19
|
+
)
|
20
|
+
|
21
|
+
from ..api_models import (
|
22
|
+
FullResolveRequest,
|
23
|
+
Manifest,
|
24
|
+
ManifestResponse,
|
25
|
+
ResolveRequest,
|
26
|
+
StreamRead,
|
27
|
+
StreamTestReadRequest,
|
28
|
+
)
|
29
|
+
from ..auth import verify_jwt_token
|
30
|
+
from ..command_processor.processor import ManifestCommandProcessor
|
31
|
+
from ..command_processor.utils import build_catalog, build_source
|
32
|
+
|
33
|
+
|
34
|
+
def safe_build_source(
|
35
|
+
manifest_dict: Mapping[str, Any],
|
36
|
+
config_dict: Mapping[str, Any],
|
37
|
+
page_limit: Optional[int] = None,
|
38
|
+
slice_limit: Optional[int] = None,
|
39
|
+
) -> ManifestDeclarativeSource:
|
40
|
+
"""Wrapper around build_source that converts ValidationError to HTTPException."""
|
41
|
+
try:
|
42
|
+
return build_source(manifest_dict, config_dict, page_limit, slice_limit)
|
43
|
+
except jsonschema.exceptions.ValidationError as e:
|
44
|
+
raise HTTPException(status_code=400, detail=f"Invalid manifest: {e.message}")
|
45
|
+
|
46
|
+
|
47
|
+
router = APIRouter(
|
48
|
+
prefix="/manifest",
|
49
|
+
tags=["manifest"],
|
50
|
+
dependencies=[Depends(verify_jwt_token)],
|
51
|
+
)
|
52
|
+
|
53
|
+
|
54
|
+
@router.post("/test_read", operation_id="testRead")
|
55
|
+
def test_read(request: StreamTestReadRequest) -> StreamRead:
|
56
|
+
"""
|
57
|
+
Test reading from a specific stream in the manifest.
|
58
|
+
"""
|
59
|
+
config_dict = request.config.model_dump()
|
60
|
+
|
61
|
+
source = safe_build_source(
|
62
|
+
request.manifest.model_dump(), config_dict, request.page_limit, request.slice_limit
|
63
|
+
)
|
64
|
+
catalog = build_catalog(request.stream_name)
|
65
|
+
state = [AirbyteStateMessageSerializer.load(state) for state in request.state]
|
66
|
+
|
67
|
+
if request.custom_components_code:
|
68
|
+
config_dict[INJECTED_COMPONENTS_PY] = request.custom_components_code
|
69
|
+
config_dict[INJECTED_COMPONENTS_PY_CHECKSUMS] = {
|
70
|
+
"md5": hashlib.md5(request.custom_components_code.encode()).hexdigest()
|
71
|
+
}
|
72
|
+
|
73
|
+
runner = ManifestCommandProcessor(source)
|
74
|
+
cdk_result = runner.test_read(
|
75
|
+
config_dict,
|
76
|
+
catalog,
|
77
|
+
state,
|
78
|
+
request.record_limit,
|
79
|
+
request.page_limit,
|
80
|
+
request.slice_limit,
|
81
|
+
)
|
82
|
+
return StreamRead.model_validate(asdict(cdk_result))
|
83
|
+
|
84
|
+
|
85
|
+
@router.post("/check", operation_id="check")
|
86
|
+
def check(request: CheckRequest) -> CheckResponse:
|
87
|
+
"""Check configuration against a manifest"""
|
88
|
+
source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
|
89
|
+
runner = ManifestCommandProcessor(source)
|
90
|
+
success, message = runner.check_connection(request.config.model_dump())
|
91
|
+
return CheckResponse(success=success, message=message)
|
92
|
+
|
93
|
+
|
94
|
+
@router.post("/discover", operation_id="discover")
|
95
|
+
def discover(request: DiscoverRequest) -> DiscoverResponse:
|
96
|
+
"""Discover streams from a manifest"""
|
97
|
+
source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
|
98
|
+
runner = ManifestCommandProcessor(source)
|
99
|
+
catalog = runner.discover(request.config.model_dump())
|
100
|
+
if catalog is None:
|
101
|
+
raise HTTPException(status_code=422, detail="Connector did not return a discovered catalog")
|
102
|
+
return DiscoverResponse(catalog=catalog)
|
103
|
+
|
104
|
+
|
105
|
+
@router.post("/resolve", operation_id="resolve")
|
106
|
+
def resolve(request: ResolveRequest) -> ManifestResponse:
|
107
|
+
"""Resolve a manifest to its final configuration."""
|
108
|
+
source = safe_build_source(request.manifest.model_dump(), {})
|
109
|
+
return ManifestResponse(manifest=Manifest(**source.resolved_manifest))
|
110
|
+
|
111
|
+
|
112
|
+
@router.post("/full_resolve", operation_id="fullResolve")
|
113
|
+
def full_resolve(request: FullResolveRequest) -> ManifestResponse:
|
114
|
+
"""
|
115
|
+
Fully resolve a manifest including dynamic streams.
|
116
|
+
|
117
|
+
Generates dynamic streams up to the specified limit and includes
|
118
|
+
them in the resolved manifest.
|
119
|
+
"""
|
120
|
+
source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
|
121
|
+
manifest = {**source.resolved_manifest}
|
122
|
+
streams = manifest.get("streams", [])
|
123
|
+
for stream in streams:
|
124
|
+
stream["dynamic_stream_name"] = None
|
125
|
+
|
126
|
+
mapped_streams: Dict[str, List[Dict[str, Any]]] = {}
|
127
|
+
for stream in source.dynamic_streams:
|
128
|
+
generated_streams = mapped_streams.setdefault(stream["dynamic_stream_name"], [])
|
129
|
+
|
130
|
+
if len(generated_streams) < request.stream_limit:
|
131
|
+
generated_streams += [stream]
|
132
|
+
|
133
|
+
for generated_streams_list in mapped_streams.values():
|
134
|
+
streams.extend(generated_streams_list)
|
135
|
+
|
136
|
+
manifest["streams"] = streams
|
137
|
+
return ManifestResponse(manifest=Manifest(**manifest))
|
@@ -2,6 +2,7 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
import logging
|
5
|
+
import os
|
5
6
|
from typing import Dict, Iterable, List, Optional, Set
|
6
7
|
|
7
8
|
from airbyte_cdk.exception_handler import generate_failed_streams_error_message
|
@@ -95,11 +96,14 @@ class ConcurrentReadProcessor:
|
|
95
96
|
"""
|
96
97
|
stream_name = partition.stream_name()
|
97
98
|
self._streams_to_running_partitions[stream_name].add(partition)
|
99
|
+
cursor = self._stream_name_to_instance[stream_name].cursor
|
98
100
|
if self._slice_logger.should_log_slice_message(self._logger):
|
99
101
|
self._message_repository.emit_message(
|
100
102
|
self._slice_logger.create_slice_log_message(partition.to_slice())
|
101
103
|
)
|
102
|
-
self._thread_pool_manager.submit(
|
104
|
+
self._thread_pool_manager.submit(
|
105
|
+
self._partition_reader.process_partition, partition, cursor
|
106
|
+
)
|
103
107
|
|
104
108
|
def on_partition_complete_sentinel(
|
105
109
|
self, sentinel: PartitionCompleteSentinel
|
@@ -112,26 +116,16 @@ class ConcurrentReadProcessor:
|
|
112
116
|
"""
|
113
117
|
partition = sentinel.partition
|
114
118
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
partitions_running = self._streams_to_running_partitions[partition.stream_name()]
|
126
|
-
if partition in partitions_running:
|
127
|
-
partitions_running.remove(partition)
|
128
|
-
# If all partitions were generated and this was the last one, the stream is done
|
129
|
-
if (
|
130
|
-
partition.stream_name() not in self._streams_currently_generating_partitions
|
131
|
-
and len(partitions_running) == 0
|
132
|
-
):
|
133
|
-
yield from self._on_stream_is_done(partition.stream_name())
|
134
|
-
yield from self._message_repository.consume_queue()
|
119
|
+
partitions_running = self._streams_to_running_partitions[partition.stream_name()]
|
120
|
+
if partition in partitions_running:
|
121
|
+
partitions_running.remove(partition)
|
122
|
+
# If all partitions were generated and this was the last one, the stream is done
|
123
|
+
if (
|
124
|
+
partition.stream_name() not in self._streams_currently_generating_partitions
|
125
|
+
and len(partitions_running) == 0
|
126
|
+
):
|
127
|
+
yield from self._on_stream_is_done(partition.stream_name())
|
128
|
+
yield from self._message_repository.consume_queue()
|
135
129
|
|
136
130
|
def on_record(self, record: Record) -> Iterable[AirbyteMessage]:
|
137
131
|
"""
|
@@ -160,7 +154,6 @@ class ConcurrentReadProcessor:
|
|
160
154
|
stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING
|
161
155
|
)
|
162
156
|
self._record_counter[stream.name] += 1
|
163
|
-
stream.cursor.observe(record)
|
164
157
|
yield message
|
165
158
|
yield from self._message_repository.consume_queue()
|
166
159
|
|
@@ -1,10 +1,11 @@
|
|
1
1
|
#
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
|
+
|
4
5
|
import concurrent
|
5
6
|
import logging
|
6
7
|
from queue import Queue
|
7
|
-
from typing import Iterable, Iterator, List
|
8
|
+
from typing import Iterable, Iterator, List, Optional
|
8
9
|
|
9
10
|
from airbyte_cdk.models import AirbyteMessage
|
10
11
|
from airbyte_cdk.sources.concurrent_source.concurrent_read_processor import ConcurrentReadProcessor
|
@@ -16,7 +17,7 @@ from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPool
|
|
16
17
|
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
|
17
18
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
18
19
|
from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer
|
19
|
-
from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader
|
20
|
+
from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionLogger, PartitionReader
|
20
21
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
21
22
|
from airbyte_cdk.sources.streams.concurrent.partitions.types import (
|
22
23
|
PartitionCompleteSentinel,
|
@@ -43,6 +44,7 @@ class ConcurrentSource:
|
|
43
44
|
logger: logging.Logger,
|
44
45
|
slice_logger: SliceLogger,
|
45
46
|
message_repository: MessageRepository,
|
47
|
+
queue: Optional[Queue[QueueItem]] = None,
|
46
48
|
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
|
47
49
|
) -> "ConcurrentSource":
|
48
50
|
is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1
|
@@ -59,12 +61,13 @@ class ConcurrentSource:
|
|
59
61
|
logger,
|
60
62
|
)
|
61
63
|
return ConcurrentSource(
|
62
|
-
threadpool,
|
63
|
-
logger,
|
64
|
-
slice_logger,
|
65
|
-
|
66
|
-
|
67
|
-
|
64
|
+
threadpool=threadpool,
|
65
|
+
logger=logger,
|
66
|
+
slice_logger=slice_logger,
|
67
|
+
queue=queue,
|
68
|
+
message_repository=message_repository,
|
69
|
+
initial_number_partitions_to_generate=initial_number_of_partitions_to_generate,
|
70
|
+
timeout_seconds=timeout_seconds,
|
68
71
|
)
|
69
72
|
|
70
73
|
def __init__(
|
@@ -72,6 +75,7 @@ class ConcurrentSource:
|
|
72
75
|
threadpool: ThreadPoolManager,
|
73
76
|
logger: logging.Logger,
|
74
77
|
slice_logger: SliceLogger = DebugSliceLogger(),
|
78
|
+
queue: Optional[Queue[QueueItem]] = None,
|
75
79
|
message_repository: MessageRepository = InMemoryMessageRepository(),
|
76
80
|
initial_number_partitions_to_generate: int = 1,
|
77
81
|
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
|
@@ -91,25 +95,28 @@ class ConcurrentSource:
|
|
91
95
|
self._initial_number_partitions_to_generate = initial_number_partitions_to_generate
|
92
96
|
self._timeout_seconds = timeout_seconds
|
93
97
|
|
98
|
+
# We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
|
99
|
+
# threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
|
100
|
+
# partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
|
101
|
+
# information and might even need to be configurable depending on the source
|
102
|
+
self._queue = queue or Queue(maxsize=10_000)
|
103
|
+
|
94
104
|
def read(
|
95
105
|
self,
|
96
106
|
streams: List[AbstractStream],
|
97
107
|
) -> Iterator[AirbyteMessage]:
|
98
108
|
self._logger.info("Starting syncing")
|
99
|
-
|
100
|
-
# We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
|
101
|
-
# threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
|
102
|
-
# partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
|
103
|
-
# information and might even need to be configurable depending on the source
|
104
|
-
queue: Queue[QueueItem] = Queue(maxsize=10_000)
|
105
109
|
concurrent_stream_processor = ConcurrentReadProcessor(
|
106
110
|
streams,
|
107
|
-
PartitionEnqueuer(
|
111
|
+
PartitionEnqueuer(self._queue, self._threadpool),
|
108
112
|
self._threadpool,
|
109
113
|
self._logger,
|
110
114
|
self._slice_logger,
|
111
115
|
self._message_repository,
|
112
|
-
PartitionReader(
|
116
|
+
PartitionReader(
|
117
|
+
self._queue,
|
118
|
+
PartitionLogger(self._slice_logger, self._logger, self._message_repository),
|
119
|
+
),
|
113
120
|
)
|
114
121
|
|
115
122
|
# Enqueue initial partition generation tasks
|
@@ -117,7 +124,7 @@ class ConcurrentSource:
|
|
117
124
|
|
118
125
|
# Read from the queue until all partitions were generated and read
|
119
126
|
yield from self._consume_from_queue(
|
120
|
-
|
127
|
+
self._queue,
|
121
128
|
concurrent_stream_processor,
|
122
129
|
)
|
123
130
|
self._threadpool.check_for_errors_and_shutdown()
|
@@ -141,7 +148,10 @@ class ConcurrentSource:
|
|
141
148
|
airbyte_message_or_record_or_exception,
|
142
149
|
concurrent_stream_processor,
|
143
150
|
)
|
144
|
-
|
151
|
+
# In the event that a partition raises an exception, anything remaining in
|
152
|
+
# the queue will be missed because is_done() can raise an exception and exit
|
153
|
+
# out of this loop before remaining items are consumed
|
154
|
+
if queue.empty() and concurrent_stream_processor.is_done():
|
145
155
|
# all partitions were generated and processed. we're done here
|
146
156
|
break
|
147
157
|
|
@@ -161,5 +171,7 @@ class ConcurrentSource:
|
|
161
171
|
yield from concurrent_stream_processor.on_partition_complete_sentinel(queue_item)
|
162
172
|
elif isinstance(queue_item, Record):
|
163
173
|
yield from concurrent_stream_processor.on_record(queue_item)
|
174
|
+
elif isinstance(queue_item, AirbyteMessage):
|
175
|
+
yield queue_item
|
164
176
|
else:
|
165
177
|
raise ValueError(f"Unknown queue item type: {type(queue_item)}")
|
@@ -3,7 +3,22 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
import logging
|
6
|
-
from
|
6
|
+
from dataclasses import dataclass, field
|
7
|
+
from queue import Queue
|
8
|
+
from typing import (
|
9
|
+
Any,
|
10
|
+
ClassVar,
|
11
|
+
Generic,
|
12
|
+
Iterator,
|
13
|
+
List,
|
14
|
+
Mapping,
|
15
|
+
MutableMapping,
|
16
|
+
Optional,
|
17
|
+
Tuple,
|
18
|
+
Union,
|
19
|
+
)
|
20
|
+
|
21
|
+
from airbyte_protocol_dataclasses.models import Level
|
7
22
|
|
8
23
|
from airbyte_cdk.models import (
|
9
24
|
AirbyteCatalog,
|
@@ -43,6 +58,8 @@ from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_genera
|
|
43
58
|
StreamSlicerPartitionGenerator,
|
44
59
|
)
|
45
60
|
from airbyte_cdk.sources.declarative.types import ConnectionDefinition
|
61
|
+
from airbyte_cdk.sources.message.concurrent_repository import ConcurrentMessageRepository
|
62
|
+
from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository
|
46
63
|
from airbyte_cdk.sources.source import TState
|
47
64
|
from airbyte_cdk.sources.streams import Stream
|
48
65
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
@@ -50,6 +67,22 @@ from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import Abstra
|
|
50
67
|
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, FinalStateCursor
|
51
68
|
from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
|
52
69
|
from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream
|
70
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem
|
71
|
+
|
72
|
+
|
73
|
+
@dataclass
|
74
|
+
class TestLimits:
|
75
|
+
__test__: ClassVar[bool] = False # Tell Pytest this is not a Pytest class, despite its name
|
76
|
+
|
77
|
+
DEFAULT_MAX_PAGES_PER_SLICE: ClassVar[int] = 5
|
78
|
+
DEFAULT_MAX_SLICES: ClassVar[int] = 5
|
79
|
+
DEFAULT_MAX_RECORDS: ClassVar[int] = 100
|
80
|
+
DEFAULT_MAX_STREAMS: ClassVar[int] = 100
|
81
|
+
|
82
|
+
max_records: int = field(default=DEFAULT_MAX_RECORDS)
|
83
|
+
max_pages_per_slice: int = field(default=DEFAULT_MAX_PAGES_PER_SLICE)
|
84
|
+
max_slices: int = field(default=DEFAULT_MAX_SLICES)
|
85
|
+
max_streams: int = field(default=DEFAULT_MAX_STREAMS)
|
53
86
|
|
54
87
|
|
55
88
|
class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
@@ -65,7 +98,9 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
65
98
|
source_config: ConnectionDefinition,
|
66
99
|
debug: bool = False,
|
67
100
|
emit_connector_builder_messages: bool = False,
|
68
|
-
|
101
|
+
migrate_manifest: bool = False,
|
102
|
+
normalize_manifest: bool = False,
|
103
|
+
limits: Optional[TestLimits] = None,
|
69
104
|
config_path: Optional[str] = None,
|
70
105
|
**kwargs: Any,
|
71
106
|
) -> None:
|
@@ -73,21 +108,39 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
73
108
|
# no longer needs to store the original incoming state. But maybe there's an edge case?
|
74
109
|
self._connector_state_manager = ConnectorStateManager(state=state) # type: ignore # state is always in the form of List[AirbyteStateMessage]. The ConnectorStateManager should use generics, but this can be done later
|
75
110
|
|
111
|
+
# We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
|
112
|
+
# threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
|
113
|
+
# partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
|
114
|
+
# information and might even need to be configurable depending on the source
|
115
|
+
queue: Queue[QueueItem] = Queue(maxsize=10_000)
|
116
|
+
message_repository = InMemoryMessageRepository(
|
117
|
+
Level.DEBUG if emit_connector_builder_messages else Level.INFO
|
118
|
+
)
|
119
|
+
|
76
120
|
# To reduce the complexity of the concurrent framework, we are not enabling RFR with synthetic
|
77
121
|
# cursors. We do this by no longer automatically instantiating RFR cursors when converting
|
78
122
|
# the declarative models into runtime components. Concurrent sources will continue to checkpoint
|
79
123
|
# incremental streams running in full refresh.
|
80
|
-
component_factory =
|
124
|
+
component_factory = ModelToComponentFactory(
|
81
125
|
emit_connector_builder_messages=emit_connector_builder_messages,
|
126
|
+
message_repository=ConcurrentMessageRepository(queue, message_repository),
|
82
127
|
connector_state_manager=self._connector_state_manager,
|
83
128
|
max_concurrent_async_job_count=source_config.get("max_concurrent_async_job_count"),
|
129
|
+
limit_pages_fetched_per_slice=limits.max_pages_per_slice if limits else None,
|
130
|
+
limit_slices_fetched=limits.max_slices if limits else None,
|
131
|
+
disable_retries=True if limits else False,
|
132
|
+
disable_cache=True if limits else False,
|
84
133
|
)
|
85
134
|
|
135
|
+
self._limits = limits
|
136
|
+
|
86
137
|
super().__init__(
|
87
138
|
source_config=source_config,
|
88
139
|
config=config,
|
89
140
|
debug=debug,
|
90
141
|
emit_connector_builder_messages=emit_connector_builder_messages,
|
142
|
+
migrate_manifest=migrate_manifest,
|
143
|
+
normalize_manifest=normalize_manifest,
|
91
144
|
component_factory=component_factory,
|
92
145
|
config_path=config_path,
|
93
146
|
)
|
@@ -117,6 +170,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
117
170
|
initial_number_of_partitions_to_generate=initial_number_of_partitions_to_generate,
|
118
171
|
logger=self.logger,
|
119
172
|
slice_logger=self._slice_logger,
|
173
|
+
queue=queue,
|
120
174
|
message_repository=self.message_repository,
|
121
175
|
)
|
122
176
|
|
@@ -280,8 +334,14 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
280
334
|
schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
|
281
335
|
retriever=retriever,
|
282
336
|
message_repository=self.message_repository,
|
337
|
+
max_records_limit=self._limits.max_records
|
338
|
+
if self._limits
|
339
|
+
else None,
|
283
340
|
),
|
284
341
|
stream_slicer=declarative_stream.retriever.stream_slicer,
|
342
|
+
slice_limit=self._limits.max_slices
|
343
|
+
if self._limits
|
344
|
+
else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
|
285
345
|
)
|
286
346
|
else:
|
287
347
|
if (
|
@@ -311,8 +371,12 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
311
371
|
schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
|
312
372
|
retriever=retriever,
|
313
373
|
message_repository=self.message_repository,
|
374
|
+
max_records_limit=self._limits.max_records
|
375
|
+
if self._limits
|
376
|
+
else None,
|
314
377
|
),
|
315
378
|
stream_slicer=cursor,
|
379
|
+
slice_limit=self._limits.max_slices if self._limits else None,
|
316
380
|
)
|
317
381
|
|
318
382
|
concurrent_streams.append(
|
@@ -341,8 +405,12 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
341
405
|
schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
|
342
406
|
retriever=declarative_stream.retriever,
|
343
407
|
message_repository=self.message_repository,
|
408
|
+
max_records_limit=self._limits.max_records if self._limits else None,
|
344
409
|
),
|
345
410
|
declarative_stream.retriever.stream_slicer,
|
411
|
+
slice_limit=self._limits.max_slices
|
412
|
+
if self._limits
|
413
|
+
else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
|
346
414
|
)
|
347
415
|
|
348
416
|
final_state_cursor = FinalStateCursor(
|
@@ -401,8 +469,10 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
401
469
|
schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
|
402
470
|
retriever=retriever,
|
403
471
|
message_repository=self.message_repository,
|
472
|
+
max_records_limit=self._limits.max_records if self._limits else None,
|
404
473
|
),
|
405
474
|
perpartition_cursor,
|
475
|
+
slice_limit=self._limits.max_slices if self._limits else None,
|
406
476
|
)
|
407
477
|
|
408
478
|
concurrent_streams.append(
|
@@ -631,6 +631,10 @@ SCHEMA_TRANSFORMER_TYPE_MAPPING = {
|
|
631
631
|
SchemaNormalizationModel.Default: TransformConfig.DefaultSchemaNormalization,
|
632
632
|
}
|
633
633
|
|
634
|
+
# Ideally this should use the value defined in ConcurrentDeclarativeSource, but
|
635
|
+
# this would be a circular import
|
636
|
+
MAX_SLICES = 5
|
637
|
+
|
634
638
|
|
635
639
|
class ModelToComponentFactory:
|
636
640
|
EPOCH_DATETIME_FORMAT = "%s"
|
@@ -1,9 +1,12 @@
|
|
1
|
-
# Copyright (c)
|
1
|
+
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
|
2
2
|
|
3
|
-
from typing import Any, Iterable, Mapping, Optional
|
3
|
+
from typing import Any, Iterable, Mapping, Optional, cast
|
4
4
|
|
5
5
|
from airbyte_cdk.sources.declarative.retrievers import Retriever
|
6
6
|
from airbyte_cdk.sources.declarative.schema import SchemaLoader
|
7
|
+
from airbyte_cdk.sources.declarative.stream_slicers.stream_slicer_test_read_decorator import (
|
8
|
+
StreamSlicerTestReadDecorator,
|
9
|
+
)
|
7
10
|
from airbyte_cdk.sources.message import MessageRepository
|
8
11
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
9
12
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition_generator import PartitionGenerator
|
@@ -11,6 +14,11 @@ from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import Stre
|
|
11
14
|
from airbyte_cdk.sources.types import Record, StreamSlice
|
12
15
|
from airbyte_cdk.utils.slice_hasher import SliceHasher
|
13
16
|
|
17
|
+
# For Connector Builder test read operations, we track the total number of records
|
18
|
+
# read for the stream at the global level so that we can stop reading early if we
|
19
|
+
# exceed the record limit
|
20
|
+
total_record_counter = 0
|
21
|
+
|
14
22
|
|
15
23
|
class SchemaLoaderCachingDecorator(SchemaLoader):
|
16
24
|
def __init__(self, schema_loader: SchemaLoader):
|
@@ -31,6 +39,7 @@ class DeclarativePartitionFactory:
|
|
31
39
|
schema_loader: SchemaLoader,
|
32
40
|
retriever: Retriever,
|
33
41
|
message_repository: MessageRepository,
|
42
|
+
max_records_limit: Optional[int] = None,
|
34
43
|
) -> None:
|
35
44
|
"""
|
36
45
|
The DeclarativePartitionFactory takes a retriever_factory and not a retriever directly. The reason is that our components are not
|
@@ -41,6 +50,7 @@ class DeclarativePartitionFactory:
|
|
41
50
|
self._schema_loader = SchemaLoaderCachingDecorator(schema_loader)
|
42
51
|
self._retriever = retriever
|
43
52
|
self._message_repository = message_repository
|
53
|
+
self._max_records_limit = max_records_limit
|
44
54
|
|
45
55
|
def create(self, stream_slice: StreamSlice) -> Partition:
|
46
56
|
return DeclarativePartition(
|
@@ -48,6 +58,7 @@ class DeclarativePartitionFactory:
|
|
48
58
|
schema_loader=self._schema_loader,
|
49
59
|
retriever=self._retriever,
|
50
60
|
message_repository=self._message_repository,
|
61
|
+
max_records_limit=self._max_records_limit,
|
51
62
|
stream_slice=stream_slice,
|
52
63
|
)
|
53
64
|
|
@@ -59,19 +70,29 @@ class DeclarativePartition(Partition):
|
|
59
70
|
schema_loader: SchemaLoader,
|
60
71
|
retriever: Retriever,
|
61
72
|
message_repository: MessageRepository,
|
73
|
+
max_records_limit: Optional[int],
|
62
74
|
stream_slice: StreamSlice,
|
63
75
|
):
|
64
76
|
self._stream_name = stream_name
|
65
77
|
self._schema_loader = schema_loader
|
66
78
|
self._retriever = retriever
|
67
79
|
self._message_repository = message_repository
|
80
|
+
self._max_records_limit = max_records_limit
|
68
81
|
self._stream_slice = stream_slice
|
69
82
|
self._hash = SliceHasher.hash(self._stream_name, self._stream_slice)
|
70
83
|
|
71
84
|
def read(self) -> Iterable[Record]:
|
85
|
+
if self._max_records_limit is not None:
|
86
|
+
global total_record_counter
|
87
|
+
if total_record_counter >= self._max_records_limit:
|
88
|
+
return
|
72
89
|
for stream_data in self._retriever.read_records(
|
73
90
|
self._schema_loader.get_json_schema(), self._stream_slice
|
74
91
|
):
|
92
|
+
if self._max_records_limit is not None:
|
93
|
+
if total_record_counter >= self._max_records_limit:
|
94
|
+
break
|
95
|
+
|
75
96
|
if isinstance(stream_data, Mapping):
|
76
97
|
record = (
|
77
98
|
stream_data
|
@@ -86,6 +107,9 @@ class DeclarativePartition(Partition):
|
|
86
107
|
else:
|
87
108
|
self._message_repository.emit_message(stream_data)
|
88
109
|
|
110
|
+
if self._max_records_limit is not None:
|
111
|
+
total_record_counter += 1
|
112
|
+
|
89
113
|
def to_slice(self) -> Optional[Mapping[str, Any]]:
|
90
114
|
return self._stream_slice
|
91
115
|
|
@@ -98,10 +122,24 @@ class DeclarativePartition(Partition):
|
|
98
122
|
|
99
123
|
class StreamSlicerPartitionGenerator(PartitionGenerator):
|
100
124
|
def __init__(
|
101
|
-
self,
|
125
|
+
self,
|
126
|
+
partition_factory: DeclarativePartitionFactory,
|
127
|
+
stream_slicer: StreamSlicer,
|
128
|
+
slice_limit: Optional[int] = None,
|
129
|
+
max_records_limit: Optional[int] = None,
|
102
130
|
) -> None:
|
103
131
|
self._partition_factory = partition_factory
|
104
|
-
|
132
|
+
|
133
|
+
if slice_limit:
|
134
|
+
self._stream_slicer = cast(
|
135
|
+
StreamSlicer,
|
136
|
+
StreamSlicerTestReadDecorator(
|
137
|
+
wrapped_slicer=stream_slicer,
|
138
|
+
maximum_number_of_slices=slice_limit,
|
139
|
+
),
|
140
|
+
)
|
141
|
+
else:
|
142
|
+
self._stream_slicer = stream_slicer
|
105
143
|
|
106
144
|
def generate(self) -> Iterable[Partition]:
|
107
145
|
for stream_slice in self._stream_slicer.stream_slices():
|
@@ -4,10 +4,10 @@
|
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
from itertools import islice
|
7
|
-
from typing import Any, Iterable
|
7
|
+
from typing import Any, Iterable
|
8
8
|
|
9
9
|
from airbyte_cdk.sources.streams.concurrent.partitions.stream_slicer import StreamSlicer
|
10
|
-
from airbyte_cdk.sources.types import StreamSlice
|
10
|
+
from airbyte_cdk.sources.types import StreamSlice
|
11
11
|
|
12
12
|
|
13
13
|
@dataclass
|