airbyte-cdk 6.60.15__py3-none-any.whl → 6.60.16.post40.dev17219503797__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +32 -36
- airbyte_cdk/connector_builder/main.py +3 -3
- airbyte_cdk/connector_builder/test_reader/helpers.py +24 -2
- airbyte_cdk/connector_builder/test_reader/message_grouper.py +1 -1
- airbyte_cdk/manifest_server/Dockerfile +45 -0
- airbyte_cdk/manifest_server/README.md +142 -0
- airbyte_cdk/manifest_server/__init__.py +3 -0
- airbyte_cdk/manifest_server/api_models/__init__.py +41 -0
- airbyte_cdk/manifest_server/api_models/capabilities.py +7 -0
- airbyte_cdk/manifest_server/api_models/dicts.py +17 -0
- airbyte_cdk/manifest_server/api_models/manifest.py +73 -0
- airbyte_cdk/manifest_server/api_models/stream.py +76 -0
- airbyte_cdk/manifest_server/app.py +17 -0
- airbyte_cdk/manifest_server/auth.py +43 -0
- airbyte_cdk/manifest_server/cli/__init__.py +5 -0
- airbyte_cdk/manifest_server/cli/_common.py +28 -0
- airbyte_cdk/manifest_server/cli/_info.py +30 -0
- airbyte_cdk/manifest_server/cli/_openapi.py +43 -0
- airbyte_cdk/manifest_server/cli/_start.py +38 -0
- airbyte_cdk/manifest_server/cli/run.py +59 -0
- airbyte_cdk/manifest_server/command_processor/__init__.py +0 -0
- airbyte_cdk/manifest_server/command_processor/processor.py +151 -0
- airbyte_cdk/manifest_server/command_processor/utils.py +76 -0
- airbyte_cdk/manifest_server/main.py +24 -0
- airbyte_cdk/manifest_server/openapi.yaml +641 -0
- airbyte_cdk/manifest_server/routers/__init__.py +0 -0
- airbyte_cdk/manifest_server/routers/capabilities.py +25 -0
- airbyte_cdk/manifest_server/routers/health.py +13 -0
- airbyte_cdk/manifest_server/routers/manifest.py +137 -0
- airbyte_cdk/sources/concurrent_source/concurrent_read_processor.py +15 -22
- airbyte_cdk/sources/concurrent_source/concurrent_source.py +30 -18
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +73 -3
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +66 -39
- airbyte_cdk/sources/declarative/stream_slicers/declarative_partition_generator.py +42 -4
- airbyte_cdk/sources/declarative/stream_slicers/stream_slicer_test_read_decorator.py +2 -2
- airbyte_cdk/sources/message/concurrent_repository.py +47 -0
- airbyte_cdk/sources/streams/concurrent/cursor.py +23 -7
- airbyte_cdk/sources/streams/concurrent/partition_reader.py +46 -5
- airbyte_cdk/sources/streams/concurrent/partitions/types.py +7 -1
- airbyte_cdk/sources/streams/http/http_client.py +4 -1
- airbyte_cdk/sources/utils/slice_logger.py +4 -0
- {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/METADATA +4 -1
- {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/RECORD +47 -21
- {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/entry_points.txt +1 -0
- {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.60.15.dist-info → airbyte_cdk-6.60.16.post40.dev17219503797.dist-info}/WHEEL +0 -0
@@ -0,0 +1,137 @@
|
|
1
|
+
import hashlib
|
2
|
+
from dataclasses import asdict
|
3
|
+
from typing import Any, Dict, List, Mapping, Optional
|
4
|
+
|
5
|
+
import jsonschema
|
6
|
+
from fastapi import APIRouter, Depends, HTTPException
|
7
|
+
|
8
|
+
from airbyte_cdk.manifest_server.api_models.manifest import (
|
9
|
+
CheckRequest,
|
10
|
+
CheckResponse,
|
11
|
+
DiscoverRequest,
|
12
|
+
DiscoverResponse,
|
13
|
+
)
|
14
|
+
from airbyte_cdk.models import AirbyteStateMessageSerializer
|
15
|
+
from airbyte_cdk.sources.declarative.manifest_declarative_source import ManifestDeclarativeSource
|
16
|
+
from airbyte_cdk.sources.declarative.parsers.custom_code_compiler import (
|
17
|
+
INJECTED_COMPONENTS_PY,
|
18
|
+
INJECTED_COMPONENTS_PY_CHECKSUMS,
|
19
|
+
)
|
20
|
+
|
21
|
+
from ..api_models import (
|
22
|
+
FullResolveRequest,
|
23
|
+
Manifest,
|
24
|
+
ManifestResponse,
|
25
|
+
ResolveRequest,
|
26
|
+
StreamRead,
|
27
|
+
StreamTestReadRequest,
|
28
|
+
)
|
29
|
+
from ..auth import verify_jwt_token
|
30
|
+
from ..command_processor.processor import ManifestCommandProcessor
|
31
|
+
from ..command_processor.utils import build_catalog, build_source
|
32
|
+
|
33
|
+
|
34
|
+
def safe_build_source(
|
35
|
+
manifest_dict: Mapping[str, Any],
|
36
|
+
config_dict: Mapping[str, Any],
|
37
|
+
page_limit: Optional[int] = None,
|
38
|
+
slice_limit: Optional[int] = None,
|
39
|
+
) -> ManifestDeclarativeSource:
|
40
|
+
"""Wrapper around build_source that converts ValidationError to HTTPException."""
|
41
|
+
try:
|
42
|
+
return build_source(manifest_dict, config_dict, page_limit, slice_limit)
|
43
|
+
except jsonschema.exceptions.ValidationError as e:
|
44
|
+
raise HTTPException(status_code=400, detail=f"Invalid manifest: {e.message}")
|
45
|
+
|
46
|
+
|
47
|
+
router = APIRouter(
|
48
|
+
prefix="/manifest",
|
49
|
+
tags=["manifest"],
|
50
|
+
dependencies=[Depends(verify_jwt_token)],
|
51
|
+
)
|
52
|
+
|
53
|
+
|
54
|
+
@router.post("/test_read", operation_id="testRead")
|
55
|
+
def test_read(request: StreamTestReadRequest) -> StreamRead:
|
56
|
+
"""
|
57
|
+
Test reading from a specific stream in the manifest.
|
58
|
+
"""
|
59
|
+
config_dict = request.config.model_dump()
|
60
|
+
|
61
|
+
source = safe_build_source(
|
62
|
+
request.manifest.model_dump(), config_dict, request.page_limit, request.slice_limit
|
63
|
+
)
|
64
|
+
catalog = build_catalog(request.stream_name)
|
65
|
+
state = [AirbyteStateMessageSerializer.load(state) for state in request.state]
|
66
|
+
|
67
|
+
if request.custom_components_code:
|
68
|
+
config_dict[INJECTED_COMPONENTS_PY] = request.custom_components_code
|
69
|
+
config_dict[INJECTED_COMPONENTS_PY_CHECKSUMS] = {
|
70
|
+
"md5": hashlib.md5(request.custom_components_code.encode()).hexdigest()
|
71
|
+
}
|
72
|
+
|
73
|
+
runner = ManifestCommandProcessor(source)
|
74
|
+
cdk_result = runner.test_read(
|
75
|
+
config_dict,
|
76
|
+
catalog,
|
77
|
+
state,
|
78
|
+
request.record_limit,
|
79
|
+
request.page_limit,
|
80
|
+
request.slice_limit,
|
81
|
+
)
|
82
|
+
return StreamRead.model_validate(asdict(cdk_result))
|
83
|
+
|
84
|
+
|
85
|
+
@router.post("/check", operation_id="check")
|
86
|
+
def check(request: CheckRequest) -> CheckResponse:
|
87
|
+
"""Check configuration against a manifest"""
|
88
|
+
source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
|
89
|
+
runner = ManifestCommandProcessor(source)
|
90
|
+
success, message = runner.check_connection(request.config.model_dump())
|
91
|
+
return CheckResponse(success=success, message=message)
|
92
|
+
|
93
|
+
|
94
|
+
@router.post("/discover", operation_id="discover")
|
95
|
+
def discover(request: DiscoverRequest) -> DiscoverResponse:
|
96
|
+
"""Discover streams from a manifest"""
|
97
|
+
source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
|
98
|
+
runner = ManifestCommandProcessor(source)
|
99
|
+
catalog = runner.discover(request.config.model_dump())
|
100
|
+
if catalog is None:
|
101
|
+
raise HTTPException(status_code=422, detail="Connector did not return a discovered catalog")
|
102
|
+
return DiscoverResponse(catalog=catalog)
|
103
|
+
|
104
|
+
|
105
|
+
@router.post("/resolve", operation_id="resolve")
|
106
|
+
def resolve(request: ResolveRequest) -> ManifestResponse:
|
107
|
+
"""Resolve a manifest to its final configuration."""
|
108
|
+
source = safe_build_source(request.manifest.model_dump(), {})
|
109
|
+
return ManifestResponse(manifest=Manifest(**source.resolved_manifest))
|
110
|
+
|
111
|
+
|
112
|
+
@router.post("/full_resolve", operation_id="fullResolve")
|
113
|
+
def full_resolve(request: FullResolveRequest) -> ManifestResponse:
|
114
|
+
"""
|
115
|
+
Fully resolve a manifest including dynamic streams.
|
116
|
+
|
117
|
+
Generates dynamic streams up to the specified limit and includes
|
118
|
+
them in the resolved manifest.
|
119
|
+
"""
|
120
|
+
source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
|
121
|
+
manifest = {**source.resolved_manifest}
|
122
|
+
streams = manifest.get("streams", [])
|
123
|
+
for stream in streams:
|
124
|
+
stream["dynamic_stream_name"] = None
|
125
|
+
|
126
|
+
mapped_streams: Dict[str, List[Dict[str, Any]]] = {}
|
127
|
+
for stream in source.dynamic_streams:
|
128
|
+
generated_streams = mapped_streams.setdefault(stream["dynamic_stream_name"], [])
|
129
|
+
|
130
|
+
if len(generated_streams) < request.stream_limit:
|
131
|
+
generated_streams += [stream]
|
132
|
+
|
133
|
+
for generated_streams_list in mapped_streams.values():
|
134
|
+
streams.extend(generated_streams_list)
|
135
|
+
|
136
|
+
manifest["streams"] = streams
|
137
|
+
return ManifestResponse(manifest=Manifest(**manifest))
|
@@ -2,6 +2,7 @@
|
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
4
|
import logging
|
5
|
+
import os
|
5
6
|
from typing import Dict, Iterable, List, Optional, Set
|
6
7
|
|
7
8
|
from airbyte_cdk.exception_handler import generate_failed_streams_error_message
|
@@ -95,11 +96,14 @@ class ConcurrentReadProcessor:
|
|
95
96
|
"""
|
96
97
|
stream_name = partition.stream_name()
|
97
98
|
self._streams_to_running_partitions[stream_name].add(partition)
|
99
|
+
cursor = self._stream_name_to_instance[stream_name].cursor
|
98
100
|
if self._slice_logger.should_log_slice_message(self._logger):
|
99
101
|
self._message_repository.emit_message(
|
100
102
|
self._slice_logger.create_slice_log_message(partition.to_slice())
|
101
103
|
)
|
102
|
-
self._thread_pool_manager.submit(
|
104
|
+
self._thread_pool_manager.submit(
|
105
|
+
self._partition_reader.process_partition, partition, cursor
|
106
|
+
)
|
103
107
|
|
104
108
|
def on_partition_complete_sentinel(
|
105
109
|
self, sentinel: PartitionCompleteSentinel
|
@@ -112,26 +116,16 @@ class ConcurrentReadProcessor:
|
|
112
116
|
"""
|
113
117
|
partition = sentinel.partition
|
114
118
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
partitions_running = self._streams_to_running_partitions[partition.stream_name()]
|
126
|
-
if partition in partitions_running:
|
127
|
-
partitions_running.remove(partition)
|
128
|
-
# If all partitions were generated and this was the last one, the stream is done
|
129
|
-
if (
|
130
|
-
partition.stream_name() not in self._streams_currently_generating_partitions
|
131
|
-
and len(partitions_running) == 0
|
132
|
-
):
|
133
|
-
yield from self._on_stream_is_done(partition.stream_name())
|
134
|
-
yield from self._message_repository.consume_queue()
|
119
|
+
partitions_running = self._streams_to_running_partitions[partition.stream_name()]
|
120
|
+
if partition in partitions_running:
|
121
|
+
partitions_running.remove(partition)
|
122
|
+
# If all partitions were generated and this was the last one, the stream is done
|
123
|
+
if (
|
124
|
+
partition.stream_name() not in self._streams_currently_generating_partitions
|
125
|
+
and len(partitions_running) == 0
|
126
|
+
):
|
127
|
+
yield from self._on_stream_is_done(partition.stream_name())
|
128
|
+
yield from self._message_repository.consume_queue()
|
135
129
|
|
136
130
|
def on_record(self, record: Record) -> Iterable[AirbyteMessage]:
|
137
131
|
"""
|
@@ -160,7 +154,6 @@ class ConcurrentReadProcessor:
|
|
160
154
|
stream.as_airbyte_stream(), AirbyteStreamStatus.RUNNING
|
161
155
|
)
|
162
156
|
self._record_counter[stream.name] += 1
|
163
|
-
stream.cursor.observe(record)
|
164
157
|
yield message
|
165
158
|
yield from self._message_repository.consume_queue()
|
166
159
|
|
@@ -1,10 +1,11 @@
|
|
1
1
|
#
|
2
2
|
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
|
3
3
|
#
|
4
|
+
|
4
5
|
import concurrent
|
5
6
|
import logging
|
6
7
|
from queue import Queue
|
7
|
-
from typing import Iterable, Iterator, List
|
8
|
+
from typing import Iterable, Iterator, List, Optional
|
8
9
|
|
9
10
|
from airbyte_cdk.models import AirbyteMessage
|
10
11
|
from airbyte_cdk.sources.concurrent_source.concurrent_read_processor import ConcurrentReadProcessor
|
@@ -16,7 +17,7 @@ from airbyte_cdk.sources.concurrent_source.thread_pool_manager import ThreadPool
|
|
16
17
|
from airbyte_cdk.sources.message import InMemoryMessageRepository, MessageRepository
|
17
18
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
18
19
|
from airbyte_cdk.sources.streams.concurrent.partition_enqueuer import PartitionEnqueuer
|
19
|
-
from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionReader
|
20
|
+
from airbyte_cdk.sources.streams.concurrent.partition_reader import PartitionLogger, PartitionReader
|
20
21
|
from airbyte_cdk.sources.streams.concurrent.partitions.partition import Partition
|
21
22
|
from airbyte_cdk.sources.streams.concurrent.partitions.types import (
|
22
23
|
PartitionCompleteSentinel,
|
@@ -43,6 +44,7 @@ class ConcurrentSource:
|
|
43
44
|
logger: logging.Logger,
|
44
45
|
slice_logger: SliceLogger,
|
45
46
|
message_repository: MessageRepository,
|
47
|
+
queue: Optional[Queue[QueueItem]] = None,
|
46
48
|
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
|
47
49
|
) -> "ConcurrentSource":
|
48
50
|
is_single_threaded = initial_number_of_partitions_to_generate == 1 and num_workers == 1
|
@@ -59,12 +61,13 @@ class ConcurrentSource:
|
|
59
61
|
logger,
|
60
62
|
)
|
61
63
|
return ConcurrentSource(
|
62
|
-
threadpool,
|
63
|
-
logger,
|
64
|
-
slice_logger,
|
65
|
-
|
66
|
-
|
67
|
-
|
64
|
+
threadpool=threadpool,
|
65
|
+
logger=logger,
|
66
|
+
slice_logger=slice_logger,
|
67
|
+
queue=queue,
|
68
|
+
message_repository=message_repository,
|
69
|
+
initial_number_partitions_to_generate=initial_number_of_partitions_to_generate,
|
70
|
+
timeout_seconds=timeout_seconds,
|
68
71
|
)
|
69
72
|
|
70
73
|
def __init__(
|
@@ -72,6 +75,7 @@ class ConcurrentSource:
|
|
72
75
|
threadpool: ThreadPoolManager,
|
73
76
|
logger: logging.Logger,
|
74
77
|
slice_logger: SliceLogger = DebugSliceLogger(),
|
78
|
+
queue: Optional[Queue[QueueItem]] = None,
|
75
79
|
message_repository: MessageRepository = InMemoryMessageRepository(),
|
76
80
|
initial_number_partitions_to_generate: int = 1,
|
77
81
|
timeout_seconds: int = DEFAULT_TIMEOUT_SECONDS,
|
@@ -91,25 +95,28 @@ class ConcurrentSource:
|
|
91
95
|
self._initial_number_partitions_to_generate = initial_number_partitions_to_generate
|
92
96
|
self._timeout_seconds = timeout_seconds
|
93
97
|
|
98
|
+
# We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
|
99
|
+
# threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
|
100
|
+
# partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
|
101
|
+
# information and might even need to be configurable depending on the source
|
102
|
+
self._queue = queue or Queue(maxsize=10_000)
|
103
|
+
|
94
104
|
def read(
|
95
105
|
self,
|
96
106
|
streams: List[AbstractStream],
|
97
107
|
) -> Iterator[AirbyteMessage]:
|
98
108
|
self._logger.info("Starting syncing")
|
99
|
-
|
100
|
-
# We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
|
101
|
-
# threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
|
102
|
-
# partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
|
103
|
-
# information and might even need to be configurable depending on the source
|
104
|
-
queue: Queue[QueueItem] = Queue(maxsize=10_000)
|
105
109
|
concurrent_stream_processor = ConcurrentReadProcessor(
|
106
110
|
streams,
|
107
|
-
PartitionEnqueuer(
|
111
|
+
PartitionEnqueuer(self._queue, self._threadpool),
|
108
112
|
self._threadpool,
|
109
113
|
self._logger,
|
110
114
|
self._slice_logger,
|
111
115
|
self._message_repository,
|
112
|
-
PartitionReader(
|
116
|
+
PartitionReader(
|
117
|
+
self._queue,
|
118
|
+
PartitionLogger(self._slice_logger, self._logger, self._message_repository),
|
119
|
+
),
|
113
120
|
)
|
114
121
|
|
115
122
|
# Enqueue initial partition generation tasks
|
@@ -117,7 +124,7 @@ class ConcurrentSource:
|
|
117
124
|
|
118
125
|
# Read from the queue until all partitions were generated and read
|
119
126
|
yield from self._consume_from_queue(
|
120
|
-
|
127
|
+
self._queue,
|
121
128
|
concurrent_stream_processor,
|
122
129
|
)
|
123
130
|
self._threadpool.check_for_errors_and_shutdown()
|
@@ -141,7 +148,10 @@ class ConcurrentSource:
|
|
141
148
|
airbyte_message_or_record_or_exception,
|
142
149
|
concurrent_stream_processor,
|
143
150
|
)
|
144
|
-
|
151
|
+
# In the event that a partition raises an exception, anything remaining in
|
152
|
+
# the queue will be missed because is_done() can raise an exception and exit
|
153
|
+
# out of this loop before remaining items are consumed
|
154
|
+
if queue.empty() and concurrent_stream_processor.is_done():
|
145
155
|
# all partitions were generated and processed. we're done here
|
146
156
|
break
|
147
157
|
|
@@ -161,5 +171,7 @@ class ConcurrentSource:
|
|
161
171
|
yield from concurrent_stream_processor.on_partition_complete_sentinel(queue_item)
|
162
172
|
elif isinstance(queue_item, Record):
|
163
173
|
yield from concurrent_stream_processor.on_record(queue_item)
|
174
|
+
elif isinstance(queue_item, AirbyteMessage):
|
175
|
+
yield queue_item
|
164
176
|
else:
|
165
177
|
raise ValueError(f"Unknown queue item type: {type(queue_item)}")
|
@@ -3,7 +3,22 @@
|
|
3
3
|
#
|
4
4
|
|
5
5
|
import logging
|
6
|
-
from
|
6
|
+
from dataclasses import dataclass, field
|
7
|
+
from queue import Queue
|
8
|
+
from typing import (
|
9
|
+
Any,
|
10
|
+
ClassVar,
|
11
|
+
Generic,
|
12
|
+
Iterator,
|
13
|
+
List,
|
14
|
+
Mapping,
|
15
|
+
MutableMapping,
|
16
|
+
Optional,
|
17
|
+
Tuple,
|
18
|
+
Union,
|
19
|
+
)
|
20
|
+
|
21
|
+
from airbyte_protocol_dataclasses.models import Level
|
7
22
|
|
8
23
|
from airbyte_cdk.models import (
|
9
24
|
AirbyteCatalog,
|
@@ -43,6 +58,8 @@ from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_genera
|
|
43
58
|
StreamSlicerPartitionGenerator,
|
44
59
|
)
|
45
60
|
from airbyte_cdk.sources.declarative.types import ConnectionDefinition
|
61
|
+
from airbyte_cdk.sources.message.concurrent_repository import ConcurrentMessageRepository
|
62
|
+
from airbyte_cdk.sources.message.repository import InMemoryMessageRepository, MessageRepository
|
46
63
|
from airbyte_cdk.sources.source import TState
|
47
64
|
from airbyte_cdk.sources.streams import Stream
|
48
65
|
from airbyte_cdk.sources.streams.concurrent.abstract_stream import AbstractStream
|
@@ -50,6 +67,22 @@ from airbyte_cdk.sources.streams.concurrent.abstract_stream_facade import Abstra
|
|
50
67
|
from airbyte_cdk.sources.streams.concurrent.cursor import ConcurrentCursor, FinalStateCursor
|
51
68
|
from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
|
52
69
|
from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream
|
70
|
+
from airbyte_cdk.sources.streams.concurrent.partitions.types import QueueItem
|
71
|
+
|
72
|
+
|
73
|
+
@dataclass
|
74
|
+
class TestLimits:
|
75
|
+
__test__: ClassVar[bool] = False # Tell Pytest this is not a Pytest class, despite its name
|
76
|
+
|
77
|
+
DEFAULT_MAX_PAGES_PER_SLICE: ClassVar[int] = 5
|
78
|
+
DEFAULT_MAX_SLICES: ClassVar[int] = 5
|
79
|
+
DEFAULT_MAX_RECORDS: ClassVar[int] = 100
|
80
|
+
DEFAULT_MAX_STREAMS: ClassVar[int] = 100
|
81
|
+
|
82
|
+
max_records: int = field(default=DEFAULT_MAX_RECORDS)
|
83
|
+
max_pages_per_slice: int = field(default=DEFAULT_MAX_PAGES_PER_SLICE)
|
84
|
+
max_slices: int = field(default=DEFAULT_MAX_SLICES)
|
85
|
+
max_streams: int = field(default=DEFAULT_MAX_STREAMS)
|
53
86
|
|
54
87
|
|
55
88
|
class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
@@ -65,7 +98,9 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
65
98
|
source_config: ConnectionDefinition,
|
66
99
|
debug: bool = False,
|
67
100
|
emit_connector_builder_messages: bool = False,
|
68
|
-
|
101
|
+
migrate_manifest: bool = False,
|
102
|
+
normalize_manifest: bool = False,
|
103
|
+
limits: Optional[TestLimits] = None,
|
69
104
|
config_path: Optional[str] = None,
|
70
105
|
**kwargs: Any,
|
71
106
|
) -> None:
|
@@ -73,21 +108,39 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
73
108
|
# no longer needs to store the original incoming state. But maybe there's an edge case?
|
74
109
|
self._connector_state_manager = ConnectorStateManager(state=state) # type: ignore # state is always in the form of List[AirbyteStateMessage]. The ConnectorStateManager should use generics, but this can be done later
|
75
110
|
|
111
|
+
# We set a maxsize to for the main thread to process record items when the queue size grows. This assumes that there are less
|
112
|
+
# threads generating partitions that than are max number of workers. If it weren't the case, we could have threads only generating
|
113
|
+
# partitions which would fill the queue. This number is arbitrarily set to 10_000 but will probably need to be changed given more
|
114
|
+
# information and might even need to be configurable depending on the source
|
115
|
+
queue: Queue[QueueItem] = Queue(maxsize=10_000)
|
116
|
+
message_repository = InMemoryMessageRepository(
|
117
|
+
Level.DEBUG if emit_connector_builder_messages else Level.INFO
|
118
|
+
)
|
119
|
+
|
76
120
|
# To reduce the complexity of the concurrent framework, we are not enabling RFR with synthetic
|
77
121
|
# cursors. We do this by no longer automatically instantiating RFR cursors when converting
|
78
122
|
# the declarative models into runtime components. Concurrent sources will continue to checkpoint
|
79
123
|
# incremental streams running in full refresh.
|
80
|
-
component_factory =
|
124
|
+
component_factory = ModelToComponentFactory(
|
81
125
|
emit_connector_builder_messages=emit_connector_builder_messages,
|
126
|
+
message_repository=ConcurrentMessageRepository(queue, message_repository),
|
82
127
|
connector_state_manager=self._connector_state_manager,
|
83
128
|
max_concurrent_async_job_count=source_config.get("max_concurrent_async_job_count"),
|
129
|
+
limit_pages_fetched_per_slice=limits.max_pages_per_slice if limits else None,
|
130
|
+
limit_slices_fetched=limits.max_slices if limits else None,
|
131
|
+
disable_retries=True if limits else False,
|
132
|
+
disable_cache=True if limits else False,
|
84
133
|
)
|
85
134
|
|
135
|
+
self._limits = limits
|
136
|
+
|
86
137
|
super().__init__(
|
87
138
|
source_config=source_config,
|
88
139
|
config=config,
|
89
140
|
debug=debug,
|
90
141
|
emit_connector_builder_messages=emit_connector_builder_messages,
|
142
|
+
migrate_manifest=migrate_manifest,
|
143
|
+
normalize_manifest=normalize_manifest,
|
91
144
|
component_factory=component_factory,
|
92
145
|
config_path=config_path,
|
93
146
|
)
|
@@ -117,6 +170,7 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
117
170
|
initial_number_of_partitions_to_generate=initial_number_of_partitions_to_generate,
|
118
171
|
logger=self.logger,
|
119
172
|
slice_logger=self._slice_logger,
|
173
|
+
queue=queue,
|
120
174
|
message_repository=self.message_repository,
|
121
175
|
)
|
122
176
|
|
@@ -280,8 +334,14 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
280
334
|
schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
|
281
335
|
retriever=retriever,
|
282
336
|
message_repository=self.message_repository,
|
337
|
+
max_records_limit=self._limits.max_records
|
338
|
+
if self._limits
|
339
|
+
else None,
|
283
340
|
),
|
284
341
|
stream_slicer=declarative_stream.retriever.stream_slicer,
|
342
|
+
slice_limit=self._limits.max_slices
|
343
|
+
if self._limits
|
344
|
+
else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
|
285
345
|
)
|
286
346
|
else:
|
287
347
|
if (
|
@@ -311,8 +371,12 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
311
371
|
schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
|
312
372
|
retriever=retriever,
|
313
373
|
message_repository=self.message_repository,
|
374
|
+
max_records_limit=self._limits.max_records
|
375
|
+
if self._limits
|
376
|
+
else None,
|
314
377
|
),
|
315
378
|
stream_slicer=cursor,
|
379
|
+
slice_limit=self._limits.max_slices if self._limits else None,
|
316
380
|
)
|
317
381
|
|
318
382
|
concurrent_streams.append(
|
@@ -341,8 +405,12 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
341
405
|
schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
|
342
406
|
retriever=declarative_stream.retriever,
|
343
407
|
message_repository=self.message_repository,
|
408
|
+
max_records_limit=self._limits.max_records if self._limits else None,
|
344
409
|
),
|
345
410
|
declarative_stream.retriever.stream_slicer,
|
411
|
+
slice_limit=self._limits.max_slices
|
412
|
+
if self._limits
|
413
|
+
else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
|
346
414
|
)
|
347
415
|
|
348
416
|
final_state_cursor = FinalStateCursor(
|
@@ -401,8 +469,10 @@ class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
|
|
401
469
|
schema_loader=declarative_stream._schema_loader, # type: ignore # We are accessing the private property but the public one is optional and we will remove this code soonish
|
402
470
|
retriever=retriever,
|
403
471
|
message_repository=self.message_repository,
|
472
|
+
max_records_limit=self._limits.max_records if self._limits else None,
|
404
473
|
),
|
405
474
|
perpartition_cursor,
|
475
|
+
slice_limit=self._limits.max_slices if self._limits else None,
|
406
476
|
)
|
407
477
|
|
408
478
|
concurrent_streams.append(
|