airbyte-cdk 6.61.5__py3-none-any.whl → 6.61.6.post3.dev17473738577__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/connector_builder/connector_builder_handler.py +7 -7
- airbyte_cdk/connector_builder/main.py +2 -2
- airbyte_cdk/connector_builder/test_reader/reader.py +2 -2
- airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py +4 -2
- airbyte_cdk/manifest_server/Dockerfile +2 -2
- airbyte_cdk/manifest_server/api_models/__init__.py +2 -0
- airbyte_cdk/manifest_server/api_models/manifest.py +12 -0
- airbyte_cdk/manifest_server/api_models/stream.py +2 -2
- airbyte_cdk/manifest_server/command_processor/processor.py +2 -4
- airbyte_cdk/manifest_server/command_processor/utils.py +1 -1
- airbyte_cdk/manifest_server/helpers/__init__.py +0 -0
- airbyte_cdk/manifest_server/helpers/tracing.py +36 -0
- airbyte_cdk/manifest_server/routers/manifest.py +38 -2
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +7 -6
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +57 -7
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +4 -2
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +229 -281
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +0 -6
- airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -5
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +0 -6
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +1 -23
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +0 -6
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +88 -107
- airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py +95 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +4 -1
- airbyte_cdk/sources/declarative/retrievers/retriever.py +5 -0
- airbyte_cdk/sources/declarative/yaml_declarative_source.py +1 -1
- airbyte_cdk/sources/message/repository.py +20 -0
- airbyte_cdk/sources/utils/schema_helpers.py +29 -9
- airbyte_cdk/sources/utils/transform.py +25 -13
- airbyte_cdk/utils/spec_schema_transformations.py +7 -5
- {airbyte_cdk-6.61.5.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/METADATA +3 -2
- {airbyte_cdk-6.61.5.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/RECORD +38 -35
- /airbyte_cdk/manifest_server/{auth.py → helpers/auth.py} +0 -0
- {airbyte_cdk-6.61.5.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.61.5.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.61.5.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.61.5.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/entry_points.txt +0 -0
@@ -62,10 +62,10 @@ def should_normalize_manifest(config: Mapping[str, Any]) -> bool:
|
|
62
62
|
|
63
63
|
def create_source(
|
64
64
|
config: Mapping[str, Any],
|
65
|
-
limits: TestLimits,
|
66
|
-
catalog:
|
67
|
-
state:
|
68
|
-
) -> ConcurrentDeclarativeSource
|
65
|
+
limits: TestLimits | None = None,
|
66
|
+
catalog: ConfiguredAirbyteCatalog | None = None,
|
67
|
+
state: List[AirbyteStateMessage] | None = None,
|
68
|
+
) -> ConcurrentDeclarativeSource:
|
69
69
|
manifest = config["__injected_declarative_manifest"]
|
70
70
|
|
71
71
|
# We enforce a concurrency level of 1 so that the stream is processed on a single thread
|
@@ -88,7 +88,7 @@ def create_source(
|
|
88
88
|
|
89
89
|
|
90
90
|
def read_stream(
|
91
|
-
source: ConcurrentDeclarativeSource
|
91
|
+
source: ConcurrentDeclarativeSource,
|
92
92
|
config: Mapping[str, Any],
|
93
93
|
configured_catalog: ConfiguredAirbyteCatalog,
|
94
94
|
state: List[AirbyteStateMessage],
|
@@ -127,7 +127,7 @@ def read_stream(
|
|
127
127
|
|
128
128
|
|
129
129
|
def resolve_manifest(
|
130
|
-
source: ConcurrentDeclarativeSource
|
130
|
+
source: ConcurrentDeclarativeSource,
|
131
131
|
) -> AirbyteMessage:
|
132
132
|
try:
|
133
133
|
return AirbyteMessage(
|
@@ -146,7 +146,7 @@ def resolve_manifest(
|
|
146
146
|
|
147
147
|
|
148
148
|
def full_resolve_manifest(
|
149
|
-
source: ConcurrentDeclarativeSource
|
149
|
+
source: ConcurrentDeclarativeSource, limits: TestLimits
|
150
150
|
) -> AirbyteMessage:
|
151
151
|
try:
|
152
152
|
manifest = {**source.resolved_manifest}
|
@@ -34,7 +34,7 @@ from airbyte_cdk.utils.traced_exception import AirbyteTracedException
|
|
34
34
|
|
35
35
|
def get_config_and_catalog_from_args(
|
36
36
|
args: List[str],
|
37
|
-
) -> Tuple[str, Mapping[str, Any], Optional[ConfiguredAirbyteCatalog],
|
37
|
+
) -> Tuple[str, Mapping[str, Any], Optional[ConfiguredAirbyteCatalog], List[AirbyteStateMessage]]:
|
38
38
|
# TODO: Add functionality for the `debug` logger.
|
39
39
|
# Currently, no one `debug` level log will be displayed during `read` a stream for a connector created through `connector-builder`.
|
40
40
|
parsed_args = AirbyteEntrypoint.parse_args(args)
|
@@ -70,7 +70,7 @@ def get_config_and_catalog_from_args(
|
|
70
70
|
|
71
71
|
|
72
72
|
def handle_connector_builder_request(
|
73
|
-
source: ConcurrentDeclarativeSource
|
73
|
+
source: ConcurrentDeclarativeSource,
|
74
74
|
command: str,
|
75
75
|
config: Mapping[str, Any],
|
76
76
|
catalog: Optional[ConfiguredAirbyteCatalog],
|
@@ -85,7 +85,7 @@ class TestReader:
|
|
85
85
|
|
86
86
|
def run_test_read(
|
87
87
|
self,
|
88
|
-
source: ConcurrentDeclarativeSource
|
88
|
+
source: ConcurrentDeclarativeSource,
|
89
89
|
config: Mapping[str, Any],
|
90
90
|
configured_catalog: ConfiguredAirbyteCatalog,
|
91
91
|
stream_name: str,
|
@@ -383,7 +383,7 @@ class TestReader:
|
|
383
383
|
|
384
384
|
def _read_stream(
|
385
385
|
self,
|
386
|
-
source: ConcurrentDeclarativeSource
|
386
|
+
source: ConcurrentDeclarativeSource,
|
387
387
|
config: Mapping[str, Any],
|
388
388
|
configured_catalog: ConfiguredAirbyteCatalog,
|
389
389
|
state: List[AirbyteStateMessage],
|
@@ -146,8 +146,10 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
146
146
|
if "state" in stream_state:
|
147
147
|
self._state_to_migrate_from = stream_state["state"]
|
148
148
|
|
149
|
-
#
|
150
|
-
|
149
|
+
# We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
|
150
|
+
# Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
|
151
|
+
# We are still keeping this line as a comment to be explicit about the past behavior.
|
152
|
+
# self._partition_router.set_initial_state(stream_state)
|
151
153
|
|
152
154
|
def observe(self, stream_slice: StreamSlice, record: Record) -> None:
|
153
155
|
self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].observe(
|
@@ -11,7 +11,7 @@ FROM python:3.12-slim-bookworm
|
|
11
11
|
RUN apt-get update && \
|
12
12
|
apt-get install -y git && \
|
13
13
|
rm -rf /var/lib/apt/lists/* && \
|
14
|
-
pip install poetry==
|
14
|
+
pip install poetry==2.0.1
|
15
15
|
|
16
16
|
# Configure poetry to not create virtual environments and disable interactive mode
|
17
17
|
ENV POETRY_NO_INTERACTION=1 \
|
@@ -42,4 +42,4 @@ USER airbyte:airbyte
|
|
42
42
|
|
43
43
|
EXPOSE 8080
|
44
44
|
|
45
|
-
CMD ["uvicorn", "airbyte_cdk.manifest_server.app:app", "--host", "0.0.0.0", "--port", "8080"]
|
45
|
+
CMD ["uvicorn", "airbyte_cdk.manifest_server.app:app", "--host", "0.0.0.0", "--port", "8080"]
|
@@ -12,6 +12,7 @@ from .manifest import (
|
|
12
12
|
DiscoverResponse,
|
13
13
|
FullResolveRequest,
|
14
14
|
ManifestResponse,
|
15
|
+
RequestContext,
|
15
16
|
ResolveRequest,
|
16
17
|
StreamTestReadRequest,
|
17
18
|
)
|
@@ -30,6 +31,7 @@ __all__ = [
|
|
30
31
|
"ConnectorConfig",
|
31
32
|
"Manifest",
|
32
33
|
# Manifest request/response models
|
34
|
+
"RequestContext",
|
33
35
|
"FullResolveRequest",
|
34
36
|
"ManifestResponse",
|
35
37
|
"StreamTestReadRequest",
|
@@ -13,6 +13,13 @@ from pydantic import BaseModel, Field
|
|
13
13
|
from .dicts import ConnectorConfig, Manifest
|
14
14
|
|
15
15
|
|
16
|
+
class RequestContext(BaseModel):
|
17
|
+
"""Optional context information for tracing and observability."""
|
18
|
+
|
19
|
+
workspace_id: Optional[str] = None
|
20
|
+
project_id: Optional[str] = None
|
21
|
+
|
22
|
+
|
16
23
|
class StreamTestReadRequest(BaseModel):
|
17
24
|
"""Request to test read from a specific stream."""
|
18
25
|
|
@@ -24,6 +31,7 @@ class StreamTestReadRequest(BaseModel):
|
|
24
31
|
record_limit: int = Field(default=100, ge=1, le=5000)
|
25
32
|
page_limit: int = Field(default=5, ge=1, le=20)
|
26
33
|
slice_limit: int = Field(default=5, ge=1, le=20)
|
34
|
+
context: Optional[RequestContext] = None
|
27
35
|
|
28
36
|
|
29
37
|
class CheckRequest(BaseModel):
|
@@ -31,6 +39,7 @@ class CheckRequest(BaseModel):
|
|
31
39
|
|
32
40
|
manifest: Manifest
|
33
41
|
config: ConnectorConfig
|
42
|
+
context: Optional[RequestContext] = None
|
34
43
|
|
35
44
|
|
36
45
|
class CheckResponse(BaseModel):
|
@@ -45,6 +54,7 @@ class DiscoverRequest(BaseModel):
|
|
45
54
|
|
46
55
|
manifest: Manifest
|
47
56
|
config: ConnectorConfig
|
57
|
+
context: Optional[RequestContext] = None
|
48
58
|
|
49
59
|
|
50
60
|
class DiscoverResponse(BaseModel):
|
@@ -57,6 +67,7 @@ class ResolveRequest(BaseModel):
|
|
57
67
|
"""Request to resolve a manifest."""
|
58
68
|
|
59
69
|
manifest: Manifest
|
70
|
+
context: Optional[RequestContext] = None
|
60
71
|
|
61
72
|
|
62
73
|
class ManifestResponse(BaseModel):
|
@@ -71,3 +82,4 @@ class FullResolveRequest(BaseModel):
|
|
71
82
|
manifest: Manifest
|
72
83
|
config: ConnectorConfig
|
73
84
|
stream_limit: int = Field(default=100, ge=1, le=100)
|
85
|
+
context: Optional[RequestContext] = None
|
@@ -6,7 +6,7 @@ They accurately reflect the runtime types returned by the CDK, particularly
|
|
6
6
|
fixing type mismatches like slice_descriptor being a string rather than an object.
|
7
7
|
"""
|
8
8
|
|
9
|
-
from typing import Any, Dict, List, Optional
|
9
|
+
from typing import Any, Dict, List, Optional, Union
|
10
10
|
|
11
11
|
from pydantic import BaseModel
|
12
12
|
|
@@ -59,7 +59,7 @@ class StreamReadSlices(BaseModel):
|
|
59
59
|
"""Slices of data read from a stream."""
|
60
60
|
|
61
61
|
pages: List[StreamReadPages]
|
62
|
-
slice_descriptor: Optional[str] #
|
62
|
+
slice_descriptor: Optional[Union[Dict[str, Any], str]] # We're seeing strings at runtime
|
63
63
|
state: Optional[List[Dict[str, Any]]] = None
|
64
64
|
auxiliary_requests: Optional[List[AuxiliaryRequest]] = None
|
65
65
|
|
@@ -21,12 +21,10 @@ from airbyte_cdk.test.entrypoint_wrapper import AirbyteEntrypointException, Entr
|
|
21
21
|
|
22
22
|
|
23
23
|
class ManifestCommandProcessor:
|
24
|
-
_source: ConcurrentDeclarativeSource
|
24
|
+
_source: ConcurrentDeclarativeSource
|
25
25
|
_logger = logging.getLogger("airbyte.manifest-server")
|
26
26
|
|
27
|
-
def __init__(
|
28
|
-
self, source: ConcurrentDeclarativeSource[Optional[List[AirbyteStateMessage]]]
|
29
|
-
) -> None:
|
27
|
+
def __init__(self, source: ConcurrentDeclarativeSource) -> None:
|
30
28
|
self._source = source
|
31
29
|
|
32
30
|
def test_read(
|
@@ -63,7 +63,7 @@ def build_source(
|
|
63
63
|
record_limit: Optional[int] = None,
|
64
64
|
page_limit: Optional[int] = None,
|
65
65
|
slice_limit: Optional[int] = None,
|
66
|
-
) -> ConcurrentDeclarativeSource
|
66
|
+
) -> ConcurrentDeclarativeSource:
|
67
67
|
# We enforce a concurrency level of 1 so that the stream is processed on a single thread
|
68
68
|
# to retain ordering for the grouping of the builder message responses.
|
69
69
|
definition = copy.deepcopy(manifest)
|
File without changes
|
@@ -0,0 +1,36 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
import ddtrace
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
|
8
|
+
|
9
|
+
def apply_trace_tags_from_context(
|
10
|
+
workspace_id: Optional[str] = None,
|
11
|
+
project_id: Optional[str] = None,
|
12
|
+
) -> None:
|
13
|
+
"""Apply trace tags from context to the current span."""
|
14
|
+
if not workspace_id and not project_id:
|
15
|
+
return
|
16
|
+
|
17
|
+
# Log the trace IDs for observability
|
18
|
+
log_parts = []
|
19
|
+
if workspace_id:
|
20
|
+
log_parts.append(f"workspace_id={workspace_id}")
|
21
|
+
if project_id:
|
22
|
+
log_parts.append(f"project_id={project_id}")
|
23
|
+
|
24
|
+
if log_parts:
|
25
|
+
logger.info(f"Processing request with trace tags: {', '.join(log_parts)}")
|
26
|
+
|
27
|
+
try:
|
28
|
+
span = ddtrace.tracer.current_span()
|
29
|
+
if span:
|
30
|
+
if workspace_id:
|
31
|
+
span.set_tag("workspace_id", workspace_id)
|
32
|
+
if project_id:
|
33
|
+
span.set_tag("project_id", project_id)
|
34
|
+
except Exception:
|
35
|
+
# Silently ignore any ddtrace-related errors (e.g. if ddtrace.auto wasn't run)
|
36
|
+
pass
|
@@ -27,9 +27,10 @@ from ..api_models import (
|
|
27
27
|
StreamReadResponse,
|
28
28
|
StreamTestReadRequest,
|
29
29
|
)
|
30
|
-
from ..auth import verify_jwt_token
|
31
30
|
from ..command_processor.processor import ManifestCommandProcessor
|
32
31
|
from ..command_processor.utils import build_catalog, build_source
|
32
|
+
from ..helpers.auth import verify_jwt_token
|
33
|
+
from ..helpers.tracing import apply_trace_tags_from_context
|
33
34
|
|
34
35
|
|
35
36
|
def safe_build_source(
|
@@ -40,7 +41,7 @@ def safe_build_source(
|
|
40
41
|
page_limit: Optional[int] = None,
|
41
42
|
slice_limit: Optional[int] = None,
|
42
43
|
record_limit: Optional[int] = None,
|
43
|
-
) -> ConcurrentDeclarativeSource
|
44
|
+
) -> ConcurrentDeclarativeSource:
|
44
45
|
"""Wrapper around build_source that converts ValidationError to HTTPException."""
|
45
46
|
try:
|
46
47
|
return build_source(
|
@@ -68,6 +69,13 @@ def test_read(request: StreamTestReadRequest) -> StreamReadResponse:
|
|
68
69
|
"""
|
69
70
|
Test reading from a specific stream in the manifest.
|
70
71
|
"""
|
72
|
+
# Apply trace tags from context if provided
|
73
|
+
if request.context:
|
74
|
+
apply_trace_tags_from_context(
|
75
|
+
workspace_id=request.context.workspace_id,
|
76
|
+
project_id=request.context.project_id,
|
77
|
+
)
|
78
|
+
|
71
79
|
config_dict = request.config.model_dump()
|
72
80
|
|
73
81
|
catalog = build_catalog(request.stream_name)
|
@@ -104,6 +112,13 @@ def test_read(request: StreamTestReadRequest) -> StreamReadResponse:
|
|
104
112
|
@router.post("/check", operation_id="check")
|
105
113
|
def check(request: CheckRequest) -> CheckResponse:
|
106
114
|
"""Check configuration against a manifest"""
|
115
|
+
# Apply trace tags from context if provided
|
116
|
+
if request.context:
|
117
|
+
apply_trace_tags_from_context(
|
118
|
+
workspace_id=request.context.workspace_id,
|
119
|
+
project_id=request.context.project_id,
|
120
|
+
)
|
121
|
+
|
107
122
|
source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
|
108
123
|
runner = ManifestCommandProcessor(source)
|
109
124
|
success, message = runner.check_connection(request.config.model_dump())
|
@@ -113,6 +128,13 @@ def check(request: CheckRequest) -> CheckResponse:
|
|
113
128
|
@router.post("/discover", operation_id="discover")
|
114
129
|
def discover(request: DiscoverRequest) -> DiscoverResponse:
|
115
130
|
"""Discover streams from a manifest"""
|
131
|
+
# Apply trace tags from context if provided
|
132
|
+
if request.context:
|
133
|
+
apply_trace_tags_from_context(
|
134
|
+
workspace_id=request.context.workspace_id,
|
135
|
+
project_id=request.context.project_id,
|
136
|
+
)
|
137
|
+
|
116
138
|
source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
|
117
139
|
runner = ManifestCommandProcessor(source)
|
118
140
|
catalog = runner.discover(request.config.model_dump())
|
@@ -124,6 +146,13 @@ def discover(request: DiscoverRequest) -> DiscoverResponse:
|
|
124
146
|
@router.post("/resolve", operation_id="resolve")
|
125
147
|
def resolve(request: ResolveRequest) -> ManifestResponse:
|
126
148
|
"""Resolve a manifest to its final configuration."""
|
149
|
+
# Apply trace tags from context if provided
|
150
|
+
if request.context:
|
151
|
+
apply_trace_tags_from_context(
|
152
|
+
workspace_id=request.context.workspace_id,
|
153
|
+
project_id=request.context.project_id,
|
154
|
+
)
|
155
|
+
|
127
156
|
source = safe_build_source(request.manifest.model_dump(), {})
|
128
157
|
return ManifestResponse(manifest=Manifest(**source.resolved_manifest))
|
129
158
|
|
@@ -135,6 +164,13 @@ def full_resolve(request: FullResolveRequest) -> ManifestResponse:
|
|
135
164
|
|
136
165
|
This is a similar operation to resolve, but has an extra step which generates streams from dynamic stream templates if the manifest contains any. This is used when a user clicks the generate streams button on a stream template in the Builder UI
|
137
166
|
"""
|
167
|
+
# Apply trace tags from context if provided
|
168
|
+
if request.context:
|
169
|
+
apply_trace_tags_from_context(
|
170
|
+
workspace_id=request.context.workspace_id,
|
171
|
+
project_id=request.context.project_id,
|
172
|
+
)
|
173
|
+
|
138
174
|
source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
|
139
175
|
manifest = {**source.resolved_manifest}
|
140
176
|
streams = manifest.get("streams", [])
|
@@ -162,16 +162,17 @@ def _get_declarative_component_schema() -> Dict[str, Any]:
|
|
162
162
|
# is no longer inherited from since the only external dependency is from that class.
|
163
163
|
#
|
164
164
|
# todo: It is worth investigating removal of the Generic[TState] since it will always be Optional[List[AirbyteStateMessage]]
|
165
|
-
class ConcurrentDeclarativeSource(AbstractSource
|
165
|
+
class ConcurrentDeclarativeSource(AbstractSource):
|
166
166
|
# By default, we defer to a value of 2. A value lower than could cause a PartitionEnqueuer to be stuck in a state of deadlock
|
167
167
|
# because it has hit the limit of futures but not partition reader is consuming them.
|
168
168
|
_LOWEST_SAFE_CONCURRENCY_LEVEL = 2
|
169
169
|
|
170
170
|
def __init__(
|
171
171
|
self,
|
172
|
-
catalog: Optional[ConfiguredAirbyteCatalog],
|
173
|
-
config: Optional[Mapping[str, Any]],
|
174
|
-
state:
|
172
|
+
catalog: Optional[ConfiguredAirbyteCatalog] = None,
|
173
|
+
config: Optional[Mapping[str, Any]] = None,
|
174
|
+
state: Optional[List[AirbyteStateMessage]] = None,
|
175
|
+
*,
|
175
176
|
source_config: ConnectionDefinition,
|
176
177
|
debug: bool = False,
|
177
178
|
emit_connector_builder_messages: bool = False,
|
@@ -703,7 +704,7 @@ class ConcurrentDeclarativeSource(AbstractSource, Generic[TState]):
|
|
703
704
|
stream_slicer=declarative_stream.retriever.stream_slicer,
|
704
705
|
slice_limit=self._limits.max_slices
|
705
706
|
if self._limits
|
706
|
-
else None, # technically not needed because
|
707
|
+
else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
|
707
708
|
)
|
708
709
|
else:
|
709
710
|
if (
|
@@ -772,7 +773,7 @@ class ConcurrentDeclarativeSource(AbstractSource, Generic[TState]):
|
|
772
773
|
declarative_stream.retriever.stream_slicer,
|
773
774
|
slice_limit=self._limits.max_slices
|
774
775
|
if self._limits
|
775
|
-
else None, # technically not needed because
|
776
|
+
else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
|
776
777
|
)
|
777
778
|
|
778
779
|
final_state_cursor = FinalStateCursor(
|
@@ -11,6 +11,13 @@ from copy import deepcopy
|
|
11
11
|
from datetime import timedelta
|
12
12
|
from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional
|
13
13
|
|
14
|
+
from airbyte_cdk.models import (
|
15
|
+
AirbyteStateBlob,
|
16
|
+
AirbyteStateMessage,
|
17
|
+
AirbyteStateType,
|
18
|
+
AirbyteStreamState,
|
19
|
+
StreamDescriptor,
|
20
|
+
)
|
14
21
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
15
22
|
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
|
16
23
|
Timer,
|
@@ -48,7 +55,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
48
55
|
Manages state per partition when a stream has many partitions, preventing data loss or duplication.
|
49
56
|
|
50
57
|
Attributes:
|
51
|
-
DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000).
|
58
|
+
DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000). This limit needs to be higher than the number of threads we might enqueue (which is represented by ThreadPoolManager.DEFAULT_MAX_QUEUE_SIZE). If not, we could have partitions that have been generated and submitted to the ThreadPool but got deleted from the ConcurrentPerPartitionCursor and when closing them, it will generate KeyError.
|
52
59
|
|
53
60
|
- **Partition Limitation Logic**
|
54
61
|
Ensures the number of tracked partitions does not exceed the specified limit to prevent memory overuse. Oldest partitions are removed when the limit is reached.
|
@@ -128,6 +135,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
128
135
|
|
129
136
|
# FIXME this is a temporary field the time of the migration from declarative cursors to concurrent ones
|
130
137
|
self._attempt_to_create_cursor_if_not_provided = attempt_to_create_cursor_if_not_provided
|
138
|
+
self._synced_some_data = False
|
131
139
|
|
132
140
|
@property
|
133
141
|
def cursor_field(self) -> CursorField:
|
@@ -168,8 +176,8 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
168
176
|
with self._lock:
|
169
177
|
self._semaphore_per_partition[partition_key].acquire()
|
170
178
|
if not self._use_global_cursor:
|
171
|
-
self._cursor_per_partition[partition_key].close_partition(partition=partition)
|
172
179
|
cursor = self._cursor_per_partition[partition_key]
|
180
|
+
cursor.close_partition(partition=partition)
|
173
181
|
if (
|
174
182
|
partition_key in self._partitions_done_generating_stream_slices
|
175
183
|
and self._semaphore_per_partition[partition_key]._value == 0
|
@@ -213,8 +221,10 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
213
221
|
if not any(
|
214
222
|
semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
|
215
223
|
):
|
216
|
-
|
217
|
-
|
224
|
+
if self._synced_some_data:
|
225
|
+
# we only update those if we actually synced some data
|
226
|
+
self._global_cursor = self._new_global_cursor
|
227
|
+
self._lookback_window = self._timer.finish()
|
218
228
|
self._parent_state = self._partition_router.get_stream_state()
|
219
229
|
self._emit_state_message(throttle=False)
|
220
230
|
|
@@ -422,9 +432,6 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
422
432
|
if stream_state.get("parent_state"):
|
423
433
|
self._parent_state = stream_state["parent_state"]
|
424
434
|
|
425
|
-
# Set parent state for partition routers based on parent streams
|
426
|
-
self._partition_router.set_initial_state(stream_state)
|
427
|
-
|
428
435
|
def _set_global_state(self, stream_state: Mapping[str, Any]) -> None:
|
429
436
|
"""
|
430
437
|
Initializes the global cursor state from the provided stream state.
|
@@ -458,6 +465,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
458
465
|
except ValueError:
|
459
466
|
return
|
460
467
|
|
468
|
+
self._synced_some_data = True
|
461
469
|
record_cursor = self._connector_state_converter.output_format(
|
462
470
|
self._connector_state_converter.parse_value(record_cursor_value)
|
463
471
|
)
|
@@ -541,3 +549,45 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
541
549
|
|
542
550
|
def limit_reached(self) -> bool:
|
543
551
|
return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT
|
552
|
+
|
553
|
+
@staticmethod
|
554
|
+
def get_parent_state(
|
555
|
+
stream_state: Optional[StreamState], parent_stream_name: str
|
556
|
+
) -> Optional[AirbyteStateMessage]:
|
557
|
+
if not stream_state:
|
558
|
+
return None
|
559
|
+
|
560
|
+
if "parent_state" not in stream_state:
|
561
|
+
logger.warning(
|
562
|
+
f"Trying to get_parent_state for stream `{parent_stream_name}` when there are not parent state in the state"
|
563
|
+
)
|
564
|
+
return None
|
565
|
+
elif parent_stream_name not in stream_state["parent_state"]:
|
566
|
+
logger.info(
|
567
|
+
f"Could not find parent state for stream `{parent_stream_name}`. On parents available are {list(stream_state['parent_state'].keys())}"
|
568
|
+
)
|
569
|
+
return None
|
570
|
+
|
571
|
+
return AirbyteStateMessage(
|
572
|
+
type=AirbyteStateType.STREAM,
|
573
|
+
stream=AirbyteStreamState(
|
574
|
+
stream_descriptor=StreamDescriptor(parent_stream_name, None),
|
575
|
+
stream_state=AirbyteStateBlob(stream_state["parent_state"][parent_stream_name]),
|
576
|
+
),
|
577
|
+
)
|
578
|
+
|
579
|
+
@staticmethod
|
580
|
+
def get_global_state(
|
581
|
+
stream_state: Optional[StreamState], parent_stream_name: str
|
582
|
+
) -> Optional[AirbyteStateMessage]:
|
583
|
+
return (
|
584
|
+
AirbyteStateMessage(
|
585
|
+
type=AirbyteStateType.STREAM,
|
586
|
+
stream=AirbyteStreamState(
|
587
|
+
stream_descriptor=StreamDescriptor(parent_stream_name, None),
|
588
|
+
stream_state=AirbyteStateBlob(stream_state["state"]),
|
589
|
+
),
|
590
|
+
)
|
591
|
+
if stream_state and "state" in stream_state
|
592
|
+
else None
|
593
|
+
)
|
@@ -192,8 +192,10 @@ class GlobalSubstreamCursor(DeclarativeCursor):
|
|
192
192
|
# Example: {"global_state_format_key": "global_state_format_value"}
|
193
193
|
self._stream_cursor.set_initial_state(stream_state)
|
194
194
|
|
195
|
-
#
|
196
|
-
|
195
|
+
# We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
|
196
|
+
# Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
|
197
|
+
# We are still keeping this line as a comment to be explicit about the past behavior.
|
198
|
+
# self._partition_router.set_initial_state(stream_state)
|
197
199
|
|
198
200
|
def _inject_lookback_into_stream_cursor(self, lookback_window: int) -> None:
|
199
201
|
"""
|