airbyte-cdk 6.61.6__py3-none-any.whl → 6.61.6.post3.dev17473738577__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py +4 -2
- airbyte_cdk/manifest_server/api_models/__init__.py +2 -0
- airbyte_cdk/manifest_server/api_models/manifest.py +12 -0
- airbyte_cdk/manifest_server/api_models/stream.py +2 -2
- airbyte_cdk/manifest_server/helpers/__init__.py +0 -0
- airbyte_cdk/manifest_server/helpers/tracing.py +36 -0
- airbyte_cdk/manifest_server/routers/manifest.py +37 -1
- airbyte_cdk/sources/declarative/concurrent_declarative_source.py +2 -2
- airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +57 -7
- airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +4 -2
- airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +229 -281
- airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +0 -6
- airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -5
- airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +0 -6
- airbyte_cdk/sources/declarative/partition_routers/partition_router.py +1 -23
- airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +0 -6
- airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +88 -107
- airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py +95 -0
- airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +4 -1
- airbyte_cdk/sources/declarative/retrievers/retriever.py +5 -0
- airbyte_cdk/sources/message/repository.py +20 -0
- {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/METADATA +1 -1
- {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/RECORD +28 -25
- /airbyte_cdk/manifest_server/{auth.py → helpers/auth.py} +0 -0
- {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/LICENSE.txt +0 -0
- {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/LICENSE_SHORT +0 -0
- {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/WHEEL +0 -0
- {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/entry_points.txt +0 -0
@@ -146,8 +146,10 @@ class PerPartitionCursor(DeclarativeCursor):
|
|
146
146
|
if "state" in stream_state:
|
147
147
|
self._state_to_migrate_from = stream_state["state"]
|
148
148
|
|
149
|
-
#
|
150
|
-
|
149
|
+
# We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
|
150
|
+
# Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
|
151
|
+
# We are still keeping this line as a comment to be explicit about the past behavior.
|
152
|
+
# self._partition_router.set_initial_state(stream_state)
|
151
153
|
|
152
154
|
def observe(self, stream_slice: StreamSlice, record: Record) -> None:
|
153
155
|
self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].observe(
|
@@ -12,6 +12,7 @@ from .manifest import (
|
|
12
12
|
DiscoverResponse,
|
13
13
|
FullResolveRequest,
|
14
14
|
ManifestResponse,
|
15
|
+
RequestContext,
|
15
16
|
ResolveRequest,
|
16
17
|
StreamTestReadRequest,
|
17
18
|
)
|
@@ -30,6 +31,7 @@ __all__ = [
|
|
30
31
|
"ConnectorConfig",
|
31
32
|
"Manifest",
|
32
33
|
# Manifest request/response models
|
34
|
+
"RequestContext",
|
33
35
|
"FullResolveRequest",
|
34
36
|
"ManifestResponse",
|
35
37
|
"StreamTestReadRequest",
|
@@ -13,6 +13,13 @@ from pydantic import BaseModel, Field
|
|
13
13
|
from .dicts import ConnectorConfig, Manifest
|
14
14
|
|
15
15
|
|
16
|
+
class RequestContext(BaseModel):
|
17
|
+
"""Optional context information for tracing and observability."""
|
18
|
+
|
19
|
+
workspace_id: Optional[str] = None
|
20
|
+
project_id: Optional[str] = None
|
21
|
+
|
22
|
+
|
16
23
|
class StreamTestReadRequest(BaseModel):
|
17
24
|
"""Request to test read from a specific stream."""
|
18
25
|
|
@@ -24,6 +31,7 @@ class StreamTestReadRequest(BaseModel):
|
|
24
31
|
record_limit: int = Field(default=100, ge=1, le=5000)
|
25
32
|
page_limit: int = Field(default=5, ge=1, le=20)
|
26
33
|
slice_limit: int = Field(default=5, ge=1, le=20)
|
34
|
+
context: Optional[RequestContext] = None
|
27
35
|
|
28
36
|
|
29
37
|
class CheckRequest(BaseModel):
|
@@ -31,6 +39,7 @@ class CheckRequest(BaseModel):
|
|
31
39
|
|
32
40
|
manifest: Manifest
|
33
41
|
config: ConnectorConfig
|
42
|
+
context: Optional[RequestContext] = None
|
34
43
|
|
35
44
|
|
36
45
|
class CheckResponse(BaseModel):
|
@@ -45,6 +54,7 @@ class DiscoverRequest(BaseModel):
|
|
45
54
|
|
46
55
|
manifest: Manifest
|
47
56
|
config: ConnectorConfig
|
57
|
+
context: Optional[RequestContext] = None
|
48
58
|
|
49
59
|
|
50
60
|
class DiscoverResponse(BaseModel):
|
@@ -57,6 +67,7 @@ class ResolveRequest(BaseModel):
|
|
57
67
|
"""Request to resolve a manifest."""
|
58
68
|
|
59
69
|
manifest: Manifest
|
70
|
+
context: Optional[RequestContext] = None
|
60
71
|
|
61
72
|
|
62
73
|
class ManifestResponse(BaseModel):
|
@@ -71,3 +82,4 @@ class FullResolveRequest(BaseModel):
|
|
71
82
|
manifest: Manifest
|
72
83
|
config: ConnectorConfig
|
73
84
|
stream_limit: int = Field(default=100, ge=1, le=100)
|
85
|
+
context: Optional[RequestContext] = None
|
@@ -6,7 +6,7 @@ They accurately reflect the runtime types returned by the CDK, particularly
|
|
6
6
|
fixing type mismatches like slice_descriptor being a string rather than an object.
|
7
7
|
"""
|
8
8
|
|
9
|
-
from typing import Any, Dict, List, Optional
|
9
|
+
from typing import Any, Dict, List, Optional, Union
|
10
10
|
|
11
11
|
from pydantic import BaseModel
|
12
12
|
|
@@ -59,7 +59,7 @@ class StreamReadSlices(BaseModel):
|
|
59
59
|
"""Slices of data read from a stream."""
|
60
60
|
|
61
61
|
pages: List[StreamReadPages]
|
62
|
-
slice_descriptor: Optional[str] #
|
62
|
+
slice_descriptor: Optional[Union[Dict[str, Any], str]] # We're seeing strings at runtime
|
63
63
|
state: Optional[List[Dict[str, Any]]] = None
|
64
64
|
auxiliary_requests: Optional[List[AuxiliaryRequest]] = None
|
65
65
|
|
File without changes
|
@@ -0,0 +1,36 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
import ddtrace
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
|
8
|
+
|
9
|
+
def apply_trace_tags_from_context(
|
10
|
+
workspace_id: Optional[str] = None,
|
11
|
+
project_id: Optional[str] = None,
|
12
|
+
) -> None:
|
13
|
+
"""Apply trace tags from context to the current span."""
|
14
|
+
if not workspace_id and not project_id:
|
15
|
+
return
|
16
|
+
|
17
|
+
# Log the trace IDs for observability
|
18
|
+
log_parts = []
|
19
|
+
if workspace_id:
|
20
|
+
log_parts.append(f"workspace_id={workspace_id}")
|
21
|
+
if project_id:
|
22
|
+
log_parts.append(f"project_id={project_id}")
|
23
|
+
|
24
|
+
if log_parts:
|
25
|
+
logger.info(f"Processing request with trace tags: {', '.join(log_parts)}")
|
26
|
+
|
27
|
+
try:
|
28
|
+
span = ddtrace.tracer.current_span()
|
29
|
+
if span:
|
30
|
+
if workspace_id:
|
31
|
+
span.set_tag("workspace_id", workspace_id)
|
32
|
+
if project_id:
|
33
|
+
span.set_tag("project_id", project_id)
|
34
|
+
except Exception:
|
35
|
+
# Silently ignore any ddtrace-related errors (e.g. if ddtrace.auto wasn't run)
|
36
|
+
pass
|
@@ -27,9 +27,10 @@ from ..api_models import (
|
|
27
27
|
StreamReadResponse,
|
28
28
|
StreamTestReadRequest,
|
29
29
|
)
|
30
|
-
from ..auth import verify_jwt_token
|
31
30
|
from ..command_processor.processor import ManifestCommandProcessor
|
32
31
|
from ..command_processor.utils import build_catalog, build_source
|
32
|
+
from ..helpers.auth import verify_jwt_token
|
33
|
+
from ..helpers.tracing import apply_trace_tags_from_context
|
33
34
|
|
34
35
|
|
35
36
|
def safe_build_source(
|
@@ -68,6 +69,13 @@ def test_read(request: StreamTestReadRequest) -> StreamReadResponse:
|
|
68
69
|
"""
|
69
70
|
Test reading from a specific stream in the manifest.
|
70
71
|
"""
|
72
|
+
# Apply trace tags from context if provided
|
73
|
+
if request.context:
|
74
|
+
apply_trace_tags_from_context(
|
75
|
+
workspace_id=request.context.workspace_id,
|
76
|
+
project_id=request.context.project_id,
|
77
|
+
)
|
78
|
+
|
71
79
|
config_dict = request.config.model_dump()
|
72
80
|
|
73
81
|
catalog = build_catalog(request.stream_name)
|
@@ -104,6 +112,13 @@ def test_read(request: StreamTestReadRequest) -> StreamReadResponse:
|
|
104
112
|
@router.post("/check", operation_id="check")
|
105
113
|
def check(request: CheckRequest) -> CheckResponse:
|
106
114
|
"""Check configuration against a manifest"""
|
115
|
+
# Apply trace tags from context if provided
|
116
|
+
if request.context:
|
117
|
+
apply_trace_tags_from_context(
|
118
|
+
workspace_id=request.context.workspace_id,
|
119
|
+
project_id=request.context.project_id,
|
120
|
+
)
|
121
|
+
|
107
122
|
source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
|
108
123
|
runner = ManifestCommandProcessor(source)
|
109
124
|
success, message = runner.check_connection(request.config.model_dump())
|
@@ -113,6 +128,13 @@ def check(request: CheckRequest) -> CheckResponse:
|
|
113
128
|
@router.post("/discover", operation_id="discover")
|
114
129
|
def discover(request: DiscoverRequest) -> DiscoverResponse:
|
115
130
|
"""Discover streams from a manifest"""
|
131
|
+
# Apply trace tags from context if provided
|
132
|
+
if request.context:
|
133
|
+
apply_trace_tags_from_context(
|
134
|
+
workspace_id=request.context.workspace_id,
|
135
|
+
project_id=request.context.project_id,
|
136
|
+
)
|
137
|
+
|
116
138
|
source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
|
117
139
|
runner = ManifestCommandProcessor(source)
|
118
140
|
catalog = runner.discover(request.config.model_dump())
|
@@ -124,6 +146,13 @@ def discover(request: DiscoverRequest) -> DiscoverResponse:
|
|
124
146
|
@router.post("/resolve", operation_id="resolve")
|
125
147
|
def resolve(request: ResolveRequest) -> ManifestResponse:
|
126
148
|
"""Resolve a manifest to its final configuration."""
|
149
|
+
# Apply trace tags from context if provided
|
150
|
+
if request.context:
|
151
|
+
apply_trace_tags_from_context(
|
152
|
+
workspace_id=request.context.workspace_id,
|
153
|
+
project_id=request.context.project_id,
|
154
|
+
)
|
155
|
+
|
127
156
|
source = safe_build_source(request.manifest.model_dump(), {})
|
128
157
|
return ManifestResponse(manifest=Manifest(**source.resolved_manifest))
|
129
158
|
|
@@ -135,6 +164,13 @@ def full_resolve(request: FullResolveRequest) -> ManifestResponse:
|
|
135
164
|
|
136
165
|
This is a similar operation to resolve, but has an extra step which generates streams from dynamic stream templates if the manifest contains any. This is used when a user clicks the generate streams button on a stream template in the Builder UI
|
137
166
|
"""
|
167
|
+
# Apply trace tags from context if provided
|
168
|
+
if request.context:
|
169
|
+
apply_trace_tags_from_context(
|
170
|
+
workspace_id=request.context.workspace_id,
|
171
|
+
project_id=request.context.project_id,
|
172
|
+
)
|
173
|
+
|
138
174
|
source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
|
139
175
|
manifest = {**source.resolved_manifest}
|
140
176
|
streams = manifest.get("streams", [])
|
@@ -704,7 +704,7 @@ class ConcurrentDeclarativeSource(AbstractSource):
|
|
704
704
|
stream_slicer=declarative_stream.retriever.stream_slicer,
|
705
705
|
slice_limit=self._limits.max_slices
|
706
706
|
if self._limits
|
707
|
-
else None, # technically not needed because
|
707
|
+
else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
|
708
708
|
)
|
709
709
|
else:
|
710
710
|
if (
|
@@ -773,7 +773,7 @@ class ConcurrentDeclarativeSource(AbstractSource):
|
|
773
773
|
declarative_stream.retriever.stream_slicer,
|
774
774
|
slice_limit=self._limits.max_slices
|
775
775
|
if self._limits
|
776
|
-
else None, # technically not needed because
|
776
|
+
else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
|
777
777
|
)
|
778
778
|
|
779
779
|
final_state_cursor = FinalStateCursor(
|
@@ -11,6 +11,13 @@ from copy import deepcopy
|
|
11
11
|
from datetime import timedelta
|
12
12
|
from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional
|
13
13
|
|
14
|
+
from airbyte_cdk.models import (
|
15
|
+
AirbyteStateBlob,
|
16
|
+
AirbyteStateMessage,
|
17
|
+
AirbyteStateType,
|
18
|
+
AirbyteStreamState,
|
19
|
+
StreamDescriptor,
|
20
|
+
)
|
14
21
|
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
|
15
22
|
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
|
16
23
|
Timer,
|
@@ -48,7 +55,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
48
55
|
Manages state per partition when a stream has many partitions, preventing data loss or duplication.
|
49
56
|
|
50
57
|
Attributes:
|
51
|
-
DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000).
|
58
|
+
DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000). This limit needs to be higher than the number of threads we might enqueue (which is represented by ThreadPoolManager.DEFAULT_MAX_QUEUE_SIZE). If not, we could have partitions that have been generated and submitted to the ThreadPool but got deleted from the ConcurrentPerPartitionCursor and when closing them, it will generate KeyError.
|
52
59
|
|
53
60
|
- **Partition Limitation Logic**
|
54
61
|
Ensures the number of tracked partitions does not exceed the specified limit to prevent memory overuse. Oldest partitions are removed when the limit is reached.
|
@@ -128,6 +135,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
128
135
|
|
129
136
|
# FIXME this is a temporary field the time of the migration from declarative cursors to concurrent ones
|
130
137
|
self._attempt_to_create_cursor_if_not_provided = attempt_to_create_cursor_if_not_provided
|
138
|
+
self._synced_some_data = False
|
131
139
|
|
132
140
|
@property
|
133
141
|
def cursor_field(self) -> CursorField:
|
@@ -168,8 +176,8 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
168
176
|
with self._lock:
|
169
177
|
self._semaphore_per_partition[partition_key].acquire()
|
170
178
|
if not self._use_global_cursor:
|
171
|
-
self._cursor_per_partition[partition_key].close_partition(partition=partition)
|
172
179
|
cursor = self._cursor_per_partition[partition_key]
|
180
|
+
cursor.close_partition(partition=partition)
|
173
181
|
if (
|
174
182
|
partition_key in self._partitions_done_generating_stream_slices
|
175
183
|
and self._semaphore_per_partition[partition_key]._value == 0
|
@@ -213,8 +221,10 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
213
221
|
if not any(
|
214
222
|
semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
|
215
223
|
):
|
216
|
-
|
217
|
-
|
224
|
+
if self._synced_some_data:
|
225
|
+
# we only update those if we actually synced some data
|
226
|
+
self._global_cursor = self._new_global_cursor
|
227
|
+
self._lookback_window = self._timer.finish()
|
218
228
|
self._parent_state = self._partition_router.get_stream_state()
|
219
229
|
self._emit_state_message(throttle=False)
|
220
230
|
|
@@ -422,9 +432,6 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
422
432
|
if stream_state.get("parent_state"):
|
423
433
|
self._parent_state = stream_state["parent_state"]
|
424
434
|
|
425
|
-
# Set parent state for partition routers based on parent streams
|
426
|
-
self._partition_router.set_initial_state(stream_state)
|
427
|
-
|
428
435
|
def _set_global_state(self, stream_state: Mapping[str, Any]) -> None:
|
429
436
|
"""
|
430
437
|
Initializes the global cursor state from the provided stream state.
|
@@ -458,6 +465,7 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
458
465
|
except ValueError:
|
459
466
|
return
|
460
467
|
|
468
|
+
self._synced_some_data = True
|
461
469
|
record_cursor = self._connector_state_converter.output_format(
|
462
470
|
self._connector_state_converter.parse_value(record_cursor_value)
|
463
471
|
)
|
@@ -541,3 +549,45 @@ class ConcurrentPerPartitionCursor(Cursor):
|
|
541
549
|
|
542
550
|
def limit_reached(self) -> bool:
|
543
551
|
return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT
|
552
|
+
|
553
|
+
@staticmethod
|
554
|
+
def get_parent_state(
|
555
|
+
stream_state: Optional[StreamState], parent_stream_name: str
|
556
|
+
) -> Optional[AirbyteStateMessage]:
|
557
|
+
if not stream_state:
|
558
|
+
return None
|
559
|
+
|
560
|
+
if "parent_state" not in stream_state:
|
561
|
+
logger.warning(
|
562
|
+
f"Trying to get_parent_state for stream `{parent_stream_name}` when there are not parent state in the state"
|
563
|
+
)
|
564
|
+
return None
|
565
|
+
elif parent_stream_name not in stream_state["parent_state"]:
|
566
|
+
logger.info(
|
567
|
+
f"Could not find parent state for stream `{parent_stream_name}`. On parents available are {list(stream_state['parent_state'].keys())}"
|
568
|
+
)
|
569
|
+
return None
|
570
|
+
|
571
|
+
return AirbyteStateMessage(
|
572
|
+
type=AirbyteStateType.STREAM,
|
573
|
+
stream=AirbyteStreamState(
|
574
|
+
stream_descriptor=StreamDescriptor(parent_stream_name, None),
|
575
|
+
stream_state=AirbyteStateBlob(stream_state["parent_state"][parent_stream_name]),
|
576
|
+
),
|
577
|
+
)
|
578
|
+
|
579
|
+
@staticmethod
|
580
|
+
def get_global_state(
|
581
|
+
stream_state: Optional[StreamState], parent_stream_name: str
|
582
|
+
) -> Optional[AirbyteStateMessage]:
|
583
|
+
return (
|
584
|
+
AirbyteStateMessage(
|
585
|
+
type=AirbyteStateType.STREAM,
|
586
|
+
stream=AirbyteStreamState(
|
587
|
+
stream_descriptor=StreamDescriptor(parent_stream_name, None),
|
588
|
+
stream_state=AirbyteStateBlob(stream_state["state"]),
|
589
|
+
),
|
590
|
+
)
|
591
|
+
if stream_state and "state" in stream_state
|
592
|
+
else None
|
593
|
+
)
|
@@ -192,8 +192,10 @@ class GlobalSubstreamCursor(DeclarativeCursor):
|
|
192
192
|
# Example: {"global_state_format_key": "global_state_format_value"}
|
193
193
|
self._stream_cursor.set_initial_state(stream_state)
|
194
194
|
|
195
|
-
#
|
196
|
-
|
195
|
+
# We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
|
196
|
+
# Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
|
197
|
+
# We are still keeping this line as a comment to be explicit about the past behavior.
|
198
|
+
# self._partition_router.set_initial_state(stream_state)
|
197
199
|
|
198
200
|
def _inject_lookback_into_stream_cursor(self, lookback_window: int) -> None:
|
199
201
|
"""
|