airbyte-cdk 6.61.6__py3-none-any.whl → 6.61.6.post3.dev17473738577__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py +4 -2
  2. airbyte_cdk/manifest_server/api_models/__init__.py +2 -0
  3. airbyte_cdk/manifest_server/api_models/manifest.py +12 -0
  4. airbyte_cdk/manifest_server/api_models/stream.py +2 -2
  5. airbyte_cdk/manifest_server/helpers/__init__.py +0 -0
  6. airbyte_cdk/manifest_server/helpers/tracing.py +36 -0
  7. airbyte_cdk/manifest_server/routers/manifest.py +37 -1
  8. airbyte_cdk/sources/declarative/concurrent_declarative_source.py +2 -2
  9. airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py +57 -7
  10. airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py +4 -2
  11. airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py +229 -281
  12. airbyte_cdk/sources/declarative/partition_routers/cartesian_product_stream_slicer.py +0 -6
  13. airbyte_cdk/sources/declarative/partition_routers/grouping_partition_router.py +0 -5
  14. airbyte_cdk/sources/declarative/partition_routers/list_partition_router.py +0 -6
  15. airbyte_cdk/sources/declarative/partition_routers/partition_router.py +1 -23
  16. airbyte_cdk/sources/declarative/partition_routers/single_partition_router.py +0 -6
  17. airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py +88 -107
  18. airbyte_cdk/sources/declarative/requesters/request_options/per_partition_request_option_provider.py +95 -0
  19. airbyte_cdk/sources/declarative/resolvers/http_components_resolver.py +4 -1
  20. airbyte_cdk/sources/declarative/retrievers/retriever.py +5 -0
  21. airbyte_cdk/sources/message/repository.py +20 -0
  22. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/METADATA +1 -1
  23. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/RECORD +28 -25
  24. /airbyte_cdk/manifest_server/{auth.py → helpers/auth.py} +0 -0
  25. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/LICENSE.txt +0 -0
  26. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/LICENSE_SHORT +0 -0
  27. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/WHEEL +0 -0
  28. {airbyte_cdk-6.61.6.dist-info → airbyte_cdk-6.61.6.post3.dev17473738577.dist-info}/entry_points.txt +0 -0
@@ -146,8 +146,10 @@ class PerPartitionCursor(DeclarativeCursor):
146
146
  if "state" in stream_state:
147
147
  self._state_to_migrate_from = stream_state["state"]
148
148
 
149
- # Set parent state for partition routers based on parent streams
150
- self._partition_router.set_initial_state(stream_state)
149
+ # We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
150
+ # Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
151
+ # We are still keeping this line as a comment to be explicit about the past behavior.
152
+ # self._partition_router.set_initial_state(stream_state)
151
153
 
152
154
  def observe(self, stream_slice: StreamSlice, record: Record) -> None:
153
155
  self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].observe(
@@ -12,6 +12,7 @@ from .manifest import (
12
12
  DiscoverResponse,
13
13
  FullResolveRequest,
14
14
  ManifestResponse,
15
+ RequestContext,
15
16
  ResolveRequest,
16
17
  StreamTestReadRequest,
17
18
  )
@@ -30,6 +31,7 @@ __all__ = [
30
31
  "ConnectorConfig",
31
32
  "Manifest",
32
33
  # Manifest request/response models
34
+ "RequestContext",
33
35
  "FullResolveRequest",
34
36
  "ManifestResponse",
35
37
  "StreamTestReadRequest",
@@ -13,6 +13,13 @@ from pydantic import BaseModel, Field
13
13
  from .dicts import ConnectorConfig, Manifest
14
14
 
15
15
 
16
+ class RequestContext(BaseModel):
17
+ """Optional context information for tracing and observability."""
18
+
19
+ workspace_id: Optional[str] = None
20
+ project_id: Optional[str] = None
21
+
22
+
16
23
  class StreamTestReadRequest(BaseModel):
17
24
  """Request to test read from a specific stream."""
18
25
 
@@ -24,6 +31,7 @@ class StreamTestReadRequest(BaseModel):
24
31
  record_limit: int = Field(default=100, ge=1, le=5000)
25
32
  page_limit: int = Field(default=5, ge=1, le=20)
26
33
  slice_limit: int = Field(default=5, ge=1, le=20)
34
+ context: Optional[RequestContext] = None
27
35
 
28
36
 
29
37
  class CheckRequest(BaseModel):
@@ -31,6 +39,7 @@ class CheckRequest(BaseModel):
31
39
 
32
40
  manifest: Manifest
33
41
  config: ConnectorConfig
42
+ context: Optional[RequestContext] = None
34
43
 
35
44
 
36
45
  class CheckResponse(BaseModel):
@@ -45,6 +54,7 @@ class DiscoverRequest(BaseModel):
45
54
 
46
55
  manifest: Manifest
47
56
  config: ConnectorConfig
57
+ context: Optional[RequestContext] = None
48
58
 
49
59
 
50
60
  class DiscoverResponse(BaseModel):
@@ -57,6 +67,7 @@ class ResolveRequest(BaseModel):
57
67
  """Request to resolve a manifest."""
58
68
 
59
69
  manifest: Manifest
70
+ context: Optional[RequestContext] = None
60
71
 
61
72
 
62
73
  class ManifestResponse(BaseModel):
@@ -71,3 +82,4 @@ class FullResolveRequest(BaseModel):
71
82
  manifest: Manifest
72
83
  config: ConnectorConfig
73
84
  stream_limit: int = Field(default=100, ge=1, le=100)
85
+ context: Optional[RequestContext] = None
@@ -6,7 +6,7 @@ They accurately reflect the runtime types returned by the CDK, particularly
6
6
  fixing type mismatches like slice_descriptor being a string rather than an object.
7
7
  """
8
8
 
9
- from typing import Any, Dict, List, Optional
9
+ from typing import Any, Dict, List, Optional, Union
10
10
 
11
11
  from pydantic import BaseModel
12
12
 
@@ -59,7 +59,7 @@ class StreamReadSlices(BaseModel):
59
59
  """Slices of data read from a stream."""
60
60
 
61
61
  pages: List[StreamReadPages]
62
- slice_descriptor: Optional[str] # This is actually a string at runtime, not Dict[str, Any]
62
+ slice_descriptor: Optional[Union[Dict[str, Any], str]] # We're seeing strings at runtime
63
63
  state: Optional[List[Dict[str, Any]]] = None
64
64
  auxiliary_requests: Optional[List[AuxiliaryRequest]] = None
65
65
 
File without changes
@@ -0,0 +1,36 @@
1
+ import logging
2
+ from typing import Optional
3
+
4
+ import ddtrace
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ def apply_trace_tags_from_context(
10
+ workspace_id: Optional[str] = None,
11
+ project_id: Optional[str] = None,
12
+ ) -> None:
13
+ """Apply trace tags from context to the current span."""
14
+ if not workspace_id and not project_id:
15
+ return
16
+
17
+ # Log the trace IDs for observability
18
+ log_parts = []
19
+ if workspace_id:
20
+ log_parts.append(f"workspace_id={workspace_id}")
21
+ if project_id:
22
+ log_parts.append(f"project_id={project_id}")
23
+
24
+ if log_parts:
25
+ logger.info(f"Processing request with trace tags: {', '.join(log_parts)}")
26
+
27
+ try:
28
+ span = ddtrace.tracer.current_span()
29
+ if span:
30
+ if workspace_id:
31
+ span.set_tag("workspace_id", workspace_id)
32
+ if project_id:
33
+ span.set_tag("project_id", project_id)
34
+ except Exception:
35
+ # Silently ignore any ddtrace-related errors (e.g. if ddtrace.auto wasn't run)
36
+ pass
@@ -27,9 +27,10 @@ from ..api_models import (
27
27
  StreamReadResponse,
28
28
  StreamTestReadRequest,
29
29
  )
30
- from ..auth import verify_jwt_token
31
30
  from ..command_processor.processor import ManifestCommandProcessor
32
31
  from ..command_processor.utils import build_catalog, build_source
32
+ from ..helpers.auth import verify_jwt_token
33
+ from ..helpers.tracing import apply_trace_tags_from_context
33
34
 
34
35
 
35
36
  def safe_build_source(
@@ -68,6 +69,13 @@ def test_read(request: StreamTestReadRequest) -> StreamReadResponse:
68
69
  """
69
70
  Test reading from a specific stream in the manifest.
70
71
  """
72
+ # Apply trace tags from context if provided
73
+ if request.context:
74
+ apply_trace_tags_from_context(
75
+ workspace_id=request.context.workspace_id,
76
+ project_id=request.context.project_id,
77
+ )
78
+
71
79
  config_dict = request.config.model_dump()
72
80
 
73
81
  catalog = build_catalog(request.stream_name)
@@ -104,6 +112,13 @@ def test_read(request: StreamTestReadRequest) -> StreamReadResponse:
104
112
  @router.post("/check", operation_id="check")
105
113
  def check(request: CheckRequest) -> CheckResponse:
106
114
  """Check configuration against a manifest"""
115
+ # Apply trace tags from context if provided
116
+ if request.context:
117
+ apply_trace_tags_from_context(
118
+ workspace_id=request.context.workspace_id,
119
+ project_id=request.context.project_id,
120
+ )
121
+
107
122
  source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
108
123
  runner = ManifestCommandProcessor(source)
109
124
  success, message = runner.check_connection(request.config.model_dump())
@@ -113,6 +128,13 @@ def check(request: CheckRequest) -> CheckResponse:
113
128
  @router.post("/discover", operation_id="discover")
114
129
  def discover(request: DiscoverRequest) -> DiscoverResponse:
115
130
  """Discover streams from a manifest"""
131
+ # Apply trace tags from context if provided
132
+ if request.context:
133
+ apply_trace_tags_from_context(
134
+ workspace_id=request.context.workspace_id,
135
+ project_id=request.context.project_id,
136
+ )
137
+
116
138
  source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
117
139
  runner = ManifestCommandProcessor(source)
118
140
  catalog = runner.discover(request.config.model_dump())
@@ -124,6 +146,13 @@ def discover(request: DiscoverRequest) -> DiscoverResponse:
124
146
  @router.post("/resolve", operation_id="resolve")
125
147
  def resolve(request: ResolveRequest) -> ManifestResponse:
126
148
  """Resolve a manifest to its final configuration."""
149
+ # Apply trace tags from context if provided
150
+ if request.context:
151
+ apply_trace_tags_from_context(
152
+ workspace_id=request.context.workspace_id,
153
+ project_id=request.context.project_id,
154
+ )
155
+
127
156
  source = safe_build_source(request.manifest.model_dump(), {})
128
157
  return ManifestResponse(manifest=Manifest(**source.resolved_manifest))
129
158
 
@@ -135,6 +164,13 @@ def full_resolve(request: FullResolveRequest) -> ManifestResponse:
135
164
 
136
165
  This is a similar operation to resolve, but has an extra step which generates streams from dynamic stream templates if the manifest contains any. This is used when a user clicks the generate streams button on a stream template in the Builder UI
137
166
  """
167
+ # Apply trace tags from context if provided
168
+ if request.context:
169
+ apply_trace_tags_from_context(
170
+ workspace_id=request.context.workspace_id,
171
+ project_id=request.context.project_id,
172
+ )
173
+
138
174
  source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
139
175
  manifest = {**source.resolved_manifest}
140
176
  streams = manifest.get("streams", [])
@@ -704,7 +704,7 @@ class ConcurrentDeclarativeSource(AbstractSource):
704
704
  stream_slicer=declarative_stream.retriever.stream_slicer,
705
705
  slice_limit=self._limits.max_slices
706
706
  if self._limits
707
- else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
707
+ else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
708
708
  )
709
709
  else:
710
710
  if (
@@ -773,7 +773,7 @@ class ConcurrentDeclarativeSource(AbstractSource):
773
773
  declarative_stream.retriever.stream_slicer,
774
774
  slice_limit=self._limits.max_slices
775
775
  if self._limits
776
- else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
776
+ else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
777
777
  )
778
778
 
779
779
  final_state_cursor = FinalStateCursor(
@@ -11,6 +11,13 @@ from copy import deepcopy
11
11
  from datetime import timedelta
12
12
  from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional
13
13
 
14
+ from airbyte_cdk.models import (
15
+ AirbyteStateBlob,
16
+ AirbyteStateMessage,
17
+ AirbyteStateType,
18
+ AirbyteStreamState,
19
+ StreamDescriptor,
20
+ )
14
21
  from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
15
22
  from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
16
23
  Timer,
@@ -48,7 +55,7 @@ class ConcurrentPerPartitionCursor(Cursor):
48
55
  Manages state per partition when a stream has many partitions, preventing data loss or duplication.
49
56
 
50
57
  Attributes:
51
- DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000).
58
+ DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000). This limit needs to be higher than the number of threads we might enqueue (which is represented by ThreadPoolManager.DEFAULT_MAX_QUEUE_SIZE). If not, we could have partitions that have been generated and submitted to the ThreadPool but got deleted from the ConcurrentPerPartitionCursor and when closing them, it will generate KeyError.
52
59
 
53
60
  - **Partition Limitation Logic**
54
61
  Ensures the number of tracked partitions does not exceed the specified limit to prevent memory overuse. Oldest partitions are removed when the limit is reached.
@@ -128,6 +135,7 @@ class ConcurrentPerPartitionCursor(Cursor):
128
135
 
129
136
  # FIXME this is a temporary field the time of the migration from declarative cursors to concurrent ones
130
137
  self._attempt_to_create_cursor_if_not_provided = attempt_to_create_cursor_if_not_provided
138
+ self._synced_some_data = False
131
139
 
132
140
  @property
133
141
  def cursor_field(self) -> CursorField:
@@ -168,8 +176,8 @@ class ConcurrentPerPartitionCursor(Cursor):
168
176
  with self._lock:
169
177
  self._semaphore_per_partition[partition_key].acquire()
170
178
  if not self._use_global_cursor:
171
- self._cursor_per_partition[partition_key].close_partition(partition=partition)
172
179
  cursor = self._cursor_per_partition[partition_key]
180
+ cursor.close_partition(partition=partition)
173
181
  if (
174
182
  partition_key in self._partitions_done_generating_stream_slices
175
183
  and self._semaphore_per_partition[partition_key]._value == 0
@@ -213,8 +221,10 @@ class ConcurrentPerPartitionCursor(Cursor):
213
221
  if not any(
214
222
  semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
215
223
  ):
216
- self._global_cursor = self._new_global_cursor
217
- self._lookback_window = self._timer.finish()
224
+ if self._synced_some_data:
225
+ # we only update those if we actually synced some data
226
+ self._global_cursor = self._new_global_cursor
227
+ self._lookback_window = self._timer.finish()
218
228
  self._parent_state = self._partition_router.get_stream_state()
219
229
  self._emit_state_message(throttle=False)
220
230
 
@@ -422,9 +432,6 @@ class ConcurrentPerPartitionCursor(Cursor):
422
432
  if stream_state.get("parent_state"):
423
433
  self._parent_state = stream_state["parent_state"]
424
434
 
425
- # Set parent state for partition routers based on parent streams
426
- self._partition_router.set_initial_state(stream_state)
427
-
428
435
  def _set_global_state(self, stream_state: Mapping[str, Any]) -> None:
429
436
  """
430
437
  Initializes the global cursor state from the provided stream state.
@@ -458,6 +465,7 @@ class ConcurrentPerPartitionCursor(Cursor):
458
465
  except ValueError:
459
466
  return
460
467
 
468
+ self._synced_some_data = True
461
469
  record_cursor = self._connector_state_converter.output_format(
462
470
  self._connector_state_converter.parse_value(record_cursor_value)
463
471
  )
@@ -541,3 +549,45 @@ class ConcurrentPerPartitionCursor(Cursor):
541
549
 
542
550
  def limit_reached(self) -> bool:
543
551
  return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT
552
+
553
+ @staticmethod
554
+ def get_parent_state(
555
+ stream_state: Optional[StreamState], parent_stream_name: str
556
+ ) -> Optional[AirbyteStateMessage]:
557
+ if not stream_state:
558
+ return None
559
+
560
+ if "parent_state" not in stream_state:
561
+ logger.warning(
562
+ f"Trying to get_parent_state for stream `{parent_stream_name}` when there are not parent state in the state"
563
+ )
564
+ return None
565
+ elif parent_stream_name not in stream_state["parent_state"]:
566
+ logger.info(
567
+ f"Could not find parent state for stream `{parent_stream_name}`. On parents available are {list(stream_state['parent_state'].keys())}"
568
+ )
569
+ return None
570
+
571
+ return AirbyteStateMessage(
572
+ type=AirbyteStateType.STREAM,
573
+ stream=AirbyteStreamState(
574
+ stream_descriptor=StreamDescriptor(parent_stream_name, None),
575
+ stream_state=AirbyteStateBlob(stream_state["parent_state"][parent_stream_name]),
576
+ ),
577
+ )
578
+
579
+ @staticmethod
580
+ def get_global_state(
581
+ stream_state: Optional[StreamState], parent_stream_name: str
582
+ ) -> Optional[AirbyteStateMessage]:
583
+ return (
584
+ AirbyteStateMessage(
585
+ type=AirbyteStateType.STREAM,
586
+ stream=AirbyteStreamState(
587
+ stream_descriptor=StreamDescriptor(parent_stream_name, None),
588
+ stream_state=AirbyteStateBlob(stream_state["state"]),
589
+ ),
590
+ )
591
+ if stream_state and "state" in stream_state
592
+ else None
593
+ )
@@ -192,8 +192,10 @@ class GlobalSubstreamCursor(DeclarativeCursor):
192
192
  # Example: {"global_state_format_key": "global_state_format_value"}
193
193
  self._stream_cursor.set_initial_state(stream_state)
194
194
 
195
- # Set parent state for partition routers based on parent streams
196
- self._partition_router.set_initial_state(stream_state)
195
+ # We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
196
+ # Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
197
+ # We are still keeping this line as a comment to be explicit about the past behavior.
198
+ # self._partition_router.set_initial_state(stream_state)
197
199
 
198
200
  def _inject_lookback_into_stream_cursor(self, lookback_window: int) -> None:
199
201
  """