PyPI - airbyte-cdk - Versions diffs - 6.61.6__py3-none-any.whl → 6.61.6.post3.dev17473738577__py3-none-any.whl - Mend

airbyte-cdk 6.61.6py3-none-any.whl → 6.61.6.post3.dev17473738577py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py CHANGED Viewed

@@ -146,8 +146,10 @@ class PerPartitionCursor(DeclarativeCursor):
             if "state" in stream_state:
                 self._state_to_migrate_from = stream_state["state"]
-        # Set parent state for partition routers based on parent streams
-        self._partition_router.set_initial_state(stream_state)
+        # We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
+        # Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
+        # We are still keeping this line as a comment to be explicit about the past behavior.
+        # self._partition_router.set_initial_state(stream_state)
     def observe(self, stream_slice: StreamSlice, record: Record) -> None:
         self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].observe(

airbyte_cdk/manifest_server/api_models/__init__.py CHANGED Viewed

@@ -12,6 +12,7 @@ from .manifest import (
     DiscoverResponse,
     FullResolveRequest,
     ManifestResponse,
+    RequestContext,
     ResolveRequest,
     StreamTestReadRequest,
 )
@@ -30,6 +31,7 @@ __all__ = [
     "ConnectorConfig",
     "Manifest",
     # Manifest request/response models
+    "RequestContext",
     "FullResolveRequest",
     "ManifestResponse",
     "StreamTestReadRequest",

airbyte_cdk/manifest_server/api_models/manifest.py CHANGED Viewed

@@ -13,6 +13,13 @@ from pydantic import BaseModel, Field
 from .dicts import ConnectorConfig, Manifest
+class RequestContext(BaseModel):
+    """Optional context information for tracing and observability."""
+    workspace_id: Optional[str] = None
+    project_id: Optional[str] = None
 class StreamTestReadRequest(BaseModel):
     """Request to test read from a specific stream."""
@@ -24,6 +31,7 @@ class StreamTestReadRequest(BaseModel):
     record_limit: int = Field(default=100, ge=1, le=5000)
     page_limit: int = Field(default=5, ge=1, le=20)
     slice_limit: int = Field(default=5, ge=1, le=20)
+    context: Optional[RequestContext] = None
 class CheckRequest(BaseModel):
@@ -31,6 +39,7 @@ class CheckRequest(BaseModel):
     manifest: Manifest
     config: ConnectorConfig
+    context: Optional[RequestContext] = None
 class CheckResponse(BaseModel):
@@ -45,6 +54,7 @@ class DiscoverRequest(BaseModel):
     manifest: Manifest
     config: ConnectorConfig
+    context: Optional[RequestContext] = None
 class DiscoverResponse(BaseModel):
@@ -57,6 +67,7 @@ class ResolveRequest(BaseModel):
     """Request to resolve a manifest."""
     manifest: Manifest
+    context: Optional[RequestContext] = None
 class ManifestResponse(BaseModel):
@@ -71,3 +82,4 @@ class FullResolveRequest(BaseModel):
     manifest: Manifest
     config: ConnectorConfig
     stream_limit: int = Field(default=100, ge=1, le=100)
+    context: Optional[RequestContext] = None

airbyte_cdk/manifest_server/api_models/stream.py CHANGED Viewed

@@ -6,7 +6,7 @@ They accurately reflect the runtime types returned by the CDK, particularly
 fixing type mismatches like slice_descriptor being a string rather than an object.
 """
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 from pydantic import BaseModel
@@ -59,7 +59,7 @@ class StreamReadSlices(BaseModel):
     """Slices of data read from a stream."""
     pages: List[StreamReadPages]
-    slice_descriptor: Optional[str]  # This is actually a string at runtime, not Dict[str, Any]
+    slice_descriptor: Optional[Union[Dict[str, Any], str]]  # We're seeing strings at runtime
     state: Optional[List[Dict[str, Any]]] = None
     auxiliary_requests: Optional[List[AuxiliaryRequest]] = None

airbyte_cdk/manifest_server/helpers/__init__.py ADDED Viewed

File without changes

airbyte_cdk/manifest_server/helpers/tracing.py ADDED Viewed

@@ -0,0 +1,36 @@
+import logging
+from typing import Optional
+import ddtrace
+logger = logging.getLogger(__name__)
+def apply_trace_tags_from_context(
+    workspace_id: Optional[str] = None,
+    project_id: Optional[str] = None,
+) -> None:
+    """Apply trace tags from context to the current span."""
+    if not workspace_id and not project_id:
+        return
+    # Log the trace IDs for observability
+    log_parts = []
+    if workspace_id:
+        log_parts.append(f"workspace_id={workspace_id}")
+    if project_id:
+        log_parts.append(f"project_id={project_id}")
+    if log_parts:
+        logger.info(f"Processing request with trace tags: {', '.join(log_parts)}")
+    try:
+        span = ddtrace.tracer.current_span()
+        if span:
+            if workspace_id:
+                span.set_tag("workspace_id", workspace_id)
+            if project_id:
+                span.set_tag("project_id", project_id)
+    except Exception:
+        # Silently ignore any ddtrace-related errors (e.g. if ddtrace.auto wasn't run)
+        pass

airbyte_cdk/manifest_server/routers/manifest.py CHANGED Viewed

@@ -27,9 +27,10 @@ from ..api_models import (
     StreamReadResponse,
     StreamTestReadRequest,
 )
-from ..auth import verify_jwt_token
 from ..command_processor.processor import ManifestCommandProcessor
 from ..command_processor.utils import build_catalog, build_source
+from ..helpers.auth import verify_jwt_token
+from ..helpers.tracing import apply_trace_tags_from_context
 def safe_build_source(
@@ -68,6 +69,13 @@ def test_read(request: StreamTestReadRequest) -> StreamReadResponse:
     """
     Test reading from a specific stream in the manifest.
     """
+    # Apply trace tags from context if provided
+    if request.context:
+        apply_trace_tags_from_context(
+            workspace_id=request.context.workspace_id,
+            project_id=request.context.project_id,
+        )
     config_dict = request.config.model_dump()
     catalog = build_catalog(request.stream_name)
@@ -104,6 +112,13 @@ def test_read(request: StreamTestReadRequest) -> StreamReadResponse:
 @router.post("/check", operation_id="check")
 def check(request: CheckRequest) -> CheckResponse:
     """Check configuration against a manifest"""
+    # Apply trace tags from context if provided
+    if request.context:
+        apply_trace_tags_from_context(
+            workspace_id=request.context.workspace_id,
+            project_id=request.context.project_id,
+        )
     source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
     runner = ManifestCommandProcessor(source)
     success, message = runner.check_connection(request.config.model_dump())
@@ -113,6 +128,13 @@ def check(request: CheckRequest) -> CheckResponse:
 @router.post("/discover", operation_id="discover")
 def discover(request: DiscoverRequest) -> DiscoverResponse:
     """Discover streams from a manifest"""
+    # Apply trace tags from context if provided
+    if request.context:
+        apply_trace_tags_from_context(
+            workspace_id=request.context.workspace_id,
+            project_id=request.context.project_id,
+        )
     source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
     runner = ManifestCommandProcessor(source)
     catalog = runner.discover(request.config.model_dump())
@@ -124,6 +146,13 @@ def discover(request: DiscoverRequest) -> DiscoverResponse:
 @router.post("/resolve", operation_id="resolve")
 def resolve(request: ResolveRequest) -> ManifestResponse:
     """Resolve a manifest to its final configuration."""
+    # Apply trace tags from context if provided
+    if request.context:
+        apply_trace_tags_from_context(
+            workspace_id=request.context.workspace_id,
+            project_id=request.context.project_id,
+        )
     source = safe_build_source(request.manifest.model_dump(), {})
     return ManifestResponse(manifest=Manifest(**source.resolved_manifest))
@@ -135,6 +164,13 @@ def full_resolve(request: FullResolveRequest) -> ManifestResponse:
     This is a similar operation to resolve, but has an extra step which generates streams from dynamic stream templates if the manifest contains any. This is used when a user clicks the generate streams button on a stream template in the Builder UI
     """
+    # Apply trace tags from context if provided
+    if request.context:
+        apply_trace_tags_from_context(
+            workspace_id=request.context.workspace_id,
+            project_id=request.context.project_id,
+        )
     source = safe_build_source(request.manifest.model_dump(), request.config.model_dump())
     manifest = {**source.resolved_manifest}
     streams = manifest.get("streams", [])

airbyte_cdk/sources/declarative/concurrent_declarative_source.py CHANGED Viewed

@@ -704,7 +704,7 @@ class ConcurrentDeclarativeSource(AbstractSource):
                             stream_slicer=declarative_stream.retriever.stream_slicer,
                             slice_limit=self._limits.max_slices
                             if self._limits
-                            else None,  # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
+                            else None,  # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
                         )
                     else:
                         if (
@@ -773,7 +773,7 @@ class ConcurrentDeclarativeSource(AbstractSource):
                         declarative_stream.retriever.stream_slicer,
                         slice_limit=self._limits.max_slices
                         if self._limits
-                        else None,  # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
+                        else None,  # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
                     )
                     final_state_cursor = FinalStateCursor(

airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py CHANGED Viewed

@@ -11,6 +11,13 @@ from copy import deepcopy
 from datetime import timedelta
 from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional
+from airbyte_cdk.models import (
+    AirbyteStateBlob,
+    AirbyteStateMessage,
+    AirbyteStateType,
+    AirbyteStreamState,
+    StreamDescriptor,
+)
 from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
 from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
     Timer,
@@ -48,7 +55,7 @@ class ConcurrentPerPartitionCursor(Cursor):
     Manages state per partition when a stream has many partitions, preventing data loss or duplication.
     Attributes:
-        DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000).
+        DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000). This limit needs to be higher than the number of threads we might enqueue (which is represented by ThreadPoolManager.DEFAULT_MAX_QUEUE_SIZE). If not, we could have partitions that have been generated and submitted to the ThreadPool but got deleted from the ConcurrentPerPartitionCursor and when closing them, it will generate KeyError.
     - **Partition Limitation Logic**
       Ensures the number of tracked partitions does not exceed the specified limit to prevent memory overuse. Oldest partitions are removed when the limit is reached.
@@ -128,6 +135,7 @@ class ConcurrentPerPartitionCursor(Cursor):
         # FIXME this is a temporary field the time of the migration from declarative cursors to concurrent ones
         self._attempt_to_create_cursor_if_not_provided = attempt_to_create_cursor_if_not_provided
+        self._synced_some_data = False
     @property
     def cursor_field(self) -> CursorField:
@@ -168,8 +176,8 @@ class ConcurrentPerPartitionCursor(Cursor):
         with self._lock:
             self._semaphore_per_partition[partition_key].acquire()
             if not self._use_global_cursor:
-                self._cursor_per_partition[partition_key].close_partition(partition=partition)
                 cursor = self._cursor_per_partition[partition_key]
+                cursor.close_partition(partition=partition)
                 if (
                     partition_key in self._partitions_done_generating_stream_slices
                     and self._semaphore_per_partition[partition_key]._value == 0
@@ -213,8 +221,10 @@ class ConcurrentPerPartitionCursor(Cursor):
         if not any(
             semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
         ):
-            self._global_cursor = self._new_global_cursor
-            self._lookback_window = self._timer.finish()
+            if self._synced_some_data:
+                # we only update those if we actually synced some data
+                self._global_cursor = self._new_global_cursor
+                self._lookback_window = self._timer.finish()
             self._parent_state = self._partition_router.get_stream_state()
         self._emit_state_message(throttle=False)
@@ -422,9 +432,6 @@ class ConcurrentPerPartitionCursor(Cursor):
         if stream_state.get("parent_state"):
             self._parent_state = stream_state["parent_state"]
-        # Set parent state for partition routers based on parent streams
-        self._partition_router.set_initial_state(stream_state)
     def _set_global_state(self, stream_state: Mapping[str, Any]) -> None:
         """
         Initializes the global cursor state from the provided stream state.
@@ -458,6 +465,7 @@ class ConcurrentPerPartitionCursor(Cursor):
         except ValueError:
             return
+        self._synced_some_data = True
         record_cursor = self._connector_state_converter.output_format(
             self._connector_state_converter.parse_value(record_cursor_value)
         )
@@ -541,3 +549,45 @@ class ConcurrentPerPartitionCursor(Cursor):
     def limit_reached(self) -> bool:
         return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT
+    @staticmethod
+    def get_parent_state(
+        stream_state: Optional[StreamState], parent_stream_name: str
+    ) -> Optional[AirbyteStateMessage]:
+        if not stream_state:
+            return None
+        if "parent_state" not in stream_state:
+            logger.warning(
+                f"Trying to get_parent_state for stream `{parent_stream_name}` when there are not parent state in the state"
+            )
+            return None
+        elif parent_stream_name not in stream_state["parent_state"]:
+            logger.info(
+                f"Could not find parent state for stream `{parent_stream_name}`. On parents available are {list(stream_state['parent_state'].keys())}"
+            )
+            return None
+        return AirbyteStateMessage(
+            type=AirbyteStateType.STREAM,
+            stream=AirbyteStreamState(
+                stream_descriptor=StreamDescriptor(parent_stream_name, None),
+                stream_state=AirbyteStateBlob(stream_state["parent_state"][parent_stream_name]),
+            ),
+        )
+    @staticmethod
+    def get_global_state(
+        stream_state: Optional[StreamState], parent_stream_name: str
+    ) -> Optional[AirbyteStateMessage]:
+        return (
+            AirbyteStateMessage(
+                type=AirbyteStateType.STREAM,
+                stream=AirbyteStreamState(
+                    stream_descriptor=StreamDescriptor(parent_stream_name, None),
+                    stream_state=AirbyteStateBlob(stream_state["state"]),
+                ),
+            )
+            if stream_state and "state" in stream_state
+            else None
+        )

airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py CHANGED Viewed

@@ -192,8 +192,10 @@ class GlobalSubstreamCursor(DeclarativeCursor):
             # Example: {"global_state_format_key": "global_state_format_value"}
             self._stream_cursor.set_initial_state(stream_state)
-        # Set parent state for partition routers based on parent streams
-        self._partition_router.set_initial_state(stream_state)
+        # We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
+        # Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
+        # We are still keeping this line as a comment to be explicit about the past behavior.
+        # self._partition_router.set_initial_state(stream_state)
     def _inject_lookback_into_stream_cursor(self, lookback_window: int) -> None:
         """

airbyte-cdk 6.61.6__py3-none-any.whl → 6.61.6.post3.dev17473738577__py3-none-any.whl

airbyte-cdk 6.61.6py3-none-any.whl → 6.61.6.post3.dev17473738577py3-none-any.whl