PyPI - data-designer-engine - Versions diffs - 0.4.0__py3-none-any.whl - Mend

data-designer-engine 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

data_designer/engine/__init__.py +2 -0
data_designer/engine/_version.py +34 -0
data_designer/engine/analysis/column_profilers/base.py +49 -0
data_designer/engine/analysis/column_profilers/judge_score_profiler.py +153 -0
data_designer/engine/analysis/column_profilers/registry.py +22 -0
data_designer/engine/analysis/column_statistics.py +145 -0
data_designer/engine/analysis/dataset_profiler.py +149 -0
data_designer/engine/analysis/errors.py +9 -0
data_designer/engine/analysis/utils/column_statistics_calculations.py +234 -0
data_designer/engine/analysis/utils/judge_score_processing.py +132 -0
data_designer/engine/column_generators/__init__.py +2 -0
data_designer/engine/column_generators/generators/__init__.py +2 -0
data_designer/engine/column_generators/generators/base.py +122 -0
data_designer/engine/column_generators/generators/embedding.py +35 -0
data_designer/engine/column_generators/generators/expression.py +55 -0
data_designer/engine/column_generators/generators/llm_completion.py +116 -0
data_designer/engine/column_generators/generators/samplers.py +69 -0
data_designer/engine/column_generators/generators/seed_dataset.py +144 -0
data_designer/engine/column_generators/generators/validation.py +140 -0
data_designer/engine/column_generators/registry.py +60 -0
data_designer/engine/column_generators/utils/errors.py +15 -0
data_designer/engine/column_generators/utils/generator_classification.py +43 -0
data_designer/engine/column_generators/utils/judge_score_factory.py +58 -0
data_designer/engine/column_generators/utils/prompt_renderer.py +100 -0
data_designer/engine/compiler.py +97 -0
data_designer/engine/configurable_task.py +71 -0
data_designer/engine/dataset_builders/artifact_storage.py +283 -0
data_designer/engine/dataset_builders/column_wise_builder.py +354 -0
data_designer/engine/dataset_builders/errors.py +15 -0
data_designer/engine/dataset_builders/multi_column_configs.py +46 -0
data_designer/engine/dataset_builders/utils/__init__.py +2 -0
data_designer/engine/dataset_builders/utils/concurrency.py +212 -0
data_designer/engine/dataset_builders/utils/config_compiler.py +62 -0
data_designer/engine/dataset_builders/utils/dag.py +62 -0
data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +200 -0
data_designer/engine/dataset_builders/utils/errors.py +15 -0
data_designer/engine/dataset_builders/utils/progress_tracker.py +122 -0
data_designer/engine/errors.py +51 -0
data_designer/engine/model_provider.py +77 -0
data_designer/engine/models/__init__.py +2 -0
data_designer/engine/models/errors.py +300 -0
data_designer/engine/models/facade.py +284 -0
data_designer/engine/models/factory.py +42 -0
data_designer/engine/models/litellm_overrides.py +179 -0
data_designer/engine/models/parsers/__init__.py +2 -0
data_designer/engine/models/parsers/errors.py +34 -0
data_designer/engine/models/parsers/parser.py +235 -0
data_designer/engine/models/parsers/postprocessors.py +93 -0
data_designer/engine/models/parsers/tag_parsers.py +62 -0
data_designer/engine/models/parsers/types.py +84 -0
data_designer/engine/models/recipes/base.py +81 -0
data_designer/engine/models/recipes/response_recipes.py +293 -0
data_designer/engine/models/registry.py +151 -0
data_designer/engine/models/telemetry.py +362 -0
data_designer/engine/models/usage.py +73 -0
data_designer/engine/models/utils.py +101 -0
data_designer/engine/processing/ginja/__init__.py +2 -0
data_designer/engine/processing/ginja/ast.py +65 -0
data_designer/engine/processing/ginja/environment.py +463 -0
data_designer/engine/processing/ginja/exceptions.py +56 -0
data_designer/engine/processing/ginja/record.py +32 -0
data_designer/engine/processing/gsonschema/__init__.py +2 -0
data_designer/engine/processing/gsonschema/exceptions.py +15 -0
data_designer/engine/processing/gsonschema/schema_transformers.py +83 -0
data_designer/engine/processing/gsonschema/types.py +10 -0
data_designer/engine/processing/gsonschema/validators.py +202 -0
data_designer/engine/processing/processors/base.py +13 -0
data_designer/engine/processing/processors/drop_columns.py +42 -0
data_designer/engine/processing/processors/registry.py +25 -0
data_designer/engine/processing/processors/schema_transform.py +71 -0
data_designer/engine/processing/utils.py +169 -0
data_designer/engine/registry/base.py +99 -0
data_designer/engine/registry/data_designer_registry.py +39 -0
data_designer/engine/registry/errors.py +12 -0
data_designer/engine/resources/managed_dataset_generator.py +39 -0
data_designer/engine/resources/managed_dataset_repository.py +197 -0
data_designer/engine/resources/managed_storage.py +65 -0
data_designer/engine/resources/resource_provider.py +77 -0
data_designer/engine/resources/seed_reader.py +154 -0
data_designer/engine/sampling_gen/column.py +91 -0
data_designer/engine/sampling_gen/constraints.py +100 -0
data_designer/engine/sampling_gen/data_sources/base.py +217 -0
data_designer/engine/sampling_gen/data_sources/errors.py +12 -0
data_designer/engine/sampling_gen/data_sources/sources.py +347 -0
data_designer/engine/sampling_gen/entities/__init__.py +2 -0
data_designer/engine/sampling_gen/entities/assets/zip_area_code_map.parquet +0 -0
data_designer/engine/sampling_gen/entities/dataset_based_person_fields.py +90 -0
data_designer/engine/sampling_gen/entities/email_address_utils.py +171 -0
data_designer/engine/sampling_gen/entities/errors.py +10 -0
data_designer/engine/sampling_gen/entities/national_id_utils.py +102 -0
data_designer/engine/sampling_gen/entities/person.py +144 -0
data_designer/engine/sampling_gen/entities/phone_number.py +128 -0
data_designer/engine/sampling_gen/errors.py +26 -0
data_designer/engine/sampling_gen/generator.py +122 -0
data_designer/engine/sampling_gen/jinja_utils.py +64 -0
data_designer/engine/sampling_gen/people_gen.py +199 -0
data_designer/engine/sampling_gen/person_constants.py +56 -0
data_designer/engine/sampling_gen/schema.py +147 -0
data_designer/engine/sampling_gen/schema_builder.py +61 -0
data_designer/engine/sampling_gen/utils.py +46 -0
data_designer/engine/secret_resolver.py +82 -0
data_designer/engine/testing/__init__.py +12 -0
data_designer/engine/testing/stubs.py +133 -0
data_designer/engine/testing/utils.py +20 -0
data_designer/engine/validation.py +367 -0
data_designer/engine/validators/__init__.py +19 -0
data_designer/engine/validators/base.py +38 -0
data_designer/engine/validators/local_callable.py +39 -0
data_designer/engine/validators/python.py +254 -0
data_designer/engine/validators/remote.py +89 -0
data_designer/engine/validators/sql.py +65 -0
data_designer_engine-0.4.0.dist-info/METADATA +50 -0
data_designer_engine-0.4.0.dist-info/RECORD +114 -0
data_designer_engine-0.4.0.dist-info/WHEEL +4 -0

data_designer/engine/models/telemetry.py ADDED Viewed

@@ -0,0 +1,362 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""
+Telemetry handler for NeMo products.
+Environment variables:
+- NEMO_TELEMETRY_ENABLED: Whether telemetry is enabled.
+- NEMO_DEPLOYMENT_TYPE: The deployment type the event came from.
+- NEMO_TELEMETRY_ENDPOINT: The endpoint to send the telemetry events to.
+- NEMO_SESSION_PREFIX: Optional prefix to add to session IDs.
+"""
+from __future__ import annotations
+import asyncio
+import os
+import platform
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from enum import Enum
+from typing import Any, ClassVar
+from pydantic import BaseModel, Field
+from data_designer.lazy_heavy_imports import httpx
+TELEMETRY_ENABLED = os.getenv("NEMO_TELEMETRY_ENABLED", "true").lower() in ("1", "true", "yes")
+CLIENT_ID = "184482118588404"
+NEMO_TELEMETRY_VERSION = "nemo-telemetry/1.0"
+MAX_RETRIES = 3
+NEMO_TELEMETRY_ENDPOINT = os.getenv(
+    "NEMO_TELEMETRY_ENDPOINT", "https://events.telemetry.data.nvidia.com/v1.1/events/json"
+).lower()
+CPU_ARCHITECTURE = platform.uname().machine
+SESSION_PREFIX = os.getenv("NEMO_SESSION_PREFIX")
+class NemoSourceEnum(str, Enum):
+    INFERENCE = "inference"
+    AUDITOR = "auditor"
+    DATADESIGNER = "datadesigner"
+    EVALUATOR = "evaluator"
+    GUARDRAILS = "guardrails"
+    UNDEFINED = "undefined"
+class DeploymentTypeEnum(str, Enum):
+    LIBRARY = "library"
+    API = "api"
+    UNDEFINED = "undefined"
+_deployment_type_raw = os.getenv("NEMO_DEPLOYMENT_TYPE", "library").lower()
+try:
+    DEPLOYMENT_TYPE = DeploymentTypeEnum(_deployment_type_raw)
+except ValueError:
+    valid_values = [e.value for e in DeploymentTypeEnum]
+    raise ValueError(
+        f"Invalid NEMO_DEPLOYMENT_TYPE: {_deployment_type_raw!r}. Must be one of: {valid_values}"
+    ) from None
+class TaskStatusEnum(str, Enum):
+    SUCCESS = "success"
+    FAILURE = "failure"
+    UNDEFINED = "undefined"
+class TelemetryEvent(BaseModel):
+    _event_name: ClassVar[str]  # Subclasses must define this
+    _schema_version: ClassVar[str] = "1.3"
+    def __init_subclass__(cls, **kwargs: Any) -> None:
+        super().__init_subclass__(**kwargs)
+        if "_event_name" not in cls.__dict__:
+            raise TypeError(f"{cls.__name__} must define '_event_name' class variable")
+class InferenceEvent(TelemetryEvent):
+    _event_name: ClassVar[str] = "inference_event"
+    nemo_source: NemoSourceEnum = Field(
+        ...,
+        alias="nemoSource",
+        description="The NeMo product that created the event (i.e. data-designer).",
+    )
+    task: str = Field(
+        ...,
+        description="The type of task that was performed that generated the inference event (i.e. preview-job, batch-job).",
+    )
+    task_status: TaskStatusEnum = Field(
+        ...,
+        alias="taskStatus",
+        description="The status of the task.",
+    )
+    deployment_type: DeploymentTypeEnum = Field(
+        default=DEPLOYMENT_TYPE,
+        alias="deploymentType",
+        description="The deployment type the event came from.",
+    )
+    model: str = Field(
+        ...,
+        description="The name of the model that was used.",
+    )
+    model_group: str = Field(
+        default="undefined",
+        alias="modelGroup",
+        description="An optional identifier to group models together.",
+    )
+    input_bytes: int = Field(
+        default=-1,
+        alias="inputBytes",
+        description="Number of bytes provided as input to the model. -1 if not available.",
+        ge=-9223372036854775808,
+        le=9223372036854775807,
+    )
+    input_tokens: int = Field(
+        default=-1,
+        alias="inputTokens",
+        description="Number of tokens provided as input to the model. -1 if not available.",
+        ge=-9223372036854775808,
+        le=9223372036854775807,
+    )
+    output_bytes: int = Field(
+        default=-1,
+        alias="outputBytes",
+        description="Number of bytes returned by the model. -1 if not available.",
+        ge=-9223372036854775808,
+        le=9223372036854775807,
+    )
+    output_tokens: int = Field(
+        default=-1,
+        alias="outputTokens",
+        description="Number of tokens returned by the model. -1 if not available.",
+        ge=-9223372036854775808,
+        le=9223372036854775807,
+    )
+    model_config = {"populate_by_name": True}
+@dataclass
+class QueuedEvent:
+    event: TelemetryEvent
+    timestamp: datetime
+    retry_count: int = 0
+def _get_iso_timestamp(dt: datetime | None = None) -> str:
+    if dt is None:
+        dt = datetime.now(timezone.utc)
+    return dt.strftime("%Y-%m-%dT%H:%M:%S.") + f"{dt.microsecond // 1000:03d}Z"
+def build_payload(
+    events: list[QueuedEvent], *, source_client_version: str, session_id: str = "undefined"
+) -> dict[str, Any]:
+    return {
+        "browserType": "undefined",  # do not change
+        "clientId": CLIENT_ID,
+        "clientType": "Native",  # do not change
+        "clientVariant": "Release",  # do not change
+        "clientVer": source_client_version,
+        "cpuArchitecture": CPU_ARCHITECTURE,
+        "deviceGdprBehOptIn": "None",  # do not change
+        "deviceGdprFuncOptIn": "None",  # do not change
+        "deviceGdprTechOptIn": "None",  # do not change
+        "deviceId": "undefined",  # do not change
+        "deviceMake": "undefined",  # do not change
+        "deviceModel": "undefined",  # do not change
+        "deviceOS": "undefined",  # do not change
+        "deviceOSVersion": "undefined",  # do not change
+        "deviceType": "undefined",  # do not change
+        "eventProtocol": "1.6",  # do not change
+        "eventSchemaVer": events[0].event._schema_version,
+        "eventSysVer": NEMO_TELEMETRY_VERSION,
+        "externalUserId": "undefined",  # do not change
+        "gdprBehOptIn": "None",  # do not change
+        "gdprFuncOptIn": "None",  # do not change
+        "gdprTechOptIn": "None",  # do not change
+        "idpId": "undefined",  # do not change
+        "integrationId": "undefined",  # do not change
+        "productName": "undefined",  # do not change
+        "productVersion": "undefined",  # do not change
+        "sentTs": _get_iso_timestamp(),
+        "sessionId": session_id,
+        "userId": "undefined",  # do not change
+        "events": [
+            {
+                "ts": _get_iso_timestamp(queued.timestamp),
+                "parameters": queued.event.model_dump(by_alias=True),
+                "name": queued.event._event_name,
+            }
+            for queued in events
+        ],
+    }
+class TelemetryHandler:
+    """
+    Handles telemetry event batching, flushing, and retry logic for NeMo products.
+    Args:
+        flush_interval_seconds (float): The interval in seconds to flush the events.
+        max_queue_size (int): The maximum number of events to queue before flushing.
+        max_retries (int): The maximum number of times to retry sending an event.
+        source_client_version (str): The version of the source client. This should be the version of
+            the actual NeMo product that is sending the events, typically the same as the version of
+            a PyPi package that a user would install.
+        session_id (str): An optional session ID to associate with the events.
+            This should be a unique identifier for the session, such as a UUID.
+            It is used to group events together.
+    """
+    def __init__(
+        self,
+        flush_interval_seconds: float = 120.0,
+        max_queue_size: int = 50,
+        max_retries: int = MAX_RETRIES,
+        source_client_version: str = "undefined",
+        session_id: str = "undefined",
+    ):
+        self._flush_interval = flush_interval_seconds
+        self._max_queue_size = max_queue_size
+        self._max_retries = max_retries
+        self._events: list[QueuedEvent] = []
+        self._dlq: list[QueuedEvent] = []  # Dead letter queue for retry
+        self._flush_signal = asyncio.Event()
+        self._timer_task: asyncio.Task | None = None
+        self._running = False
+        self._source_client_version = source_client_version
+        # Apply session prefix if environment variable is set
+        if SESSION_PREFIX:
+            self._session_id = f"{SESSION_PREFIX}{session_id}"
+        else:
+            self._session_id = session_id
+    async def astart(self) -> None:
+        if self._running:
+            return
+        self._running = True
+        self._timer_task = asyncio.create_task(self._timer_loop())
+    async def astop(self) -> None:
+        self._running = False
+        self._flush_signal.set()
+        if self._timer_task:
+            self._timer_task.cancel()
+            try:
+                await self._timer_task
+            except asyncio.CancelledError:
+                pass
+            self._timer_task = None
+        await self._flush_events()
+    async def aflush(self) -> None:
+        self._flush_signal.set()
+    def start(self) -> None:
+        self._run_sync(self.astart())
+    def stop(self) -> None:
+        self._run_sync(self.astop())
+    def flush(self) -> None:
+        self._flush_signal.set()
+    def enqueue(self, event: TelemetryEvent) -> None:
+        if not TELEMETRY_ENABLED:
+            return
+        if not isinstance(event, TelemetryEvent):
+            # Silently fail as we prioritize not disrupting upstream call sites and telemetry is best effort
+            return
+        queued = QueuedEvent(event=event, timestamp=datetime.now(timezone.utc))
+        self._events.append(queued)
+        if len(self._events) >= self._max_queue_size:
+            self._flush_signal.set()
+    def _run_sync(self, coro: Any) -> Any:
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            loop = None
+        if loop and loop.is_running():
+            import concurrent.futures
+            with concurrent.futures.ThreadPoolExecutor() as pool:
+                future = pool.submit(asyncio.run, coro)
+                return future.result()
+        else:
+            return asyncio.run(coro)
+    def __enter__(self) -> TelemetryHandler:
+        self.start()
+        return self
+    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        self.stop()
+    async def __aenter__(self) -> TelemetryHandler:
+        await self.astart()
+        return self
+    async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
+        await self.astop()
+    async def _timer_loop(self) -> None:
+        while self._running:
+            try:
+                await asyncio.wait_for(
+                    self._flush_signal.wait(),
+                    timeout=self._flush_interval,
+                )
+            except asyncio.TimeoutError:
+                pass
+            self._flush_signal.clear()
+            await self._flush_events()
+    async def _flush_events(self) -> None:
+        dlq_events, self._dlq = self._dlq, []
+        new_events, self._events = self._events, []
+        events_to_send = dlq_events + new_events
+        if events_to_send:
+            await self._send_events(events_to_send)
+    async def _send_events(self, events: list[QueuedEvent]) -> None:
+        async with httpx.AsyncClient() as client:
+            await self._send_events_with_client(client, events)
+    async def _send_events_with_client(self, client: httpx.AsyncClient, events: list[QueuedEvent]) -> None:
+        if not events:
+            return
+        payload = build_payload(events, source_client_version=self._source_client_version, session_id=self._session_id)
+        try:
+            response = await client.post(NEMO_TELEMETRY_ENDPOINT, json=payload)
+            # 2xx, 400, 422 are all considered complete (no retry)
+            # 400/422 indicate bad payload which retrying won't fix
+            if response.status_code in (400, 422) or response.is_success:
+                return
+            # 413 (payload too large) - split and retry
+            if response.status_code == 413:
+                if len(events) == 1:
+                    # Can't split further, drop the event
+                    return
+                mid = len(events) // 2
+                await self._send_events_with_client(client, events[:mid])
+                await self._send_events_with_client(client, events[mid:])
+                return
+            if response.status_code == 408 or response.status_code >= 500:
+                self._add_to_dlq(events)
+        except httpx.HTTPError:
+            self._add_to_dlq(events)
+    def _add_to_dlq(self, events: list[QueuedEvent]) -> None:
+        for queued in events:
+            queued.retry_count += 1
+            if queued.retry_count > self._max_retries:
+                continue
+            self._dlq.append(queued)

data_designer/engine/models/usage.py ADDED Viewed

@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import logging
+from pydantic import BaseModel, computed_field
+logger = logging.getLogger(__name__)
+class TokenUsageStats(BaseModel):
+    input_tokens: int = 0
+    output_tokens: int = 0
+    @computed_field
+    def total_tokens(self) -> int:
+        return self.input_tokens + self.output_tokens
+    @property
+    def has_usage(self) -> bool:
+        return self.total_tokens > 0
+    def extend(self, *, input_tokens: int, output_tokens: int) -> None:
+        self.input_tokens += input_tokens
+        self.output_tokens += output_tokens
+class RequestUsageStats(BaseModel):
+    successful_requests: int = 0
+    failed_requests: int = 0
+    @computed_field
+    def total_requests(self) -> int:
+        return self.successful_requests + self.failed_requests
+    @property
+    def has_usage(self) -> bool:
+        return self.total_requests > 0
+    def extend(self, *, successful_requests: int, failed_requests: int) -> None:
+        self.successful_requests += successful_requests
+        self.failed_requests += failed_requests
+class ModelUsageStats(BaseModel):
+    token_usage: TokenUsageStats = TokenUsageStats()
+    request_usage: RequestUsageStats = RequestUsageStats()
+    @property
+    def has_usage(self) -> bool:
+        return self.token_usage.has_usage and self.request_usage.has_usage
+    def extend(
+        self, *, token_usage: TokenUsageStats | None = None, request_usage: RequestUsageStats | None = None
+    ) -> None:
+        if token_usage is not None:
+            self.token_usage.extend(input_tokens=token_usage.input_tokens, output_tokens=token_usage.output_tokens)
+        if request_usage is not None:
+            self.request_usage.extend(
+                successful_requests=request_usage.successful_requests, failed_requests=request_usage.failed_requests
+            )
+    def get_usage_stats(self, *, total_time_elapsed: float) -> dict:
+        return self.model_dump() | {
+            "tokens_per_second": int(self.token_usage.total_tokens / total_time_elapsed)
+            if total_time_elapsed > 0
+            else 0,
+            "requests_per_minute": int(self.request_usage.total_requests / total_time_elapsed * 60)
+            if total_time_elapsed > 0
+            else 0,
+        }

data_designer/engine/models/utils.py ADDED Viewed

@@ -0,0 +1,101 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Literal
+@dataclass
+class ChatMessage:
+    """A chat message in an LLM conversation.
+    This dataclass represents messages exchanged in a conversation with an LLM,
+    supporting various message types including user prompts, assistant responses,
+    system instructions, and tool interactions.
+    Attributes:
+        role: The role of the message sender. One of 'user', 'assistant', 'system', or 'tool'.
+        content: The message content. Can be a string or a list of content blocks
+            for multimodal messages (e.g., text + images).
+        reasoning_content: Optional reasoning/thinking content from the assistant,
+            typically from extended thinking or chain-of-thought models.
+        tool_calls: Optional list of tool calls requested by the assistant.
+            Each tool call contains 'id', 'type', and 'function' keys.
+        tool_call_id: Optional ID linking a tool response to its corresponding
+            tool call. Required for messages with role='tool'.
+    """
+    role: Literal["user", "assistant", "system", "tool"]
+    content: str | list[dict[str, Any]] = ""
+    reasoning_content: str | None = None
+    tool_calls: list[dict[str, Any]] = field(default_factory=list)
+    tool_call_id: str | None = None
+    def to_dict(self) -> dict[str, Any]:
+        """Convert the message to a dictionary format for API calls.
+        Returns:
+            A dictionary containing the message fields. Only includes non-empty
+            optional fields to keep the output clean.
+        """
+        result: dict[str, Any] = {"role": self.role, "content": self.content}
+        if self.reasoning_content:
+            result["reasoning_content"] = self.reasoning_content
+        if self.tool_calls:
+            result["tool_calls"] = self.tool_calls
+        if self.tool_call_id:
+            result["tool_call_id"] = self.tool_call_id
+        return result
+    @classmethod
+    def as_user(cls, content: str | list[dict[str, Any]]) -> ChatMessage:
+        """Create a user message."""
+        return cls(role="user", content=content)
+    @classmethod
+    def as_assistant(
+        cls,
+        content: str = "",
+        reasoning_content: str | None = None,
+        tool_calls: list[dict[str, Any]] | None = None,
+    ) -> ChatMessage:
+        """Create an assistant message."""
+        return cls(
+            role="assistant",
+            content=content,
+            reasoning_content=reasoning_content,
+            tool_calls=tool_calls or [],
+        )
+    @classmethod
+    def as_system(cls, content: str) -> ChatMessage:
+        """Create a system message."""
+        return cls(role="system", content=content)
+    @classmethod
+    def as_tool(cls, content: str, tool_call_id: str) -> ChatMessage:
+        """Create a tool response message."""
+        return cls(role="tool", content=content, tool_call_id=tool_call_id)
+def prompt_to_messages(
+    *,
+    user_prompt: str,
+    system_prompt: str | None = None,
+    multi_modal_context: list[dict[str, Any]] | None = None,
+) -> list[ChatMessage]:
+    """Convert a user and system prompt into ChatMessage list.
+    Args:
+        user_prompt (str): A user prompt.
+        system_prompt (str, optional): An optional system prompt.
+    """
+    user_content: str | list[dict[str, Any]] = user_prompt
+    if multi_modal_context:
+        user_content = [*multi_modal_context, {"type": "text", "text": user_prompt}]
+    if system_prompt:
+        return [ChatMessage.as_system(system_prompt), ChatMessage.as_user(user_content)]
+    return [ChatMessage.as_user(user_content)]

data_designer/engine/processing/ginja/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2	+ # SPDX-License-Identifier: Apache-2.0

data_designer/engine/processing/ginja/ast.py ADDED Viewed

@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+from collections import deque
+from jinja2 import nodes as j_nodes
+def ast_max_depth(node: j_nodes.Node) -> int:
+    """Calculate the depth of a Jinja AST from a given node.
+    Args:
+        node (jinja2.nodes.Node): The starting Jinja2 AST node
+    Returns:
+        int: The maximum depth of the tree
+    """
+    # Each entry is (node, depth)
+    queue = deque([(node, 1)])
+    max_depth = 0
+    while queue:
+        current_node, current_depth = queue.popleft()
+        # Update maximum depth seen so far
+        max_depth = max(max_depth, current_depth)
+        # Add all children with incremented depth
+        for child in current_node.iter_child_nodes():
+            queue.append((child, current_depth + 1))
+    return max_depth
+def ast_descendant_count(ast: j_nodes.Node, only_type: type[j_nodes.Node] | None = None) -> int:
+    """Count the number of nodes which descend from the given node.
+    Args:
+        ast (jinja2.nodes.Node): The starting Jinja2 AST node
+        only_type (Type[jinja2.nodes.Node]): If specified, then only
+            nodes of this type will be counted.
+    Returns:
+        int: The number of nodes descended from the given node.
+    """
+    if only_type is None:
+        only_type = j_nodes.Node
+    return len(list(ast.find_all(only_type)))
+def ast_count_name_references(ast: j_nodes.Node, name: str) -> int:
+    """Count the number of nodes descended from the current that refer to name.
+    Args:
+        ast (jinja2.nodes.Node): The starting Jinja2 AST node
+    Returns:
+        int: The number of nodes descended from the provided node whose
+            name field matches the given name.
+    """
+    referenced_names = [node.name for node in ast.find_all(j_nodes.Name) if node.name == name]
+    return len(referenced_names)