PyPI - synth-ai - Versions diffs - 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

synth-ai 0.2.13.dev1py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (291) hide show

synth_ai/task/config.py ADDED Viewed

@@ -0,0 +1,257 @@
+"""Configuration dataclasses for task app CLI commands (eval, filter)."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Literal
+@dataclass(slots=True)
+class EvalConfig:
+    """Configuration for 'synth-ai eval' command.
+    Validates and provides defaults for evaluation runs against task apps.
+    """
+    # Required: Task app identifier
+    app_id: str
+    # Required: Model to evaluate
+    model: str
+    # Required: Seeds to run
+    seeds: list[int]
+    # Optional: Task app URL (None = spawn in-process)
+    task_app_url: str | None = None
+    # Optional: Data split to use
+    split: str = "train"
+    # Optional: Maximum turns/steps per episode
+    max_turns: int | None = None
+    # Optional: Maximum LLM calls per episode
+    max_llm_calls: int = 10
+    # Optional: Concurrency for parallel rollouts
+    concurrency: int = 1
+    # Optional: Environment name
+    env_name: str | None = None
+    # Optional: Policy name
+    policy_name: str | None = None
+    # Optional: Trace format ("compact", "full", "structured")
+    trace_format: Literal["compact", "full", "structured"] = "compact"
+    # Optional: Whether to return traces in response
+    return_trace: bool = False
+    # Optional: Operations sequence (if not provided, generates default)
+    ops: list[str] | None = None
+    # Optional: Environment config overrides
+    env_config: dict[str, Any] = field(default_factory=dict)
+    # Optional: Policy config overrides
+    policy_config: dict[str, Any] = field(default_factory=dict)
+    # Optional: Metadata for traces
+    metadata: dict[str, str] = field(default_factory=dict)
+    # Optional: SQL query for metadata filtering
+    metadata_sql: str | None = None
+    def __post_init__(self):
+        """Validate configuration after initialization."""
+        if not self.app_id:
+            raise ValueError("app_id is required")
+        if not self.model:
+            raise ValueError("model is required")
+        if not self.seeds:
+            raise ValueError("seeds list cannot be empty")
+        if not isinstance(self.seeds, list):
+            raise ValueError("seeds must be a list of integers")
+        if self.concurrency < 1:
+            raise ValueError("concurrency must be >= 1")
+        if self.max_llm_calls < 1:
+            raise ValueError("max_llm_calls must be >= 1")
+        if self.max_turns is not None and self.max_turns < 1:
+            raise ValueError("max_turns must be >= 1")
+        if self.trace_format not in ("compact", "full", "structured"):
+            raise ValueError(f"trace_format must be 'compact', 'full', or 'structured', got: {self.trace_format}")
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> EvalConfig:
+        """Create EvalConfig from a dictionary (e.g. from TOML).
+        Args:
+            data: Dictionary with eval configuration
+        Returns:
+            Validated EvalConfig instance
+        """
+        # Extract known fields
+        config_dict = {
+            "app_id": data.get("app_id"),
+            "model": data.get("model"),
+            "seeds": data.get("seeds", []),
+            "task_app_url": data.get("task_app_url"),
+            "split": data.get("split", "train"),
+            "max_turns": data.get("max_turns"),
+            "max_llm_calls": data.get("max_llm_calls", 10),
+            "concurrency": data.get("concurrency", 1),
+            "env_name": data.get("env_name"),
+            "policy_name": data.get("policy_name"),
+            "trace_format": data.get("trace_format", "compact"),
+            "return_trace": data.get("return_trace", False),
+            "ops": data.get("ops"),
+            "env_config": data.get("env_config", {}),
+            "policy_config": data.get("policy_config", {}),
+            "metadata": data.get("metadata", {}),
+            "metadata_sql": data.get("metadata_sql"),
+        }
+        return cls(**config_dict)
+@dataclass(slots=True)
+class FilterConfig:
+    """Configuration for 'synth-ai filter' command.
+    Validates and provides defaults for filtering traces into SFT datasets.
+    """
+    # Required: Database path or URL
+    db: str
+    # Required: Output JSONL path
+    output: str
+    # Optional: Filter by data splits
+    splits: list[str] = field(default_factory=list)
+    # Optional: Filter by task IDs
+    task_ids: list[str] = field(default_factory=list)
+    # Optional: Filter by models
+    models: list[str] = field(default_factory=list)
+    # Optional: Minimum official score threshold
+    min_official_score: float | None = None
+    # Optional: Maximum official score threshold
+    max_official_score: float | None = None
+    # Optional: Minimum judge scores (judge_name -> min_score)
+    min_judge_scores: dict[str, float] = field(default_factory=dict)
+    # Optional: Maximum judge scores (judge_name -> max_score)
+    max_judge_scores: dict[str, float] = field(default_factory=dict)
+    # Optional: Limit number of examples
+    limit: int | None = None
+    # Optional: Offset for pagination
+    offset: int | None = None
+    # Optional: Whether to shuffle results
+    shuffle: bool = False
+    # Optional: Random seed for shuffling
+    shuffle_seed: int | None = None
+    def __post_init__(self):
+        """Validate configuration after initialization."""
+        if not self.db:
+            raise ValueError("db (database path or URL) is required")
+        if not self.output:
+            raise ValueError("output (JSONL file path) is required")
+        # Validate output has .jsonl extension
+        output_path = Path(self.output)
+        if output_path.suffix.lower() not in (".jsonl", ".json"):
+            raise ValueError(f"output must be a .jsonl or .json file, got: {self.output}")
+        # Validate score thresholds
+        if self.min_official_score is not None and self.max_official_score is not None:
+            if self.min_official_score > self.max_official_score:
+                raise ValueError("min_official_score cannot be greater than max_official_score")
+        # Validate limit/offset
+        if self.limit is not None and self.limit < 1:
+            raise ValueError("limit must be >= 1")
+        if self.offset is not None and self.offset < 0:
+            raise ValueError("offset must be >= 0")
+        # Validate shuffle seed requires shuffle
+        if self.shuffle_seed is not None and not self.shuffle:
+            raise ValueError("shuffle_seed requires shuffle=true")
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> FilterConfig:
+        """Create FilterConfig from a dictionary (e.g. from TOML).
+        Args:
+            data: Dictionary with filter configuration
+        Returns:
+            Validated FilterConfig instance
+        """
+        # Extract known fields
+        config_dict = {
+            "db": data.get("db"),
+            "output": data.get("output"),
+            "splits": data.get("splits", []),
+            "task_ids": data.get("task_ids", []),
+            "models": data.get("models", []),
+            "min_official_score": data.get("min_official_score"),
+            "max_official_score": data.get("max_official_score"),
+            "min_judge_scores": data.get("min_judge_scores", {}),
+            "max_judge_scores": data.get("max_judge_scores", {}),
+            "limit": data.get("limit"),
+            "offset": data.get("offset"),
+            "shuffle": data.get("shuffle", False),
+            "shuffle_seed": data.get("shuffle_seed"),
+        }
+        return cls(**config_dict)
+    def get_db_url(self) -> str:
+        """Convert db path to proper SQLite URL if needed.
+        Returns:
+            Database URL suitable for SQLAlchemy/aiosqlite
+        """
+        db_value = self.db.strip()
+        if "://" in db_value:
+            return db_value
+        else:
+            db_path = Path(db_value).expanduser().resolve()
+            # Ensure parent directory exists
+            db_path.parent.mkdir(parents=True, exist_ok=True)
+            return f"sqlite+aiosqlite:///{db_path}"
+    def get_output_path(self) -> Path:
+        """Get resolved output path with parent directory created.
+        Returns:
+            Resolved Path object with parent directory created
+        """
+        output_path = Path(self.output).expanduser().resolve()
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        return output_path

synth_ai/task/contracts.py CHANGED Viewed

@@ -1,19 +1,25 @@
 from __future__ import annotations
 from dataclasses import dataclass
+from enum import Enum
 from typing import Any, Literal
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, ConfigDict, Field
+class RolloutMode(str, Enum):
+    """Mode controls how rollout infrastructure processes inference URLs."""
+    RL = "rl"
+    EVAL = "eval"
 @dataclass(frozen=True)
 class TaskAppEndpoints:
-    """Canonical Task App endpoint shapes used by RL trainers.
+    """Required Task App endpoints used by RL trainers and clients.
-    Task Apps run as lightweight HTTP services (often on Modal) that expose a
-    consistent set of endpoints for health, metadata, environment lifecycle,
-    rollouts, and optional proxy access to vendor models. The endpoint strings
-    defined here act as defaults and documentation for clients.
+    Task Apps run as lightweight HTTP services (often on Modal) that expose these
+    standard endpoints. Additional endpoints (proxies, debug routes) may be added
+    by individual task apps as needed.
     """
     root: str = "/"
@@ -21,28 +27,6 @@ class TaskAppEndpoints:
     info: str = "/info"
     task_info: str = "/task_info"
     rollout: str = "/rollout"
-    proxy_chat_completions: str = "/proxy/v1/chat/completions"
-    proxy_groq_chat_completions: str = "/proxy/groq/v1/chat/completions"
-    env_initialize: str = "/env/{env_name}/initialize"
-    env_step: str = "/env/{env_name}/step"
-    env_terminate: str = "/env/{env_name}/terminate"
-@dataclass(frozen=True)
-class TaskAppContract:
-    """Requirements and expectations for a Task App used by RL trainers.
-    - Auth: ENVIRONMENT_API_KEY must be set in the Task App environment; requests include X-API-Key.
-    - Health: /health returns 200 and JSON; may verify X-API-Key header.
-    - Env API: initialize/step/terminate are present for the target env (e.g., CrafterClassic).
-    - Rollout API: optional; provides a single-call rollout for convenience/testing.
-    - Inference routing: policy config passes an inference_url (Synth backend or OpenAI proxy).
-    - URL: base must be reachable via HTTPS and should be under .modal.run in production.
-    """
-    base_url: str
-    env_name: str | None = None
-    requires_api_key_header: bool = True
 # --- Unified rollout schema used by Task App services and SDK utilities ---
@@ -66,7 +50,7 @@ class RolloutRecordConfig(BaseModel):
     logprobs: bool = False
     value: bool = False
     return_trace: bool = False
-    trace_format: Literal["compact", "full"] = "compact"
+    trace_format: Literal["compact", "full", "structured"] = "compact"
 class RolloutSafetyConfig(BaseModel):
@@ -84,9 +68,16 @@ class RolloutRequest(BaseModel):
     safety: RolloutSafetyConfig = RolloutSafetyConfig()
     training_session_id: str | None = None
     synth_base_url: str | None = None
+    mode: RolloutMode  # Required: explicit RL vs EVAL mode
 class RolloutStep(BaseModel):
+    """Single step in a rollout trajectory.
+    DEPRECATED: This is part of the legacy trajectory format. New code should
+    consume v3 traces (RolloutResponse.trace) instead. See monorepo/trace_single_source.txt
+    for migration plan.
+    """
     obs: dict[str, Any]
     tool_calls: list[dict[str, Any]]
     reward: float | None = None
@@ -96,11 +87,40 @@ class RolloutStep(BaseModel):
 class RolloutTrajectory(BaseModel):
+    """Legacy trajectory format for rollout results.
+    DEPRECATED: This format duplicates data already present in v3 traces and will
+    be removed once training code migrates to consuming RolloutResponse.trace.
+    Current state:
+    - Task apps emit BOTH this format AND v3 traces (dual serialization)
+    - Training code (GSPO) reads from this format
+    - Eval/filter tools read from v3 traces
+    Migration plan:
+    - Phase 1: Training code learns to read from v3 traces (with fallback to this)
+    - Phase 2: Make this field optional once training is migrated
+    - Phase 3: Remove this field entirely and delete this class
+    See: monorepo/trace_single_source.txt for full migration plan and timeline.
+    Why v3 traces are better:
+    - Single source of truth (no duplication/drift)
+    - Richer data: token IDs, logprobs, reasoning, timing, images
+    - Built-in audit trail and replay capability
+    - Standard schema across all Synth AI tooling
+    """
     env_id: str
     policy_id: str
     steps: list[RolloutStep]
     final: dict[str, Any] | None = None
     length: int
+    # Required for trace correlation with inference mesh (optional initially for backward compat)
+    # See: monorepo/INFERENCE_URL_REQUIREMENT_PLAN.md and trace_creation_and_judgement.txt
+    inference_url: str
+    decision_samples: list[dict[str, Any]] | None = None
 class RolloutMetrics(BaseModel):
@@ -114,24 +134,103 @@ class RolloutMetrics(BaseModel):
 class RolloutResponse(BaseModel):
+    """Response from a rollout execution.
+    Contains both legacy trajectory format (for backward compatibility) and
+    modern v3 trace format (preferred going forward).
+    """
     run_id: str
+    # DEPRECATED: Legacy format maintained for training code compatibility.
+    # Will be removed once training migrates to reading from `trace` field.
+    # See: monorepo/trace_single_source.txt for migration plan.
     trajectories: list[RolloutTrajectory]
     branches: dict[str, list[str]] = Field(default_factory=dict)
     metrics: RolloutMetrics
     aborted: bool = False
     ops_executed: int = 0
+    # OPTIONAL: correlation ID for linking rollout to inference traces
+    # If not provided, trainer will infer it from trajectory.inference_url ?cid=... parameter
+    trace_correlation_id: str | None = None
+    # PREFERRED: v3 trace format (SessionTrace). This is the single source of truth
+    # for rollout data and should be used by all new code. Contains richer data than
+    # trajectories including token IDs, logprobs, timing, and multimodal content.
     trace: dict[str, Any] | None = None
+    pipeline_metadata: dict[str, Any] = Field(default_factory=dict)
+class _ExtraAllowModel(BaseModel):
+    """Base helper that preserves unknown keys while still exposing typed attributes."""
+    model_config = ConfigDict(extra="allow")
+class TaskDescriptor(_ExtraAllowModel):
+    """Human-readable task identifiers shown in UIs and logs."""
+    id: str
+    name: str
+    description: str | None = None
+    version: str | None = None
+class DatasetInfo(_ExtraAllowModel):
+    """Metadata about the prompt/task dataset powering the environment."""
+    id: str | None = None
+    name: str | None = None
+    version: str | None = None
+    splits: list[str] | None = None
+    default_split: str | None = None
+    description: str | None = None
+class RubricCriterion(_ExtraAllowModel):
+    id: str
+    description: str
+    weight: float | None = None
+class RubricSection(_ExtraAllowModel):
+    name: str
+    criteria: list[RubricCriterion] = Field(default_factory=list)
+class RubricInfo(_ExtraAllowModel):
+    """Outcome and event scoring definitions used by judges."""
+    outcome: RubricSection | None = None
+    events: RubricSection | None = None
+class InferenceInfo(_ExtraAllowModel):
+    """Recommended defaults for policy model routing."""
+    model: str | None = None
+    inference_url: str | None = None
+class LimitsInfo(_ExtraAllowModel):
+    """Operational limits the environment enforces."""
+    max_turns: int | None = None
+    max_response_tokens: int | None = None
+    timeout_seconds: int | None = None
-class TaskInfo(BaseModel):
+class TaskInfo(_ExtraAllowModel):
     """Static metadata describing the capabilities of a Task App task."""
-    task: dict[str, Any]
-    environments: list[str]
-    action_space: dict[str, Any]
-    observation: dict[str, Any]
-    dataset: dict[str, Any]
-    rubric: dict[str, Any]
-    inference: dict[str, Any]
-    capabilities: dict[str, Any]
-    limits: dict[str, Any]
+    task: TaskDescriptor
+    environment: str
+    dataset: DatasetInfo
+    rubric: RubricInfo
+    inference: InferenceInfo
+    limits: LimitsInfo
+    task_metadata: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Task-specific extras (e.g. prompt version info, documentation links).",
+    )

synth_ai/task/proxy.py CHANGED Viewed

@@ -1,39 +1,15 @@
-"""Shared helpers for Task App proxy endpoints (OpenAI, Groq, etc.)."""
+"""Shared helpers for Task App proxy endpoints (OpenAI, Groq, etc.).
+The proxy is tool-agnostic - each task app provides its own tools schema.
+"""
 from __future__ import annotations
 import copy
 import json
 import re
-from collections.abc import Iterable
 from typing import Any
-INTERACT_TOOL_SCHEMA: list[dict[str, Any]] = [
-    {
-        "type": "function",
-        "function": {
-            "name": "interact",
-            "description": "Perform one or more environment actions.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "actions": {
-                        "type": "array",
-                        "items": {"type": "string"},
-                        "description": "List of environment actions to execute in order.",
-                    },
-                    "reasoning": {
-                        "type": "string",
-                        "description": "Optional reasoning for the chosen actions.",
-                    },
-                },
-                "required": ["actions"],
-                "additionalProperties": False,
-            },
-        },
-    }
-]
 _REMOVE_FIELDS = {
     "stop_after_tool_calls",
     "thinking_mode",
@@ -44,14 +20,12 @@ _REMOVE_SAMPLING_FIELDS = {"temperature", "top_p"}
 _GPT5_MIN_COMPLETION_TOKENS = 16000
-def _ensure_tools(payload: dict[str, Any]) -> None:
-    tools = payload.get("tools")
-    if not isinstance(tools, list) or not tools:
-        payload["tools"] = copy.deepcopy(INTERACT_TOOL_SCHEMA)
 def prepare_for_openai(model: str | None, payload: dict[str, Any]) -> dict[str, Any]:
-    """Sanitise an OpenAI chat completions payload for Task App usage."""
+    """Sanitise an OpenAI chat completions payload for Task App usage.
+    The task app is responsible for providing tools in the payload.
+    This function only handles model-specific parameter normalization.
+    """
     sanitized = copy.deepcopy(payload)
     for field in _REMOVE_FIELDS:
@@ -68,10 +42,18 @@ def prepare_for_openai(model: str | None, payload: dict[str, Any]) -> dict[str,
         mct = sanitized.get("max_completion_tokens")
         if not isinstance(mct, int) or mct < _GPT5_MIN_COMPLETION_TOKENS:
             sanitized["max_completion_tokens"] = _GPT5_MIN_COMPLETION_TOKENS
-        sanitized["tool_choice"] = {"type": "function", "function": {"name": "interact"}}
+        # Set tool_choice to first provided tool (task app must provide tools)
+        # If tool_choice not already set and tools are provided, use the first one
+        if "tool_choice" not in sanitized:
+            tools = sanitized.get("tools", [])
+            if isinstance(tools, list) and tools:
+                first_func = tools[0].get("function", {})
+                if isinstance(first_func, dict) and "name" in first_func:
+                    sanitized["tool_choice"] = {"type": "function", "function": {"name": first_func["name"]}}
         sanitized["parallel_tool_calls"] = False
-    _ensure_tools(sanitized)
     return sanitized
@@ -206,24 +188,18 @@ def parse_tool_call_from_text(text: str) -> tuple[list[str], str]:
     return [], text
-def _build_tool_call(actions: Iterable[str], reasoning: str) -> dict[str, Any]:
-    payload = {
-        "actions": [str(a).strip() for a in actions if str(a).strip()],
-    }
-    if reasoning.strip():
-        payload["reasoning"] = reasoning.strip()
-    return {
-        "id": "tool_interact_fallback",
-        "type": "function",
-        "function": {
-            "name": INTERACT_TOOL_SCHEMA[0]["function"]["name"],
-            "arguments": json.dumps(payload, ensure_ascii=False),
-        },
-    }
-def synthesize_tool_call_if_missing(openai_response: dict[str, Any]) -> dict[str, Any]:
-    """Ensure the first choice carries a tool_call derived from text if absent."""
+def synthesize_tool_call_if_missing(
+    openai_response: dict[str, Any],
+    fallback_tool_name: str = "interact"
+) -> dict[str, Any]:
+    """Ensure the first choice carries a tool_call derived from text if absent.
+    This is a fallback for models that don't properly support tool calling.
+    Task apps can specify their preferred fallback tool name (e.g., "interact", "execute_sequence").
+    DEPRECATED: Task apps should prefer models with native tool calling support.
+    This function will be removed in a future version.
+    """
     if not isinstance(openai_response, dict):
         return openai_response
@@ -245,8 +221,24 @@ def synthesize_tool_call_if_missing(openai_response: dict[str, Any]) -> dict[str
     if not actions:
         return openai_response
+    # Build a fallback tool call using the provided tool name
+    payload = {
+        "actions": [str(a).strip() for a in actions if str(a).strip()],
+    }
+    if reasoning.strip():
+        payload["reasoning"] = reasoning.strip()
+    tool_call = {
+        "id": f"tool_{fallback_tool_name}_fallback",
+        "type": "function",
+        "function": {
+            "name": fallback_tool_name,
+            "arguments": json.dumps(payload, ensure_ascii=False),
+        },
+    }
     new_message = copy.deepcopy(message)
-    new_message["tool_calls"] = [_build_tool_call(actions, reasoning)]
+    new_message["tool_calls"] = [tool_call]
     if "content" not in new_message:
         new_message["content"] = None

synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.13.dev1py3-none-any.whl → 0.2.14py3-none-any.whl