PyPI - synth-ai - Versions diffs - 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl - Mend

synth-ai 0.2.13.dev1py3-none-any.whl → 0.2.13.dev2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (226) hide show

synth_ai/task/contracts.py CHANGED Viewed

@@ -3,17 +3,16 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import Any, Literal
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, ConfigDict, Field
 @dataclass(frozen=True)
 class TaskAppEndpoints:
-    """Canonical Task App endpoint shapes used by RL trainers.
+    """Required Task App endpoints used by RL trainers and clients.
-    Task Apps run as lightweight HTTP services (often on Modal) that expose a
-    consistent set of endpoints for health, metadata, environment lifecycle,
-    rollouts, and optional proxy access to vendor models. The endpoint strings
-    defined here act as defaults and documentation for clients.
+    Task Apps run as lightweight HTTP services (often on Modal) that expose these
+    standard endpoints. Additional endpoints (proxies, debug routes) may be added
+    by individual task apps as needed.
     """
     root: str = "/"
@@ -21,28 +20,6 @@ class TaskAppEndpoints:
     info: str = "/info"
     task_info: str = "/task_info"
     rollout: str = "/rollout"
-    proxy_chat_completions: str = "/proxy/v1/chat/completions"
-    proxy_groq_chat_completions: str = "/proxy/groq/v1/chat/completions"
-    env_initialize: str = "/env/{env_name}/initialize"
-    env_step: str = "/env/{env_name}/step"
-    env_terminate: str = "/env/{env_name}/terminate"
-@dataclass(frozen=True)
-class TaskAppContract:
-    """Requirements and expectations for a Task App used by RL trainers.
-    - Auth: ENVIRONMENT_API_KEY must be set in the Task App environment; requests include X-API-Key.
-    - Health: /health returns 200 and JSON; may verify X-API-Key header.
-    - Env API: initialize/step/terminate are present for the target env (e.g., CrafterClassic).
-    - Rollout API: optional; provides a single-call rollout for convenience/testing.
-    - Inference routing: policy config passes an inference_url (Synth backend or OpenAI proxy).
-    - URL: base must be reachable via HTTPS and should be under .modal.run in production.
-    """
-    base_url: str
-    env_name: str | None = None
-    requires_api_key_header: bool = True
 # --- Unified rollout schema used by Task App services and SDK utilities ---
@@ -87,6 +64,12 @@ class RolloutRequest(BaseModel):
 class RolloutStep(BaseModel):
+    """Single step in a rollout trajectory.
+    DEPRECATED: This is part of the legacy trajectory format. New code should
+    consume v3 traces (RolloutResponse.trace) instead. See monorepo/trace_single_source.txt
+    for migration plan.
+    """
     obs: dict[str, Any]
     tool_calls: list[dict[str, Any]]
     reward: float | None = None
@@ -96,11 +79,40 @@ class RolloutStep(BaseModel):
 class RolloutTrajectory(BaseModel):
+    """Legacy trajectory format for rollout results.
+    DEPRECATED: This format duplicates data already present in v3 traces and will
+    be removed once training code migrates to consuming RolloutResponse.trace.
+    Current state:
+    - Task apps emit BOTH this format AND v3 traces (dual serialization)
+    - Training code (GSPO) reads from this format
+    - Eval/filter tools read from v3 traces
+    Migration plan:
+    - Phase 1: Training code learns to read from v3 traces (with fallback to this)
+    - Phase 2: Make this field optional once training is migrated
+    - Phase 3: Remove this field entirely and delete this class
+    See: monorepo/trace_single_source.txt for full migration plan and timeline.
+    Why v3 traces are better:
+    - Single source of truth (no duplication/drift)
+    - Richer data: token IDs, logprobs, reasoning, timing, images
+    - Built-in audit trail and replay capability
+    - Standard schema across all Synth AI tooling
+    """
     env_id: str
     policy_id: str
     steps: list[RolloutStep]
     final: dict[str, Any] | None = None
     length: int
+    # Required for trace correlation with inference mesh (optional initially for backward compat)
+    # See: monorepo/INFERENCE_URL_REQUIREMENT_PLAN.md and trace_creation_and_judgement.txt
+    inference_url: str | None = None
+    decision_samples: list[dict[str, Any]] | None = None
 class RolloutMetrics(BaseModel):
@@ -114,24 +126,98 @@ class RolloutMetrics(BaseModel):
 class RolloutResponse(BaseModel):
+    """Response from a rollout execution.
+    Contains both legacy trajectory format (for backward compatibility) and
+    modern v3 trace format (preferred going forward).
+    """
     run_id: str
+    # DEPRECATED: Legacy format maintained for training code compatibility.
+    # Will be removed once training migrates to reading from `trace` field.
+    # See: monorepo/trace_single_source.txt for migration plan.
     trajectories: list[RolloutTrajectory]
     branches: dict[str, list[str]] = Field(default_factory=dict)
     metrics: RolloutMetrics
     aborted: bool = False
     ops_executed: int = 0
+    # PREFERRED: v3 trace format (SessionTrace). This is the single source of truth
+    # for rollout data and should be used by all new code. Contains richer data than
+    # trajectories including token IDs, logprobs, timing, and multimodal content.
     trace: dict[str, Any] | None = None
-class TaskInfo(BaseModel):
+class _ExtraAllowModel(BaseModel):
+    """Base helper that preserves unknown keys while still exposing typed attributes."""
+    model_config = ConfigDict(extra="allow")
+class TaskDescriptor(_ExtraAllowModel):
+    """Human-readable task identifiers shown in UIs and logs."""
+    id: str
+    name: str
+    description: str | None = None
+    version: str | None = None
+class DatasetInfo(_ExtraAllowModel):
+    """Metadata about the prompt/task dataset powering the environment."""
+    id: str | None = None
+    name: str | None = None
+    version: str | None = None
+    splits: list[str] | None = None
+    default_split: str | None = None
+    description: str | None = None
+class RubricCriterion(_ExtraAllowModel):
+    id: str
+    description: str
+    weight: float | None = None
+class RubricSection(_ExtraAllowModel):
+    name: str
+    criteria: list[RubricCriterion] = Field(default_factory=list)
+class RubricInfo(_ExtraAllowModel):
+    """Outcome and event scoring definitions used by judges."""
+    outcome: RubricSection | None = None
+    events: RubricSection | None = None
+class InferenceInfo(_ExtraAllowModel):
+    """Recommended defaults for policy model routing."""
+    model: str | None = None
+    inference_url: str | None = None
+class LimitsInfo(_ExtraAllowModel):
+    """Operational limits the environment enforces."""
+    max_turns: int | None = None
+    max_response_tokens: int | None = None
+    timeout_seconds: int | None = None
+class TaskInfo(_ExtraAllowModel):
     """Static metadata describing the capabilities of a Task App task."""
-    task: dict[str, Any]
-    environments: list[str]
-    action_space: dict[str, Any]
-    observation: dict[str, Any]
-    dataset: dict[str, Any]
-    rubric: dict[str, Any]
-    inference: dict[str, Any]
-    capabilities: dict[str, Any]
-    limits: dict[str, Any]
+    task: TaskDescriptor
+    environment: str
+    dataset: DatasetInfo
+    rubric: RubricInfo
+    inference: InferenceInfo
+    limits: LimitsInfo
+    task_metadata: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Task-specific extras (e.g. prompt version info, documentation links).",
+    )

synth_ai/task/proxy.py CHANGED Viewed

@@ -1,39 +1,15 @@
-"""Shared helpers for Task App proxy endpoints (OpenAI, Groq, etc.)."""
+"""Shared helpers for Task App proxy endpoints (OpenAI, Groq, etc.).
+The proxy is tool-agnostic - each task app provides its own tools schema.
+"""
 from __future__ import annotations
 import copy
 import json
 import re
-from collections.abc import Iterable
 from typing import Any
-INTERACT_TOOL_SCHEMA: list[dict[str, Any]] = [
-    {
-        "type": "function",
-        "function": {
-            "name": "interact",
-            "description": "Perform one or more environment actions.",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "actions": {
-                        "type": "array",
-                        "items": {"type": "string"},
-                        "description": "List of environment actions to execute in order.",
-                    },
-                    "reasoning": {
-                        "type": "string",
-                        "description": "Optional reasoning for the chosen actions.",
-                    },
-                },
-                "required": ["actions"],
-                "additionalProperties": False,
-            },
-        },
-    }
-]
 _REMOVE_FIELDS = {
     "stop_after_tool_calls",
     "thinking_mode",
@@ -44,14 +20,12 @@ _REMOVE_SAMPLING_FIELDS = {"temperature", "top_p"}
 _GPT5_MIN_COMPLETION_TOKENS = 16000
-def _ensure_tools(payload: dict[str, Any]) -> None:
-    tools = payload.get("tools")
-    if not isinstance(tools, list) or not tools:
-        payload["tools"] = copy.deepcopy(INTERACT_TOOL_SCHEMA)
 def prepare_for_openai(model: str | None, payload: dict[str, Any]) -> dict[str, Any]:
-    """Sanitise an OpenAI chat completions payload for Task App usage."""
+    """Sanitise an OpenAI chat completions payload for Task App usage.
+    The task app is responsible for providing tools in the payload.
+    This function only handles model-specific parameter normalization.
+    """
     sanitized = copy.deepcopy(payload)
     for field in _REMOVE_FIELDS:
@@ -68,10 +42,18 @@ def prepare_for_openai(model: str | None, payload: dict[str, Any]) -> dict[str,
         mct = sanitized.get("max_completion_tokens")
         if not isinstance(mct, int) or mct < _GPT5_MIN_COMPLETION_TOKENS:
             sanitized["max_completion_tokens"] = _GPT5_MIN_COMPLETION_TOKENS
-        sanitized["tool_choice"] = {"type": "function", "function": {"name": "interact"}}
+        # Set tool_choice to first provided tool (task app must provide tools)
+        # If tool_choice not already set and tools are provided, use the first one
+        if "tool_choice" not in sanitized:
+            tools = sanitized.get("tools", [])
+            if isinstance(tools, list) and tools:
+                first_func = tools[0].get("function", {})
+                if isinstance(first_func, dict) and "name" in first_func:
+                    sanitized["tool_choice"] = {"type": "function", "function": {"name": first_func["name"]}}
         sanitized["parallel_tool_calls"] = False
-    _ensure_tools(sanitized)
     return sanitized
@@ -206,24 +188,18 @@ def parse_tool_call_from_text(text: str) -> tuple[list[str], str]:
     return [], text
-def _build_tool_call(actions: Iterable[str], reasoning: str) -> dict[str, Any]:
-    payload = {
-        "actions": [str(a).strip() for a in actions if str(a).strip()],
-    }
-    if reasoning.strip():
-        payload["reasoning"] = reasoning.strip()
-    return {
-        "id": "tool_interact_fallback",
-        "type": "function",
-        "function": {
-            "name": INTERACT_TOOL_SCHEMA[0]["function"]["name"],
-            "arguments": json.dumps(payload, ensure_ascii=False),
-        },
-    }
-def synthesize_tool_call_if_missing(openai_response: dict[str, Any]) -> dict[str, Any]:
-    """Ensure the first choice carries a tool_call derived from text if absent."""
+def synthesize_tool_call_if_missing(
+    openai_response: dict[str, Any],
+    fallback_tool_name: str = "interact"
+) -> dict[str, Any]:
+    """Ensure the first choice carries a tool_call derived from text if absent.
+    This is a fallback for models that don't properly support tool calling.
+    Task apps can specify their preferred fallback tool name (e.g., "interact", "execute_sequence").
+    DEPRECATED: Task apps should prefer models with native tool calling support.
+    This function will be removed in a future version.
+    """
     if not isinstance(openai_response, dict):
         return openai_response
@@ -245,8 +221,24 @@ def synthesize_tool_call_if_missing(openai_response: dict[str, Any]) -> dict[str
     if not actions:
         return openai_response
+    # Build a fallback tool call using the provided tool name
+    payload = {
+        "actions": [str(a).strip() for a in actions if str(a).strip()],
+    }
+    if reasoning.strip():
+        payload["reasoning"] = reasoning.strip()
+    tool_call = {
+        "id": f"tool_{fallback_tool_name}_fallback",
+        "type": "function",
+        "function": {
+            "name": fallback_tool_name,
+            "arguments": json.dumps(payload, ensure_ascii=False),
+        },
+    }
     new_message = copy.deepcopy(message)
-    new_message["tool_calls"] = [_build_tool_call(actions, reasoning)]
+    new_message["tool_calls"] = [tool_call]
     if "content" not in new_message:
         new_message["content"] = None

synth_ai/task/rubrics/__init__.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""Rubric schema, loading, and scoring helpers for Task Apps.
+This module provides:
+- Flexible rubric models (Criterion, Rubric) for general task app use
+- Strict validators (StrictCriterion, StrictRubric) for step-wise judges
+- Loading utilities supporting JSON, YAML, and HTTP sources
+- Blending utilities for composing rubrics
+- Scoring utilities for events and outcomes
+"""
+# Core models (flexible validation)
+from .models import Criterion, Rubric
+# Loading and blending
+from .loaders import blend_rubrics, load_rubric
+# Scoring
+from .scoring import score_events_against_rubric, score_outcome_against_rubric
+# Strict validators (for judge configs)
+from .strict import (
+    StrictCriterion,
+    StrictRubric,
+    ValidationError,
+    validate_rubric_dict,
+    validate_rubric_file,
+    validate_rubric_files,
+)
+__all__ = [
+    # Flexible models
+    "Criterion",
+    "Rubric",
+    # Loaders
+    "load_rubric",
+    "blend_rubrics",
+    # Scoring
+    "score_events_against_rubric",
+    "score_outcome_against_rubric",
+    # Strict validators
+    "StrictCriterion",
+    "StrictRubric",
+    "ValidationError",
+    "validate_rubric_dict",
+    "validate_rubric_file",
+    "validate_rubric_files",
+]
+# Maintain backwards compatibility
+# Old code may import these names expecting the flexible variants
+RubricCriterion = StrictCriterion
+RubricSpec = StrictRubric

synth_ai/task/rubrics/loaders.py ADDED Viewed

@@ -0,0 +1,133 @@
+"""Rubric loading and blending utilities."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+from .models import Criterion, Rubric
+def _load_text(source: str) -> tuple[str, str | None]:
+    """Load text from file path or return as-is."""
+    path = Path(source)
+    if path.exists():
+        return path.read_text(encoding="utf-8"), path.suffix.lower()
+    return source, None
+def _parse_structured(text: str, suffix: str | None) -> dict[str, Any]:
+    """Parse JSON or YAML text into a dictionary."""
+    text = text.strip()
+    if not text:
+        raise ValueError("Rubric source is empty")
+    if suffix in (".yaml", ".yml"):
+        try:
+            import yaml  # type: ignore
+        except Exception as exc:  # pragma: no cover - optional dependency
+            raise RuntimeError("PyYAML is required to load YAML rubrics") from exc
+        data = yaml.safe_load(text)
+        if not isinstance(data, dict):
+            raise ValueError("Rubric YAML must produce a mapping") from None
+        return data
+    if text.startswith("{"):
+        return json.loads(text)
+    if text.startswith("http://") or text.startswith("https://"):
+        import requests  # type: ignore
+        response = requests.get(text, timeout=15)
+        response.raise_for_status()
+        return _parse_structured(response.text, suffix)
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        try:
+            import yaml  # type: ignore
+        except Exception as exc:  # pragma: no cover - optional dependency
+            raise RuntimeError("PyYAML is required to load rubric text") from exc
+        data = yaml.safe_load(text)
+        if not isinstance(data, dict):
+            raise ValueError("Rubric text must decode to a mapping") from None
+        return data
+def load_rubric(source: str | dict[str, Any] | Rubric | None) -> Rubric | None:
+    """Load rubric from file path, dict, or return existing Rubric.
+    Args:
+        source: File path (JSON/YAML), dict, existing Rubric, or None
+    Returns:
+        Parsed Rubric instance or None if source is None
+    """
+    if source is None:
+        return None
+    if isinstance(source, Rubric):
+        return source
+    if isinstance(source, dict):
+        return Rubric.model_validate(source)
+    text, suffix = _load_text(str(source))
+    data = _parse_structured(text, suffix)
+    return Rubric.model_validate(data)
+def _merge_weights(base: Criterion, override: Criterion) -> float:
+    """Merge criterion weights from base and override rubrics."""
+    if override.weight != 1.0 and base.weight != 1.0:
+        return base.weight * override.weight
+    if override.weight != 1.0:
+        return override.weight
+    return base.weight
+def blend_rubrics(base: Rubric | None, override: Rubric | None) -> Rubric | None:
+    """Blend two rubrics by merging criteria and inheriting properties.
+    Override rubric takes precedence for descriptions and settings.
+    Weights are merged multiplicatively when both are non-default.
+    Args:
+        base: Base rubric providing defaults
+        override: Override rubric with specific customizations
+    Returns:
+        Blended rubric or None if both inputs are None
+    """
+    if override is None and base is None:
+        return None
+    if base is None:
+        return override
+    if override is None:
+        return base
+    base_map = {criterion.id: criterion for criterion in base.criteria}
+    merged: list[Criterion] = []
+    for ov in override.criteria:
+        if ov.id in base_map:
+            existing = base_map.pop(ov.id)
+            merged.append(
+                Criterion(
+                    id=ov.id,
+                    description=ov.description or existing.description,
+                    weight=_merge_weights(existing, ov),
+                    required=ov.required if ov.required is not None else existing.required,
+                )
+            )
+        else:
+            merged.append(ov)
+    merged.extend(base_map.values())
+    aggregation = override.aggregation
+    if aggregation == "inherit":
+        aggregation = base.aggregation
+    return Rubric(
+        version=override.version or base.version,
+        goal_text=override.goal_text or base.goal_text,
+        criteria=merged,
+        aggregation=aggregation,
+    )

synth_ai/task/rubrics/models.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Rubric and Criterion data models."""
+from __future__ import annotations
+from pydantic import BaseModel, Field, field_validator
+class Criterion(BaseModel):
+    """Single scoring criterion within a rubric.
+    Flexible variant allowing weights > 1.0 and no normalization requirement.
+    Used by task apps for general rubric scoring.
+    """
+    id: str
+    description: str
+    weight: float = 1.0
+    required: bool = False
+    @field_validator("weight")
+    @classmethod
+    def _validate_weight(cls, value: float) -> float:
+        if value <= 0:
+            raise ValueError("criterion weight must be positive")
+        return value
+class Rubric(BaseModel):
+    """Rubric definition for scoring task app outcomes.
+    Supports flexible aggregation and blending. Criteria weights do not need
+    to sum to 1.0, making this suitable for general task app usage.
+    """
+    version: str
+    goal_text: str | None = None
+    criteria: list[Criterion] = Field(default_factory=list)
+    aggregation: str = "weighted_sum"
+    @field_validator("aggregation")
+    @classmethod
+    def _validate_aggregation(cls, value: str) -> str:
+        allowed = {"sum", "weighted_sum", "custom", "inherit"}
+        if value not in allowed:
+            raise ValueError(f"aggregation must be one of {sorted(allowed)}")
+        return value
+    @field_validator("criteria")
+    @classmethod
+    def _validate_criteria(cls, criteria: list[Criterion]) -> list[Criterion]:
+        seen = set()
+        for criterion in criteria:
+            if criterion.id in seen:
+                raise ValueError(f"duplicate criterion id: {criterion.id}")
+            seen.add(criterion.id)
+        return criteria

synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.13.dev1py3-none-any.whl → 0.2.13.dev2py3-none-any.whl