PyPI - synth-ai - Versions diffs - 0.2.12__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl - Mend

synth-ai 0.2.12py3-none-any.whl → 0.2.13.dev2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (229) hide show

synth_ai/task/rubrics/scoring.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""Rubric scoring utilities for events and outcomes."""
+from __future__ import annotations
+from collections.abc import Iterable
+from typing import Any
+from .models import Criterion, Rubric
+def _as_float(value: Any) -> float | None:
+    """Safely convert value to float, returning None on failure."""
+    try:
+        return float(value)
+    except Exception:
+        return None
+def _score(
+    criteria: Iterable[Criterion], values: dict[str, float], aggregation: str
+) -> dict[str, Any]:
+    """Compute aggregate score from criterion values.
+    Args:
+        criteria: List of criteria defining scoring dimensions
+        values: Map of criterion IDs to scores
+        aggregation: How to aggregate ("sum", "weighted_sum", "custom")
+    Returns:
+        Dict with aggregation method, total score, and per-criterion breakdown
+    """
+    if aggregation == "inherit":
+        aggregation = "weighted_sum"
+    per_criterion: dict[str, dict[str, Any]] = {}
+    total = 0.0
+    total_weight = 0.0
+    for criterion in criteria:
+        score = values.get(criterion.id, 0.0)
+        per_criterion[criterion.id] = {
+            "score": score,
+            "weight": criterion.weight,
+            "required": criterion.required,
+        }
+        if aggregation == "sum":
+            total += score
+        elif aggregation == "weighted_sum":
+            total += score * criterion.weight
+            total_weight += criterion.weight
+    if aggregation == "weighted_sum" and total_weight > 0:
+        total = total / total_weight
+    if aggregation == "custom":
+        total = None  # type: ignore[assignment]
+    return {
+        "aggregation": aggregation,
+        "score": total,
+        "per_criterion": per_criterion,
+    }
+def score_events_against_rubric(
+    events: list[dict[str, Any]], rubric: Rubric | None
+) -> dict[str, Any]:
+    """Score a list of evaluation events against a rubric.
+    Events should contain criterion_id/id/criterion and score fields.
+    Args:
+        events: List of event dicts with scoring info
+        rubric: Rubric defining criteria and aggregation
+    Returns:
+        Scoring result with total and per-criterion scores
+    """
+    if rubric is None:
+        return {"aggregation": "none", "score": None, "per_criterion": {}}
+    values: dict[str, float] = {}
+    for event in events or []:
+        if not isinstance(event, dict):
+            continue
+        cid = event.get("criterion_id") or event.get("id") or event.get("criterion")
+        score = _as_float(event.get("score"))
+        if cid and score is not None:
+            values[str(cid)] = score
+    return _score(rubric.criteria, values, rubric.aggregation)
+def score_outcome_against_rubric(outcome: dict[str, Any], rubric: Rubric | None) -> dict[str, Any]:
+    """Score a rollout outcome against a rubric.
+    Outcome should be a dict mapping criterion IDs to scores, optionally
+    nested under a "criteria" key.
+    Args:
+        outcome: Outcome dict with criterion scores
+        rubric: Rubric defining criteria and aggregation
+    Returns:
+        Scoring result with total and per-criterion scores
+    """
+    if rubric is None:
+        return {"aggregation": "none", "score": None, "per_criterion": {}}
+    values: dict[str, float] = {}
+    if isinstance(outcome, dict):
+        candidates = (
+            outcome.get("criteria") if isinstance(outcome.get("criteria"), dict) else outcome
+        )
+        if isinstance(candidates, dict):
+            for key, value in candidates.items():
+                score = _as_float(value)
+                if score is not None:
+                    values[str(key)] = score
+    return _score(rubric.criteria, values, rubric.aggregation)

synth_ai/task/rubrics/strict.py ADDED Viewed

@@ -0,0 +1,149 @@
+"""Strict rubric validators for step-wise judges.
+These validators enforce stricter constraints than the general-purpose rubrics:
+- Weights must be ≤ 1.0 and sum to exactly 1.0
+- Only weighted_sum aggregation is allowed
+- All required fields must be non-empty
+Used primarily for validation in judge configurations.
+"""
+from __future__ import annotations
+import json
+import math
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Any, Literal
+import pydantic
+class StrictCriterion(pydantic.BaseModel):
+    """Single scoring criterion with strict validation.
+    Enforces:
+    - Weight ≤ 1.0 (for proper normalization)
+    - Weight > 0.0 (positive)
+    - Non-empty strings
+    """
+    id: str
+    description: str
+    weight: float
+    scale: str | None = None
+    @pydantic.field_validator("weight")
+    @classmethod
+    def _validate_weight(cls, value: float) -> float:
+        if not math.isfinite(value):
+            raise ValueError("weight must be a finite number")
+        if value <= 0.0:
+            raise ValueError("weight must be positive")
+        if value > 1.0:
+            raise ValueError("weight must be <= 1.0")
+        return value
+    @pydantic.field_validator("id", "description", mode="before")
+    @classmethod
+    def _strip_string(cls, value: Any) -> Any:
+        if isinstance(value, str):
+            return value.strip()
+        return value
+class StrictRubric(pydantic.BaseModel):
+    """Strict rubric definition for step-wise judges.
+    Enforces:
+    - Weights must sum to 1.0
+    - Only weighted_sum aggregation
+    - Non-empty version and goal_text
+    - At least one criterion
+    """
+    version: str
+    goal_text: str
+    aggregation: Literal["weighted_sum"]
+    criteria: list[StrictCriterion]
+    @pydantic.model_validator(mode="after")
+    def _validate_weights(self) -> StrictRubric:
+        if not self.criteria:
+            raise ValueError("rubric must declare at least one criterion")
+        total_weight = sum(criterion.weight for criterion in self.criteria)
+        if not math.isclose(total_weight, 1.0, abs_tol=1e-6, rel_tol=1e-6):
+            raise ValueError(
+                f"criterion weights must sum to 1 (got {total_weight:.6f})"
+            )
+        return self
+    @pydantic.field_validator("version")
+    @classmethod
+    def _non_empty_version(cls, value: str) -> str:
+        value = value.strip()
+        if not value:
+            raise ValueError("version string must not be empty")
+        return value
+    @pydantic.field_validator("goal_text")
+    @classmethod
+    def _non_empty_goal_text(cls, value: str) -> str:
+        value = value.strip()
+        if not value:
+            raise ValueError("goal_text must not be empty")
+        return value
+# Re-export pydantic's ValidationError for convenience
+ValidationError = pydantic.ValidationError
+def validate_rubric_dict(payload: dict[str, Any]) -> StrictRubric:
+    """Validate an in-memory rubric payload with strict rules.
+    Args:
+        payload: Dictionary representing the rubric JSON
+    Returns:
+        Validated StrictRubric instance
+    Raises:
+        ValidationError: If payload is invalid or doesn't meet strict constraints
+    """
+    if not isinstance(payload, dict):
+        raise TypeError("rubric payload must be a dictionary")
+    return StrictRubric.model_validate(payload)
+def _load_payload_from_file(path: Path) -> dict[str, Any]:
+    """Load JSON rubric from file."""
+    if path.suffix.lower() != ".json":
+        raise ValueError(f"Unsupported rubric file type: {path}")
+    text = path.read_text(encoding="utf-8")
+    return json.loads(text)
+def validate_rubric_file(path: Path) -> StrictRubric:
+    """Load and validate a rubric file with strict rules.
+    Args:
+        path: Path to a JSON rubric document
+    Returns:
+        Validated StrictRubric instance
+    """
+    payload = _load_payload_from_file(path)
+    return validate_rubric_dict(payload)
+def validate_rubric_files(paths: Iterable[Path]) -> list[StrictRubric]:
+    """Validate multiple rubric files with strict rules.
+    Useful for bulk validation inside tests or CI checks.
+    """
+    validated: list[StrictRubric] = []
+    for path in paths:
+        validated.append(validate_rubric_file(path))
+    return validated

synth_ai/task/server.py CHANGED Viewed

@@ -70,7 +70,7 @@ class TaskAppConfig:
     provide_task_instances: InstanceProvider
     rollout: RolloutExecutor
     dataset_registry: TaskDatasetRegistry | None = None
-    rubrics: RubricBundle = field(default_factory=RubricBundle)
+    rubrics: RubricBundle | None = field(default_factory=RubricBundle)
     proxy: ProxyConfig | None = None
     routers: Sequence[APIRouter] = field(default_factory=tuple)
     middleware: Sequence[Middleware] = field(default_factory=tuple)
@@ -93,7 +93,7 @@ class TaskAppConfig:
             provide_task_instances=self.provide_task_instances,
             rollout=self.rollout,
             dataset_registry=self.dataset_registry,
-            rubrics=self.rubrics,
+            rubrics=self.rubrics or RubricBundle(),
             proxy=self.proxy,
             routers=tuple(self.routers),
             middleware=tuple(self.middleware),
@@ -221,6 +221,7 @@ def _auth_dependency_factory(config: TaskAppConfig) -> Callable[[Request], None]
 def create_task_app(config: TaskAppConfig) -> FastAPI:
     cfg = config.clone()
+    cfg.rubrics = cfg.rubrics or RubricBundle()
     app = FastAPI(title=cfg.name, description=cfg.description)
     for key, value in cfg.app_state.items():
@@ -310,20 +311,20 @@ def create_task_app(config: TaskAppConfig) -> FastAPI:
     async def info() -> Mapping[str, Any]:
         dataset_meta = cfg.base_task_info.dataset
         rubrics: dict[str, Any] | None = None
-        if cfg.rubrics.outcome or cfg.rubrics.events:
+        rubric_bundle = cfg.rubrics
+        if rubric_bundle and (rubric_bundle.outcome or rubric_bundle.events):
             rubrics = {
-                "outcome": cfg.rubrics.outcome.model_dump() if cfg.rubrics.outcome else None,
-                "events": cfg.rubrics.events.model_dump() if cfg.rubrics.events else None,
+                "outcome": rubric_bundle.outcome.model_dump() if rubric_bundle.outcome else None,
+                "events": rubric_bundle.events.model_dump() if rubric_bundle.events else None,
             }
         payload = {
             "service": {
                 "task": cfg.base_task_info.task,
-                "version": cfg.base_task_info.task.get("version"),
+                "version": cfg.base_task_info.task.version,
             },
             "dataset": dataset_meta,
             "rubrics": rubrics,
             "inference": cfg.base_task_info.inference,
-            "capabilities": cfg.base_task_info.capabilities,
             "limits": cfg.base_task_info.limits,
         }
         return to_jsonable(payload)

synth_ai/task/validators.py CHANGED Viewed

@@ -1,11 +1,274 @@
+"""Task app validation utilities."""
 from __future__ import annotations
-from urllib.parse import urlparse
+import re
+from typing import Any
+import click
+import httpx
+from synth_ai.task.contracts import TaskAppEndpoints  # type: ignore[attr-defined]
+def validate_task_app_url(url: str | None) -> str:
+    """Validate and normalize a task app URL.
+    Args:
+        url: URL to validate
+    Returns:
+        Normalized URL
+    Raises:
+        ValueError: If URL is invalid
+    """
+    if not url:
+        raise ValueError("Task app URL is required")
+    url = url.strip().rstrip("/")
+    # Basic URL validation
+    url_pattern = re.compile(
+        r"^https?://"  # http:// or https://
+        r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|"  # domain...
+        r"localhost|"  # localhost...
+        r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"  # ...or ip
+        r"(?::\d+)?"  # optional port
+        r"(?:/?|[/?]\S+)$",
+        re.IGNORECASE,
+    )
+    if not url_pattern.match(url):
+        raise ValueError(f"Invalid task app URL: {url}")
+    return url
+def _print_success(msg: str) -> None:
+    """Print success message in green."""
+    click.echo(click.style(f"✓ {msg}", fg="green"))
+def _print_error(msg: str) -> None:
+    """Print error message in red."""
+    click.echo(click.style(f"✗ {msg}", fg="red"), err=True)
+def _print_warning(msg: str) -> None:
+    """Print warning message in yellow."""
+    click.echo(click.style(f"⚠ {msg}", fg="yellow"))
+def _print_info(msg: str) -> None:
+    """Print info message."""
+    click.echo(f"  {msg}")
-def validate_task_app_url(url: str, *, name: str = "TASK_APP_BASE_URL") -> None:
-    """Validate a Task App base URL (scheme + host present)."""
-    p = urlparse(url)
-    if p.scheme not in ("http", "https") or not p.netloc:
-        raise ValueError(f"Invalid {name}: malformed: {url}")
+async def validate_task_app_endpoint(
+    url: str,
+    api_key: str | None = None,
+    min_instances: int = 10,
+    verbose: bool = False,
+) -> tuple[bool, dict[str, Any]]:
+    """Validate a task app deployment.
+    Returns:
+        (success: bool, results: dict)
+    """
+    results: dict[str, Any] = {
+        "url": url,
+        "endpoints": {},
+        "auth": {},
+        "task_instances": {},
+        "overall": False,
+    }
+    all_passed = True
+    endpoints = TaskAppEndpoints()
+    # Set up headers
+    headers = {}
+    if api_key:
+        headers["X-API-Key"] = api_key
+    click.echo(f"\n{'='*60}")
+    click.echo(f"Validating Task App: {url}")
+    click.echo(f"{'='*60}\n")
+    async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
+        # 1. Check root endpoint
+        click.echo("1. Checking root endpoint...")
+        try:
+            resp = await client.get(f"{url}{endpoints.root}")
+            if resp.status_code == 200:
+                data = resp.json()
+                _print_success(f"Root endpoint responds (status: {data.get('status')})")
+                results["endpoints"]["root"] = {"passed": True, "data": data}
+                if verbose:
+                    _print_info(f"Service: {data.get('service', 'N/A')}")
+            else:
+                _print_error(f"Root endpoint returned {resp.status_code}")
+                results["endpoints"]["root"] = {"passed": False, "status": resp.status_code}
+                all_passed = False
+        except Exception as e:
+            _print_error(f"Root endpoint failed: {e}")
+            results["endpoints"]["root"] = {"passed": False, "error": str(e)}
+            all_passed = False
+        # 2. Check health endpoint
+        click.echo("\n2. Checking health endpoint...")
+        try:
+            resp = await client.get(f"{url}{endpoints.health}", headers=headers)
+            if resp.status_code == 200:
+                data = resp.json()
+                _print_success(f"Health endpoint responds (healthy: {data.get('healthy')})")
+                results["endpoints"]["health"] = {"passed": True, "data": data}
+                # Check auth configuration
+                auth_info = data.get("auth", {})
+                if auth_info.get("required"):
+                    _print_info(f"Auth required: {auth_info.get('required')}")
+                    _print_info(f"Expected key prefix: {auth_info.get('expected_prefix', 'N/A')}")
+                    if api_key:
+                        _print_success("API key provided and accepted")
+                        results["auth"]["provided"] = True
+                        results["auth"]["accepted"] = True
+                    else:
+                        _print_warning("No API key provided but may be required")
+                        results["auth"]["provided"] = False
+                        results["auth"]["required"] = True
+            else:
+                _print_error(f"Health endpoint returned {resp.status_code}")
+                results["endpoints"]["health"] = {"passed": False, "status": resp.status_code}
+                all_passed = False
+                if resp.status_code == 403:
+                    _print_error("Authentication failed - provide API key with --api-key")
+                    results["auth"]["error"] = "Authentication failed"
+        except Exception as e:
+            _print_error(f"Health endpoint failed: {e}")
+            results["endpoints"]["health"] = {"passed": False, "error": str(e)}
+            all_passed = False
+        # 3. Check info endpoint
+        click.echo("\n3. Checking info endpoint...")
+        try:
+            resp = await client.get(f"{url}{endpoints.info}", headers=headers)
+            if resp.status_code == 200:
+                data = resp.json()
+                _print_success("Info endpoint responds")
+                results["endpoints"]["info"] = {"passed": True, "data": data}
+                if verbose:
+                    service = data.get("service", {})
+                    task_info = service.get("task", {})
+                    if isinstance(task_info, dict):
+                        _print_info(f"Task: {task_info.get('name', 'N/A')}")
+                    _print_info(f"Version: {service.get('version', 'N/A')}")
+                    dataset = data.get("dataset", {})
+                    if isinstance(dataset, dict):
+                        _print_info(f"Dataset: {dataset.get('id', 'N/A')}")
+            else:
+                _print_error(f"Info endpoint returned {resp.status_code}")
+                results["endpoints"]["info"] = {"passed": False, "status": resp.status_code}
+                all_passed = False
+        except Exception as e:
+            _print_error(f"Info endpoint failed: {e}")
+            results["endpoints"]["info"] = {"passed": False, "error": str(e)}
+            all_passed = False
+        # 4. Check task_info endpoint and instance count
+        click.echo("\n4. Checking task_info endpoint and instance availability...")
+        try:
+            # Get taskset descriptor first
+            resp = await client.get(f"{url}{endpoints.task_info}", headers=headers)
+            if resp.status_code == 200:
+                data = resp.json()
+                _print_success("Task info endpoint responds")
+                results["endpoints"]["task_info"] = {"passed": True}
+                taskset = data.get("taskset", {})
+                if verbose and taskset:
+                    if isinstance(taskset, dict):
+                        _print_info(f"Taskset: {taskset.get('id', 'N/A')}")
+                    else:
+                        _print_info(f"Taskset: {taskset}")
+                # Try to get specific task instances (seeds 0-19)
+                # Fetch instances one by one to verify we can get at least min_instances
+                instances = []
+                for seed in range(min_instances + 5):  # Try a few extra
+                    try:
+                        resp_seed = await client.get(
+                            f"{url}{endpoints.task_info}",
+                            params={"seed": seed},
+                            headers=headers,
+                        )
+                        if resp_seed.status_code == 200:
+                            instance = resp_seed.json()
+                            instances.append(instance)
+                        else:
+                            break  # Stop if we hit an invalid seed
+                    except Exception:
+                        break
+                instance_count = len(instances)
+                results["task_instances"]["count"] = instance_count
+                results["task_instances"]["requested"] = min_instances
+                if instance_count >= min_instances:
+                    _print_success(f"Found {instance_count} task instances (≥ {min_instances} required)")
+                    results["task_instances"]["passed"] = True
+                    if verbose and instances:
+                        sample = instances[0]
+                        task_info_sample = sample.get('task', {})
+                        if isinstance(task_info_sample, dict):
+                            _print_info(f"Sample task: {task_info_sample.get('name', 'N/A')}")
+                        _print_info(f"Environment: {sample.get('environment', 'N/A')}")
+                else:
+                    _print_error(f"Only {instance_count} task instances available (need ≥ {min_instances})")
+                    results["task_instances"]["passed"] = False
+                    all_passed = False
+            else:
+                _print_error(f"Task info endpoint returned {resp.status_code}")
+                results["endpoints"]["task_info"] = {"passed": False, "status": resp.status_code}
+                all_passed = False
+        except Exception as e:
+            _print_error(f"Task info endpoint failed: {e}")
+            results["endpoints"]["task_info"] = {"passed": False, "error": str(e)}
+            results["task_instances"]["passed"] = False
+            all_passed = False
+        # 5. Check rollout endpoint structure (don't actually run a rollout)
+        click.echo("\n5. Checking rollout endpoint availability...")
+        try:
+            # Just check if it's registered (OPTIONS or a lightweight probe)
+            resp = await client.options(f"{url}{endpoints.rollout}", headers=headers)
+            # Many servers return 200 for OPTIONS, some return 405
+            if resp.status_code in (200, 204, 405):
+                _print_success("Rollout endpoint is registered")
+                results["endpoints"]["rollout"] = {"passed": True}
+            else:
+                _print_warning(f"Rollout endpoint returned unexpected status: {resp.status_code}")
+                results["endpoints"]["rollout"] = {"passed": True, "note": "endpoint exists"}
+        except Exception as e:
+            # OPTIONS might not be supported, that's okay
+            _print_info(f"Rollout endpoint check skipped (OPTIONS not supported): {e}")
+            results["endpoints"]["rollout"] = {"passed": True, "note": "assumed present"}
+    # Summary
+    click.echo(f"\n{'='*60}")
+    if all_passed:
+        _print_success("All validations passed!")
+        click.echo(f"{'='*60}\n")
+    else:
+        _print_error("Some validations failed. See errors above.")
+        click.echo(f"{'='*60}\n")
+    results["overall"] = all_passed
+    return all_passed, results

synth_ai/tracing_v3/decorators.py CHANGED Viewed

@@ -37,10 +37,14 @@ from .utils import calculate_cost, detect_provider
 # Context variables for session and turn tracking
 # These variables automatically propagate across async call boundaries,
 # allowing deeply nested code to access tracing context without explicit passing
-_session_id_ctx: contextvars.ContextVar[str | None] = contextvars.ContextVar("session_id")
-_turn_number_ctx: contextvars.ContextVar[int | None] = contextvars.ContextVar("turn_number")
+_session_id_ctx: contextvars.ContextVar[str | None] = contextvars.ContextVar(
+    "session_id"
+)
+_turn_number_ctx: contextvars.ContextVar[int | None] = contextvars.ContextVar(
+    "turn_number"
+)
 _session_tracer_ctx: contextvars.ContextVar[Any | None] = contextvars.ContextVar(
-    "session_tracer", default=None
+    "session_tracer"
 )

synth_ai/tracing_v3/replica_sync.py CHANGED Viewed

@@ -25,15 +25,15 @@ application to continue without blocking on sync operations.
 """
 import asyncio
+import importlib
 import logging
-from typing import Any
-import libsql
+from typing import Any, cast
 from .config import CONFIG
 logger = logging.getLogger(__name__)
+libsql = cast(Any, importlib.import_module("libsql"))
 class ReplicaSync:
     """Manages synchronization of embedded SQLite replica with remote Turso database.
@@ -53,7 +53,7 @@ class ReplicaSync:
         db_path: str = "embedded.db",
         sync_url: str | None = None,
         auth_token: str | None = None,
-        sync_interval: int | None = None,
+        sync_interval: float | None = None,
     ):
         """Initialize replica sync manager.

synth-ai 0.2.12__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.12py3-none-any.whl → 0.2.13.dev2py3-none-any.whl