PyPI - synth-ai - Versions diffs - 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl - Mend

synth-ai 0.2.13.dev1py3-none-any.whl → 0.2.13.dev2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (226) hide show

synth_ai/task/rubrics/scoring.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""Rubric scoring utilities for events and outcomes."""
+from __future__ import annotations
+from collections.abc import Iterable
+from typing import Any
+from .models import Criterion, Rubric
+def _as_float(value: Any) -> float | None:
+    """Safely convert value to float, returning None on failure."""
+    try:
+        return float(value)
+    except Exception:
+        return None
+def _score(
+    criteria: Iterable[Criterion], values: dict[str, float], aggregation: str
+) -> dict[str, Any]:
+    """Compute aggregate score from criterion values.
+    Args:
+        criteria: List of criteria defining scoring dimensions
+        values: Map of criterion IDs to scores
+        aggregation: How to aggregate ("sum", "weighted_sum", "custom")
+    Returns:
+        Dict with aggregation method, total score, and per-criterion breakdown
+    """
+    if aggregation == "inherit":
+        aggregation = "weighted_sum"
+    per_criterion: dict[str, dict[str, Any]] = {}
+    total = 0.0
+    total_weight = 0.0
+    for criterion in criteria:
+        score = values.get(criterion.id, 0.0)
+        per_criterion[criterion.id] = {
+            "score": score,
+            "weight": criterion.weight,
+            "required": criterion.required,
+        }
+        if aggregation == "sum":
+            total += score
+        elif aggregation == "weighted_sum":
+            total += score * criterion.weight
+            total_weight += criterion.weight
+    if aggregation == "weighted_sum" and total_weight > 0:
+        total = total / total_weight
+    if aggregation == "custom":
+        total = None  # type: ignore[assignment]
+    return {
+        "aggregation": aggregation,
+        "score": total,
+        "per_criterion": per_criterion,
+    }
+def score_events_against_rubric(
+    events: list[dict[str, Any]], rubric: Rubric | None
+) -> dict[str, Any]:
+    """Score a list of evaluation events against a rubric.
+    Events should contain criterion_id/id/criterion and score fields.
+    Args:
+        events: List of event dicts with scoring info
+        rubric: Rubric defining criteria and aggregation
+    Returns:
+        Scoring result with total and per-criterion scores
+    """
+    if rubric is None:
+        return {"aggregation": "none", "score": None, "per_criterion": {}}
+    values: dict[str, float] = {}
+    for event in events or []:
+        if not isinstance(event, dict):
+            continue
+        cid = event.get("criterion_id") or event.get("id") or event.get("criterion")
+        score = _as_float(event.get("score"))
+        if cid and score is not None:
+            values[str(cid)] = score
+    return _score(rubric.criteria, values, rubric.aggregation)
+def score_outcome_against_rubric(outcome: dict[str, Any], rubric: Rubric | None) -> dict[str, Any]:
+    """Score a rollout outcome against a rubric.
+    Outcome should be a dict mapping criterion IDs to scores, optionally
+    nested under a "criteria" key.
+    Args:
+        outcome: Outcome dict with criterion scores
+        rubric: Rubric defining criteria and aggregation
+    Returns:
+        Scoring result with total and per-criterion scores
+    """
+    if rubric is None:
+        return {"aggregation": "none", "score": None, "per_criterion": {}}
+    values: dict[str, float] = {}
+    if isinstance(outcome, dict):
+        candidates = (
+            outcome.get("criteria") if isinstance(outcome.get("criteria"), dict) else outcome
+        )
+        if isinstance(candidates, dict):
+            for key, value in candidates.items():
+                score = _as_float(value)
+                if score is not None:
+                    values[str(key)] = score
+    return _score(rubric.criteria, values, rubric.aggregation)

synth_ai/{rubrics/validators.py → task/rubrics/strict.py} RENAMED Viewed

@@ -1,15 +1,32 @@
+"""Strict rubric validators for step-wise judges.
+These validators enforce stricter constraints than the general-purpose rubrics:
+- Weights must be ≤ 1.0 and sum to exactly 1.0
+- Only weighted_sum aggregation is allowed
+- All required fields must be non-empty
+Used primarily for validation in judge configurations.
+"""
 from __future__ import annotations
 import json
 import math
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Any, Iterable, Literal
+from typing import Any, Literal
 import pydantic
-class RubricCriterion(pydantic.BaseModel):
-    """Single scoring criterion within a rubric."""
+class StrictCriterion(pydantic.BaseModel):
+    """Single scoring criterion with strict validation.
+    Enforces:
+    - Weight ≤ 1.0 (for proper normalization)
+    - Weight > 0.0 (positive)
+    - Non-empty strings
+    """
     id: str
     description: str
@@ -35,16 +52,23 @@ class RubricCriterion(pydantic.BaseModel):
         return value
-class RubricSpec(pydantic.BaseModel):
-    """High-level rubric definition used by step-wise judges."""
+class StrictRubric(pydantic.BaseModel):
+    """Strict rubric definition for step-wise judges.
+    Enforces:
+    - Weights must sum to 1.0
+    - Only weighted_sum aggregation
+    - Non-empty version and goal_text
+    - At least one criterion
+    """
     version: str
     goal_text: str
     aggregation: Literal["weighted_sum"]
-    criteria: list[RubricCriterion]
+    criteria: list[StrictCriterion]
     @pydantic.model_validator(mode="after")
-    def _validate_weights(self) -> "RubricSpec":
+    def _validate_weights(self) -> StrictRubric:
         if not self.criteria:
             raise ValueError("rubric must declare at least one criterion")
         total_weight = sum(criterion.weight for criterion in self.criteria)
@@ -71,56 +95,55 @@ class RubricSpec(pydantic.BaseModel):
         return value
+# Re-export pydantic's ValidationError for convenience
 ValidationError = pydantic.ValidationError
-def validate_rubric_dict(payload: dict[str, Any]) -> RubricSpec:
-    """
-    Validate an in-memory rubric payload and return the parsed model.
+def validate_rubric_dict(payload: dict[str, Any]) -> StrictRubric:
+    """Validate an in-memory rubric payload with strict rules.
     Args:
-        payload: Dictionary representing the rubric JSON.
+        payload: Dictionary representing the rubric JSON
     Returns:
-        Validated RubricSpec instance.
+        Validated StrictRubric instance
     Raises:
-        ValidationError: If the payload is missing required fields or contains
-        invalid weights.
+        ValidationError: If payload is invalid or doesn't meet strict constraints
     """
     if not isinstance(payload, dict):
         raise TypeError("rubric payload must be a dictionary")
-    return RubricSpec.model_validate(payload)
+    return StrictRubric.model_validate(payload)
 def _load_payload_from_file(path: Path) -> dict[str, Any]:
+    """Load JSON rubric from file."""
     if path.suffix.lower() != ".json":
         raise ValueError(f"Unsupported rubric file type: {path}")
     text = path.read_text(encoding="utf-8")
     return json.loads(text)
-def validate_rubric_file(path: Path) -> RubricSpec:
-    """
-    Load and validate a rubric file.
+def validate_rubric_file(path: Path) -> StrictRubric:
+    """Load and validate a rubric file with strict rules.
     Args:
-        path: Path to a JSON rubric document.
+        path: Path to a JSON rubric document
     Returns:
-        Validated RubricSpec instance.
+        Validated StrictRubric instance
     """
     payload = _load_payload_from_file(path)
     return validate_rubric_dict(payload)
-def validate_rubric_files(paths: Iterable[Path]) -> list[RubricSpec]:
-    """
-    Validate multiple rubric files and return their parsed models.
+def validate_rubric_files(paths: Iterable[Path]) -> list[StrictRubric]:
+    """Validate multiple rubric files with strict rules.
     Useful for bulk validation inside tests or CI checks.
     """
-    validated: list[RubricSpec] = []
+    validated: list[StrictRubric] = []
     for path in paths:
         validated.append(validate_rubric_file(path))
     return validated

synth_ai/task/server.py CHANGED Viewed

@@ -70,7 +70,7 @@ class TaskAppConfig:
     provide_task_instances: InstanceProvider
     rollout: RolloutExecutor
     dataset_registry: TaskDatasetRegistry | None = None
-    rubrics: RubricBundle = field(default_factory=RubricBundle)
+    rubrics: RubricBundle | None = field(default_factory=RubricBundle)
     proxy: ProxyConfig | None = None
     routers: Sequence[APIRouter] = field(default_factory=tuple)
     middleware: Sequence[Middleware] = field(default_factory=tuple)
@@ -93,7 +93,7 @@ class TaskAppConfig:
             provide_task_instances=self.provide_task_instances,
             rollout=self.rollout,
             dataset_registry=self.dataset_registry,
-            rubrics=self.rubrics,
+            rubrics=self.rubrics or RubricBundle(),
             proxy=self.proxy,
             routers=tuple(self.routers),
             middleware=tuple(self.middleware),
@@ -221,6 +221,7 @@ def _auth_dependency_factory(config: TaskAppConfig) -> Callable[[Request], None]
 def create_task_app(config: TaskAppConfig) -> FastAPI:
     cfg = config.clone()
+    cfg.rubrics = cfg.rubrics or RubricBundle()
     app = FastAPI(title=cfg.name, description=cfg.description)
     for key, value in cfg.app_state.items():
@@ -310,20 +311,20 @@ def create_task_app(config: TaskAppConfig) -> FastAPI:
     async def info() -> Mapping[str, Any]:
         dataset_meta = cfg.base_task_info.dataset
         rubrics: dict[str, Any] | None = None
-        if cfg.rubrics.outcome or cfg.rubrics.events:
+        rubric_bundle = cfg.rubrics
+        if rubric_bundle and (rubric_bundle.outcome or rubric_bundle.events):
             rubrics = {
-                "outcome": cfg.rubrics.outcome.model_dump() if cfg.rubrics.outcome else None,
-                "events": cfg.rubrics.events.model_dump() if cfg.rubrics.events else None,
+                "outcome": rubric_bundle.outcome.model_dump() if rubric_bundle.outcome else None,
+                "events": rubric_bundle.events.model_dump() if rubric_bundle.events else None,
             }
         payload = {
             "service": {
                 "task": cfg.base_task_info.task,
-                "version": cfg.base_task_info.task.get("version"),
+                "version": cfg.base_task_info.task.version,
             },
             "dataset": dataset_meta,
             "rubrics": rubrics,
             "inference": cfg.base_task_info.inference,
-            "capabilities": cfg.base_task_info.capabilities,
             "limits": cfg.base_task_info.limits,
         }
         return to_jsonable(payload)

synth_ai/task/validators.py CHANGED Viewed

@@ -1,11 +1,274 @@
+"""Task app validation utilities."""
 from __future__ import annotations
-from urllib.parse import urlparse
+import re
+from typing import Any
+import click
+import httpx
+from synth_ai.task.contracts import TaskAppEndpoints  # type: ignore[attr-defined]
+def validate_task_app_url(url: str | None) -> str:
+    """Validate and normalize a task app URL.
+    Args:
+        url: URL to validate
+    Returns:
+        Normalized URL
+    Raises:
+        ValueError: If URL is invalid
+    """
+    if not url:
+        raise ValueError("Task app URL is required")
+    url = url.strip().rstrip("/")
+    # Basic URL validation
+    url_pattern = re.compile(
+        r"^https?://"  # http:// or https://
+        r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|"  # domain...
+        r"localhost|"  # localhost...
+        r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"  # ...or ip
+        r"(?::\d+)?"  # optional port
+        r"(?:/?|[/?]\S+)$",
+        re.IGNORECASE,
+    )
+    if not url_pattern.match(url):
+        raise ValueError(f"Invalid task app URL: {url}")
+    return url
+def _print_success(msg: str) -> None:
+    """Print success message in green."""
+    click.echo(click.style(f"✓ {msg}", fg="green"))
+def _print_error(msg: str) -> None:
+    """Print error message in red."""
+    click.echo(click.style(f"✗ {msg}", fg="red"), err=True)
+def _print_warning(msg: str) -> None:
+    """Print warning message in yellow."""
+    click.echo(click.style(f"⚠ {msg}", fg="yellow"))
+def _print_info(msg: str) -> None:
+    """Print info message."""
+    click.echo(f"  {msg}")
-def validate_task_app_url(url: str, *, name: str = "TASK_APP_BASE_URL") -> None:
-    """Validate a Task App base URL (scheme + host present)."""
-    p = urlparse(url)
-    if p.scheme not in ("http", "https") or not p.netloc:
-        raise ValueError(f"Invalid {name}: malformed: {url}")
+async def validate_task_app_endpoint(
+    url: str,
+    api_key: str | None = None,
+    min_instances: int = 10,
+    verbose: bool = False,
+) -> tuple[bool, dict[str, Any]]:
+    """Validate a task app deployment.
+    Returns:
+        (success: bool, results: dict)
+    """
+    results: dict[str, Any] = {
+        "url": url,
+        "endpoints": {},
+        "auth": {},
+        "task_instances": {},
+        "overall": False,
+    }
+    all_passed = True
+    endpoints = TaskAppEndpoints()
+    # Set up headers
+    headers = {}
+    if api_key:
+        headers["X-API-Key"] = api_key
+    click.echo(f"\n{'='*60}")
+    click.echo(f"Validating Task App: {url}")
+    click.echo(f"{'='*60}\n")
+    async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
+        # 1. Check root endpoint
+        click.echo("1. Checking root endpoint...")
+        try:
+            resp = await client.get(f"{url}{endpoints.root}")
+            if resp.status_code == 200:
+                data = resp.json()
+                _print_success(f"Root endpoint responds (status: {data.get('status')})")
+                results["endpoints"]["root"] = {"passed": True, "data": data}
+                if verbose:
+                    _print_info(f"Service: {data.get('service', 'N/A')}")
+            else:
+                _print_error(f"Root endpoint returned {resp.status_code}")
+                results["endpoints"]["root"] = {"passed": False, "status": resp.status_code}
+                all_passed = False
+        except Exception as e:
+            _print_error(f"Root endpoint failed: {e}")
+            results["endpoints"]["root"] = {"passed": False, "error": str(e)}
+            all_passed = False
+        # 2. Check health endpoint
+        click.echo("\n2. Checking health endpoint...")
+        try:
+            resp = await client.get(f"{url}{endpoints.health}", headers=headers)
+            if resp.status_code == 200:
+                data = resp.json()
+                _print_success(f"Health endpoint responds (healthy: {data.get('healthy')})")
+                results["endpoints"]["health"] = {"passed": True, "data": data}
+                # Check auth configuration
+                auth_info = data.get("auth", {})
+                if auth_info.get("required"):
+                    _print_info(f"Auth required: {auth_info.get('required')}")
+                    _print_info(f"Expected key prefix: {auth_info.get('expected_prefix', 'N/A')}")
+                    if api_key:
+                        _print_success("API key provided and accepted")
+                        results["auth"]["provided"] = True
+                        results["auth"]["accepted"] = True
+                    else:
+                        _print_warning("No API key provided but may be required")
+                        results["auth"]["provided"] = False
+                        results["auth"]["required"] = True
+            else:
+                _print_error(f"Health endpoint returned {resp.status_code}")
+                results["endpoints"]["health"] = {"passed": False, "status": resp.status_code}
+                all_passed = False
+                if resp.status_code == 403:
+                    _print_error("Authentication failed - provide API key with --api-key")
+                    results["auth"]["error"] = "Authentication failed"
+        except Exception as e:
+            _print_error(f"Health endpoint failed: {e}")
+            results["endpoints"]["health"] = {"passed": False, "error": str(e)}
+            all_passed = False
+        # 3. Check info endpoint
+        click.echo("\n3. Checking info endpoint...")
+        try:
+            resp = await client.get(f"{url}{endpoints.info}", headers=headers)
+            if resp.status_code == 200:
+                data = resp.json()
+                _print_success("Info endpoint responds")
+                results["endpoints"]["info"] = {"passed": True, "data": data}
+                if verbose:
+                    service = data.get("service", {})
+                    task_info = service.get("task", {})
+                    if isinstance(task_info, dict):
+                        _print_info(f"Task: {task_info.get('name', 'N/A')}")
+                    _print_info(f"Version: {service.get('version', 'N/A')}")
+                    dataset = data.get("dataset", {})
+                    if isinstance(dataset, dict):
+                        _print_info(f"Dataset: {dataset.get('id', 'N/A')}")
+            else:
+                _print_error(f"Info endpoint returned {resp.status_code}")
+                results["endpoints"]["info"] = {"passed": False, "status": resp.status_code}
+                all_passed = False
+        except Exception as e:
+            _print_error(f"Info endpoint failed: {e}")
+            results["endpoints"]["info"] = {"passed": False, "error": str(e)}
+            all_passed = False
+        # 4. Check task_info endpoint and instance count
+        click.echo("\n4. Checking task_info endpoint and instance availability...")
+        try:
+            # Get taskset descriptor first
+            resp = await client.get(f"{url}{endpoints.task_info}", headers=headers)
+            if resp.status_code == 200:
+                data = resp.json()
+                _print_success("Task info endpoint responds")
+                results["endpoints"]["task_info"] = {"passed": True}
+                taskset = data.get("taskset", {})
+                if verbose and taskset:
+                    if isinstance(taskset, dict):
+                        _print_info(f"Taskset: {taskset.get('id', 'N/A')}")
+                    else:
+                        _print_info(f"Taskset: {taskset}")
+                # Try to get specific task instances (seeds 0-19)
+                # Fetch instances one by one to verify we can get at least min_instances
+                instances = []
+                for seed in range(min_instances + 5):  # Try a few extra
+                    try:
+                        resp_seed = await client.get(
+                            f"{url}{endpoints.task_info}",
+                            params={"seed": seed},
+                            headers=headers,
+                        )
+                        if resp_seed.status_code == 200:
+                            instance = resp_seed.json()
+                            instances.append(instance)
+                        else:
+                            break  # Stop if we hit an invalid seed
+                    except Exception:
+                        break
+                instance_count = len(instances)
+                results["task_instances"]["count"] = instance_count
+                results["task_instances"]["requested"] = min_instances
+                if instance_count >= min_instances:
+                    _print_success(f"Found {instance_count} task instances (≥ {min_instances} required)")
+                    results["task_instances"]["passed"] = True
+                    if verbose and instances:
+                        sample = instances[0]
+                        task_info_sample = sample.get('task', {})
+                        if isinstance(task_info_sample, dict):
+                            _print_info(f"Sample task: {task_info_sample.get('name', 'N/A')}")
+                        _print_info(f"Environment: {sample.get('environment', 'N/A')}")
+                else:
+                    _print_error(f"Only {instance_count} task instances available (need ≥ {min_instances})")
+                    results["task_instances"]["passed"] = False
+                    all_passed = False
+            else:
+                _print_error(f"Task info endpoint returned {resp.status_code}")
+                results["endpoints"]["task_info"] = {"passed": False, "status": resp.status_code}
+                all_passed = False
+        except Exception as e:
+            _print_error(f"Task info endpoint failed: {e}")
+            results["endpoints"]["task_info"] = {"passed": False, "error": str(e)}
+            results["task_instances"]["passed"] = False
+            all_passed = False
+        # 5. Check rollout endpoint structure (don't actually run a rollout)
+        click.echo("\n5. Checking rollout endpoint availability...")
+        try:
+            # Just check if it's registered (OPTIONS or a lightweight probe)
+            resp = await client.options(f"{url}{endpoints.rollout}", headers=headers)
+            # Many servers return 200 for OPTIONS, some return 405
+            if resp.status_code in (200, 204, 405):
+                _print_success("Rollout endpoint is registered")
+                results["endpoints"]["rollout"] = {"passed": True}
+            else:
+                _print_warning(f"Rollout endpoint returned unexpected status: {resp.status_code}")
+                results["endpoints"]["rollout"] = {"passed": True, "note": "endpoint exists"}
+        except Exception as e:
+            # OPTIONS might not be supported, that's okay
+            _print_info(f"Rollout endpoint check skipped (OPTIONS not supported): {e}")
+            results["endpoints"]["rollout"] = {"passed": True, "note": "assumed present"}
+    # Summary
+    click.echo(f"\n{'='*60}")
+    if all_passed:
+        _print_success("All validations passed!")
+        click.echo(f"{'='*60}\n")
+    else:
+        _print_error("Some validations failed. See errors above.")
+        click.echo(f"{'='*60}\n")
+    results["overall"] = all_passed
+    return all_passed, results

synth_ai/tracing_v3/decorators.py CHANGED Viewed

@@ -37,10 +37,14 @@ from .utils import calculate_cost, detect_provider
 # Context variables for session and turn tracking
 # These variables automatically propagate across async call boundaries,
 # allowing deeply nested code to access tracing context without explicit passing
-_session_id_ctx: contextvars.ContextVar[str | None] = contextvars.ContextVar("session_id")
-_turn_number_ctx: contextvars.ContextVar[int | None] = contextvars.ContextVar("turn_number")
+_session_id_ctx: contextvars.ContextVar[str | None] = contextvars.ContextVar(
+    "session_id"
+)
+_turn_number_ctx: contextvars.ContextVar[int | None] = contextvars.ContextVar(
+    "turn_number"
+)
 _session_tracer_ctx: contextvars.ContextVar[Any | None] = contextvars.ContextVar(
-    "session_tracer", default=None
+    "session_tracer"
 )

synth_ai/tracing_v3/replica_sync.py CHANGED Viewed

@@ -25,15 +25,15 @@ application to continue without blocking on sync operations.
 """
 import asyncio
+import importlib
 import logging
-from typing import Any
-import libsql
+from typing import Any, cast
 from .config import CONFIG
 logger = logging.getLogger(__name__)
+libsql = cast(Any, importlib.import_module("libsql"))
 class ReplicaSync:
     """Manages synchronization of embedded SQLite replica with remote Turso database.
@@ -53,7 +53,7 @@ class ReplicaSync:
         db_path: str = "embedded.db",
         sync_url: str | None = None,
         auth_token: str | None = None,
-        sync_interval: int | None = None,
+        sync_interval: float | None = None,
     ):
         """Initialize replica sync manager.

synth_ai/tracing_v3/serialization.py CHANGED Viewed

@@ -55,11 +55,11 @@ def normalize_for_json(value: Any) -> Any:
         return {str(k): normalize_for_json(v) for k, v in value.items()}
     # Sequences
-    if isinstance(value, (list, tuple, set)):
+    if isinstance(value, list | tuple | set):
         return [normalize_for_json(v) for v in value]
     # Datetime / Date
-    if isinstance(value, (datetime, date)):
+    if isinstance(value, datetime | date):
         return value.isoformat()
     # Decimal
@@ -73,7 +73,7 @@ def normalize_for_json(value: Any) -> Any:
             return str(value)
     # Bytes-like
-    if isinstance(value, (bytes, bytearray)):
+    if isinstance(value, bytes | bytearray):
         return base64.b64encode(bytes(value)).decode("ascii")
     # Enum
@@ -82,9 +82,9 @@ def normalize_for_json(value: Any) -> Any:
     # Numpy scalars / arrays
     if _np is not None:
-        if isinstance(value, (_np.generic,)):  # type: ignore[attr-defined]
+        if isinstance(value, _np.generic):  # type: ignore[attr-defined]
             return normalize_for_json(value.item())
-        if isinstance(value, (_np.ndarray,)):
+        if isinstance(value, _np.ndarray):
             return normalize_for_json(value.tolist())
     # Floats: sanitize NaN / Infinity to None

synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.13.dev2__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.13.dev1py3-none-any.whl → 0.2.13.dev2py3-none-any.whl