PyPI - synth-ai - Versions diffs - 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

synth-ai 0.2.13.dev1py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (291) hide show

synth_ai/task/rubrics/__init__.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Rubric schema, loading, and scoring helpers for Task Apps.
+This module provides:
+- Flexible rubric models (Criterion, Rubric) for general task app use
+- Strict validators (StrictCriterion, StrictRubric) for step-wise judges
+- Loading utilities supporting JSON, YAML, and HTTP sources
+- Blending utilities for composing rubrics
+- Scoring utilities for events and outcomes
+"""
+# Core models (flexible validation)
+from .models import Criterion, Rubric
+# Loading and blending
+from .loaders import blend_rubrics, load_rubric
+# Scoring
+from .scoring import score_events_against_rubric, score_outcome_against_rubric
+# Strict validators (for judge configs)
+from .strict import (
+    StrictCriterion,
+    StrictRubric,
+    ValidationError,
+    validate_rubric_dict,
+    validate_rubric_file,
+    validate_rubric_files,
+)
+__all__ = [
+    # Flexible models
+    "Criterion",
+    "Rubric",
+    # Loaders
+    "load_rubric",
+    "blend_rubrics",
+    # Scoring
+    "score_events_against_rubric",
+    "score_outcome_against_rubric",
+    # Strict validators
+    "StrictCriterion",
+    "StrictRubric",
+    "ValidationError",
+    "validate_rubric_dict",
+    "validate_rubric_file",
+    "validate_rubric_files",
+]
+# Maintain backwards compatibility
+# Old code may import these names expecting the flexible variants
+RubricCriterion = StrictCriterion
+RubricSpec = StrictRubric

synth_ai/task/rubrics/loaders.py ADDED Viewed

@@ -0,0 +1,152 @@
+"""Rubric loading and blending utilities."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+from .models import Criterion, Rubric
+def _load_text(source: str) -> tuple[str, str | None]:
+    """Load text from file path or return as-is."""
+    path = Path(source)
+    if path.exists():
+        return path.read_text(encoding="utf-8"), path.suffix.lower()
+    return source, None
+def _parse_structured(text: str, suffix: str | None) -> dict[str, Any]:
+    """Parse JSON or YAML text into a dictionary."""
+    text = text.strip()
+    if not text:
+        raise ValueError("Rubric source is empty")
+    if suffix in (".yaml", ".yml"):
+        try:
+            import yaml  # type: ignore
+        except Exception as exc:  # pragma: no cover - optional dependency
+            raise RuntimeError("PyYAML is required to load YAML rubrics") from exc
+        data = yaml.safe_load(text)
+        if not isinstance(data, dict):
+            raise ValueError("Rubric YAML must produce a mapping") from None
+        return data
+    if text.startswith("{"):
+        return json.loads(text)
+    if text.startswith("http://") or text.startswith("https://"):
+        import requests  # type: ignore
+        response = requests.get(text, timeout=15)
+        response.raise_for_status()
+        return _parse_structured(response.text, suffix)
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        try:
+            import yaml  # type: ignore
+        except Exception as exc:  # pragma: no cover - optional dependency
+            raise RuntimeError("PyYAML is required to load rubric text") from exc
+        data = yaml.safe_load(text)
+        if not isinstance(data, dict):
+            raise ValueError("Rubric text must decode to a mapping") from None
+        return data
+def load_rubric(source: str | dict[str, Any] | Rubric | None) -> Rubric | None:
+    """Load rubric from file path, dict, or return existing Rubric.
+    Args:
+        source: File path (JSON/YAML), dict, existing Rubric, or None
+    Returns:
+        Parsed Rubric instance or None if source is None
+    Raises:
+        ValueError: If the rubric format is incorrect (e.g., backend judge format)
+        ValidationError: If the rubric fails schema validation
+    """
+    if source is None:
+        return None
+    if isinstance(source, Rubric):
+        return source
+    # Load and parse the data
+    if isinstance(source, dict):
+        data = source
+    else:
+        text, suffix = _load_text(str(source))
+        data = _parse_structured(text, suffix)
+    # Check if this looks like a backend judge rubric (wrong format)
+    if isinstance(data, dict) and "event" in data and "outcome" in data:
+        # Missing required task app rubric fields
+        if "version" not in data and "goal_text" not in data and "criteria" not in data:
+            source_hint = f" ({source})" if isinstance(source, str) else ""
+            raise ValueError(
+                f"Rubric appears to be in backend judge format (has 'event'/'outcome' keys){source_hint}. "
+                f"Task apps require rubrics with 'version', 'goal_text', and 'criteria' fields. "
+                f"Backend judge rubrics should be named '*_backend_judge.json' and loaded by judge functions."
+            )
+    return Rubric.model_validate(data)
+def _merge_weights(base: Criterion, override: Criterion) -> float:
+    """Merge criterion weights from base and override rubrics."""
+    if override.weight != 1.0 and base.weight != 1.0:
+        return base.weight * override.weight
+    if override.weight != 1.0:
+        return override.weight
+    return base.weight
+def blend_rubrics(base: Rubric | None, override: Rubric | None) -> Rubric | None:
+    """Blend two rubrics by merging criteria and inheriting properties.
+    Override rubric takes precedence for descriptions and settings.
+    Weights are merged multiplicatively when both are non-default.
+    Args:
+        base: Base rubric providing defaults
+        override: Override rubric with specific customizations
+    Returns:
+        Blended rubric or None if both inputs are None
+    """
+    if override is None and base is None:
+        return None
+    if base is None:
+        return override
+    if override is None:
+        return base
+    base_map = {criterion.id: criterion for criterion in base.criteria}
+    merged: list[Criterion] = []
+    for ov in override.criteria:
+        if ov.id in base_map:
+            existing = base_map.pop(ov.id)
+            merged.append(
+                Criterion(
+                    id=ov.id,
+                    description=ov.description or existing.description,
+                    weight=_merge_weights(existing, ov),
+                    required=ov.required if ov.required is not None else existing.required,
+                )
+            )
+        else:
+            merged.append(ov)
+    merged.extend(base_map.values())
+    aggregation = override.aggregation
+    if aggregation == "inherit":
+        aggregation = base.aggregation
+    return Rubric(
+        version=override.version or base.version,
+        goal_text=override.goal_text or base.goal_text,
+        criteria=merged,
+        aggregation=aggregation,
+    )

synth_ai/task/rubrics/models.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Rubric and Criterion data models."""
+from __future__ import annotations
+from pydantic import BaseModel, Field, field_validator
+class Criterion(BaseModel):
+    """Single scoring criterion within a rubric.
+    Flexible variant allowing weights > 1.0 and no normalization requirement.
+    Used by task apps for general rubric scoring.
+    """
+    id: str
+    description: str
+    weight: float = 1.0
+    required: bool = False
+    @field_validator("weight")
+    @classmethod
+    def _validate_weight(cls, value: float) -> float:
+        if value <= 0:
+            raise ValueError("criterion weight must be positive")
+        return value
+class Rubric(BaseModel):
+    """Rubric definition for scoring task app outcomes.
+    Supports flexible aggregation and blending. Criteria weights do not need
+    to sum to 1.0, making this suitable for general task app usage.
+    """
+    version: str
+    goal_text: str | None = None
+    criteria: list[Criterion] = Field(default_factory=list)
+    aggregation: str = "weighted_sum"
+    @field_validator("aggregation")
+    @classmethod
+    def _validate_aggregation(cls, value: str) -> str:
+        allowed = {"sum", "weighted_sum", "custom", "inherit"}
+        if value not in allowed:
+            raise ValueError(f"aggregation must be one of {sorted(allowed)}")
+        return value
+    @field_validator("criteria")
+    @classmethod
+    def _validate_criteria(cls, criteria: list[Criterion]) -> list[Criterion]:
+        seen = set()
+        for criterion in criteria:
+            if criterion.id in seen:
+                raise ValueError(f"duplicate criterion id: {criterion.id}")
+            seen.add(criterion.id)
+        return criteria

synth_ai/task/rubrics/scoring.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""Rubric scoring utilities for events and outcomes."""
+from __future__ import annotations
+from collections.abc import Iterable
+from typing import Any
+from .models import Criterion, Rubric
+def _as_float(value: Any) -> float | None:
+    """Safely convert value to float, returning None on failure."""
+    try:
+        return float(value)
+    except Exception:
+        return None
+def _score(
+    criteria: Iterable[Criterion], values: dict[str, float], aggregation: str
+) -> dict[str, Any]:
+    """Compute aggregate score from criterion values.
+    Args:
+        criteria: List of criteria defining scoring dimensions
+        values: Map of criterion IDs to scores
+        aggregation: How to aggregate ("sum", "weighted_sum", "custom")
+    Returns:
+        Dict with aggregation method, total score, and per-criterion breakdown
+    """
+    if aggregation == "inherit":
+        aggregation = "weighted_sum"
+    per_criterion: dict[str, dict[str, Any]] = {}
+    total = 0.0
+    total_weight = 0.0
+    for criterion in criteria:
+        score = values.get(criterion.id, 0.0)
+        per_criterion[criterion.id] = {
+            "score": score,
+            "weight": criterion.weight,
+            "required": criterion.required,
+        }
+        if aggregation == "sum":
+            total += score
+        elif aggregation == "weighted_sum":
+            total += score * criterion.weight
+            total_weight += criterion.weight
+    if aggregation == "weighted_sum" and total_weight > 0:
+        total = total / total_weight
+    if aggregation == "custom":
+        total = None  # type: ignore[assignment]
+    return {
+        "aggregation": aggregation,
+        "score": total,
+        "per_criterion": per_criterion,
+    }
+def score_events_against_rubric(
+    events: list[dict[str, Any]], rubric: Rubric | None
+) -> dict[str, Any]:
+    """Score a list of evaluation events against a rubric.
+    Events should contain criterion_id/id/criterion and score fields.
+    Args:
+        events: List of event dicts with scoring info
+        rubric: Rubric defining criteria and aggregation
+    Returns:
+        Scoring result with total and per-criterion scores
+    """
+    if rubric is None:
+        return {"aggregation": "none", "score": None, "per_criterion": {}}
+    values: dict[str, float] = {}
+    for event in events or []:
+        if not isinstance(event, dict):
+            continue
+        cid = event.get("criterion_id") or event.get("id") or event.get("criterion")
+        score = _as_float(event.get("score"))
+        if cid and score is not None:
+            values[str(cid)] = score
+    return _score(rubric.criteria, values, rubric.aggregation)
+def score_outcome_against_rubric(outcome: dict[str, Any], rubric: Rubric | None) -> dict[str, Any]:
+    """Score a rollout outcome against a rubric.
+    Outcome should be a dict mapping criterion IDs to scores, optionally
+    nested under a "criteria" key.
+    Args:
+        outcome: Outcome dict with criterion scores
+        rubric: Rubric defining criteria and aggregation
+    Returns:
+        Scoring result with total and per-criterion scores
+    """
+    if rubric is None:
+        return {"aggregation": "none", "score": None, "per_criterion": {}}
+    values: dict[str, float] = {}
+    if isinstance(outcome, dict):
+        candidates = (
+            outcome.get("criteria") if isinstance(outcome.get("criteria"), dict) else outcome
+        )
+        if isinstance(candidates, dict):
+            for key, value in candidates.items():
+                score = _as_float(value)
+                if score is not None:
+                    values[str(key)] = score
+    return _score(rubric.criteria, values, rubric.aggregation)

synth_ai/{rubrics/validators.py → task/rubrics/strict.py} RENAMED Viewed

@@ -1,15 +1,32 @@
+"""Strict rubric validators for step-wise judges.
+These validators enforce stricter constraints than the general-purpose rubrics:
+- Weights must be ≤ 1.0 and sum to exactly 1.0
+- Only weighted_sum aggregation is allowed
+- All required fields must be non-empty
+Used primarily for validation in judge configurations.
+"""
 from __future__ import annotations
 import json
 import math
+from collections.abc import Iterable
 from pathlib import Path
-from typing import Any, Iterable, Literal
+from typing import Any, Literal
 import pydantic
-class RubricCriterion(pydantic.BaseModel):
-    """Single scoring criterion within a rubric."""
+class StrictCriterion(pydantic.BaseModel):
+    """Single scoring criterion with strict validation.
+    Enforces:
+    - Weight ≤ 1.0 (for proper normalization)
+    - Weight > 0.0 (positive)
+    - Non-empty strings
+    """
     id: str
     description: str
@@ -35,16 +52,23 @@ class RubricCriterion(pydantic.BaseModel):
         return value
-class RubricSpec(pydantic.BaseModel):
-    """High-level rubric definition used by step-wise judges."""
+class StrictRubric(pydantic.BaseModel):
+    """Strict rubric definition for step-wise judges.
+    Enforces:
+    - Weights must sum to 1.0
+    - Only weighted_sum aggregation
+    - Non-empty version and goal_text
+    - At least one criterion
+    """
     version: str
     goal_text: str
     aggregation: Literal["weighted_sum"]
-    criteria: list[RubricCriterion]
+    criteria: list[StrictCriterion]
     @pydantic.model_validator(mode="after")
-    def _validate_weights(self) -> "RubricSpec":
+    def _validate_weights(self) -> StrictRubric:
         if not self.criteria:
             raise ValueError("rubric must declare at least one criterion")
         total_weight = sum(criterion.weight for criterion in self.criteria)
@@ -71,56 +95,55 @@ class RubricSpec(pydantic.BaseModel):
         return value
+# Re-export pydantic's ValidationError for convenience
 ValidationError = pydantic.ValidationError
-def validate_rubric_dict(payload: dict[str, Any]) -> RubricSpec:
-    """
-    Validate an in-memory rubric payload and return the parsed model.
+def validate_rubric_dict(payload: dict[str, Any]) -> StrictRubric:
+    """Validate an in-memory rubric payload with strict rules.
     Args:
-        payload: Dictionary representing the rubric JSON.
+        payload: Dictionary representing the rubric JSON
     Returns:
-        Validated RubricSpec instance.
+        Validated StrictRubric instance
     Raises:
-        ValidationError: If the payload is missing required fields or contains
-        invalid weights.
+        ValidationError: If payload is invalid or doesn't meet strict constraints
     """
     if not isinstance(payload, dict):
         raise TypeError("rubric payload must be a dictionary")
-    return RubricSpec.model_validate(payload)
+    return StrictRubric.model_validate(payload)
 def _load_payload_from_file(path: Path) -> dict[str, Any]:
+    """Load JSON rubric from file."""
     if path.suffix.lower() != ".json":
         raise ValueError(f"Unsupported rubric file type: {path}")
     text = path.read_text(encoding="utf-8")
     return json.loads(text)
-def validate_rubric_file(path: Path) -> RubricSpec:
-    """
-    Load and validate a rubric file.
+def validate_rubric_file(path: Path) -> StrictRubric:
+    """Load and validate a rubric file with strict rules.
     Args:
-        path: Path to a JSON rubric document.
+        path: Path to a JSON rubric document
     Returns:
-        Validated RubricSpec instance.
+        Validated StrictRubric instance
     """
     payload = _load_payload_from_file(path)
     return validate_rubric_dict(payload)
-def validate_rubric_files(paths: Iterable[Path]) -> list[RubricSpec]:
-    """
-    Validate multiple rubric files and return their parsed models.
+def validate_rubric_files(paths: Iterable[Path]) -> list[StrictRubric]:
+    """Validate multiple rubric files with strict rules.
     Useful for bulk validation inside tests or CI checks.
     """
-    validated: list[RubricSpec] = []
+    validated: list[StrictRubric] = []
     for path in paths:
         validated.append(validate_rubric_file(path))
     return validated

synth_ai/task/server.py CHANGED Viewed

@@ -70,7 +70,7 @@ class TaskAppConfig:
     provide_task_instances: InstanceProvider
     rollout: RolloutExecutor
     dataset_registry: TaskDatasetRegistry | None = None
-    rubrics: RubricBundle = field(default_factory=RubricBundle)
+    rubrics: RubricBundle | None = field(default_factory=RubricBundle)
     proxy: ProxyConfig | None = None
     routers: Sequence[APIRouter] = field(default_factory=tuple)
     middleware: Sequence[Middleware] = field(default_factory=tuple)
@@ -93,7 +93,7 @@ class TaskAppConfig:
             provide_task_instances=self.provide_task_instances,
             rollout=self.rollout,
             dataset_registry=self.dataset_registry,
-            rubrics=self.rubrics,
+            rubrics=self.rubrics or RubricBundle(),
             proxy=self.proxy,
             routers=tuple(self.routers),
             middleware=tuple(self.middleware),
@@ -221,6 +221,7 @@ def _auth_dependency_factory(config: TaskAppConfig) -> Callable[[Request], None]
 def create_task_app(config: TaskAppConfig) -> FastAPI:
     cfg = config.clone()
+    cfg.rubrics = cfg.rubrics or RubricBundle()
     app = FastAPI(title=cfg.name, description=cfg.description)
     for key, value in cfg.app_state.items():
@@ -310,20 +311,20 @@ def create_task_app(config: TaskAppConfig) -> FastAPI:
     async def info() -> Mapping[str, Any]:
         dataset_meta = cfg.base_task_info.dataset
         rubrics: dict[str, Any] | None = None
-        if cfg.rubrics.outcome or cfg.rubrics.events:
+        rubric_bundle = cfg.rubrics
+        if rubric_bundle and (rubric_bundle.outcome or rubric_bundle.events):
             rubrics = {
-                "outcome": cfg.rubrics.outcome.model_dump() if cfg.rubrics.outcome else None,
-                "events": cfg.rubrics.events.model_dump() if cfg.rubrics.events else None,
+                "outcome": rubric_bundle.outcome.model_dump() if rubric_bundle.outcome else None,
+                "events": rubric_bundle.events.model_dump() if rubric_bundle.events else None,
             }
         payload = {
             "service": {
                 "task": cfg.base_task_info.task,
-                "version": cfg.base_task_info.task.get("version"),
+                "version": cfg.base_task_info.task.version,
             },
             "dataset": dataset_meta,
             "rubrics": rubrics,
             "inference": cfg.base_task_info.inference,
-            "capabilities": cfg.base_task_info.capabilities,
             "limits": cfg.base_task_info.limits,
         }
         return to_jsonable(payload)

synth-ai 0.2.13.dev1__py3-none-any.whl → 0.2.14__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.13.dev1py3-none-any.whl → 0.2.14py3-none-any.whl