PyPI - synth-ai - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

synth-ai 0.4.1py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (153) hide show

synth_ai/__init__.py +13 -13
synth_ai/cli/__init__.py +6 -15
synth_ai/cli/commands/eval/__init__.py +6 -15
synth_ai/cli/commands/eval/config.py +338 -0
synth_ai/cli/commands/eval/core.py +236 -1091
synth_ai/cli/commands/eval/runner.py +704 -0
synth_ai/cli/commands/eval/validation.py +44 -117
synth_ai/cli/commands/filter/core.py +7 -7
synth_ai/cli/commands/filter/validation.py +2 -2
synth_ai/cli/commands/smoke/core.py +7 -17
synth_ai/cli/commands/status/__init__.py +1 -64
synth_ai/cli/commands/status/client.py +50 -151
synth_ai/cli/commands/status/config.py +3 -83
synth_ai/cli/commands/status/errors.py +4 -13
synth_ai/cli/commands/status/subcommands/__init__.py +2 -8
synth_ai/cli/commands/status/subcommands/config.py +13 -0
synth_ai/cli/commands/status/subcommands/files.py +18 -63
synth_ai/cli/commands/status/subcommands/jobs.py +28 -311
synth_ai/cli/commands/status/subcommands/models.py +18 -62
synth_ai/cli/commands/status/subcommands/runs.py +16 -63
synth_ai/cli/commands/status/subcommands/session.py +67 -172
synth_ai/cli/commands/status/subcommands/summary.py +24 -32
synth_ai/cli/commands/status/subcommands/utils.py +41 -0
synth_ai/cli/commands/status/utils.py +16 -107
synth_ai/cli/commands/train/__init__.py +18 -20
synth_ai/cli/commands/train/errors.py +3 -3
synth_ai/cli/commands/train/prompt_learning_validation.py +15 -16
synth_ai/cli/commands/train/validation.py +7 -7
synth_ai/cli/commands/train/{judge_schemas.py → verifier_schemas.py} +33 -34
synth_ai/cli/commands/train/verifier_validation.py +235 -0
synth_ai/cli/demo_apps/demo_task_apps/math/config.toml +0 -1
synth_ai/cli/demo_apps/demo_task_apps/math/modal_task_app.py +2 -6
synth_ai/cli/demo_apps/math/config.toml +0 -1
synth_ai/cli/demo_apps/math/modal_task_app.py +2 -6
synth_ai/cli/demo_apps/mipro/task_app.py +25 -47
synth_ai/cli/lib/apps/task_app.py +12 -13
synth_ai/cli/lib/task_app_discovery.py +6 -6
synth_ai/cli/lib/train_cfgs.py +10 -10
synth_ai/cli/task_apps/__init__.py +11 -0
synth_ai/cli/task_apps/commands.py +7 -15
synth_ai/core/env.py +12 -1
synth_ai/core/errors.py +1 -2
synth_ai/core/integrations/cloudflare.py +209 -33
synth_ai/core/tracing_v3/abstractions.py +46 -0
synth_ai/data/__init__.py +3 -30
synth_ai/data/enums.py +1 -20
synth_ai/data/rewards.py +100 -3
synth_ai/products/graph_evolve/__init__.py +1 -2
synth_ai/products/graph_evolve/config.py +16 -16
synth_ai/products/graph_evolve/converters/__init__.py +3 -3
synth_ai/products/graph_evolve/converters/openai_sft.py +7 -7
synth_ai/products/graph_evolve/examples/hotpotqa/config.toml +1 -1
synth_ai/products/graph_gepa/__init__.py +23 -0
synth_ai/products/graph_gepa/converters/__init__.py +19 -0
synth_ai/products/graph_gepa/converters/openai_sft.py +29 -0
synth_ai/sdk/__init__.py +45 -35
synth_ai/sdk/api/eval/__init__.py +33 -0
synth_ai/sdk/api/eval/job.py +732 -0
synth_ai/sdk/api/research_agent/__init__.py +276 -66
synth_ai/sdk/api/train/builders.py +181 -0
synth_ai/sdk/api/train/cli.py +41 -33
synth_ai/sdk/api/train/configs/__init__.py +6 -4
synth_ai/sdk/api/train/configs/prompt_learning.py +127 -33
synth_ai/sdk/api/train/configs/rl.py +264 -16
synth_ai/sdk/api/train/configs/sft.py +165 -1
synth_ai/sdk/api/train/graph_validators.py +12 -12
synth_ai/sdk/api/train/graphgen.py +169 -51
synth_ai/sdk/api/train/graphgen_models.py +95 -45
synth_ai/sdk/api/train/local_api.py +10 -0
synth_ai/sdk/api/train/pollers.py +36 -0
synth_ai/sdk/api/train/prompt_learning.py +390 -60
synth_ai/sdk/api/train/rl.py +41 -5
synth_ai/sdk/api/train/sft.py +2 -0
synth_ai/sdk/api/train/task_app.py +20 -0
synth_ai/sdk/api/train/validators.py +17 -17
synth_ai/sdk/graphs/completions.py +239 -33
synth_ai/sdk/{judging/schemas.py → graphs/verifier_schemas.py} +23 -23
synth_ai/sdk/learning/__init__.py +35 -5
synth_ai/sdk/learning/context_learning_client.py +531 -0
synth_ai/sdk/learning/context_learning_types.py +294 -0
synth_ai/sdk/learning/prompt_learning_client.py +1 -1
synth_ai/sdk/learning/prompt_learning_types.py +2 -1
synth_ai/sdk/learning/rl/__init__.py +0 -4
synth_ai/sdk/learning/rl/contracts.py +0 -4
synth_ai/sdk/localapi/__init__.py +40 -0
synth_ai/sdk/localapi/apps/__init__.py +28 -0
synth_ai/sdk/localapi/client.py +10 -0
synth_ai/sdk/localapi/contracts.py +10 -0
synth_ai/sdk/localapi/helpers.py +519 -0
synth_ai/sdk/localapi/rollouts.py +93 -0
synth_ai/sdk/localapi/server.py +29 -0
synth_ai/sdk/localapi/template.py +49 -0
synth_ai/sdk/streaming/handlers.py +6 -6
synth_ai/sdk/streaming/streamer.py +10 -6
synth_ai/sdk/task/__init__.py +18 -5
synth_ai/sdk/task/apps/__init__.py +37 -1
synth_ai/sdk/task/client.py +9 -1
synth_ai/sdk/task/config.py +6 -11
synth_ai/sdk/task/contracts.py +137 -95
synth_ai/sdk/task/in_process.py +32 -22
synth_ai/sdk/task/in_process_runner.py +9 -4
synth_ai/sdk/task/rubrics/__init__.py +2 -3
synth_ai/sdk/task/rubrics/loaders.py +4 -4
synth_ai/sdk/task/rubrics/strict.py +3 -4
synth_ai/sdk/task/server.py +76 -16
synth_ai/sdk/task/trace_correlation_helpers.py +190 -139
synth_ai/sdk/task/validators.py +34 -49
synth_ai/sdk/training/__init__.py +7 -16
synth_ai/sdk/tunnels/__init__.py +118 -0
synth_ai/sdk/tunnels/cleanup.py +83 -0
synth_ai/sdk/tunnels/ports.py +120 -0
synth_ai/sdk/tunnels/tunneled_api.py +363 -0
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/METADATA +71 -4
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/RECORD +118 -128
synth_ai/cli/commands/baseline/__init__.py +0 -12
synth_ai/cli/commands/baseline/core.py +0 -636
synth_ai/cli/commands/baseline/list.py +0 -94
synth_ai/cli/commands/eval/errors.py +0 -81
synth_ai/cli/commands/status/formatters.py +0 -164
synth_ai/cli/commands/status/subcommands/pricing.py +0 -23
synth_ai/cli/commands/status/subcommands/usage.py +0 -203
synth_ai/cli/commands/train/judge_validation.py +0 -305
synth_ai/cli/usage.py +0 -159
synth_ai/data/specs.py +0 -36
synth_ai/sdk/api/research_agent/cli.py +0 -428
synth_ai/sdk/api/research_agent/config.py +0 -357
synth_ai/sdk/api/research_agent/job.py +0 -717
synth_ai/sdk/baseline/__init__.py +0 -25
synth_ai/sdk/baseline/config.py +0 -209
synth_ai/sdk/baseline/discovery.py +0 -216
synth_ai/sdk/baseline/execution.py +0 -154
synth_ai/sdk/judging/__init__.py +0 -15
synth_ai/sdk/judging/base.py +0 -24
synth_ai/sdk/judging/client.py +0 -191
synth_ai/sdk/judging/types.py +0 -42
synth_ai/sdk/research_agent/__init__.py +0 -34
synth_ai/sdk/research_agent/container_builder.py +0 -328
synth_ai/sdk/research_agent/container_spec.py +0 -198
synth_ai/sdk/research_agent/defaults.py +0 -34
synth_ai/sdk/research_agent/results_collector.py +0 -69
synth_ai/sdk/specs/__init__.py +0 -46
synth_ai/sdk/specs/dataclasses.py +0 -149
synth_ai/sdk/specs/loader.py +0 -144
synth_ai/sdk/specs/serializer.py +0 -199
synth_ai/sdk/specs/validation.py +0 -250
synth_ai/sdk/tracing/__init__.py +0 -39
synth_ai/sdk/usage/__init__.py +0 -37
synth_ai/sdk/usage/client.py +0 -171
synth_ai/sdk/usage/models.py +0 -261
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/WHEEL +0 -0
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/entry_points.txt +0 -0
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/licenses/LICENSE +0 -0
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/top_level.txt +0 -0

synth_ai/sdk/baseline/__init__.py DELETED Viewed

@@ -1,25 +0,0 @@
-"""Baseline file system for self-contained task evaluation.
-This package provides abstractions for defining and executing baseline evaluations
-without requiring deployed task apps. Supports both class-based and function-based
-task runners with first-class train/val/test split support.
-"""
-from __future__ import annotations
-from synth_ai.sdk.baseline.config import (
-    BaselineConfig,
-    BaselineResults,
-    BaselineTaskRunner,
-    DataSplit,
-    TaskResult,
-)
-__all__ = [
-    "BaselineConfig",
-    "BaselineTaskRunner",
-    "DataSplit",
-    "TaskResult",
-    "BaselineResults",
-]

synth_ai/sdk/baseline/config.py DELETED Viewed

@@ -1,209 +0,0 @@
-"""Core dataclasses for baseline configuration and results."""
-from __future__ import annotations
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional
-class BaselineTaskRunner:
-    """
-    Base class for task runners.
-    Subclasses should implement `run_task` method for class-based approach,
-    or you can use standalone async functions for function-based approach.
-    """
-    def __init__(
-        self,
-        policy_config: Dict[str, Any],
-        env_config: Dict[str, Any],
-    ):
-        """
-        Initialize task runner with configuration.
-        Args:
-            policy_config: Policy configuration (model, temperature, etc.)
-            env_config: Environment configuration (max_steps, difficulty, etc.)
-        """
-        self.policy_config = policy_config
-        self.env_config = env_config
-    async def run_task(self, seed: int) -> TaskResult:
-        """
-        Execute a single task instance.
-        This method is called for each seed in the selected split.
-        Args:
-            seed: The seed/index for this task instance
-        Returns:
-            TaskResult: Structured result containing success, rewards, metadata, trace
-        """
-        raise NotImplementedError("Subclasses must implement run_task method")
-@dataclass
-class DataSplit:
-    """Definition of a data split (train/val/test)."""
-    name: str  # "train", "val", "test"
-    seeds: List[int]  # Seed/index values for this split
-    metadata: Dict[str, Any] = field(default_factory=dict)  # Optional metadata
-@dataclass
-class TaskResult:
-    """Result from a single task execution."""
-    # Required: Seed/index that was evaluated
-    seed: int
-    # Required: Did the task complete successfully?
-    success: bool
-    # Required: Outcome reward for the episode
-    outcome_reward: float
-    # Optional: Event rewards (step-level)
-    event_rewards: List[Dict[str, Any]] = field(default_factory=list)
-    # Optional: Total steps/turns taken
-    total_steps: int = 0
-    # Optional: Metadata (achievements, completion info, etc.)
-    metadata: Dict[str, Any] = field(default_factory=dict)
-    # Optional: Error information if success=False
-    error: Optional[str] = None
-    # Optional: v3 trace (SessionTrace dict)
-    trace: Optional[Dict[str, Any]] = None
-# Type alias for task runner (can be class or function)
-TaskRunnerType = (
-    type[BaselineTaskRunner]
-    | Callable[[int, dict[str, Any], dict[str, Any]], Any]  # Function signature
-)
-# Type alias for result aggregator (can be class or function)
-AggregatorType = (
-    type[Any]  # Class with aggregate() method
-    | Callable[[list[TaskResult]], dict[str, Any]]  # Function signature
-)
-@dataclass
-class BaselineConfig:
-    """Configuration for a baseline file.
-    A baseline file defines how to evaluate a task without requiring
-    a deployed task app. It provides self-contained evaluation logic
-    with first-class support for train/val/test splits.
-    Supports both class-based and function-based task runners:
-    - Class-based: Pass a class that inherits from BaselineTaskRunner
-    - Function-based: Pass an async function with signature:
-      async def task_runner(seed: int, policy_config: Dict[str, Any],
-                           env_config: Dict[str, Any]) -> TaskResult
-    """
-    # Required: Unique identifier for this baseline config
-    baseline_id: str
-    # Required: Human-readable name
-    name: str
-    # Required: Task runner (class or function)
-    # Class-based: Pass a class inheriting from BaselineTaskRunner
-    #   The class will be instantiated with policy_config and env_config,
-    #   and run_task(seed) will be called for each seed.
-    # Function-based: Pass an async function with signature:
-    #   async def task_runner(seed: int, policy_config: Dict[str, Any],
-    #                        env_config: Dict[str, Any]) -> TaskResult
-    task_runner: TaskRunnerType
-    # Required: Data splits (train/val/test)
-    splits: Dict[str, DataSplit]
-    # Optional: Description for documentation
-    description: str = ""
-    # Optional: Default policy configuration
-    default_policy_config: Dict[str, Any] = field(default_factory=dict)
-    # Optional: Default environment configuration
-    default_env_config: Dict[str, Any] = field(default_factory=dict)
-    # Optional: Metadata for filtering/organization
-    metadata: Dict[str, Any] = field(default_factory=dict)
-    # Optional: Tags for filtering and discovery
-    tags: List[str] = field(default_factory=list)
-    # Optional: Custom result aggregator (class or function)
-    # Class-based: Pass a class with aggregate(results: List[TaskResult]) method
-    #   The class will be instantiated and aggregate() called.
-    # Function-based: Pass a function with signature:
-    #   def aggregate_results(results: List[TaskResult]) -> Dict[str, Any]
-    result_aggregator: Optional[AggregatorType] = None
-    # Optional: Path to this baseline file (set by discovery)
-    _source_path: Optional[Path] = None
-    def matches_tag(self, tag: str) -> bool:
-        """Check if baseline matches a tag (case-insensitive)."""
-        return tag.lower() in [t.lower() for t in self.tags]
-    def matches_metadata(self, key: str, value: Any) -> bool:
-        """Check if baseline metadata matches key-value pair."""
-        return self.metadata.get(key) == value
-@dataclass
-class BaselineResults:
-    """Aggregate results from a baseline evaluation."""
-    # Configuration that was used
-    config: BaselineConfig
-    # Split that was evaluated
-    split_name: str
-    # Per-seed results
-    results: List[TaskResult]
-    # Aggregate metrics
-    aggregate_metrics: Dict[str, Any]
-    # Execution metadata
-    execution_time_seconds: float
-    model_name: str
-    timestamp: str
-    def to_dict(self) -> Dict[str, Any]:
-        """Serialize to dictionary for JSON output."""
-        return {
-            "baseline_id": self.config.baseline_id,
-            "name": self.config.name,
-            "split": self.split_name,
-            "model": self.model_name,
-            "timestamp": self.timestamp,
-            "execution_time_seconds": self.execution_time_seconds,
-            "aggregate_metrics": self.aggregate_metrics,
-            "results": [
-                {
-                    "seed": r.seed,
-                    "success": r.success,
-                    "outcome_reward": r.outcome_reward,
-                    "total_steps": r.total_steps,
-                    "metadata": r.metadata,
-                    "error": r.error,
-                }
-                for r in self.results
-            ],
-        }

synth_ai/sdk/baseline/discovery.py DELETED Viewed

@@ -1,216 +0,0 @@
-"""AST-based discovery mechanism for baseline files."""
-from __future__ import annotations
-import ast
-import importlib.util
-from dataclasses import dataclass
-from pathlib import Path
-from typing import List, Optional, Tuple
-from synth_ai.sdk.baseline.config import BaselineConfig
-# Search patterns for baseline files
-BASELINE_FILE_PATTERNS = [
-    "**/baseline/*.py",
-    "**/baselines/*.py",
-    "**/*_baseline.py",
-]
-# Directories to ignore during discovery
-IGNORE_PATTERNS = {
-    "__pycache__",
-    ".git",
-    ".venv",
-    "venv",
-    "node_modules",
-    "build",
-    "dist",
-    ".mypy_cache",
-    ".pytest_cache",
-}
-@dataclass
-class BaselineChoice:
-    """Represents a discovered baseline configuration."""
-    baseline_id: str
-    path: Path
-    lineno: int
-    source: str  # "discovered" or "registered"
-    config: Optional[BaselineConfig] = None
-class BaselineConfigVisitor(ast.NodeVisitor):
-    """AST visitor to find BaselineConfig instances."""
-    def __init__(self):
-        self.matches: List[Tuple[str, int]] = []  # (baseline_id, lineno)
-    def visit_Assign(self, node: ast.Assign) -> None:
-        """Visit assignment statements looking for BaselineConfig."""
-        if not isinstance(node.value, ast.Call):
-            self.generic_visit(node)
-            return
-        # Check if right-hand side is BaselineConfig(...)
-        func = node.value.func
-        if isinstance(func, ast.Name) and func.id == "BaselineConfig":
-            # Extract baseline_id from constructor args
-            baseline_id = self._extract_baseline_id(node.value)
-            if baseline_id:
-                self.matches.append((baseline_id, node.lineno))
-        self.generic_visit(node)
-    def _extract_baseline_id(self, call_node: ast.Call) -> Optional[str]:
-        """Extract baseline_id from BaselineConfig constructor."""
-        for keyword in call_node.keywords:
-            if keyword.arg == "baseline_id" and isinstance(keyword.value, ast.Constant):
-                val = keyword.value.value
-                if isinstance(val, str):
-                    return val
-        return None
-def should_ignore_path(path: Path) -> bool:
-    """Check if a path should be ignored during discovery."""
-    return any(part in IGNORE_PATTERNS for part in path.parts)
-def discover_baseline_files(search_roots: List[Path]) -> List[BaselineChoice]:
-    """Discover baseline files via AST scanning.
-    Args:
-        search_roots: List of root directories to search in
-    Returns:
-        List of BaselineChoice objects representing discovered baselines
-    """
-    results: List[BaselineChoice] = []
-    seen = set()
-    for root in search_roots:
-        if not root.exists():
-            continue
-        for pattern in BASELINE_FILE_PATTERNS:
-            for path in root.glob(pattern):
-                if should_ignore_path(path):
-                    continue
-                try:
-                    source = path.read_text(encoding="utf-8")
-                    tree = ast.parse(source, filename=str(path))
-                except (OSError, SyntaxError):
-                    continue
-                visitor = BaselineConfigVisitor()
-                visitor.visit(tree)
-                for baseline_id, lineno in visitor.matches:
-                    key = (baseline_id, path.resolve())
-                    if key in seen:
-                        continue
-                    seen.add(key)
-                    results.append(
-                        BaselineChoice(
-                            baseline_id=baseline_id,
-                            path=path.resolve(),
-                            lineno=lineno,
-                            source="discovered",
-                        )
-                    )
-    return results
-def load_baseline_config_from_file(
-    baseline_id: str,
-    path: Path,
-) -> BaselineConfig:
-    """Load a BaselineConfig from a Python file.
-    Args:
-        baseline_id: The baseline_id to look for
-        path: Path to the Python file
-    Returns:
-        BaselineConfig instance
-    Raises:
-        ValueError: If baseline_id not found or file cannot be loaded
-    """
-    # Load the module
-    spec = importlib.util.spec_from_file_location("baseline_module", path)
-    if spec is None or spec.loader is None:
-        raise ValueError(f"Cannot load baseline file: {path}")
-    module = importlib.util.module_from_spec(spec)
-    try:
-        spec.loader.exec_module(module)
-    except ModuleNotFoundError as e:
-        missing_module = str(e).split("'")[1] if "'" in str(e) else str(e)
-        raise ImportError(
-            f"❌ Missing dependency for baseline '{baseline_id}'\n"
-            f"   File: {path}\n"
-            f"   Missing module: {missing_module}\n"
-            f"   Fix: pip install {missing_module}  (or 'uv add {missing_module}')"
-        ) from e
-    except SyntaxError as e:
-        raise ValueError(
-            f"❌ Syntax error in baseline file '{baseline_id}'\n"
-            f"   File: {path}\n"
-            f"   Error at line {e.lineno}: {e.msg}\n"
-            f"   Text: {e.text.strip() if e.text else 'N/A'}\n"
-            f"   Fix: Check the Python syntax in the baseline file"
-        ) from e
-    except Exception as e:
-        error_type = type(e).__name__
-        raise ValueError(
-            f"❌ Failed to load baseline '{baseline_id}'\n"
-            f"   File: {path}\n"
-            f"   Error type: {error_type}\n"
-            f"   Message: {str(e)}\n"
-            f"   This may be due to:\n"
-            f"     - Missing dependencies (check imports)\n"
-            f"     - Configuration errors in the baseline file\n"
-            f"     - Environment variables not set\n"
-            f"   Tip: Run with --verbose for more details"
-        ) from e
-    # Find the BaselineConfig instance
-    for attr_name in dir(module):
-        if attr_name.startswith("_"):
-            continue
-        attr = getattr(module, attr_name)
-        if isinstance(attr, BaselineConfig) and attr.baseline_id == baseline_id:
-            # Set source path for reference
-            attr._source_path = path
-            return attr
-    # Provide helpful error message
-    found_configs = []
-    for attr_name in dir(module):
-        if attr_name.startswith("_"):
-            continue
-        attr = getattr(module, attr_name)
-        if isinstance(attr, BaselineConfig):
-            found_configs.append(attr.baseline_id)
-    if found_configs:
-        raise ValueError(
-            f"❌ Baseline '{baseline_id}' not found in {path}\n"
-            f"   Found baselines in this file: {', '.join(found_configs)}\n"
-            f"   Fix: Use one of the above baseline IDs or check the baseline_id parameter"
-        )
-    else:
-        raise ValueError(
-            f"❌ No BaselineConfig instances found in {path}\n"
-            f"   Expected to find a BaselineConfig with baseline_id='{baseline_id}'\n"
-            f"   Fix: Ensure the file defines a BaselineConfig instance with baseline_id='{baseline_id}'"
-        )

synth_ai/sdk/baseline/execution.py DELETED Viewed

@@ -1,154 +0,0 @@
-"""Execution engine for baseline evaluations."""
-from __future__ import annotations
-import asyncio
-from typing import Any, Dict, List, Optional
-from synth_ai.sdk.baseline.config import (
-    BaselineConfig,
-    BaselineTaskRunner,
-    TaskResult,
-)
-def default_aggregator(results: List[TaskResult]) -> Dict[str, Any]:
-    """Default result aggregation function.
-    Computes mean, std, min, max, success rate, and other basic metrics.
-    Args:
-        results: List of TaskResult objects from all seeds
-    Returns:
-        Dict with aggregate metrics
-    """
-    successful_results = [r for r in results if r.success]
-    outcome_rewards = [r.outcome_reward for r in successful_results]
-    if not outcome_rewards:
-        return {
-            "mean_outcome_reward": 0.0,
-            "std_outcome_reward": 0.0,
-            "min_outcome_reward": 0.0,
-            "max_outcome_reward": 0.0,
-            "success_rate": 0.0,
-            "total_tasks": len(results),
-            "successful_tasks": 0,
-            "failed_tasks": len(results),
-        }
-    mean_reward = sum(outcome_rewards) / len(outcome_rewards)
-    # Calculate standard deviation
-    variance = sum((x - mean_reward) ** 2 for x in outcome_rewards) / len(outcome_rewards)
-    std_reward = variance ** 0.5
-    return {
-        "mean_outcome_reward": mean_reward,
-        "std_outcome_reward": std_reward,
-        "min_outcome_reward": min(outcome_rewards),
-        "max_outcome_reward": max(outcome_rewards),
-        "success_rate": len(successful_results) / len(results),
-        "total_tasks": len(results),
-        "successful_tasks": len(successful_results),
-        "failed_tasks": len(results) - len(successful_results),
-    }
-def _is_class_based_runner(task_runner: Any) -> bool:
-    """Check if task_runner is a class (not a function)."""
-    return (
-        isinstance(task_runner, type)
-        and issubclass(task_runner, BaselineTaskRunner)
-    )
-async def run_baseline_evaluation(
-    config: BaselineConfig,
-    seeds: List[int],
-    policy_config: Dict[str, Any],
-    env_config: Dict[str, Any],
-    concurrency: int = 4,
-) -> List[TaskResult]:
-    """Run baseline evaluation for given seeds.
-    Args:
-        config: BaselineConfig instance
-        seeds: List of seeds to evaluate
-        policy_config: Policy configuration (merged from defaults + overrides)
-        env_config: Environment configuration (merged from defaults + overrides)
-        concurrency: Maximum concurrent task executions
-    Returns:
-        List of TaskResult objects, one per seed
-    """
-    # Determine if we're using class-based or function-based runner
-    is_class_based = _is_class_based_runner(config.task_runner)
-    # Instantiate runner if class-based
-    runner_instance: Optional[BaselineTaskRunner] = None
-    if is_class_based:
-        # task_runner is a class - instantiate with policy_config and env_config
-        # as documented in BaselineConfig and BaselineTaskRunner
-        runner_instance = config.task_runner(policy_config, env_config)  # type: ignore[call-arg]
-    # Create semaphore for concurrency control
-    semaphore = asyncio.Semaphore(concurrency)
-    async def run_task(seed: int) -> TaskResult:
-        """Execute a single task with error handling."""
-        async with semaphore:
-            try:
-                if is_class_based and runner_instance:
-                    # Class-based: call run_task method
-                    return await runner_instance.run_task(seed)
-                else:
-                    # Function-based: call function directly
-                    task_runner_fn = config.task_runner
-                    if callable(task_runner_fn):
-                        result = task_runner_fn(seed, policy_config, env_config)  # type: ignore[call-arg]
-                        # Handle both sync and async functions
-                        if hasattr(result, "__await__"):
-                            return await result
-                        return result
-                    raise RuntimeError("task_runner is not callable")
-            except Exception as exc:
-                # Return error result
-                return TaskResult(
-                    seed=seed,
-                    success=False,
-                    outcome_reward=0.0,
-                    error=str(exc),
-                )
-    # Execute all tasks concurrently
-    results = await asyncio.gather(*[run_task(seed) for seed in seeds])
-    return list(results)
-def aggregate_results(
-    config: BaselineConfig,
-    results: List[TaskResult],
-) -> Dict[str, Any]:
-    """Aggregate results using custom aggregator or default.
-    Args:
-        config: BaselineConfig instance
-        results: List of TaskResult objects
-    Returns:
-        Dict with aggregate metrics
-    """
-    if config.result_aggregator is None:
-        return default_aggregator(results)
-    # Check if aggregator is a class or function
-    if isinstance(config.result_aggregator, type):
-        # Class-based: instantiate and call aggregate()
-        aggregator_instance = config.result_aggregator()
-        return aggregator_instance.aggregate(results)
-    else:
-        # Function-based: call directly
-        return config.result_aggregator(results)

synth_ai/sdk/judging/__init__.py DELETED Viewed

@@ -1,15 +0,0 @@
-from .client import JudgeClient, JudgeOptions, JudgeScoreResponse, VerifierClient
-from .types import Judgement, RewardJudgement, RewardMetadata, Track, TrackAggregate
-__all__ = [
-	"JudgeClient",
-	"VerifierClient",
-	"JudgeOptions",
-	"JudgeScoreResponse",
-	"Judgement",
-	"RewardJudgement",
-	"RewardMetadata",
-	"Track",
-	"TrackAggregate",
-]

synth_ai/sdk/judging/base.py DELETED Viewed

@@ -1,24 +0,0 @@
-from __future__ import annotations
-from abc import ABC, abstractmethod
-from typing import Any
-class Judgement:
-    def __init__(
-        self,
-        criteria: str,
-        score: float,
-        reasoning: str = "",
-        evidence: list[str] | None = None,
-    ) -> None:
-        self.criteria = criteria
-        self.score = score
-        self.reasoning = reasoning
-        self.evidence = evidence or []
-class BaseEval(ABC):
-    @abstractmethod
-    async def run(self, data: Any) -> list[Judgement]:
-        """Execute the evaluation and return a list of judgements."""

synth-ai 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl

Potentially problematic release.

synth-ai 0.4.1py3-none-any.whl → 0.4.4py3-none-any.whl