PyPI - synth-ai - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

synth-ai 0.4.1py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (153) hide show

synth_ai/__init__.py +13 -13
synth_ai/cli/__init__.py +6 -15
synth_ai/cli/commands/eval/__init__.py +6 -15
synth_ai/cli/commands/eval/config.py +338 -0
synth_ai/cli/commands/eval/core.py +236 -1091
synth_ai/cli/commands/eval/runner.py +704 -0
synth_ai/cli/commands/eval/validation.py +44 -117
synth_ai/cli/commands/filter/core.py +7 -7
synth_ai/cli/commands/filter/validation.py +2 -2
synth_ai/cli/commands/smoke/core.py +7 -17
synth_ai/cli/commands/status/__init__.py +1 -64
synth_ai/cli/commands/status/client.py +50 -151
synth_ai/cli/commands/status/config.py +3 -83
synth_ai/cli/commands/status/errors.py +4 -13
synth_ai/cli/commands/status/subcommands/__init__.py +2 -8
synth_ai/cli/commands/status/subcommands/config.py +13 -0
synth_ai/cli/commands/status/subcommands/files.py +18 -63
synth_ai/cli/commands/status/subcommands/jobs.py +28 -311
synth_ai/cli/commands/status/subcommands/models.py +18 -62
synth_ai/cli/commands/status/subcommands/runs.py +16 -63
synth_ai/cli/commands/status/subcommands/session.py +67 -172
synth_ai/cli/commands/status/subcommands/summary.py +24 -32
synth_ai/cli/commands/status/subcommands/utils.py +41 -0
synth_ai/cli/commands/status/utils.py +16 -107
synth_ai/cli/commands/train/__init__.py +18 -20
synth_ai/cli/commands/train/errors.py +3 -3
synth_ai/cli/commands/train/prompt_learning_validation.py +15 -16
synth_ai/cli/commands/train/validation.py +7 -7
synth_ai/cli/commands/train/{judge_schemas.py → verifier_schemas.py} +33 -34
synth_ai/cli/commands/train/verifier_validation.py +235 -0
synth_ai/cli/demo_apps/demo_task_apps/math/config.toml +0 -1
synth_ai/cli/demo_apps/demo_task_apps/math/modal_task_app.py +2 -6
synth_ai/cli/demo_apps/math/config.toml +0 -1
synth_ai/cli/demo_apps/math/modal_task_app.py +2 -6
synth_ai/cli/demo_apps/mipro/task_app.py +25 -47
synth_ai/cli/lib/apps/task_app.py +12 -13
synth_ai/cli/lib/task_app_discovery.py +6 -6
synth_ai/cli/lib/train_cfgs.py +10 -10
synth_ai/cli/task_apps/__init__.py +11 -0
synth_ai/cli/task_apps/commands.py +7 -15
synth_ai/core/env.py +12 -1
synth_ai/core/errors.py +1 -2
synth_ai/core/integrations/cloudflare.py +209 -33
synth_ai/core/tracing_v3/abstractions.py +46 -0
synth_ai/data/__init__.py +3 -30
synth_ai/data/enums.py +1 -20
synth_ai/data/rewards.py +100 -3
synth_ai/products/graph_evolve/__init__.py +1 -2
synth_ai/products/graph_evolve/config.py +16 -16
synth_ai/products/graph_evolve/converters/__init__.py +3 -3
synth_ai/products/graph_evolve/converters/openai_sft.py +7 -7
synth_ai/products/graph_evolve/examples/hotpotqa/config.toml +1 -1
synth_ai/products/graph_gepa/__init__.py +23 -0
synth_ai/products/graph_gepa/converters/__init__.py +19 -0
synth_ai/products/graph_gepa/converters/openai_sft.py +29 -0
synth_ai/sdk/__init__.py +45 -35
synth_ai/sdk/api/eval/__init__.py +33 -0
synth_ai/sdk/api/eval/job.py +732 -0
synth_ai/sdk/api/research_agent/__init__.py +276 -66
synth_ai/sdk/api/train/builders.py +181 -0
synth_ai/sdk/api/train/cli.py +41 -33
synth_ai/sdk/api/train/configs/__init__.py +6 -4
synth_ai/sdk/api/train/configs/prompt_learning.py +127 -33
synth_ai/sdk/api/train/configs/rl.py +264 -16
synth_ai/sdk/api/train/configs/sft.py +165 -1
synth_ai/sdk/api/train/graph_validators.py +12 -12
synth_ai/sdk/api/train/graphgen.py +169 -51
synth_ai/sdk/api/train/graphgen_models.py +95 -45
synth_ai/sdk/api/train/local_api.py +10 -0
synth_ai/sdk/api/train/pollers.py +36 -0
synth_ai/sdk/api/train/prompt_learning.py +390 -60
synth_ai/sdk/api/train/rl.py +41 -5
synth_ai/sdk/api/train/sft.py +2 -0
synth_ai/sdk/api/train/task_app.py +20 -0
synth_ai/sdk/api/train/validators.py +17 -17
synth_ai/sdk/graphs/completions.py +239 -33
synth_ai/sdk/{judging/schemas.py → graphs/verifier_schemas.py} +23 -23
synth_ai/sdk/learning/__init__.py +35 -5
synth_ai/sdk/learning/context_learning_client.py +531 -0
synth_ai/sdk/learning/context_learning_types.py +294 -0
synth_ai/sdk/learning/prompt_learning_client.py +1 -1
synth_ai/sdk/learning/prompt_learning_types.py +2 -1
synth_ai/sdk/learning/rl/__init__.py +0 -4
synth_ai/sdk/learning/rl/contracts.py +0 -4
synth_ai/sdk/localapi/__init__.py +40 -0
synth_ai/sdk/localapi/apps/__init__.py +28 -0
synth_ai/sdk/localapi/client.py +10 -0
synth_ai/sdk/localapi/contracts.py +10 -0
synth_ai/sdk/localapi/helpers.py +519 -0
synth_ai/sdk/localapi/rollouts.py +93 -0
synth_ai/sdk/localapi/server.py +29 -0
synth_ai/sdk/localapi/template.py +49 -0
synth_ai/sdk/streaming/handlers.py +6 -6
synth_ai/sdk/streaming/streamer.py +10 -6
synth_ai/sdk/task/__init__.py +18 -5
synth_ai/sdk/task/apps/__init__.py +37 -1
synth_ai/sdk/task/client.py +9 -1
synth_ai/sdk/task/config.py +6 -11
synth_ai/sdk/task/contracts.py +137 -95
synth_ai/sdk/task/in_process.py +32 -22
synth_ai/sdk/task/in_process_runner.py +9 -4
synth_ai/sdk/task/rubrics/__init__.py +2 -3
synth_ai/sdk/task/rubrics/loaders.py +4 -4
synth_ai/sdk/task/rubrics/strict.py +3 -4
synth_ai/sdk/task/server.py +76 -16
synth_ai/sdk/task/trace_correlation_helpers.py +190 -139
synth_ai/sdk/task/validators.py +34 -49
synth_ai/sdk/training/__init__.py +7 -16
synth_ai/sdk/tunnels/__init__.py +118 -0
synth_ai/sdk/tunnels/cleanup.py +83 -0
synth_ai/sdk/tunnels/ports.py +120 -0
synth_ai/sdk/tunnels/tunneled_api.py +363 -0
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/METADATA +71 -4
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/RECORD +118 -128
synth_ai/cli/commands/baseline/__init__.py +0 -12
synth_ai/cli/commands/baseline/core.py +0 -636
synth_ai/cli/commands/baseline/list.py +0 -94
synth_ai/cli/commands/eval/errors.py +0 -81
synth_ai/cli/commands/status/formatters.py +0 -164
synth_ai/cli/commands/status/subcommands/pricing.py +0 -23
synth_ai/cli/commands/status/subcommands/usage.py +0 -203
synth_ai/cli/commands/train/judge_validation.py +0 -305
synth_ai/cli/usage.py +0 -159
synth_ai/data/specs.py +0 -36
synth_ai/sdk/api/research_agent/cli.py +0 -428
synth_ai/sdk/api/research_agent/config.py +0 -357
synth_ai/sdk/api/research_agent/job.py +0 -717
synth_ai/sdk/baseline/__init__.py +0 -25
synth_ai/sdk/baseline/config.py +0 -209
synth_ai/sdk/baseline/discovery.py +0 -216
synth_ai/sdk/baseline/execution.py +0 -154
synth_ai/sdk/judging/__init__.py +0 -15
synth_ai/sdk/judging/base.py +0 -24
synth_ai/sdk/judging/client.py +0 -191
synth_ai/sdk/judging/types.py +0 -42
synth_ai/sdk/research_agent/__init__.py +0 -34
synth_ai/sdk/research_agent/container_builder.py +0 -328
synth_ai/sdk/research_agent/container_spec.py +0 -198
synth_ai/sdk/research_agent/defaults.py +0 -34
synth_ai/sdk/research_agent/results_collector.py +0 -69
synth_ai/sdk/specs/__init__.py +0 -46
synth_ai/sdk/specs/dataclasses.py +0 -149
synth_ai/sdk/specs/loader.py +0 -144
synth_ai/sdk/specs/serializer.py +0 -199
synth_ai/sdk/specs/validation.py +0 -250
synth_ai/sdk/tracing/__init__.py +0 -39
synth_ai/sdk/usage/__init__.py +0 -37
synth_ai/sdk/usage/client.py +0 -171
synth_ai/sdk/usage/models.py +0 -261
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/WHEEL +0 -0
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/entry_points.txt +0 -0
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/licenses/LICENSE +0 -0
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/top_level.txt +0 -0

synth_ai/sdk/api/eval/job.py ADDED Viewed

@@ -0,0 +1,732 @@
+"""First-class SDK API for evaluation jobs.
+This module provides high-level abstractions for running evaluation jobs
+that route through the backend for trace capture and cost tracking.
+Example:
+    from synth_ai.sdk.api.eval import EvalJob, EvalResult
+    job = EvalJob(config)
+    job.submit()
+    # progress=True provides built-in status printing:
+    # [00:05] running | 3/10 completed
+    # [00:10] running | 7/10 completed
+    # [00:15] completed | mean_score: 0.85
+    result = job.poll_until_complete(progress=True)
+    # Typed result access (not raw dict)
+    if result.succeeded:
+        print(f"Mean score: {result.mean_score}")
+        print(f"Total cost: ${result.total_cost_usd:.4f}")
+        for seed_result in result.seed_results:
+            print(f"  Seed {seed_result['seed']}: {seed_result['score']}")
+    elif result.failed:
+        print(f"Error: {result.error}")
+See Also:
+    - `synth_ai.cli.commands.eval`: CLI implementation
+    - `synth_ai.sdk.api.train.prompt_learning`: Similar pattern for training
+"""
+from __future__ import annotations
+import asyncio
+import os
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional
+import httpx
+from synth_ai.core.telemetry import log_info
+class EvalStatus(str, Enum):
+    """Status of an evaluation job."""
+    PENDING = "pending"
+    RUNNING = "running"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    CANCELLED = "cancelled"
+    @classmethod
+    def from_string(cls, status: str) -> "EvalStatus":
+        """Convert string to EvalStatus, defaulting to PENDING for unknown values."""
+        try:
+            return cls(status.lower())
+        except ValueError:
+            return cls.PENDING
+    @property
+    def is_terminal(self) -> bool:
+        """Whether this status is terminal (job won't change further)."""
+        return self in (EvalStatus.COMPLETED, EvalStatus.FAILED, EvalStatus.CANCELLED)
+    @property
+    def is_success(self) -> bool:
+        """Whether this status indicates success."""
+        return self == EvalStatus.COMPLETED
+@dataclass
+class EvalResult:
+    """Typed result from an evaluation job.
+    Provides clean accessors for common fields instead of raw dict access.
+    Example:
+        >>> result = job.poll_until_complete(progress=True)
+        >>> if result.succeeded:
+        ...     print(f"Mean score: {result.mean_score:.2%}")
+        ...     print(f"Total cost: ${result.total_cost_usd:.4f}")
+        >>> else:
+        ...     print(f"Failed: {result.error}")
+    """
+    job_id: str
+    status: EvalStatus
+    mean_score: Optional[float] = None
+    total_tokens: Optional[int] = None
+    total_cost_usd: Optional[float] = None
+    num_completed: int = 0
+    num_total: int = 0
+    seed_results: List[Dict[str, Any]] = field(default_factory=list)
+    error: Optional[str] = None
+    raw: Dict[str, Any] = field(default_factory=dict)
+    @classmethod
+    def from_response(cls, job_id: str, data: Dict[str, Any]) -> "EvalResult":
+        """Create result from API response dict."""
+        status_str = data.get("status", "pending")
+        status = EvalStatus.from_string(status_str)
+        # Extract summary metrics
+        summary = data.get("summary", {})
+        results_info = data.get("results", {})
+        # Handle both summary dict and inline fields
+        mean_score = summary.get("mean_score") or data.get("mean_score")
+        total_tokens = summary.get("total_tokens") or data.get("total_tokens")
+        total_cost_usd = summary.get("total_cost_usd") or data.get("total_cost_usd")
+        # Get completion progress
+        num_completed = results_info.get("completed", 0) if isinstance(results_info, dict) else 0
+        num_total = results_info.get("total", 0) if isinstance(results_info, dict) else 0
+        # Get per-seed results (can be in "results" list or nested)
+        seed_results = data.get("results", [])
+        if isinstance(seed_results, dict):
+            seed_results = seed_results.get("items", [])
+        return cls(
+            job_id=job_id,
+            status=status,
+            mean_score=mean_score,
+            total_tokens=total_tokens,
+            total_cost_usd=total_cost_usd,
+            num_completed=num_completed,
+            num_total=num_total,
+            seed_results=list(seed_results) if isinstance(seed_results, list) else [],
+            error=data.get("error"),
+            raw=data,
+        )
+    @property
+    def succeeded(self) -> bool:
+        """Whether the job completed successfully."""
+        return self.status.is_success
+    @property
+    def failed(self) -> bool:
+        """Whether the job failed."""
+        return self.status == EvalStatus.FAILED
+    @property
+    def is_terminal(self) -> bool:
+        """Whether the job has reached a terminal state."""
+        return self.status.is_terminal
+@dataclass
+class EvalJobConfig:
+    """Configuration for an evaluation job.
+    This dataclass holds all the configuration needed to submit and run
+    an evaluation job via the backend.
+    Attributes:
+        task_app_url: URL of the task app to evaluate (e.g., "http://localhost:8103").
+            Required for job submission. Alias: local_api_url
+        backend_url: Base URL of the Synth API backend (e.g., "https://api.usesynth.ai").
+            Can also be set via SYNTH_BASE_URL or BACKEND_BASE_URL environment variables.
+        api_key: Synth API key for authentication with the backend.
+            Can also be set via SYNTH_API_KEY environment variable.
+        task_app_api_key: API key for authenticating with the task app.
+            Defaults to ENVIRONMENT_API_KEY env var if not provided. Alias: local_api_key
+        app_id: Task app identifier (optional, for logging/tracking).
+        env_name: Environment name within the task app.
+        seeds: List of seeds/indices to evaluate.
+        policy_config: Model and provider configuration for the policy.
+        env_config: Additional environment configuration.
+        concurrency: Maximum number of parallel rollouts (default: 5).
+        timeout: Maximum seconds per rollout (default: 600.0).
+    Example:
+        >>> config = EvalJobConfig(
+        ...     task_app_url="http://localhost:8103",
+        ...     backend_url="https://api.usesynth.ai",
+        ...     api_key="sk_live_...",
+        ...     env_name="banking77",
+        ...     seeds=[0, 1, 2, 3, 4],
+        ...     policy_config={"model": "gpt-4", "provider": "openai"},
+        ... )
+    """
+    task_app_url: str = field(default="")
+    backend_url: str = field(default="")
+    api_key: str = field(default="")
+    task_app_api_key: Optional[str] = None
+    app_id: Optional[str] = None
+    env_name: Optional[str] = None
+    seeds: List[int] = field(default_factory=list)
+    policy_config: Dict[str, Any] = field(default_factory=dict)
+    env_config: Dict[str, Any] = field(default_factory=dict)
+    concurrency: int = 5
+    timeout: float = 600.0
+    # Aliases for backwards compatibility (not stored, just used in __init__)
+    local_api_url: str = field(default="", repr=False)
+    local_api_key: Optional[str] = field(default=None, repr=False)
+    def __post_init__(self) -> None:
+        """Validate configuration and handle aliases."""
+        # Handle aliases for backwards compatibility
+        if self.local_api_url and not self.task_app_url:
+            self.task_app_url = self.local_api_url
+        if self.local_api_key and not self.task_app_api_key:
+            self.task_app_api_key = self.local_api_key
+        if not self.task_app_url:
+            raise ValueError("task_app_url (or local_api_url) is required")
+        if not self.backend_url:
+            raise ValueError("backend_url is required")
+        if not self.api_key:
+            raise ValueError("api_key is required")
+        if not self.seeds:
+            raise ValueError("seeds list is required and cannot be empty")
+        # Get task_app_api_key from environment if not provided
+        if not self.task_app_api_key:
+            self.task_app_api_key = os.environ.get("ENVIRONMENT_API_KEY")
+class EvalJob:
+    """High-level SDK class for running evaluation jobs via the backend.
+    This class provides a clean API for:
+    1. Submitting evaluation jobs to the backend
+    2. Polling job status until completion
+    3. Retrieving detailed results with metrics, tokens, and costs
+    4. Downloading traces for analysis
+    The backend routes LLM calls through the inference interceptor, which:
+    - Captures traces automatically
+    - Tracks token usage
+    - Calculates costs based on model pricing
+    Example:
+        >>> from synth_ai.sdk.api.eval import EvalJob
+        >>>
+        >>> # Create job from config file
+        >>> job = EvalJob.from_config(
+        ...     config_path="banking77_eval.toml",
+        ...     backend_url="https://api.usesynth.ai",
+        ...     api_key=os.environ["SYNTH_API_KEY"],
+        ... )
+        >>>
+        >>> # Submit job
+        >>> job_id = job.submit()
+        >>> print(f"Job submitted: {job_id}")
+        >>>
+        >>> # Poll until complete
+        >>> results = job.poll_until_complete(timeout=1200.0)
+        >>> print(f"Mean score: {results['summary']['mean_score']}")
+        >>>
+        >>> # Download traces
+        >>> job.download_traces("./traces")
+    See Also:
+        - `PromptLearningJob`: Similar pattern for prompt learning jobs
+        - Backend API: POST /api/eval/jobs, GET /api/eval/jobs/{job_id}
+    """
+    # Default poll settings
+    _POLL_INTERVAL_S = 2.0
+    _MAX_POLL_ATTEMPTS = 600  # 20 minutes max
+    def __init__(
+        self,
+        config: EvalJobConfig,
+        job_id: Optional[str] = None,
+    ) -> None:
+        """Initialize an evaluation job.
+        Args:
+            config: Job configuration with task app URL, seeds, policy, etc.
+            job_id: Existing job ID (if resuming a previous job)
+        """
+        self.config = config
+        self._job_id = job_id
+    @classmethod
+    def from_config(
+        cls,
+        config_path: str | Path,
+        backend_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+        task_app_api_key: Optional[str] = None,
+        task_app_url: Optional[str] = None,
+        seeds: Optional[List[int]] = None,
+    ) -> EvalJob:
+        """Create a job from a TOML config file.
+        Loads evaluation configuration from a TOML file and allows
+        overriding specific values via arguments.
+        Args:
+            config_path: Path to TOML config file
+            backend_url: Backend API URL (defaults to env or production)
+            api_key: API key (defaults to SYNTH_API_KEY env var)
+            task_app_api_key: Task app API key (defaults to ENVIRONMENT_API_KEY)
+            task_app_url: Override task app URL from config
+            seeds: Override seeds list from config
+        Returns:
+            EvalJob instance ready for submission
+        Raises:
+            ValueError: If required config is missing
+            FileNotFoundError: If config file doesn't exist
+        Example:
+            >>> job = EvalJob.from_config(
+            ...     "banking77_eval.toml",
+            ...     backend_url="https://api.usesynth.ai",
+            ...     api_key="sk_live_...",
+            ...     seeds=[0, 1, 2],  # Override seeds
+            ... )
+        """
+        import tomllib
+        config_path_obj = Path(config_path)
+        if not config_path_obj.exists():
+            raise FileNotFoundError(f"Config file not found: {config_path}")
+        with open(config_path_obj, "rb") as f:
+            toml_data = tomllib.load(f)
+        # Extract eval section (supports both [eval] and [prompt_learning] formats)
+        eval_config = toml_data.get("eval", {})
+        if not eval_config:
+            pl_config = toml_data.get("prompt_learning", {})
+            if pl_config:
+                eval_config = {
+                    "app_id": pl_config.get("task_app_id"),
+                    "url": pl_config.get("task_app_url"),
+                    "env_name": pl_config.get("gepa", {}).get("env_name"),
+                    "seeds": pl_config.get("gepa", {}).get("evaluation", {}).get("seeds", []),
+                    "policy_config": pl_config.get("gepa", {}).get("policy", {}),
+                }
+        # Resolve backend URL
+        if not backend_url:
+            backend_url = os.environ.get("SYNTH_BASE_URL") or os.environ.get("BACKEND_BASE_URL")
+            if not backend_url:
+                backend_url = "https://api.usesynth.ai"
+        # Resolve API key
+        if not api_key:
+            api_key = os.environ.get("SYNTH_API_KEY")
+            if not api_key:
+                raise ValueError("api_key is required (provide explicitly or set SYNTH_API_KEY env var)")
+        # Build config with overrides
+        final_task_app_url = task_app_url or eval_config.get("url") or eval_config.get("task_app_url")
+        if not final_task_app_url:
+            raise ValueError("task_app_url is required (in config or as argument)")
+        final_seeds = seeds or eval_config.get("seeds", [])
+        if not final_seeds:
+            raise ValueError("seeds list is required (in config or as argument)")
+        config = EvalJobConfig(
+            task_app_url=final_task_app_url,
+            backend_url=backend_url,
+            api_key=api_key,
+            task_app_api_key=task_app_api_key,
+            app_id=eval_config.get("app_id"),
+            env_name=eval_config.get("env_name"),
+            seeds=list(final_seeds),
+            policy_config=eval_config.get("policy_config", {}),
+            env_config=eval_config.get("env_config", {}),
+            concurrency=eval_config.get("concurrency", 5),
+            timeout=eval_config.get("timeout", 600.0),
+        )
+        return cls(config)
+    @classmethod
+    def from_job_id(
+        cls,
+        job_id: str,
+        backend_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+    ) -> EvalJob:
+        """Resume an existing job by ID.
+        Use this to check status or get results of a previously submitted job.
+        Args:
+            job_id: Existing job ID (e.g., "eval-abc123")
+            backend_url: Backend API URL (defaults to env or production)
+            api_key: API key (defaults to SYNTH_API_KEY env var)
+        Returns:
+            EvalJob instance for the existing job
+        Example:
+            >>> job = EvalJob.from_job_id("eval-abc123")
+            >>> status = job.get_status()
+            >>> if status["status"] == "completed":
+            ...     results = job.get_results()
+        """
+        # Resolve backend URL
+        if not backend_url:
+            backend_url = os.environ.get("SYNTH_BASE_URL") or os.environ.get("BACKEND_BASE_URL")
+            if not backend_url:
+                backend_url = "https://api.usesynth.ai"
+        # Resolve API key
+        if not api_key:
+            api_key = os.environ.get("SYNTH_API_KEY")
+            if not api_key:
+                raise ValueError("api_key is required (provide explicitly or set SYNTH_API_KEY env var)")
+        # Create minimal config for resumed job
+        config = EvalJobConfig(
+            task_app_url="resumed",  # Placeholder - not needed for status/results
+            backend_url=backend_url,
+            api_key=api_key,
+            seeds=[0],  # Placeholder
+        )
+        return cls(config, job_id=job_id)
+    def _base_url(self) -> str:
+        """Get normalized base URL for API calls."""
+        base = self.config.backend_url.rstrip("/")
+        if not base.endswith("/api"):
+            base = f"{base}/api"
+        return base
+    def _headers(self) -> Dict[str, str]:
+        """Get headers for API calls."""
+        return {
+            "Authorization": f"Bearer {self.config.api_key}",
+            "Content-Type": "application/json",
+        }
+    def submit(self) -> str:
+        """Submit the job to the backend.
+        Creates an eval job on the backend which will:
+        1. Route LLM calls through the inference interceptor
+        2. Capture traces and token usage
+        3. Calculate costs based on model pricing
+        Returns:
+            Job ID (e.g., "eval-abc123")
+        Raises:
+            RuntimeError: If job submission fails or job already submitted
+            ValueError: If configuration is invalid
+        Example:
+            >>> job = EvalJob.from_config("eval.toml")
+            >>> job_id = job.submit()
+            >>> print(f"Submitted: {job_id}")
+        """
+        ctx: Dict[str, Any] = {"task_app_url": self.config.task_app_url}
+        log_info("EvalJob.submit invoked", ctx=ctx)
+        if self._job_id:
+            raise RuntimeError(f"Job already submitted: {self._job_id}")
+        # Build job request payload
+        policy = dict(self.config.policy_config)
+        job_request = {
+            "task_app_url": self.config.task_app_url,
+            "task_app_api_key": self.config.task_app_api_key,
+            "app_id": self.config.app_id,
+            "env_name": self.config.env_name,
+            "seeds": self.config.seeds,
+            "policy": policy,
+            "env_config": self.config.env_config,
+            "max_concurrent": self.config.concurrency,
+            "timeout": self.config.timeout,
+        }
+        # Submit synchronously using httpx
+        url = f"{self._base_url()}/eval/jobs"
+        with httpx.Client(timeout=httpx.Timeout(30.0)) as client:
+            resp = client.post(url, json=job_request, headers=self._headers())
+            if resp.status_code not in (200, 201):
+                raise RuntimeError(
+                    f"Job submission failed with status {resp.status_code}: {resp.text[:500]}"
+                )
+            job_data = resp.json()
+            job_id = job_data.get("job_id")
+            if not job_id:
+                raise RuntimeError(f"No job_id in response: {job_data}")
+            self._job_id = job_id
+            ctx["job_id"] = job_id
+            log_info("EvalJob.submit completed", ctx=ctx)
+            return job_id
+    @property
+    def job_id(self) -> Optional[str]:
+        """Get the job ID (None if not yet submitted)."""
+        return self._job_id
+    def get_status(self) -> Dict[str, Any]:
+        """Get current job status.
+        Returns:
+            Job status dictionary with keys:
+            - job_id: Job identifier
+            - status: "running", "completed", or "failed"
+            - error: Error message if failed
+            - created_at, started_at, completed_at: Timestamps
+            - config: Original job configuration
+            - results: Summary results if completed
+        Raises:
+            RuntimeError: If job hasn't been submitted yet
+        Example:
+            >>> status = job.get_status()
+            >>> print(f"Status: {status['status']}")
+            >>> if status["status"] == "completed":
+            ...     print(f"Mean score: {status['results']['mean_score']}")
+        """
+        if not self._job_id:
+            raise RuntimeError("Job not yet submitted. Call submit() first.")
+        url = f"{self._base_url()}/eval/jobs/{self._job_id}"
+        with httpx.Client(timeout=httpx.Timeout(30.0)) as client:
+            resp = client.get(url, headers=self._headers())
+            if resp.status_code != 200:
+                raise RuntimeError(f"Failed to get status: {resp.status_code} {resp.text}")
+            return resp.json()
+    def poll_until_complete(
+        self,
+        *,
+        timeout: float = 1200.0,
+        interval: float = 2.0,
+        progress: bool = False,
+        on_status: Optional[Callable[[Dict[str, Any]], None]] = None,
+    ) -> EvalResult:
+        """Poll job until it reaches a terminal state, then return results.
+        Polls the backend until the job completes or fails, then fetches
+        and returns the detailed results.
+        Args:
+            timeout: Maximum seconds to wait (default: 1200 = 20 minutes)
+            interval: Seconds between poll attempts (default: 2)
+            progress: If True, print status updates during polling (useful for notebooks)
+            on_status: Optional callback called on each status update (for custom progress handling)
+        Returns:
+            EvalResult with typed status, mean_score, seed_results, etc.
+        Raises:
+            RuntimeError: If job hasn't been submitted yet
+            TimeoutError: If timeout is exceeded
+        Example:
+            >>> result = job.poll_until_complete(progress=True)
+            [00:05] running | 3/10 completed
+            [00:10] running | 7/10 completed
+            [00:15] completed | mean_score: 0.85
+            >>> result.succeeded
+            True
+            >>> result.mean_score
+            0.85
+        """
+        if not self._job_id:
+            raise RuntimeError("Job not yet submitted. Call submit() first.")
+        job_id = self._job_id
+        start_time = time.time()
+        last_data: Dict[str, Any] = {}
+        while True:
+            elapsed = time.time() - start_time
+            if elapsed >= timeout:
+                if progress:
+                    print(f"[poll] timeout after {timeout:.0f}s")
+                # Return with whatever data we have
+                return EvalResult.from_response(job_id, last_data)
+            try:
+                status_data = self.get_status()
+                last_data = status_data
+                status = EvalStatus.from_string(status_data.get("status", "pending"))
+                # Extract progress info
+                results_info = status_data.get("results", {})
+                completed = results_info.get("completed", 0) if isinstance(results_info, dict) else 0
+                total = results_info.get("total", len(self.config.seeds)) if isinstance(results_info, dict) else len(self.config.seeds)
+                # Progress output
+                if progress:
+                    mins, secs = divmod(int(elapsed), 60)
+                    if status.is_terminal:
+                        # Get final results for mean_score
+                        try:
+                            final_results = self.get_results()
+                            mean_score = final_results.get("summary", {}).get("mean_score")
+                            score_str = f"mean_score: {mean_score:.2f}" if mean_score is not None else ""
+                            print(f"[{mins:02d}:{secs:02d}] {status.value} | {score_str}")
+                            # Use final results for the return value
+                            last_data = final_results
+                        except Exception:
+                            print(f"[{mins:02d}:{secs:02d}] {status.value}")
+                    else:
+                        print(f"[{mins:02d}:{secs:02d}] {status.value} | {completed}/{total} completed")
+                # Callback for custom handling
+                if on_status:
+                    on_status(status_data)
+                # Check terminal state
+                if status.is_terminal:
+                    # Fetch full results if completed
+                    if status == EvalStatus.COMPLETED:
+                        try:
+                            final_results = self.get_results()
+                            return EvalResult.from_response(job_id, final_results)
+                        except Exception:
+                            pass
+                    return EvalResult.from_response(job_id, last_data)
+            except Exception as exc:
+                if progress:
+                    print(f"[poll] error: {exc}")
+                log_info("poll request failed", ctx={"error": str(exc), "job_id": job_id})
+            time.sleep(interval)
+    def get_results(self) -> Dict[str, Any]:
+        """Get detailed job results.
+        Fetches the full results including per-seed scores, tokens, and costs.
+        Returns:
+            Results dictionary with:
+            - job_id: Job identifier
+            - status: Job status
+            - summary: Aggregate metrics
+                - mean_score: Average score across seeds
+                - total_tokens: Total token usage
+                - total_cost_usd: Total cost
+                - num_seeds: Number of seeds evaluated
+                - num_successful: Seeds that completed
+                - num_failed: Seeds that failed
+            - results: List of per-seed results
+                - seed: Seed number
+                - score: Evaluation score
+                - tokens: Token count
+                - cost_usd: Cost for this seed
+                - latency_ms: Execution time
+                - error: Error message if failed
+        Raises:
+            RuntimeError: If job hasn't been submitted yet
+        Example:
+            >>> results = job.get_results()
+            >>> for r in results["results"]:
+            ...     print(f"Seed {r['seed']}: score={r['score']}, tokens={r['tokens']}")
+        """
+        if not self._job_id:
+            raise RuntimeError("Job not yet submitted. Call submit() first.")
+        url = f"{self._base_url()}/eval/jobs/{self._job_id}/results"
+        with httpx.Client(timeout=httpx.Timeout(30.0)) as client:
+            resp = client.get(url, headers=self._headers())
+            if resp.status_code != 200:
+                raise RuntimeError(f"Failed to get results: {resp.status_code} {resp.text}")
+            return resp.json()
+    def download_traces(self, output_dir: str | Path) -> Path:
+        """Download traces for the job to a directory.
+        Downloads the traces ZIP file from the backend and extracts
+        it to the specified directory.
+        Args:
+            output_dir: Directory to extract traces to
+        Returns:
+            Path to the output directory
+        Raises:
+            RuntimeError: If job hasn't been submitted or download fails
+        Example:
+            >>> traces_dir = job.download_traces("./traces")
+            >>> for trace_file in traces_dir.glob("*.json"):
+            ...     print(f"Trace: {trace_file}")
+        """
+        import io
+        import zipfile
+        if not self._job_id:
+            raise RuntimeError("Job not yet submitted. Call submit() first.")
+        url = f"{self._base_url()}/eval/jobs/{self._job_id}/traces"
+        output_path = Path(output_dir)
+        with httpx.Client(timeout=httpx.Timeout(60.0)) as client:
+            resp = client.get(url, headers=self._headers())
+            if resp.status_code != 200:
+                raise RuntimeError(f"Failed to download traces: {resp.status_code} {resp.text}")
+            output_path.mkdir(parents=True, exist_ok=True)
+            with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
+                zf.extractall(output_path)
+            return output_path
+__all__ = ["EvalJob", "EvalJobConfig", "EvalResult", "EvalStatus"]

synth-ai 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl

Potentially problematic release.

synth-ai 0.4.1py3-none-any.whl → 0.4.4py3-none-any.whl