PyPI - synth-ai - Versions diffs - 0.2.10__py3-none-any.whl → 0.2.13.dev1__py3-none-any.whl - Mend

synth-ai 0.2.10py3-none-any.whl → 0.2.13.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (73) hide show

examples/agora_ex/README_MoE.md +224 -0
examples/agora_ex/__init__.py +7 -0
examples/agora_ex/agora_ex.py +65 -0
examples/agora_ex/agora_ex_task_app.py +590 -0
examples/agora_ex/configs/rl_lora_qwen3_moe_2xh200.toml +121 -0
examples/agora_ex/reward_fn_grpo-human.py +129 -0
examples/agora_ex/system_prompt_CURRENT.md +63 -0
examples/agora_ex/task_app/agora_ex_task_app.py +590 -0
examples/agora_ex/task_app/reward_fn_grpo-human.py +129 -0
examples/agora_ex/task_app/system_prompt_CURRENT.md +63 -0
examples/multi_step/configs/crafter_rl_outcome.toml +74 -0
examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +175 -0
examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +83 -0
examples/multi_step/configs/crafter_rl_stepwise_simple.toml +78 -0
examples/multi_step/crafter_rl_lora.md +51 -10
examples/multi_step/sse_metrics_streaming_notes.md +357 -0
examples/multi_step/task_app_config_notes.md +494 -0
examples/warming_up_to_rl/configs/eval_stepwise_complex.toml +35 -0
examples/warming_up_to_rl/configs/eval_stepwise_consistent.toml +26 -0
examples/warming_up_to_rl/configs/eval_stepwise_per_achievement.toml +36 -0
examples/warming_up_to_rl/configs/eval_stepwise_simple.toml +32 -0
examples/warming_up_to_rl/run_eval.py +267 -41
examples/warming_up_to_rl/task_app/grpo_crafter.py +3 -33
examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +109 -45
examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +42 -46
examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +376 -193
synth_ai/__init__.py +41 -1
synth_ai/api/train/builders.py +74 -33
synth_ai/api/train/cli.py +29 -6
synth_ai/api/train/configs/__init__.py +44 -0
synth_ai/api/train/configs/rl.py +133 -0
synth_ai/api/train/configs/sft.py +94 -0
synth_ai/api/train/configs/shared.py +24 -0
synth_ai/api/train/env_resolver.py +18 -19
synth_ai/api/train/supported_algos.py +8 -5
synth_ai/api/train/utils.py +6 -1
synth_ai/cli/__init__.py +4 -2
synth_ai/cli/_storage.py +19 -0
synth_ai/cli/balance.py +14 -2
synth_ai/cli/calc.py +37 -22
synth_ai/cli/demo.py +38 -39
synth_ai/cli/legacy_root_backup.py +12 -14
synth_ai/cli/recent.py +12 -7
synth_ai/cli/rl_demo.py +81 -102
synth_ai/cli/status.py +4 -3
synth_ai/cli/task_apps.py +146 -137
synth_ai/cli/traces.py +4 -3
synth_ai/cli/watch.py +3 -2
synth_ai/demos/core/cli.py +121 -159
synth_ai/environments/examples/crafter_classic/environment.py +16 -0
synth_ai/evals/__init__.py +15 -0
synth_ai/evals/client.py +85 -0
synth_ai/evals/types.py +42 -0
synth_ai/jobs/client.py +15 -3
synth_ai/judge_schemas.py +127 -0
synth_ai/rubrics/__init__.py +22 -0
synth_ai/rubrics/validators.py +126 -0
synth_ai/task/server.py +14 -7
synth_ai/tracing_v3/decorators.py +51 -26
synth_ai/tracing_v3/examples/basic_usage.py +12 -7
synth_ai/tracing_v3/llm_call_record_helpers.py +107 -53
synth_ai/tracing_v3/replica_sync.py +8 -4
synth_ai/tracing_v3/serialization.py +130 -0
synth_ai/tracing_v3/storage/utils.py +11 -9
synth_ai/tracing_v3/turso/__init__.py +12 -0
synth_ai/tracing_v3/turso/daemon.py +2 -1
synth_ai/tracing_v3/turso/native_manager.py +28 -15
{synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/METADATA +4 -2
{synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/RECORD +73 -40
{synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/entry_points.txt +0 -1
{synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/WHEEL +0 -0
{synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/licenses/LICENSE +0 -0
{synth_ai-0.2.10.dist-info → synth_ai-0.2.13.dev1.dist-info}/top_level.txt +0 -0

synth_ai/judge_schemas.py ADDED Viewed

@@ -0,0 +1,127 @@
+"""
+Judge API Contract Schemas
+These schemas define the expected structure for requests and responses
+to the judge scoring endpoint at POST /api/judge/v1/score.
+This is the canonical contract that the backend MUST conform to.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Literal, Optional
+from pydantic import BaseModel, Field
+class CriterionScorePayload(BaseModel):
+    """Per-criterion score returned by the judge."""
+    score: float = Field(..., description="Numeric score for this criterion")
+    reason: str = Field(default="", description="Explanation for the score")
+    weight: float = Field(default=1.0, description="Weight of this criterion")
+    description: str = Field(default="", description="Description of the criterion")
+class ReviewPayload(BaseModel):
+    """Rubric review (event-level or outcome-level)."""
+    criteria: Dict[str, CriterionScorePayload] = Field(
+        default_factory=dict,
+        description="Map of criterion keys to their scores"
+    )
+    total: float = Field(default=0.0, description="Aggregated total score")
+    summary: Optional[str] = Field(None, description="Optional text summary")
+class JudgeScoreResponse(BaseModel):
+    """
+    Response body for POST /api/judge/v1/score.
+    This is the canonical contract that judge backends MUST return.
+    """
+    status: Literal["ok", "failed"] = Field(default="ok", description="Request status")
+    event_reviews: List[ReviewPayload] = Field(
+        default_factory=list,
+        description="List of per-event rubric reviews (one per step)"
+    )
+    outcome_review: Optional[ReviewPayload] = Field(
+        None,
+        description="Optional outcome-level rubric review"
+    )
+    event_totals: List[float] = Field(
+        default_factory=list,
+        description="List of aggregated scores per event (matches event_reviews length)"
+    )
+    details: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Additional details (provider, latency, etc.)"
+    )
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Request metadata (provider, options, etc.)"
+    )
+    def aggregate_event_reward(self) -> float | None:
+        """
+        Aggregate all event totals into a single reward.
+        Returns:
+            Sum of all event_totals, or None if empty
+        """
+        if not self.event_totals:
+            return None
+        return sum(self.event_totals)
+    def aggregate_outcome_reward(self) -> float | None:
+        """
+        Extract outcome reward from outcome_review.
+        Returns:
+            outcome_review.total, or None if no outcome review
+        """
+        if self.outcome_review is None:
+            return None
+        return self.outcome_review.total
+# Request schemas for completeness
+class JudgeTaskApp(BaseModel):
+    """Task application metadata."""
+    id: str = Field(..., description="Task app identifier")
+    base_url: Optional[str] = Field(None, description="Optional base URL for task app")
+class JudgeOptions(BaseModel):
+    """Judge provider and configuration options."""
+    provider: Optional[str] = Field(None, description="Judge provider (e.g., 'openai', 'groq')")
+    model: Optional[str] = Field(None, description="Model identifier")
+    rubric_id: Optional[str] = Field(None, description="Rubric identifier")
+    event: bool = Field(True, description="Enable event-level judging")
+    outcome: bool = Field(True, description="Enable outcome-level judging")
+class JudgeTracePayload(BaseModel):
+    """Trace payload containing trajectory context."""
+    event_history: List[Dict[str, Any]] = Field(..., description="List of events/steps")
+    markov_blanket_message_history: List[Dict[str, Any]] = Field(
+        default_factory=list,
+        description="Optional message history for context"
+    )
+    metadata: Dict[str, Any] = Field(default_factory=dict, description="Trace metadata")
+class JudgeScoreRequest(BaseModel):
+    """Request body for POST /api/judge/v1/score."""
+    policy_name: str = Field(..., description="Name of the policy being evaluated")
+    task_app: JudgeTaskApp = Field(..., description="Task application metadata")
+    trace: JudgeTracePayload = Field(..., description="Trajectory trace to evaluate")
+    options: JudgeOptions = Field(default_factory=lambda: JudgeOptions(), description="Judge options")
+    rubric: Optional[Dict[str, Any]] = Field(None, description="Optional explicit rubric criteria")

synth_ai/rubrics/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""
+Rubric utilities.
+Exposes helpers for validating rubric specifications that are used across
+Crafter-style judge configurations.
+"""
+from .validators import (
+    RubricCriterion,
+    RubricSpec,
+    ValidationError,
+    validate_rubric_dict,
+    validate_rubric_file,
+)
+__all__ = [
+    "RubricCriterion",
+    "RubricSpec",
+    "ValidationError",
+    "validate_rubric_dict",
+    "validate_rubric_file",
+]

synth_ai/rubrics/validators.py ADDED Viewed

@@ -0,0 +1,126 @@
+from __future__ import annotations
+import json
+import math
+from pathlib import Path
+from typing import Any, Iterable, Literal
+import pydantic
+class RubricCriterion(pydantic.BaseModel):
+    """Single scoring criterion within a rubric."""
+    id: str
+    description: str
+    weight: float
+    scale: str | None = None
+    @pydantic.field_validator("weight")
+    @classmethod
+    def _validate_weight(cls, value: float) -> float:
+        if not math.isfinite(value):
+            raise ValueError("weight must be a finite number")
+        if value <= 0.0:
+            raise ValueError("weight must be positive")
+        if value > 1.0:
+            raise ValueError("weight must be <= 1.0")
+        return value
+    @pydantic.field_validator("id", "description", mode="before")
+    @classmethod
+    def _strip_string(cls, value: Any) -> Any:
+        if isinstance(value, str):
+            return value.strip()
+        return value
+class RubricSpec(pydantic.BaseModel):
+    """High-level rubric definition used by step-wise judges."""
+    version: str
+    goal_text: str
+    aggregation: Literal["weighted_sum"]
+    criteria: list[RubricCriterion]
+    @pydantic.model_validator(mode="after")
+    def _validate_weights(self) -> "RubricSpec":
+        if not self.criteria:
+            raise ValueError("rubric must declare at least one criterion")
+        total_weight = sum(criterion.weight for criterion in self.criteria)
+        if not math.isclose(total_weight, 1.0, abs_tol=1e-6, rel_tol=1e-6):
+            raise ValueError(
+                f"criterion weights must sum to 1 (got {total_weight:.6f})"
+            )
+        return self
+    @pydantic.field_validator("version")
+    @classmethod
+    def _non_empty_version(cls, value: str) -> str:
+        value = value.strip()
+        if not value:
+            raise ValueError("version string must not be empty")
+        return value
+    @pydantic.field_validator("goal_text")
+    @classmethod
+    def _non_empty_goal_text(cls, value: str) -> str:
+        value = value.strip()
+        if not value:
+            raise ValueError("goal_text must not be empty")
+        return value
+ValidationError = pydantic.ValidationError
+def validate_rubric_dict(payload: dict[str, Any]) -> RubricSpec:
+    """
+    Validate an in-memory rubric payload and return the parsed model.
+    Args:
+        payload: Dictionary representing the rubric JSON.
+    Returns:
+        Validated RubricSpec instance.
+    Raises:
+        ValidationError: If the payload is missing required fields or contains
+        invalid weights.
+    """
+    if not isinstance(payload, dict):
+        raise TypeError("rubric payload must be a dictionary")
+    return RubricSpec.model_validate(payload)
+def _load_payload_from_file(path: Path) -> dict[str, Any]:
+    if path.suffix.lower() != ".json":
+        raise ValueError(f"Unsupported rubric file type: {path}")
+    text = path.read_text(encoding="utf-8")
+    return json.loads(text)
+def validate_rubric_file(path: Path) -> RubricSpec:
+    """
+    Load and validate a rubric file.
+    Args:
+        path: Path to a JSON rubric document.
+    Returns:
+        Validated RubricSpec instance.
+    """
+    payload = _load_payload_from_file(path)
+    return validate_rubric_dict(payload)
+def validate_rubric_files(paths: Iterable[Path]) -> list[RubricSpec]:
+    """
+    Validate multiple rubric files and return their parsed models.
+    Useful for bulk validation inside tests or CI checks.
+    """
+    validated: list[RubricSpec] = []
+    for path in paths:
+        validated.append(validate_rubric_file(path))
+    return validated

synth_ai/task/server.py CHANGED Viewed

@@ -6,6 +6,7 @@ import asyncio
 import inspect
 import os
 from collections.abc import Awaitable, Callable, Iterable, Mapping, MutableMapping, Sequence
+from contextlib import asynccontextmanager
 from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any
@@ -34,6 +35,10 @@ InstanceProvider = Callable[[Sequence[int]], Iterable[TaskInfo] | Awaitable[Iter
 RolloutExecutor = Callable[[RolloutRequest, Request], Any | Awaitable[Any]]
+def _default_app_state() -> dict[str, Any]:
+    return {}
 @dataclass(slots=True)
 class RubricBundle:
     """Optional rubrics advertised by the task app."""
@@ -69,7 +74,7 @@ class TaskAppConfig:
     proxy: ProxyConfig | None = None
     routers: Sequence[APIRouter] = field(default_factory=tuple)
     middleware: Sequence[Middleware] = field(default_factory=tuple)
-    app_state: Mapping[str, Any] = field(default_factory=dict)
+    app_state: MutableMapping[str, Any] = field(default_factory=_default_app_state)
     require_api_key: bool = True
     expose_debug_env: bool = True
     cors_origins: Sequence[str] | None = None
@@ -260,17 +265,19 @@ def create_task_app(config: TaskAppConfig) -> FastAPI:
             return _maybe_await(hook(app))  # type: ignore[misc]
         return _maybe_await(hook())
-    @app.on_event("startup")
-    async def _startup() -> None:  # pragma: no cover - FastAPI lifecycle
+    @asynccontextmanager
+    async def lifespan(_: FastAPI):
         normalize_environment_api_key()
         normalize_vendor_keys()
         for hook in cfg.startup_hooks:
             await _call_hook(hook)
+        try:
+            yield
+        finally:
+            for hook in cfg.shutdown_hooks:
+                await _call_hook(hook)
-    @app.on_event("shutdown")
-    async def _shutdown() -> None:  # pragma: no cover - FastAPI lifecycle
-        for hook in cfg.shutdown_hooks:
-            await _call_hook(hook)
+    app.router.lifespan_context = lifespan
     @app.get("/")
     async def root() -> Mapping[str, Any]:

synth_ai/tracing_v3/decorators.py CHANGED Viewed

@@ -28,8 +28,8 @@ import asyncio
 import contextvars
 import functools
 import time
-from collections.abc import Callable
-from typing import Any, TypeVar
+from collections.abc import Awaitable, Callable, Mapping
+from typing import Any, TypeVar, cast, overload
 from .abstractions import LMCAISEvent, TimeRecord
 from .utils import calculate_cost, detect_provider
@@ -88,6 +88,16 @@ def get_session_tracer() -> Any:
 T = TypeVar("T")
+@overload
+def with_session(require: bool = True) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
+    ...
+@overload
+def with_session(require: bool = True) -> Callable[[Callable[..., T]], Callable[..., T]]:
+    ...
 def with_session(require: bool = True):
     """Decorator that ensures a session is active.
@@ -109,29 +119,31 @@ def with_session(require: bool = True):
         ```
     """
-    def decorator(fn: Callable[..., T]) -> Callable[..., T]:
+    def decorator(fn: Callable[..., Awaitable[T]] | Callable[..., T]) -> Callable[..., Awaitable[T]] | Callable[..., T]:
         if asyncio.iscoroutinefunction(fn):
             @functools.wraps(fn)
-            async def async_wrapper(*args, **kwargs):
+            async def async_wrapper(*args: Any, **kwargs: Any) -> T:
                 session_id = get_session_id()
                 if require and session_id is None:
                     raise RuntimeError(
                         f"No active session for {getattr(fn, '__name__', 'unknown')}"
                     )
-                return await fn(*args, **kwargs)
+                async_fn = cast(Callable[..., Awaitable[T]], fn)
+                return await async_fn(*args, **kwargs)
             return async_wrapper
         else:
             @functools.wraps(fn)
-            def sync_wrapper(*args, **kwargs):
+            def sync_wrapper(*args: Any, **kwargs: Any) -> T:
                 session_id = get_session_id()
                 if require and session_id is None:
                     raise RuntimeError(
                         f"No active session for {getattr(fn, '__name__', 'unknown')}"
                     )
-                return fn(*args, **kwargs)
+                sync_fn = cast(Callable[..., T], fn)
+                return sync_fn(*args, **kwargs)
             return sync_wrapper
@@ -172,31 +184,36 @@ def trace_llm_call(
         ```
     """
-    def decorator(fn: Callable[..., T]) -> Callable[..., T]:
+    def decorator(fn: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
         if asyncio.iscoroutinefunction(fn):
+            async_fn: Callable[..., Awaitable[T]] = fn
             @functools.wraps(fn)
-            async def async_wrapper(*args, **kwargs):
+            async def async_wrapper(*args: Any, **kwargs: Any) -> T:
                 tracer = get_session_tracer()
                 if not tracer:
-                    return await fn(*args, **kwargs)
+                    return await async_fn(*args, **kwargs)
                 start_time = time.time()
                 system_state_before = kwargs.get("state_before", {})
                 try:
-                    result = await fn(*args, **kwargs)
+                    result = await async_fn(*args, **kwargs)
                     # Extract metrics from result - this assumes the result follows
                     # common LLM API response formats (OpenAI, Anthropic, etc.)
-                    if extract_tokens and isinstance(result, dict):
-                        input_tokens = result.get("usage", {}).get("prompt_tokens")
-                        output_tokens = result.get("usage", {}).get("completion_tokens")
-                        total_tokens = result.get("usage", {}).get("total_tokens")
-                        actual_model = result.get("model", model_name)
-                    else:
-                        input_tokens = output_tokens = total_tokens = None
-                        actual_model = model_name
+                    input_tokens = output_tokens = total_tokens = None
+                    actual_model = model_name
+                    if extract_tokens and isinstance(result, Mapping):
+                        result_mapping = cast(Mapping[str, Any], result)
+                        usage = result_mapping.get("usage")
+                        if isinstance(usage, Mapping):
+                            input_tokens = usage.get("prompt_tokens")
+                            output_tokens = usage.get("completion_tokens")
+                            total_tokens = usage.get("total_tokens")
+                        value = result_mapping.get("model")
+                        if isinstance(value, str):
+                            actual_model = value
                     latency_ms = int((time.time() - start_time) * 1000)
@@ -272,19 +289,26 @@ def trace_method(event_type: str = "runtime", system_id: str | None = None):
         ```
     """
-    def decorator(fn: Callable[..., T]) -> Callable[..., T]:
+    def decorator(
+        fn: Callable[..., Awaitable[T]] | Callable[..., T]
+    ) -> Callable[..., Awaitable[T]] | Callable[..., T]:
         if asyncio.iscoroutinefunction(fn):
+            async_fn = cast(Callable[..., Awaitable[T]], fn)
             @functools.wraps(fn)
-            async def async_wrapper(self, *args, **kwargs):
+            async def async_wrapper(*args: Any, **kwargs: Any) -> T:
                 tracer = get_session_tracer()
                 if not tracer:
-                    return await fn(self, *args, **kwargs)
+                    return await async_fn(*args, **kwargs)
                 from .abstractions import RuntimeEvent
                 # Use class name as system_id if not provided
-                actual_system_id = system_id or self.__class__.__name__
+                self_obj = args[0] if args else None
+                inferred_system_id = (
+                    self_obj.__class__.__name__ if self_obj is not None else "unknown"
+                )
+                actual_system_id = system_id or inferred_system_id
                 event = RuntimeEvent(
                     system_instance_id=actual_system_id,
@@ -298,17 +322,18 @@ def trace_method(event_type: str = "runtime", system_id: str | None = None):
                 )
                 await tracer.record_event(event)
-                return await fn(self, *args, **kwargs)
+                return await async_fn(*args, **kwargs)
             return async_wrapper
         else:
             @functools.wraps(fn)
-            def sync_wrapper(self, *args, **kwargs):
+            def sync_wrapper(*args: Any, **kwargs: Any) -> T:
                 # For sync methods, we can't easily trace without blocking
                 # the event loop. This is a limitation of the async-first design.
                 # Consider converting to async or using a different approach
-                return fn(self, *args, **kwargs)
+                sync_fn = cast(Callable[..., T], fn)
+                return sync_fn(*args, **kwargs)
             return sync_wrapper

synth_ai/tracing_v3/examples/basic_usage.py CHANGED Viewed

@@ -2,13 +2,14 @@
 import asyncio
 import time
+from typing import Any
-from synth_ai.tracing_v3 import SessionTracer
-from synth_ai.tracing_v3.abstractions import EnvironmentEvent, LMCAISEvent, RuntimeEvent, TimeRecord
-from synth_ai.tracing_v3.turso.daemon import SqldDaemon
+from .. import SessionTracer
+from ..abstractions import EnvironmentEvent, LMCAISEvent, RuntimeEvent, TimeRecord
+from ..turso.daemon import SqldDaemon
-async def simulate_llm_call(model: str, prompt: str) -> dict:
+async def simulate_llm_call(model: str, prompt: str) -> dict[str, Any]:
     """Simulate an LLM API call."""
     await asyncio.sleep(0.1)  # Simulate network latency
@@ -133,6 +134,9 @@ async def main():
         print("\n--- Example 3: Querying Data ---")
         # Get model usage statistics
+        if tracer.db is None:
+            raise RuntimeError("Tracer database backend is not initialized")
         model_usage = await tracer.db.get_model_usage()
         print("\nModel Usage:")
         print(model_usage)
@@ -150,9 +154,10 @@ async def main():
         # Get specific session details
         if recent_sessions:
             session_detail = await tracer.db.get_session_trace(recent_sessions[0]["session_id"])
-            print(f"\nSession Detail for {session_detail['session_id']}:")
-            print(f"  Created: {session_detail['created_at']}")
-            print(f"  Timesteps: {len(session_detail['timesteps'])}")
+            if session_detail:
+                print(f"\nSession Detail for {session_detail['session_id']}:")
+                print(f"  Created: {session_detail['created_at']}")
+                print(f"  Timesteps: {len(session_detail['timesteps'])}")
         # Example 4: Using hooks
         print("\n--- Example 4: Hooks ---")

synth-ai 0.2.10__py3-none-any.whl → 0.2.13.dev1__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.10py3-none-any.whl → 0.2.13.dev1py3-none-any.whl