PyPI - synth-ai - Versions diffs - 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

synth-ai 0.4.1py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (153) hide show

synth_ai/__init__.py +13 -13
synth_ai/cli/__init__.py +6 -15
synth_ai/cli/commands/eval/__init__.py +6 -15
synth_ai/cli/commands/eval/config.py +338 -0
synth_ai/cli/commands/eval/core.py +236 -1091
synth_ai/cli/commands/eval/runner.py +704 -0
synth_ai/cli/commands/eval/validation.py +44 -117
synth_ai/cli/commands/filter/core.py +7 -7
synth_ai/cli/commands/filter/validation.py +2 -2
synth_ai/cli/commands/smoke/core.py +7 -17
synth_ai/cli/commands/status/__init__.py +1 -64
synth_ai/cli/commands/status/client.py +50 -151
synth_ai/cli/commands/status/config.py +3 -83
synth_ai/cli/commands/status/errors.py +4 -13
synth_ai/cli/commands/status/subcommands/__init__.py +2 -8
synth_ai/cli/commands/status/subcommands/config.py +13 -0
synth_ai/cli/commands/status/subcommands/files.py +18 -63
synth_ai/cli/commands/status/subcommands/jobs.py +28 -311
synth_ai/cli/commands/status/subcommands/models.py +18 -62
synth_ai/cli/commands/status/subcommands/runs.py +16 -63
synth_ai/cli/commands/status/subcommands/session.py +67 -172
synth_ai/cli/commands/status/subcommands/summary.py +24 -32
synth_ai/cli/commands/status/subcommands/utils.py +41 -0
synth_ai/cli/commands/status/utils.py +16 -107
synth_ai/cli/commands/train/__init__.py +18 -20
synth_ai/cli/commands/train/errors.py +3 -3
synth_ai/cli/commands/train/prompt_learning_validation.py +15 -16
synth_ai/cli/commands/train/validation.py +7 -7
synth_ai/cli/commands/train/{judge_schemas.py → verifier_schemas.py} +33 -34
synth_ai/cli/commands/train/verifier_validation.py +235 -0
synth_ai/cli/demo_apps/demo_task_apps/math/config.toml +0 -1
synth_ai/cli/demo_apps/demo_task_apps/math/modal_task_app.py +2 -6
synth_ai/cli/demo_apps/math/config.toml +0 -1
synth_ai/cli/demo_apps/math/modal_task_app.py +2 -6
synth_ai/cli/demo_apps/mipro/task_app.py +25 -47
synth_ai/cli/lib/apps/task_app.py +12 -13
synth_ai/cli/lib/task_app_discovery.py +6 -6
synth_ai/cli/lib/train_cfgs.py +10 -10
synth_ai/cli/task_apps/__init__.py +11 -0
synth_ai/cli/task_apps/commands.py +7 -15
synth_ai/core/env.py +12 -1
synth_ai/core/errors.py +1 -2
synth_ai/core/integrations/cloudflare.py +209 -33
synth_ai/core/tracing_v3/abstractions.py +46 -0
synth_ai/data/__init__.py +3 -30
synth_ai/data/enums.py +1 -20
synth_ai/data/rewards.py +100 -3
synth_ai/products/graph_evolve/__init__.py +1 -2
synth_ai/products/graph_evolve/config.py +16 -16
synth_ai/products/graph_evolve/converters/__init__.py +3 -3
synth_ai/products/graph_evolve/converters/openai_sft.py +7 -7
synth_ai/products/graph_evolve/examples/hotpotqa/config.toml +1 -1
synth_ai/products/graph_gepa/__init__.py +23 -0
synth_ai/products/graph_gepa/converters/__init__.py +19 -0
synth_ai/products/graph_gepa/converters/openai_sft.py +29 -0
synth_ai/sdk/__init__.py +45 -35
synth_ai/sdk/api/eval/__init__.py +33 -0
synth_ai/sdk/api/eval/job.py +732 -0
synth_ai/sdk/api/research_agent/__init__.py +276 -66
synth_ai/sdk/api/train/builders.py +181 -0
synth_ai/sdk/api/train/cli.py +41 -33
synth_ai/sdk/api/train/configs/__init__.py +6 -4
synth_ai/sdk/api/train/configs/prompt_learning.py +127 -33
synth_ai/sdk/api/train/configs/rl.py +264 -16
synth_ai/sdk/api/train/configs/sft.py +165 -1
synth_ai/sdk/api/train/graph_validators.py +12 -12
synth_ai/sdk/api/train/graphgen.py +169 -51
synth_ai/sdk/api/train/graphgen_models.py +95 -45
synth_ai/sdk/api/train/local_api.py +10 -0
synth_ai/sdk/api/train/pollers.py +36 -0
synth_ai/sdk/api/train/prompt_learning.py +390 -60
synth_ai/sdk/api/train/rl.py +41 -5
synth_ai/sdk/api/train/sft.py +2 -0
synth_ai/sdk/api/train/task_app.py +20 -0
synth_ai/sdk/api/train/validators.py +17 -17
synth_ai/sdk/graphs/completions.py +239 -33
synth_ai/sdk/{judging/schemas.py → graphs/verifier_schemas.py} +23 -23
synth_ai/sdk/learning/__init__.py +35 -5
synth_ai/sdk/learning/context_learning_client.py +531 -0
synth_ai/sdk/learning/context_learning_types.py +294 -0
synth_ai/sdk/learning/prompt_learning_client.py +1 -1
synth_ai/sdk/learning/prompt_learning_types.py +2 -1
synth_ai/sdk/learning/rl/__init__.py +0 -4
synth_ai/sdk/learning/rl/contracts.py +0 -4
synth_ai/sdk/localapi/__init__.py +40 -0
synth_ai/sdk/localapi/apps/__init__.py +28 -0
synth_ai/sdk/localapi/client.py +10 -0
synth_ai/sdk/localapi/contracts.py +10 -0
synth_ai/sdk/localapi/helpers.py +519 -0
synth_ai/sdk/localapi/rollouts.py +93 -0
synth_ai/sdk/localapi/server.py +29 -0
synth_ai/sdk/localapi/template.py +49 -0
synth_ai/sdk/streaming/handlers.py +6 -6
synth_ai/sdk/streaming/streamer.py +10 -6
synth_ai/sdk/task/__init__.py +18 -5
synth_ai/sdk/task/apps/__init__.py +37 -1
synth_ai/sdk/task/client.py +9 -1
synth_ai/sdk/task/config.py +6 -11
synth_ai/sdk/task/contracts.py +137 -95
synth_ai/sdk/task/in_process.py +32 -22
synth_ai/sdk/task/in_process_runner.py +9 -4
synth_ai/sdk/task/rubrics/__init__.py +2 -3
synth_ai/sdk/task/rubrics/loaders.py +4 -4
synth_ai/sdk/task/rubrics/strict.py +3 -4
synth_ai/sdk/task/server.py +76 -16
synth_ai/sdk/task/trace_correlation_helpers.py +190 -139
synth_ai/sdk/task/validators.py +34 -49
synth_ai/sdk/training/__init__.py +7 -16
synth_ai/sdk/tunnels/__init__.py +118 -0
synth_ai/sdk/tunnels/cleanup.py +83 -0
synth_ai/sdk/tunnels/ports.py +120 -0
synth_ai/sdk/tunnels/tunneled_api.py +363 -0
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/METADATA +71 -4
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/RECORD +118 -128
synth_ai/cli/commands/baseline/__init__.py +0 -12
synth_ai/cli/commands/baseline/core.py +0 -636
synth_ai/cli/commands/baseline/list.py +0 -94
synth_ai/cli/commands/eval/errors.py +0 -81
synth_ai/cli/commands/status/formatters.py +0 -164
synth_ai/cli/commands/status/subcommands/pricing.py +0 -23
synth_ai/cli/commands/status/subcommands/usage.py +0 -203
synth_ai/cli/commands/train/judge_validation.py +0 -305
synth_ai/cli/usage.py +0 -159
synth_ai/data/specs.py +0 -36
synth_ai/sdk/api/research_agent/cli.py +0 -428
synth_ai/sdk/api/research_agent/config.py +0 -357
synth_ai/sdk/api/research_agent/job.py +0 -717
synth_ai/sdk/baseline/__init__.py +0 -25
synth_ai/sdk/baseline/config.py +0 -209
synth_ai/sdk/baseline/discovery.py +0 -216
synth_ai/sdk/baseline/execution.py +0 -154
synth_ai/sdk/judging/__init__.py +0 -15
synth_ai/sdk/judging/base.py +0 -24
synth_ai/sdk/judging/client.py +0 -191
synth_ai/sdk/judging/types.py +0 -42
synth_ai/sdk/research_agent/__init__.py +0 -34
synth_ai/sdk/research_agent/container_builder.py +0 -328
synth_ai/sdk/research_agent/container_spec.py +0 -198
synth_ai/sdk/research_agent/defaults.py +0 -34
synth_ai/sdk/research_agent/results_collector.py +0 -69
synth_ai/sdk/specs/__init__.py +0 -46
synth_ai/sdk/specs/dataclasses.py +0 -149
synth_ai/sdk/specs/loader.py +0 -144
synth_ai/sdk/specs/serializer.py +0 -199
synth_ai/sdk/specs/validation.py +0 -250
synth_ai/sdk/tracing/__init__.py +0 -39
synth_ai/sdk/usage/__init__.py +0 -37
synth_ai/sdk/usage/client.py +0 -171
synth_ai/sdk/usage/models.py +0 -261
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/WHEEL +0 -0
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/entry_points.txt +0 -0
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/licenses/LICENSE +0 -0
{synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/top_level.txt +0 -0

synth_ai/sdk/streaming/handlers.py CHANGED Viewed

@@ -335,10 +335,10 @@ class IntegrationTestHandler(StreamHandler):
 class GraphGenHandler(StreamHandler):
-    """Handler for ADAS jobs that delegate child job streams to an underlying handler.
+    """Handler for Graph Opt jobs that delegate child job streams to an underlying handler.
-    ADAS jobs emit events from child jobs (GEPA, MIPRO, RL, SFT, etc.). This handler
-    provides light ADAS-aware filtering and routing while keeping child job output
+    Graph Opt jobs emit events from child jobs (GEPA, MIPRO, RL, SFT, etc.). This handler
+    provides light Graph Opt-aware filtering and routing while keeping child job output
     intact via a delegate handler. The delegate can be supplied directly or created
     via a factory; by default we choose a prompt-learning handler for GEPA/MIPRO and
     a basic CLI handler for other job types.
@@ -365,7 +365,7 @@ class GraphGenHandler(StreamHandler):
         self._pl_show_validation = show_validation
         self.filter_verbose_events = filter_verbose_events
-        # If False, skip ADAS-specific filtering/transformations and just pass through.
+        # If False, skip Graph Opt-specific filtering/transformations and just pass through.
         self.wrap_child_events = wrap_child_events
         # Detected child job type (gepa/mipro/rl/sft/etc.)
@@ -436,7 +436,7 @@ class GraphGenHandler(StreamHandler):
         elif event_type.startswith("sft.") or ".sft." in event_type:
             self.child_job_type = "sft"
         else:
-            # Fall back to the first segment as a hint (e.g., "adas.child_type")
+            # Fall back to the first segment as a hint (e.g., "graphgen.child_type")
             parts = event_type.split(".")
             if parts:
                 self.child_job_type = parts[0]
@@ -504,7 +504,7 @@ class GraphGenHandler(StreamHandler):
         return any(pattern in event_type_lower for pattern in verbose_patterns)
     def _transform_event_message(self, message: StreamMessage) -> StreamMessage:
-        """Transform event messages for ADAS context (currently passthrough)."""
+        """Transform event messages for Graph Opt context (currently passthrough)."""
         return message
     def flush(self) -> None:

synth_ai/sdk/streaming/streamer.py CHANGED Viewed

@@ -142,11 +142,11 @@ class StreamEndpoints:
         )
     @classmethod
-    def adas(cls, job_id: str) -> StreamEndpoints:
-        """Endpoints for GraphGen (formerly ADAS) workflow optimization jobs.
+    def graphgen(cls, job_id: str) -> StreamEndpoints:
+        """Endpoints for GraphGen workflow optimization jobs.
         GraphGen jobs use /api/graphgen/jobs/{job_id} endpoints.
-        The backend handles GraphGen → graph_evolve job ID resolution internally using the job_relationships table.
+        The backend handles GraphGen -> graph_evolve job ID resolution internally using job_relationships.
         No fallbacks needed - GraphGen endpoints resolve everything.
         """
         base = f"/graphgen/jobs/{job_id}"
@@ -158,6 +158,7 @@ class StreamEndpoints:
         )
 class JobStreamer:
     """Poll job endpoints and dispatch messages to configured handlers."""
@@ -503,14 +504,17 @@ class JobStreamer:
             except Exception as e:
                 error_str = str(e)
                 print(f"[DEBUG] Error polling {path}: {e}", file=sys.stderr)
-                # Fail fast if we get 404 on both ADAS and fallback endpoints (indicates job ID mapping issue)
-                if "404" in error_str and ("adas" in path.lower() or "prompt-learning" in path.lower()):
+                # Fail fast if we get 404 on GraphGen and fallback endpoints (indicates job ID mapping issue)
+                if "404" in error_str and (
+                    "graphgen" in path.lower()
+                    or "prompt-learning" in path.lower()
+                ):
                     # Check if this is the last fallback path - if so, raise to fail fast
                     if path == self._event_paths[-1]:  # Last fallback path
                         raise RuntimeError(
                             f"Failed to poll events: All endpoints returned 404. "
                             f"This likely indicates a job ID mapping issue. "
-                            f"ADAS endpoints need ADAS job ID, GEPA fallback endpoints need GEPA job ID. "
+                            f"GraphGen endpoints need the GraphGen job ID; GEPA fallback endpoints need the GEPA job ID. "
                             f"Last error: {error_str}"
                         )
                 continue

synth_ai/sdk/task/__init__.py CHANGED Viewed

@@ -1,14 +1,21 @@
+"""Task namespace (legacy).
+Prefer synth_ai.sdk.localapi.* moving forward. This module remains for backward
+compatibility during the naming transition.
+"""
 from .auth import (
     is_api_key_header_authorized,
     normalize_environment_api_key,
     require_api_key_dependency,
 )
-from .client import TaskAppClient
+from .client import LocalAPIClient, TaskAppClient
 from .config import EvalConfig, FilterConfig
 from .contracts import (
     DatasetInfo,
     InferenceInfo,
     LimitsInfo,
+    LocalAPIEndpoints,
     RolloutEnvSpec,
     RolloutMetrics,
     RolloutPolicySpec,
@@ -16,8 +23,6 @@ from .contracts import (
     RolloutRequest,
     RolloutResponse,
     RolloutSafetyConfig,
-    RolloutStep,
-    RolloutTrajectory,
     RubricInfo,
     RubricSection,
     TaskAppEndpoints,
@@ -54,15 +59,19 @@ from .rubrics import (
     score_outcome_against_rubric,
 )
 from .server import (
+    LocalAPIConfig,
     ProxyConfig,
     RubricBundle,
     TaskAppConfig,
     create_task_app,
+    run_server_background,
     run_task_app,
 )
 from .trace_correlation_helpers import (
+    build_trace_payload,
     build_trajectory_trace,
     extract_trace_correlation_id,
+    include_event_history_in_response,
     include_event_history_in_trajectories,
     include_trace_correlation_id_in_response,
     validate_trace_correlation_id,
@@ -89,14 +98,13 @@ __all__ = [
     "EvalConfig",
     "FilterConfig",
     "TaskAppEndpoints",
+    "LocalAPIEndpoints",
     "RolloutEnvSpec",
     "RolloutPolicySpec",
     "RolloutRecordConfig",
     "RolloutSafetyConfig",
     "RolloutRequest",
     "RolloutResponse",
-    "RolloutTrajectory",
-    "RolloutStep",
     "RolloutMetrics",
     "TaskDescriptor",
     "DatasetInfo",
@@ -127,14 +135,17 @@ __all__ = [
     "score_events_against_rubric",
     "score_outcome_against_rubric",
     "TaskAppClient",
+    "LocalAPIClient",
     "error_payload",
     "http_exception",
     "json_error_response",
     "run_task_app",
+    "run_server_background",
     "create_task_app",
     "RubricBundle",
     "ProxyConfig",
     "TaskAppConfig",
+    "LocalAPIConfig",
     "InferenceAPIClient",
     "InProcessTaskApp",
     "InProcessJobResult",
@@ -143,7 +154,9 @@ __all__ = [
     "run_in_process_job",
     "run_in_process_job_sync",
     "build_trajectory_trace",
+    "build_trace_payload",
     "extract_trace_correlation_id",
+    "include_event_history_in_response",
     "include_event_history_in_trajectories",
     "include_trace_correlation_id_in_response",
     "validate_trace_correlation_id",

synth_ai/sdk/task/apps/__init__.py CHANGED Viewed

@@ -1,4 +1,8 @@
-"""Registry for Task Apps exposed via the shared FastAPI harness."""
+"""Registry for Task Apps exposed via the shared FastAPI harness.
+Prefer synth_ai.sdk.localapi.apps moving forward. This module remains for
+backward compatibility during the naming transition.
+"""
 from __future__ import annotations
@@ -43,6 +47,22 @@ class TaskAppEntry:
     modal: ModalDeploymentConfig | None = None
+@dataclass(slots=True)
+class LocalAPIEntry:
+    """Metadata describing a registered local API."""
+    api_id: str
+    description: str
+    config_factory: Callable[[], TaskAppConfig]
+    aliases: Sequence[str] = field(default_factory=tuple)
+    env_files: Sequence[str] = field(default_factory=tuple)
+    modal: ModalDeploymentConfig | None = None
+    @property
+    def app_id(self) -> str:
+        return self.api_id
 class TaskAppRegistry:
     """In-memory registry of known task apps."""
@@ -86,6 +106,22 @@ def register_task_app(*, entry: TaskAppEntry) -> None:
     registry.register(entry)
+def register_local_api(*, entry: LocalAPIEntry | TaskAppEntry) -> None:
+    if isinstance(entry, LocalAPIEntry):
+        registry.register(
+            TaskAppEntry(
+                app_id=entry.api_id,
+                description=entry.description,
+                config_factory=entry.config_factory,
+                aliases=entry.aliases,
+                env_files=entry.env_files,
+                modal=entry.modal,
+            )
+        )
+        return
+    registry.register(entry)
 def discover_task_apps_from_cwd() -> None:
     """Discover and register task apps from the current working directory and subdirectories."""
     cwd = Path.cwd()

synth_ai/sdk/task/client.py CHANGED Viewed

@@ -1,4 +1,8 @@
-"""Async HTTP client for interacting with Task Apps."""
+"""Async HTTP client for interacting with Task Apps.
+Prefer synth_ai.sdk.localapi.client moving forward. This module remains for
+backward compatibility during the naming transition.
+"""
 from __future__ import annotations
@@ -142,6 +146,10 @@ class TaskAppClient:
         return RolloutResponse.model_validate(data)
+class LocalAPIClient(TaskAppClient):
+    """Alias for TaskAppClient with LocalAPI naming."""
 class _TaskAppEnvironmentClient:
     def __init__(self, client: TaskAppClient) -> None:
         self._client = client

synth_ai/sdk/task/config.py CHANGED Viewed

@@ -50,9 +50,6 @@ class EvalConfig:
     # Optional: Whether to return traces in response
     return_trace: bool = False
-    # Optional: Operations sequence (if not provided, generates default)
-    ops: list[str] | None = None
     # Optional: Environment config overrides
     env_config: dict[str, Any] = field(default_factory=dict)
@@ -115,7 +112,6 @@ class EvalConfig:
             "policy_name": data.get("policy_name"),
             "trace_format": data.get("trace_format", "compact"),
             "return_trace": data.get("return_trace", False),
-            "ops": data.get("ops"),
             "env_config": data.get("env_config", {}),
             "policy_config": data.get("policy_config", {}),
             "metadata": data.get("metadata", {}),
@@ -153,11 +149,11 @@ class FilterConfig:
     # Optional: Maximum official score threshold
     max_official_score: float | None = None
-    # Optional: Minimum judge scores (judge_name -> min_score)
-    min_judge_scores: dict[str, float] = field(default_factory=dict)
+    # Optional: Minimum verifier scores (verifier_name -> min_score)
+    min_verifier_scores: dict[str, float] = field(default_factory=dict)
-    # Optional: Maximum judge scores (judge_name -> max_score)
-    max_judge_scores: dict[str, float] = field(default_factory=dict)
+    # Optional: Maximum verifier scores (verifier_name -> max_score)
+    max_verifier_scores: dict[str, float] = field(default_factory=dict)
     # Optional: Limit number of examples
     limit: int | None = None
@@ -222,8 +218,8 @@ class FilterConfig:
             "models": data.get("models", []),
             "min_official_score": data.get("min_official_score"),
             "max_official_score": data.get("max_official_score"),
-            "min_judge_scores": data.get("min_judge_scores", {}),
-            "max_judge_scores": data.get("max_judge_scores", {}),
+            "min_verifier_scores": data.get("min_verifier_scores", {}),
+            "max_verifier_scores": data.get("max_verifier_scores", {}),
             "limit": data.get("limit"),
             "offset": data.get("offset"),
             "shuffle": data.get("shuffle", False),
@@ -258,4 +254,3 @@ class FilterConfig:
         return output_path

synth_ai/sdk/task/contracts.py CHANGED Viewed

@@ -1,3 +1,9 @@
+"""Contracts for Task Apps.
+Prefer synth_ai.sdk.localapi.contracts moving forward. This module remains for
+backward compatibility during the naming transition.
+"""
 from __future__ import annotations
 from dataclasses import dataclass
@@ -64,6 +70,11 @@ class TaskAppEndpoints:
     rollout: str = "/rollout"
+@dataclass(frozen=True)
+class LocalAPIEndpoints(TaskAppEndpoints):
+    """Alias for TaskAppEndpoints with LocalAPI naming."""
 # --- Unified rollout schema used by Task App services and SDK utilities ---
@@ -91,7 +102,6 @@ class RolloutPolicySpec(BaseModel):
 class RolloutRecordConfig(BaseModel):
-    trajectories: bool = True
     logprobs: bool = False
     value: bool = False
     return_trace: bool = False
@@ -99,7 +109,6 @@ class RolloutRecordConfig(BaseModel):
 class RolloutSafetyConfig(BaseModel):
-    max_ops: int = 100000
     max_time_s: float = 3600.0
@@ -107,121 +116,148 @@ class RolloutRequest(BaseModel):
     run_id: str
     env: RolloutEnvSpec
     policy: RolloutPolicySpec
-    ops: list[dict[str, Any]] | list[str]
     record: RolloutRecordConfig = RolloutRecordConfig()
     on_done: str = "reset"
     safety: RolloutSafetyConfig = RolloutSafetyConfig()
     training_session_id: str | None = None
     synth_base_url: str | None = None
-    mode: RolloutMode  # Required: explicit RL vs EVAL mode
+    mode: RolloutMode = RolloutMode.RL  # Default to RL mode for training/optimization
+class RolloutMetrics(BaseModel):
+    """Metrics from a rollout execution.
+    ## Preferred Fields (New - Normalized)
+    - `outcome_reward`: The reward for this rollout (PREFERRED)
+    - `event_rewards`: Optional per-step rewards
+    ## Legacy Fields (Backward Compatibility)
+    - `episode_rewards`, `reward_mean`, `num_steps`: Still supported for backward
+      compatibility. For new implementations, just use `outcome_reward`.
+    - `outcome_score`: Alias for `outcome_reward` (deprecated)
+    ## Example - Minimal (New Style)
+        metrics = RolloutMetrics(
+            outcome_reward=1.0,  # PREFERRED - just provide the reward
+        )
-class RolloutStep(BaseModel):
-    """Single step in a rollout trajectory.
+    ## Example - Full (Backward Compatible)
-    DEPRECATED: This is part of the legacy trajectory format. New code should
-    consume v3 traces (RolloutResponse.trace) instead. See monorepo/trace_single_source.txt
-    for migration plan.
+        metrics = RolloutMetrics(
+            episode_rewards=[1.0],
+            reward_mean=1.0,
+            num_steps=1,
+            outcome_reward=1.0,  # PREFERRED
+        )
     """
-    obs: dict[str, Any]
-    tool_calls: list[dict[str, Any]] = Field(default_factory=list)
-    reward: float | None = None
-    done: bool = False
-    truncated: bool | None = None
-    info: dict[str, Any] | None = None
-    # Unified output fields (supports all output modes)
-    output: dict[str, Any] | str | None = Field(
+    # =========================================================================
+    # PREFERRED FIELDS (New - Normalized)
+    # =========================================================================
+    outcome_reward: float | None = Field(
         default=None,
-        description="Unified output: parsed JSON for STRUCTURED, raw text for TEXT, tool args for TOOL_CALLS"
+        description="The reward for this rollout. PREFERRED field for scoring.",
     )
-    output_mode: OutputMode | None = Field(
+    event_rewards: list[float] | None = Field(
         default=None,
-        description="Which output mode produced this step's output"
+        description="Optional per-step/event rewards for multi-step tasks.",
     )
-class RolloutTrajectory(BaseModel):
-    """Legacy trajectory format for rollout results.
-    DEPRECATED: This format duplicates data already present in v3 traces and will
-    be removed once training code migrates to consuming RolloutResponse.trace.
-    Current state:
-    - Task apps emit BOTH this format AND v3 traces (dual serialization)
-    - Training code (GSPO) reads from this format
-    - Eval/filter tools read from v3 traces
-    Migration plan:
-    - Phase 1: Training code learns to read from v3 traces (with fallback to this)
-    - Phase 2: Make this field optional once training is migrated
-    - Phase 3: Remove this field entirely and delete this class
-    See: monorepo/trace_single_source.txt for full migration plan and timeline.
-    Why v3 traces are better:
-    - Single source of truth (no duplication/drift)
-    - Richer data: token IDs, logprobs, reasoning, timing, images
-    - Built-in audit trail and replay capability
-    - Standard schema across all Synth AI tooling
-    """
-    env_id: str
-    policy_id: str
-    steps: list[RolloutStep]
-    final: dict[str, Any] | None = None
-    length: int
-    # Required for trace correlation with inference mesh (optional initially for backward compat)
-    # See: monorepo/INFERENCE_URL_REQUIREMENT_PLAN.md and trace_creation_and_judgement.txt
-    inference_url: str
-    # Required by monorepo trace_validation.py: trajectory-level trace with event_history
-    # The event_history contains LM call records for input/output extraction
-    trace: dict[str, Any] | None = Field(
+    # =========================================================================
+    # LEGACY FIELDS (Backward Compatibility)
+    # =========================================================================
+    episode_rewards: list[float] = Field(
+        default_factory=list,
+        description="[LEGACY] Per-episode rewards. Use outcome_reward instead.",
+    )
+    reward_mean: float = Field(
+        default=0.0,
+        description="[LEGACY] Mean reward. Use outcome_reward instead.",
+    )
+    num_steps: int = Field(
+        default=1,
+        description="[LEGACY] Step count. Can be derived from event_rewards or trace.",
+    )
+    num_episodes: int = Field(
+        default=1,
+        description="[LEGACY] Episode count. Usually 1 for GEPA tasks.",
+    )
+    outcome_score: float | None = Field(
+        default=None,
+        description="[DEPRECATED] Alias for outcome_reward. Use outcome_reward instead.",
+    )
+    events_score: float | None = Field(
         default=None,
-        description="V3 trace with event_history for this trajectory (required for trace strict mode)"
+        description="[LEGACY] Aggregate event score. Use event_rewards instead.",
+    )
+    details: dict[str, Any] = Field(
+        default_factory=dict,
+        description="Metadata only. Do NOT use details.correct for rewards.",
     )
-    decision_samples: list[dict[str, Any]] | None = None
+class RolloutResponse(BaseModel):
+    """Response from a rollout execution.
-class RolloutMetrics(BaseModel):
-    episode_returns: list[float]
-    mean_return: float
-    num_steps: int
-    num_episodes: int = 0
-    outcome_score: float | None = None
-    events_score: float | None = None
-    details: dict[str, Any] = Field(default_factory=dict)
+    ## Key Fields
+    - `run_id`: Echo from request (required)
+    - `metrics`: Rollout metrics with `outcome_reward` (required)
+    - `trace`: v3 trace payload (required for verifier scoring)
-class RolloutResponse(BaseModel):
-    """Response from a rollout execution.
-    Contains both legacy trajectory format (for backward compatibility) and
-    modern v3 trace format (preferred going forward).
+    ## Canonical Locations (Top-Level)
+    - `trace_correlation_id`: Correlation ID for trace recovery (TOP-LEVEL CANONICAL)
+    - `inference_url`: Inference URL used for this rollout (TOP-LEVEL CANONICAL)
+    These fields SHOULD be at top-level. The monorepo parses from top-level first,
+    with fallback to nested locations for backward compatibility.
+    ## Example
+        response = RolloutResponse(
+            run_id=request.run_id,
+            metrics=RolloutMetrics(outcome_reward=1.0),
+            trace=trace_payload,
+            trace_correlation_id="trace_abc123",
+            inference_url="https://api.usesynth.ai/v1/trial-xyz",
+        )
     """
     run_id: str
-    # DEPRECATED: Legacy format maintained for training code compatibility.
-    # Will be removed once training migrates to reading from `trace` field.
-    # See: monorepo/trace_single_source.txt for migration plan.
-    trajectories: list[RolloutTrajectory]
-    branches: dict[str, list[str]] = Field(default_factory=dict)
     metrics: RolloutMetrics
-    aborted: bool = False
-    ops_executed: int = 0
-    # OPTIONAL: correlation ID for linking rollout to inference traces
-    # If not provided, trainer will infer it from trajectory.inference_url ?cid=... parameter
-    trace_correlation_id: str | None = None
-    # PREFERRED: v3 trace format (SessionTrace). This is the single source of truth
-    # for rollout data and should be used by all new code. Contains richer data than
-    # trajectories including token IDs, logprobs, timing, and multimodal content.
     trace: dict[str, Any] | None = None
-    pipeline_metadata: dict[str, Any] = Field(default_factory=dict)
+    # =========================================================================
+    # CANONICAL LOCATIONS (Top-Level - Preferred for Parsing)
+    # =========================================================================
+    trace_correlation_id: str | None = Field(
+        default=None,
+        description="Correlation ID for trace recovery. TOP-LEVEL CANONICAL location.",
+    )
+    inference_url: str | None = Field(
+        default=None,
+        description="Inference URL used for this rollout. TOP-LEVEL CANONICAL location.",
+    )
+    # =========================================================================
+    # LEGACY FIELDS (Backward Compatibility)
+    # =========================================================================
+    branches: dict[str, list[str]] = Field(
+        default_factory=dict,
+        description="[LEGACY] Branch tracking. Usually empty for single-path rollouts.",
+    )
+    aborted: bool = Field(
+        default=False,
+        description="Whether the rollout was aborted early.",
+    )
+    pipeline_metadata: dict[str, Any] = Field(
+        default_factory=dict,
+        description="[LEGACY] Additional metadata. Prefer top-level fields instead.",
+    )
 class _ExtraAllowModel(BaseModel):
@@ -262,7 +298,7 @@ class RubricSection(_ExtraAllowModel):
 class RubricInfo(_ExtraAllowModel):
-    """Outcome and event scoring definitions used by judges."""
+    """Outcome and event scoring definitions used by verifiers."""
     outcome: RubricSection | None = None
     events: RubricSection | None = None
@@ -287,11 +323,17 @@ class TaskInfo(_ExtraAllowModel):
     """Static metadata describing the capabilities of a Task App task."""
     task: TaskDescriptor
-    environment: str
     dataset: DatasetInfo
-    rubric: RubricInfo
     inference: InferenceInfo
     limits: LimitsInfo
+    environment: str | None = Field(
+        default=None,
+        description="[DEPRECATED] Legacy field not read by server. Will be removed in future version.",
+    )
+    rubric: RubricInfo | None = Field(
+        default=None,
+        description="[DEPRECATED] Use LocalAPIConfig.rubrics (RubricBundle) instead. Server ignores this field.",
+    )
     task_metadata: dict[str, Any] = Field(
         default_factory=dict,
         description="Task-specific extras (e.g. prompt version info, documentation links).",

synth-ai 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl

Potentially problematic release.

synth-ai 0.4.1py3-none-any.whl → 0.4.4py3-none-any.whl