PyPI - synth-ai - Versions diffs - 0.2.13.dev2__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

synth-ai 0.2.13.dev2py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (110) hide show

examples/multi_step/configs/README_verilog_rl.md +77 -0
examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +5 -4
examples/multi_step/configs/crafter_synth_backend.md +40 -0
examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
examples/multi_step/configs/verilog_rl_lora.toml +190 -0
examples/multi_step/judges/crafter_backend_judge.py +220 -0
examples/multi_step/judges/verilog_backend_judge.py +234 -0
examples/multi_step/readme.md +48 -0
examples/multi_step/verilog_rl_lora.md +218 -0
examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
examples/sft/evaluate.py +2 -0
examples/sft/generate_traces.py +2 -0
examples/swe/task_app/grpo_swe_mini.py +1 -0
examples/swe/task_app/hosted/rollout.py +2 -0
examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
examples/task_apps/crafter/task_app/__init__.py +3 -0
examples/task_apps/crafter/task_app/grpo_crafter.py +306 -8
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +16 -3
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +52 -1
examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +111 -13
examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
examples/task_apps/enron/filter_sft.toml +5 -0
examples/task_apps/enron/tests/__init__.py +2 -0
examples/task_apps/enron/tests/integration/__init__.py +2 -0
examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
examples/task_apps/enron/tests/unit/__init__.py +2 -0
examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
examples/task_apps/pokemon_red/task_app.py +199 -6
examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
examples/task_apps/sokoban/filter_sft.toml +5 -0
examples/task_apps/sokoban/tests/__init__.py +2 -0
examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
examples/task_apps/verilog/filter_sft.toml +5 -0
examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
examples/task_apps/verilog/tests/__init__.py +2 -0
examples/task_apps/verilog/tests/integration/__init__.py +2 -0
examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
examples/task_apps/verilog/tests/unit/__init__.py +2 -0
examples/warming_up_to_rl/groq_test.py +2 -0
examples/warming_up_to_rl/run_local_rollout.py +2 -0
examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
examples/warming_up_to_rl/run_rollout_remote.py +2 -0
synth_ai/api/models/supported.py +1 -0
synth_ai/cli/__init__.py +46 -13
synth_ai/cli/_modal_wrapper.py +3 -2
synth_ai/cli/recent.py +1 -1
synth_ai/cli/status.py +1 -1
synth_ai/cli/task_apps.py +354 -143
synth_ai/cli/traces.py +1 -1
synth_ai/cli/tui.py +57 -0
synth_ai/cli/turso.py +1 -1
synth_ai/cli/watch.py +1 -1
synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
synth_ai/environments/examples/crafter_classic/environment.py +1 -1
synth_ai/environments/examples/verilog/engine.py +76 -10
synth_ai/judge_schemas.py +8 -8
synth_ai/task/__init__.py +11 -1
synth_ai/task/apps/__init__.py +1 -0
synth_ai/task/config.py +257 -0
synth_ai/task/contracts.py +15 -2
synth_ai/task/rubrics/__init__.py +3 -0
synth_ai/task/rubrics/loaders.py +22 -3
synth_ai/task/rubrics/scoring.py +3 -0
synth_ai/task/trace_correlation_helpers.py +315 -0
synth_ai/task/validators.py +144 -0
synth_ai/tracing_v3/abstractions.py +3 -3
synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
synth_ai/tracing_v3/session_tracer.py +16 -6
synth_ai/tracing_v3/storage/base.py +29 -29
synth_ai/tracing_v3/storage/config.py +3 -3
synth_ai/tracing_v3/turso/daemon.py +8 -7
synth_ai/tracing_v3/turso/native_manager.py +63 -40
synth_ai/tracing_v3/utils.py +3 -3
synth_ai/tui/__init__.py +5 -0
synth_ai/tui/__main__.py +13 -0
synth_ai/tui/cli/__init__.py +1 -0
synth_ai/tui/cli/query_experiments.py +164 -0
synth_ai/tui/cli/query_experiments_v3.py +164 -0
synth_ai/tui/dashboard.py +906 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/METADATA +1 -1
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/RECORD +110 -71
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0

examples/task_apps/crafter/task_app/grpo_crafter.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
+import json
 import logging
 import os
 import sys
@@ -11,11 +12,12 @@ from pathlib import Path
 from typing import Any
 from synth_ai.task.apps import ModalDeploymentConfig, TaskAppEntry, register_task_app
-from synth_ai.task.contracts import RolloutMetrics, RolloutRequest, RolloutResponse, TaskInfo
+from synth_ai.task.contracts import RolloutMetrics, RolloutMode, RolloutRequest, RolloutResponse, TaskInfo
 from synth_ai.task.datasets import TaskDatasetRegistry, TaskDatasetSpec
 from synth_ai.task.json import to_jsonable  # noqa: F401  (imported for side-effect compatibility)
 from synth_ai.task.rubrics import load_rubric
 from synth_ai.task.server import ProxyConfig, RubricBundle, TaskAppConfig
+from synth_ai.task.validators import normalize_inference_url
 from synth_ai.task.tracing_utils import (
     build_tracer_factory,
     resolve_sft_output_dir,
@@ -24,6 +26,18 @@ from synth_ai.task.tracing_utils import (
 )
 from synth_ai.tracing_v3.session_tracer import SessionTracer
+try:
+    from .synth_envs_hosted.utils import (
+        ensure_chat_completions_url,
+        extract_trace_correlation_id,
+    )
+except Exception:  # pragma: no cover - utils unavailable if optional deps missing
+    def ensure_chat_completions_url(raw_url, mode=None):
+        """Fallback to shared utility for URL normalization."""
+        return normalize_inference_url(raw_url) if raw_url else raw_url
+    def extract_trace_correlation_id(_raw_url):
+        return None
 logger = logging.getLogger(__name__)
 DEFAULT_ALIAS_OPS: list[str] = ["agent", "env"] * 10
@@ -95,6 +109,110 @@ SYNTH_ENVS_HOSTED_ROOT = (TASK_APP_ROOT / "synth_envs_hosted").resolve()
 EXAMPLES_ROOT = (REPO_ROOT / "examples").resolve()
 RUBRICS_ROOT = (EXAMPLES_ROOT / "multi_step" / "rubrics").resolve()
+DEFAULT_OUTCOME_RUBRIC_DATA: dict[str, Any] = {
+    "version": "1",
+    "goal_text": (
+        "Reward episodes that climb the Crafter achievement ladder, stockpile key resources "
+        "(especially wood), and finish alive with clear understanding of any failure."
+    ),
+    "aggregation": "weighted_sum",
+    "criteria": [
+        {
+            "id": "achievement_progression",
+            "description": (
+                "Weigh achievements by tier: late-game unlocks (iron tools, furnace, armor) earn "
+                "the most, mid-tier crafting (stone tools, furnace prep) gets partial credit, early "
+                "tasks (collecting saplings/wood tools) only lightly scored."
+            ),
+            "weight": 0.35,
+        },
+        {
+            "id": "resource_stockpile",
+            "description": (
+                "Assess resource totals with emphasis on wood stores; high scores require abundant "
+                "wood plus supporting materials (stone, coal, iron) that signal readiness for "
+                "crafting."
+            ),
+            "weight": 0.2,
+        },
+        {
+            "id": "survival_state",
+            "description": (
+                "Reward finishing alive with healthy food/drink bars and safe positioning; penalize "
+                "deaths, low vitals, or lingering hazards at episode end."
+            ),
+            "weight": 0.2,
+        },
+        {
+            "id": "failure_analysis",
+            "description": (
+                "If the run ends in death or timeout, clearly identify the cause and deduct unless "
+                "the agent mitigated risk; highlight when the agent survives despite danger."
+            ),
+            "weight": 0.15,
+        },
+        {
+            "id": "future_readiness",
+            "description": (
+                "Describe how prepared the agent is for the next objectives (tools crafted, shelters, "
+                "furnaces, smelted materials) and whether the inventory supports further progress."
+            ),
+            "weight": 0.1,
+        },
+    ],
+}
+DEFAULT_EVENTS_RUBRIC_DATA: dict[str, Any] = {
+    "version": "1",
+    "goal_text": (
+        "Score each decision in proportion to the concrete Crafter achievement progress it "
+        "delivers, topping out the scale when the log shows a fresh achievement unlock and keeping "
+        "routine upkeep near zero."
+    ),
+    "aggregation": "weighted_sum",
+    "criteria": [
+        {
+            "id": "achievement_unlocks",
+            "description": (
+                "Assign 0.9-1.0 when the decision explicitly unlocks a new Crafter achievement (look "
+                'for "Achievement unlocked" messages or equivalent deterministic completions such as '
+                "placing a furnace that immediately crafts ingots). Cap the score at 0.4 when no new "
+                "achievement fires, and drop to <=0.1 if the turn repeats known actions without "
+                "measurable progress."
+            ),
+            "weight": 0.55,
+        },
+        {
+            "id": "milestone_setup",
+            "description": (
+                "Give 0.5-0.7 when the action completes the last prerequisite for a specific upcoming "
+                "achievement (e.g., gathering the final ore before smelting, crafting sticks right "
+                "before a tool). Keep the score <=0.3 if the progress is speculative or still several "
+                "steps away."
+            ),
+            "weight": 0.2,
+        },
+        {
+            "id": "inventory_depth",
+            "description": (
+                "Reward 0.3-0.5 for pulls that clearly deepen critical buffers (fuel, food, ore) and "
+                "immediately unblock the next milestone. If resources are already plentiful or the "
+                "haul is generic filler, stay at <=0.2."
+            ),
+            "weight": 0.15,
+        },
+        {
+            "id": "execution_quality",
+            "description": (
+                "Only add up to 0.1 for clean, legal execution that avoids wasted turns; drop to 0.0 "
+                "whenever the agent idles, repeats failed moves, or takes damage without compensating "
+                "progress."
+            ),
+            "weight": 0.1,
+        },
+    ],
+}
 for path in (REPO_ROOT, TASK_APP_ROOT, SYNTH_ENVS_HOSTED_ROOT, EXAMPLES_ROOT):
     try:
         resolved = path.resolve()
@@ -115,6 +233,28 @@ try:
 except Exception:
     pass
+def _load_rubric_with_fallback(filename: str, fallback: dict[str, Any]):
+    """Load rubric from JSON file when available, otherwise use bundled fallback."""
+    search_paths = [RUBRICS_ROOT / filename, TASK_APP_ROOT / "rubrics" / filename]
+    for path in search_paths:
+        try:
+            if path.exists():
+                logger.debug("Loading rubric from %s", path)
+                return load_rubric(str(path))
+        except Exception as exc:
+            logger.warning("Failed to load rubric %s from %s: %s", filename, path, exc)
+    logger.warning("Falling back to inline rubric %s: file not available", filename)
+    try:
+        materialized = search_paths[0]
+        materialized.parent.mkdir(parents=True, exist_ok=True)
+        materialized.write_text(json.dumps(fallback, indent=2), encoding="utf-8")
+    except Exception:
+        logger.debug("Unable to materialize inline rubric %s", filename, exc_info=True)
+    return load_rubric(fallback)
 HAS_HOSTED = True
 try:
     import crafter  # type: ignore
@@ -343,9 +483,13 @@ def _base_task_info(dataset: CrafterDataset) -> TaskInfo:
     )
-OUTCOME_RUBRIC = load_rubric(str(RUBRICS_ROOT / "crafter_outcome_rubric.json"))
+OUTCOME_RUBRIC = _load_rubric_with_fallback(
+    "crafter_outcome_rubric.json", DEFAULT_OUTCOME_RUBRIC_DATA
+)
-EVENTS_RUBRIC = load_rubric(str(RUBRICS_ROOT / "crafter_events_rubric.json"))
+EVENTS_RUBRIC = _load_rubric_with_fallback(
+    "crafter_events_rubric.json", DEFAULT_EVENTS_RUBRIC_DATA
+)
 def describe_taskset(dataset: CrafterDataset) -> dict[str, Any]:
@@ -493,9 +637,94 @@ def _coerce_math_to_crafter(request: RolloutRequest) -> RolloutRequest:
     return coerced
+def _resolve_trace_correlation_id(policy_cfg: dict[str, Any], mode: Any = None) -> str | None:
+    """Best-effort extraction of the trace correlation identifier."""
+    candidates: list[Any] = [
+        policy_cfg.get("trace_correlation_id"),
+        policy_cfg.get("trace"),
+    ]
+    logger.debug(
+        "_resolve_trace_correlation_id: inspecting policy_cfg keys=%s candidates=%s",
+        sorted(policy_cfg.keys()),
+        candidates,
+    )
+    for candidate in candidates:
+        if isinstance(candidate, str):
+            stripped = candidate.strip()
+            if stripped:
+                return stripped
+    return extract_trace_correlation_id(policy_cfg.get("inference_url"), mode=mode)
 async def rollout_executor(request: RolloutRequest, fastapi_request) -> RolloutResponse:
+    request = _coerce_math_to_crafter(request)
+    policy_cfg = dict(request.policy.config or {})
+    logger.info(
+        "ROLLOUT_EXEC: incoming policy config keys=%s inference_url=%s run_id=%s mode=%s",
+        sorted(policy_cfg.keys()),
+        policy_cfg.get("inference_url"),
+        request.run_id,
+        request.mode,
+    )
+    inferred_url = ensure_chat_completions_url(policy_cfg.get("inference_url"), mode=request.mode)
+    if isinstance(inferred_url, str) and inferred_url:
+        if inferred_url != policy_cfg.get("inference_url"):
+            logger.warning(
+                "ROLLOUT_EXEC: normalized inference_url run_id=%s from %s to %s",
+                request.run_id,
+                policy_cfg.get("inference_url"),
+                inferred_url,
+            )
+        policy_cfg["inference_url"] = inferred_url
+    else:
+        logger.warning(
+            "ROLLOUT_EXEC: inference_url missing or not normalized run_id=%s raw=%s",
+            request.run_id,
+            policy_cfg.get("inference_url"),
+        )
+    trace_correlation_id = _resolve_trace_correlation_id(policy_cfg, mode=request.mode)
+    # ASSERTION: trace_correlation_id MUST be present for RL mode (but not EVAL mode)
+    if request.mode == RolloutMode.RL:
+        assert trace_correlation_id is not None, (
+            f"FATAL: trace_correlation_id extraction failed for run_id={request.run_id}. "
+            f"policy_cfg_keys={sorted(policy_cfg.keys())} "
+            f"inference_url={policy_cfg.get('inference_url')}"
+        )
+        assert isinstance(trace_correlation_id, str) and trace_correlation_id.strip(), (
+            f"FATAL: trace_correlation_id is empty for run_id={request.run_id}. "
+            f"Got: {trace_correlation_id!r}"
+        )
+    if trace_correlation_id:
+        policy_cfg["trace_correlation_id"] = trace_correlation_id
+    logger.info(
+        "ROLLOUT_EXEC: resolved trace_correlation_id=%s run_id=%s",
+        trace_correlation_id,
+        request.run_id,
+    )
+    pipeline_metadata: dict[str, Any] = {}
+    if trace_correlation_id:
+        pipeline_metadata["trace_correlation_id"] = trace_correlation_id
+    if isinstance(policy_cfg.get("inference_url"), str) and policy_cfg["inference_url"]:
+        pipeline_metadata.setdefault("inference_url", policy_cfg["inference_url"])
+    logger.info(
+        "ROLLOUT_EXEC: pipeline metadata prepared run_id=%s metadata=%s",
+        request.run_id,
+        pipeline_metadata,
+    )
     # If hosted env service code is not bundled, return a no-op rollout response compatible with contracts
     if not HAS_HOSTED:
+        logger.warning(
+            "ROLLOUT_EXEC: HAS_HOSTED disabled, returning stub response run_id=%s metadata=%s",
+            request.run_id,
+            pipeline_metadata,
+        )
         return RolloutResponse(
             run_id=request.run_id,
             trajectories=[],
@@ -510,11 +739,10 @@ async def rollout_executor(request: RolloutRequest, fastapi_request) -> RolloutR
             aborted=False,
             ops_executed=0,
             trace=None,
+            trace_correlation_id=trace_correlation_id or f"trace_{request.run_id}",
+            pipeline_metadata=pipeline_metadata,
         )
-    request = _coerce_math_to_crafter(request)
-    policy_cfg = dict(request.policy.config or {})
     try:
         max_llm_calls = int(policy_cfg.get("max_llm_calls") or 10)
     except Exception:
@@ -545,6 +773,7 @@ async def rollout_executor(request: RolloutRequest, fastapi_request) -> RolloutR
         converted_ops = converted_ops[:max_ops_allowed]
     legacy_request = LegacyRolloutRequest(
         run_id=request.run_id,
+        mode=request.mode,  # Preserve mode for nested requests
         env=LegacyRolloutEnvSpec(
             env_id=request.env.env_id,
             env_name=request.env.env_name,
@@ -568,12 +797,79 @@ async def rollout_executor(request: RolloutRequest, fastapi_request) -> RolloutR
     legacy_response: LegacyRolloutResponse = await legacy_execute_rollout(
         legacy_request, fastapi_request
     )
+    logger.info(
+        "ROLLOUT_EXEC: legacy rollout completed run_id=%s trace_id=%s",
+        request.run_id,
+        trace_correlation_id,
+    )
     data = legacy_response.model_dump()
     metrics = data.get("metrics", {}) or {}
     metrics.setdefault("outcome_score", None)
     metrics.setdefault("events_score", None)
     metrics.setdefault("details", {})
     data["metrics"] = metrics
+    # Add trace_correlation_id at TOP-LEVEL (REQUIRED for RL training pipeline)
+    # Use fallback if somehow missing
+    data["trace_correlation_id"] = trace_correlation_id or f"trace_{request.run_id}"
+    # Add trace_correlation_id to pipeline_metadata
+    existing_meta = data.get("pipeline_metadata")
+    if not isinstance(existing_meta, dict):
+        existing_meta = {}
+    # ALWAYS set trace_correlation_id (use fallback if needed)
+    final_cid = trace_correlation_id or f"trace_{request.run_id}"
+    existing_meta["trace_correlation_id"] = final_cid
+    if isinstance(policy_cfg.get("inference_url"), str) and policy_cfg["inference_url"]:
+        existing_meta.setdefault("inference_url", policy_cfg["inference_url"])
+    data["pipeline_metadata"] = existing_meta
+    # Add trace_correlation_id to each trajectory (required for RL training pipeline)
+    if "trajectories" in data:
+        for traj in data.get("trajectories", []):
+            if isinstance(traj, dict):
+                traj["trace_correlation_id"] = final_cid
+    logger.info(
+        "ROLLOUT_EXEC: final pipeline metadata run_id=%s metadata=%s",
+        request.run_id,
+        existing_meta,
+    )
+    if trace_correlation_id and existing_meta.get("trace_correlation_id") != trace_correlation_id:
+        logger.error(
+            "ROLLOUT_EXEC: metadata trace mismatch run_id=%s expected=%s actual=%s",
+            request.run_id,
+            trace_correlation_id,
+            existing_meta.get("trace_correlation_id"),
+        )
+    if not existing_meta.get("trace_correlation_id"):
+        logger.error(
+            "ROLLOUT_EXEC: final metadata missing trace_correlation_id run_id=%s metadata=%s",
+            request.run_id,
+            existing_meta,
+        )
+    # ASSERTION: Verify trace_correlation_id is present in response at all required levels
+    assert "trace_correlation_id" in data, (
+        f"FATAL: trace_correlation_id missing from top-level response data for run_id={request.run_id}. "
+        f"Keys: {list(data.keys())}"
+    )
+    assert data["trace_correlation_id"] == final_cid, (
+        f"FATAL: trace_correlation_id mismatch in response for run_id={request.run_id}. "
+        f"Expected: {final_cid!r}, Got: {data.get('trace_correlation_id')!r}"
+    )
+    assert "pipeline_metadata" in data, (
+        f"FATAL: pipeline_metadata missing from response for run_id={request.run_id}"
+    )
+    assert data["pipeline_metadata"].get("trace_correlation_id") == final_cid, (
+        f"FATAL: trace_correlation_id missing or mismatched in pipeline_metadata for run_id={request.run_id}. "
+        f"Expected: {final_cid!r}, Got: {data['pipeline_metadata'].get('trace_correlation_id')!r}"
+    )
+    logger.info(
+        "ROLLOUT_EXEC: assertions passed - trace_correlation_id present in response run_id=%s cid=%s",
+        request.run_id,
+        final_cid,
+    )
     return RolloutResponse.model_validate(data)
@@ -617,7 +913,7 @@ def build_config() -> TaskAppConfig:
     routers: tuple = (environment_router, policy_router, branching_router) if HAS_HOSTED else ()
     config = TaskAppConfig(
-        app_id="grpo-crafter",
+        app_id="grpo-crafter-task-app",
         name="GRPO Crafter Task App",
         description="Crafter Classic environment with GRPO task endpoints and LLM proxies.",
         base_task_info=base_info,
@@ -638,7 +934,7 @@ def build_config() -> TaskAppConfig:
 register_task_app(
     entry=TaskAppEntry(
-        app_id="grpo-crafter",
+        app_id="grpo-crafter-task-app",
         description="Crafter Classic task app with rollout + proxy endpoints",
         config_factory=build_config,
         aliases=("crafter", "crafter-task"),
@@ -665,6 +961,8 @@ register_task_app(
                 (str(REPO_ROOT), "/opt/synth_ai_repo"),
                 (str(REPO_ROOT / "synth_ai"), "/opt/synth_ai_repo/synth_ai"),
                 (str(TASK_APP_ROOT), "/opt/synth_ai_repo/examples/task_apps/crafter/task_app"),
+                # Explicitly mount rubrics directory
+                (str(RUBRICS_ROOT), "/opt/synth_ai_repo/examples/multi_step/rubrics"),
             ),
             secret_names=("groq-api-key", "openai-api-key"),
             memory=16384,

examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py CHANGED Viewed

@@ -209,6 +209,16 @@ class CrafterEnvironmentWrapper:
             logger.info("No valid actions provided, defaulting to noop")
             normalized.append(EnvToolCall(tool="interact", args={"action": 0}))  # noop action
+        # Limit to first 20 actions to prevent spam from overly long tool calls
+        MAX_ACTIONS_PER_STEP = 20
+        if len(normalized) > MAX_ACTIONS_PER_STEP:
+            logger.warning(
+                "Tool call contained %d actions, limiting to first %d to prevent spam",
+                len(normalized),
+                MAX_ACTIONS_PER_STEP,
+            )
+            normalized = normalized[:MAX_ACTIONS_PER_STEP]
         # Pre-step logging: capture current public state and print concise summary
         before_state: dict[str, Any] | None = None
         try:

examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py CHANGED Viewed

@@ -45,6 +45,7 @@ class CrafterPolicy(Policy):
         self.model = model
         self.use_tools = True
         self.use_vision = False  # Enable vision for VLMs
+        self.image_only_mode = False  # If True, only send images without text observations
         # Sampling parameters (populated via initialize(config))
         self.temperature: float | None = None
         self.top_p: float | None = None
@@ -66,6 +67,11 @@ class CrafterPolicy(Policy):
             self.use_tools = bool(config["use_tools"])
         if "use_vision" in config:
             self.use_vision = bool(config["use_vision"])
+        if "image_only_mode" in config:
+            self.image_only_mode = bool(config["image_only_mode"])
+            # If image_only_mode is enabled, automatically enable vision
+            if self.image_only_mode:
+                self.use_vision = True
         # Auto-detect vision capability from model name if not explicitly set
         if "use_vision" not in config and self.model:
             self.use_vision = self._is_vision_model(self.model)
@@ -417,14 +423,21 @@ class CrafterPolicy(Policy):
         """Prepare an inference request (implementing abstract method)."""
         # Format observation with rich contextual information
         observation_text = self._format_observation_for_llm(observation)
-        image_parts = self._extract_image_parts(observation)
-        # Build messages (observation_text already formatted; no raw matrices)
+        # Extract image parts based on vision settings
+        if self.use_vision:
+            image_parts = self._extract_image_parts(observation)
+        else:
+            # Text-only mode: don't include any images
+            image_parts = []
+        # Build messages with appropriate mode
         messages = CrafterReActAgent.build_messages(
             observation=observation_text,
             history=history,
             turn=self.turn_index,
             image_parts=image_parts,
+            image_only_mode=self.image_only_mode,
         )
         # Return messages and tools schema

examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py CHANGED Viewed

@@ -85,8 +85,17 @@ class CrafterReActAgent:
         history: list[dict[str, Any]] | None = None,
         turn: int | None = None,
         image_parts: list[dict[str, Any]] | None = None,
+        image_only_mode: bool = False,
     ) -> list[dict[str, Any]]:
-        """Construct OpenAI-style messages list for vLLM generation."""
+        """Construct OpenAI-style messages list for vLLM generation.
+        Args:
+            observation: Text observation to include
+            history: Previous conversation history
+            turn: Current turn number
+            image_parts: Image content parts in OpenAI format
+            image_only_mode: If True, only include images without text observation
+        """
         msgs: list[dict[str, Any]] = [
             {"role": "system", "content": CrafterReActAgent.get_system_prompt()}
         ]
@@ -94,8 +103,14 @@ class CrafterReActAgent:
             msgs.extend(history)
         user_content: Any
         if image_parts:
-            user_content = [{"type": "text", "text": observation}] + list(image_parts)
+            # Image-only mode: send only images without text observation
+            if image_only_mode:
+                user_content = list(image_parts)
+            else:
+                # Normal vision mode: send both text and images
+                user_content = [{"type": "text", "text": observation}] + list(image_parts)
         else:
+            # Text-only mode (default): no images
             user_content = observation
         msgs.append({"role": "user", "content": user_content})
         return msgs

examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py CHANGED Viewed

@@ -149,7 +149,11 @@ class OpenAIClient:
             OpenAI-compatible chat completion response
         """
         base = (base_url or self.base_url).rstrip("/")
-        url = base + "/v1/chat/completions"
+        # Don't append /v1/chat/completions if the URL already contains it
+        if "/v1/chat/completions" in base:
+            url = base
+        else:
+            url = base + "/v1/chat/completions"
         timeout = timeout_s or self.timeout_s
         # Merge headers
@@ -164,10 +168,28 @@ class OpenAIClient:
         except Exception:
             pass
-        # If target is our in-app Groq proxy, force Authorization to use GROQ_API_KEY
+        # Set Authorization header based on the target URL
         try:
             low_url = (url or "").lower()
-            if "/proxy/groq" in low_url or "groq" in low_url:
+            # If calling OpenAI directly (api.openai.com)
+            if "api.openai.com" in low_url:
+                openai_key = os.getenv("OPENAI_API_KEY")
+                if openai_key and isinstance(openai_key, str):
+                    headers["Authorization"] = f"Bearer {openai_key}"
+            # If target is Synth backend (any deployment), use SYNTH_API_KEY
+            # Matches: synth-backend-*, agent-learning*, localhost:8000, 127.0.0.1:8000
+            elif any(pattern in low_url for pattern in [
+                "synth-backend", "synth.run", "agent-learning",
+                "localhost:8000", "127.0.0.1:8000"
+            ]):
+                synth_key = os.getenv("SYNTH_API_KEY")
+                if synth_key and isinstance(synth_key, str):
+                    headers["Authorization"] = f"Bearer {synth_key}"
+            # If target is Groq, use GROQ_API_KEY
+            elif "/proxy/groq" in low_url or "api.groq.com" in low_url:
                 gk = os.getenv("GROQ_API_KEY")
                 if gk and isinstance(gk, str):
                     headers["Authorization"] = f"Bearer {gk}"

examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py CHANGED Viewed

@@ -10,11 +10,13 @@ from fastapi import APIRouter, HTTPException, Request
 from pydantic import BaseModel
 from synth_ai.task.auth import allowed_environment_api_keys, normalize_environment_api_key
+from synth_ai.task.contracts import RolloutMode
 from .envs.crafter.policy import CrafterPolicy
 from .inference.openai_client import create_inference_client
 from .registry import registry
 from .storage.volume import storage
+from .utils import ensure_chat_completions_url
 # Token budgeting (shared logic with inference server)
 try:
@@ -40,6 +42,7 @@ class PolicyCreateRequest(BaseModel):
     parent_policy_id: str | None = None
     rl_run_id: str
     bound_env_id: str | None = None
+    mode: RolloutMode
 class PolicyCreateResponse(BaseModel):
@@ -119,6 +122,14 @@ async def create_policy(
             config.setdefault("inference_url", f"{base_url}/proxy")
             config["provider"] = "openai"
+        received_url = config.get("inference_url")
+        logger.info(
+            "POLICY_CREATE: policy=%s provider=%s raw_inference_url=%s",
+            request.policy_name,
+            provider,
+            received_url,
+        )
         if "inference_url" not in config and task_app is not None:
             task_base_url = getattr(task_app, "vllm_base_url", None)
             if task_base_url:
@@ -133,6 +144,31 @@ async def create_policy(
                 detail="Policy configuration must include 'inference_url' and 'model'.",
             )
+        # Get mode from PolicyCreateRequest (defaults to "rl" for backward compatibility)
+        mode = request.mode
+        logger.info("POLICY_CREATE: Using mode=%s for URL processing", mode)
+        sanitized_url = ensure_chat_completions_url(config.get("inference_url"), mode=mode)
+        if isinstance(sanitized_url, str) and sanitized_url:
+            if sanitized_url != config.get("inference_url"):
+                logger.warning(
+                    "POLICY_CREATE: normalized inference_url for policy=%s provider=%s mode=%s from %s to %s",
+                    request.policy_name,
+                    provider,
+                    mode,
+                    config.get("inference_url"),
+                    sanitized_url,
+                )
+            config["inference_url"] = sanitized_url
+        else:
+            logger.warning(
+                "POLICY_CREATE: unable to normalize inference_url for policy=%s provider=%s mode=%s raw=%s",
+                request.policy_name,
+                mode,
+                provider,
+                config.get("inference_url"),
+            )
         # Create policy instance based on name
         pname = request.policy_name.lower()
         if pname in ["crafter-react", "crafter"]:
@@ -507,7 +543,22 @@ async def step_policy(
             # Ensure meta carries the final target URL for downstream logging/clients
             with contextlib.suppress(Exception):
-                meta["inference_url"] = target_url
+                sanitized_target = ensure_chat_completions_url(target_url)
+                if sanitized_target and sanitized_target != target_url:
+                    logger.warning(
+                        "POLICY_STEP: normalized inference_url mid-flight policy=%s from %s to %s",
+                        policy_name,
+                        target_url,
+                        sanitized_target,
+                    )
+                elif not sanitized_target:
+                    logger.info(
+                        "POLICY_STEP: inference_url unchanged policy=%s target=%s",
+                        policy_name,
+                        target_url,
+                    )
+                meta["inference_url"] = sanitized_target if sanitized_target else target_url
+                target_url = sanitized_target or target_url
             # Select API key based on resolved target URL
             api_key_override = None

synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.14__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.13.dev2py3-none-any.whl → 0.2.14py3-none-any.whl