PyPI - synth-ai - Versions diffs - 0.2.13.dev2__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

synth-ai 0.2.13.dev2py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (110) hide show

examples/multi_step/configs/README_verilog_rl.md +77 -0
examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +5 -4
examples/multi_step/configs/crafter_synth_backend.md +40 -0
examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
examples/multi_step/configs/verilog_rl_lora.toml +190 -0
examples/multi_step/judges/crafter_backend_judge.py +220 -0
examples/multi_step/judges/verilog_backend_judge.py +234 -0
examples/multi_step/readme.md +48 -0
examples/multi_step/verilog_rl_lora.md +218 -0
examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
examples/sft/evaluate.py +2 -0
examples/sft/generate_traces.py +2 -0
examples/swe/task_app/grpo_swe_mini.py +1 -0
examples/swe/task_app/hosted/rollout.py +2 -0
examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
examples/task_apps/crafter/task_app/__init__.py +3 -0
examples/task_apps/crafter/task_app/grpo_crafter.py +306 -8
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +16 -3
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +52 -1
examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +111 -13
examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
examples/task_apps/enron/filter_sft.toml +5 -0
examples/task_apps/enron/tests/__init__.py +2 -0
examples/task_apps/enron/tests/integration/__init__.py +2 -0
examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
examples/task_apps/enron/tests/unit/__init__.py +2 -0
examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
examples/task_apps/pokemon_red/task_app.py +199 -6
examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
examples/task_apps/sokoban/filter_sft.toml +5 -0
examples/task_apps/sokoban/tests/__init__.py +2 -0
examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
examples/task_apps/verilog/filter_sft.toml +5 -0
examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
examples/task_apps/verilog/tests/__init__.py +2 -0
examples/task_apps/verilog/tests/integration/__init__.py +2 -0
examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
examples/task_apps/verilog/tests/unit/__init__.py +2 -0
examples/warming_up_to_rl/groq_test.py +2 -0
examples/warming_up_to_rl/run_local_rollout.py +2 -0
examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
examples/warming_up_to_rl/run_rollout_remote.py +2 -0
synth_ai/api/models/supported.py +1 -0
synth_ai/cli/__init__.py +46 -13
synth_ai/cli/_modal_wrapper.py +3 -2
synth_ai/cli/recent.py +1 -1
synth_ai/cli/status.py +1 -1
synth_ai/cli/task_apps.py +354 -143
synth_ai/cli/traces.py +1 -1
synth_ai/cli/tui.py +57 -0
synth_ai/cli/turso.py +1 -1
synth_ai/cli/watch.py +1 -1
synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
synth_ai/environments/examples/crafter_classic/environment.py +1 -1
synth_ai/environments/examples/verilog/engine.py +76 -10
synth_ai/judge_schemas.py +8 -8
synth_ai/task/__init__.py +11 -1
synth_ai/task/apps/__init__.py +1 -0
synth_ai/task/config.py +257 -0
synth_ai/task/contracts.py +15 -2
synth_ai/task/rubrics/__init__.py +3 -0
synth_ai/task/rubrics/loaders.py +22 -3
synth_ai/task/rubrics/scoring.py +3 -0
synth_ai/task/trace_correlation_helpers.py +315 -0
synth_ai/task/validators.py +144 -0
synth_ai/tracing_v3/abstractions.py +3 -3
synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
synth_ai/tracing_v3/session_tracer.py +16 -6
synth_ai/tracing_v3/storage/base.py +29 -29
synth_ai/tracing_v3/storage/config.py +3 -3
synth_ai/tracing_v3/turso/daemon.py +8 -7
synth_ai/tracing_v3/turso/native_manager.py +63 -40
synth_ai/tracing_v3/utils.py +3 -3
synth_ai/tui/__init__.py +5 -0
synth_ai/tui/__main__.py +13 -0
synth_ai/tui/cli/__init__.py +1 -0
synth_ai/tui/cli/query_experiments.py +164 -0
synth_ai/tui/cli/query_experiments_v3.py +164 -0
synth_ai/tui/dashboard.py +906 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/METADATA +1 -1
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/RECORD +110 -71
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0

examples/task_apps/verilog/task_app/grpo_verilog.py CHANGED Viewed

@@ -34,6 +34,7 @@ from synth_ai.task.contracts import (
 from synth_ai.task.datasets import TaskDatasetRegistry, TaskDatasetSpec
 from synth_ai.task.rubrics import load_rubric
 from synth_ai.task.server import ProxyConfig, RubricBundle, TaskAppConfig
+from synth_ai.task.validators import normalize_inference_url
 from synth_ai.task.tracing_utils import (
     build_tracer_factory,
     resolve_sft_output_dir,
@@ -45,7 +46,36 @@ from synth_ai.tracing_v3.session_tracer import SessionTracer
 logger = logging.getLogger(__name__)
 _HERE = Path(__file__).resolve()
-REPO_ROOT = _HERE.parents[4]
+def _resolve_repo_root() -> Path:
+    """Find synth-ai repo root, checking env var and parent traversal."""
+    candidates: list[Path] = []
+    env_root = os.getenv("SYNTH_AI_REPO_ROOT")
+    if env_root:
+        candidates.append(Path(env_root).expanduser())
+    # Try Modal mount point
+    candidates.append(Path("/opt/synth_ai_repo"))
+    # Traverse up from current file
+    current = _HERE
+    for _ in range(6):
+        current = current.parent
+        candidates.append(current)
+        if (current / "synth_ai").is_dir() and (current / "examples").is_dir():
+            return current
+    # Return first existing candidate
+    for candidate in candidates:
+        if candidate.is_dir() and (candidate / "synth_ai").exists():
+            return candidate
+    # Fallback to current parent structure (may not work in Modal)
+    return _HERE.parent.parent.parent.parent
+REPO_ROOT = _resolve_repo_root()
 DATASET_SPEC = TaskDatasetSpec(
     id="verilog_eval_v2",
@@ -161,23 +191,6 @@ def _base_task_info(dataset: VerilogDataset) -> TaskInfo:
     )
-def _normalize_inference_url(url: str | None) -> str:
-    candidate = (url or DEFAULT_INFERENCE_URL).strip()
-    if not candidate:
-        candidate = DEFAULT_INFERENCE_URL
-    if candidate.endswith("/v1/chat/completions"):
-        return candidate
-    if candidate.endswith("/chat/completions"):
-        return candidate
-    if candidate.endswith("/v1"):
-        return f"{candidate.rstrip('/')}/chat/completions"
-    if candidate.endswith("/v1/"):
-        return f"{candidate.rstrip('/')}/chat/completions"
-    if candidate.endswith("/chat"):
-        return f"{candidate.rstrip('/')}/completions"
-    if candidate.endswith("/chat/"):
-        return f"{candidate.rstrip('/')}/completions"
-    return f"{candidate.rstrip('/')}/v1/chat/completions"
 def _format_file_previews(files: dict[str, str]) -> str:
@@ -336,7 +349,7 @@ class VerilogLLMAgent:
         max_tokens: int,
     ) -> None:
         self.instructions = instructions.strip()
-        self.inference_url = _normalize_inference_url(inference_url)
+        self.inference_url = normalize_inference_url(inference_url, default=DEFAULT_INFERENCE_URL)
         self.model = model or DEFAULT_MODEL
         self.temperature = temperature
         self.max_tokens = max_tokens
@@ -349,7 +362,16 @@ class VerilogLLMAgent:
             if not api_key:
                 raise RuntimeError("GROQ_API_KEY is not configured for Verilog inference.")
             self.headers["Authorization"] = f"Bearer {api_key.strip()}"
-        elif "openai" in lowered:
+        # If target is Synth backend (any deployment), use SYNTH_API_KEY
+        elif any(pattern in lowered for pattern in [
+            "synth-backend", "synth.run", "agent-learning",
+            "localhost:8000", "127.0.0.1:8000"
+        ]):
+            api_key = os.getenv("SYNTH_API_KEY")
+            if not api_key:
+                raise RuntimeError("SYNTH_API_KEY is not configured for Verilog inference with Synth backend.")
+            self.headers["Authorization"] = f"Bearer {api_key.strip()}"
+        elif "openai" in lowered or "api.openai.com" in lowered:
             api_key = os.getenv("OPENAI_API_KEY")
             if not api_key:
                 raise RuntimeError("OPENAI_API_KEY is not configured for Verilog inference.")
@@ -574,6 +596,21 @@ async def rollout_executor(
     total_reward = 0.0
     final_observation: dict[str, Any] | None = None
     truncated_due_to_limit = False
+    # Log episode start
+    problem_id = getattr(instance, "problem_id", "unknown")
+    logger.info("=" * 80)
+    logger.info(f"[EPISODE START] run_id={request.run_id}")
+    logger.info(f"  Problem ID:        {problem_id}")
+    logger.info(f"  Policy:            {policy_id}")
+    logger.info(f"  Model:             {policy_model}")
+    logger.info(f"  Max steps:         {max_steps}")
+    logger.info(f"  Temperature:       {temperature}")
+    logger.info(f"  Max tokens:        {max_tokens}")
+    if instructions:
+        instructions_preview = instructions[:150] + "..." if len(instructions) > 150 else instructions
+        logger.info(f"  Instructions:      {instructions_preview}")
+    logger.info("=" * 80)
     code_dirty = False
     last_compile_success = False
     simulate_since_last_compile = False
@@ -648,7 +685,7 @@ async def rollout_executor(
                             and not code_dirty
                         )
                         if skip_env_step:
-                            reward_last = -0.01
+                            reward_last = 0.0  # No reward for blocked operations
                             total_reward += reward_last
                             current_observation = dict(current_observation)
                             current_observation["reward_last"] = reward_last
@@ -669,6 +706,23 @@ async def rollout_executor(
                                 or current_observation.get("task_completed")
                             )
                             truncated_flag = bool(current_observation.get("truncated"))
+                            # Log what the environment returned
+                            print(f"\n{'='*80}")
+                            print(f"[STEP {step_index}] TOOL CALL:")
+                            print(f"  Tool: {env_call.tool}")
+                            print(f"  Args: {env_call.args}")
+                            print(f"\n[STEP {step_index}] ENVIRONMENT RESPONSE:")
+                            print(f"  Reward: {reward_last:.4f} (cumulative: {total_reward:.4f})")
+                            print(f"  Task completed: {step_observation.get('task_completed')}")
+                            print(f"  Done: {done_flag} | Truncated: {truncated_flag}")
+                            if 'compile_status' in step_observation and step_observation.get('compile_status'):
+                                print(f"  Compile status:\n{step_observation.get('compile_status')}")
+                            if 'simulate_status' in step_observation and step_observation.get('simulate_status'):
+                                print(f"  Simulate status:\n{step_observation.get('simulate_status')}")
+                            if 'files' in step_observation:
+                                print(f"  Files: {list(step_observation.get('files', {}).keys())}")
+                            print(f"{'='*80}\n")
                         executed_tool_name = str(primary_call["tool"])
                         normalized_executed_tool = executed_tool_name.strip().lower()
@@ -698,10 +752,40 @@ async def rollout_executor(
                             {"tool_name": call["tool"], "arguments": call["args"]}
                             for call in tool_calls
                         ]
+                        # Print tool calls for debugging
+                        logger.info(f"[STEP {step_index}] Tool calls executed:")
+                        for call in tool_calls:
+                            tool_name = call["tool"]
+                            args = call["args"]
+                            # Truncate long arguments for readability
+                            if "code" in args or "content" in args:
+                                args_preview = {k: (v[:100] + "..." if isinstance(v, str) and len(v) > 100 else v)
+                                               for k, v in args.items()}
+                            else:
+                                args_preview = args
+                            logger.info(f"  └─ {tool_name}({args_preview})")
+                        # Log reward details for debugging
+                        logger.info(f"[STEP {step_index}] Reward details:")
+                        logger.info(f"  └─ reward_last: {reward_last:.4f}")
+                        logger.info(f"  └─ total_reward: {total_reward:.4f}")
+                        logger.info(f"  └─ skip_env_step: {skip_env_step}")
+                        if not skip_env_step:
+                            logger.info(f"  └─ obs.task_completed: {current_observation.get('task_completed', False)}")
+                            logger.info(f"  └─ obs.compile_status: {current_observation.get('compile_status', 'N/A')}")
+                            logger.info(f"  └─ obs.simulate_status: {current_observation.get('simulate_status', 'N/A')}")
+                            logger.info(f"  └─ obs.terminated: {current_observation.get('terminated', False)}")
+                        else:
+                            logger.info(f"  └─ (blocked operation - no env step)")
                         step_info = {
                             "assistant_message": assistant_text,
                             "model_response": raw_response,
                             "llm_request": request_payload,
+                            "meta": {
+                                "inference_url": policy_config.get("inference_url") or resolved_inference,  # CRITICAL: Required by RL trainer for trace extraction (must have ?cid=...)
+                            },
                         }
                         if override_info:
                             step_info["auto_override"] = override_info
@@ -756,6 +840,9 @@ async def rollout_executor(
                             "model_response": raw_response,
                             "llm_request": request_payload,
                             "error": error_text,
+                            "meta": {
+                                "inference_url": policy_config.get("inference_url") or resolved_inference,  # CRITICAL: Required by RL trainer
+                            },
                         }
                         steps.append(
                             RolloutStep(
@@ -797,6 +884,25 @@ async def rollout_executor(
         },
     )
+    # Extract inference_url from policy config (REQUIRED for RL trace correlation)
+    # The trainer injects this with ?cid=trace_xxxxx parameter for trace linking
+    final_inference_url = policy_config.get("inference_url")
+    if not isinstance(final_inference_url, str) or not final_inference_url.strip():
+        # Fallback to agent's inference_url if not in policy config
+        final_inference_url = agent.inference_url
+        logger.warning(
+            "VERILOG_ROLLOUT: inference_url not found in policy_config, using agent.inference_url run_id=%s url=%s",
+            request.run_id,
+            final_inference_url,
+        )
+    else:
+        logger.info(
+            "VERILOG_ROLLOUT: using inference_url from policy_config run_id=%s url=%s has_cid=%s",
+            request.run_id,
+            final_inference_url,
+            "?cid=" in final_inference_url,
+        )
     trajectory = RolloutTrajectory(
         env_id=str(env_id),
         policy_id=str(policy_id),
@@ -810,11 +916,11 @@ async def rollout_executor(
                 "total_reward": final_total_reward,
                 "task_completed": bool(final_observation.get("task_completed")),
                 "policy_model": policy_model,
-                "inference_url": agent.inference_url,
+                "inference_url": final_inference_url,
             },
         },
         length=len(steps),
-        inference_url=agent.inference_url,  # NEW: Required for trace correlation
+        inference_url=final_inference_url,  # CRITICAL: Must contain ?cid=... for trace correlation
         decision_samples=None,
     )
@@ -836,6 +942,133 @@ async def rollout_executor(
         }
     }
+    # Build pipeline_metadata (required for RL training)
+    pipeline_metadata = {
+        "reward_score": final_total_reward,
+        "policy_id": policy_id,
+        "inference_url": final_inference_url,  # CRITICAL: Must be at top level for RL trainer (expects ?cid=...)
+        "inference": {
+            "provider": "groq",
+            "model": policy_model,
+            "url": final_inference_url,  # Use final_inference_url (has ?cid=...)
+        },
+        "env_name": env_id,
+        "task_id": getattr(instance, "problem_id", None),
+        "task_split": getattr(instance, "split", "val"),
+    }
+    # Log episode summary with reward breakdown
+    compile_status = final_observation.get("compile_status", "N/A")
+    simulate_status = final_observation.get("simulate_status", "N/A")
+    task_completed = bool(final_observation.get("task_completed", False))
+    logger.info("=" * 80)
+    logger.info(f"[EPISODE COMPLETE] run_id={request.run_id}")
+    logger.info(f"  Steps taken:       {len(steps)}")
+    logger.info(f"  Total reward:      {final_total_reward:.3f}")
+    logger.info(f"  Task completed:    {task_completed}")
+    logger.info(f"  Compile status:    {compile_status}")
+    logger.info(f"  Simulate status:   {simulate_status}")
+    logger.info(f"  Done/Truncated:    {final_done}/{final_truncated}")
+    logger.info(f"  Problem ID:        {getattr(instance, 'problem_id', 'N/A')}")
+    # DEBUG: Log each step's reward for RL debugging
+    print(f"\n[REWARD DEBUG] Step-by-step breakdown:")
+    for idx, step in enumerate(steps):
+        print(f"  Step {idx}: reward={step.reward:.4f} tool_calls={[tc.get('tool_name') for tc in step.tool_calls]}")
+    print(f"[REWARD DEBUG] Final observation keys: {list(final_observation.keys())}")
+    print(f"[REWARD DEBUG] Final obs total_reward: {final_observation.get('total_reward')}")
+    print(f"[REWARD DEBUG] Metrics outcome_score: {metrics.outcome_score}")
+    print(f"[REWARD DEBUG] Metrics mean_return: {metrics.mean_return}")
+    # Reward breakdown for debugging
+    logger.info("\n[REWARD BREAKDOWN]")
+    compile_count = sum(1 for s in steps if any(tc.get("tool_name") == "compile" for tc in s.tool_calls))
+    simulate_count = sum(1 for s in steps if any(tc.get("tool_name") == "simulate" for tc in s.tool_calls))
+    submit_count = sum(1 for s in steps if any(tc.get("tool_name") == "submit" for tc in s.tool_calls))
+    write_count = sum(1 for s in steps if any(tc.get("tool_name") == "write_file" for tc in s.tool_calls))
+    logger.info(f"  Tool usage: write_file={write_count}, compile={compile_count}, simulate={simulate_count}, submit={submit_count}")
+    # Show per-step rewards
+    step_rewards = [s.reward for s in steps]
+    nonzero_rewards = [r for r in step_rewards if r != 0.0]
+    logger.info(f"  Step rewards: {step_rewards}")
+    if nonzero_rewards:
+        logger.info(f"  Non-zero rewards: {nonzero_rewards}")
+    else:
+        logger.info(f"  ⚠️  ALL REWARDS ZERO! Possible reasons:")
+        logger.info(f"    - No successful compiles (compile reward = 0.01)")
+        logger.info(f"    - No successful simulations (simulate reward = 0.1)")
+        logger.info(f"    - No successful submits (submit reward = 1.0)")
+        logger.info(f"    - Check if task_completed={task_completed}")
+        logger.info(f"    - Check compile_status='{compile_status}'")
+        logger.info(f"    - Check simulate_status='{simulate_status}'")
+    logger.info("=" * 80)
+    # Log for debugging RL training
+    logger.info(
+        "VERILOG_ROLLOUT: pipeline_metadata run_id=%s reward=%.3f inference_url=%s",
+        request.run_id,
+        final_total_reward,
+        final_inference_url,
+    )
+    # DEBUG: Log what we're returning to the RL trainer
+    print(f"\n[RETURN DEBUG] Trajectory structure being returned:")
+    print(f"  trajectory.steps count: {len(steps)}")
+    print(f"  trajectory.final.reward: {trajectory.final.get('reward') if trajectory.final else 'None'}")
+    print(f"  trajectory.length: {trajectory.length}")
+    print(f"  metrics.outcome_score: {metrics.outcome_score}")
+    print(f"  metrics.mean_return: {metrics.mean_return}")
+    print(f"  metrics.episode_returns: {metrics.episode_returns}")
+    print(f"  pipeline_metadata.reward_score: {pipeline_metadata.get('reward_score')}")
+    # ASSERTIONS: Validate RL-required fields before returning
+    # These catch structural issues early (before they reach the backend trainer)
+    # Only enforce for RL mode, not EVAL mode
+    is_rl_mode = hasattr(request, 'mode') and str(getattr(request, 'mode', '')).lower() == 'rl'
+    assert isinstance(pipeline_metadata, dict), (
+        f"VERILOG_ROLLOUT_VALIDATION: pipeline_metadata must be dict, got {type(pipeline_metadata).__name__}"
+    )
+    assert "inference_url" in pipeline_metadata, (
+        f"VERILOG_ROLLOUT_VALIDATION: pipeline_metadata missing 'inference_url' (REQUIRED for RL training)"
+    )
+    assert isinstance(pipeline_metadata["inference_url"], str), (
+        f"VERILOG_ROLLOUT_VALIDATION: pipeline_metadata['inference_url'] must be string, got {type(pipeline_metadata['inference_url']).__name__}"
+    )
+    # Only require ?cid= for RL mode (not needed for EVAL)
+    if is_rl_mode:
+        assert "?cid=" in pipeline_metadata["inference_url"], (
+            f"VERILOG_ROLLOUT_VALIDATION: pipeline_metadata['inference_url'] must contain '?cid=' for trace correlation in RL mode. "
+            f"Got: {pipeline_metadata['inference_url'][:100]}"
+        )
+    # Validate each step has meta.inference_url (backend expects this nested structure)
+    for step_idx, step in enumerate(steps):
+        step_dict = step if isinstance(step, dict) else (step.model_dump() if hasattr(step, "model_dump") else {})
+        step_info = step_dict.get("info", {})
+        assert isinstance(step_info, dict), (
+            f"VERILOG_ROLLOUT_VALIDATION: step[{step_idx}].info must be dict, got {type(step_info).__name__}"
+        )
+        step_meta = step_info.get("meta", {})
+        assert isinstance(step_meta, dict), (
+            f"VERILOG_ROLLOUT_VALIDATION: step[{step_idx}].info.meta must be dict, got {type(step_meta).__name__}"
+        )
+        assert "inference_url" in step_meta, (
+            f"VERILOG_ROLLOUT_VALIDATION: step[{step_idx}].info.meta missing 'inference_url' (REQUIRED for RL training)"
+        )
+        assert isinstance(step_meta["inference_url"], str), (
+            f"VERILOG_ROLLOUT_VALIDATION: step[{step_idx}].info.meta['inference_url'] must be string, got {type(step_meta['inference_url']).__name__}"
+        )
+    logger.info(
+        "VERILOG_ROLLOUT_VALIDATION: ✓ All RL-required fields present run_id=%s steps=%d",
+        request.run_id,
+        len(steps),
+    )
     return RolloutResponse(
         run_id=request.run_id,
         trajectories=[trajectory],
@@ -844,6 +1077,7 @@ async def rollout_executor(
         aborted=False,
         ops_executed=len(steps),
         trace=trace_payload,
+        pipeline_metadata=pipeline_metadata,
     )
@@ -917,6 +1151,7 @@ register_task_app(
                 "python-dotenv>=1.0.1",
                 "datasets>=2.10.0",
             ),
+            apt_packages=("iverilog",),  # Icarus Verilog compiler and simulator (provides iverilog and vvp)
             extra_local_dirs=(
                 (str(REPO_ROOT), "/opt/synth_ai_repo"),
                 (str(REPO_ROOT / "synth_ai"), "/opt/synth_ai_repo/synth_ai"),

examples/task_apps/verilog/tests/__init__.py CHANGED Viewed

	@@ -1,2 +1,4 @@
1 1	# Verilog task app tests
2 2
3	+
4	+

examples/task_apps/verilog/tests/integration/__init__.py CHANGED Viewed

	@@ -1,2 +1,4 @@
1 1	# Integration tests for Verilog task app
2 2
3	+
4	+

examples/task_apps/verilog/tests/integration/test_verilog_eval.py CHANGED Viewed

@@ -177,3 +177,5 @@ def test_verilog_eval_with_groq(verilog_server: str) -> None:
     # Check that we got a meaningful outcome score
     assert "outcome" in result.stdout.lower() or "mean_return" in result.stdout.lower()

examples/task_apps/verilog/tests/unit/__init__.py CHANGED Viewed

	@@ -1,2 +1,4 @@
1 1	# Unit tests for Verilog task app
2 2
3	+
4	+

examples/warming_up_to_rl/groq_test.py CHANGED Viewed

@@ -47,8 +47,10 @@ async def run(args: argparse.Namespace) -> None:
     inference_url = args.inference_url or f"{args.base_url.rstrip('/')}/proxy/groq"
+    from synth_ai.task.contracts import RolloutMode
     request = RolloutRequest(
         run_id=args.run_id,
+        mode=RolloutMode.EVAL,
         env=RolloutEnvSpec(env_name="crafter", seed=args.seed, config={"seed": args.seed}),
         policy=RolloutPolicySpec(
             policy_name="groq-smoke",

examples/warming_up_to_rl/run_local_rollout.py CHANGED Viewed

@@ -42,8 +42,10 @@ def build_rollout_request(
         trace_format=trace_format,
         return_trace=return_trace,
     )
+    from synth_ai.task.contracts import RolloutMode
     return RolloutRequest(
         run_id=run_id,
+        mode=RolloutMode.EVAL,
         env=RolloutEnvSpec(env_name="crafter", seed=seed, config={}),
         policy=RolloutPolicySpec(policy_name="crafter-react", config=policy_config),
         ops=ops,

examples/warming_up_to_rl/run_local_rollout_modal.py CHANGED Viewed

@@ -33,12 +33,14 @@ def build_rollout_request(
             "Authorization": f"Bearer {api_key}",
         },
     }
+    from synth_ai.task.contracts import RolloutMode
     return RolloutRequest(
         run_id=run_id,
         env=RolloutEnvSpec(env_name="crafter", seed=seed, config={}),
         policy=RolloutPolicySpec(policy_name="crafter-react", config=policy_config),
         ops=ops,
         record=RolloutRecordConfig(trajectories=True),
+        mode=RolloutMode.EVAL,
         on_done="reset",
         safety=RolloutSafetyConfig(),
     )

examples/warming_up_to_rl/run_local_rollout_parallel.py CHANGED Viewed

@@ -46,12 +46,14 @@ def build_rollout_request(
         trace_format=trace_format,
         return_trace=return_trace,
     )
+    from synth_ai.task.contracts import RolloutMode
     return RolloutRequest(
         run_id=run_id,
         env=RolloutEnvSpec(env_name="crafter", seed=seed, config={}),
         policy=RolloutPolicySpec(policy_name="crafter-react", config=policy_config),
         ops=ops,
         record=record_cfg,
+        mode=RolloutMode.EVAL,
         on_done="reset",
         safety=RolloutSafetyConfig(),
     )

examples/warming_up_to_rl/run_local_rollout_traced.py CHANGED Viewed

@@ -53,12 +53,14 @@ def build_rollout_request(
         trace_format=trace_format,
     )
+    from synth_ai.task.contracts import RolloutMode
     return RolloutRequest(
         run_id=run_id,
         env=RolloutEnvSpec(env_name="crafter", seed=seed, config={}),
         policy=RolloutPolicySpec(policy_name="crafter-react", config=policy_config),
         ops=ops,
         record=record,
+        mode=RolloutMode.EVAL,
         on_done="reset",
         safety=RolloutSafetyConfig(),
     )

examples/warming_up_to_rl/run_rollout_remote.py CHANGED Viewed

@@ -60,12 +60,14 @@ def build_request(
     for _ in range(max(llm_calls, 1)):
         ops.extend(["agent", "env"])
+    from synth_ai.task.contracts import RolloutMode
     return RolloutRequest(
         run_id=run_id,
         env=RolloutEnvSpec(env_name="crafter", seed=seed, config={}),
         policy=RolloutPolicySpec(policy_name="crafter-react", config=policy_config),
         ops=ops,
         record=RolloutRecordConfig(trajectories=True),
+        mode=RolloutMode.EVAL,
         on_done="reset",
         safety=RolloutSafetyConfig(),
     )

synth_ai/api/models/supported.py CHANGED Viewed

@@ -36,6 +36,7 @@ QWEN3_CODER_MODELS: list[str] = [
 # Training support sets
 RL_SUPPORTED_MODELS: frozenset[str] = frozenset(
     {
+        "Qwen/Qwen3-0.6B",
         "Qwen/Qwen3-1.7B",
         "Qwen/Qwen3-4B",
         "Qwen/Qwen3-4B-Thinking-2507",

synth_ai/cli/__init__.py CHANGED Viewed

@@ -22,26 +22,26 @@ except Exception:
     pass
 try:
-    from ._typer_patch import patch_typer_make_metavar
+    from synth_ai.cli._typer_patch import patch_typer_make_metavar
     patch_typer_make_metavar()
 except Exception:
     pass
-from .root import cli  # new canonical CLI entrypoint
+from synth_ai.cli.root import cli  # new canonical CLI entrypoint
 # Register subcommands from this package onto the group
 # Deprecated/legacy commands intentionally not registered: watch/experiments, balance, calc,
 # man, recent, status, traces
 try:
-    from . import demo as _demo
+    from synth_ai.cli import demo as _demo
     _demo.register(cli)
 except Exception:
     pass
 try:
-    from . import turso as _turso
+    from synth_ai.cli import turso as _turso
     _turso.register(cli)
 except Exception:
@@ -54,20 +54,53 @@ except Exception:
     pass
-from .task_apps import task_app_group
-cli.add_command(task_app_group, name="task-app")
+# Import task_app_group conditionally
+try:
+    from synth_ai.cli.task_apps import task_app_group
+    cli.add_command(task_app_group, name="task-app")
+except Exception:
+    # Task app functionality not available
+    pass
 try:
-    from . import task_apps as _task_apps
+    # Make task_apps import more robust to handle missing optional dependencies
+    import importlib
+    task_apps_module = importlib.import_module('synth_ai.cli.task_apps')
+    task_apps_module.register(cli)
+except (ImportError, ModuleNotFoundError, TypeError, RuntimeError) as e:
+    # Task apps module not available (missing optional dependencies)
+    # This is expected - silently skip
+    pass
-    _task_apps.register(cli)
+# Register TUI command - make import completely isolated
+def _register_tui_command():
+    """Register TUI command only when called, not during CLI startup."""
+    try:
+        # Import TUI only when the command is actually used
+        from synth_ai.cli.tui import register as tui_register
+        tui_register(cli)
+    except Exception:
+        # TUI not available - this is expected if dependencies are missing
+        pass
+# Add TUI command as a lazy-registered command
+try:
+    # Try to import and register immediately for normal cases
+    from synth_ai.cli.tui import register as tui_register
+    tui_register(cli)
 except Exception:
+    # If that fails, add a lazy registration that will only happen when called
+    # For now, just skip - the command won't be available but CLI won't crash
     pass
-cli.add_command(task_app_group.commands["serve"], name="serve")
-cli.add_command(task_app_group.commands["deploy"], name="deploy")
-cli.add_command(task_app_group.commands["modal-serve"], name="modal-serve")
+# Add task app commands if available
+try:
+    if 'task_app_group' in locals() and hasattr(task_app_group, 'commands'):
+        cli.add_command(task_app_group.commands["serve"], name="serve")
+        cli.add_command(task_app_group.commands["deploy"], name="deploy")
+        cli.add_command(task_app_group.commands["modal-serve"], name="modal-serve")
+except Exception:
+    # Task app commands not available
+    pass
 # Top-level 'info' alias removed; use `synth-ai task-app info` instead

synth_ai/cli/_modal_wrapper.py CHANGED Viewed

@@ -6,7 +6,7 @@ import sys
 def main() -> int:
     # Apply Typer compatibility patch before Modal CLI bootstraps Click/Typer internals.
     try:
-        from ._typer_patch import patch_typer_make_metavar
+        from synth_ai.cli._typer_patch import patch_typer_make_metavar
         patch_typer_make_metavar()
     except Exception:
@@ -20,7 +20,8 @@ def main() -> int:
     else:
         sys.argv = ["modal"]
-    return modal_main()
+    result = modal_main()
+    return result if result is not None else 0
 if __name__ == "__main__":

synth_ai/cli/recent.py CHANGED Viewed

@@ -12,7 +12,7 @@ from rich import box
 from rich.console import Console
 from rich.table import Table
-from ._storage import load_storage
+from synth_ai.cli._storage import load_storage
 if TYPE_CHECKING:  # pragma: no cover - typing only
     import pandas as pd

synth_ai/cli/status.py CHANGED Viewed

@@ -12,7 +12,7 @@ from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
-from ._storage import load_storage
+from synth_ai.cli._storage import load_storage
 async def _db_stats(db_url: str) -> dict:

synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.14__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.13.dev2py3-none-any.whl → 0.2.14py3-none-any.whl