PyPI - synth-ai - Versions diffs - 0.2.13.dev2__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

synth-ai 0.2.13.dev2py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of synth-ai might be problematic. Click here for more details.

Files changed (110) hide show

examples/multi_step/configs/README_verilog_rl.md +77 -0
examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +5 -4
examples/multi_step/configs/crafter_synth_backend.md +40 -0
examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
examples/multi_step/configs/verilog_rl_lora.toml +190 -0
examples/multi_step/judges/crafter_backend_judge.py +220 -0
examples/multi_step/judges/verilog_backend_judge.py +234 -0
examples/multi_step/readme.md +48 -0
examples/multi_step/verilog_rl_lora.md +218 -0
examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
examples/sft/evaluate.py +2 -0
examples/sft/generate_traces.py +2 -0
examples/swe/task_app/grpo_swe_mini.py +1 -0
examples/swe/task_app/hosted/rollout.py +2 -0
examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
examples/task_apps/crafter/task_app/__init__.py +3 -0
examples/task_apps/crafter/task_app/grpo_crafter.py +306 -8
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +16 -3
examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +52 -1
examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +111 -13
examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
examples/task_apps/enron/filter_sft.toml +5 -0
examples/task_apps/enron/tests/__init__.py +2 -0
examples/task_apps/enron/tests/integration/__init__.py +2 -0
examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
examples/task_apps/enron/tests/unit/__init__.py +2 -0
examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
examples/task_apps/pokemon_red/task_app.py +199 -6
examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
examples/task_apps/sokoban/filter_sft.toml +5 -0
examples/task_apps/sokoban/tests/__init__.py +2 -0
examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
examples/task_apps/verilog/filter_sft.toml +5 -0
examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
examples/task_apps/verilog/tests/__init__.py +2 -0
examples/task_apps/verilog/tests/integration/__init__.py +2 -0
examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
examples/task_apps/verilog/tests/unit/__init__.py +2 -0
examples/warming_up_to_rl/groq_test.py +2 -0
examples/warming_up_to_rl/run_local_rollout.py +2 -0
examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
examples/warming_up_to_rl/run_rollout_remote.py +2 -0
synth_ai/api/models/supported.py +1 -0
synth_ai/cli/__init__.py +46 -13
synth_ai/cli/_modal_wrapper.py +3 -2
synth_ai/cli/recent.py +1 -1
synth_ai/cli/status.py +1 -1
synth_ai/cli/task_apps.py +354 -143
synth_ai/cli/traces.py +1 -1
synth_ai/cli/tui.py +57 -0
synth_ai/cli/turso.py +1 -1
synth_ai/cli/watch.py +1 -1
synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
synth_ai/environments/examples/crafter_classic/environment.py +1 -1
synth_ai/environments/examples/verilog/engine.py +76 -10
synth_ai/judge_schemas.py +8 -8
synth_ai/task/__init__.py +11 -1
synth_ai/task/apps/__init__.py +1 -0
synth_ai/task/config.py +257 -0
synth_ai/task/contracts.py +15 -2
synth_ai/task/rubrics/__init__.py +3 -0
synth_ai/task/rubrics/loaders.py +22 -3
synth_ai/task/rubrics/scoring.py +3 -0
synth_ai/task/trace_correlation_helpers.py +315 -0
synth_ai/task/validators.py +144 -0
synth_ai/tracing_v3/abstractions.py +3 -3
synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
synth_ai/tracing_v3/session_tracer.py +16 -6
synth_ai/tracing_v3/storage/base.py +29 -29
synth_ai/tracing_v3/storage/config.py +3 -3
synth_ai/tracing_v3/turso/daemon.py +8 -7
synth_ai/tracing_v3/turso/native_manager.py +63 -40
synth_ai/tracing_v3/utils.py +3 -3
synth_ai/tui/__init__.py +5 -0
synth_ai/tui/__main__.py +13 -0
synth_ai/tui/cli/__init__.py +1 -0
synth_ai/tui/cli/query_experiments.py +164 -0
synth_ai/tui/cli/query_experiments_v3.py +164 -0
synth_ai/tui/dashboard.py +906 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/METADATA +1 -1
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/RECORD +110 -71
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
{synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0

examples/task_apps/pokemon_red/task_app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import logging
 from typing import Any, Dict, Iterable, Mapping, Sequence
 from fastapi import HTTPException, Request
@@ -21,6 +22,15 @@ from synth_ai.task.contracts import (
     TaskInfo,
 )
 from synth_ai.task.server import ProxyConfig, TaskAppConfig
+from synth_ai.task.tracing_utils import (
+    build_tracer_factory,
+    resolve_sft_output_dir,
+    resolve_tracing_db_url,
+    tracing_env_enabled,
+)
+from synth_ai.tracing_v3.session_tracer import SessionTracer
+logger = logging.getLogger(__name__)
 def _base_task_info() -> TaskInfo:
@@ -182,7 +192,70 @@ def _calculate_outcome_score(final_state: dict[str, Any], total_reward: float) -
 async def rollout_executor(request: RolloutRequest, fastapi_request: Request) -> RolloutResponse:
+    # Initialize SessionTracer for this rollout
+    tracer_factory = getattr(fastapi_request.app.state, "session_tracer_factory", None)
+    tracer_instance: SessionTracer | None = None
+    if callable(tracer_factory):
+        try:
+            inst = tracer_factory()
+            tracer_instance = inst if isinstance(inst, SessionTracer) else None
+        except Exception as exc:
+            logger.debug(f"TRACER_FACTORY_FAIL: {exc}")
+    # Start tracing session
+    if tracer_instance is not None:
+        try:
+            await tracer_instance.initialize()
+            await tracer_instance.start_session(
+                session_id=request.run_id,
+                metadata={
+                    "run_id": request.run_id,
+                    "env_name": "pokemon_red",
+                    "policy_name": request.policy.policy_name or "default",
+                    "seed": request.env.seed,
+                }
+            )
+            logger.info(f"[pokemon_red] tracing enabled for run_id={request.run_id}")
+        except Exception as exc:
+            logger.warning(f"[pokemon_red] tracing init failed: {exc}")
+            tracer_instance = None
     async def _call_inference(policy_cfg: Mapping[str, Any], observation: Mapping[str, Any]) -> Mapping[str, Any]:
+        # Check if vision mode is enabled
+        use_vision = bool(policy_cfg.get("use_vision", False))
+        image_only_mode = bool(policy_cfg.get("image_only_mode", False))
+        # Build user message content
+        if use_vision and "observation_image_data_url" in observation:
+            # Extract image data URL
+            image_data_url = observation["observation_image_data_url"]
+            # Build state summary (text observation)
+            state_summary = "State summary: " + str({
+                k: observation.get(k)
+                for k in observation.keys()
+                if k not in ["error", "observation_image_base64", "observation_image_data_url",
+                            "observation_image_format", "observation_image_width", "observation_image_height"]
+            })
+            # Image-only mode: only send image, no text
+            if image_only_mode:
+                user_content = [
+                    {"type": "image_url", "image_url": {"url": image_data_url}}
+                ]
+            else:
+                # Vision mode with text: send both text and image
+                user_content = [
+                    {"type": "text", "text": state_summary},
+                    {"type": "image_url", "image_url": {"url": image_data_url}}
+                ]
+        else:
+            # Text-only mode (default)
+            state_summary = "State summary: " + str({
+                k: observation.get(k) for k in observation.keys() if k != "error"
+            })
+            user_content = state_summary
         messages = [
             {
                 "role": "system",
@@ -193,9 +266,7 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
             },
             {
                 "role": "user",
-                "content": (
-                    "State summary: " + str({k: observation.get(k) for k in observation.keys() if k != "error"})
-                ),
+                "content": user_content,
             },
         ]
         payload = {
@@ -262,6 +333,10 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
             "max_tokens": int(policy_cfg.get("max_tokens") or 500),
         }
         inference_url = str(policy_cfg.get("inference_url") or "").rstrip("/")
+        # Determine if this is an external URL or internal proxy
+        is_external = inference_url.startswith("http://") or inference_url.startswith("https://")
         if not inference_url:
             # Prefer built-in proxy endpoints from app if no external URL
             provider = (policy_cfg.get("provider") or "").lower()
@@ -269,8 +344,31 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
                 inference_url = "/proxy/groq/v1/chat/completions"
             else:
                 inference_url = "/proxy/v1/chat/completions"
-        async with httpx.AsyncClient(base_url="http://127.0.0.1:" + str(fastapi_request.url.port or 8913), timeout=httpx.Timeout(60.0)) as client:  # best-effort
-            resp = await client.post(inference_url, json=payload)
+            is_external = False
+        elif is_external:
+            # Add /v1/chat/completions if using OpenAI directly
+            if "api.openai.com" in inference_url and not inference_url.endswith("/chat/completions"):
+                inference_url = inference_url + "/v1/chat/completions"
+        if is_external:
+            # External API: use direct HTTP client with auth header
+            headers = {}
+            if "api.openai.com" in inference_url:
+                import os
+                api_key = os.getenv("OPENAI_API_KEY")
+                if api_key:
+                    headers["Authorization"] = f"Bearer {api_key}"
+            async with httpx.AsyncClient(timeout=httpx.Timeout(60.0)) as client:
+                resp = await client.post(inference_url, json=payload, headers=headers)
+        else:
+            # Internal proxy: use local base_url
+            async with httpx.AsyncClient(
+                base_url="http://127.0.0.1:" + str(fastapi_request.url.port or 8913),
+                timeout=httpx.Timeout(60.0)
+            ) as client:
+                resp = await client.post(inference_url, json=payload)
         resp.raise_for_status()
         data = resp.json()
         # Extract first tool call
@@ -555,6 +653,72 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
         inference_url=inference_url,  # NEW: Required for trace correlation
     )
+    # Record outcome rewards and end session
+    trace_payload = None
+    if tracer_instance is not None:
+        try:
+            # Count achievements (milestones)
+            achievements_count = len(milestone_events)
+            # Build metadata with all relevant info
+            reward_metadata = {
+                "run_id": request.run_id,
+                "env_name": "pokemon_red",
+                "final_map": final_state.get("map_id", -1),
+                "party_count": final_state.get("party_count", 0),
+                "badges": final_state.get("badges", 0),
+                "steps": len(steps),
+                "milestone_events": milestone_events,
+                "reward_components": all_reward_components,
+            }
+            # Record outcome reward to Turso
+            await tracer_instance.record_outcome_reward(
+                total_reward=int(total_reward),
+                achievements_count=achievements_count,
+                total_steps=len(steps),
+                reward_metadata=reward_metadata,
+            )
+            logger.info(f"[pokemon_red] recorded outcome: reward={total_reward}, achievements={achievements_count}")
+            # End session and get trace
+            session_trace = await tracer_instance.end_session()
+            # Build trace payload if requested
+            record_config = getattr(request, 'record', None)
+            if record_config and getattr(record_config, 'return_trace', False) and session_trace:
+                trace_payload = {
+                    "session_id": session_trace.session_id,
+                    "created_at": session_trace.created_at.isoformat() if session_trace.created_at else None,
+                    "metadata": dict(session_trace.metadata or {}),
+                    "num_timesteps": session_trace.num_timesteps,
+                    "num_events": session_trace.num_events,
+                    "num_messages": session_trace.num_messages,
+                }
+        except Exception as exc:
+            logger.warning(f"[pokemon_red] tracing finalization failed: {exc}")
+    # Fallback trace payload if no tracer but CLI needs it
+    if trace_payload is None:
+        record_config = getattr(request, 'record', None)
+        if record_config and getattr(record_config, 'return_trace', False):
+            trace_payload = {
+                "session_id": request.run_id,
+                "created_at": import_datetime().now().isoformat(),
+                "metadata": {
+                    "run_id": request.run_id,
+                    "env_name": "pokemon_red",
+                    "total_reward": int(total_reward),
+                    "final_map": final_state.get("map_id", -1),
+                    "party_count": final_state.get("party_count", 0),
+                    "badges": final_state.get("badges", 0),
+                    "steps": len(steps),
+                },
+                "num_timesteps": len(steps),
+                "num_events": len(steps),
+                "num_messages": len(steps) * 2,
+            }
     return RolloutResponse(
         run_id=request.run_id,
         trajectories=[trajectory],
@@ -562,11 +726,40 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
         metrics=metrics,
         aborted=False,
         ops_executed=len(request.ops or []),
+        trace=trace_payload,
     )
+def import_datetime():
+    """Helper to import datetime for trace timestamps."""
+    from datetime import datetime
+    return datetime
 def build_config() -> TaskAppConfig:
     base_info = _base_task_info()
+    # Set up tracing
+    tracing_enabled = tracing_env_enabled()
+    tracing_db_url = resolve_tracing_db_url()
+    tracer_factory = build_tracer_factory(
+        SessionTracer, enabled=tracing_enabled, db_url=tracing_db_url
+    )
+    sft_output_dir = resolve_sft_output_dir()
+    app_state: dict[str, Any] = {
+        "tracing_enabled": tracing_enabled,
+    }
+    if tracer_factory is not None:
+        app_state["session_tracer_factory"] = tracer_factory
+    if sft_output_dir:
+        app_state["sft_output_dir"] = sft_output_dir
+    if tracing_enabled:
+        status_msg = f"[task:tracing] enabled (db={tracing_db_url or 'default'})"
+        logger.info(status_msg)
+        print(status_msg, flush=True)
     return TaskAppConfig(
         app_id="pokemon_red",
         name="Pokémon Red Task App",
@@ -585,7 +778,7 @@ def build_config() -> TaskAppConfig:
                 "Example: {\"tool\": \"execute_sequence\", \"args\": {\"actions\": [{\"button\": \"DOWN\", \"frames\": 30}, ...]}}"
             ),
         ),
-        app_state={},
+        app_state=app_state,
         require_api_key=False,
         expose_debug_env=True,
         cors_origins=["*"],

examples/task_apps/pokemon_red/test_pallet_town_rewards.py CHANGED Viewed

@@ -189,3 +189,5 @@ async def main():
 if __name__ == "__main__":
     asyncio.run(main())

examples/task_apps/sokoban/filter_sft.toml ADDED Viewed

@@ -0,0 +1,5 @@
+[filter]
+db = "traces/v3/synth_ai.db"
+output = "ft_data/sokoban_sft.jsonl"
+min_official_score = 0.01

examples/task_apps/sokoban/tests/__init__.py CHANGED Viewed

	@@ -1,2 +1,4 @@
1 1	# Sokoban task app tests
2 2
3	+
4	+

examples/task_apps/sokoban/tests/integration/__init__.py CHANGED Viewed

	@@ -1,2 +1,4 @@
1 1	# Integration tests for Sokoban task app
2 2
3	+
4	+

examples/task_apps/sokoban/tests/unit/__init__.py CHANGED Viewed

	@@ -1,2 +1,4 @@
1 1	# Unit tests for Sokoban task app
2 2
3	+
4	+

examples/task_apps/verilog/eval_groq_qwen32b.toml CHANGED Viewed

@@ -1,12 +1,14 @@
 # Verilog Eval Config for Groq Qwen3-32B
+# Quick eval to test Verilog task app before RL training
 [task_app]
-url = "http://localhost:8103"  # Verilog task app port
+# Update this with your Modal URL after deployment
+url = "https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run"
 [eval]
-num_episodes = 5
-seeds = [0, 1, 2, 3, 4]
-max_steps = 10
+num_episodes = 3  # Quick test with 3 seeds
+seeds = [0, 1, 2]
+max_steps = 15    # More steps for Verilog compilation chains
 [policy]
 provider = "groq"
@@ -18,3 +20,5 @@ inference_url = "https://api.groq.com/openai/v1/chat/completions"
 [env]
 difficulty = "medium"  # Can be "easy", "medium", or "hard"

examples/task_apps/verilog/filter_sft.toml ADDED Viewed

@@ -0,0 +1,5 @@
+[filter]
+db = "traces/v3/synth_ai.db"
+output = "ft_data/verilog_sft.jsonl"
+min_official_score = 0.01

synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.14__py3-none-any.whl

Potentially problematic release.

synth-ai 0.2.13.dev2py3-none-any.whl → 0.2.14py3-none-any.whl