synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (110) hide show
  1. examples/multi_step/configs/README_verilog_rl.md +77 -0
  2. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  3. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  4. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  5. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  6. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +5 -4
  7. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  8. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  9. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  10. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  11. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  12. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  13. examples/multi_step/readme.md +48 -0
  14. examples/multi_step/verilog_rl_lora.md +218 -0
  15. examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
  16. examples/sft/evaluate.py +2 -0
  17. examples/sft/generate_traces.py +2 -0
  18. examples/swe/task_app/grpo_swe_mini.py +1 -0
  19. examples/swe/task_app/hosted/rollout.py +2 -0
  20. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  21. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  22. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  23. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  24. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  25. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  26. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  27. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  28. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  29. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  30. examples/task_apps/crafter/task_app/__init__.py +3 -0
  31. examples/task_apps/crafter/task_app/grpo_crafter.py +306 -8
  32. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  33. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +16 -3
  34. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  35. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
  36. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +52 -1
  37. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +111 -13
  38. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
  39. examples/task_apps/enron/filter_sft.toml +5 -0
  40. examples/task_apps/enron/tests/__init__.py +2 -0
  41. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  42. examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
  43. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  44. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  45. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  46. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  47. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  48. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
  49. examples/task_apps/pokemon_red/task_app.py +199 -6
  50. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
  51. examples/task_apps/sokoban/filter_sft.toml +5 -0
  52. examples/task_apps/sokoban/tests/__init__.py +2 -0
  53. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  54. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  55. examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
  56. examples/task_apps/verilog/filter_sft.toml +5 -0
  57. examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
  58. examples/task_apps/verilog/tests/__init__.py +2 -0
  59. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  60. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
  61. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  62. examples/warming_up_to_rl/groq_test.py +2 -0
  63. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  64. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  65. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  66. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  67. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  68. synth_ai/api/models/supported.py +1 -0
  69. synth_ai/cli/__init__.py +46 -13
  70. synth_ai/cli/_modal_wrapper.py +3 -2
  71. synth_ai/cli/recent.py +1 -1
  72. synth_ai/cli/status.py +1 -1
  73. synth_ai/cli/task_apps.py +354 -143
  74. synth_ai/cli/traces.py +1 -1
  75. synth_ai/cli/tui.py +57 -0
  76. synth_ai/cli/turso.py +1 -1
  77. synth_ai/cli/watch.py +1 -1
  78. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
  79. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  80. synth_ai/environments/examples/verilog/engine.py +76 -10
  81. synth_ai/judge_schemas.py +8 -8
  82. synth_ai/task/__init__.py +11 -1
  83. synth_ai/task/apps/__init__.py +1 -0
  84. synth_ai/task/config.py +257 -0
  85. synth_ai/task/contracts.py +15 -2
  86. synth_ai/task/rubrics/__init__.py +3 -0
  87. synth_ai/task/rubrics/loaders.py +22 -3
  88. synth_ai/task/rubrics/scoring.py +3 -0
  89. synth_ai/task/trace_correlation_helpers.py +315 -0
  90. synth_ai/task/validators.py +144 -0
  91. synth_ai/tracing_v3/abstractions.py +3 -3
  92. synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
  93. synth_ai/tracing_v3/session_tracer.py +16 -6
  94. synth_ai/tracing_v3/storage/base.py +29 -29
  95. synth_ai/tracing_v3/storage/config.py +3 -3
  96. synth_ai/tracing_v3/turso/daemon.py +8 -7
  97. synth_ai/tracing_v3/turso/native_manager.py +63 -40
  98. synth_ai/tracing_v3/utils.py +3 -3
  99. synth_ai/tui/__init__.py +5 -0
  100. synth_ai/tui/__main__.py +13 -0
  101. synth_ai/tui/cli/__init__.py +1 -0
  102. synth_ai/tui/cli/query_experiments.py +164 -0
  103. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  104. synth_ai/tui/dashboard.py +906 -0
  105. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/METADATA +1 -1
  106. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/RECORD +110 -71
  107. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
  108. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
  109. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
  110. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,48 @@
1
+
2
+
3
+ Crafter
4
+
5
+ cd /Users/joshpurtell/Documents/GitHub/synth-ai && uvx synth-ai modal-serve grpo-crafter-task-app --name grpo-crafter-task-app --env-file /Users/joshpurtell/Documents/GitHub/monorepo/environments/crafter/.env
6
+
7
+ cd /Users/joshpurtell/Documents/GitHub/monorepo && uv run modal deploy backend/app/routes/clustered_training/core/algorithms/gspo/app.py --env dev
8
+
9
+ uvx synth-ai eval --config /Users/joshpurtell/Documents/GitHub/synth-ai/examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml
10
+
11
+
12
+ uvx synth-ai train \
13
+ --type rl \
14
+ --config /Users/joshpurtell/Documents/GitHub/synth-ai/examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml \
15
+ --task-url https://synth-laboratories--grpo-crafter-task-app-fastapi-app-dev.modal.run \
16
+ --backend https://synth-backend-dev-docker.onrender.com/api \
17
+ --env-file /Users/joshpurtell/Documents/GitHub/monorepo/environments/crafter/.env
18
+
19
+
20
+
21
+ ---
22
+
23
+ Verilog
24
+
25
+ # 1. Deploy Verilog task app
26
+ cd /Users/joshpurtell/Documents/GitHub/synth-ai && uvx synth-ai modal-serve grpo-verilog --name grpo-verilog-task-app --env-file /Users/joshpurtell/Documents/GitHub/monorepo/environments/verilog/.env
27
+
28
+ # 2. Baseline eval using Synth backend (pre-training)
29
+ uvx synth-ai eval --config /Users/joshpurtell/Documents/GitHub/synth-ai/examples/multi_step/configs/verilog_eval_synth_qwen4b.toml
30
+
31
+ # 3. (Optional) External reference eval using Groq Qwen 32B
32
+ uvx synth-ai eval --config /Users/joshpurtell/Documents/GitHub/synth-ai/examples/multi_step/configs/verilog_eval_groq_qwen32b.toml
33
+
34
+ # 4. Deploy training backend
35
+ cd /Users/joshpurtell/Documents/GitHub/monorepo && uv run modal deploy backend/app/routes/clustered_training/core/algorithms/gspo/app.py --env dev
36
+
37
+ # 5. Run RL training
38
+ uvx synth-ai train \
39
+ --type rl \
40
+ --config /Users/joshpurtell/Documents/GitHub/synth-ai/examples/multi_step/configs/verilog_rl_lora.toml \
41
+ --task-url https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run \
42
+ --backend https://synth-backend-dev-docker.onrender.com/api \
43
+ --env-file /Users/joshpurtell/Documents/GitHub/monorepo/environments/verilog/.env
44
+
45
+ # 6. Post-training eval (update job_id in config first!)
46
+ # After training, note the job_id from logs (e.g., job_19a1823e56303de604f)
47
+ # Update verilog_eval_synth_trained_qwen8b.toml with your job_id
48
+ uvx synth-ai eval --config /Users/joshpurtell/Documents/GitHub/synth-ai/examples/multi_step/configs/verilog_eval_synth_trained_qwen8b.toml
@@ -0,0 +1,218 @@
1
+ # Verilog RL with LoRA Analysis
2
+
3
+ ## Executive Summary
4
+
5
+ **✅ YES, Verilog can absolutely do RL with LoRA just like Crafter!** The architecture is nearly identical, but there are important considerations around model size and task complexity.
6
+
7
+ ## Architecture Compatibility ✅
8
+
9
+ ### **Same Foundation** (No changes needed)
10
+ - ✅ **Contracts**: Uses identical `RolloutRequest`/`RolloutResponse` as Crafter
11
+ - ✅ **Task App Framework**: Same `synth_ai.task.apps` framework
12
+ - ✅ **Environment Pattern**: Same `StatefulEnvironment` + tool-based architecture
13
+ - ✅ **Rubrics System**: Same evaluation and reward system
14
+ - ✅ **Trace Correlation**: Already implemented in `rollout_executor` (line 817 in `grpo_verilog.py`)
15
+ - ✅ **Modal Deployment**: Same deployment pattern as Crafter
16
+
17
+ ### **Key Differences** (Considerations for LoRA)
18
+
19
+ #### 1. **Model Size: 8x Larger** ⚠️
20
+ ```toml
21
+ # Verilog (current)
22
+ model = "qwen/qwen3-32b" # 32B parameters
23
+
24
+ # Crafter (working)
25
+ model = "Qwen/Qwen3-4B" # 4B parameters
26
+ ```
27
+ **Impact**: Memory requirements 8x higher for LoRA training
28
+ **Solution**: Use gradient checkpointing, smaller batch sizes, or distributed training
29
+
30
+ #### 2. **Tool Set: Simpler but More Structured**
31
+ ```python
32
+ # Verilog Tools (4 tools)
33
+ TOOLS = ["write_file", "compile", "simulate", "submit"]
34
+
35
+ # Crafter Tools (20+ tools)
36
+ # craft, move, attack, gather, etc.
37
+ ```
38
+
39
+ **Verilog Advantages**:
40
+ - ✅ **Deterministic**: Write → Compile → Simulate → Submit workflow
41
+ - ✅ **Clear Success Criteria**: Tests pass = high reward
42
+ - ✅ **Sparse but Meaningful Rewards**: +10 for submit success, +1 for simulation pass
43
+
44
+ **Verilog Challenges**:
45
+ - ❌ **Sparser Rewards**: Fewer intermediate signals for learning
46
+ - ❌ **Longer Sequences**: Multi-step compilation chains
47
+ - ❌ **Error Recovery**: Must debug compilation failures
48
+
49
+ #### 3. **State Representation**
50
+ ```python
51
+ # Verilog State (file-based)
52
+ {
53
+ "files": {"TopModule.v": "module TopModule(..."},
54
+ "compile_status": "Last compile: Success",
55
+ "simulate_status": "Last simulation: Passed",
56
+ "task_completed": false
57
+ }
58
+
59
+ # Crafter State (world-based)
60
+ {
61
+ "inventory": {"wood": 5, "stone": 3},
62
+ "position": [x, y],
63
+ "nearby_entities": [...],
64
+ "achievement_unlocked": true
65
+ }
66
+ ```
67
+
68
+ ## Configuration for LoRA RL
69
+
70
+ ### **Option 1: Qwen3-0.6B (Recommended for testing)** ⭐
71
+ ```toml
72
+ [algorithm]
73
+ type = "online"
74
+ method = "policy_gradient"
75
+ variety = "gspo"
76
+
77
+ [model]
78
+ base = "Qwen/Qwen3-0.6B" # ✅ Same as existing SFT configs
79
+ trainer_mode = "lora"
80
+
81
+ [lora]
82
+ r = 16
83
+ alpha = 32
84
+ dropout = 0.05
85
+ target_modules = ["all-linear"]
86
+
87
+ [rollout]
88
+ env_name = "verilog"
89
+ max_turns = 15
90
+ policy_name = "verilog-designer"
91
+
92
+ [training]
93
+ batch_size = 4 # ✅ Same as Crafter
94
+ gradient_accumulation_steps = 1
95
+ ```
96
+
97
+ ### **Option 2: Qwen3-32B (Production)** ⚠️
98
+ ```toml
99
+ [algorithm]
100
+ type = "online"
101
+ method = "policy_gradient"
102
+ variety = "gspo"
103
+
104
+ [model]
105
+ base = "qwen/qwen3-32b" # ⚠️ 8x memory vs Crafter's 4B
106
+ trainer_mode = "lora"
107
+
108
+ [lora]
109
+ r = 16
110
+ alpha = 32
111
+ dropout = 0.05
112
+ target_modules = ["all-linear"]
113
+
114
+ [rollout]
115
+ env_name = "verilog"
116
+ max_turns = 15
117
+ policy_name = "verilog-designer"
118
+ ```
119
+
120
+ ### **Memory Optimization** (for 32B model)
121
+ ```toml
122
+ [vllm]
123
+ max_model_len = 4096 # Shorter than Crafter's 8192
124
+ tensor_parallel_size = 2 # Distribute across GPUs
125
+
126
+ [training]
127
+ batch_size = 2 # Smaller than Crafter's 4
128
+ gradient_accumulation_steps = 4
129
+ ```
130
+
131
+ ## Task App Changes Needed
132
+
133
+ ### **1. Mode Parameter Support** ✅ (Already implemented)
134
+ The Verilog task app already handles `mode="rl"` correctly:
135
+ ```python
136
+ # In grpo_verilog.py rollout_executor
137
+ policy_config = dict(policy_config_raw)
138
+ # ... mode parameter flows through naturally
139
+ ```
140
+
141
+ ### **2. Trace Correlation** ✅ (Already implemented)
142
+ ```python
143
+ # Line 817 in grpo_verilog.py
144
+ trajectory = RolloutTrajectory(
145
+ # ...
146
+ inference_url=agent.inference_url, # ✅ Required for trace correlation
147
+ decision_samples=None,
148
+ )
149
+ ```
150
+
151
+ ### **3. Rubric Integration** ✅ (Already configured)
152
+ ```python
153
+ # In grpo_verilog.py
154
+ rubrics=RubricBundle(
155
+ outcome=OUTCOME_RUBRIC, # Tests pass reward
156
+ events=EVENTS_RUBRIC, # Process efficiency reward
157
+ )
158
+ ```
159
+
160
+ ## RL Training Feasibility
161
+
162
+ ### **✅ Works Great**
163
+ 1. **Clear Success Signal**: Submit passing tests = +10 reward
164
+ 2. **Guided Process**: Natural write→compile→simulate→submit progression
165
+ 3. **Error Learning**: Agent must learn to debug compilation failures
166
+ 4. **Hardware Design**: Real-world applicable skills
167
+
168
+ ### **⚠️ Challenges**
169
+ 1. **Model Size**: 32B vs 4B = 8x memory, slower training
170
+ 2. **Sparse Rewards**: Fewer learning signals than Crafter's dense rewards
171
+ 3. **Longer Episodes**: 15+ steps vs Crafter's 10 steps
172
+ 4. **Compilation Errors**: Must learn to interpret and fix syntax errors
173
+
174
+ ## Recommended Approach
175
+
176
+ ### **Phase 1: Start with Qwen3-0.6B** ⭐ (as you requested)
177
+ ```toml
178
+ # Perfect for testing - same model used in existing SFT configs
179
+ model = "Qwen/Qwen3-0.6B"
180
+ batch_size = 4 # Same as Crafter
181
+ ```
182
+ - ✅ **Zero setup**: Already configured in `synth-ai/examples/sft/configs/crafter_lora_qwen0p6b.toml`
183
+ - ✅ **Fast iteration**: 0.6B parameters = quick training cycles
184
+ - ✅ **Memory efficient**: Fits on single GPU easily
185
+ - ✅ **Proven baseline**: Same model used in RL demos and SFT examples
186
+
187
+ ### **Phase 2: Scale to Qwen3-8B** (if 0.6B works well)
188
+ ```toml
189
+ model = "qwen/qwen3-8b"
190
+ batch_size = 2
191
+ gradient_accumulation_steps = 2
192
+ ```
193
+
194
+ ### **Phase 3: Production with Qwen3-32B**
195
+ ```toml
196
+ model = "qwen/qwen3-32b"
197
+ tensor_parallel_size = 2
198
+ batch_size = 1
199
+ gradient_accumulation_steps = 4
200
+ ```
201
+
202
+ ### **Phase 3: Optimize for Verilog Domain**
203
+ Consider fine-tuning the base model on:
204
+ - Verilog syntax and semantics
205
+ - Hardware design patterns
206
+ - Compilation error messages
207
+ - Testbench writing
208
+
209
+ ## Conclusion
210
+
211
+ **✅ Verilog RL with LoRA is absolutely feasible** and should work with the same pipeline as Crafter. The main differences are:
212
+
213
+ 1. **Larger model** (32B vs 4B) requires memory optimization
214
+ 2. **Sparser rewards** may need different reward shaping
215
+ 3. **More structured tasks** could actually make learning easier
216
+ 4. **Real hardware skills** make it more valuable than game tasks
217
+
218
+ **Recommended next step**: Create a `verilog_rl_lora.toml` config starting with Qwen3-8B and adapt the reward rubrics for the compilation workflow.
@@ -3,7 +3,7 @@
3
3
  [algorithm]
4
4
  type = "offline"
5
5
  method = "sft"
6
- variety = "fft"
6
+ variety = "lora"
7
7
 
8
8
  [job]
9
9
  model = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
examples/sft/evaluate.py CHANGED
@@ -44,6 +44,7 @@ def _ops(n: int) -> list[str]:
44
44
 
45
45
 
46
46
  def _request(seed: int, a: EvalArgs) -> RolloutRequest:
47
+ from synth_ai.task.contracts import RolloutMode
47
48
  return RolloutRequest(
48
49
  run_id=f"eval-{seed}",
49
50
  env=RolloutEnvSpec(env_name="crafter", seed=seed, config={}),
@@ -53,6 +54,7 @@ def _request(seed: int, a: EvalArgs) -> RolloutRequest:
53
54
  ),
54
55
  ops=_ops(a.max_llm_calls),
55
56
  record=RolloutRecordConfig(trajectories=True, return_trace=False, trace_format="compact"),
57
+ mode=RolloutMode.EVAL,
56
58
  )
57
59
 
58
60
 
@@ -42,6 +42,7 @@ def _build_ops(max_llm_calls: int) -> list[str]:
42
42
 
43
43
 
44
44
  def _build_request(seed: int, run_id: str, model: str, inference_url: str, api_key: str, *, max_llm_calls: int, return_trace: bool) -> RolloutRequest:
45
+ from synth_ai.task.contracts import RolloutMode
45
46
  policy_cfg: dict[str, Any] = {
46
47
  "model": model,
47
48
  "inference_url": inference_url,
@@ -54,6 +55,7 @@ def _build_request(seed: int, run_id: str, model: str, inference_url: str, api_k
54
55
  policy=RolloutPolicySpec(policy_name="crafter-react", config=policy_cfg),
55
56
  ops=_build_ops(max_llm_calls),
56
57
  record=record,
58
+ mode=RolloutMode.EVAL,
57
59
  )
58
60
 
59
61
 
@@ -484,6 +484,7 @@ def build_config() -> TaskAppConfig:
484
484
 
485
485
  legacy_request = LegacyRolloutRequest(
486
486
  run_id=request.run_id,
487
+ mode=request.mode, # Preserve mode for nested requests
487
488
  env=LegacyRolloutEnvSpec(
488
489
  env_id=request.env.env_id,
489
490
  env_name=env_spec.env_name or "swe-mini",
@@ -12,6 +12,7 @@ from fastapi import APIRouter, HTTPException, Request, status
12
12
  from pydantic import BaseModel
13
13
  from synth_ai.lm.vendors.base import BaseLMResponse
14
14
  from synth_ai.task.tracing_utils import unique_sft_path
15
+ from synth_ai.task.contracts import RolloutMode
15
16
  from synth_ai.tracing_v3.abstractions import EnvironmentEvent, LMCAISEvent, TimeRecord
16
17
  from synth_ai.tracing_v3.llm_call_record_helpers import create_llm_call_record_from_response
17
18
  from synth_ai.tracing_v3.session_tracer import SessionTracer
@@ -120,6 +121,7 @@ class RolloutRequest(BaseModel):
120
121
  # Optional run/session context
121
122
  training_session_id: str | None = None
122
123
  synth_base_url: str | None = None
124
+ mode: RolloutMode # Required: explicit RL vs EVAL mode
123
125
 
124
126
 
125
127
  class RolloutStep(BaseModel):
@@ -0,0 +1,258 @@
1
+ # Image-Only Evaluation - Quick Reference
2
+
3
+ This document provides a quick reference for running image-only evaluations on **Crafter** and **Pokemon Red** with Turso tracing.
4
+
5
+ ## 📚 Full Documentation
6
+
7
+ - **Crafter**: [`crafter/README_IMAGE_ONLY_EVAL.md`](crafter/README_IMAGE_ONLY_EVAL.md)
8
+ - **Pokemon Red**: [`pokemon_red/README_IMAGE_ONLY_EVAL.md`](pokemon_red/README_IMAGE_ONLY_EVAL.md)
9
+
10
+ ## ⚡ Quick Start
11
+
12
+ ### Prerequisites
13
+
14
+ ```bash
15
+ # 1. Set OpenAI API key in .env
16
+ echo "OPENAI_API_KEY=sk-proj-..." >> .env
17
+
18
+ # 2. Navigate to synth-ai repo
19
+ cd /path/to/synth-ai
20
+ ```
21
+
22
+ ### Run Crafter (Easier - 70% Success Rate)
23
+
24
+ ```bash
25
+ # Set up tracing
26
+ export TASKAPP_TRACING_ENABLED=1
27
+ export TURSO_NATIVE=1
28
+ export SQLD_DB_PATH="traces/v3/crafter_eval.db"
29
+
30
+ # Run evaluation
31
+ uv run synth-ai eval grpo-crafter \
32
+ --config examples/task_apps/crafter/eval_image_only_gpt4o.toml
33
+
34
+ # Check results
35
+ sqlite3 -header -column traces/v3/crafter_eval.db \
36
+ "SELECT total_reward, achievements_count,
37
+ json_extract(reward_metadata, '$.final_achievements') as achievements
38
+ FROM outcome_rewards WHERE total_reward > 0;"
39
+ ```
40
+
41
+ ### Run Pokemon Red (Harder - 0% with Default Config)
42
+
43
+ ```bash
44
+ # Set up tracing
45
+ export TASKAPP_TRACING_ENABLED=1
46
+ export TURSO_NATIVE=1
47
+ export SQLD_DB_PATH="traces/v3/pokemon_red_eval.db"
48
+
49
+ # Run evaluation
50
+ uv run synth-ai eval pokemon_red \
51
+ --config examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml
52
+
53
+ # Check results
54
+ sqlite3 -header -column traces/v3/pokemon_red_eval.db \
55
+ "SELECT total_reward, achievements_count,
56
+ json_extract(reward_metadata, '$.final_map') as map,
57
+ json_extract(reward_metadata, '$.party_count') as party
58
+ FROM outcome_rewards;"
59
+ ```
60
+
61
+ ## 📊 Comparison
62
+
63
+ | Feature | Crafter | Pokemon Red |
64
+ |---------|---------|-------------|
65
+ | **Difficulty** | Easier | Harder |
66
+ | **Default success** | ~70% earn rewards | ~0% (needs tuning) |
67
+ | **Typical reward** | 1-3 achievements | 0 (10 steps too short) |
68
+ | **Best for** | Testing vision models | RL research |
69
+ | **Recommended steps** | 10 (default works) | 100-500 (need more) |
70
+
71
+ ## 🔧 Configuration Files
72
+
73
+ ### Crafter Config
74
+ **Location**: `examples/task_apps/crafter/eval_image_only_gpt4o.toml`
75
+
76
+ ```toml
77
+ [eval]
78
+ app_id = "grpo-crafter"
79
+ model = "gpt-4o-mini-2024-07-18"
80
+ seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
81
+ max_turns = 10
82
+ env_name = "crafter"
83
+ policy_name = "crafter-react"
84
+
85
+ [eval.policy_config]
86
+ use_vision = true
87
+ image_only_mode = true # Only images, no text
88
+ ```
89
+
90
+ ### Pokemon Red Config
91
+ **Location**: `examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml`
92
+
93
+ ```toml
94
+ [eval]
95
+ app_id = "pokemon_red"
96
+ model = "gpt-4o-mini-2024-07-18"
97
+ seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
98
+ max_turns = 10
99
+ env_name = "pokemon_red"
100
+
101
+ [eval.policy_config]
102
+ use_vision = true
103
+ image_only_mode = true # Only images, no text
104
+ ```
105
+
106
+ ## 📈 Improving Pokemon Red Results
107
+
108
+ Pokemon Red is harder and needs more steps. To get non-zero rewards:
109
+
110
+ ```toml
111
+ [eval]
112
+ model = "gpt-4o-2024-08-06" # Use full GPT-4o
113
+ max_turns = 100
114
+
115
+ [eval.env_config]
116
+ env_params = {max_steps_per_episode = 500}
117
+
118
+ [eval.policy_config]
119
+ model = "gpt-4o-2024-08-06"
120
+ image_only_mode = false # Enable text too (multimodal)
121
+ max_llm_calls = 100
122
+ ```
123
+
124
+ ## 🗄️ Database Queries
125
+
126
+ ### Get All Rewards
127
+
128
+ ```sql
129
+ -- Crafter
130
+ SELECT
131
+ json_extract(reward_metadata, '$.env_seed') as seed,
132
+ total_reward,
133
+ achievements_count,
134
+ json_extract(reward_metadata, '$.final_achievements') as achievements
135
+ FROM outcome_rewards
136
+ ORDER BY total_reward DESC;
137
+
138
+ -- Pokemon Red
139
+ SELECT
140
+ session_id,
141
+ total_reward,
142
+ achievements_count,
143
+ json_extract(reward_metadata, '$.final_map') as map,
144
+ json_extract(reward_metadata, '$.party_count') as party
145
+ FROM outcome_rewards
146
+ ORDER BY total_reward DESC;
147
+ ```
148
+
149
+ ### Filter Non-Zero Rewards
150
+
151
+ ```sql
152
+ SELECT * FROM outcome_rewards WHERE total_reward > 0;
153
+ ```
154
+
155
+ ### Get Statistics
156
+
157
+ ```sql
158
+ SELECT
159
+ COUNT(*) as total,
160
+ SUM(CASE WHEN total_reward > 0 THEN 1 ELSE 0 END) as with_rewards,
161
+ AVG(total_reward) as avg_reward,
162
+ MAX(total_reward) as max_reward
163
+ FROM outcome_rewards;
164
+ ```
165
+
166
+ ## 🎯 What is Image-Only Mode?
167
+
168
+ **Image-Only Mode** means:
169
+ - ✅ Agent receives **only** base64-encoded PNG images
170
+ - ❌ Agent receives **no** text observations (HP, position, inventory, etc.)
171
+ - 🎓 Tests pure vision understanding
172
+
173
+ **Multimodal Mode** (recommended for Pokemon Red):
174
+ - ✅ Agent receives **both** images and text
175
+ - 🏆 Better performance but "easier"
176
+
177
+ Toggle with:
178
+ ```toml
179
+ [eval.policy_config]
180
+ use_vision = true # Enable vision
181
+ image_only_mode = false # false = send text too
182
+ ```
183
+
184
+ ## 📁 Files Created
185
+
186
+ ### Crafter
187
+ - `crafter/eval_image_only_gpt4o.toml` - Config
188
+ - `crafter/README_IMAGE_ONLY_EVAL.md` - Full guide
189
+ - `crafter/EVAL_IMAGE_ONLY_RESULTS.md` - Example results
190
+ - `crafter/QUERY_EXAMPLES.md` - SQL queries
191
+
192
+ ### Pokemon Red
193
+ - `pokemon_red/eval_image_only_gpt4o.toml` - Config
194
+ - `pokemon_red/README_IMAGE_ONLY_EVAL.md` - Full guide
195
+ - `pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md` - Implementation
196
+ - `pokemon_red/EVAL_IMAGE_ONLY_STATUS.md` - Status
197
+
198
+ ## 🐛 Common Issues
199
+
200
+ ### Database Not Created
201
+ ```bash
202
+ # Ensure variables are set
203
+ export TASKAPP_TRACING_ENABLED=1
204
+ export TURSO_NATIVE=1
205
+ export SQLD_DB_PATH="traces/v3/your_eval.db"
206
+ ```
207
+
208
+ ### 401 Unauthorized
209
+ ```bash
210
+ # Check API key in .env
211
+ cat .env | grep OPENAI_API_KEY
212
+ ```
213
+
214
+ ### Pokemon Red: ROM Not Found
215
+ ```bash
216
+ # Place ROM at expected location
217
+ cp pokemon_red.gb synth_ai/environments/examples/red/roms/
218
+ ```
219
+
220
+ ### All Rewards Zero
221
+ - **Crafter**: Should get ~70% non-zero by default
222
+ - **Pokemon Red**: Expected with 10 steps - increase to 100-500
223
+
224
+ ## 🎓 Understanding Results
225
+
226
+ ### Crafter Achievements
227
+ - `collect_wood` - Cut down trees
228
+ - `collect_sapling` - Collect tree saplings
229
+ - `collect_drink` - Drink from water
230
+
231
+ ### Pokemon Red Milestones
232
+ - Leave bedroom (+20)
233
+ - Exit house (+30)
234
+ - Find Oak's lab (+40)
235
+ - Get starter Pokemon (+100)
236
+ - Win first battle (+150)
237
+
238
+ **Total possible**: ~600 points
239
+
240
+ ## 🚀 Next Steps
241
+
242
+ 1. **Read full docs**: See task-specific READMEs for details
243
+ 2. **Run evaluations**: Start with Crafter (easier)
244
+ 3. **Query database**: Use SQL to analyze results
245
+ 4. **Tune configs**: Adjust steps/model for better performance
246
+ 5. **Compare modes**: Try image-only vs multimodal
247
+
248
+ ## 📞 Support
249
+
250
+ For issues or questions:
251
+ 1. Check full README for your task app
252
+ 2. Review example results files
253
+ 3. Query database to verify data
254
+ 4. Adjust config parameters
255
+
256
+ Happy evaluating! 🎮
257
+
258
+