synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (110) hide show
  1. examples/multi_step/configs/README_verilog_rl.md +77 -0
  2. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  3. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  4. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  5. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  6. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +5 -4
  7. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  8. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  9. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  10. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  11. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  12. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  13. examples/multi_step/readme.md +48 -0
  14. examples/multi_step/verilog_rl_lora.md +218 -0
  15. examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
  16. examples/sft/evaluate.py +2 -0
  17. examples/sft/generate_traces.py +2 -0
  18. examples/swe/task_app/grpo_swe_mini.py +1 -0
  19. examples/swe/task_app/hosted/rollout.py +2 -0
  20. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  21. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  22. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  23. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  24. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  25. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  26. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  27. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  28. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  29. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  30. examples/task_apps/crafter/task_app/__init__.py +3 -0
  31. examples/task_apps/crafter/task_app/grpo_crafter.py +306 -8
  32. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  33. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +16 -3
  34. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  35. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
  36. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +52 -1
  37. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +111 -13
  38. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
  39. examples/task_apps/enron/filter_sft.toml +5 -0
  40. examples/task_apps/enron/tests/__init__.py +2 -0
  41. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  42. examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
  43. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  44. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  45. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  46. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  47. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  48. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
  49. examples/task_apps/pokemon_red/task_app.py +199 -6
  50. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
  51. examples/task_apps/sokoban/filter_sft.toml +5 -0
  52. examples/task_apps/sokoban/tests/__init__.py +2 -0
  53. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  54. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  55. examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
  56. examples/task_apps/verilog/filter_sft.toml +5 -0
  57. examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
  58. examples/task_apps/verilog/tests/__init__.py +2 -0
  59. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  60. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
  61. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  62. examples/warming_up_to_rl/groq_test.py +2 -0
  63. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  64. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  65. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  66. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  67. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  68. synth_ai/api/models/supported.py +1 -0
  69. synth_ai/cli/__init__.py +46 -13
  70. synth_ai/cli/_modal_wrapper.py +3 -2
  71. synth_ai/cli/recent.py +1 -1
  72. synth_ai/cli/status.py +1 -1
  73. synth_ai/cli/task_apps.py +354 -143
  74. synth_ai/cli/traces.py +1 -1
  75. synth_ai/cli/tui.py +57 -0
  76. synth_ai/cli/turso.py +1 -1
  77. synth_ai/cli/watch.py +1 -1
  78. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
  79. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  80. synth_ai/environments/examples/verilog/engine.py +76 -10
  81. synth_ai/judge_schemas.py +8 -8
  82. synth_ai/task/__init__.py +11 -1
  83. synth_ai/task/apps/__init__.py +1 -0
  84. synth_ai/task/config.py +257 -0
  85. synth_ai/task/contracts.py +15 -2
  86. synth_ai/task/rubrics/__init__.py +3 -0
  87. synth_ai/task/rubrics/loaders.py +22 -3
  88. synth_ai/task/rubrics/scoring.py +3 -0
  89. synth_ai/task/trace_correlation_helpers.py +315 -0
  90. synth_ai/task/validators.py +144 -0
  91. synth_ai/tracing_v3/abstractions.py +3 -3
  92. synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
  93. synth_ai/tracing_v3/session_tracer.py +16 -6
  94. synth_ai/tracing_v3/storage/base.py +29 -29
  95. synth_ai/tracing_v3/storage/config.py +3 -3
  96. synth_ai/tracing_v3/turso/daemon.py +8 -7
  97. synth_ai/tracing_v3/turso/native_manager.py +63 -40
  98. synth_ai/tracing_v3/utils.py +3 -3
  99. synth_ai/tui/__init__.py +5 -0
  100. synth_ai/tui/__main__.py +13 -0
  101. synth_ai/tui/cli/__init__.py +1 -0
  102. synth_ai/tui/cli/query_experiments.py +164 -0
  103. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  104. synth_ai/tui/dashboard.py +906 -0
  105. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/METADATA +1 -1
  106. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/RECORD +110 -71
  107. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
  108. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
  109. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
  110. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,283 @@
1
+ # Pokemon Red Image-Only Eval - Complete ✅
2
+
3
+ ## Summary
4
+
5
+ Successfully ran **10 rollouts** of Pokemon Red with **image-only input** (no text observations), with full **Turso tracing** and **outcome rewards** saved to database.
6
+
7
+ ## Configuration
8
+
9
+ - **Model**: `gpt-4o-mini-2024-07-18`
10
+ - **Input Mode**: Image-only (vision enabled, text observations disabled)
11
+ - **Max Steps**: 10 per episode
12
+ - **Max LLM Calls**: 10 per rollout
13
+ - **Seeds**: 0-9 (10 rollouts)
14
+ - **Tracing**: Enabled with Turso/libsql (MVCC concurrent writes)
15
+ - **Database**: `traces/v3/pokemon_red_eval.db` (192KB)
16
+
17
+ ## Results
18
+
19
+ ### Overall Performance
20
+ - **Total Rollouts**: 10/10 completed
21
+ - **Success Rate**: 100% (no errors)
22
+ - **Mean Reward**: 0.000
23
+ - **Rollouts with Rewards**: 0/10 (0%)
24
+
25
+ *Note: 0 rewards are expected - the Pallet Town sequence is challenging with only 10 turns and image-only input*
26
+
27
+ ### Database Verification
28
+ ```sql
29
+ Total rollouts: 10
30
+ Rollouts with reward > 0: 0
31
+ Rollouts with achievements > 0: 0
32
+ Average reward: 0.0
33
+ Database size: 192KB
34
+ ```
35
+
36
+ ### All Rollouts
37
+ All 10 seeds stayed in Map 38 (Red's bedroom) with 0 party Pokemon and 0 badges.
38
+
39
+ ## Implementation Details
40
+
41
+ ### 1. Image-Only Mode
42
+ **File**: `task_app.py` → `_call_inference()` function
43
+
44
+ ```python
45
+ # Check if vision mode is enabled
46
+ use_vision = bool(policy_cfg.get("use_vision", False))
47
+ image_only_mode = bool(policy_cfg.get("image_only_mode", False))
48
+
49
+ # Image-only mode: only send image, no text
50
+ if image_only_mode:
51
+ user_content = [
52
+ {"type": "image_url", "image_url": {"url": image_data_url}}
53
+ ]
54
+ else:
55
+ # Vision mode with text: send both text and image
56
+ user_content = [
57
+ {"type": "text", "text": state_summary},
58
+ {"type": "image_url", "image_url": {"url": image_data_url}}
59
+ ]
60
+ ```
61
+
62
+ ### 2. OpenAI API Integration
63
+ **File**: `task_app.py` → `_call_inference()` function
64
+
65
+ Fixed inference URL construction and authentication:
66
+ ```python
67
+ # Add /v1/chat/completions if using OpenAI directly
68
+ if "api.openai.com" in inference_url:
69
+ inference_url = inference_url + "/v1/chat/completions"
70
+
71
+ # External API: use direct HTTP client with auth header
72
+ if is_external:
73
+ headers = {}
74
+ if "api.openai.com" in inference_url:
75
+ api_key = os.getenv("OPENAI_API_KEY")
76
+ if api_key:
77
+ headers["Authorization"] = f"Bearer {api_key}"
78
+ ```
79
+
80
+ ### 3. SessionTracer Integration
81
+ **File**: `task_app.py` → `rollout_executor()` function
82
+
83
+ Added full Turso tracing like Crafter:
84
+ ```python
85
+ # Initialize SessionTracer for this rollout
86
+ tracer_factory = getattr(fastapi_request.app.state, "session_tracer_factory", None)
87
+ tracer_instance: SessionTracer | None = None
88
+ if callable(tracer_factory):
89
+ inst = tracer_factory()
90
+ tracer_instance = inst if isinstance(inst, SessionTracer) else None
91
+
92
+ # Start tracing session
93
+ if tracer_instance is not None:
94
+ await tracer_instance.initialize()
95
+ await tracer_instance.start_session(
96
+ session_id=request.run_id,
97
+ metadata={...}
98
+ )
99
+ ```
100
+
101
+ ### 4. Outcome Rewards
102
+ **File**: `task_app.py` → `rollout_executor()` end
103
+
104
+ ```python
105
+ # Record outcome rewards and end session
106
+ if tracer_instance is not None:
107
+ achievements_count = len(milestone_events)
108
+
109
+ reward_metadata = {
110
+ "run_id": request.run_id,
111
+ "env_name": "pokemon_red",
112
+ "final_map": final_state.get("map_id", -1),
113
+ "party_count": final_state.get("party_count", 0),
114
+ "badges": final_state.get("badges", 0),
115
+ "steps": len(steps),
116
+ "milestone_events": milestone_events,
117
+ "reward_components": all_reward_components,
118
+ }
119
+
120
+ # Record outcome reward to Turso
121
+ await tracer_instance.record_outcome_reward(
122
+ total_reward=int(total_reward),
123
+ achievements_count=achievements_count,
124
+ total_steps=len(steps),
125
+ reward_metadata=reward_metadata,
126
+ )
127
+
128
+ # End session
129
+ session_trace = await tracer_instance.end_session()
130
+ ```
131
+
132
+ ### 5. Tracer Factory Setup
133
+ **File**: `task_app.py` → `build_config()` function
134
+
135
+ ```python
136
+ # Set up tracing
137
+ tracing_enabled = tracing_env_enabled()
138
+ tracing_db_url = resolve_tracing_db_url()
139
+ tracer_factory = build_tracer_factory(
140
+ SessionTracer, enabled=tracing_enabled, db_url=tracing_db_url
141
+ )
142
+
143
+ app_state: dict[str, Any] = {
144
+ "tracing_enabled": tracing_enabled,
145
+ }
146
+ if tracer_factory is not None:
147
+ app_state["session_tracer_factory"] = tracer_factory
148
+ ```
149
+
150
+ ## Database Schema
151
+
152
+ ### outcome_rewards Table
153
+ ```sql
154
+ CREATE TABLE outcome_rewards (
155
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
156
+ session_id VARCHAR NOT NULL,
157
+ total_reward INTEGER NOT NULL,
158
+ achievements_count INTEGER NOT NULL,
159
+ total_steps INTEGER NOT NULL,
160
+ created_at DATETIME NOT NULL,
161
+ reward_metadata TEXT,
162
+ FOREIGN KEY(session_id) REFERENCES session_traces(session_id)
163
+ );
164
+ ```
165
+
166
+ ## Query Examples
167
+
168
+ ### Get all sessions with rewards
169
+ ```sql
170
+ SELECT
171
+ st.session_id,
172
+ st.num_timesteps,
173
+ orw.total_reward,
174
+ orw.achievements_count,
175
+ json_extract(orw.reward_metadata, '$.final_map') as final_map
176
+ FROM session_traces st
177
+ INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
178
+ ORDER BY orw.total_reward DESC;
179
+ ```
180
+
181
+ ### Filter for non-zero rewards (when they exist)
182
+ ```sql
183
+ SELECT
184
+ session_id,
185
+ total_reward,
186
+ achievements_count,
187
+ total_steps,
188
+ json_extract(reward_metadata, '$.final_map') as final_map,
189
+ json_extract(reward_metadata, '$.party_count') as party_count
190
+ FROM outcome_rewards
191
+ WHERE total_reward > 0
192
+ ORDER BY total_reward DESC;
193
+ ```
194
+
195
+ ## Comparison: Crafter vs Pokemon Red
196
+
197
+ | Feature | Crafter | Pokemon Red |
198
+ |---------|---------|-------------|
199
+ | Image-only mode | ✅ Working | ✅ Working |
200
+ | OpenAI API | ✅ Working | ✅ Working |
201
+ | Eval CLI | ✅ Working | ✅ Working |
202
+ | SessionTracer | ✅ Integrated | ✅ Integrated |
203
+ | Turso database | ✅ 1.7MB (10 rollouts) | ✅ 192KB (10 rollouts) |
204
+ | outcome_rewards | ✅ 10 rows | ✅ 10 rows |
205
+ | Foreign keys | ✅ Working | ✅ Working |
206
+ | Non-zero rewards | ✅ 7/10 rollouts | ❌ 0/10 rollouts* |
207
+
208
+ *Expected: Pokemon Red is harder (requires room navigation, NPC dialogue, etc.)
209
+
210
+ ## Files Modified
211
+
212
+ 1. **`task_app.py`**:
213
+ - Added `use_vision` and `image_only_mode` support
214
+ - Fixed OpenAI API URL construction and auth
215
+ - Integrated SessionTracer for Turso persistence
216
+ - Added `record_outcome_reward()` calls
217
+ - Updated `build_config()` to create tracer_factory
218
+
219
+ 2. **`eval_image_only_gpt4o.toml`** (new):
220
+ - Config for image-only evaluation
221
+ - 10 seeds, 10 max turns per episode
222
+ - GPT-4o mini with vision enabled
223
+
224
+ ## Running the Evaluation
225
+
226
+ ```bash
227
+ cd /Users/joshpurtell/Documents/GitHub/synth-ai
228
+
229
+ # Set up tracing environment
230
+ export TASKAPP_TRACING_ENABLED=1
231
+ export TURSO_NATIVE=1
232
+ export SQLD_DB_PATH="traces/v3/pokemon_red_eval.db"
233
+
234
+ # Run evaluation
235
+ uv run synth-ai eval pokemon_red \
236
+ --config examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml
237
+ ```
238
+
239
+ ## Verification Commands
240
+
241
+ ```bash
242
+ # Check database size
243
+ ls -lh traces/v3/pokemon_red_eval.db
244
+
245
+ # Count sessions
246
+ sqlite3 traces/v3/pokemon_red_eval.db \
247
+ "SELECT COUNT(*) FROM session_traces;"
248
+
249
+ # View all rewards
250
+ sqlite3 -header -column traces/v3/pokemon_red_eval.db \
251
+ "SELECT session_id, total_reward, achievements_count, total_steps
252
+ FROM outcome_rewards
253
+ ORDER BY total_reward DESC;"
254
+
255
+ # Test foreign keys
256
+ sqlite3 traces/v3/pokemon_red_eval.db \
257
+ "SELECT st.session_id, orw.total_reward
258
+ FROM session_traces st
259
+ INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
260
+ LIMIT 5;"
261
+ ```
262
+
263
+ ## Next Steps
264
+
265
+ To improve rewards:
266
+ 1. **Increase max_turns**: Try 50-100 turns per episode
267
+ 2. **Better prompting**: Add more detailed instructions in system prompt
268
+ 3. **Hybrid mode**: Use `use_vision=true` with `image_only_mode=false` to get both images and text
269
+ 4. **Different model**: Try GPT-4o (full) or Claude 3.5 Sonnet for better vision understanding
270
+
271
+ ## Summary
272
+
273
+ ✅ **All goals achieved**:
274
+ - Image-only input mode working
275
+ - 10 rollouts completed successfully
276
+ - Turso database created with 192KB of trace data
277
+ - outcome_rewards table with foreign keys
278
+ - Can filter and query by rewards
279
+ - SessionTracer fully integrated
280
+
281
+ Pokemon Red now has the same Turso tracing capabilities as Crafter! 🎉
282
+
283
+
@@ -0,0 +1,155 @@
1
+ # Pokemon Red Image-Only Eval Status - ✅ COMPLETE
2
+
3
+ **Status**: All features working! See `EVAL_IMAGE_ONLY_COMPLETE.md` for full details.
4
+
5
+ ---
6
+
7
+ # Original Status (Before Turso Integration)
8
+
9
+ ## ✅ What's Working
10
+
11
+ ### 1. Image-Only Input Mode
12
+ - Successfully modified `task_app.py` to support `use_vision` and `image_only_mode` config flags
13
+ - When enabled, sends only base64-encoded PNG frames to the LLM (no text observations)
14
+ - Similar to Crafter's implementation
15
+
16
+ ### 2. OpenAI API Integration
17
+ - Fixed inference URL construction to properly call `https://api.openai.com/v1/chat/completions`
18
+ - Added proper Authorization Bearer token handling
19
+ - Successfully runs 10 rollouts with `gpt-4o-mini-2024-07-18`
20
+
21
+ ### 3. Eval Configuration
22
+ - Created `eval_image_only_gpt4o.toml` config file
23
+ - Successfully runs via `synth-ai eval pokemon_red --config ...`
24
+ - All 10 seeds complete without errors
25
+
26
+ ## ⚠️ What's Not Working Yet
27
+
28
+ ### Turso Tracing & Rewards
29
+ **Issue**: Pokemon Red doesn't use SessionTracer like Crafter does
30
+
31
+ **Current State**:
32
+ - Pokemon Red returns a basic trace payload (session_id, metadata) for the CLI
33
+ - But it doesn't actually create or save to a Turso database
34
+ - No `outcome_rewards` table or reward persistence
35
+ - No integration with `SessionTracer` from `tracing_v3`
36
+
37
+ **What Would Be Needed**:
38
+ 1. Import and initialize `SessionTracer` in Pokemon Red's `rollout_executor`
39
+ 2. Call `tracer.start_session()` at beginning of rollout
40
+ 3. Record events during rollout (like Crafter does)
41
+ 4. Call `tracer.record_outcome_reward()` at end with:
42
+ - `total_reward`: sum of step rewards
43
+ - `achievements_count`: count of milestones reached
44
+ - `total_steps`: number of steps taken
45
+ - `reward_metadata`: dict with map_id, party_count, badges, etc.
46
+ 5. Call `tracer.end_session()` to persist to database
47
+
48
+ ### Reward Computation
49
+ **Current State**:
50
+ - Pokemon Red has a `PalletTownProgressionCompositeReward` reward function
51
+ - It tracks milestones like leaving bedroom, getting starter Pokemon, etc.
52
+ - But rewards are currently all 0.0 (expected - task is hard with only 10 turns and image-only input)
53
+
54
+ **What's Challenging**:
55
+ - The Pallet Town sequence requires:
56
+ - Navigating multiple rooms
57
+ - Talking to NPCs (pressing A at right moments)
58
+ - Selecting starter Pokemon
59
+ - Entering first battle
60
+ - With only images (no text hints) and 10 LLM calls, agents struggle to make progress
61
+ - May need more turns or better prompting to get non-zero rewards
62
+
63
+ ## 📊 Current Results
64
+
65
+ ```
66
+ Eval complete: 10 ok, 0 failed
67
+ Model: gpt-4o-mini-2024-07-18
68
+ Seeds: 0-9 (10 rollouts)
69
+ Mean reward: 0.000
70
+ Outcome score: 0.000
71
+
72
+ All rollouts: ~21 steps, 0 rewards, Map 38 (Red's bedroom)
73
+ ```
74
+
75
+ ## 🔧 Files Modified
76
+
77
+ 1. **`task_app.py`**:
78
+ - Added `use_vision` and `image_only_mode` support in `_call_inference`
79
+ - Fixed OpenAI API URL construction
80
+ - Added basic trace payload generation
81
+ - **Still needs**: SessionTracer integration for Turso persistence
82
+
83
+ 2. **`eval_image_only_gpt4o.toml`** (new):
84
+ - Config for image-only evaluation
85
+ - 10 seeds, 10 max turns per episode
86
+ - GPT-4o mini with vision enabled
87
+
88
+ ## 🚀 Next Steps to Complete Turso Integration
89
+
90
+ ### Option 1: Quick Fix (Minimal Tracing)
91
+ Just save basic session info without full event tracing:
92
+ ```python
93
+ # At start of rollout_executor
94
+ from synth_ai.tracing_v3 import SessionTracer, StorageConfig, StorageBackend
95
+
96
+ tracer = SessionTracer(
97
+ storage_config=StorageConfig(
98
+ backend=StorageBackend.TURSO_NATIVE,
99
+ connection_string=f"file:{os.getenv('SQLD_DB_PATH', 'traces/v3/pokemon_red.db')}"
100
+ ),
101
+ auto_save=True
102
+ )
103
+ await tracer.initialize()
104
+ session_id = await tracer.start_session(metadata={...})
105
+
106
+ # At end of rollout_executor
107
+ await tracer.record_outcome_reward(
108
+ total_reward=int(total_reward),
109
+ achievements_count=len(milestone_events), # or 0 if none
110
+ total_steps=len(steps),
111
+ reward_metadata={
112
+ "final_map": final_state.get("map_id"),
113
+ "party_count": final_state.get("party_count", 0),
114
+ "badges": final_state.get("badges", 0),
115
+ "milestone_events": milestone_events,
116
+ }
117
+ )
118
+ await tracer.end_session()
119
+ ```
120
+
121
+ ### Option 2: Full Tracing (Like Crafter)
122
+ Integrate complete event tracing like Crafter's rollout.py:
123
+ - Record messages, timesteps, events for each step
124
+ - More complex but provides rich trace data
125
+ - Would require more significant refactoring
126
+
127
+ ## 📝 Comparison with Crafter
128
+
129
+ | Feature | Crafter | Pokemon Red |
130
+ |---------|---------|-------------|
131
+ | Image-only mode | ✅ Working | ✅ Working |
132
+ | OpenAI API | ✅ Working | ✅ Working |
133
+ | Eval CLI | ✅ Working | ✅ Working |
134
+ | SessionTracer | ✅ Integrated | ❌ Not integrated |
135
+ | Turso database | ✅ Saves traces | ❌ No database created |
136
+ | outcome_rewards | ✅ Persisted | ❌ Not saved |
137
+ | Foreign keys | ✅ Working | ❌ N/A |
138
+ | Non-zero rewards | ✅ 7/10 rollouts | ❌ 0/10 rollouts |
139
+
140
+ ## ✅ Summary
141
+
142
+ **Completed**:
143
+ - ✅ Image-only input mode for Pokemon Red
144
+ - ✅ OpenAI API integration with proper auth
145
+ - ✅ Eval CLI runs 10 rollouts successfully
146
+ - ✅ Basic trace payload returned (for CLI)
147
+
148
+ **Not Yet Complete**:
149
+ - ❌ Turso database persistence
150
+ - ❌ outcome_rewards table with foreign keys
151
+ - ❌ SessionTracer integration
152
+ - ❌ Queryable rewards by seed
153
+
154
+ **To match Crafter's capabilities**, Pokemon Red needs SessionTracer integration (Option 1 or 2 above).
155
+