synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/README.md +1 -0
- examples/multi_step/SFT_README.md +147 -0
- examples/multi_step/configs/README_verilog_rl.md +77 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
- examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
- examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -11
- examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
- examples/multi_step/configs/crafter_synth_backend.md +40 -0
- examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
- examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
- examples/multi_step/configs/verilog_rl_lora.toml +190 -0
- examples/multi_step/convert_traces_to_sft.py +84 -0
- examples/multi_step/judges/crafter_backend_judge.py +220 -0
- examples/multi_step/judges/verilog_backend_judge.py +234 -0
- examples/multi_step/readme.md +48 -0
- examples/multi_step/run_sft_qwen30b.sh +45 -0
- examples/multi_step/verilog_rl_lora.md +218 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +3 -2
- examples/qwen_coder/configs/coder_lora_4b.toml +2 -1
- examples/qwen_coder/configs/coder_lora_small.toml +2 -1
- examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
- examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
- examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
- examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
- examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
- examples/qwen_vl/QUICKSTART.md +327 -0
- examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
- examples/qwen_vl/README.md +154 -0
- examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
- examples/qwen_vl/RL_VISION_TESTING.md +333 -0
- examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
- examples/qwen_vl/SETUP_COMPLETE.md +275 -0
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +490 -0
- examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
- examples/qwen_vl/__init__.py +2 -0
- examples/qwen_vl/collect_data_via_cli.md +423 -0
- examples/qwen_vl/collect_vision_traces.py +368 -0
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +127 -0
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +60 -0
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +43 -0
- examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +45 -0
- examples/qwen_vl/configs/eval_qwen2vl_vision.toml +44 -0
- examples/qwen_vl/configs/filter_qwen2vl_sft.toml +50 -0
- examples/qwen_vl/configs/filter_vision_sft.toml +53 -0
- examples/qwen_vl/configs/filter_vision_test.toml +8 -0
- examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
- examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
- examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
- examples/qwen_vl/run_vision_comparison.sh +62 -0
- examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
- examples/qwen_vl/test_image_validation.py +201 -0
- examples/qwen_vl/test_sft_vision_data.py +110 -0
- examples/rl/README.md +1 -1
- examples/rl/configs/eval_base_qwen.toml +17 -0
- examples/rl/configs/eval_rl_qwen.toml +13 -0
- examples/rl/configs/rl_from_base_qwen.toml +37 -0
- examples/rl/configs/rl_from_base_qwen17.toml +76 -0
- examples/rl/configs/rl_from_ft_qwen.toml +37 -0
- examples/rl/run_eval.py +436 -0
- examples/rl/run_rl_and_save.py +111 -0
- examples/rl/task_app/README.md +22 -0
- examples/rl/task_app/math_single_step.py +990 -0
- examples/rl/task_app/math_task_app.py +111 -0
- examples/sft/README.md +5 -5
- examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -2
- examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -3
- examples/sft/evaluate.py +4 -4
- examples/sft/export_dataset.py +7 -4
- examples/sft/generate_traces.py +2 -0
- examples/swe/task_app/README.md +1 -1
- examples/swe/task_app/grpo_swe_mini.py +1 -1
- examples/swe/task_app/grpo_swe_mini_task_app.py +0 -12
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +13 -13
- examples/swe/task_app/hosted/policy_routes.py +0 -2
- examples/swe/task_app/hosted/rollout.py +2 -8
- examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
- examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
- examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
- examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
- examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
- examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
- examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
- examples/task_apps/crafter/task_app/__init__.py +3 -0
- examples/task_apps/crafter/task_app/grpo_crafter.py +309 -14
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +75 -4
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +55 -3
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +114 -32
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +127 -27
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/filter_sft.toml +5 -0
- examples/task_apps/enron/tests/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
- examples/task_apps/enron/tests/unit/__init__.py +2 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
- examples/task_apps/pokemon_red/task_app.py +199 -6
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
- examples/task_apps/sokoban/filter_sft.toml +5 -0
- examples/task_apps/sokoban/tests/__init__.py +2 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
- examples/task_apps/verilog/filter_sft.toml +5 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
- examples/task_apps/verilog/tests/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
- examples/task_apps/verilog/tests/unit/__init__.py +2 -0
- examples/vlm/README.md +3 -3
- examples/vlm/configs/crafter_vlm_gpt4o.toml +2 -0
- examples/vlm/crafter_openai_vlm_agent.py +3 -5
- examples/vlm/filter_image_rows.py +1 -1
- examples/vlm/run_crafter_vlm_benchmark.py +2 -2
- examples/warming_up_to_rl/_utils.py +92 -0
- examples/warming_up_to_rl/analyze_trace_db.py +1 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +2 -0
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_ft.toml +2 -0
- examples/warming_up_to_rl/export_trace_sft.py +174 -60
- examples/warming_up_to_rl/groq_test.py +2 -0
- examples/warming_up_to_rl/readme.md +63 -132
- examples/warming_up_to_rl/run_fft_and_save.py +1 -1
- examples/warming_up_to_rl/run_local_rollout.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
- examples/warming_up_to_rl/run_rl_and_save.py +1 -1
- examples/warming_up_to_rl/run_rollout_remote.py +2 -0
- examples/warming_up_to_rl/task_app/README.md +42 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +696 -0
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +478 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +204 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +618 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1081 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1861 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +62 -0
- synth_ai/__init__.py +44 -30
- synth_ai/_utils/__init__.py +47 -0
- synth_ai/_utils/base_url.py +10 -0
- synth_ai/_utils/http.py +10 -0
- synth_ai/_utils/prompts.py +10 -0
- synth_ai/_utils/task_app_state.py +12 -0
- synth_ai/_utils/user_config.py +10 -0
- synth_ai/api/models/supported.py +145 -7
- synth_ai/api/train/__init__.py +13 -1
- synth_ai/api/train/cli.py +30 -7
- synth_ai/api/train/config_finder.py +18 -11
- synth_ai/api/train/env_resolver.py +13 -10
- synth_ai/cli/__init__.py +66 -49
- synth_ai/cli/_modal_wrapper.py +9 -6
- synth_ai/cli/_typer_patch.py +0 -2
- synth_ai/cli/_validate_task_app.py +22 -4
- synth_ai/cli/legacy_root_backup.py +3 -1
- synth_ai/cli/lib/__init__.py +10 -0
- synth_ai/cli/lib/task_app_discovery.py +7 -0
- synth_ai/cli/lib/task_app_env.py +518 -0
- synth_ai/cli/recent.py +1 -0
- synth_ai/cli/setup.py +266 -0
- synth_ai/cli/task_app_deploy.py +16 -0
- synth_ai/cli/task_app_list.py +25 -0
- synth_ai/cli/task_app_modal_serve.py +16 -0
- synth_ai/cli/task_app_serve.py +18 -0
- synth_ai/cli/task_apps.py +392 -141
- synth_ai/cli/train.py +18 -0
- synth_ai/cli/tui.py +62 -0
- synth_ai/demos/__init__.py +10 -0
- synth_ai/demos/core/__init__.py +28 -1
- synth_ai/demos/crafter/__init__.py +1 -0
- synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
- synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
- synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
- synth_ai/demos/demo_registry.py +176 -0
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/demos/math/__init__.py +1 -0
- synth_ai/demos/math/_common.py +16 -0
- synth_ai/demos/math/app.py +38 -0
- synth_ai/demos/math/config.toml +76 -0
- synth_ai/demos/math/deploy_modal.py +54 -0
- synth_ai/demos/math/modal_task_app.py +702 -0
- synth_ai/demos/math/task_app_entry.py +51 -0
- synth_ai/environments/environment/core.py +7 -1
- synth_ai/environments/examples/bandit/engine.py +0 -1
- synth_ai/environments/examples/bandit/environment.py +0 -1
- synth_ai/environments/examples/crafter_classic/environment.py +1 -1
- synth_ai/environments/examples/verilog/engine.py +76 -10
- synth_ai/environments/examples/wordle/environment.py +0 -1
- synth_ai/evals/base.py +16 -5
- synth_ai/evals/client.py +1 -1
- synth_ai/inference/client.py +1 -1
- synth_ai/learning/client.py +1 -1
- synth_ai/learning/health.py +1 -1
- synth_ai/learning/jobs.py +1 -1
- synth_ai/learning/rl/client.py +1 -1
- synth_ai/learning/rl/env_keys.py +1 -1
- synth_ai/learning/rl/secrets.py +1 -1
- synth_ai/learning/sft/client.py +1 -1
- synth_ai/learning/sft/data.py +407 -4
- synth_ai/learning/validators.py +4 -1
- synth_ai/task/__init__.py +11 -1
- synth_ai/task/apps/__init__.py +5 -2
- synth_ai/task/config.py +259 -0
- synth_ai/task/contracts.py +15 -2
- synth_ai/task/rubrics/__init__.py +4 -2
- synth_ai/task/rubrics/loaders.py +27 -4
- synth_ai/task/rubrics/scoring.py +3 -0
- synth_ai/task/rubrics.py +219 -0
- synth_ai/task/trace_correlation_helpers.py +328 -0
- synth_ai/task/tracing_utils.py +14 -3
- synth_ai/task/validators.py +145 -2
- synth_ai/tracing_v3/config.py +15 -13
- synth_ai/tracing_v3/constants.py +21 -0
- synth_ai/tracing_v3/db_config.py +3 -1
- synth_ai/tracing_v3/decorators.py +10 -7
- synth_ai/tracing_v3/session_tracer.py +10 -0
- synth_ai/tracing_v3/turso/daemon.py +2 -2
- synth_ai/tracing_v3/turso/native_manager.py +108 -77
- synth_ai/tracing_v3/utils.py +1 -1
- synth_ai/tui/__init__.py +5 -0
- synth_ai/tui/__main__.py +13 -0
- synth_ai/tui/cli/__init__.py +1 -0
- synth_ai/tui/cli/query_experiments.py +164 -0
- synth_ai/tui/cli/query_experiments_v3.py +164 -0
- synth_ai/tui/dashboard.py +911 -0
- synth_ai/utils/__init__.py +101 -0
- synth_ai/utils/base_url.py +94 -0
- synth_ai/utils/cli.py +131 -0
- synth_ai/utils/env.py +287 -0
- synth_ai/utils/http.py +169 -0
- synth_ai/utils/modal.py +308 -0
- synth_ai/utils/process.py +212 -0
- synth_ai/utils/prompts.py +39 -0
- synth_ai/utils/sqld.py +122 -0
- synth_ai/utils/task_app_discovery.py +882 -0
- synth_ai/utils/task_app_env.py +186 -0
- synth_ai/utils/task_app_state.py +318 -0
- synth_ai/utils/user_config.py +137 -0
- synth_ai/v0/config/__init__.py +1 -5
- synth_ai/v0/config/base_url.py +1 -7
- synth_ai/v0/tracing/config.py +1 -1
- synth_ai/v0/tracing/decorators.py +1 -1
- synth_ai/v0/tracing/upload.py +1 -1
- synth_ai/v0/tracing_v1/config.py +1 -1
- synth_ai/v0/tracing_v1/decorators.py +1 -1
- synth_ai/v0/tracing_v1/upload.py +1 -1
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/METADATA +85 -31
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/RECORD +286 -135
- synth_ai/cli/man.py +0 -106
- synth_ai/compound/cais.py +0 -0
- synth_ai/core/experiment.py +0 -13
- synth_ai/core/system.py +0 -15
- synth_ai/demo_registry.py +0 -295
- synth_ai/handshake.py +0 -109
- synth_ai/http.py +0 -26
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
# Image-Only Evaluation - Quick Reference
|
|
2
|
+
|
|
3
|
+
This document provides a quick reference for running image-only evaluations on **Crafter** and **Pokemon Red** with Turso tracing.
|
|
4
|
+
|
|
5
|
+
## 📚 Full Documentation
|
|
6
|
+
|
|
7
|
+
- **Crafter**: [`crafter/README_IMAGE_ONLY_EVAL.md`](crafter/README_IMAGE_ONLY_EVAL.md)
|
|
8
|
+
- **Pokemon Red**: [`pokemon_red/README_IMAGE_ONLY_EVAL.md`](pokemon_red/README_IMAGE_ONLY_EVAL.md)
|
|
9
|
+
|
|
10
|
+
## ⚡ Quick Start
|
|
11
|
+
|
|
12
|
+
### Prerequisites
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
# 1. Set OpenAI API key in .env
|
|
16
|
+
echo "OPENAI_API_KEY=sk-proj-..." >> .env
|
|
17
|
+
|
|
18
|
+
# 2. Navigate to synth-ai repo
|
|
19
|
+
cd /path/to/synth-ai
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### Run Crafter (Easier - 70% Success Rate)
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# Set up tracing
|
|
26
|
+
export TASKAPP_TRACING_ENABLED=1
|
|
27
|
+
export TURSO_NATIVE=1
|
|
28
|
+
export SQLD_DB_PATH="traces/v3/crafter_eval.db"
|
|
29
|
+
|
|
30
|
+
# Run evaluation
|
|
31
|
+
uv run synth-ai eval grpo-crafter \
|
|
32
|
+
--config examples/task_apps/crafter/eval_image_only_gpt4o.toml
|
|
33
|
+
|
|
34
|
+
# Check results
|
|
35
|
+
sqlite3 -header -column traces/v3/crafter_eval.db \
|
|
36
|
+
"SELECT total_reward, achievements_count,
|
|
37
|
+
json_extract(reward_metadata, '$.final_achievements') as achievements
|
|
38
|
+
FROM outcome_rewards WHERE total_reward > 0;"
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Run Pokemon Red (Harder - 0% with Default Config)
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
# Set up tracing
|
|
45
|
+
export TASKAPP_TRACING_ENABLED=1
|
|
46
|
+
export TURSO_NATIVE=1
|
|
47
|
+
export SQLD_DB_PATH="traces/v3/pokemon_red_eval.db"
|
|
48
|
+
|
|
49
|
+
# Run evaluation
|
|
50
|
+
uv run synth-ai eval pokemon_red \
|
|
51
|
+
--config examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml
|
|
52
|
+
|
|
53
|
+
# Check results
|
|
54
|
+
sqlite3 -header -column traces/v3/pokemon_red_eval.db \
|
|
55
|
+
"SELECT total_reward, achievements_count,
|
|
56
|
+
json_extract(reward_metadata, '$.final_map') as map,
|
|
57
|
+
json_extract(reward_metadata, '$.party_count') as party
|
|
58
|
+
FROM outcome_rewards;"
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## 📊 Comparison
|
|
62
|
+
|
|
63
|
+
| Feature | Crafter | Pokemon Red |
|
|
64
|
+
|---------|---------|-------------|
|
|
65
|
+
| **Difficulty** | Easier | Harder |
|
|
66
|
+
| **Default success** | ~70% earn rewards | ~0% (needs tuning) |
|
|
67
|
+
| **Typical reward** | 1-3 achievements | 0 (10 steps too short) |
|
|
68
|
+
| **Best for** | Testing vision models | RL research |
|
|
69
|
+
| **Recommended steps** | 10 (default works) | 100-500 (need more) |
|
|
70
|
+
|
|
71
|
+
## 🔧 Configuration Files
|
|
72
|
+
|
|
73
|
+
### Crafter Config
|
|
74
|
+
**Location**: `examples/task_apps/crafter/eval_image_only_gpt4o.toml`
|
|
75
|
+
|
|
76
|
+
```toml
|
|
77
|
+
[eval]
|
|
78
|
+
app_id = "grpo-crafter"
|
|
79
|
+
model = "gpt-4o-mini-2024-07-18"
|
|
80
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
81
|
+
max_turns = 10
|
|
82
|
+
env_name = "crafter"
|
|
83
|
+
policy_name = "crafter-react"
|
|
84
|
+
|
|
85
|
+
[eval.policy_config]
|
|
86
|
+
use_vision = true
|
|
87
|
+
image_only_mode = true # Only images, no text
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Pokemon Red Config
|
|
91
|
+
**Location**: `examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml`
|
|
92
|
+
|
|
93
|
+
```toml
|
|
94
|
+
[eval]
|
|
95
|
+
app_id = "pokemon_red"
|
|
96
|
+
model = "gpt-4o-mini-2024-07-18"
|
|
97
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
98
|
+
max_turns = 10
|
|
99
|
+
env_name = "pokemon_red"
|
|
100
|
+
|
|
101
|
+
[eval.policy_config]
|
|
102
|
+
use_vision = true
|
|
103
|
+
image_only_mode = true # Only images, no text
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## 📈 Improving Pokemon Red Results
|
|
107
|
+
|
|
108
|
+
Pokemon Red is harder and needs more steps. To get non-zero rewards:
|
|
109
|
+
|
|
110
|
+
```toml
|
|
111
|
+
[eval]
|
|
112
|
+
model = "gpt-4o-2024-08-06" # Use full GPT-4o
|
|
113
|
+
max_turns = 100
|
|
114
|
+
|
|
115
|
+
[eval.env_config]
|
|
116
|
+
env_params = {max_steps_per_episode = 500}
|
|
117
|
+
|
|
118
|
+
[eval.policy_config]
|
|
119
|
+
model = "gpt-4o-2024-08-06"
|
|
120
|
+
image_only_mode = false # Enable text too (multimodal)
|
|
121
|
+
max_llm_calls = 100
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## 🗄️ Database Queries
|
|
125
|
+
|
|
126
|
+
### Get All Rewards
|
|
127
|
+
|
|
128
|
+
```sql
|
|
129
|
+
-- Crafter
|
|
130
|
+
SELECT
|
|
131
|
+
json_extract(reward_metadata, '$.env_seed') as seed,
|
|
132
|
+
total_reward,
|
|
133
|
+
achievements_count,
|
|
134
|
+
json_extract(reward_metadata, '$.final_achievements') as achievements
|
|
135
|
+
FROM outcome_rewards
|
|
136
|
+
ORDER BY total_reward DESC;
|
|
137
|
+
|
|
138
|
+
-- Pokemon Red
|
|
139
|
+
SELECT
|
|
140
|
+
session_id,
|
|
141
|
+
total_reward,
|
|
142
|
+
achievements_count,
|
|
143
|
+
json_extract(reward_metadata, '$.final_map') as map,
|
|
144
|
+
json_extract(reward_metadata, '$.party_count') as party
|
|
145
|
+
FROM outcome_rewards
|
|
146
|
+
ORDER BY total_reward DESC;
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Filter Non-Zero Rewards
|
|
150
|
+
|
|
151
|
+
```sql
|
|
152
|
+
SELECT * FROM outcome_rewards WHERE total_reward > 0;
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Get Statistics
|
|
156
|
+
|
|
157
|
+
```sql
|
|
158
|
+
SELECT
|
|
159
|
+
COUNT(*) as total,
|
|
160
|
+
SUM(CASE WHEN total_reward > 0 THEN 1 ELSE 0 END) as with_rewards,
|
|
161
|
+
AVG(total_reward) as avg_reward,
|
|
162
|
+
MAX(total_reward) as max_reward
|
|
163
|
+
FROM outcome_rewards;
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## 🎯 What is Image-Only Mode?
|
|
167
|
+
|
|
168
|
+
**Image-Only Mode** means:
|
|
169
|
+
- ✅ Agent receives **only** base64-encoded PNG images
|
|
170
|
+
- ❌ Agent receives **no** text observations (HP, position, inventory, etc.)
|
|
171
|
+
- 🎓 Tests pure vision understanding
|
|
172
|
+
|
|
173
|
+
**Multimodal Mode** (recommended for Pokemon Red):
|
|
174
|
+
- ✅ Agent receives **both** images and text
|
|
175
|
+
- 🏆 Better performance but "easier"
|
|
176
|
+
|
|
177
|
+
Toggle with:
|
|
178
|
+
```toml
|
|
179
|
+
[eval.policy_config]
|
|
180
|
+
use_vision = true # Enable vision
|
|
181
|
+
image_only_mode = false # false = send text too
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## 📁 Files Created
|
|
185
|
+
|
|
186
|
+
### Crafter
|
|
187
|
+
- `crafter/eval_image_only_gpt4o.toml` - Config
|
|
188
|
+
- `crafter/README_IMAGE_ONLY_EVAL.md` - Full guide
|
|
189
|
+
- `crafter/EVAL_IMAGE_ONLY_RESULTS.md` - Example results
|
|
190
|
+
- `crafter/QUERY_EXAMPLES.md` - SQL queries
|
|
191
|
+
|
|
192
|
+
### Pokemon Red
|
|
193
|
+
- `pokemon_red/eval_image_only_gpt4o.toml` - Config
|
|
194
|
+
- `pokemon_red/README_IMAGE_ONLY_EVAL.md` - Full guide
|
|
195
|
+
- `pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md` - Implementation
|
|
196
|
+
- `pokemon_red/EVAL_IMAGE_ONLY_STATUS.md` - Status
|
|
197
|
+
|
|
198
|
+
## 🐛 Common Issues
|
|
199
|
+
|
|
200
|
+
### Database Not Created
|
|
201
|
+
```bash
|
|
202
|
+
# Ensure variables are set
|
|
203
|
+
export TASKAPP_TRACING_ENABLED=1
|
|
204
|
+
export TURSO_NATIVE=1
|
|
205
|
+
export SQLD_DB_PATH="traces/v3/your_eval.db"
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### 401 Unauthorized
|
|
209
|
+
```bash
|
|
210
|
+
# Check API key in .env
|
|
211
|
+
cat .env | grep OPENAI_API_KEY
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Pokemon Red: ROM Not Found
|
|
215
|
+
```bash
|
|
216
|
+
# Place ROM at expected location
|
|
217
|
+
cp pokemon_red.gb synth_ai/environments/examples/red/roms/
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### All Rewards Zero
|
|
221
|
+
- **Crafter**: Should get ~70% non-zero by default
|
|
222
|
+
- **Pokemon Red**: Expected with 10 steps - increase to 100-500
|
|
223
|
+
|
|
224
|
+
## 🎓 Understanding Results
|
|
225
|
+
|
|
226
|
+
### Crafter Achievements
|
|
227
|
+
- `collect_wood` - Cut down trees
|
|
228
|
+
- `collect_sapling` - Collect tree saplings
|
|
229
|
+
- `collect_drink` - Drink from water
|
|
230
|
+
|
|
231
|
+
### Pokemon Red Milestones
|
|
232
|
+
- Leave bedroom (+20)
|
|
233
|
+
- Exit house (+30)
|
|
234
|
+
- Find Oak's lab (+40)
|
|
235
|
+
- Get starter Pokemon (+100)
|
|
236
|
+
- Win first battle (+150)
|
|
237
|
+
|
|
238
|
+
**Total possible**: ~600 points
|
|
239
|
+
|
|
240
|
+
## 🚀 Next Steps
|
|
241
|
+
|
|
242
|
+
1. **Read full docs**: See task-specific READMEs for details
|
|
243
|
+
2. **Run evaluations**: Start with Crafter (easier)
|
|
244
|
+
3. **Query database**: Use SQL to analyze results
|
|
245
|
+
4. **Tune configs**: Adjust steps/model for better performance
|
|
246
|
+
5. **Compare modes**: Try image-only vs multimodal
|
|
247
|
+
|
|
248
|
+
## 📞 Support
|
|
249
|
+
|
|
250
|
+
For issues or questions:
|
|
251
|
+
1. Check full README for your task app
|
|
252
|
+
2. Review example results files
|
|
253
|
+
3. Query database to verify data
|
|
254
|
+
4. Adjust config parameters
|
|
255
|
+
|
|
256
|
+
Happy evaluating! 🎮
|
|
257
|
+
|
|
258
|
+
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
# Creating SFT Datasets from Crafter Traces
|
|
2
|
+
|
|
3
|
+
There are two approaches to create SFT (Supervised Fine-Tuning) datasets from Crafter rollouts:
|
|
4
|
+
|
|
5
|
+
## Approach 1: Direct SFT Recording (Recommended)
|
|
6
|
+
|
|
7
|
+
Crafter's rollout system can write SFT-ready JSONL files directly during evaluation by setting the `sft_output_dir`.
|
|
8
|
+
|
|
9
|
+
### Setup
|
|
10
|
+
|
|
11
|
+
1. Set the SFT output directory environment variable:
|
|
12
|
+
```bash
|
|
13
|
+
export SFT_OUTPUT_DIR="ft_data/crafter_sft"
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
2. Run evaluation:
|
|
17
|
+
```bash
|
|
18
|
+
cd /Users/joshpurtell/Documents/GitHub/synth-ai
|
|
19
|
+
|
|
20
|
+
export TASKAPP_TRACING_ENABLED=1
|
|
21
|
+
export TURSO_NATIVE=1
|
|
22
|
+
export SQLD_DB_PATH="traces/v3/crafter_eval.db"
|
|
23
|
+
export SFT_OUTPUT_DIR="ft_data/crafter_sft" # Enable SFT recording
|
|
24
|
+
|
|
25
|
+
uv run synth-ai eval grpo-crafter \
|
|
26
|
+
--config examples/task_apps/crafter/eval_image_only_gpt4o.toml
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
3. SFT files will be written to:
|
|
30
|
+
```
|
|
31
|
+
ft_data/crafter_sft/
|
|
32
|
+
├── sft_<run_id_1>.jsonl
|
|
33
|
+
├── sft_<run_id_2>.jsonl
|
|
34
|
+
├── ...
|
|
35
|
+
└── sft_<run_id_10>.jsonl
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### SFT Record Format
|
|
39
|
+
|
|
40
|
+
Each JSONL file contains records like:
|
|
41
|
+
```json
|
|
42
|
+
{
|
|
43
|
+
"messages": [
|
|
44
|
+
{"role": "system", "content": "...system prompt..."},
|
|
45
|
+
{"role": "user", "content": "...observation..."},
|
|
46
|
+
{"role": "assistant", "content": "...action..."}
|
|
47
|
+
],
|
|
48
|
+
"metadata": {
|
|
49
|
+
"run_id": "...",
|
|
50
|
+
"turn": 5,
|
|
51
|
+
"reward": 1.0,
|
|
52
|
+
...
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Combine Multiple Files
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
# Combine all SFT files into one
|
|
61
|
+
cat ft_data/crafter_sft/sft_*.jsonl > ft_data/crafter_combined.jsonl
|
|
62
|
+
|
|
63
|
+
# Count examples
|
|
64
|
+
wc -l ft_data/crafter_combined.jsonl
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Approach 2: Extract from Turso Database (Not Currently Supported)
|
|
68
|
+
|
|
69
|
+
The `synth-ai filter` command is designed for traces with a different structure (where prompt/completion are stored in session metadata).
|
|
70
|
+
|
|
71
|
+
**Current Limitation**: Crafter's SessionTracer-based traces don't store messages in the format expected by the filter command.
|
|
72
|
+
|
|
73
|
+
### Why Filter Doesn't Work
|
|
74
|
+
|
|
75
|
+
The filter command expects:
|
|
76
|
+
```python
|
|
77
|
+
metadata = {
|
|
78
|
+
"prompt": "...", # User message
|
|
79
|
+
"completion": "..." # Assistant response
|
|
80
|
+
}
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
But Crafter traces store:
|
|
84
|
+
- Messages in separate `messages` table (currently 0 messages - not recorded during eval)
|
|
85
|
+
- Rewards in `outcome_rewards` table
|
|
86
|
+
- Metadata without prompt/completion fields
|
|
87
|
+
|
|
88
|
+
### Future Enhancement
|
|
89
|
+
|
|
90
|
+
To make filter work with Crafter traces, we would need to:
|
|
91
|
+
1. Modify rollout to record messages to the `messages` table
|
|
92
|
+
2. Update filter command to query `messages` table directly
|
|
93
|
+
3. Join with `outcome_rewards` to filter by achievements
|
|
94
|
+
|
|
95
|
+
## Comparison
|
|
96
|
+
|
|
97
|
+
| Feature | Direct SFT | Filter Command |
|
|
98
|
+
|---------|-----------|----------------|
|
|
99
|
+
| **Setup** | Set `SFT_OUTPUT_DIR` | Create filter config |
|
|
100
|
+
| **When** | During rollout | After rollout |
|
|
101
|
+
| **Format** | JSONL per rollout | Combined JSONL |
|
|
102
|
+
| **Filtering** | Manual (combine files) | Automatic (SQL queries) |
|
|
103
|
+
| **Status** | ✅ Works now | ❌ Needs implementation |
|
|
104
|
+
|
|
105
|
+
## Recommended Workflow
|
|
106
|
+
|
|
107
|
+
### 1. Run evaluation with SFT recording:
|
|
108
|
+
```bash
|
|
109
|
+
export SFT_OUTPUT_DIR="ft_data/crafter_sft"
|
|
110
|
+
uv run synth-ai eval grpo-crafter --config examples/task_apps/crafter/eval_image_only_gpt4o.toml
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
### 2. Filter for successful rollouts:
|
|
114
|
+
|
|
115
|
+
Since we can't use the filter command yet, manually select files:
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
# Query database to find session_ids with rewards
|
|
119
|
+
sqlite3 traces/v3/crafter_eval.db \
|
|
120
|
+
"SELECT session_id FROM outcome_rewards WHERE total_reward > 0" \
|
|
121
|
+
> successful_sessions.txt
|
|
122
|
+
|
|
123
|
+
# Create directory for filtered SFT
|
|
124
|
+
mkdir -p ft_data/crafter_sft_filtered
|
|
125
|
+
|
|
126
|
+
# Copy only successful rollout SFT files
|
|
127
|
+
while read session_id; do
|
|
128
|
+
if [ -f "ft_data/crafter_sft/sft_${session_id}.jsonl" ]; then
|
|
129
|
+
cp "ft_data/crafter_sft/sft_${session_id}.jsonl" ft_data/crafter_sft_filtered/
|
|
130
|
+
fi
|
|
131
|
+
done < successful_sessions.txt
|
|
132
|
+
|
|
133
|
+
# Combine filtered files
|
|
134
|
+
cat ft_data/crafter_sft_filtered/sft_*.jsonl > ft_data/crafter_high_reward.jsonl
|
|
135
|
+
|
|
136
|
+
echo "Created SFT dataset: ft_data/crafter_high_reward.jsonl"
|
|
137
|
+
wc -l ft_data/crafter_high_reward.jsonl
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
### 3. Verify dataset:
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
# Look at first example
|
|
144
|
+
head -1 ft_data/crafter_high_reward.jsonl | jq .
|
|
145
|
+
|
|
146
|
+
# Count examples
|
|
147
|
+
wc -l ft_data/crafter_high_reward.jsonl
|
|
148
|
+
|
|
149
|
+
# Check message types
|
|
150
|
+
jq -r '.messages[].role' ft_data/crafter_high_reward.jsonl | sort | uniq -c
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Example: Complete Pipeline
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
#!/bin/bash
|
|
157
|
+
# complete_sft_pipeline.sh
|
|
158
|
+
|
|
159
|
+
cd /Users/joshpurtell/Documents/GitHub/synth-ai
|
|
160
|
+
|
|
161
|
+
# 1. Run evaluation with SFT recording
|
|
162
|
+
export TASKAPP_TRACING_ENABLED=1
|
|
163
|
+
export TURSO_NATIVE=1
|
|
164
|
+
export SQLD_DB_PATH="traces/v3/crafter_eval.db"
|
|
165
|
+
export SFT_OUTPUT_DIR="ft_data/crafter_sft"
|
|
166
|
+
|
|
167
|
+
echo "Running evaluation..."
|
|
168
|
+
uv run synth-ai eval grpo-crafter \
|
|
169
|
+
--config examples/task_apps/crafter/eval_image_only_gpt4o.toml
|
|
170
|
+
|
|
171
|
+
# 2. Filter for successful rollouts
|
|
172
|
+
echo "Filtering for successful rollouts..."
|
|
173
|
+
mkdir -p ft_data/crafter_sft_filtered
|
|
174
|
+
|
|
175
|
+
sqlite3 traces/v3/crafter_eval.db \
|
|
176
|
+
"SELECT session_id FROM outcome_rewards WHERE total_reward > 0" | \
|
|
177
|
+
while read session_id; do
|
|
178
|
+
if [ -f "ft_data/crafter_sft/sft_${session_id}.jsonl" ]; then
|
|
179
|
+
cp "ft_data/crafter_sft/sft_${session_id}.jsonl" ft_data/crafter_sft_filtered/
|
|
180
|
+
fi
|
|
181
|
+
done
|
|
182
|
+
|
|
183
|
+
# 3. Combine into single dataset
|
|
184
|
+
echo "Creating combined dataset..."
|
|
185
|
+
cat ft_data/crafter_sft_filtered/sft_*.jsonl > ft_data/crafter_high_reward.jsonl
|
|
186
|
+
|
|
187
|
+
# 4. Report statistics
|
|
188
|
+
echo ""
|
|
189
|
+
echo "=== SFT Dataset Created ==="
|
|
190
|
+
echo "Total examples: $(wc -l < ft_data/crafter_high_reward.jsonl)"
|
|
191
|
+
echo "Location: ft_data/crafter_high_reward.jsonl"
|
|
192
|
+
echo ""
|
|
193
|
+
echo "Rollouts included:"
|
|
194
|
+
sqlite3 traces/v3/crafter_eval.db \
|
|
195
|
+
"SELECT
|
|
196
|
+
COUNT(*) as count,
|
|
197
|
+
SUM(total_reward) as total_reward,
|
|
198
|
+
AVG(achievements_count) as avg_achievements
|
|
199
|
+
FROM outcome_rewards
|
|
200
|
+
WHERE total_reward > 0"
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## Troubleshooting
|
|
204
|
+
|
|
205
|
+
### No SFT Files Created
|
|
206
|
+
|
|
207
|
+
**Issue**: `ft_data/crafter_sft/` is empty after evaluation
|
|
208
|
+
|
|
209
|
+
**Possible causes**:
|
|
210
|
+
1. `SFT_OUTPUT_DIR` environment variable not set
|
|
211
|
+
2. Rollout doesn't record SFT by default in eval mode
|
|
212
|
+
3. Directory permissions issue
|
|
213
|
+
|
|
214
|
+
**Debug**:
|
|
215
|
+
```bash
|
|
216
|
+
# Check if variable is set
|
|
217
|
+
echo $SFT_OUTPUT_DIR
|
|
218
|
+
|
|
219
|
+
# Check directory exists and is writable
|
|
220
|
+
ls -la ft_data/
|
|
221
|
+
|
|
222
|
+
# Try with explicit path
|
|
223
|
+
export SFT_OUTPUT_DIR="/Users/joshpurtell/Documents/GitHub/synth-ai/ft_data/crafter_sft"
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
### SFT Files Don't Match Successful Rollouts
|
|
227
|
+
|
|
228
|
+
**Issue**: Have SFT files for rollouts with 0 rewards
|
|
229
|
+
|
|
230
|
+
**Solution**: This is expected - SFT is recorded for all rollouts. Use the filtering step to keep only successful ones.
|
|
231
|
+
|
|
232
|
+
## Future Work
|
|
233
|
+
|
|
234
|
+
To enable the `synth-ai filter` command for Crafter traces:
|
|
235
|
+
|
|
236
|
+
1. **Modify Crafter rollout** to record messages to database:
|
|
237
|
+
```python
|
|
238
|
+
# In RolloutTracingContext
|
|
239
|
+
await self.tracer.record_message(
|
|
240
|
+
content=user_prompt,
|
|
241
|
+
message_type="user",
|
|
242
|
+
metadata={"turn": turn}
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
await self.tracer.record_message(
|
|
246
|
+
content=assistant_response,
|
|
247
|
+
message_type="assistant",
|
|
248
|
+
metadata={"turn": turn, "reward": step_reward}
|
|
249
|
+
)
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
2. **Update filter command** to query messages table:
|
|
253
|
+
```python
|
|
254
|
+
# Instead of looking for metadata.prompt/completion
|
|
255
|
+
# Query messages table directly
|
|
256
|
+
messages = await tracer.db.get_messages(session_id)
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
3. **Create filter config** that works:
|
|
260
|
+
```toml
|
|
261
|
+
[filter]
|
|
262
|
+
db = "traces/v3/crafter_eval.db"
|
|
263
|
+
output = "ft_data/crafter_filtered.jsonl"
|
|
264
|
+
min_official_score = 0.01 # Filter by outcome_rewards
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
## See Also
|
|
268
|
+
|
|
269
|
+
- `README_IMAGE_ONLY_EVAL.md` - How to run evaluations
|
|
270
|
+
- `EVAL_IMAGE_ONLY_RESULTS.md` - Example results
|
|
271
|
+
- `QUERY_EXAMPLES.md` - SQL queries for trace analysis
|
|
272
|
+
|
|
273
|
+
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# Crafter Image-Only Eval Results
|
|
2
|
+
|
|
3
|
+
## Summary
|
|
4
|
+
Successfully ran 10 rollouts of the Crafter task app using **image-only input** (no text observations), with full tracing and rewards saved to Turso database.
|
|
5
|
+
|
|
6
|
+
## Configuration
|
|
7
|
+
- **Model**: `gpt-4o-mini-2024-07-18`
|
|
8
|
+
- **Input Mode**: Image-only (vision enabled, text observations disabled)
|
|
9
|
+
- **Max Steps**: 10 per episode
|
|
10
|
+
- **Max LLM Calls**: 10 per rollout
|
|
11
|
+
- **Seeds**: 0-9 (10 rollouts)
|
|
12
|
+
- **Tracing**: Enabled with Turso/libsql (MVCC concurrent writes)
|
|
13
|
+
- **Database**: `traces/v3/crafter_eval.db` (1.7MB)
|
|
14
|
+
|
|
15
|
+
## Results
|
|
16
|
+
|
|
17
|
+
### Overall Performance
|
|
18
|
+
- **Total Rollouts**: 10
|
|
19
|
+
- **Success Rate**: 100% (10/10 completed)
|
|
20
|
+
- **Mean Official Score**: 0.700 (70%)
|
|
21
|
+
- **Rollouts with Achievements**: 7/10 (70%)
|
|
22
|
+
|
|
23
|
+
### Achievement Distribution
|
|
24
|
+
| Achievements Count | Number of Rollouts |
|
|
25
|
+
|-------------------|-------------------|
|
|
26
|
+
| 3 | 1 |
|
|
27
|
+
| 2 | 4 |
|
|
28
|
+
| 1 | 2 |
|
|
29
|
+
| 0 | 3 |
|
|
30
|
+
|
|
31
|
+
### Top Performing Rollouts
|
|
32
|
+
1. **Seed 0** - 3 achievements: `collect_drink`, `collect_sapling`, `collect_wood` (reward: 3)
|
|
33
|
+
2. **Seed 1** - 2 achievements: `collect_sapling`, `collect_wood` (reward: 2)
|
|
34
|
+
3. **Seed 3** - 2 achievements: `collect_sapling`, `collect_wood` (reward: 2)
|
|
35
|
+
4. **Seed 6** - 2 achievements: `collect_sapling`, `collect_wood` (reward: 2)
|
|
36
|
+
5. **Seed 9** - 2 achievements: `collect_sapling`, `collect_wood` (reward: 2)
|
|
37
|
+
6. **Seed 4** - 1 achievement: `collect_wood` (reward: 1)
|
|
38
|
+
7. **Seed 7** - 1 achievement: `collect_wood` (reward: 1)
|
|
39
|
+
|
|
40
|
+
### Rollouts with No Achievements
|
|
41
|
+
- Seed 2, 5, 8 - No achievements earned
|
|
42
|
+
|
|
43
|
+
## Database Schema
|
|
44
|
+
|
|
45
|
+
### outcome_rewards Table
|
|
46
|
+
```sql
|
|
47
|
+
CREATE TABLE outcome_rewards (
|
|
48
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
49
|
+
session_id VARCHAR NOT NULL,
|
|
50
|
+
total_reward INTEGER NOT NULL,
|
|
51
|
+
achievements_count INTEGER NOT NULL,
|
|
52
|
+
total_steps INTEGER NOT NULL,
|
|
53
|
+
created_at DATETIME NOT NULL,
|
|
54
|
+
reward_metadata TEXT,
|
|
55
|
+
FOREIGN KEY(session_id) REFERENCES session_traces(session_id)
|
|
56
|
+
);
|
|
57
|
+
CREATE INDEX idx_outcome_rewards_session ON outcome_rewards (session_id);
|
|
58
|
+
CREATE INDEX idx_outcome_rewards_total ON outcome_rewards (total_reward);
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Query Examples
|
|
62
|
+
|
|
63
|
+
### Get rollouts with achievements > 0
|
|
64
|
+
```sql
|
|
65
|
+
SELECT
|
|
66
|
+
st.session_id,
|
|
67
|
+
st.num_timesteps,
|
|
68
|
+
orw.achievements_count,
|
|
69
|
+
orw.total_reward,
|
|
70
|
+
json_extract(orw.reward_metadata, '$.final_achievements') as achievements
|
|
71
|
+
FROM session_traces st
|
|
72
|
+
INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
|
|
73
|
+
WHERE orw.achievements_count > 0
|
|
74
|
+
ORDER BY orw.achievements_count DESC, orw.total_reward DESC;
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Count rollouts by achievement count
|
|
78
|
+
```sql
|
|
79
|
+
SELECT achievements_count, COUNT(*) as count
|
|
80
|
+
FROM outcome_rewards
|
|
81
|
+
GROUP BY achievements_count
|
|
82
|
+
ORDER BY achievements_count DESC;
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Get top performers
|
|
86
|
+
```sql
|
|
87
|
+
SELECT session_id, total_reward, achievements_count, reward_metadata
|
|
88
|
+
FROM outcome_rewards
|
|
89
|
+
WHERE achievements_count > 0 OR total_reward > 0
|
|
90
|
+
ORDER BY achievements_count DESC, total_reward DESC
|
|
91
|
+
LIMIT 10;
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Key Changes Made
|
|
95
|
+
|
|
96
|
+
### 1. OpenAI Authorization Fix
|
|
97
|
+
Updated `openai_client.py` to properly set `Authorization: Bearer` header for OpenAI API calls:
|
|
98
|
+
```python
|
|
99
|
+
# If calling OpenAI directly (api.openai.com)
|
|
100
|
+
if "api.openai.com" in low_url:
|
|
101
|
+
openai_key = os.getenv("OPENAI_API_KEY")
|
|
102
|
+
if openai_key and isinstance(openai_key, str):
|
|
103
|
+
headers["Authorization"] = f"Bearer {openai_key}"
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### 2. Image-Only Mode Implementation
|
|
107
|
+
Added `image_only_mode` support to `CrafterPolicy` and `CrafterReActAgent`:
|
|
108
|
+
- When enabled, only image observations are sent to the LLM
|
|
109
|
+
- Text observations are set to empty string
|
|
110
|
+
- Vision mode is automatically enabled
|
|
111
|
+
|
|
112
|
+
### 3. Trace Format Support
|
|
113
|
+
Fixed CLI to properly handle both "compact" and "full" trace formats:
|
|
114
|
+
```python
|
|
115
|
+
# Handle both "compact" and "full" trace formats
|
|
116
|
+
session_trace_dict = trace_namespace.get("session_trace")
|
|
117
|
+
if not isinstance(session_trace_dict, dict):
|
|
118
|
+
if "session_id" in trace_namespace:
|
|
119
|
+
session_trace_dict = trace_namespace
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### 4. Request Body Structure
|
|
123
|
+
Fixed rollout request to properly nest tracing parameters:
|
|
124
|
+
```python
|
|
125
|
+
"record": {
|
|
126
|
+
"return_trace": True,
|
|
127
|
+
"trace_format": "full",
|
|
128
|
+
}
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Files Modified
|
|
132
|
+
1. `/Users/joshpurtell/Documents/GitHub/synth-ai/examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py`
|
|
133
|
+
2. `/Users/joshpurtell/Documents/GitHub/synth-ai/examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py`
|
|
134
|
+
3. `/Users/joshpurtell/Documents/GitHub/synth-ai/examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py`
|
|
135
|
+
4. `/Users/joshpurtell/Documents/GitHub/synth-ai/synth_ai/cli/task_apps.py`
|
|
136
|
+
5. `/Users/joshpurtell/Documents/GitHub/synth-ai/examples/task_apps/crafter/eval_image_only_gpt4o.toml`
|
|
137
|
+
|
|
138
|
+
## Verification
|
|
139
|
+
- ✅ All 10 rollouts completed successfully
|
|
140
|
+
- ✅ Image-only input confirmed (base64 PNG images in prompts)
|
|
141
|
+
- ✅ Achievements computed and saved
|
|
142
|
+
- ✅ Foreign keys working (can join session_traces and outcome_rewards)
|
|
143
|
+
- ✅ Can query rollouts by achievement count and rewards
|
|
144
|
+
- ✅ Database size: 1.7MB with full trace data
|
|
145
|
+
|
|
146
|
+
## Next Steps
|
|
147
|
+
- Increase `max_steps_per_episode` for longer episodes
|
|
148
|
+
- Try different models (e.g., gpt-4o, claude-3.5-sonnet)
|
|
149
|
+
- Analyze which actions lead to the most achievements
|
|
150
|
+
- Use concurrent writes with higher concurrency (Turso MVCC supports this)
|
|
151
|
+
|
|
152
|
+
|