synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/README.md +1 -0
- examples/multi_step/SFT_README.md +147 -0
- examples/multi_step/configs/README_verilog_rl.md +77 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
- examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
- examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -11
- examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
- examples/multi_step/configs/crafter_synth_backend.md +40 -0
- examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
- examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
- examples/multi_step/configs/verilog_rl_lora.toml +190 -0
- examples/multi_step/convert_traces_to_sft.py +84 -0
- examples/multi_step/judges/crafter_backend_judge.py +220 -0
- examples/multi_step/judges/verilog_backend_judge.py +234 -0
- examples/multi_step/readme.md +48 -0
- examples/multi_step/run_sft_qwen30b.sh +45 -0
- examples/multi_step/verilog_rl_lora.md +218 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +3 -2
- examples/qwen_coder/configs/coder_lora_4b.toml +2 -1
- examples/qwen_coder/configs/coder_lora_small.toml +2 -1
- examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
- examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
- examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
- examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
- examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
- examples/qwen_vl/QUICKSTART.md +327 -0
- examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
- examples/qwen_vl/README.md +154 -0
- examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
- examples/qwen_vl/RL_VISION_TESTING.md +333 -0
- examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
- examples/qwen_vl/SETUP_COMPLETE.md +275 -0
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +490 -0
- examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
- examples/qwen_vl/__init__.py +2 -0
- examples/qwen_vl/collect_data_via_cli.md +423 -0
- examples/qwen_vl/collect_vision_traces.py +368 -0
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +127 -0
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +60 -0
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +43 -0
- examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +45 -0
- examples/qwen_vl/configs/eval_qwen2vl_vision.toml +44 -0
- examples/qwen_vl/configs/filter_qwen2vl_sft.toml +50 -0
- examples/qwen_vl/configs/filter_vision_sft.toml +53 -0
- examples/qwen_vl/configs/filter_vision_test.toml +8 -0
- examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
- examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
- examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
- examples/qwen_vl/run_vision_comparison.sh +62 -0
- examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
- examples/qwen_vl/test_image_validation.py +201 -0
- examples/qwen_vl/test_sft_vision_data.py +110 -0
- examples/rl/README.md +1 -1
- examples/rl/configs/eval_base_qwen.toml +17 -0
- examples/rl/configs/eval_rl_qwen.toml +13 -0
- examples/rl/configs/rl_from_base_qwen.toml +37 -0
- examples/rl/configs/rl_from_base_qwen17.toml +76 -0
- examples/rl/configs/rl_from_ft_qwen.toml +37 -0
- examples/rl/run_eval.py +436 -0
- examples/rl/run_rl_and_save.py +111 -0
- examples/rl/task_app/README.md +22 -0
- examples/rl/task_app/math_single_step.py +990 -0
- examples/rl/task_app/math_task_app.py +111 -0
- examples/sft/README.md +5 -5
- examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -2
- examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -3
- examples/sft/evaluate.py +4 -4
- examples/sft/export_dataset.py +7 -4
- examples/sft/generate_traces.py +2 -0
- examples/swe/task_app/README.md +1 -1
- examples/swe/task_app/grpo_swe_mini.py +1 -1
- examples/swe/task_app/grpo_swe_mini_task_app.py +0 -12
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +13 -13
- examples/swe/task_app/hosted/policy_routes.py +0 -2
- examples/swe/task_app/hosted/rollout.py +2 -8
- examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
- examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
- examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
- examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
- examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
- examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
- examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
- examples/task_apps/crafter/task_app/__init__.py +3 -0
- examples/task_apps/crafter/task_app/grpo_crafter.py +309 -14
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +75 -4
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +55 -3
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +114 -32
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +127 -27
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/filter_sft.toml +5 -0
- examples/task_apps/enron/tests/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
- examples/task_apps/enron/tests/unit/__init__.py +2 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
- examples/task_apps/pokemon_red/task_app.py +199 -6
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
- examples/task_apps/sokoban/filter_sft.toml +5 -0
- examples/task_apps/sokoban/tests/__init__.py +2 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
- examples/task_apps/verilog/filter_sft.toml +5 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
- examples/task_apps/verilog/tests/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
- examples/task_apps/verilog/tests/unit/__init__.py +2 -0
- examples/vlm/README.md +3 -3
- examples/vlm/configs/crafter_vlm_gpt4o.toml +2 -0
- examples/vlm/crafter_openai_vlm_agent.py +3 -5
- examples/vlm/filter_image_rows.py +1 -1
- examples/vlm/run_crafter_vlm_benchmark.py +2 -2
- examples/warming_up_to_rl/_utils.py +92 -0
- examples/warming_up_to_rl/analyze_trace_db.py +1 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +2 -0
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_ft.toml +2 -0
- examples/warming_up_to_rl/export_trace_sft.py +174 -60
- examples/warming_up_to_rl/groq_test.py +2 -0
- examples/warming_up_to_rl/readme.md +63 -132
- examples/warming_up_to_rl/run_fft_and_save.py +1 -1
- examples/warming_up_to_rl/run_local_rollout.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
- examples/warming_up_to_rl/run_rl_and_save.py +1 -1
- examples/warming_up_to_rl/run_rollout_remote.py +2 -0
- examples/warming_up_to_rl/task_app/README.md +42 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +696 -0
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +478 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +204 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +618 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1081 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1861 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +62 -0
- synth_ai/__init__.py +44 -30
- synth_ai/_utils/__init__.py +47 -0
- synth_ai/_utils/base_url.py +10 -0
- synth_ai/_utils/http.py +10 -0
- synth_ai/_utils/prompts.py +10 -0
- synth_ai/_utils/task_app_state.py +12 -0
- synth_ai/_utils/user_config.py +10 -0
- synth_ai/api/models/supported.py +145 -7
- synth_ai/api/train/__init__.py +13 -1
- synth_ai/api/train/cli.py +30 -7
- synth_ai/api/train/config_finder.py +18 -11
- synth_ai/api/train/env_resolver.py +13 -10
- synth_ai/cli/__init__.py +66 -49
- synth_ai/cli/_modal_wrapper.py +9 -6
- synth_ai/cli/_typer_patch.py +0 -2
- synth_ai/cli/_validate_task_app.py +22 -4
- synth_ai/cli/legacy_root_backup.py +3 -1
- synth_ai/cli/lib/__init__.py +10 -0
- synth_ai/cli/lib/task_app_discovery.py +7 -0
- synth_ai/cli/lib/task_app_env.py +518 -0
- synth_ai/cli/recent.py +1 -0
- synth_ai/cli/setup.py +266 -0
- synth_ai/cli/task_app_deploy.py +16 -0
- synth_ai/cli/task_app_list.py +25 -0
- synth_ai/cli/task_app_modal_serve.py +16 -0
- synth_ai/cli/task_app_serve.py +18 -0
- synth_ai/cli/task_apps.py +392 -141
- synth_ai/cli/train.py +18 -0
- synth_ai/cli/tui.py +62 -0
- synth_ai/demos/__init__.py +10 -0
- synth_ai/demos/core/__init__.py +28 -1
- synth_ai/demos/crafter/__init__.py +1 -0
- synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
- synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
- synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
- synth_ai/demos/demo_registry.py +176 -0
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/demos/math/__init__.py +1 -0
- synth_ai/demos/math/_common.py +16 -0
- synth_ai/demos/math/app.py +38 -0
- synth_ai/demos/math/config.toml +76 -0
- synth_ai/demos/math/deploy_modal.py +54 -0
- synth_ai/demos/math/modal_task_app.py +702 -0
- synth_ai/demos/math/task_app_entry.py +51 -0
- synth_ai/environments/environment/core.py +7 -1
- synth_ai/environments/examples/bandit/engine.py +0 -1
- synth_ai/environments/examples/bandit/environment.py +0 -1
- synth_ai/environments/examples/crafter_classic/environment.py +1 -1
- synth_ai/environments/examples/verilog/engine.py +76 -10
- synth_ai/environments/examples/wordle/environment.py +0 -1
- synth_ai/evals/base.py +16 -5
- synth_ai/evals/client.py +1 -1
- synth_ai/inference/client.py +1 -1
- synth_ai/learning/client.py +1 -1
- synth_ai/learning/health.py +1 -1
- synth_ai/learning/jobs.py +1 -1
- synth_ai/learning/rl/client.py +1 -1
- synth_ai/learning/rl/env_keys.py +1 -1
- synth_ai/learning/rl/secrets.py +1 -1
- synth_ai/learning/sft/client.py +1 -1
- synth_ai/learning/sft/data.py +407 -4
- synth_ai/learning/validators.py +4 -1
- synth_ai/task/__init__.py +11 -1
- synth_ai/task/apps/__init__.py +5 -2
- synth_ai/task/config.py +259 -0
- synth_ai/task/contracts.py +15 -2
- synth_ai/task/rubrics/__init__.py +4 -2
- synth_ai/task/rubrics/loaders.py +27 -4
- synth_ai/task/rubrics/scoring.py +3 -0
- synth_ai/task/rubrics.py +219 -0
- synth_ai/task/trace_correlation_helpers.py +328 -0
- synth_ai/task/tracing_utils.py +14 -3
- synth_ai/task/validators.py +145 -2
- synth_ai/tracing_v3/config.py +15 -13
- synth_ai/tracing_v3/constants.py +21 -0
- synth_ai/tracing_v3/db_config.py +3 -1
- synth_ai/tracing_v3/decorators.py +10 -7
- synth_ai/tracing_v3/session_tracer.py +10 -0
- synth_ai/tracing_v3/turso/daemon.py +2 -2
- synth_ai/tracing_v3/turso/native_manager.py +108 -77
- synth_ai/tracing_v3/utils.py +1 -1
- synth_ai/tui/__init__.py +5 -0
- synth_ai/tui/__main__.py +13 -0
- synth_ai/tui/cli/__init__.py +1 -0
- synth_ai/tui/cli/query_experiments.py +164 -0
- synth_ai/tui/cli/query_experiments_v3.py +164 -0
- synth_ai/tui/dashboard.py +911 -0
- synth_ai/utils/__init__.py +101 -0
- synth_ai/utils/base_url.py +94 -0
- synth_ai/utils/cli.py +131 -0
- synth_ai/utils/env.py +287 -0
- synth_ai/utils/http.py +169 -0
- synth_ai/utils/modal.py +308 -0
- synth_ai/utils/process.py +212 -0
- synth_ai/utils/prompts.py +39 -0
- synth_ai/utils/sqld.py +122 -0
- synth_ai/utils/task_app_discovery.py +882 -0
- synth_ai/utils/task_app_env.py +186 -0
- synth_ai/utils/task_app_state.py +318 -0
- synth_ai/utils/user_config.py +137 -0
- synth_ai/v0/config/__init__.py +1 -5
- synth_ai/v0/config/base_url.py +1 -7
- synth_ai/v0/tracing/config.py +1 -1
- synth_ai/v0/tracing/decorators.py +1 -1
- synth_ai/v0/tracing/upload.py +1 -1
- synth_ai/v0/tracing_v1/config.py +1 -1
- synth_ai/v0/tracing_v1/decorators.py +1 -1
- synth_ai/v0/tracing_v1/upload.py +1 -1
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/METADATA +85 -31
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/RECORD +286 -135
- synth_ai/cli/man.py +0 -106
- synth_ai/compound/cais.py +0 -0
- synth_ai/core/experiment.py +0 -13
- synth_ai/core/system.py +0 -15
- synth_ai/demo_registry.py +0 -295
- synth_ai/handshake.py +0 -109
- synth_ai/http.py +0 -26
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# Filter Command Status for Crafter
|
|
2
|
+
|
|
3
|
+
## Summary
|
|
4
|
+
|
|
5
|
+
The `synth-ai filter` command has been updated to work with Crafter's SessionTracer v3 traces, but there's currently an issue with message persistence that needs to be resolved.
|
|
6
|
+
|
|
7
|
+
## What Was Changed
|
|
8
|
+
|
|
9
|
+
### 1. Updated Filter Command (`synth_ai/cli/task_apps.py`)
|
|
10
|
+
|
|
11
|
+
The filter command now:
|
|
12
|
+
- ✅ Queries `outcome_rewards` table to filter by `total_reward`
|
|
13
|
+
- ✅ Queries `messages` table to extract user/assistant pairs
|
|
14
|
+
- ✅ Falls back to metadata-based filtering for backwards compatibility
|
|
15
|
+
- ✅ Supports filtering by achievements/rewards from Crafter rollouts
|
|
16
|
+
- ✅ Extracts text from structured message content (JSON payloads)
|
|
17
|
+
|
|
18
|
+
### 2. Created Filter Config
|
|
19
|
+
|
|
20
|
+
**File**: `examples/task_apps/crafter/filter_sft_dataset.toml`
|
|
21
|
+
|
|
22
|
+
```toml
|
|
23
|
+
[filter]
|
|
24
|
+
db = "traces/v3/crafter_eval.db"
|
|
25
|
+
output = "ft_data/crafter_image_only_sft.jsonl"
|
|
26
|
+
min_official_score = 0.01 # Only traces with rewards > 0
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Current Issue: Messages Not Being Saved
|
|
30
|
+
|
|
31
|
+
### Problem
|
|
32
|
+
|
|
33
|
+
When running evaluations, the database ends up with:
|
|
34
|
+
- ✅ 2 `session_traces` (metadata saved)
|
|
35
|
+
- ✅ 2 `outcome_rewards` (rewards saved)
|
|
36
|
+
- ❌ 0 `messages` (messages NOT saved)
|
|
37
|
+
- ✅ 40 `events` (environment events saved)
|
|
38
|
+
- ✅ 20 `session_timesteps` (timesteps saved)
|
|
39
|
+
|
|
40
|
+
### Expected Behavior
|
|
41
|
+
|
|
42
|
+
The rollout code calls:
|
|
43
|
+
1. `tracer.initialize()` - Opens database connection
|
|
44
|
+
2. `tracer.start_session()` - Creates session
|
|
45
|
+
3. `tracer.record_message()` - Records system/user prompts (via `record_policy_prompts`)
|
|
46
|
+
4. `tracer.end_session()` - Saves session with `auto_save=True`
|
|
47
|
+
|
|
48
|
+
The `insert_session_trace` method (in `NativeLibsqlTraceManager`) SHOULD iterate through `trace.markov_blanket_message_history` and save each message to the `messages` table.
|
|
49
|
+
|
|
50
|
+
### Actual Behavior
|
|
51
|
+
|
|
52
|
+
Messages are NOT being persisted to the database, even though:
|
|
53
|
+
- The code path looks correct
|
|
54
|
+
- `end_session()` is being called
|
|
55
|
+
- `auto_save=True` is the default
|
|
56
|
+
- The trace JSON payload includes `markov_blanket_message_history`
|
|
57
|
+
|
|
58
|
+
### Debugging Observations
|
|
59
|
+
|
|
60
|
+
1. **Trace payload includes messages**: The eval output shows a large JSON structure with `markov_blanket_messages` containing all the prompts
|
|
61
|
+
2. **No errors logged**: The `try/except` around `end_session()` doesn't log any failures
|
|
62
|
+
3. **Works with both TURSO_NATIVE=0 and TURSO_NATIVE=1**: Neither backend saves messages
|
|
63
|
+
4. **Database is writable**: `outcome_rewards` and `events` are being saved successfully
|
|
64
|
+
|
|
65
|
+
## Possible Causes
|
|
66
|
+
|
|
67
|
+
1. **Silent exception during message insertion**: The `insert_message_row` might be failing without raising
|
|
68
|
+
2. **Transaction not committed**: Messages might be inserted but not committed
|
|
69
|
+
3. **Messages not in trace object**: `markov_blanket_message_history` might be empty when `end_session` is called
|
|
70
|
+
4. **Record message not adding to history**: `tracer.record_message()` might not be appending to the list properly
|
|
71
|
+
|
|
72
|
+
## Next Steps to Fix
|
|
73
|
+
|
|
74
|
+
### Option 1: Debug Message Persistence
|
|
75
|
+
|
|
76
|
+
Add logging to trace the message save path:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
# In rollout.py, finalize method
|
|
80
|
+
logger.info(f"[finalize] trace has {len(self.tracer._current_trace.markov_blanket_message_history)} messages before end_session")
|
|
81
|
+
|
|
82
|
+
# In native_manager.py, insert_session_trace
|
|
83
|
+
logger.info(f"[insert_session_trace] saving {len(trace.markov_blanket_message_history)} messages")
|
|
84
|
+
for msg in trace.markov_blanket_message_history:
|
|
85
|
+
logger.info(f" - message type={msg.message_type}")
|
|
86
|
+
await self.insert_message_row(...)
|
|
87
|
+
logger.info(f" - message saved")
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Option 2: Verify Messages Are Being Recorded
|
|
91
|
+
|
|
92
|
+
Check if `record_policy_prompts` is actually being called and adding messages:
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
# In rollout.py, after record_policy_prompts
|
|
96
|
+
if self.tracer and self.tracer._current_trace:
|
|
97
|
+
msg_count = len(self.tracer._current_trace.markov_blanket_message_history)
|
|
98
|
+
logger.info(f"[record_policy_prompts] trace now has {msg_count} messages")
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### Option 3: Manual Message Recording
|
|
102
|
+
|
|
103
|
+
As a workaround, explicitly save messages outside of SessionTracer:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
# In finalize(), before end_session()
|
|
107
|
+
if self.enabled and self.tracer is not None:
|
|
108
|
+
conn = await self.tracer.db.get_connection()
|
|
109
|
+
for msg in self.tracer._current_trace.markov_blanket_message_history:
|
|
110
|
+
await conn.execute(
|
|
111
|
+
"INSERT INTO messages (session_id, message_type, content, timestamp) VALUES (?, ?, ?, ?)",
|
|
112
|
+
(self.run_id, msg.message_type, str(msg.content), msg.time_record.event_time)
|
|
113
|
+
)
|
|
114
|
+
await conn.commit()
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Option 4: Use SFT Records Instead
|
|
118
|
+
|
|
119
|
+
Crafter already has working SFT record generation that writes directly to JSONL files. Use that instead of the filter command:
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
export SFT_OUTPUT_DIR="ft_data/crafter_sft"
|
|
123
|
+
uv run synth-ai eval grpo-crafter-task-app --config eval_image_only_gpt4o.toml
|
|
124
|
+
|
|
125
|
+
# Then filter successful runs manually
|
|
126
|
+
cat ft_data/crafter_sft/sft_*.jsonl > ft_data/crafter_combined.jsonl
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Current Workaround
|
|
130
|
+
|
|
131
|
+
Until message persistence is fixed, use the direct SFT recording approach (Option 4) documented in `CREATE_SFT_DATASET.md`.
|
|
132
|
+
|
|
133
|
+
## Testing the Filter Command
|
|
134
|
+
|
|
135
|
+
Once messages are being saved:
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
# 1. Run eval to populate database
|
|
139
|
+
export TASKAPP_TRACING_ENABLED=1
|
|
140
|
+
export TURSO_NATIVE=0
|
|
141
|
+
export SQLD_DB_PATH="traces/v3/crafter_eval.db"
|
|
142
|
+
uv run synth-ai eval grpo-crafter-task-app --config eval_image_only_gpt4o.toml
|
|
143
|
+
|
|
144
|
+
# 2. Verify messages were saved
|
|
145
|
+
sqlite3 traces/v3/crafter_eval.db "SELECT COUNT(*) FROM messages;"
|
|
146
|
+
# Should be > 0
|
|
147
|
+
|
|
148
|
+
# 3. Run filter
|
|
149
|
+
uv run synth-ai filter --config filter_sft_dataset.toml
|
|
150
|
+
|
|
151
|
+
# 4. Check output
|
|
152
|
+
cat ft_data/crafter_image_only_sft.jsonl | jq .
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Related Files
|
|
156
|
+
|
|
157
|
+
- `synth_ai/cli/task_apps.py` - Filter command implementation (updated)
|
|
158
|
+
- `synth_ai/tracing_v3/session_tracer.py` - SessionTracer class
|
|
159
|
+
- `synth_ai/tracing_v3/turso/native_manager.py` - `insert_session_trace` method (should save messages)
|
|
160
|
+
- `examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py` - Rollout tracing context
|
|
161
|
+
- `filter_sft_dataset.toml` - Filter configuration
|
|
162
|
+
- `CREATE_SFT_DATASET.md` - Alternative approach using direct SFT recording
|
|
163
|
+
|
|
164
|
+
## Status
|
|
165
|
+
|
|
166
|
+
- ✅ Filter command updated to query messages table
|
|
167
|
+
- ✅ Filter command can join with outcome_rewards
|
|
168
|
+
- ✅ Filter config created
|
|
169
|
+
- ❌ Messages not being persisted to database
|
|
170
|
+
- ❌ Filter command cannot extract SFT data without messages
|
|
171
|
+
|
|
172
|
+
**Action Required**: Debug why messages aren't being saved to the database despite correct code path.
|
|
173
|
+
|
|
174
|
+
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# Filter Command Success - SFT Dataset Creation Working!
|
|
2
|
+
|
|
3
|
+
## ✅ Complete Success!
|
|
4
|
+
|
|
5
|
+
The `uvx synth-ai eval` → `uvx synth-ai filter` loop is now working end-to-end for Crafter!
|
|
6
|
+
|
|
7
|
+
## What Was Fixed
|
|
8
|
+
|
|
9
|
+
### Issue 1: Early Return in `insert_session_trace`
|
|
10
|
+
**Problem**: Sessions created by `start_session` already existed in the database, so `insert_session_trace` returned early without saving messages.
|
|
11
|
+
|
|
12
|
+
**Fix**: Modified `/Users/joshpurtell/Documents/GitHub/synth-ai/synth_ai/tracing_v3/turso/native_manager.py` to continue processing messages even when the session already exists:
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
if session_exists:
|
|
16
|
+
# Update metadata but don't return early
|
|
17
|
+
# Continue to save messages
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
### Issue 2: Invalid Message Types
|
|
21
|
+
**Problem**: Crafter was using custom message types (`policy_system_prompt`, `policy_user_prompt`, `policy_tool_call`) that violated the database CHECK constraint.
|
|
22
|
+
|
|
23
|
+
**Fix**: Modified `/Users/joshpurtell/Documents/GitHub/synth-ai/examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py` to use standard message types:
|
|
24
|
+
- `policy_system_prompt` → `system`
|
|
25
|
+
- `policy_user_prompt` → `user`
|
|
26
|
+
- `policy_tool_call` → `assistant` (with `is_tool_call: true` metadata)
|
|
27
|
+
|
|
28
|
+
## Full Working Pipeline
|
|
29
|
+
|
|
30
|
+
### 1. Run Evaluation with Tracing
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
cd /Users/joshpurtell/Documents/GitHub/synth-ai
|
|
34
|
+
|
|
35
|
+
export TASKAPP_TRACING_ENABLED=1
|
|
36
|
+
export TURSO_NATIVE=0
|
|
37
|
+
export SQLD_DB_PATH="traces/v3/crafter_eval.db"
|
|
38
|
+
|
|
39
|
+
uv run synth-ai eval grpo-crafter-task-app \
|
|
40
|
+
--config examples/task_apps/crafter/eval_image_only_gpt4o.toml
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
**Result**:
|
|
44
|
+
- ✅ 2 rollouts completed
|
|
45
|
+
- ✅ 120 messages saved to database (40 system + 40 user + 40 assistant)
|
|
46
|
+
- ✅ 2 outcome_rewards saved with achievements
|
|
47
|
+
- ✅ Traces returned successfully
|
|
48
|
+
|
|
49
|
+
### 2. Filter to Create SFT Dataset
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
uv run synth-ai filter \
|
|
53
|
+
--config examples/task_apps/crafter/filter_sft_dataset.toml
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
**Result**:
|
|
57
|
+
```
|
|
58
|
+
Wrote 40 examples -> ft_data/crafter_image_only_sft.jsonl
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### 3. Verify SFT Data
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
# Check first example
|
|
65
|
+
head -1 ft_data/crafter_image_only_sft.jsonl | jq .
|
|
66
|
+
|
|
67
|
+
# Count examples
|
|
68
|
+
wc -l ft_data/crafter_image_only_sft.jsonl
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## SFT Dataset Format
|
|
72
|
+
|
|
73
|
+
Each line in the JSONL contains:
|
|
74
|
+
|
|
75
|
+
```json
|
|
76
|
+
{
|
|
77
|
+
"messages": [
|
|
78
|
+
{
|
|
79
|
+
"role": "user",
|
|
80
|
+
"content": "=== CRAFTER GAME STATE ===\nStep: 0/10000\n..."
|
|
81
|
+
},
|
|
82
|
+
{
|
|
83
|
+
"role": "assistant",
|
|
84
|
+
"content": "[{'tool_name': 'interact_many', 'arguments': {...}}]"
|
|
85
|
+
}
|
|
86
|
+
],
|
|
87
|
+
"metadata": {
|
|
88
|
+
"session_id": "...",
|
|
89
|
+
"env_name": "crafter",
|
|
90
|
+
"policy_name": "crafter-react",
|
|
91
|
+
"seed": 0,
|
|
92
|
+
"total_reward": 1,
|
|
93
|
+
"achievements_count": 1,
|
|
94
|
+
"created_at": "2025-10-22T23:55:25.533188+00:00"
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Database Schema
|
|
100
|
+
|
|
101
|
+
The filter command queries these tables:
|
|
102
|
+
|
|
103
|
+
### messages table
|
|
104
|
+
```sql
|
|
105
|
+
SELECT message_type, content, timestamp
|
|
106
|
+
FROM messages
|
|
107
|
+
WHERE session_id = :session_id
|
|
108
|
+
ORDER BY timestamp ASC
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
- ✅ 120 messages total
|
|
112
|
+
- System (40) + User (40) + Assistant (40) messages
|
|
113
|
+
- Pairs extracted: user → assistant
|
|
114
|
+
|
|
115
|
+
### outcome_rewards table
|
|
116
|
+
```sql
|
|
117
|
+
SELECT total_reward, achievements_count
|
|
118
|
+
FROM outcome_rewards
|
|
119
|
+
WHERE session_id = :session_id
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
- Used to filter for successful rollouts
|
|
123
|
+
- `min_official_score = 0.01` filters for rewards > 0
|
|
124
|
+
- Both rollouts had `total_reward = 1` (1 achievement each)
|
|
125
|
+
|
|
126
|
+
## Filter Configuration
|
|
127
|
+
|
|
128
|
+
**File**: `examples/task_apps/crafter/filter_sft_dataset.toml`
|
|
129
|
+
|
|
130
|
+
```toml
|
|
131
|
+
[filter]
|
|
132
|
+
db = "traces/v3/crafter_eval.db"
|
|
133
|
+
output = "ft_data/crafter_image_only_sft.jsonl"
|
|
134
|
+
min_official_score = 0.01 # Only traces with rewards > 0
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Available Filter Options
|
|
138
|
+
|
|
139
|
+
```toml
|
|
140
|
+
[filter]
|
|
141
|
+
db = "path/to/traces.db" # Required
|
|
142
|
+
output = "path/to/output.jsonl" # Required
|
|
143
|
+
|
|
144
|
+
# Optional filters
|
|
145
|
+
min_official_score = 0.01 # Filter by reward
|
|
146
|
+
splits = ["train", "test"] # Filter by split
|
|
147
|
+
task_ids = ["task_1"] # Filter by task
|
|
148
|
+
models = ["gpt-4o"] # Filter by model
|
|
149
|
+
limit = 100 # Limit number of examples
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Statistics
|
|
153
|
+
|
|
154
|
+
From 2 rollouts with 10 turns each:
|
|
155
|
+
|
|
156
|
+
| Metric | Count |
|
|
157
|
+
|--------|-------|
|
|
158
|
+
| Total rollouts | 2 |
|
|
159
|
+
| Rollouts with rewards | 2 (100%) |
|
|
160
|
+
| Total messages saved | 120 |
|
|
161
|
+
| System messages | 40 |
|
|
162
|
+
| User messages | 40 |
|
|
163
|
+
| Assistant messages | 40 |
|
|
164
|
+
| **SFT examples** | **40** |
|
|
165
|
+
| Average turns per rollout | 10 |
|
|
166
|
+
| Examples per rollout | 20 |
|
|
167
|
+
|
|
168
|
+
## Next Steps
|
|
169
|
+
|
|
170
|
+
### Scale Up
|
|
171
|
+
|
|
172
|
+
Run with more seeds for a larger dataset:
|
|
173
|
+
|
|
174
|
+
```toml
|
|
175
|
+
# In eval_image_only_gpt4o.toml
|
|
176
|
+
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # 10 rollouts
|
|
177
|
+
max_turns = 50 # More examples per rollout
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
Expected output: ~1000 SFT examples from 10 rollouts @ 50 turns each
|
|
181
|
+
|
|
182
|
+
### Use the SFT Data
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
# For OpenAI fine-tuning
|
|
186
|
+
# The JSONL format is compatible with OpenAI's fine-tuning API
|
|
187
|
+
|
|
188
|
+
# For local fine-tuning
|
|
189
|
+
# Convert to your preferred format (HuggingFace, etc.)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
### Filter Variations
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
# Only high-reward traces
|
|
196
|
+
min_official_score = 2.0
|
|
197
|
+
|
|
198
|
+
# Only specific achievements
|
|
199
|
+
# Query manually then filter by session_id
|
|
200
|
+
|
|
201
|
+
# Time-based filtering
|
|
202
|
+
min_created_at = "2025-10-22T00:00:00"
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Files Modified
|
|
206
|
+
|
|
207
|
+
1. **`synth_ai/tracing_v3/turso/native_manager.py`**
|
|
208
|
+
- Fixed early return when session exists
|
|
209
|
+
- Added logging for debugging
|
|
210
|
+
|
|
211
|
+
2. **`examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py`**
|
|
212
|
+
- Changed message types to standard values
|
|
213
|
+
- Added debug logging
|
|
214
|
+
|
|
215
|
+
3. **`synth_ai/cli/task_apps.py`**
|
|
216
|
+
- Updated filter command to query messages table
|
|
217
|
+
- Added support for outcome_rewards filtering
|
|
218
|
+
- Fixed SQL parameter format
|
|
219
|
+
|
|
220
|
+
4. **`examples/task_apps/crafter/filter_sft_dataset.toml`**
|
|
221
|
+
- Created filter configuration
|
|
222
|
+
|
|
223
|
+
## Troubleshooting
|
|
224
|
+
|
|
225
|
+
### No messages in database
|
|
226
|
+
|
|
227
|
+
**Check**:
|
|
228
|
+
```bash
|
|
229
|
+
sqlite3 traces/v3/crafter_eval.db "SELECT COUNT(*) FROM messages;"
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
**Fix**: Ensure `TASKAPP_TRACING_ENABLED=1` and `TURSO_NATIVE=0`
|
|
233
|
+
|
|
234
|
+
### Filter returns no examples
|
|
235
|
+
|
|
236
|
+
**Check**:
|
|
237
|
+
```bash
|
|
238
|
+
sqlite3 traces/v3/crafter_eval.db \
|
|
239
|
+
"SELECT COUNT(*) FROM outcome_rewards WHERE total_reward > 0;"
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
**Fix**: Lower `min_official_score` or remove it to include all traces
|
|
243
|
+
|
|
244
|
+
### Invalid message types
|
|
245
|
+
|
|
246
|
+
**Error**: `CHECK constraint failed: message_type IN (...)`
|
|
247
|
+
|
|
248
|
+
**Fix**: Already fixed in rollout.py - update to latest code
|
|
249
|
+
|
|
250
|
+
## Related Documentation
|
|
251
|
+
|
|
252
|
+
- `README_IMAGE_ONLY_EVAL.md` - How to run evaluations
|
|
253
|
+
- `EVAL_IMAGE_ONLY_RESULTS.md` - Example evaluation results
|
|
254
|
+
- `QUERY_EXAMPLES.md` - SQL query examples
|
|
255
|
+
- `CREATE_SFT_DATASET.md` - Original approach (now superseded)
|
|
256
|
+
|
|
257
|
+
## Success Metrics
|
|
258
|
+
|
|
259
|
+
✅ Eval completes without errors
|
|
260
|
+
✅ Messages saved to database (system, user, assistant)
|
|
261
|
+
✅ Outcome rewards saved with foreign keys
|
|
262
|
+
✅ Filter command extracts user/assistant pairs
|
|
263
|
+
✅ SFT JSONL created with proper format
|
|
264
|
+
✅ Metadata includes rewards and achievements
|
|
265
|
+
|
|
266
|
+
**Status**: 🎉 **WORKING END-TO-END!**
|
|
267
|
+
|
|
268
|
+
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# Crafter Eval Database Query Examples
|
|
2
|
+
|
|
3
|
+
## Database Location
|
|
4
|
+
```bash
|
|
5
|
+
/Users/joshpurtell/Documents/GitHub/synth-ai/traces/v3/crafter_eval.db
|
|
6
|
+
```
|
|
7
|
+
|
|
8
|
+
## Quick Stats
|
|
9
|
+
|
|
10
|
+
Run this query to get an overview:
|
|
11
|
+
```sql
|
|
12
|
+
SELECT
|
|
13
|
+
'Total rollouts' as metric,
|
|
14
|
+
CAST(COUNT(*) as TEXT) as value
|
|
15
|
+
FROM outcome_rewards
|
|
16
|
+
UNION ALL
|
|
17
|
+
SELECT
|
|
18
|
+
'Rollouts with reward > 0',
|
|
19
|
+
CAST(COUNT(*) as TEXT)
|
|
20
|
+
FROM outcome_rewards
|
|
21
|
+
WHERE total_reward > 0
|
|
22
|
+
UNION ALL
|
|
23
|
+
SELECT
|
|
24
|
+
'Average reward',
|
|
25
|
+
CAST(ROUND(AVG(total_reward), 2) as TEXT)
|
|
26
|
+
FROM outcome_rewards
|
|
27
|
+
UNION ALL
|
|
28
|
+
SELECT
|
|
29
|
+
'Max reward',
|
|
30
|
+
CAST(MAX(total_reward) as TEXT)
|
|
31
|
+
FROM outcome_rewards;
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
**Current Results:**
|
|
35
|
+
- Total rollouts: 10
|
|
36
|
+
- Rollouts with reward > 0: 7
|
|
37
|
+
- Average reward: 1.3
|
|
38
|
+
- Max reward: 3
|
|
39
|
+
|
|
40
|
+
## Filter for Non-Zero Rewards
|
|
41
|
+
|
|
42
|
+
### Simple Query
|
|
43
|
+
```sql
|
|
44
|
+
SELECT
|
|
45
|
+
session_id,
|
|
46
|
+
total_reward,
|
|
47
|
+
achievements_count,
|
|
48
|
+
json_extract(reward_metadata, '$.env_seed') as seed,
|
|
49
|
+
json_extract(reward_metadata, '$.final_achievements') as achievements
|
|
50
|
+
FROM outcome_rewards
|
|
51
|
+
WHERE total_reward > 0
|
|
52
|
+
ORDER BY total_reward DESC, achievements_count DESC;
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### With Full Session Context
|
|
56
|
+
```sql
|
|
57
|
+
SELECT
|
|
58
|
+
st.session_id,
|
|
59
|
+
st.created_at,
|
|
60
|
+
st.num_timesteps,
|
|
61
|
+
st.num_events,
|
|
62
|
+
orw.total_reward,
|
|
63
|
+
orw.achievements_count,
|
|
64
|
+
json_extract(orw.reward_metadata, '$.final_achievements') as achievements,
|
|
65
|
+
json_extract(orw.reward_metadata, '$.env_seed') as seed
|
|
66
|
+
FROM session_traces st
|
|
67
|
+
INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
|
|
68
|
+
WHERE orw.total_reward > 0
|
|
69
|
+
ORDER BY orw.total_reward DESC;
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Filter by Achievement Count
|
|
73
|
+
|
|
74
|
+
### Get rollouts with 2+ achievements
|
|
75
|
+
```sql
|
|
76
|
+
SELECT
|
|
77
|
+
session_id,
|
|
78
|
+
total_reward,
|
|
79
|
+
achievements_count,
|
|
80
|
+
json_extract(reward_metadata, '$.final_achievements') as achievements
|
|
81
|
+
FROM outcome_rewards
|
|
82
|
+
WHERE achievements_count >= 2
|
|
83
|
+
ORDER BY achievements_count DESC, total_reward DESC;
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Get rollouts with specific achievement
|
|
87
|
+
```sql
|
|
88
|
+
SELECT
|
|
89
|
+
session_id,
|
|
90
|
+
total_reward,
|
|
91
|
+
achievements_count,
|
|
92
|
+
json_extract(reward_metadata, '$.final_achievements') as achievements
|
|
93
|
+
FROM outcome_rewards
|
|
94
|
+
WHERE reward_metadata LIKE '%collect_drink%'
|
|
95
|
+
ORDER BY total_reward DESC;
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Group by Achievement Count
|
|
99
|
+
```sql
|
|
100
|
+
SELECT
|
|
101
|
+
achievements_count,
|
|
102
|
+
COUNT(*) as num_rollouts,
|
|
103
|
+
ROUND(AVG(total_reward), 2) as avg_reward,
|
|
104
|
+
SUM(total_reward) as total_reward_sum,
|
|
105
|
+
GROUP_CONCAT(DISTINCT json_extract(reward_metadata, '$.env_seed')) as seeds
|
|
106
|
+
FROM outcome_rewards
|
|
107
|
+
GROUP BY achievements_count
|
|
108
|
+
ORDER BY achievements_count DESC;
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Top Performers
|
|
112
|
+
```sql
|
|
113
|
+
SELECT
|
|
114
|
+
json_extract(orw.reward_metadata, '$.env_seed') as seed,
|
|
115
|
+
orw.total_reward,
|
|
116
|
+
orw.achievements_count,
|
|
117
|
+
orw.total_steps,
|
|
118
|
+
st.num_events,
|
|
119
|
+
json_extract(orw.reward_metadata, '$.final_achievements') as achievements
|
|
120
|
+
FROM session_traces st
|
|
121
|
+
INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
|
|
122
|
+
ORDER BY orw.total_reward DESC, orw.achievements_count DESC
|
|
123
|
+
LIMIT 5;
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Get Event Details for High-Reward Rollouts
|
|
127
|
+
```sql
|
|
128
|
+
SELECT
|
|
129
|
+
e.event_type,
|
|
130
|
+
e.model_name,
|
|
131
|
+
e.input_tokens,
|
|
132
|
+
e.output_tokens,
|
|
133
|
+
e.latency_ms,
|
|
134
|
+
e.reward as step_reward
|
|
135
|
+
FROM events e
|
|
136
|
+
INNER JOIN outcome_rewards orw ON e.session_id = orw.session_id
|
|
137
|
+
WHERE orw.total_reward >= 2
|
|
138
|
+
ORDER BY e.session_id, e.id
|
|
139
|
+
LIMIT 20;
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Running Queries
|
|
143
|
+
|
|
144
|
+
### From Command Line
|
|
145
|
+
```bash
|
|
146
|
+
cd /Users/joshpurtell/Documents/GitHub/synth-ai
|
|
147
|
+
sqlite3 traces/v3/crafter_eval.db "YOUR_QUERY_HERE"
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### With Formatted Output
|
|
151
|
+
```bash
|
|
152
|
+
sqlite3 -header -column traces/v3/crafter_eval.db "YOUR_QUERY_HERE"
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### With JSON Output
|
|
156
|
+
```bash
|
|
157
|
+
sqlite3 -json traces/v3/crafter_eval.db "YOUR_QUERY_HERE" | jq .
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
## Example: Get CSV Export of Non-Zero Rewards
|
|
161
|
+
```bash
|
|
162
|
+
sqlite3 -header -csv traces/v3/crafter_eval.db \
|
|
163
|
+
"SELECT
|
|
164
|
+
json_extract(reward_metadata, '$.env_seed') as seed,
|
|
165
|
+
total_reward,
|
|
166
|
+
achievements_count,
|
|
167
|
+
total_steps,
|
|
168
|
+
json_extract(reward_metadata, '$.final_achievements') as achievements
|
|
169
|
+
FROM outcome_rewards
|
|
170
|
+
WHERE total_reward > 0
|
|
171
|
+
ORDER BY total_reward DESC" \
|
|
172
|
+
> crafter_rewards_nonzero.csv
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## Current Data Summary
|
|
176
|
+
|
|
177
|
+
| Reward | Count | Seeds | Achievements |
|
|
178
|
+
|--------|-------|-------|--------------|
|
|
179
|
+
| 3 | 1 | 0 | collect_drink, collect_sapling, collect_wood |
|
|
180
|
+
| 2 | 4 | 1,3,6,9 | collect_sapling, collect_wood |
|
|
181
|
+
| 1 | 2 | 4,7 | collect_wood |
|
|
182
|
+
| 0 | 3 | 2,5,8 | none |
|
|
183
|
+
|
|
184
|
+
## Verifying Foreign Keys Work
|
|
185
|
+
|
|
186
|
+
```sql
|
|
187
|
+
-- This should return 7 rows (all rollouts with rewards > 0)
|
|
188
|
+
SELECT COUNT(*)
|
|
189
|
+
FROM session_traces st
|
|
190
|
+
INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
|
|
191
|
+
WHERE orw.total_reward > 0;
|
|
192
|
+
|
|
193
|
+
-- This should return the same 7 session_ids
|
|
194
|
+
SELECT st.session_id
|
|
195
|
+
FROM session_traces st
|
|
196
|
+
WHERE st.session_id IN (
|
|
197
|
+
SELECT session_id FROM outcome_rewards WHERE total_reward > 0
|
|
198
|
+
);
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
✅ **Confirmed: Foreign keys are working correctly and can be used to join tables!**
|
|
202
|
+
|
|
203
|
+
|