synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/README.md +1 -0
- examples/multi_step/SFT_README.md +147 -0
- examples/multi_step/configs/README_verilog_rl.md +77 -0
- examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
- examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
- examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
- examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -11
- examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
- examples/multi_step/configs/crafter_synth_backend.md +40 -0
- examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
- examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
- examples/multi_step/configs/verilog_rl_lora.toml +190 -0
- examples/multi_step/convert_traces_to_sft.py +84 -0
- examples/multi_step/judges/crafter_backend_judge.py +220 -0
- examples/multi_step/judges/verilog_backend_judge.py +234 -0
- examples/multi_step/readme.md +48 -0
- examples/multi_step/run_sft_qwen30b.sh +45 -0
- examples/multi_step/verilog_rl_lora.md +218 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +3 -2
- examples/qwen_coder/configs/coder_lora_4b.toml +2 -1
- examples/qwen_coder/configs/coder_lora_small.toml +2 -1
- examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
- examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
- examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
- examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
- examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
- examples/qwen_vl/QUICKSTART.md +327 -0
- examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
- examples/qwen_vl/README.md +154 -0
- examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
- examples/qwen_vl/RL_VISION_TESTING.md +333 -0
- examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
- examples/qwen_vl/SETUP_COMPLETE.md +275 -0
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +490 -0
- examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
- examples/qwen_vl/__init__.py +2 -0
- examples/qwen_vl/collect_data_via_cli.md +423 -0
- examples/qwen_vl/collect_vision_traces.py +368 -0
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +127 -0
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +60 -0
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +43 -0
- examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +45 -0
- examples/qwen_vl/configs/eval_qwen2vl_vision.toml +44 -0
- examples/qwen_vl/configs/filter_qwen2vl_sft.toml +50 -0
- examples/qwen_vl/configs/filter_vision_sft.toml +53 -0
- examples/qwen_vl/configs/filter_vision_test.toml +8 -0
- examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
- examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
- examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
- examples/qwen_vl/run_vision_comparison.sh +62 -0
- examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
- examples/qwen_vl/test_image_validation.py +201 -0
- examples/qwen_vl/test_sft_vision_data.py +110 -0
- examples/rl/README.md +1 -1
- examples/rl/configs/eval_base_qwen.toml +17 -0
- examples/rl/configs/eval_rl_qwen.toml +13 -0
- examples/rl/configs/rl_from_base_qwen.toml +37 -0
- examples/rl/configs/rl_from_base_qwen17.toml +76 -0
- examples/rl/configs/rl_from_ft_qwen.toml +37 -0
- examples/rl/run_eval.py +436 -0
- examples/rl/run_rl_and_save.py +111 -0
- examples/rl/task_app/README.md +22 -0
- examples/rl/task_app/math_single_step.py +990 -0
- examples/rl/task_app/math_task_app.py +111 -0
- examples/sft/README.md +5 -5
- examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -2
- examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -3
- examples/sft/evaluate.py +4 -4
- examples/sft/export_dataset.py +7 -4
- examples/sft/generate_traces.py +2 -0
- examples/swe/task_app/README.md +1 -1
- examples/swe/task_app/grpo_swe_mini.py +1 -1
- examples/swe/task_app/grpo_swe_mini_task_app.py +0 -12
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +13 -13
- examples/swe/task_app/hosted/policy_routes.py +0 -2
- examples/swe/task_app/hosted/rollout.py +2 -8
- examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
- examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
- examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
- examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
- examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
- examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
- examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
- examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
- examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
- examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
- examples/task_apps/crafter/task_app/__init__.py +3 -0
- examples/task_apps/crafter/task_app/grpo_crafter.py +309 -14
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +75 -4
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +55 -3
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +114 -32
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +127 -27
- examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
- examples/task_apps/enron/__init__.py +1 -0
- examples/task_apps/enron/filter_sft.toml +5 -0
- examples/task_apps/enron/tests/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/__init__.py +2 -0
- examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
- examples/task_apps/enron/tests/unit/__init__.py +2 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
- examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
- examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
- examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
- examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
- examples/task_apps/pokemon_red/task_app.py +199 -6
- examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
- examples/task_apps/sokoban/filter_sft.toml +5 -0
- examples/task_apps/sokoban/tests/__init__.py +2 -0
- examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
- examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
- examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
- examples/task_apps/verilog/filter_sft.toml +5 -0
- examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
- examples/task_apps/verilog/tests/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/__init__.py +2 -0
- examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
- examples/task_apps/verilog/tests/unit/__init__.py +2 -0
- examples/vlm/README.md +3 -3
- examples/vlm/configs/crafter_vlm_gpt4o.toml +2 -0
- examples/vlm/crafter_openai_vlm_agent.py +3 -5
- examples/vlm/filter_image_rows.py +1 -1
- examples/vlm/run_crafter_vlm_benchmark.py +2 -2
- examples/warming_up_to_rl/_utils.py +92 -0
- examples/warming_up_to_rl/analyze_trace_db.py +1 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +2 -0
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_ft.toml +2 -0
- examples/warming_up_to_rl/export_trace_sft.py +174 -60
- examples/warming_up_to_rl/groq_test.py +2 -0
- examples/warming_up_to_rl/readme.md +63 -132
- examples/warming_up_to_rl/run_fft_and_save.py +1 -1
- examples/warming_up_to_rl/run_local_rollout.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
- examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
- examples/warming_up_to_rl/run_rl_and_save.py +1 -1
- examples/warming_up_to_rl/run_rollout_remote.py +2 -0
- examples/warming_up_to_rl/task_app/README.md +42 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +696 -0
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +478 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +204 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +618 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1081 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1861 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +62 -0
- synth_ai/__init__.py +44 -30
- synth_ai/_utils/__init__.py +47 -0
- synth_ai/_utils/base_url.py +10 -0
- synth_ai/_utils/http.py +10 -0
- synth_ai/_utils/prompts.py +10 -0
- synth_ai/_utils/task_app_state.py +12 -0
- synth_ai/_utils/user_config.py +10 -0
- synth_ai/api/models/supported.py +145 -7
- synth_ai/api/train/__init__.py +13 -1
- synth_ai/api/train/cli.py +30 -7
- synth_ai/api/train/config_finder.py +18 -11
- synth_ai/api/train/env_resolver.py +13 -10
- synth_ai/cli/__init__.py +66 -49
- synth_ai/cli/_modal_wrapper.py +9 -6
- synth_ai/cli/_typer_patch.py +0 -2
- synth_ai/cli/_validate_task_app.py +22 -4
- synth_ai/cli/legacy_root_backup.py +3 -1
- synth_ai/cli/lib/__init__.py +10 -0
- synth_ai/cli/lib/task_app_discovery.py +7 -0
- synth_ai/cli/lib/task_app_env.py +518 -0
- synth_ai/cli/recent.py +1 -0
- synth_ai/cli/setup.py +266 -0
- synth_ai/cli/task_app_deploy.py +16 -0
- synth_ai/cli/task_app_list.py +25 -0
- synth_ai/cli/task_app_modal_serve.py +16 -0
- synth_ai/cli/task_app_serve.py +18 -0
- synth_ai/cli/task_apps.py +392 -141
- synth_ai/cli/train.py +18 -0
- synth_ai/cli/tui.py +62 -0
- synth_ai/demos/__init__.py +10 -0
- synth_ai/demos/core/__init__.py +28 -1
- synth_ai/demos/crafter/__init__.py +1 -0
- synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
- synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
- synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
- synth_ai/demos/demo_registry.py +176 -0
- synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
- synth_ai/demos/math/__init__.py +1 -0
- synth_ai/demos/math/_common.py +16 -0
- synth_ai/demos/math/app.py +38 -0
- synth_ai/demos/math/config.toml +76 -0
- synth_ai/demos/math/deploy_modal.py +54 -0
- synth_ai/demos/math/modal_task_app.py +702 -0
- synth_ai/demos/math/task_app_entry.py +51 -0
- synth_ai/environments/environment/core.py +7 -1
- synth_ai/environments/examples/bandit/engine.py +0 -1
- synth_ai/environments/examples/bandit/environment.py +0 -1
- synth_ai/environments/examples/crafter_classic/environment.py +1 -1
- synth_ai/environments/examples/verilog/engine.py +76 -10
- synth_ai/environments/examples/wordle/environment.py +0 -1
- synth_ai/evals/base.py +16 -5
- synth_ai/evals/client.py +1 -1
- synth_ai/inference/client.py +1 -1
- synth_ai/learning/client.py +1 -1
- synth_ai/learning/health.py +1 -1
- synth_ai/learning/jobs.py +1 -1
- synth_ai/learning/rl/client.py +1 -1
- synth_ai/learning/rl/env_keys.py +1 -1
- synth_ai/learning/rl/secrets.py +1 -1
- synth_ai/learning/sft/client.py +1 -1
- synth_ai/learning/sft/data.py +407 -4
- synth_ai/learning/validators.py +4 -1
- synth_ai/task/__init__.py +11 -1
- synth_ai/task/apps/__init__.py +5 -2
- synth_ai/task/config.py +259 -0
- synth_ai/task/contracts.py +15 -2
- synth_ai/task/rubrics/__init__.py +4 -2
- synth_ai/task/rubrics/loaders.py +27 -4
- synth_ai/task/rubrics/scoring.py +3 -0
- synth_ai/task/rubrics.py +219 -0
- synth_ai/task/trace_correlation_helpers.py +328 -0
- synth_ai/task/tracing_utils.py +14 -3
- synth_ai/task/validators.py +145 -2
- synth_ai/tracing_v3/config.py +15 -13
- synth_ai/tracing_v3/constants.py +21 -0
- synth_ai/tracing_v3/db_config.py +3 -1
- synth_ai/tracing_v3/decorators.py +10 -7
- synth_ai/tracing_v3/session_tracer.py +10 -0
- synth_ai/tracing_v3/turso/daemon.py +2 -2
- synth_ai/tracing_v3/turso/native_manager.py +108 -77
- synth_ai/tracing_v3/utils.py +1 -1
- synth_ai/tui/__init__.py +5 -0
- synth_ai/tui/__main__.py +13 -0
- synth_ai/tui/cli/__init__.py +1 -0
- synth_ai/tui/cli/query_experiments.py +164 -0
- synth_ai/tui/cli/query_experiments_v3.py +164 -0
- synth_ai/tui/dashboard.py +911 -0
- synth_ai/utils/__init__.py +101 -0
- synth_ai/utils/base_url.py +94 -0
- synth_ai/utils/cli.py +131 -0
- synth_ai/utils/env.py +287 -0
- synth_ai/utils/http.py +169 -0
- synth_ai/utils/modal.py +308 -0
- synth_ai/utils/process.py +212 -0
- synth_ai/utils/prompts.py +39 -0
- synth_ai/utils/sqld.py +122 -0
- synth_ai/utils/task_app_discovery.py +882 -0
- synth_ai/utils/task_app_env.py +186 -0
- synth_ai/utils/task_app_state.py +318 -0
- synth_ai/utils/user_config.py +137 -0
- synth_ai/v0/config/__init__.py +1 -5
- synth_ai/v0/config/base_url.py +1 -7
- synth_ai/v0/tracing/config.py +1 -1
- synth_ai/v0/tracing/decorators.py +1 -1
- synth_ai/v0/tracing/upload.py +1 -1
- synth_ai/v0/tracing_v1/config.py +1 -1
- synth_ai/v0/tracing_v1/decorators.py +1 -1
- synth_ai/v0/tracing_v1/upload.py +1 -1
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/METADATA +85 -31
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/RECORD +286 -135
- synth_ai/cli/man.py +0 -106
- synth_ai/compound/cais.py +0 -0
- synth_ai/core/experiment.py +0 -13
- synth_ai/core/system.py +0 -15
- synth_ai/demo_registry.py +0 -295
- synth_ai/handshake.py +0 -109
- synth_ai/http.py +0 -26
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,165 @@
|
|
|
1
1
|
"""Utility functions for the task service."""
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from typing import Any
|
|
5
|
+
from urllib.parse import parse_qs, urlparse, urlunparse
|
|
4
6
|
|
|
5
7
|
import numpy as np
|
|
6
8
|
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
_CHAT_COMPLETIONS_SUFFIX = "/v1/chat/completions"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def ensure_chat_completions_url(raw_url: Any, mode: str | None = None) -> Any:
|
|
15
|
+
"""
|
|
16
|
+
Ensure inference URLs point at the chat completions endpoint.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
raw_url: The inference URL to process
|
|
20
|
+
mode: "rl" applies URL transformations, "eval" uses URLs as-is (deprecated - use RolloutMode enum)
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Processed URL (transformed in RL mode, unchanged in EVAL mode)
|
|
24
|
+
"""
|
|
25
|
+
# In EVAL mode, use URLs exactly as provided - no transformations
|
|
26
|
+
# Accept both string "eval" (legacy) and RolloutMode.EVAL
|
|
27
|
+
from synth_ai.task.contracts import RolloutMode
|
|
28
|
+
is_eval_mode = (mode == "eval" or mode == RolloutMode.EVAL or
|
|
29
|
+
(hasattr(mode, 'value') and mode.value == "eval"))
|
|
30
|
+
|
|
31
|
+
if is_eval_mode:
|
|
32
|
+
logger.info("ensure_chat_completions_url: EVAL mode - using URL as-is: %s", raw_url)
|
|
33
|
+
return raw_url
|
|
34
|
+
|
|
35
|
+
# RL mode: apply transformations for compatibility
|
|
36
|
+
if not isinstance(raw_url, str):
|
|
37
|
+
logger.debug("ensure_chat_completions_url: non-string input %r (type=%s)", raw_url, type(raw_url))
|
|
38
|
+
return raw_url
|
|
39
|
+
url = raw_url.strip()
|
|
40
|
+
if not url:
|
|
41
|
+
logger.debug("ensure_chat_completions_url: blank/whitespace URL input")
|
|
42
|
+
return raw_url
|
|
43
|
+
|
|
44
|
+
parsed = urlparse(url)
|
|
45
|
+
path = (parsed.path or "").rstrip("/")
|
|
46
|
+
if path.endswith("/v1/chat/completions"):
|
|
47
|
+
logger.debug("ensure_chat_completions_url: URL already normalized %s", url)
|
|
48
|
+
# Already targeting the desired endpoint; keep original to preserve trailing slash.
|
|
49
|
+
return url
|
|
50
|
+
|
|
51
|
+
if not path:
|
|
52
|
+
new_path = _CHAT_COMPLETIONS_SUFFIX
|
|
53
|
+
else:
|
|
54
|
+
new_path = f"{path}{_CHAT_COMPLETIONS_SUFFIX}"
|
|
55
|
+
|
|
56
|
+
rebuilt = parsed._replace(path=new_path)
|
|
57
|
+
normalized = urlunparse(rebuilt)
|
|
58
|
+
logger.info(
|
|
59
|
+
"ensure_chat_completions_url: RL mode - normalized inference URL from %s to %s",
|
|
60
|
+
url,
|
|
61
|
+
normalized,
|
|
62
|
+
)
|
|
63
|
+
return normalized
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def inference_url_to_trace_correlation_id(raw_url: Any, *, required: bool = False, mode: Any = None) -> str | None:
|
|
67
|
+
"""
|
|
68
|
+
Extract trace_correlation_id from inference URL query params.
|
|
69
|
+
|
|
70
|
+
The inference URL should contain ?cid=trace_xxxxx parameter.
|
|
71
|
+
This is THE canonical source for trace_correlation_id - it's what the
|
|
72
|
+
inference server uses to tag traces, so we extract it here.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
raw_url: Inference URL (should contain ?cid=... query param)
|
|
76
|
+
required: If True, raises AssertionError if trace_correlation_id not found
|
|
77
|
+
mode: RolloutMode or string ("rl" or "eval"). Controls warning behavior -
|
|
78
|
+
warnings only logged for RL mode, not EVAL mode.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
trace_correlation_id if found in URL, None otherwise
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
AssertionError: If required=True and trace_correlation_id not found
|
|
85
|
+
"""
|
|
86
|
+
if not isinstance(raw_url, str):
|
|
87
|
+
logger.debug(
|
|
88
|
+
"inference_url_to_trace_correlation_id: non-string input %r (type=%s)",
|
|
89
|
+
raw_url,
|
|
90
|
+
type(raw_url)
|
|
91
|
+
)
|
|
92
|
+
if required:
|
|
93
|
+
raise AssertionError(
|
|
94
|
+
f"FATAL: inference_url_to_trace_correlation_id requires string URL, got {type(raw_url)}: {raw_url!r}"
|
|
95
|
+
)
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
parsed = urlparse(raw_url)
|
|
99
|
+
query_params = parse_qs(parsed.query or "")
|
|
100
|
+
|
|
101
|
+
# Check all possible parameter names (cid is primary)
|
|
102
|
+
candidates = (
|
|
103
|
+
query_params.get("cid") or
|
|
104
|
+
query_params.get("trace") or
|
|
105
|
+
query_params.get("trace_correlation_id") or
|
|
106
|
+
[]
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
for value in candidates:
|
|
110
|
+
if isinstance(value, str) and value.strip():
|
|
111
|
+
correlation_id = value.strip()
|
|
112
|
+
logger.info(
|
|
113
|
+
"inference_url_to_trace_correlation_id: ✅ extracted id=%s from url=%s",
|
|
114
|
+
correlation_id,
|
|
115
|
+
raw_url,
|
|
116
|
+
)
|
|
117
|
+
# ASSERTION: Correlation ID should look like trace_xxxxx
|
|
118
|
+
assert correlation_id.startswith("trace_"), (
|
|
119
|
+
f"FATAL: trace_correlation_id has unexpected format: {correlation_id!r}. "
|
|
120
|
+
f"Expected to start with 'trace_'"
|
|
121
|
+
)
|
|
122
|
+
return correlation_id
|
|
123
|
+
|
|
124
|
+
# Not found - check if we're in EVAL mode (trace_correlation_id not required for eval)
|
|
125
|
+
from synth_ai.task.contracts import RolloutMode
|
|
126
|
+
is_eval_mode = (mode == "eval" or mode == RolloutMode.EVAL or
|
|
127
|
+
(hasattr(mode, 'value') and mode.value == "eval"))
|
|
128
|
+
|
|
129
|
+
if is_eval_mode:
|
|
130
|
+
# For EVAL mode, missing trace_correlation_id is expected - log as debug, not warning
|
|
131
|
+
logger.debug(
|
|
132
|
+
"inference_url_to_trace_correlation_id: No trace_correlation_id in EVAL mode (expected) url=%s query_params=%s",
|
|
133
|
+
raw_url,
|
|
134
|
+
list(query_params.keys())
|
|
135
|
+
)
|
|
136
|
+
else:
|
|
137
|
+
# For RL mode, missing trace_correlation_id is concerning
|
|
138
|
+
logger.warning(
|
|
139
|
+
"inference_url_to_trace_correlation_id: ❌ NO trace_correlation_id found in url=%s query_params=%s",
|
|
140
|
+
raw_url,
|
|
141
|
+
list(query_params.keys())
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
if required:
|
|
145
|
+
raise AssertionError(
|
|
146
|
+
f"FATAL: trace_correlation_id REQUIRED but not found in inference_url!\n"
|
|
147
|
+
f"\n"
|
|
148
|
+
f"URL: {raw_url}\n"
|
|
149
|
+
f"Query params found: {list(query_params.keys())}\n"
|
|
150
|
+
f"\n"
|
|
151
|
+
f"The inference_url MUST contain ?cid=trace_xxxxx parameter.\n"
|
|
152
|
+
f"This is set by the trainer when generating rollout requests.\n"
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# Legacy alias for backward compatibility
|
|
159
|
+
def extract_trace_correlation_id(raw_url: Any, mode: Any = None) -> str | None:
|
|
160
|
+
"""DEPRECATED: Use inference_url_to_trace_correlation_id instead."""
|
|
161
|
+
return inference_url_to_trace_correlation_id(raw_url, required=False, mode=mode)
|
|
162
|
+
|
|
7
163
|
|
|
8
164
|
def convert_numpy_to_python(obj: Any) -> Any:
|
|
9
165
|
"""
|
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
# Pokemon Red Image-Only Eval - Complete ✅
|
|
2
|
+
|
|
3
|
+
## Summary
|
|
4
|
+
|
|
5
|
+
Successfully ran **10 rollouts** of Pokemon Red with **image-only input** (no text observations), with full **Turso tracing** and **outcome rewards** saved to database.
|
|
6
|
+
|
|
7
|
+
## Configuration
|
|
8
|
+
|
|
9
|
+
- **Model**: `gpt-4o-mini-2024-07-18`
|
|
10
|
+
- **Input Mode**: Image-only (vision enabled, text observations disabled)
|
|
11
|
+
- **Max Steps**: 10 per episode
|
|
12
|
+
- **Max LLM Calls**: 10 per rollout
|
|
13
|
+
- **Seeds**: 0-9 (10 rollouts)
|
|
14
|
+
- **Tracing**: Enabled with Turso/libsql (MVCC concurrent writes)
|
|
15
|
+
- **Database**: `traces/v3/pokemon_red_eval.db` (192KB)
|
|
16
|
+
|
|
17
|
+
## Results
|
|
18
|
+
|
|
19
|
+
### Overall Performance
|
|
20
|
+
- **Total Rollouts**: 10/10 completed
|
|
21
|
+
- **Success Rate**: 100% (no errors)
|
|
22
|
+
- **Mean Reward**: 0.000
|
|
23
|
+
- **Rollouts with Rewards**: 0/10 (0%)
|
|
24
|
+
|
|
25
|
+
*Note: 0 rewards are expected - the Pallet Town sequence is challenging with only 10 turns and image-only input*
|
|
26
|
+
|
|
27
|
+
### Database Verification
|
|
28
|
+
```sql
|
|
29
|
+
Total rollouts: 10
|
|
30
|
+
Rollouts with reward > 0: 0
|
|
31
|
+
Rollouts with achievements > 0: 0
|
|
32
|
+
Average reward: 0.0
|
|
33
|
+
Database size: 192KB
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### All Rollouts
|
|
37
|
+
All 10 seeds stayed in Map 38 (Red's bedroom) with 0 party Pokemon and 0 badges.
|
|
38
|
+
|
|
39
|
+
## Implementation Details
|
|
40
|
+
|
|
41
|
+
### 1. Image-Only Mode
|
|
42
|
+
**File**: `task_app.py` → `_call_inference()` function
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
# Check if vision mode is enabled
|
|
46
|
+
use_vision = bool(policy_cfg.get("use_vision", False))
|
|
47
|
+
image_only_mode = bool(policy_cfg.get("image_only_mode", False))
|
|
48
|
+
|
|
49
|
+
# Image-only mode: only send image, no text
|
|
50
|
+
if image_only_mode:
|
|
51
|
+
user_content = [
|
|
52
|
+
{"type": "image_url", "image_url": {"url": image_data_url}}
|
|
53
|
+
]
|
|
54
|
+
else:
|
|
55
|
+
# Vision mode with text: send both text and image
|
|
56
|
+
user_content = [
|
|
57
|
+
{"type": "text", "text": state_summary},
|
|
58
|
+
{"type": "image_url", "image_url": {"url": image_data_url}}
|
|
59
|
+
]
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### 2. OpenAI API Integration
|
|
63
|
+
**File**: `task_app.py` → `_call_inference()` function
|
|
64
|
+
|
|
65
|
+
Fixed inference URL construction and authentication:
|
|
66
|
+
```python
|
|
67
|
+
# Add /v1/chat/completions if using OpenAI directly
|
|
68
|
+
if "api.openai.com" in inference_url:
|
|
69
|
+
inference_url = inference_url + "/v1/chat/completions"
|
|
70
|
+
|
|
71
|
+
# External API: use direct HTTP client with auth header
|
|
72
|
+
if is_external:
|
|
73
|
+
headers = {}
|
|
74
|
+
if "api.openai.com" in inference_url:
|
|
75
|
+
api_key = os.getenv("OPENAI_API_KEY")
|
|
76
|
+
if api_key:
|
|
77
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### 3. SessionTracer Integration
|
|
81
|
+
**File**: `task_app.py` → `rollout_executor()` function
|
|
82
|
+
|
|
83
|
+
Added full Turso tracing like Crafter:
|
|
84
|
+
```python
|
|
85
|
+
# Initialize SessionTracer for this rollout
|
|
86
|
+
tracer_factory = getattr(fastapi_request.app.state, "session_tracer_factory", None)
|
|
87
|
+
tracer_instance: SessionTracer | None = None
|
|
88
|
+
if callable(tracer_factory):
|
|
89
|
+
inst = tracer_factory()
|
|
90
|
+
tracer_instance = inst if isinstance(inst, SessionTracer) else None
|
|
91
|
+
|
|
92
|
+
# Start tracing session
|
|
93
|
+
if tracer_instance is not None:
|
|
94
|
+
await tracer_instance.initialize()
|
|
95
|
+
await tracer_instance.start_session(
|
|
96
|
+
session_id=request.run_id,
|
|
97
|
+
metadata={...}
|
|
98
|
+
)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### 4. Outcome Rewards
|
|
102
|
+
**File**: `task_app.py` → `rollout_executor()` end
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
# Record outcome rewards and end session
|
|
106
|
+
if tracer_instance is not None:
|
|
107
|
+
achievements_count = len(milestone_events)
|
|
108
|
+
|
|
109
|
+
reward_metadata = {
|
|
110
|
+
"run_id": request.run_id,
|
|
111
|
+
"env_name": "pokemon_red",
|
|
112
|
+
"final_map": final_state.get("map_id", -1),
|
|
113
|
+
"party_count": final_state.get("party_count", 0),
|
|
114
|
+
"badges": final_state.get("badges", 0),
|
|
115
|
+
"steps": len(steps),
|
|
116
|
+
"milestone_events": milestone_events,
|
|
117
|
+
"reward_components": all_reward_components,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
# Record outcome reward to Turso
|
|
121
|
+
await tracer_instance.record_outcome_reward(
|
|
122
|
+
total_reward=int(total_reward),
|
|
123
|
+
achievements_count=achievements_count,
|
|
124
|
+
total_steps=len(steps),
|
|
125
|
+
reward_metadata=reward_metadata,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# End session
|
|
129
|
+
session_trace = await tracer_instance.end_session()
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### 5. Tracer Factory Setup
|
|
133
|
+
**File**: `task_app.py` → `build_config()` function
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
# Set up tracing
|
|
137
|
+
tracing_enabled = tracing_env_enabled()
|
|
138
|
+
tracing_db_url = resolve_tracing_db_url()
|
|
139
|
+
tracer_factory = build_tracer_factory(
|
|
140
|
+
SessionTracer, enabled=tracing_enabled, db_url=tracing_db_url
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
app_state: dict[str, Any] = {
|
|
144
|
+
"tracing_enabled": tracing_enabled,
|
|
145
|
+
}
|
|
146
|
+
if tracer_factory is not None:
|
|
147
|
+
app_state["session_tracer_factory"] = tracer_factory
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Database Schema
|
|
151
|
+
|
|
152
|
+
### outcome_rewards Table
|
|
153
|
+
```sql
|
|
154
|
+
CREATE TABLE outcome_rewards (
|
|
155
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
156
|
+
session_id VARCHAR NOT NULL,
|
|
157
|
+
total_reward INTEGER NOT NULL,
|
|
158
|
+
achievements_count INTEGER NOT NULL,
|
|
159
|
+
total_steps INTEGER NOT NULL,
|
|
160
|
+
created_at DATETIME NOT NULL,
|
|
161
|
+
reward_metadata TEXT,
|
|
162
|
+
FOREIGN KEY(session_id) REFERENCES session_traces(session_id)
|
|
163
|
+
);
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Query Examples
|
|
167
|
+
|
|
168
|
+
### Get all sessions with rewards
|
|
169
|
+
```sql
|
|
170
|
+
SELECT
|
|
171
|
+
st.session_id,
|
|
172
|
+
st.num_timesteps,
|
|
173
|
+
orw.total_reward,
|
|
174
|
+
orw.achievements_count,
|
|
175
|
+
json_extract(orw.reward_metadata, '$.final_map') as final_map
|
|
176
|
+
FROM session_traces st
|
|
177
|
+
INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
|
|
178
|
+
ORDER BY orw.total_reward DESC;
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Filter for non-zero rewards (when they exist)
|
|
182
|
+
```sql
|
|
183
|
+
SELECT
|
|
184
|
+
session_id,
|
|
185
|
+
total_reward,
|
|
186
|
+
achievements_count,
|
|
187
|
+
total_steps,
|
|
188
|
+
json_extract(reward_metadata, '$.final_map') as final_map,
|
|
189
|
+
json_extract(reward_metadata, '$.party_count') as party_count
|
|
190
|
+
FROM outcome_rewards
|
|
191
|
+
WHERE total_reward > 0
|
|
192
|
+
ORDER BY total_reward DESC;
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Comparison: Crafter vs Pokemon Red
|
|
196
|
+
|
|
197
|
+
| Feature | Crafter | Pokemon Red |
|
|
198
|
+
|---------|---------|-------------|
|
|
199
|
+
| Image-only mode | ✅ Working | ✅ Working |
|
|
200
|
+
| OpenAI API | ✅ Working | ✅ Working |
|
|
201
|
+
| Eval CLI | ✅ Working | ✅ Working |
|
|
202
|
+
| SessionTracer | ✅ Integrated | ✅ Integrated |
|
|
203
|
+
| Turso database | ✅ 1.7MB (10 rollouts) | ✅ 192KB (10 rollouts) |
|
|
204
|
+
| outcome_rewards | ✅ 10 rows | ✅ 10 rows |
|
|
205
|
+
| Foreign keys | ✅ Working | ✅ Working |
|
|
206
|
+
| Non-zero rewards | ✅ 7/10 rollouts | ❌ 0/10 rollouts* |
|
|
207
|
+
|
|
208
|
+
*Expected: Pokemon Red is harder (requires room navigation, NPC dialogue, etc.)
|
|
209
|
+
|
|
210
|
+
## Files Modified
|
|
211
|
+
|
|
212
|
+
1. **`task_app.py`**:
|
|
213
|
+
- Added `use_vision` and `image_only_mode` support
|
|
214
|
+
- Fixed OpenAI API URL construction and auth
|
|
215
|
+
- Integrated SessionTracer for Turso persistence
|
|
216
|
+
- Added `record_outcome_reward()` calls
|
|
217
|
+
- Updated `build_config()` to create tracer_factory
|
|
218
|
+
|
|
219
|
+
2. **`eval_image_only_gpt4o.toml`** (new):
|
|
220
|
+
- Config for image-only evaluation
|
|
221
|
+
- 10 seeds, 10 max turns per episode
|
|
222
|
+
- GPT-4o mini with vision enabled
|
|
223
|
+
|
|
224
|
+
## Running the Evaluation
|
|
225
|
+
|
|
226
|
+
```bash
|
|
227
|
+
cd /Users/joshpurtell/Documents/GitHub/synth-ai
|
|
228
|
+
|
|
229
|
+
# Set up tracing environment
|
|
230
|
+
export TASKAPP_TRACING_ENABLED=1
|
|
231
|
+
export TURSO_NATIVE=1
|
|
232
|
+
export SQLD_DB_PATH="traces/v3/pokemon_red_eval.db"
|
|
233
|
+
|
|
234
|
+
# Run evaluation
|
|
235
|
+
uv run synth-ai eval pokemon_red \
|
|
236
|
+
--config examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
## Verification Commands
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
# Check database size
|
|
243
|
+
ls -lh traces/v3/pokemon_red_eval.db
|
|
244
|
+
|
|
245
|
+
# Count sessions
|
|
246
|
+
sqlite3 traces/v3/pokemon_red_eval.db \
|
|
247
|
+
"SELECT COUNT(*) FROM session_traces;"
|
|
248
|
+
|
|
249
|
+
# View all rewards
|
|
250
|
+
sqlite3 -header -column traces/v3/pokemon_red_eval.db \
|
|
251
|
+
"SELECT session_id, total_reward, achievements_count, total_steps
|
|
252
|
+
FROM outcome_rewards
|
|
253
|
+
ORDER BY total_reward DESC;"
|
|
254
|
+
|
|
255
|
+
# Test foreign keys
|
|
256
|
+
sqlite3 traces/v3/pokemon_red_eval.db \
|
|
257
|
+
"SELECT st.session_id, orw.total_reward
|
|
258
|
+
FROM session_traces st
|
|
259
|
+
INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
|
|
260
|
+
LIMIT 5;"
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## Next Steps
|
|
264
|
+
|
|
265
|
+
To improve rewards:
|
|
266
|
+
1. **Increase max_turns**: Try 50-100 turns per episode
|
|
267
|
+
2. **Better prompting**: Add more detailed instructions in system prompt
|
|
268
|
+
3. **Hybrid mode**: Use `use_vision=true` with `image_only_mode=false` to get both images and text
|
|
269
|
+
4. **Different model**: Try GPT-4o (full) or Claude 3.5 Sonnet for better vision understanding
|
|
270
|
+
|
|
271
|
+
## Summary
|
|
272
|
+
|
|
273
|
+
✅ **All goals achieved**:
|
|
274
|
+
- Image-only input mode working
|
|
275
|
+
- 10 rollouts completed successfully
|
|
276
|
+
- Turso database created with 192KB of trace data
|
|
277
|
+
- outcome_rewards table with foreign keys
|
|
278
|
+
- Can filter and query by rewards
|
|
279
|
+
- SessionTracer fully integrated
|
|
280
|
+
|
|
281
|
+
Pokemon Red now has the same Turso tracing capabilities as Crafter! 🎉
|
|
282
|
+
|
|
283
|
+
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
# Pokemon Red Image-Only Eval Status - ✅ COMPLETE
|
|
2
|
+
|
|
3
|
+
**Status**: All features working! See `EVAL_IMAGE_ONLY_COMPLETE.md` for full details.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Original Status (Before Turso Integration)
|
|
8
|
+
|
|
9
|
+
## ✅ What's Working
|
|
10
|
+
|
|
11
|
+
### 1. Image-Only Input Mode
|
|
12
|
+
- Successfully modified `task_app.py` to support `use_vision` and `image_only_mode` config flags
|
|
13
|
+
- When enabled, sends only base64-encoded PNG frames to the LLM (no text observations)
|
|
14
|
+
- Similar to Crafter's implementation
|
|
15
|
+
|
|
16
|
+
### 2. OpenAI API Integration
|
|
17
|
+
- Fixed inference URL construction to properly call `https://api.openai.com/v1/chat/completions`
|
|
18
|
+
- Added proper Authorization Bearer token handling
|
|
19
|
+
- Successfully runs 10 rollouts with `gpt-4o-mini-2024-07-18`
|
|
20
|
+
|
|
21
|
+
### 3. Eval Configuration
|
|
22
|
+
- Created `eval_image_only_gpt4o.toml` config file
|
|
23
|
+
- Successfully runs via `synth-ai eval pokemon_red --config ...`
|
|
24
|
+
- All 10 seeds complete without errors
|
|
25
|
+
|
|
26
|
+
## ⚠️ What's Not Working Yet
|
|
27
|
+
|
|
28
|
+
### Turso Tracing & Rewards
|
|
29
|
+
**Issue**: Pokemon Red doesn't use SessionTracer like Crafter does
|
|
30
|
+
|
|
31
|
+
**Current State**:
|
|
32
|
+
- Pokemon Red returns a basic trace payload (session_id, metadata) for the CLI
|
|
33
|
+
- But it doesn't actually create or save to a Turso database
|
|
34
|
+
- No `outcome_rewards` table or reward persistence
|
|
35
|
+
- No integration with `SessionTracer` from `tracing_v3`
|
|
36
|
+
|
|
37
|
+
**What Would Be Needed**:
|
|
38
|
+
1. Import and initialize `SessionTracer` in Pokemon Red's `rollout_executor`
|
|
39
|
+
2. Call `tracer.start_session()` at beginning of rollout
|
|
40
|
+
3. Record events during rollout (like Crafter does)
|
|
41
|
+
4. Call `tracer.record_outcome_reward()` at end with:
|
|
42
|
+
- `total_reward`: sum of step rewards
|
|
43
|
+
- `achievements_count`: count of milestones reached
|
|
44
|
+
- `total_steps`: number of steps taken
|
|
45
|
+
- `reward_metadata`: dict with map_id, party_count, badges, etc.
|
|
46
|
+
5. Call `tracer.end_session()` to persist to database
|
|
47
|
+
|
|
48
|
+
### Reward Computation
|
|
49
|
+
**Current State**:
|
|
50
|
+
- Pokemon Red has a `PalletTownProgressionCompositeReward` reward function
|
|
51
|
+
- It tracks milestones like leaving bedroom, getting starter Pokemon, etc.
|
|
52
|
+
- But rewards are currently all 0.0 (expected - task is hard with only 10 turns and image-only input)
|
|
53
|
+
|
|
54
|
+
**What's Challenging**:
|
|
55
|
+
- The Pallet Town sequence requires:
|
|
56
|
+
- Navigating multiple rooms
|
|
57
|
+
- Talking to NPCs (pressing A at right moments)
|
|
58
|
+
- Selecting starter Pokemon
|
|
59
|
+
- Entering first battle
|
|
60
|
+
- With only images (no text hints) and 10 LLM calls, agents struggle to make progress
|
|
61
|
+
- May need more turns or better prompting to get non-zero rewards
|
|
62
|
+
|
|
63
|
+
## 📊 Current Results
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
Eval complete: 10 ok, 0 failed
|
|
67
|
+
Model: gpt-4o-mini-2024-07-18
|
|
68
|
+
Seeds: 0-9 (10 rollouts)
|
|
69
|
+
Mean reward: 0.000
|
|
70
|
+
Outcome score: 0.000
|
|
71
|
+
|
|
72
|
+
All rollouts: ~21 steps, 0 rewards, Map 38 (Red's bedroom)
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## 🔧 Files Modified
|
|
76
|
+
|
|
77
|
+
1. **`task_app.py`**:
|
|
78
|
+
- Added `use_vision` and `image_only_mode` support in `_call_inference`
|
|
79
|
+
- Fixed OpenAI API URL construction
|
|
80
|
+
- Added basic trace payload generation
|
|
81
|
+
- **Still needs**: SessionTracer integration for Turso persistence
|
|
82
|
+
|
|
83
|
+
2. **`eval_image_only_gpt4o.toml`** (new):
|
|
84
|
+
- Config for image-only evaluation
|
|
85
|
+
- 10 seeds, 10 max turns per episode
|
|
86
|
+
- GPT-4o mini with vision enabled
|
|
87
|
+
|
|
88
|
+
## 🚀 Next Steps to Complete Turso Integration
|
|
89
|
+
|
|
90
|
+
### Option 1: Quick Fix (Minimal Tracing)
|
|
91
|
+
Just save basic session info without full event tracing:
|
|
92
|
+
```python
|
|
93
|
+
# At start of rollout_executor
|
|
94
|
+
from synth_ai.tracing_v3 import SessionTracer, StorageConfig, StorageBackend
|
|
95
|
+
|
|
96
|
+
tracer = SessionTracer(
|
|
97
|
+
storage_config=StorageConfig(
|
|
98
|
+
backend=StorageBackend.TURSO_NATIVE,
|
|
99
|
+
connection_string=f"file:{os.getenv('SQLD_DB_PATH', 'traces/v3/pokemon_red.db')}"
|
|
100
|
+
),
|
|
101
|
+
auto_save=True
|
|
102
|
+
)
|
|
103
|
+
await tracer.initialize()
|
|
104
|
+
session_id = await tracer.start_session(metadata={...})
|
|
105
|
+
|
|
106
|
+
# At end of rollout_executor
|
|
107
|
+
await tracer.record_outcome_reward(
|
|
108
|
+
total_reward=int(total_reward),
|
|
109
|
+
achievements_count=len(milestone_events), # or 0 if none
|
|
110
|
+
total_steps=len(steps),
|
|
111
|
+
reward_metadata={
|
|
112
|
+
"final_map": final_state.get("map_id"),
|
|
113
|
+
"party_count": final_state.get("party_count", 0),
|
|
114
|
+
"badges": final_state.get("badges", 0),
|
|
115
|
+
"milestone_events": milestone_events,
|
|
116
|
+
}
|
|
117
|
+
)
|
|
118
|
+
await tracer.end_session()
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Option 2: Full Tracing (Like Crafter)
|
|
122
|
+
Integrate complete event tracing like Crafter's rollout.py:
|
|
123
|
+
- Record messages, timesteps, events for each step
|
|
124
|
+
- More complex but provides rich trace data
|
|
125
|
+
- Would require more significant refactoring
|
|
126
|
+
|
|
127
|
+
## 📝 Comparison with Crafter
|
|
128
|
+
|
|
129
|
+
| Feature | Crafter | Pokemon Red |
|
|
130
|
+
|---------|---------|-------------|
|
|
131
|
+
| Image-only mode | ✅ Working | ✅ Working |
|
|
132
|
+
| OpenAI API | ✅ Working | ✅ Working |
|
|
133
|
+
| Eval CLI | ✅ Working | ✅ Working |
|
|
134
|
+
| SessionTracer | ✅ Integrated | ❌ Not integrated |
|
|
135
|
+
| Turso database | ✅ Saves traces | ❌ No database created |
|
|
136
|
+
| outcome_rewards | ✅ Persisted | ❌ Not saved |
|
|
137
|
+
| Foreign keys | ✅ Working | ❌ N/A |
|
|
138
|
+
| Non-zero rewards | ✅ 7/10 rollouts | ❌ 0/10 rollouts |
|
|
139
|
+
|
|
140
|
+
## ✅ Summary
|
|
141
|
+
|
|
142
|
+
**Completed**:
|
|
143
|
+
- ✅ Image-only input mode for Pokemon Red
|
|
144
|
+
- ✅ OpenAI API integration with proper auth
|
|
145
|
+
- ✅ Eval CLI runs 10 rollouts successfully
|
|
146
|
+
- ✅ Basic trace payload returned (for CLI)
|
|
147
|
+
|
|
148
|
+
**Not Yet Complete**:
|
|
149
|
+
- ❌ Turso database persistence
|
|
150
|
+
- ❌ outcome_rewards table with foreign keys
|
|
151
|
+
- ❌ SessionTracer integration
|
|
152
|
+
- ❌ Queryable rewards by seed
|
|
153
|
+
|
|
154
|
+
**To match Crafter's capabilities**, Pokemon Red needs SessionTracer integration (Option 1 or 2 above).
|
|
155
|
+
|