synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (293) hide show
  1. examples/README.md +1 -0
  2. examples/multi_step/SFT_README.md +147 -0
  3. examples/multi_step/configs/README_verilog_rl.md +77 -0
  4. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  5. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  6. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  7. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  8. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +12 -11
  9. examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
  10. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  11. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  12. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  13. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  14. examples/multi_step/convert_traces_to_sft.py +84 -0
  15. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  16. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  17. examples/multi_step/readme.md +48 -0
  18. examples/multi_step/run_sft_qwen30b.sh +45 -0
  19. examples/multi_step/verilog_rl_lora.md +218 -0
  20. examples/qwen_coder/configs/coder_lora_30b.toml +3 -2
  21. examples/qwen_coder/configs/coder_lora_4b.toml +2 -1
  22. examples/qwen_coder/configs/coder_lora_small.toml +2 -1
  23. examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
  24. examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
  25. examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
  26. examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
  27. examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
  28. examples/qwen_vl/QUICKSTART.md +327 -0
  29. examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
  30. examples/qwen_vl/README.md +154 -0
  31. examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
  32. examples/qwen_vl/RL_VISION_TESTING.md +333 -0
  33. examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
  34. examples/qwen_vl/SETUP_COMPLETE.md +275 -0
  35. examples/qwen_vl/VISION_TESTS_COMPLETE.md +490 -0
  36. examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
  37. examples/qwen_vl/__init__.py +2 -0
  38. examples/qwen_vl/collect_data_via_cli.md +423 -0
  39. examples/qwen_vl/collect_vision_traces.py +368 -0
  40. examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +127 -0
  41. examples/qwen_vl/configs/crafter_vlm_sft_example.toml +60 -0
  42. examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +43 -0
  43. examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
  44. examples/qwen_vl/configs/eval_gpt5nano_vision.toml +45 -0
  45. examples/qwen_vl/configs/eval_qwen2vl_vision.toml +44 -0
  46. examples/qwen_vl/configs/filter_qwen2vl_sft.toml +50 -0
  47. examples/qwen_vl/configs/filter_vision_sft.toml +53 -0
  48. examples/qwen_vl/configs/filter_vision_test.toml +8 -0
  49. examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
  50. examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
  51. examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
  52. examples/qwen_vl/run_vision_comparison.sh +62 -0
  53. examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
  54. examples/qwen_vl/test_image_validation.py +201 -0
  55. examples/qwen_vl/test_sft_vision_data.py +110 -0
  56. examples/rl/README.md +1 -1
  57. examples/rl/configs/eval_base_qwen.toml +17 -0
  58. examples/rl/configs/eval_rl_qwen.toml +13 -0
  59. examples/rl/configs/rl_from_base_qwen.toml +37 -0
  60. examples/rl/configs/rl_from_base_qwen17.toml +76 -0
  61. examples/rl/configs/rl_from_ft_qwen.toml +37 -0
  62. examples/rl/run_eval.py +436 -0
  63. examples/rl/run_rl_and_save.py +111 -0
  64. examples/rl/task_app/README.md +22 -0
  65. examples/rl/task_app/math_single_step.py +990 -0
  66. examples/rl/task_app/math_task_app.py +111 -0
  67. examples/sft/README.md +5 -5
  68. examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -2
  69. examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -3
  70. examples/sft/evaluate.py +4 -4
  71. examples/sft/export_dataset.py +7 -4
  72. examples/sft/generate_traces.py +2 -0
  73. examples/swe/task_app/README.md +1 -1
  74. examples/swe/task_app/grpo_swe_mini.py +1 -1
  75. examples/swe/task_app/grpo_swe_mini_task_app.py +0 -12
  76. examples/swe/task_app/hosted/envs/mini_swe/environment.py +13 -13
  77. examples/swe/task_app/hosted/policy_routes.py +0 -2
  78. examples/swe/task_app/hosted/rollout.py +2 -8
  79. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  80. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  81. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  82. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  83. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  84. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  85. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  86. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  87. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  88. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  89. examples/task_apps/crafter/task_app/__init__.py +3 -0
  90. examples/task_apps/crafter/task_app/grpo_crafter.py +309 -14
  91. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  92. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +75 -4
  93. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  94. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +55 -3
  95. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +114 -32
  96. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +127 -27
  97. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
  98. examples/task_apps/enron/__init__.py +1 -0
  99. examples/task_apps/enron/filter_sft.toml +5 -0
  100. examples/task_apps/enron/tests/__init__.py +2 -0
  101. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  102. examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
  103. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  104. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  105. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  106. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  107. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  108. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
  109. examples/task_apps/pokemon_red/task_app.py +199 -6
  110. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
  111. examples/task_apps/sokoban/filter_sft.toml +5 -0
  112. examples/task_apps/sokoban/tests/__init__.py +2 -0
  113. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  114. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  115. examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
  116. examples/task_apps/verilog/filter_sft.toml +5 -0
  117. examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
  118. examples/task_apps/verilog/tests/__init__.py +2 -0
  119. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  120. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
  121. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  122. examples/vlm/README.md +3 -3
  123. examples/vlm/configs/crafter_vlm_gpt4o.toml +2 -0
  124. examples/vlm/crafter_openai_vlm_agent.py +3 -5
  125. examples/vlm/filter_image_rows.py +1 -1
  126. examples/vlm/run_crafter_vlm_benchmark.py +2 -2
  127. examples/warming_up_to_rl/_utils.py +92 -0
  128. examples/warming_up_to_rl/analyze_trace_db.py +1 -1
  129. examples/warming_up_to_rl/configs/crafter_fft.toml +2 -0
  130. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +2 -0
  131. examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +2 -0
  132. examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +2 -0
  133. examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +2 -1
  134. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -1
  135. examples/warming_up_to_rl/configs/rl_from_ft.toml +2 -0
  136. examples/warming_up_to_rl/export_trace_sft.py +174 -60
  137. examples/warming_up_to_rl/groq_test.py +2 -0
  138. examples/warming_up_to_rl/readme.md +63 -132
  139. examples/warming_up_to_rl/run_fft_and_save.py +1 -1
  140. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  141. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  142. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  143. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  144. examples/warming_up_to_rl/run_rl_and_save.py +1 -1
  145. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  146. examples/warming_up_to_rl/task_app/README.md +42 -0
  147. examples/warming_up_to_rl/task_app/grpo_crafter.py +696 -0
  148. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
  149. examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
  150. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
  151. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
  152. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
  153. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
  154. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
  155. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
  156. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
  157. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +478 -0
  158. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
  159. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
  160. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
  161. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +204 -0
  162. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
  163. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +618 -0
  164. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
  165. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1081 -0
  166. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
  167. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1861 -0
  168. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
  169. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
  170. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
  171. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
  172. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +62 -0
  173. synth_ai/__init__.py +44 -30
  174. synth_ai/_utils/__init__.py +47 -0
  175. synth_ai/_utils/base_url.py +10 -0
  176. synth_ai/_utils/http.py +10 -0
  177. synth_ai/_utils/prompts.py +10 -0
  178. synth_ai/_utils/task_app_state.py +12 -0
  179. synth_ai/_utils/user_config.py +10 -0
  180. synth_ai/api/models/supported.py +145 -7
  181. synth_ai/api/train/__init__.py +13 -1
  182. synth_ai/api/train/cli.py +30 -7
  183. synth_ai/api/train/config_finder.py +18 -11
  184. synth_ai/api/train/env_resolver.py +13 -10
  185. synth_ai/cli/__init__.py +66 -49
  186. synth_ai/cli/_modal_wrapper.py +9 -6
  187. synth_ai/cli/_typer_patch.py +0 -2
  188. synth_ai/cli/_validate_task_app.py +22 -4
  189. synth_ai/cli/legacy_root_backup.py +3 -1
  190. synth_ai/cli/lib/__init__.py +10 -0
  191. synth_ai/cli/lib/task_app_discovery.py +7 -0
  192. synth_ai/cli/lib/task_app_env.py +518 -0
  193. synth_ai/cli/recent.py +1 -0
  194. synth_ai/cli/setup.py +266 -0
  195. synth_ai/cli/task_app_deploy.py +16 -0
  196. synth_ai/cli/task_app_list.py +25 -0
  197. synth_ai/cli/task_app_modal_serve.py +16 -0
  198. synth_ai/cli/task_app_serve.py +18 -0
  199. synth_ai/cli/task_apps.py +392 -141
  200. synth_ai/cli/train.py +18 -0
  201. synth_ai/cli/tui.py +62 -0
  202. synth_ai/demos/__init__.py +10 -0
  203. synth_ai/demos/core/__init__.py +28 -1
  204. synth_ai/demos/crafter/__init__.py +1 -0
  205. synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
  206. synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
  207. synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
  208. synth_ai/demos/demo_registry.py +176 -0
  209. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
  210. synth_ai/demos/math/__init__.py +1 -0
  211. synth_ai/demos/math/_common.py +16 -0
  212. synth_ai/demos/math/app.py +38 -0
  213. synth_ai/demos/math/config.toml +76 -0
  214. synth_ai/demos/math/deploy_modal.py +54 -0
  215. synth_ai/demos/math/modal_task_app.py +702 -0
  216. synth_ai/demos/math/task_app_entry.py +51 -0
  217. synth_ai/environments/environment/core.py +7 -1
  218. synth_ai/environments/examples/bandit/engine.py +0 -1
  219. synth_ai/environments/examples/bandit/environment.py +0 -1
  220. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  221. synth_ai/environments/examples/verilog/engine.py +76 -10
  222. synth_ai/environments/examples/wordle/environment.py +0 -1
  223. synth_ai/evals/base.py +16 -5
  224. synth_ai/evals/client.py +1 -1
  225. synth_ai/inference/client.py +1 -1
  226. synth_ai/learning/client.py +1 -1
  227. synth_ai/learning/health.py +1 -1
  228. synth_ai/learning/jobs.py +1 -1
  229. synth_ai/learning/rl/client.py +1 -1
  230. synth_ai/learning/rl/env_keys.py +1 -1
  231. synth_ai/learning/rl/secrets.py +1 -1
  232. synth_ai/learning/sft/client.py +1 -1
  233. synth_ai/learning/sft/data.py +407 -4
  234. synth_ai/learning/validators.py +4 -1
  235. synth_ai/task/__init__.py +11 -1
  236. synth_ai/task/apps/__init__.py +5 -2
  237. synth_ai/task/config.py +259 -0
  238. synth_ai/task/contracts.py +15 -2
  239. synth_ai/task/rubrics/__init__.py +4 -2
  240. synth_ai/task/rubrics/loaders.py +27 -4
  241. synth_ai/task/rubrics/scoring.py +3 -0
  242. synth_ai/task/rubrics.py +219 -0
  243. synth_ai/task/trace_correlation_helpers.py +328 -0
  244. synth_ai/task/tracing_utils.py +14 -3
  245. synth_ai/task/validators.py +145 -2
  246. synth_ai/tracing_v3/config.py +15 -13
  247. synth_ai/tracing_v3/constants.py +21 -0
  248. synth_ai/tracing_v3/db_config.py +3 -1
  249. synth_ai/tracing_v3/decorators.py +10 -7
  250. synth_ai/tracing_v3/session_tracer.py +10 -0
  251. synth_ai/tracing_v3/turso/daemon.py +2 -2
  252. synth_ai/tracing_v3/turso/native_manager.py +108 -77
  253. synth_ai/tracing_v3/utils.py +1 -1
  254. synth_ai/tui/__init__.py +5 -0
  255. synth_ai/tui/__main__.py +13 -0
  256. synth_ai/tui/cli/__init__.py +1 -0
  257. synth_ai/tui/cli/query_experiments.py +164 -0
  258. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  259. synth_ai/tui/dashboard.py +911 -0
  260. synth_ai/utils/__init__.py +101 -0
  261. synth_ai/utils/base_url.py +94 -0
  262. synth_ai/utils/cli.py +131 -0
  263. synth_ai/utils/env.py +287 -0
  264. synth_ai/utils/http.py +169 -0
  265. synth_ai/utils/modal.py +308 -0
  266. synth_ai/utils/process.py +212 -0
  267. synth_ai/utils/prompts.py +39 -0
  268. synth_ai/utils/sqld.py +122 -0
  269. synth_ai/utils/task_app_discovery.py +882 -0
  270. synth_ai/utils/task_app_env.py +186 -0
  271. synth_ai/utils/task_app_state.py +318 -0
  272. synth_ai/utils/user_config.py +137 -0
  273. synth_ai/v0/config/__init__.py +1 -5
  274. synth_ai/v0/config/base_url.py +1 -7
  275. synth_ai/v0/tracing/config.py +1 -1
  276. synth_ai/v0/tracing/decorators.py +1 -1
  277. synth_ai/v0/tracing/upload.py +1 -1
  278. synth_ai/v0/tracing_v1/config.py +1 -1
  279. synth_ai/v0/tracing_v1/decorators.py +1 -1
  280. synth_ai/v0/tracing_v1/upload.py +1 -1
  281. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/METADATA +85 -31
  282. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/RECORD +286 -135
  283. synth_ai/cli/man.py +0 -106
  284. synth_ai/compound/cais.py +0 -0
  285. synth_ai/core/experiment.py +0 -13
  286. synth_ai/core/system.py +0 -15
  287. synth_ai/demo_registry.py +0 -295
  288. synth_ai/handshake.py +0 -109
  289. synth_ai/http.py +0 -26
  290. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/WHEEL +0 -0
  291. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/entry_points.txt +0 -0
  292. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/licenses/LICENSE +0 -0
  293. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.16.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,165 @@
1
1
  """Utility functions for the task service."""
2
2
 
3
+ import logging
3
4
  from typing import Any
5
+ from urllib.parse import parse_qs, urlparse, urlunparse
4
6
 
5
7
  import numpy as np
6
8
 
9
+ logger = logging.getLogger(__name__)
10
+
11
+ _CHAT_COMPLETIONS_SUFFIX = "/v1/chat/completions"
12
+
13
+
14
+ def ensure_chat_completions_url(raw_url: Any, mode: str | None = None) -> Any:
15
+ """
16
+ Ensure inference URLs point at the chat completions endpoint.
17
+
18
+ Args:
19
+ raw_url: The inference URL to process
20
+ mode: "rl" applies URL transformations, "eval" uses URLs as-is (deprecated - use RolloutMode enum)
21
+
22
+ Returns:
23
+ Processed URL (transformed in RL mode, unchanged in EVAL mode)
24
+ """
25
+ # In EVAL mode, use URLs exactly as provided - no transformations
26
+ # Accept both string "eval" (legacy) and RolloutMode.EVAL
27
+ from synth_ai.task.contracts import RolloutMode
28
+ is_eval_mode = (mode == "eval" or mode == RolloutMode.EVAL or
29
+ (hasattr(mode, 'value') and mode.value == "eval"))
30
+
31
+ if is_eval_mode:
32
+ logger.info("ensure_chat_completions_url: EVAL mode - using URL as-is: %s", raw_url)
33
+ return raw_url
34
+
35
+ # RL mode: apply transformations for compatibility
36
+ if not isinstance(raw_url, str):
37
+ logger.debug("ensure_chat_completions_url: non-string input %r (type=%s)", raw_url, type(raw_url))
38
+ return raw_url
39
+ url = raw_url.strip()
40
+ if not url:
41
+ logger.debug("ensure_chat_completions_url: blank/whitespace URL input")
42
+ return raw_url
43
+
44
+ parsed = urlparse(url)
45
+ path = (parsed.path or "").rstrip("/")
46
+ if path.endswith("/v1/chat/completions"):
47
+ logger.debug("ensure_chat_completions_url: URL already normalized %s", url)
48
+ # Already targeting the desired endpoint; keep original to preserve trailing slash.
49
+ return url
50
+
51
+ if not path:
52
+ new_path = _CHAT_COMPLETIONS_SUFFIX
53
+ else:
54
+ new_path = f"{path}{_CHAT_COMPLETIONS_SUFFIX}"
55
+
56
+ rebuilt = parsed._replace(path=new_path)
57
+ normalized = urlunparse(rebuilt)
58
+ logger.info(
59
+ "ensure_chat_completions_url: RL mode - normalized inference URL from %s to %s",
60
+ url,
61
+ normalized,
62
+ )
63
+ return normalized
64
+
65
+
66
+ def inference_url_to_trace_correlation_id(raw_url: Any, *, required: bool = False, mode: Any = None) -> str | None:
67
+ """
68
+ Extract trace_correlation_id from inference URL query params.
69
+
70
+ The inference URL should contain ?cid=trace_xxxxx parameter.
71
+ This is THE canonical source for trace_correlation_id - it's what the
72
+ inference server uses to tag traces, so we extract it here.
73
+
74
+ Args:
75
+ raw_url: Inference URL (should contain ?cid=... query param)
76
+ required: If True, raises AssertionError if trace_correlation_id not found
77
+ mode: RolloutMode or string ("rl" or "eval"). Controls warning behavior -
78
+ warnings only logged for RL mode, not EVAL mode.
79
+
80
+ Returns:
81
+ trace_correlation_id if found in URL, None otherwise
82
+
83
+ Raises:
84
+ AssertionError: If required=True and trace_correlation_id not found
85
+ """
86
+ if not isinstance(raw_url, str):
87
+ logger.debug(
88
+ "inference_url_to_trace_correlation_id: non-string input %r (type=%s)",
89
+ raw_url,
90
+ type(raw_url)
91
+ )
92
+ if required:
93
+ raise AssertionError(
94
+ f"FATAL: inference_url_to_trace_correlation_id requires string URL, got {type(raw_url)}: {raw_url!r}"
95
+ )
96
+ return None
97
+
98
+ parsed = urlparse(raw_url)
99
+ query_params = parse_qs(parsed.query or "")
100
+
101
+ # Check all possible parameter names (cid is primary)
102
+ candidates = (
103
+ query_params.get("cid") or
104
+ query_params.get("trace") or
105
+ query_params.get("trace_correlation_id") or
106
+ []
107
+ )
108
+
109
+ for value in candidates:
110
+ if isinstance(value, str) and value.strip():
111
+ correlation_id = value.strip()
112
+ logger.info(
113
+ "inference_url_to_trace_correlation_id: ✅ extracted id=%s from url=%s",
114
+ correlation_id,
115
+ raw_url,
116
+ )
117
+ # ASSERTION: Correlation ID should look like trace_xxxxx
118
+ assert correlation_id.startswith("trace_"), (
119
+ f"FATAL: trace_correlation_id has unexpected format: {correlation_id!r}. "
120
+ f"Expected to start with 'trace_'"
121
+ )
122
+ return correlation_id
123
+
124
+ # Not found - check if we're in EVAL mode (trace_correlation_id not required for eval)
125
+ from synth_ai.task.contracts import RolloutMode
126
+ is_eval_mode = (mode == "eval" or mode == RolloutMode.EVAL or
127
+ (hasattr(mode, 'value') and mode.value == "eval"))
128
+
129
+ if is_eval_mode:
130
+ # For EVAL mode, missing trace_correlation_id is expected - log as debug, not warning
131
+ logger.debug(
132
+ "inference_url_to_trace_correlation_id: No trace_correlation_id in EVAL mode (expected) url=%s query_params=%s",
133
+ raw_url,
134
+ list(query_params.keys())
135
+ )
136
+ else:
137
+ # For RL mode, missing trace_correlation_id is concerning
138
+ logger.warning(
139
+ "inference_url_to_trace_correlation_id: ❌ NO trace_correlation_id found in url=%s query_params=%s",
140
+ raw_url,
141
+ list(query_params.keys())
142
+ )
143
+
144
+ if required:
145
+ raise AssertionError(
146
+ f"FATAL: trace_correlation_id REQUIRED but not found in inference_url!\n"
147
+ f"\n"
148
+ f"URL: {raw_url}\n"
149
+ f"Query params found: {list(query_params.keys())}\n"
150
+ f"\n"
151
+ f"The inference_url MUST contain ?cid=trace_xxxxx parameter.\n"
152
+ f"This is set by the trainer when generating rollout requests.\n"
153
+ )
154
+
155
+ return None
156
+
157
+
158
+ # Legacy alias for backward compatibility
159
+ def extract_trace_correlation_id(raw_url: Any, mode: Any = None) -> str | None:
160
+ """DEPRECATED: Use inference_url_to_trace_correlation_id instead."""
161
+ return inference_url_to_trace_correlation_id(raw_url, required=False, mode=mode)
162
+
7
163
 
8
164
  def convert_numpy_to_python(obj: Any) -> Any:
9
165
  """
@@ -1 +1,2 @@
1
1
 
2
+
@@ -0,0 +1,5 @@
1
+ [filter]
2
+ db = "traces/v3/synth_ai.db"
3
+ output = "ft_data/enron_sft.jsonl"
4
+ min_official_score = 0.01
5
+
@@ -1,2 +1,4 @@
1
1
  # Enron task app tests
2
2
 
3
+
4
+
@@ -1,2 +1,4 @@
1
1
  # Integration tests for Enron task app
2
2
 
3
+
4
+
@@ -175,3 +175,5 @@ def test_enron_eval_with_groq(enron_server: str) -> None:
175
175
  # Check that we got a meaningful score
176
176
  assert "official" in result.stdout.lower() or "mean_return" in result.stdout.lower()
177
177
 
178
+
179
+
@@ -1,2 +1,4 @@
1
1
  # Unit tests for Enron task app
2
2
 
3
+
4
+
@@ -0,0 +1,283 @@
1
+ # Pokemon Red Image-Only Eval - Complete ✅
2
+
3
+ ## Summary
4
+
5
+ Successfully ran **10 rollouts** of Pokemon Red with **image-only input** (no text observations), with full **Turso tracing** and **outcome rewards** saved to database.
6
+
7
+ ## Configuration
8
+
9
+ - **Model**: `gpt-4o-mini-2024-07-18`
10
+ - **Input Mode**: Image-only (vision enabled, text observations disabled)
11
+ - **Max Steps**: 10 per episode
12
+ - **Max LLM Calls**: 10 per rollout
13
+ - **Seeds**: 0-9 (10 rollouts)
14
+ - **Tracing**: Enabled with Turso/libsql (MVCC concurrent writes)
15
+ - **Database**: `traces/v3/pokemon_red_eval.db` (192KB)
16
+
17
+ ## Results
18
+
19
+ ### Overall Performance
20
+ - **Total Rollouts**: 10/10 completed
21
+ - **Success Rate**: 100% (no errors)
22
+ - **Mean Reward**: 0.000
23
+ - **Rollouts with Rewards**: 0/10 (0%)
24
+
25
+ *Note: 0 rewards are expected - the Pallet Town sequence is challenging with only 10 turns and image-only input*
26
+
27
+ ### Database Verification
28
+ ```sql
29
+ Total rollouts: 10
30
+ Rollouts with reward > 0: 0
31
+ Rollouts with achievements > 0: 0
32
+ Average reward: 0.0
33
+ Database size: 192KB
34
+ ```
35
+
36
+ ### All Rollouts
37
+ All 10 seeds stayed in Map 38 (Red's bedroom) with 0 party Pokemon and 0 badges.
38
+
39
+ ## Implementation Details
40
+
41
+ ### 1. Image-Only Mode
42
+ **File**: `task_app.py` → `_call_inference()` function
43
+
44
+ ```python
45
+ # Check if vision mode is enabled
46
+ use_vision = bool(policy_cfg.get("use_vision", False))
47
+ image_only_mode = bool(policy_cfg.get("image_only_mode", False))
48
+
49
+ # Image-only mode: only send image, no text
50
+ if image_only_mode:
51
+ user_content = [
52
+ {"type": "image_url", "image_url": {"url": image_data_url}}
53
+ ]
54
+ else:
55
+ # Vision mode with text: send both text and image
56
+ user_content = [
57
+ {"type": "text", "text": state_summary},
58
+ {"type": "image_url", "image_url": {"url": image_data_url}}
59
+ ]
60
+ ```
61
+
62
+ ### 2. OpenAI API Integration
63
+ **File**: `task_app.py` → `_call_inference()` function
64
+
65
+ Fixed inference URL construction and authentication:
66
+ ```python
67
+ # Add /v1/chat/completions if using OpenAI directly
68
+ if "api.openai.com" in inference_url:
69
+ inference_url = inference_url + "/v1/chat/completions"
70
+
71
+ # External API: use direct HTTP client with auth header
72
+ if is_external:
73
+ headers = {}
74
+ if "api.openai.com" in inference_url:
75
+ api_key = os.getenv("OPENAI_API_KEY")
76
+ if api_key:
77
+ headers["Authorization"] = f"Bearer {api_key}"
78
+ ```
79
+
80
+ ### 3. SessionTracer Integration
81
+ **File**: `task_app.py` → `rollout_executor()` function
82
+
83
+ Added full Turso tracing like Crafter:
84
+ ```python
85
+ # Initialize SessionTracer for this rollout
86
+ tracer_factory = getattr(fastapi_request.app.state, "session_tracer_factory", None)
87
+ tracer_instance: SessionTracer | None = None
88
+ if callable(tracer_factory):
89
+ inst = tracer_factory()
90
+ tracer_instance = inst if isinstance(inst, SessionTracer) else None
91
+
92
+ # Start tracing session
93
+ if tracer_instance is not None:
94
+ await tracer_instance.initialize()
95
+ await tracer_instance.start_session(
96
+ session_id=request.run_id,
97
+ metadata={...}
98
+ )
99
+ ```
100
+
101
+ ### 4. Outcome Rewards
102
+ **File**: `task_app.py` → `rollout_executor()` end
103
+
104
+ ```python
105
+ # Record outcome rewards and end session
106
+ if tracer_instance is not None:
107
+ achievements_count = len(milestone_events)
108
+
109
+ reward_metadata = {
110
+ "run_id": request.run_id,
111
+ "env_name": "pokemon_red",
112
+ "final_map": final_state.get("map_id", -1),
113
+ "party_count": final_state.get("party_count", 0),
114
+ "badges": final_state.get("badges", 0),
115
+ "steps": len(steps),
116
+ "milestone_events": milestone_events,
117
+ "reward_components": all_reward_components,
118
+ }
119
+
120
+ # Record outcome reward to Turso
121
+ await tracer_instance.record_outcome_reward(
122
+ total_reward=int(total_reward),
123
+ achievements_count=achievements_count,
124
+ total_steps=len(steps),
125
+ reward_metadata=reward_metadata,
126
+ )
127
+
128
+ # End session
129
+ session_trace = await tracer_instance.end_session()
130
+ ```
131
+
132
+ ### 5. Tracer Factory Setup
133
+ **File**: `task_app.py` → `build_config()` function
134
+
135
+ ```python
136
+ # Set up tracing
137
+ tracing_enabled = tracing_env_enabled()
138
+ tracing_db_url = resolve_tracing_db_url()
139
+ tracer_factory = build_tracer_factory(
140
+ SessionTracer, enabled=tracing_enabled, db_url=tracing_db_url
141
+ )
142
+
143
+ app_state: dict[str, Any] = {
144
+ "tracing_enabled": tracing_enabled,
145
+ }
146
+ if tracer_factory is not None:
147
+ app_state["session_tracer_factory"] = tracer_factory
148
+ ```
149
+
150
+ ## Database Schema
151
+
152
+ ### outcome_rewards Table
153
+ ```sql
154
+ CREATE TABLE outcome_rewards (
155
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
156
+ session_id VARCHAR NOT NULL,
157
+ total_reward INTEGER NOT NULL,
158
+ achievements_count INTEGER NOT NULL,
159
+ total_steps INTEGER NOT NULL,
160
+ created_at DATETIME NOT NULL,
161
+ reward_metadata TEXT,
162
+ FOREIGN KEY(session_id) REFERENCES session_traces(session_id)
163
+ );
164
+ ```
165
+
166
+ ## Query Examples
167
+
168
+ ### Get all sessions with rewards
169
+ ```sql
170
+ SELECT
171
+ st.session_id,
172
+ st.num_timesteps,
173
+ orw.total_reward,
174
+ orw.achievements_count,
175
+ json_extract(orw.reward_metadata, '$.final_map') as final_map
176
+ FROM session_traces st
177
+ INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
178
+ ORDER BY orw.total_reward DESC;
179
+ ```
180
+
181
+ ### Filter for non-zero rewards (when they exist)
182
+ ```sql
183
+ SELECT
184
+ session_id,
185
+ total_reward,
186
+ achievements_count,
187
+ total_steps,
188
+ json_extract(reward_metadata, '$.final_map') as final_map,
189
+ json_extract(reward_metadata, '$.party_count') as party_count
190
+ FROM outcome_rewards
191
+ WHERE total_reward > 0
192
+ ORDER BY total_reward DESC;
193
+ ```
194
+
195
+ ## Comparison: Crafter vs Pokemon Red
196
+
197
+ | Feature | Crafter | Pokemon Red |
198
+ |---------|---------|-------------|
199
+ | Image-only mode | ✅ Working | ✅ Working |
200
+ | OpenAI API | ✅ Working | ✅ Working |
201
+ | Eval CLI | ✅ Working | ✅ Working |
202
+ | SessionTracer | ✅ Integrated | ✅ Integrated |
203
+ | Turso database | ✅ 1.7MB (10 rollouts) | ✅ 192KB (10 rollouts) |
204
+ | outcome_rewards | ✅ 10 rows | ✅ 10 rows |
205
+ | Foreign keys | ✅ Working | ✅ Working |
206
+ | Non-zero rewards | ✅ 7/10 rollouts | ❌ 0/10 rollouts* |
207
+
208
+ *Expected: Pokemon Red is harder (requires room navigation, NPC dialogue, etc.)
209
+
210
+ ## Files Modified
211
+
212
+ 1. **`task_app.py`**:
213
+ - Added `use_vision` and `image_only_mode` support
214
+ - Fixed OpenAI API URL construction and auth
215
+ - Integrated SessionTracer for Turso persistence
216
+ - Added `record_outcome_reward()` calls
217
+ - Updated `build_config()` to create tracer_factory
218
+
219
+ 2. **`eval_image_only_gpt4o.toml`** (new):
220
+ - Config for image-only evaluation
221
+ - 10 seeds, 10 max turns per episode
222
+ - GPT-4o mini with vision enabled
223
+
224
+ ## Running the Evaluation
225
+
226
+ ```bash
227
+ cd /Users/joshpurtell/Documents/GitHub/synth-ai
228
+
229
+ # Set up tracing environment
230
+ export TASKAPP_TRACING_ENABLED=1
231
+ export TURSO_NATIVE=1
232
+ export SQLD_DB_PATH="traces/v3/pokemon_red_eval.db"
233
+
234
+ # Run evaluation
235
+ uv run synth-ai eval pokemon_red \
236
+ --config examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml
237
+ ```
238
+
239
+ ## Verification Commands
240
+
241
+ ```bash
242
+ # Check database size
243
+ ls -lh traces/v3/pokemon_red_eval.db
244
+
245
+ # Count sessions
246
+ sqlite3 traces/v3/pokemon_red_eval.db \
247
+ "SELECT COUNT(*) FROM session_traces;"
248
+
249
+ # View all rewards
250
+ sqlite3 -header -column traces/v3/pokemon_red_eval.db \
251
+ "SELECT session_id, total_reward, achievements_count, total_steps
252
+ FROM outcome_rewards
253
+ ORDER BY total_reward DESC;"
254
+
255
+ # Test foreign keys
256
+ sqlite3 traces/v3/pokemon_red_eval.db \
257
+ "SELECT st.session_id, orw.total_reward
258
+ FROM session_traces st
259
+ INNER JOIN outcome_rewards orw ON st.session_id = orw.session_id
260
+ LIMIT 5;"
261
+ ```
262
+
263
+ ## Next Steps
264
+
265
+ To improve rewards:
266
+ 1. **Increase max_turns**: Try 50-100 turns per episode
267
+ 2. **Better prompting**: Add more detailed instructions in system prompt
268
+ 3. **Hybrid mode**: Use `use_vision=true` with `image_only_mode=false` to get both images and text
269
+ 4. **Different model**: Try GPT-4o (full) or Claude 3.5 Sonnet for better vision understanding
270
+
271
+ ## Summary
272
+
273
+ ✅ **All goals achieved**:
274
+ - Image-only input mode working
275
+ - 10 rollouts completed successfully
276
+ - Turso database created with 192KB of trace data
277
+ - outcome_rewards table with foreign keys
278
+ - Can filter and query by rewards
279
+ - SessionTracer fully integrated
280
+
281
+ Pokemon Red now has the same Turso tracing capabilities as Crafter! 🎉
282
+
283
+
@@ -0,0 +1,155 @@
1
+ # Pokemon Red Image-Only Eval Status - ✅ COMPLETE
2
+
3
+ **Status**: All features working! See `EVAL_IMAGE_ONLY_COMPLETE.md` for full details.
4
+
5
+ ---
6
+
7
+ # Original Status (Before Turso Integration)
8
+
9
+ ## ✅ What's Working
10
+
11
+ ### 1. Image-Only Input Mode
12
+ - Successfully modified `task_app.py` to support `use_vision` and `image_only_mode` config flags
13
+ - When enabled, sends only base64-encoded PNG frames to the LLM (no text observations)
14
+ - Similar to Crafter's implementation
15
+
16
+ ### 2. OpenAI API Integration
17
+ - Fixed inference URL construction to properly call `https://api.openai.com/v1/chat/completions`
18
+ - Added proper Authorization Bearer token handling
19
+ - Successfully runs 10 rollouts with `gpt-4o-mini-2024-07-18`
20
+
21
+ ### 3. Eval Configuration
22
+ - Created `eval_image_only_gpt4o.toml` config file
23
+ - Successfully runs via `synth-ai eval pokemon_red --config ...`
24
+ - All 10 seeds complete without errors
25
+
26
+ ## ⚠️ What's Not Working Yet
27
+
28
+ ### Turso Tracing & Rewards
29
+ **Issue**: Pokemon Red doesn't use SessionTracer like Crafter does
30
+
31
+ **Current State**:
32
+ - Pokemon Red returns a basic trace payload (session_id, metadata) for the CLI
33
+ - But it doesn't actually create or save to a Turso database
34
+ - No `outcome_rewards` table or reward persistence
35
+ - No integration with `SessionTracer` from `tracing_v3`
36
+
37
+ **What Would Be Needed**:
38
+ 1. Import and initialize `SessionTracer` in Pokemon Red's `rollout_executor`
39
+ 2. Call `tracer.start_session()` at beginning of rollout
40
+ 3. Record events during rollout (like Crafter does)
41
+ 4. Call `tracer.record_outcome_reward()` at end with:
42
+ - `total_reward`: sum of step rewards
43
+ - `achievements_count`: count of milestones reached
44
+ - `total_steps`: number of steps taken
45
+ - `reward_metadata`: dict with map_id, party_count, badges, etc.
46
+ 5. Call `tracer.end_session()` to persist to database
47
+
48
+ ### Reward Computation
49
+ **Current State**:
50
+ - Pokemon Red has a `PalletTownProgressionCompositeReward` reward function
51
+ - It tracks milestones like leaving bedroom, getting starter Pokemon, etc.
52
+ - But rewards are currently all 0.0 (expected - task is hard with only 10 turns and image-only input)
53
+
54
+ **What's Challenging**:
55
+ - The Pallet Town sequence requires:
56
+ - Navigating multiple rooms
57
+ - Talking to NPCs (pressing A at right moments)
58
+ - Selecting starter Pokemon
59
+ - Entering first battle
60
+ - With only images (no text hints) and 10 LLM calls, agents struggle to make progress
61
+ - May need more turns or better prompting to get non-zero rewards
62
+
63
+ ## 📊 Current Results
64
+
65
+ ```
66
+ Eval complete: 10 ok, 0 failed
67
+ Model: gpt-4o-mini-2024-07-18
68
+ Seeds: 0-9 (10 rollouts)
69
+ Mean reward: 0.000
70
+ Outcome score: 0.000
71
+
72
+ All rollouts: ~21 steps, 0 rewards, Map 38 (Red's bedroom)
73
+ ```
74
+
75
+ ## 🔧 Files Modified
76
+
77
+ 1. **`task_app.py`**:
78
+ - Added `use_vision` and `image_only_mode` support in `_call_inference`
79
+ - Fixed OpenAI API URL construction
80
+ - Added basic trace payload generation
81
+ - **Still needs**: SessionTracer integration for Turso persistence
82
+
83
+ 2. **`eval_image_only_gpt4o.toml`** (new):
84
+ - Config for image-only evaluation
85
+ - 10 seeds, 10 max turns per episode
86
+ - GPT-4o mini with vision enabled
87
+
88
+ ## 🚀 Next Steps to Complete Turso Integration
89
+
90
+ ### Option 1: Quick Fix (Minimal Tracing)
91
+ Just save basic session info without full event tracing:
92
+ ```python
93
+ # At start of rollout_executor
94
+ from synth_ai.tracing_v3 import SessionTracer, StorageConfig, StorageBackend
95
+
96
+ tracer = SessionTracer(
97
+ storage_config=StorageConfig(
98
+ backend=StorageBackend.TURSO_NATIVE,
99
+ connection_string=f"file:{os.getenv('SQLD_DB_PATH', 'traces/v3/pokemon_red.db')}"
100
+ ),
101
+ auto_save=True
102
+ )
103
+ await tracer.initialize()
104
+ session_id = await tracer.start_session(metadata={...})
105
+
106
+ # At end of rollout_executor
107
+ await tracer.record_outcome_reward(
108
+ total_reward=int(total_reward),
109
+ achievements_count=len(milestone_events), # or 0 if none
110
+ total_steps=len(steps),
111
+ reward_metadata={
112
+ "final_map": final_state.get("map_id"),
113
+ "party_count": final_state.get("party_count", 0),
114
+ "badges": final_state.get("badges", 0),
115
+ "milestone_events": milestone_events,
116
+ }
117
+ )
118
+ await tracer.end_session()
119
+ ```
120
+
121
+ ### Option 2: Full Tracing (Like Crafter)
122
+ Integrate complete event tracing like Crafter's rollout.py:
123
+ - Record messages, timesteps, events for each step
124
+ - More complex but provides rich trace data
125
+ - Would require more significant refactoring
126
+
127
+ ## 📝 Comparison with Crafter
128
+
129
+ | Feature | Crafter | Pokemon Red |
130
+ |---------|---------|-------------|
131
+ | Image-only mode | ✅ Working | ✅ Working |
132
+ | OpenAI API | ✅ Working | ✅ Working |
133
+ | Eval CLI | ✅ Working | ✅ Working |
134
+ | SessionTracer | ✅ Integrated | ❌ Not integrated |
135
+ | Turso database | ✅ Saves traces | ❌ No database created |
136
+ | outcome_rewards | ✅ Persisted | ❌ Not saved |
137
+ | Foreign keys | ✅ Working | ❌ N/A |
138
+ | Non-zero rewards | ✅ 7/10 rollouts | ❌ 0/10 rollouts |
139
+
140
+ ## ✅ Summary
141
+
142
+ **Completed**:
143
+ - ✅ Image-only input mode for Pokemon Red
144
+ - ✅ OpenAI API integration with proper auth
145
+ - ✅ Eval CLI runs 10 rollouts successfully
146
+ - ✅ Basic trace payload returned (for CLI)
147
+
148
+ **Not Yet Complete**:
149
+ - ❌ Turso database persistence
150
+ - ❌ outcome_rewards table with foreign keys
151
+ - ❌ SessionTracer integration
152
+ - ❌ Queryable rewards by seed
153
+
154
+ **To match Crafter's capabilities**, Pokemon Red needs SessionTracer integration (Option 1 or 2 above).
155
+