synth-ai 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (299) hide show
  1. examples/analyze_semantic_words.sh +2 -2
  2. examples/baseline/banking77_baseline.py +204 -0
  3. examples/baseline/crafter_baseline.py +407 -0
  4. examples/baseline/pokemon_red_baseline.py +326 -0
  5. examples/baseline/simple_baseline.py +56 -0
  6. examples/baseline/warming_up_to_rl_baseline.py +239 -0
  7. examples/blog_posts/gepa/README.md +355 -0
  8. examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
  9. examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
  10. examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
  11. examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
  12. examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
  13. examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
  14. examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
  15. examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
  16. examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
  17. examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
  18. examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
  19. examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
  20. examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
  21. examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
  22. examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
  23. examples/blog_posts/gepa/gepa_baseline.py +204 -0
  24. examples/blog_posts/gepa/query_prompts_example.py +97 -0
  25. examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
  26. examples/blog_posts/gepa/task_apps.py +105 -0
  27. examples/blog_posts/gepa/test_gepa_local.sh +67 -0
  28. examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
  29. examples/blog_posts/pokemon_vl/README.md +98 -0
  30. examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
  31. examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
  32. examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
  33. examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
  34. examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
  35. examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
  36. examples/blog_posts/pokemon_vl/extract_images.py +239 -0
  37. examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
  38. examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
  39. examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
  40. examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
  41. examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
  42. examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
  43. examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
  44. examples/blog_posts/warming_up_to_rl/README.md +158 -0
  45. examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
  46. examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
  47. examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
  48. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
  49. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
  50. examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
  51. examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
  52. examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
  53. examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
  54. examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
  55. examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
  56. examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
  57. examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
  58. examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
  59. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
  60. examples/multi_step/configs/crafter_rl_outcome.toml +2 -1
  61. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
  62. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +2 -1
  63. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +2 -1
  64. examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
  65. examples/multi_step/configs/verilog_rl_lora.toml +80 -123
  66. examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
  67. examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
  68. examples/qwen_coder/configs/coder_lora_small.toml +1 -3
  69. examples/qwen_vl/README.md +10 -12
  70. examples/qwen_vl/SETUP_COMPLETE.md +7 -8
  71. examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
  72. examples/qwen_vl/collect_data_via_cli.md +76 -84
  73. examples/qwen_vl/collect_vision_traces.py +4 -4
  74. examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
  75. examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
  76. examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
  77. examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
  78. examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
  79. examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
  80. examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
  81. examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
  82. examples/qwen_vl/run_vision_comparison.sh +6 -7
  83. examples/rl/README.md +5 -5
  84. examples/rl/configs/rl_from_base_qwen.toml +26 -1
  85. examples/rl/configs/rl_from_base_qwen17.toml +6 -2
  86. examples/rl/task_app/README.md +1 -2
  87. examples/rl/task_app/math_single_step.py +2 -2
  88. examples/run_crafter_demo.sh +2 -2
  89. examples/sft/README.md +1 -1
  90. examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
  91. examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
  92. examples/swe/task_app/README.md +32 -2
  93. examples/swe/task_app/grpo_swe_mini.py +4 -0
  94. examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
  95. examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
  96. examples/swe/task_app/hosted/inference/openai_client.py +4 -38
  97. examples/swe/task_app/hosted/policy_routes.py +17 -0
  98. examples/swe/task_app/hosted/rollout.py +4 -2
  99. examples/swe/task_app/morph_backend.py +178 -0
  100. examples/task_apps/banking77/__init__.py +6 -0
  101. examples/task_apps/banking77/banking77_task_app.py +841 -0
  102. examples/task_apps/banking77/deploy_wrapper.py +46 -0
  103. examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
  104. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
  105. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
  106. examples/task_apps/crafter/task_app/README.md +1 -1
  107. examples/task_apps/crafter/task_app/grpo_crafter.py +90 -5
  108. examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
  109. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
  110. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
  111. examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
  112. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +372 -107
  113. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +81 -12
  114. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +82 -11
  115. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
  116. examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
  117. examples/task_apps/gepa_benchmarks/__init__.py +7 -0
  118. examples/task_apps/gepa_benchmarks/common.py +260 -0
  119. examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
  120. examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
  121. examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
  122. examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
  123. examples/task_apps/math/README.md +1 -2
  124. examples/task_apps/pokemon_red/README.md +3 -4
  125. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
  126. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
  127. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
  128. examples/task_apps/pokemon_red/task_app.py +288 -39
  129. examples/task_apps/sokoban/README.md +2 -3
  130. examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
  131. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
  132. examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
  133. examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
  134. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
  135. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +3 -2
  136. examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
  137. examples/warming_up_to_rl/task_app/README.md +1 -1
  138. examples/warming_up_to_rl/task_app/grpo_crafter.py +185 -5
  139. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
  140. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
  141. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
  142. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
  143. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +156 -45
  144. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +37 -4
  145. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
  146. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
  147. examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
  148. examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +6 -0
  149. synth_ai/api/train/builders.py +99 -4
  150. synth_ai/api/train/cli.py +516 -26
  151. synth_ai/api/train/config_finder.py +13 -2
  152. synth_ai/api/train/configs/__init__.py +23 -2
  153. synth_ai/api/train/configs/prompt_learning.py +442 -0
  154. synth_ai/api/train/configs/rl.py +61 -7
  155. synth_ai/api/train/configs/sft.py +6 -2
  156. synth_ai/api/train/configs/shared.py +59 -2
  157. synth_ai/api/train/task_app.py +1 -1
  158. synth_ai/api/train/validators.py +277 -0
  159. synth_ai/auth/credentials.py +119 -0
  160. synth_ai/baseline/__init__.py +25 -0
  161. synth_ai/baseline/config.py +209 -0
  162. synth_ai/baseline/discovery.py +214 -0
  163. synth_ai/baseline/execution.py +146 -0
  164. synth_ai/cli/__init__.py +94 -18
  165. synth_ai/cli/__main__.py +0 -0
  166. synth_ai/cli/claude.py +70 -0
  167. synth_ai/cli/codex.py +84 -0
  168. synth_ai/cli/commands/__init__.py +18 -0
  169. synth_ai/cli/commands/baseline/__init__.py +12 -0
  170. synth_ai/cli/commands/baseline/core.py +637 -0
  171. synth_ai/cli/commands/baseline/list.py +93 -0
  172. synth_ai/cli/commands/demo/__init__.py +6 -0
  173. synth_ai/cli/commands/demo/core.py +163 -0
  174. synth_ai/cli/commands/eval/__init__.py +19 -0
  175. synth_ai/cli/commands/eval/core.py +1112 -0
  176. synth_ai/cli/commands/eval/errors.py +81 -0
  177. synth_ai/cli/commands/eval/validation.py +133 -0
  178. synth_ai/cli/commands/filter/__init__.py +12 -0
  179. synth_ai/cli/commands/filter/core.py +424 -0
  180. synth_ai/cli/commands/filter/errors.py +55 -0
  181. synth_ai/cli/commands/filter/validation.py +77 -0
  182. synth_ai/cli/commands/help/__init__.py +177 -0
  183. synth_ai/cli/commands/help/core.py +72 -0
  184. synth_ai/cli/commands/smoke/__init__.py +7 -0
  185. synth_ai/cli/commands/smoke/core.py +1436 -0
  186. synth_ai/cli/commands/status/__init__.py +64 -0
  187. synth_ai/cli/commands/status/client.py +192 -0
  188. synth_ai/cli/commands/status/config.py +92 -0
  189. synth_ai/cli/commands/status/errors.py +20 -0
  190. synth_ai/cli/commands/status/formatters.py +164 -0
  191. synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
  192. synth_ai/cli/commands/status/subcommands/files.py +79 -0
  193. synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
  194. synth_ai/cli/commands/status/subcommands/models.py +79 -0
  195. synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
  196. synth_ai/cli/commands/status/subcommands/runs.py +81 -0
  197. synth_ai/cli/commands/status/subcommands/summary.py +47 -0
  198. synth_ai/cli/commands/status/subcommands/usage.py +203 -0
  199. synth_ai/cli/commands/status/utils.py +114 -0
  200. synth_ai/cli/commands/train/__init__.py +53 -0
  201. synth_ai/cli/commands/train/core.py +21 -0
  202. synth_ai/cli/commands/train/errors.py +117 -0
  203. synth_ai/cli/commands/train/judge_schemas.py +200 -0
  204. synth_ai/cli/commands/train/judge_validation.py +305 -0
  205. synth_ai/cli/commands/train/validation.py +386 -0
  206. synth_ai/cli/demo.py +30 -158
  207. synth_ai/cli/deploy/__init__.py +43 -0
  208. synth_ai/cli/deploy.py +162 -0
  209. synth_ai/cli/eval/__init__.py +36 -0
  210. synth_ai/cli/eval/core.py +5 -0
  211. synth_ai/cli/eval/errors.py +31 -0
  212. synth_ai/cli/eval/validation.py +5 -0
  213. synth_ai/cli/filter/__init__.py +28 -0
  214. synth_ai/cli/filter/core.py +5 -0
  215. synth_ai/cli/filter/errors.py +23 -0
  216. synth_ai/cli/filter/validation.py +5 -0
  217. synth_ai/cli/legacy_root_backup.py +14 -8
  218. synth_ai/cli/modal_serve/__init__.py +12 -0
  219. synth_ai/cli/modal_serve/core.py +14 -0
  220. synth_ai/cli/modal_serve/errors.py +8 -0
  221. synth_ai/cli/modal_serve/validation.py +11 -0
  222. synth_ai/cli/opencode.py +107 -0
  223. synth_ai/cli/root.py +9 -5
  224. synth_ai/cli/serve/__init__.py +12 -0
  225. synth_ai/cli/serve/core.py +14 -0
  226. synth_ai/cli/serve/errors.py +8 -0
  227. synth_ai/cli/serve/validation.py +11 -0
  228. synth_ai/cli/setup.py +20 -265
  229. synth_ai/cli/status.py +7 -126
  230. synth_ai/cli/task_app_deploy.py +1 -10
  231. synth_ai/cli/task_app_modal_serve.py +4 -9
  232. synth_ai/cli/task_app_serve.py +4 -11
  233. synth_ai/cli/task_apps.py +51 -1480
  234. synth_ai/cli/train/__init__.py +12 -0
  235. synth_ai/cli/train/core.py +21 -0
  236. synth_ai/cli/train/errors.py +8 -0
  237. synth_ai/cli/train/validation.py +24 -0
  238. synth_ai/cli/train.py +1 -14
  239. synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
  240. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
  241. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
  242. synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
  243. synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
  244. synth_ai/environments/examples/red/engine.py +33 -12
  245. synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
  246. synth_ai/environments/examples/red/environment.py +26 -0
  247. synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
  248. synth_ai/http.py +12 -0
  249. synth_ai/judge_schemas.py +10 -10
  250. synth_ai/learning/__init__.py +10 -0
  251. synth_ai/learning/prompt_learning_client.py +276 -0
  252. synth_ai/learning/prompt_learning_types.py +184 -0
  253. synth_ai/learning/rl/client.py +3 -1
  254. synth_ai/pricing/__init__.py +2 -0
  255. synth_ai/pricing/model_pricing.py +57 -0
  256. synth_ai/streaming/__init__.py +29 -0
  257. synth_ai/streaming/config.py +94 -0
  258. synth_ai/streaming/handlers.py +518 -0
  259. synth_ai/streaming/streamer.py +320 -0
  260. synth_ai/streaming/types.py +95 -0
  261. synth_ai/task/apps/__init__.py +1 -0
  262. synth_ai/task/config.py +2 -0
  263. synth_ai/task/tracing_utils.py +25 -25
  264. synth_ai/task/validators.py +45 -9
  265. synth_ai/task_app_cfgs.py +21 -0
  266. synth_ai/tracing_v3/config.py +162 -19
  267. synth_ai/tracing_v3/constants.py +1 -1
  268. synth_ai/tracing_v3/db_config.py +24 -38
  269. synth_ai/tracing_v3/migration_helper.py +1 -2
  270. synth_ai/tracing_v3/storage/config.py +47 -13
  271. synth_ai/tracing_v3/storage/factory.py +3 -3
  272. synth_ai/tracing_v3/turso/daemon.py +113 -11
  273. synth_ai/tracing_v3/turso/native_manager.py +92 -16
  274. synth_ai/types.py +8 -0
  275. synth_ai/urls.py +11 -0
  276. synth_ai/utils/__init__.py +30 -1
  277. synth_ai/utils/agents.py +74 -0
  278. synth_ai/utils/bin.py +39 -0
  279. synth_ai/utils/cli.py +149 -5
  280. synth_ai/utils/env.py +40 -33
  281. synth_ai/utils/http.py +4 -1
  282. synth_ai/utils/json.py +72 -0
  283. synth_ai/utils/modal.py +285 -3
  284. synth_ai/utils/paths.py +48 -0
  285. synth_ai/utils/uvicorn.py +113 -0
  286. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/METADATA +109 -6
  287. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/RECORD +291 -142
  288. examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
  289. synth_ai/cli/tui.py +0 -62
  290. synth_ai/tui/__init__.py +0 -5
  291. synth_ai/tui/__main__.py +0 -13
  292. synth_ai/tui/cli/__init__.py +0 -1
  293. synth_ai/tui/cli/query_experiments.py +0 -164
  294. synth_ai/tui/cli/query_experiments_v3.py +0 -164
  295. synth_ai/tui/dashboard.py +0 -911
  296. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
  297. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
  298. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
  299. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,81 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ class EvalCliError(RuntimeError):
7
+ """Base exception for eval CLI failures."""
8
+
9
+
10
+ @dataclass(slots=True)
11
+ class TomlUnavailableError(EvalCliError):
12
+ hint: str | None = None
13
+
14
+
15
+ @dataclass(slots=True)
16
+ class EvalConfigNotFoundError(EvalCliError):
17
+ path: str
18
+
19
+
20
+ @dataclass(slots=True)
21
+ class EvalConfigParseError(EvalCliError):
22
+ path: str
23
+ detail: str
24
+
25
+
26
+ @dataclass(slots=True)
27
+ class MissingEvalTableError(EvalCliError):
28
+ """Raised when the eval config lacks an [eval] table."""
29
+
30
+
31
+ @dataclass(slots=True)
32
+ class InvalidEvalConfigError(EvalCliError):
33
+ detail: str
34
+
35
+
36
+ @dataclass(slots=True)
37
+ class SeedParseError(EvalCliError):
38
+ value: str
39
+
40
+
41
+ @dataclass(slots=True)
42
+ class MetadataFilterFormatError(EvalCliError):
43
+ entry: str
44
+
45
+
46
+ @dataclass(slots=True)
47
+ class TaskInfoUnavailableError(EvalCliError):
48
+ """Raised when metadata filters require task info but the task app does not expose it."""
49
+
50
+
51
+ @dataclass(slots=True)
52
+ class NoSeedsMatchedError(EvalCliError):
53
+ hint: str | None = None
54
+
55
+
56
+ @dataclass(slots=True)
57
+ class MetadataSQLExecutionError(EvalCliError):
58
+ query: str
59
+ detail: str
60
+
61
+
62
+ @dataclass(slots=True)
63
+ class MetadataSQLResultError(EvalCliError):
64
+ query: str
65
+ detail: str
66
+
67
+
68
+ __all__ = [
69
+ "EvalCliError",
70
+ "TomlUnavailableError",
71
+ "EvalConfigNotFoundError",
72
+ "EvalConfigParseError",
73
+ "MissingEvalTableError",
74
+ "InvalidEvalConfigError",
75
+ "SeedParseError",
76
+ "MetadataFilterFormatError",
77
+ "TaskInfoUnavailableError",
78
+ "NoSeedsMatchedError",
79
+ "MetadataSQLExecutionError",
80
+ "MetadataSQLResultError",
81
+ ]
@@ -0,0 +1,133 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from collections.abc import MutableMapping
5
+ from typing import Any
6
+
7
+ __all__ = ["validate_eval_options"]
8
+
9
+ _SEED_RANGE = re.compile(r"^\s*(-?\d+)\s*-\s*(-?\d+)\s*$")
10
+
11
+
12
+ def _coerce_bool(value: Any) -> bool:
13
+ if isinstance(value, str):
14
+ return value.strip().lower() in {"1", "true", "yes", "on"}
15
+ return bool(value)
16
+
17
+
18
+ def _coerce_int(value: Any) -> int | None:
19
+ if value is None or value == "":
20
+ return None
21
+ return int(value)
22
+
23
+
24
+ def _parse_seeds(value: Any) -> list[int]:
25
+ if value is None:
26
+ return []
27
+ if isinstance(value, str):
28
+ chunks = [chunk.strip() for chunk in value.split(",") if chunk.strip()]
29
+ elif isinstance(value, list | tuple | set):
30
+ chunks = list(value)
31
+ else:
32
+ chunks = [value]
33
+ seeds: list[int] = []
34
+ for chunk in chunks:
35
+ if isinstance(chunk, int):
36
+ seeds.append(chunk)
37
+ else:
38
+ text = str(chunk).strip()
39
+ if not text:
40
+ continue
41
+ match = _SEED_RANGE.match(text)
42
+ if match:
43
+ start = int(match.group(1))
44
+ end = int(match.group(2))
45
+ if start > end:
46
+ raise ValueError(f"Invalid seed range '{text}': start must be <= end")
47
+ seeds.extend(range(start, end + 1))
48
+ else:
49
+ seeds.append(int(text))
50
+ return seeds
51
+
52
+
53
+ def _normalize_metadata(value: Any) -> dict[str, str]:
54
+ if value is None:
55
+ return {}
56
+ if isinstance(value, MutableMapping):
57
+ return {str(k): str(v) for k, v in value.items()}
58
+ if isinstance(value, list | tuple):
59
+ result: dict[str, str] = {}
60
+ for item in value:
61
+ if isinstance(item, str) and "=" in item:
62
+ key, val = item.split("=", 1)
63
+ result[key.strip()] = val.strip()
64
+ return result
65
+ if isinstance(value, str) and "=" in value:
66
+ key, val = value.split("=", 1)
67
+ return {key.strip(): val.strip()}
68
+ return {}
69
+
70
+
71
+ def _ensure_list(value: Any) -> list[str] | None:
72
+ if value is None:
73
+ return None
74
+ if isinstance(value, list | tuple | set):
75
+ return [str(item) for item in value]
76
+ return [str(value)]
77
+
78
+
79
+ def _ensure_dict(value: Any) -> dict[str, Any]:
80
+ if isinstance(value, MutableMapping):
81
+ return dict(value)
82
+ return {}
83
+
84
+
85
+ def validate_eval_options(options: MutableMapping[str, Any]) -> MutableMapping[str, Any]:
86
+ """Validate and normalise eval configuration options."""
87
+
88
+ result: dict[str, Any] = dict(options)
89
+
90
+ if "seeds" in result:
91
+ result["seeds"] = _parse_seeds(result.get("seeds"))
92
+
93
+ for field in ("max_turns", "max_llm_calls", "concurrency"):
94
+ try:
95
+ result[field] = _coerce_int(result.get(field))
96
+ except Exception as exc:
97
+ raise ValueError(f"Invalid value for {field}: {result.get(field)}") from exc
98
+
99
+ if result.get("max_llm_calls") is None:
100
+ result["max_llm_calls"] = 10
101
+ if result.get("concurrency") is None:
102
+ result["concurrency"] = 1
103
+
104
+ if "return_trace" in result:
105
+ result["return_trace"] = _coerce_bool(result.get("return_trace"))
106
+
107
+ metadata_value = result.get("metadata")
108
+ result["metadata"] = _normalize_metadata(metadata_value)
109
+
110
+ if "ops" in result:
111
+ ops_list = _ensure_list(result.get("ops"))
112
+ result["ops"] = ops_list
113
+
114
+ result["env_config"] = _ensure_dict(result.get("env_config"))
115
+ result["policy_config"] = _ensure_dict(result.get("policy_config"))
116
+
117
+ trace_format = result.get("trace_format")
118
+ if trace_format is not None:
119
+ result["trace_format"] = str(trace_format)
120
+
121
+ metadata_sql = result.get("metadata_sql")
122
+ if metadata_sql is not None and not isinstance(metadata_sql, str):
123
+ result["metadata_sql"] = str(metadata_sql)
124
+
125
+ model = result.get("model")
126
+ if model is not None:
127
+ result["model"] = str(model)
128
+
129
+ app_id = result.get("app_id")
130
+ if app_id is not None:
131
+ result["app_id"] = str(app_id)
132
+
133
+ return result
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from .core import command, get_command
4
+ from .errors import FilterCliError
5
+ from .validation import validate_filter_options
6
+
7
+ __all__ = [
8
+ "command",
9
+ "get_command",
10
+ "FilterCliError",
11
+ "validate_filter_options",
12
+ ]
@@ -0,0 +1,424 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ from datetime import UTC, datetime
6
+ from pathlib import Path
7
+ from typing import Any, Sequence
8
+
9
+ import click
10
+
11
+ try: # Python 3.11+
12
+ import tomllib as _toml # type: ignore[attr-defined]
13
+ except Exception: # pragma: no cover
14
+ _toml = None # type: ignore[assignment]
15
+
16
+ from synth_ai.task.config import FilterConfig
17
+ from synth_ai.tracing_v3 import SessionTracer # type: ignore[import-untyped]
18
+
19
+ from .errors import (
20
+ FilterCliError,
21
+ FilterConfigNotFoundError,
22
+ FilterConfigParseError,
23
+ InvalidFilterConfigError,
24
+ MissingFilterTableError,
25
+ NoSessionsMatchedError,
26
+ NoTracesFoundError,
27
+ TomlUnavailableError,
28
+ )
29
+ from .validation import validate_filter_options
30
+
31
+ __all__ = ["command", "get_command", "filter_command"]
32
+
33
+
34
+ def _parse_datetime_for_trace(value: Any) -> datetime | None:
35
+ if isinstance(value, datetime):
36
+ return value if value.tzinfo else value.replace(tzinfo=UTC)
37
+ if isinstance(value, str):
38
+ value = value.replace("Z", "+00:00")
39
+ try:
40
+ dt = datetime.fromisoformat(value)
41
+ except ValueError:
42
+ try:
43
+ dt = datetime.fromtimestamp(float(value), tz=UTC)
44
+ except Exception:
45
+ return None
46
+ return dt if dt.tzinfo else dt.replace(tzinfo=UTC)
47
+ if isinstance(value, int | float):
48
+ try:
49
+ return datetime.fromtimestamp(float(value), tz=UTC)
50
+ except Exception:
51
+ return None
52
+ return None
53
+
54
+
55
+ def _score_ok(value: Any, min_val: Any, max_val: Any) -> bool:
56
+ try:
57
+ if value is None:
58
+ return min_val is None
59
+ value = float(value)
60
+ except Exception:
61
+ return False
62
+ if min_val is not None and value < float(min_val):
63
+ return False
64
+ return not (max_val is not None and value > float(max_val))
65
+
66
+
67
+ def _load_filter_config(config_path: Path) -> tuple[FilterConfig, dict[str, Any]]:
68
+ if _toml is None:
69
+ raise TomlUnavailableError(hint="Install tomli or use Python 3.11+")
70
+
71
+ if not config_path.exists():
72
+ raise FilterConfigNotFoundError(path=str(config_path))
73
+
74
+ try:
75
+ config_data = _toml.loads(config_path.read_text(encoding="utf-8"))
76
+ except Exception as exc: # pragma: no cover - validation tests cover common cases
77
+ raise FilterConfigParseError(path=str(config_path), detail=str(exc)) from exc
78
+
79
+ filter_cfg_dict = config_data.get("filter") if isinstance(config_data, dict) else None
80
+ if not isinstance(filter_cfg_dict, dict):
81
+ raise MissingFilterTableError()
82
+
83
+ try:
84
+ normalized = validate_filter_options(filter_cfg_dict)
85
+ normalized_dict = dict(normalized)
86
+ filter_cfg = FilterConfig.from_dict(normalized_dict)
87
+ except (ValueError, TypeError) as validation_error:
88
+ raise InvalidFilterConfigError(detail=str(validation_error)) from validation_error
89
+
90
+ click.echo(
91
+ f"✓ Config validated: db={filter_cfg.db}, output={filter_cfg.output}"
92
+ )
93
+ if filter_cfg.min_official_score is not None:
94
+ click.echo(
95
+ f" → Filtering for official score >= {filter_cfg.min_official_score}"
96
+ )
97
+ if filter_cfg.limit:
98
+ click.echo(f" → Limiting to {filter_cfg.limit} examples")
99
+
100
+ return filter_cfg, normalized_dict
101
+
102
+
103
+ def _extract_content(content: Any) -> Any:
104
+ if isinstance(content, dict) and "content" in content:
105
+ return content["content"]
106
+ return content
107
+
108
+
109
+ def _extract_text(content: Any) -> str:
110
+ if isinstance(content, str):
111
+ return content
112
+ if isinstance(content, dict):
113
+ payload = content.get("payload") if isinstance(content.get("payload"), dict) else None
114
+ if payload and "content" in payload:
115
+ return _extract_text(payload["content"])
116
+ for key in ("text", "content", "content_text"):
117
+ if key in content:
118
+ value = content[key]
119
+ if isinstance(value, str):
120
+ return value
121
+ try:
122
+ return json.dumps(content)
123
+ except Exception: # pragma: no cover - defensive
124
+ return str(content)
125
+ if isinstance(content, list):
126
+ parts = []
127
+ for item in content:
128
+ if isinstance(item, dict) and item.get("type") == "text":
129
+ parts.append(item.get("text", ""))
130
+ return " ".join(parts) if parts else str(content)
131
+ return str(content)
132
+
133
+
134
+ def _select_messages(message_rows: Sequence[dict[str, Any]]) -> list[dict[str, Any]]:
135
+ records: list[dict[str, Any]] = []
136
+ for index, msg_row in enumerate(message_rows):
137
+ msg_type = msg_row.get("message_type")
138
+ content_raw = msg_row.get("content")
139
+ if msg_type not in {"user", "policy_user_prompt"}:
140
+ continue
141
+
142
+ # Look backwards for system prompt
143
+ system_msg = None
144
+ for prev in range(index - 1, -1, -1):
145
+ prev_type = message_rows[prev].get("message_type")
146
+ if prev_type == "policy_system_prompt":
147
+ system_msg = message_rows[prev]
148
+ break
149
+
150
+ assistant_msg = None
151
+ tool_call_msg = None
152
+ for follow in range(index + 1, len(message_rows)):
153
+ next_type = message_rows[follow].get("message_type")
154
+ if next_type == "assistant":
155
+ assistant_msg = message_rows[follow]
156
+ break
157
+ elif next_type == "policy_tool_call":
158
+ tool_call_msg = message_rows[follow]
159
+ break
160
+
161
+ try:
162
+ user_content = json.loads(content_raw) if isinstance(content_raw, str) else content_raw
163
+ except Exception:
164
+ user_content = content_raw
165
+
166
+ user_content = _extract_content(user_content)
167
+ user_text = _extract_text(user_content)
168
+ if not user_text:
169
+ continue
170
+
171
+ messages = []
172
+
173
+ # Add system prompt if found
174
+ if system_msg is not None:
175
+ try:
176
+ system_content_raw = system_msg.get("content")
177
+ system_content = json.loads(system_content_raw) if isinstance(system_content_raw, str) else system_content_raw
178
+ system_content = _extract_content(system_content)
179
+ system_text = _extract_text(system_content)
180
+ if system_text:
181
+ messages.append({"role": "system", "content": system_text})
182
+ except Exception:
183
+ pass
184
+
185
+ # Add user message
186
+ user_payload = user_content if isinstance(user_content, list) else user_text
187
+ messages.append({"role": "user", "content": user_payload})
188
+
189
+ # Add assistant/tool call response
190
+ assistant_content = None
191
+ if tool_call_msg is not None:
192
+ raw = tool_call_msg.get("content")
193
+ try:
194
+ assistant_content = json.loads(raw) if isinstance(raw, str) else raw
195
+ except Exception:
196
+ assistant_content = raw
197
+ assistant_content = _extract_content(assistant_content)
198
+ elif assistant_msg is not None:
199
+ raw = assistant_msg.get("content")
200
+ try:
201
+ assistant_content = json.loads(raw) if isinstance(raw, str) else raw
202
+ except Exception:
203
+ assistant_content = raw
204
+ assistant_content = _extract_content(assistant_content)
205
+
206
+ assistant_payload = (
207
+ assistant_content
208
+ if isinstance(assistant_content, list)
209
+ else (_extract_text(assistant_content) if assistant_content is not None else "[no response recorded]")
210
+ )
211
+ messages.append({"role": "assistant", "content": assistant_payload})
212
+
213
+ records.append({"messages": messages})
214
+ return records
215
+
216
+
217
+ @click.command(
218
+ "filter",
219
+ help="Export filtered tracing sessions to SFT-ready JSONL based on a TOML config.",
220
+ )
221
+ @click.option(
222
+ "--config",
223
+ "config_path",
224
+ type=click.Path(),
225
+ required=True,
226
+ help="Path to TOML config describing the input trace DB, score thresholds, and output JSONL.",
227
+ )
228
+ def filter_command(config_path: str) -> None:
229
+ try:
230
+ filter_cfg, raw_cfg = _load_filter_config(Path(config_path))
231
+ except FilterCliError as exc:
232
+ raise click.ClickException(_format_filter_error(exc)) from exc
233
+
234
+ db_url = filter_cfg.get_db_url()
235
+ output_path = filter_cfg.get_output_path()
236
+
237
+ splits = set(filter_cfg.splits)
238
+ task_ids = set(filter_cfg.task_ids)
239
+ models = set(filter_cfg.models)
240
+ min_official = filter_cfg.min_official_score
241
+ max_official = filter_cfg.max_official_score
242
+ min_judge_scores = filter_cfg.min_judge_scores
243
+ max_judge_scores = filter_cfg.max_judge_scores
244
+ min_created = _parse_datetime_for_trace(raw_cfg.get("min_created_at"))
245
+ max_created = _parse_datetime_for_trace(raw_cfg.get("max_created_at"))
246
+ limit = filter_cfg.limit
247
+
248
+ async def _run() -> None:
249
+ tracer = SessionTracer(db_url=db_url, auto_save=False)
250
+ await tracer.initialize()
251
+
252
+ if tracer.db is None:
253
+ raise FilterCliError("Database not initialized")
254
+
255
+ df = await tracer.db.query_traces(
256
+ "SELECT session_id, created_at, metadata FROM session_traces ORDER BY created_at"
257
+ )
258
+ if getattr(df, "empty", True):
259
+ raise NoTracesFoundError(db_url=db_url)
260
+
261
+ sessions = df.to_dict("records")
262
+ accepted: list[dict[str, Any]] = []
263
+
264
+ for row in sessions:
265
+ metadata_raw = row.get("metadata")
266
+ if isinstance(metadata_raw, str):
267
+ try:
268
+ metadata = json.loads(metadata_raw)
269
+ except Exception:
270
+ metadata = {}
271
+ elif isinstance(metadata_raw, dict):
272
+ metadata = dict(metadata_raw)
273
+ else:
274
+ metadata = {}
275
+
276
+ created_at_raw = row.get("created_at")
277
+ created_at_dt = _parse_datetime_for_trace(created_at_raw)
278
+ session_id = row.get("session_id")
279
+
280
+ if splits and metadata.get("task_split") not in splits:
281
+ continue
282
+ if task_ids and metadata.get("task_id") not in task_ids:
283
+ continue
284
+ if models and metadata.get("model") not in models:
285
+ continue
286
+
287
+ if min_created and (created_at_dt is None or created_at_dt < min_created):
288
+ continue
289
+ if max_created and (created_at_dt is None or created_at_dt > max_created):
290
+ continue
291
+
292
+ total_reward = None
293
+ achievements_count = None
294
+ if min_official is not None or max_official is not None:
295
+ if tracer.db is None:
296
+ raise FilterCliError("Database not initialized")
297
+ reward_rows = await tracer.db.query_traces(
298
+ "SELECT total_reward, achievements_count FROM outcome_rewards WHERE session_id = :session_id",
299
+ {"session_id": session_id},
300
+ )
301
+ reward_records = (
302
+ reward_rows.to_dict("records")
303
+ if hasattr(reward_rows, "to_dict")
304
+ else []
305
+ )
306
+ if reward_records:
307
+ total_reward = reward_records[0].get("total_reward")
308
+ achievements_count = reward_records[0].get("achievements_count")
309
+ if not _score_ok(total_reward, min_official, max_official):
310
+ continue
311
+ elif min_official is not None:
312
+ continue
313
+
314
+ judge_scores = metadata.get("judge_scores") or {}
315
+ include = True
316
+ for judge_name, threshold in (min_judge_scores or {}).items():
317
+ if not _score_ok(judge_scores.get(judge_name), threshold, None):
318
+ include = False
319
+ break
320
+ if not include:
321
+ continue
322
+ for judge_name, threshold in (max_judge_scores or {}).items():
323
+ if not _score_ok(judge_scores.get(judge_name), None, threshold):
324
+ include = False
325
+ break
326
+ if not include:
327
+ continue
328
+
329
+ messages_query = (
330
+ "\n SELECT message_type, content, timestamp \n FROM messages \n WHERE session_id = :session_id\n ORDER BY timestamp ASC, id ASC\n "
331
+ )
332
+ if tracer.db is None:
333
+ raise FilterCliError("Database not initialized")
334
+ msg_df = await tracer.db.query_traces(messages_query, {"session_id": session_id})
335
+ message_rows = (
336
+ msg_df.to_dict("records") if hasattr(msg_df, "to_dict") else []
337
+ )
338
+
339
+ if not message_rows:
340
+ prompt = metadata.get("prompt") or ""
341
+ completion = metadata.get("completion") or ""
342
+ if prompt and completion:
343
+ accepted.append(
344
+ {
345
+ "messages": [
346
+ {"role": "user", "content": str(prompt)},
347
+ {"role": "assistant", "content": str(completion)},
348
+ ],
349
+ "metadata": {
350
+ "session_id": session_id,
351
+ "env_name": metadata.get("env_name"),
352
+ "policy_name": metadata.get("policy_name"),
353
+ "seed": metadata.get("seed"),
354
+ "total_reward": total_reward,
355
+ "achievements_count": achievements_count,
356
+ "model": metadata.get("model"),
357
+ "created_at": created_at_dt.isoformat()
358
+ if created_at_dt
359
+ else created_at_raw,
360
+ },
361
+ }
362
+ )
363
+ continue
364
+
365
+ for record in _select_messages(message_rows):
366
+ record["metadata"] = {
367
+ "session_id": session_id,
368
+ "env_name": metadata.get("env_name"),
369
+ "policy_name": metadata.get("policy_name"),
370
+ "seed": metadata.get("seed"),
371
+ "total_reward": total_reward,
372
+ "achievements_count": achievements_count,
373
+ "model": metadata.get("model"),
374
+ "created_at": created_at_dt.isoformat() if created_at_dt else created_at_raw,
375
+ }
376
+ accepted.append(record)
377
+
378
+ if not accepted:
379
+ raise NoSessionsMatchedError()
380
+
381
+ if limit is not None and limit > 0:
382
+ accepted[:] = accepted[:limit]
383
+
384
+ output_path.parent.mkdir(parents=True, exist_ok=True)
385
+ with output_path.open("w", encoding="utf-8") as handle:
386
+ for item in accepted:
387
+ handle.write(json.dumps(item, ensure_ascii=False))
388
+ handle.write("\n")
389
+
390
+ click.echo(f"Wrote {len(accepted)} examples -> {output_path}")
391
+ if tracer.db is not None:
392
+ await tracer.db.close()
393
+
394
+ try:
395
+ asyncio.run(_run())
396
+ except FilterCliError as exc:
397
+ raise click.ClickException(_format_filter_error(exc)) from exc
398
+
399
+
400
+ def _format_filter_error(err: FilterCliError) -> str:
401
+ if isinstance(err, TomlUnavailableError):
402
+ hint = err.hint or "Install tomli or use Python 3.11+."
403
+ return f"TOML parser not available. {hint}"
404
+ if isinstance(err, FilterConfigNotFoundError):
405
+ return f"Filter config not found: {err.path}"
406
+ if isinstance(err, FilterConfigParseError):
407
+ return f"Failed to parse TOML '{err.path}': {err.detail}"
408
+ if isinstance(err, MissingFilterTableError):
409
+ return "Config must contain a [filter] table."
410
+ if isinstance(err, InvalidFilterConfigError):
411
+ return f"Invalid filter config: {err.detail}"
412
+ if isinstance(err, NoTracesFoundError):
413
+ return f"No traces found in database ({err.db_url})."
414
+ if isinstance(err, NoSessionsMatchedError):
415
+ hint = err.hint or "Adjust the filter thresholds or choose a different dataset."
416
+ return f"No sessions matched the provided filters. {hint}"
417
+ return str(err)
418
+
419
+
420
+ command = filter_command
421
+
422
+
423
+ def get_command() -> click.Command:
424
+ return command