synth-ai 0.2.16__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (299) hide show
  1. examples/analyze_semantic_words.sh +2 -2
  2. examples/baseline/banking77_baseline.py +204 -0
  3. examples/baseline/crafter_baseline.py +407 -0
  4. examples/baseline/pokemon_red_baseline.py +326 -0
  5. examples/baseline/simple_baseline.py +56 -0
  6. examples/baseline/warming_up_to_rl_baseline.py +239 -0
  7. examples/blog_posts/gepa/README.md +355 -0
  8. examples/blog_posts/gepa/configs/banking77_gepa_local.toml +95 -0
  9. examples/blog_posts/gepa/configs/banking77_gepa_test.toml +82 -0
  10. examples/blog_posts/gepa/configs/banking77_mipro_local.toml +52 -0
  11. examples/blog_posts/gepa/configs/hotpotqa_gepa_local.toml +59 -0
  12. examples/blog_posts/gepa/configs/hotpotqa_gepa_qwen.toml +36 -0
  13. examples/blog_posts/gepa/configs/hotpotqa_mipro_local.toml +53 -0
  14. examples/blog_posts/gepa/configs/hover_gepa_local.toml +59 -0
  15. examples/blog_posts/gepa/configs/hover_gepa_qwen.toml +36 -0
  16. examples/blog_posts/gepa/configs/hover_mipro_local.toml +53 -0
  17. examples/blog_posts/gepa/configs/ifbench_gepa_local.toml +59 -0
  18. examples/blog_posts/gepa/configs/ifbench_gepa_qwen.toml +36 -0
  19. examples/blog_posts/gepa/configs/ifbench_mipro_local.toml +53 -0
  20. examples/blog_posts/gepa/configs/pupa_gepa_local.toml +60 -0
  21. examples/blog_posts/gepa/configs/pupa_mipro_local.toml +54 -0
  22. examples/blog_posts/gepa/deploy_banking77_task_app.sh +41 -0
  23. examples/blog_posts/gepa/gepa_baseline.py +204 -0
  24. examples/blog_posts/gepa/query_prompts_example.py +97 -0
  25. examples/blog_posts/gepa/run_gepa_banking77.sh +87 -0
  26. examples/blog_posts/gepa/task_apps.py +105 -0
  27. examples/blog_posts/gepa/test_gepa_local.sh +67 -0
  28. examples/blog_posts/gepa/verify_banking77_setup.sh +123 -0
  29. examples/blog_posts/pokemon_vl/README.md +98 -0
  30. examples/blog_posts/pokemon_vl/configs/eval_gpt5nano.toml +26 -0
  31. examples/blog_posts/pokemon_vl/configs/eval_qwen3_vl.toml +27 -0
  32. examples/blog_posts/pokemon_vl/configs/eval_rl_final.toml +24 -0
  33. examples/blog_posts/pokemon_vl/configs/filter_high_reward.toml +10 -0
  34. examples/blog_posts/pokemon_vl/configs/train_rl_from_sft.toml +43 -0
  35. examples/blog_posts/pokemon_vl/configs/train_sft_qwen4b_vl.toml +40 -0
  36. examples/blog_posts/pokemon_vl/extract_images.py +239 -0
  37. examples/blog_posts/pokemon_vl/pokemon_vl_baseline.py +326 -0
  38. examples/blog_posts/pokemon_vl/run_eval_extract_images.py +209 -0
  39. examples/blog_posts/pokemon_vl/run_qwen_eval_extract_images.py +212 -0
  40. examples/blog_posts/pokemon_vl/text_box_analysis.md +106 -0
  41. examples/blog_posts/warming_up_to_rl/ARCHITECTURE.md +195 -0
  42. examples/blog_posts/warming_up_to_rl/FINAL_TEST_RESULTS.md +127 -0
  43. examples/blog_posts/warming_up_to_rl/INFERENCE_SUCCESS.md +132 -0
  44. examples/blog_posts/warming_up_to_rl/README.md +158 -0
  45. examples/blog_posts/warming_up_to_rl/SMOKE_TESTING.md +164 -0
  46. examples/blog_posts/warming_up_to_rl/SMOKE_TEST_COMPLETE.md +253 -0
  47. examples/blog_posts/warming_up_to_rl/configs/eval_baseline_qwen32b_10x20.toml +25 -0
  48. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b.toml +25 -0
  49. examples/blog_posts/warming_up_to_rl/configs/eval_ft_qwen4b_10x20.toml +26 -0
  50. examples/blog_posts/warming_up_to_rl/configs/eval_groq_qwen32b.toml +25 -0
  51. examples/blog_posts/warming_up_to_rl/configs/eval_openai_gpt_oss_120b.toml +29 -0
  52. examples/blog_posts/warming_up_to_rl/configs/filter_high_reward_dataset.toml +10 -0
  53. examples/blog_posts/warming_up_to_rl/configs/smoke_test.toml +75 -0
  54. examples/blog_posts/warming_up_to_rl/configs/train_rl_from_sft.toml +91 -0
  55. examples/blog_posts/warming_up_to_rl/configs/train_sft_qwen4b.toml +40 -0
  56. examples/blog_posts/warming_up_to_rl/warming_up_to_rl_baseline.py +187 -0
  57. examples/dev/qwen3_32b_qlora_4xh100.toml +5 -0
  58. examples/multi_step/configs/VERILOG_REWARDS.md +4 -0
  59. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +4 -0
  60. examples/multi_step/configs/crafter_rl_outcome.toml +2 -1
  61. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +65 -107
  62. examples/multi_step/configs/crafter_rl_stepwise_shaped.toml +2 -1
  63. examples/multi_step/configs/crafter_rl_stepwise_simple.toml +2 -1
  64. examples/multi_step/configs/crafter_rl_stepwise_simple_NEW_FORMAT.toml +105 -0
  65. examples/multi_step/configs/verilog_rl_lora.toml +80 -123
  66. examples/qwen_coder/configs/coder_lora_30b.toml +1 -3
  67. examples/qwen_coder/configs/coder_lora_4b.toml +4 -1
  68. examples/qwen_coder/configs/coder_lora_small.toml +1 -3
  69. examples/qwen_vl/README.md +10 -12
  70. examples/qwen_vl/SETUP_COMPLETE.md +7 -8
  71. examples/qwen_vl/VISION_TESTS_COMPLETE.md +2 -3
  72. examples/qwen_vl/collect_data_via_cli.md +76 -84
  73. examples/qwen_vl/collect_vision_traces.py +4 -4
  74. examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +40 -57
  75. examples/qwen_vl/configs/crafter_vlm_sft_example.toml +1 -2
  76. examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +20 -37
  77. examples/qwen_vl/configs/eval_gpt5nano_vision.toml +21 -40
  78. examples/qwen_vl/configs/eval_qwen3vl_vision.toml +26 -0
  79. examples/qwen_vl/configs/{filter_qwen2vl_sft.toml → filter_qwen3vl_sft.toml} +4 -5
  80. examples/qwen_vl/configs/filter_vision_sft.toml +2 -3
  81. examples/qwen_vl/crafter_qwen_vl_agent.py +5 -5
  82. examples/qwen_vl/run_vision_comparison.sh +6 -7
  83. examples/rl/README.md +5 -5
  84. examples/rl/configs/rl_from_base_qwen.toml +26 -1
  85. examples/rl/configs/rl_from_base_qwen17.toml +6 -2
  86. examples/rl/task_app/README.md +1 -2
  87. examples/rl/task_app/math_single_step.py +2 -2
  88. examples/run_crafter_demo.sh +2 -2
  89. examples/sft/README.md +1 -1
  90. examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -1
  91. examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -1
  92. examples/swe/task_app/README.md +32 -2
  93. examples/swe/task_app/grpo_swe_mini.py +4 -0
  94. examples/swe/task_app/hosted/envs/crafter/react_agent.py +1 -1
  95. examples/swe/task_app/hosted/envs/mini_swe/environment.py +37 -10
  96. examples/swe/task_app/hosted/inference/openai_client.py +4 -38
  97. examples/swe/task_app/hosted/policy_routes.py +17 -0
  98. examples/swe/task_app/hosted/rollout.py +4 -2
  99. examples/swe/task_app/morph_backend.py +178 -0
  100. examples/task_apps/banking77/__init__.py +6 -0
  101. examples/task_apps/banking77/banking77_task_app.py +841 -0
  102. examples/task_apps/banking77/deploy_wrapper.py +46 -0
  103. examples/task_apps/crafter/CREATE_SFT_DATASET.md +4 -0
  104. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +4 -0
  105. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +4 -0
  106. examples/task_apps/crafter/task_app/README.md +1 -1
  107. examples/task_apps/crafter/task_app/grpo_crafter.py +90 -5
  108. examples/task_apps/crafter/task_app/grpo_crafter_task_app.py +1 -1
  109. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +4 -26
  110. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -2
  111. examples/task_apps/crafter/task_app/synth_envs_hosted/hosted_app.py +49 -0
  112. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +372 -107
  113. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +81 -12
  114. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +82 -11
  115. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +194 -1
  116. examples/task_apps/enron/task_app/grpo_enron_task_app.py +1 -1
  117. examples/task_apps/gepa_benchmarks/__init__.py +7 -0
  118. examples/task_apps/gepa_benchmarks/common.py +260 -0
  119. examples/task_apps/gepa_benchmarks/hotpotqa_task_app.py +507 -0
  120. examples/task_apps/gepa_benchmarks/hover_task_app.py +436 -0
  121. examples/task_apps/gepa_benchmarks/ifbench_task_app.py +563 -0
  122. examples/task_apps/gepa_benchmarks/pupa_task_app.py +460 -0
  123. examples/task_apps/math/README.md +1 -2
  124. examples/task_apps/pokemon_red/README.md +3 -4
  125. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +4 -0
  126. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +6 -5
  127. examples/task_apps/pokemon_red/eval_pokemon_red_policy.py +1 -2
  128. examples/task_apps/pokemon_red/task_app.py +288 -39
  129. examples/task_apps/sokoban/README.md +2 -3
  130. examples/task_apps/verilog/eval_groq_qwen32b.toml +12 -14
  131. examples/task_apps/verilog/task_app/grpo_verilog_task_app.py +1 -1
  132. examples/vlm/configs/crafter_vlm_gpt4o.toml +4 -1
  133. examples/warming_up_to_rl/configs/crafter_fft.toml +4 -1
  134. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -2
  135. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +3 -2
  136. examples/warming_up_to_rl/run_local_rollout_traced.py +1 -1
  137. examples/warming_up_to_rl/task_app/README.md +1 -1
  138. examples/warming_up_to_rl/task_app/grpo_crafter.py +185 -5
  139. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +1 -1
  140. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +3 -27
  141. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +1 -1
  142. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +49 -0
  143. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +156 -45
  144. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +37 -4
  145. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +33 -3
  146. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +67 -0
  147. examples/workflows/math_rl/configs/rl_from_base_qwen.toml +27 -0
  148. examples/workflows/math_rl/configs/rl_from_base_qwen17.toml +6 -0
  149. synth_ai/api/train/builders.py +99 -4
  150. synth_ai/api/train/cli.py +516 -26
  151. synth_ai/api/train/config_finder.py +13 -2
  152. synth_ai/api/train/configs/__init__.py +23 -2
  153. synth_ai/api/train/configs/prompt_learning.py +442 -0
  154. synth_ai/api/train/configs/rl.py +61 -7
  155. synth_ai/api/train/configs/sft.py +6 -2
  156. synth_ai/api/train/configs/shared.py +59 -2
  157. synth_ai/api/train/task_app.py +1 -1
  158. synth_ai/api/train/validators.py +277 -0
  159. synth_ai/auth/credentials.py +119 -0
  160. synth_ai/baseline/__init__.py +25 -0
  161. synth_ai/baseline/config.py +209 -0
  162. synth_ai/baseline/discovery.py +214 -0
  163. synth_ai/baseline/execution.py +146 -0
  164. synth_ai/cli/__init__.py +94 -18
  165. synth_ai/cli/__main__.py +0 -0
  166. synth_ai/cli/claude.py +70 -0
  167. synth_ai/cli/codex.py +84 -0
  168. synth_ai/cli/commands/__init__.py +18 -0
  169. synth_ai/cli/commands/baseline/__init__.py +12 -0
  170. synth_ai/cli/commands/baseline/core.py +637 -0
  171. synth_ai/cli/commands/baseline/list.py +93 -0
  172. synth_ai/cli/commands/demo/__init__.py +6 -0
  173. synth_ai/cli/commands/demo/core.py +163 -0
  174. synth_ai/cli/commands/eval/__init__.py +19 -0
  175. synth_ai/cli/commands/eval/core.py +1112 -0
  176. synth_ai/cli/commands/eval/errors.py +81 -0
  177. synth_ai/cli/commands/eval/validation.py +133 -0
  178. synth_ai/cli/commands/filter/__init__.py +12 -0
  179. synth_ai/cli/commands/filter/core.py +424 -0
  180. synth_ai/cli/commands/filter/errors.py +55 -0
  181. synth_ai/cli/commands/filter/validation.py +77 -0
  182. synth_ai/cli/commands/help/__init__.py +177 -0
  183. synth_ai/cli/commands/help/core.py +72 -0
  184. synth_ai/cli/commands/smoke/__init__.py +7 -0
  185. synth_ai/cli/commands/smoke/core.py +1436 -0
  186. synth_ai/cli/commands/status/__init__.py +64 -0
  187. synth_ai/cli/commands/status/client.py +192 -0
  188. synth_ai/cli/commands/status/config.py +92 -0
  189. synth_ai/cli/commands/status/errors.py +20 -0
  190. synth_ai/cli/commands/status/formatters.py +164 -0
  191. synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
  192. synth_ai/cli/commands/status/subcommands/files.py +79 -0
  193. synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
  194. synth_ai/cli/commands/status/subcommands/models.py +79 -0
  195. synth_ai/cli/commands/status/subcommands/pricing.py +22 -0
  196. synth_ai/cli/commands/status/subcommands/runs.py +81 -0
  197. synth_ai/cli/commands/status/subcommands/summary.py +47 -0
  198. synth_ai/cli/commands/status/subcommands/usage.py +203 -0
  199. synth_ai/cli/commands/status/utils.py +114 -0
  200. synth_ai/cli/commands/train/__init__.py +53 -0
  201. synth_ai/cli/commands/train/core.py +21 -0
  202. synth_ai/cli/commands/train/errors.py +117 -0
  203. synth_ai/cli/commands/train/judge_schemas.py +200 -0
  204. synth_ai/cli/commands/train/judge_validation.py +305 -0
  205. synth_ai/cli/commands/train/validation.py +386 -0
  206. synth_ai/cli/demo.py +30 -158
  207. synth_ai/cli/deploy/__init__.py +43 -0
  208. synth_ai/cli/deploy.py +162 -0
  209. synth_ai/cli/eval/__init__.py +36 -0
  210. synth_ai/cli/eval/core.py +5 -0
  211. synth_ai/cli/eval/errors.py +31 -0
  212. synth_ai/cli/eval/validation.py +5 -0
  213. synth_ai/cli/filter/__init__.py +28 -0
  214. synth_ai/cli/filter/core.py +5 -0
  215. synth_ai/cli/filter/errors.py +23 -0
  216. synth_ai/cli/filter/validation.py +5 -0
  217. synth_ai/cli/legacy_root_backup.py +14 -8
  218. synth_ai/cli/modal_serve/__init__.py +12 -0
  219. synth_ai/cli/modal_serve/core.py +14 -0
  220. synth_ai/cli/modal_serve/errors.py +8 -0
  221. synth_ai/cli/modal_serve/validation.py +11 -0
  222. synth_ai/cli/opencode.py +107 -0
  223. synth_ai/cli/root.py +9 -5
  224. synth_ai/cli/serve/__init__.py +12 -0
  225. synth_ai/cli/serve/core.py +14 -0
  226. synth_ai/cli/serve/errors.py +8 -0
  227. synth_ai/cli/serve/validation.py +11 -0
  228. synth_ai/cli/setup.py +20 -265
  229. synth_ai/cli/status.py +7 -126
  230. synth_ai/cli/task_app_deploy.py +1 -10
  231. synth_ai/cli/task_app_modal_serve.py +4 -9
  232. synth_ai/cli/task_app_serve.py +4 -11
  233. synth_ai/cli/task_apps.py +51 -1480
  234. synth_ai/cli/train/__init__.py +12 -0
  235. synth_ai/cli/train/core.py +21 -0
  236. synth_ai/cli/train/errors.py +8 -0
  237. synth_ai/cli/train/validation.py +24 -0
  238. synth_ai/cli/train.py +1 -14
  239. synth_ai/demos/crafter/grpo_crafter_task_app.py +1 -1
  240. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
  241. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +7 -4
  242. synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +9 -5
  243. synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +4 -3
  244. synth_ai/environments/examples/red/engine.py +33 -12
  245. synth_ai/environments/examples/red/engine_helpers/reward_components.py +151 -179
  246. synth_ai/environments/examples/red/environment.py +26 -0
  247. synth_ai/environments/examples/red/trace_hooks_v3.py +168 -0
  248. synth_ai/http.py +12 -0
  249. synth_ai/judge_schemas.py +10 -10
  250. synth_ai/learning/__init__.py +10 -0
  251. synth_ai/learning/prompt_learning_client.py +276 -0
  252. synth_ai/learning/prompt_learning_types.py +184 -0
  253. synth_ai/learning/rl/client.py +3 -1
  254. synth_ai/pricing/__init__.py +2 -0
  255. synth_ai/pricing/model_pricing.py +57 -0
  256. synth_ai/streaming/__init__.py +29 -0
  257. synth_ai/streaming/config.py +94 -0
  258. synth_ai/streaming/handlers.py +518 -0
  259. synth_ai/streaming/streamer.py +320 -0
  260. synth_ai/streaming/types.py +95 -0
  261. synth_ai/task/apps/__init__.py +1 -0
  262. synth_ai/task/config.py +2 -0
  263. synth_ai/task/tracing_utils.py +25 -25
  264. synth_ai/task/validators.py +45 -9
  265. synth_ai/task_app_cfgs.py +21 -0
  266. synth_ai/tracing_v3/config.py +162 -19
  267. synth_ai/tracing_v3/constants.py +1 -1
  268. synth_ai/tracing_v3/db_config.py +24 -38
  269. synth_ai/tracing_v3/migration_helper.py +1 -2
  270. synth_ai/tracing_v3/storage/config.py +47 -13
  271. synth_ai/tracing_v3/storage/factory.py +3 -3
  272. synth_ai/tracing_v3/turso/daemon.py +113 -11
  273. synth_ai/tracing_v3/turso/native_manager.py +92 -16
  274. synth_ai/types.py +8 -0
  275. synth_ai/urls.py +11 -0
  276. synth_ai/utils/__init__.py +30 -1
  277. synth_ai/utils/agents.py +74 -0
  278. synth_ai/utils/bin.py +39 -0
  279. synth_ai/utils/cli.py +149 -5
  280. synth_ai/utils/env.py +40 -33
  281. synth_ai/utils/http.py +4 -1
  282. synth_ai/utils/json.py +72 -0
  283. synth_ai/utils/modal.py +285 -3
  284. synth_ai/utils/paths.py +48 -0
  285. synth_ai/utils/uvicorn.py +113 -0
  286. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/METADATA +109 -6
  287. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/RECORD +291 -142
  288. examples/qwen_vl/configs/eval_qwen2vl_vision.toml +0 -44
  289. synth_ai/cli/tui.py +0 -62
  290. synth_ai/tui/__init__.py +0 -5
  291. synth_ai/tui/__main__.py +0 -13
  292. synth_ai/tui/cli/__init__.py +0 -1
  293. synth_ai/tui/cli/query_experiments.py +0 -164
  294. synth_ai/tui/cli/query_experiments_v3.py +0 -164
  295. synth_ai/tui/dashboard.py +0 -911
  296. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/WHEEL +0 -0
  297. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/entry_points.txt +0 -0
  298. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/licenses/LICENSE +0 -0
  299. {synth_ai-0.2.16.dist-info → synth_ai-0.2.19.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,563 @@
1
+ """IFBench instruction-following task app."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import os
7
+ import re
8
+ import uuid
9
+ from collections.abc import Iterable, Sequence
10
+ from pathlib import Path
11
+ from typing import Any, Mapping, cast
12
+
13
+ from datasets import load_dataset
14
+ from fastapi import APIRouter, HTTPException, Request
15
+
16
+ from synth_ai.task.apps import ModalDeploymentConfig, TaskAppEntry, register_task_app
17
+ from synth_ai.task.contracts import (
18
+ RolloutMetrics,
19
+ RolloutRequest,
20
+ RolloutResponse,
21
+ RolloutStep,
22
+ RolloutTrajectory,
23
+ TaskInfo,
24
+ )
25
+ from synth_ai.task.datasets import TaskDatasetRegistry, TaskDatasetSpec
26
+ from synth_ai.task.rubrics import Rubric, load_rubric
27
+ from synth_ai.task.server import ProxyConfig, RubricBundle, TaskAppConfig
28
+ from synth_ai.task.vendors import normalize_vendor_keys
29
+
30
+ from .common import (
31
+ call_chat_completion,
32
+ count_emojis,
33
+ count_numbers,
34
+ count_pronouns,
35
+ sentence_split,
36
+ tokenize,
37
+ unique_word_count,
38
+ )
39
+
40
+ REPO_ROOT = Path(__file__).resolve().parents[3]
41
+
42
+ DATASET_ID = "allenai/IFBench_test"
43
+ AVAILABLE_SPLITS: tuple[str, ...] = ("train",)
44
+ DEFAULT_SPLIT = "train"
45
+
46
+
47
+ ifbench_router = APIRouter()
48
+
49
+
50
+ IFBENCH_DATASET_SPEC = TaskDatasetSpec(
51
+ id="ifbench",
52
+ name="IFBench Instruction Following",
53
+ version="1.0.0",
54
+ splits=list(AVAILABLE_SPLITS),
55
+ default_split=DEFAULT_SPLIT,
56
+ description="Instruction following benchmark with programmatically-checked constraints.",
57
+ )
58
+
59
+ SUPPORTED_INSTRUCTIONS = {
60
+ "count:keywords_multiple",
61
+ "sentence:keyword",
62
+ "count:numbers",
63
+ "count:word_count_range",
64
+ "count:unique_word_count",
65
+ "count:pronouns",
66
+ "format:list",
67
+ "format:emoji",
68
+ }
69
+
70
+
71
+ class IFBenchDataset:
72
+ """Load IFBench and filter to instructions we can automatically score."""
73
+
74
+ def __init__(self) -> None:
75
+ self._cache: dict[str, list[dict[str, Any]]] = {}
76
+
77
+ def _load_split(self, split: str) -> list[dict[str, Any]]:
78
+ if split not in AVAILABLE_SPLITS:
79
+ raise ValueError(f"Unknown split '{split}'. Available: {AVAILABLE_SPLITS}")
80
+ if split not in self._cache:
81
+ try:
82
+ raw = load_dataset(DATASET_ID, split=split)
83
+ except Exception as exc: # pragma: no cover
84
+ raise RuntimeError(
85
+ f"Failed to download IFBench split '{split}'. Ensure network access."
86
+ ) from exc
87
+ filtered = [
88
+ row
89
+ for row in raw
90
+ if set(row.get("instruction_id_list") or ()).issubset(SUPPORTED_INSTRUCTIONS)
91
+ ]
92
+ if not filtered:
93
+ raise RuntimeError(
94
+ f"No IFBench samples remain after filtering for supported instructions ({SUPPORTED_INSTRUCTIONS})."
95
+ )
96
+ self._cache[split] = filtered
97
+ return self._cache[split]
98
+
99
+ def ensure_ready(self, splits: Sequence[str]) -> None:
100
+ for split in splits:
101
+ self._load_split(split)
102
+
103
+ def size(self, split: str) -> int:
104
+ return len(self._load_split(split))
105
+
106
+ def sample(self, *, split: str, index: int) -> dict[str, Any]:
107
+ dataset = self._load_split(split)
108
+ size = len(dataset)
109
+ if size == 0:
110
+ raise RuntimeError(f"IFBench split '{split}' is empty")
111
+ idx = int(index) % size
112
+ row = dataset[int(idx)]
113
+
114
+ instructions = []
115
+ ids = row.get("instruction_id_list") or []
116
+ kwargs_list = row.get("kwargs") or []
117
+ for instr_id, kwargs in zip(ids, kwargs_list):
118
+ instructions.append(
119
+ {
120
+ "id": str(instr_id),
121
+ "kwargs": kwargs or {},
122
+ }
123
+ )
124
+
125
+ return {
126
+ "index": idx,
127
+ "split": split,
128
+ "prompt": str(row.get("prompt") or ""),
129
+ "instructions": instructions,
130
+ }
131
+
132
+
133
+ def _summarise_kwargs(kwargs: Mapping[str, Any]) -> str:
134
+ items = []
135
+ for key, value in kwargs.items():
136
+ if value in (None, "", [], {}):
137
+ continue
138
+ items.append(f"{key}={value}")
139
+ return ", ".join(items) if items else "default"
140
+
141
+
142
+ _KEYWORD_PATTERN = re.compile(
143
+ r"keyword\s+([a-z0-9_-]+)\s+(once|twice|\d+\s+times?)",
144
+ flags=re.IGNORECASE,
145
+ )
146
+
147
+
148
+ def _extract_keyword_targets(prompt: str, keywords: Sequence[str]) -> dict[str, int]:
149
+ targets: dict[str, int] = {}
150
+ for match in _KEYWORD_PATTERN.finditer(prompt):
151
+ word = match.group(1)
152
+ if word not in keywords:
153
+ continue
154
+ count_str = match.group(2).lower()
155
+ if count_str == "once":
156
+ targets[word] = 1
157
+ elif count_str == "twice":
158
+ targets[word] = 2
159
+ else:
160
+ digit_match = re.search(r"\d+", count_str)
161
+ targets[word] = int(digit_match.group()) if digit_match else 1
162
+ return targets
163
+
164
+
165
+ def _evaluate_instruction(
166
+ instr_id: str,
167
+ kwargs: Mapping[str, Any],
168
+ prompt: str,
169
+ response: str,
170
+ ) -> tuple[bool, dict[str, Any]]:
171
+ tokens = tokenize(response)
172
+ details: dict[str, Any] = {}
173
+
174
+ if instr_id == "count:keywords_multiple":
175
+ keywords = [
176
+ kwargs.get("keyword1"),
177
+ kwargs.get("keyword2"),
178
+ kwargs.get("keyword3"),
179
+ kwargs.get("keyword4"),
180
+ kwargs.get("keyword5"),
181
+ ]
182
+ keywords = [str(word) for word in keywords if word]
183
+ targets = _extract_keyword_targets(prompt, keywords)
184
+ passes = True
185
+ occurrences: dict[str, int] = {}
186
+ for word in keywords:
187
+ expected = targets.get(word, 1)
188
+ actual = len(re.findall(rf"\b{re.escape(word)}\b", response, flags=re.IGNORECASE))
189
+ occurrences[word] = actual
190
+ if actual < expected:
191
+ passes = False
192
+ details.update({"keywords": keywords, "counts": occurrences, "targets": targets})
193
+ return passes, details
194
+
195
+ if instr_id == "sentence:keyword":
196
+ target_word = str(kwargs.get("word") or "").strip()
197
+ expected = int(kwargs.get("N") or 1)
198
+ sentences = sentence_split(response)
199
+ satisfied = sum(
200
+ 1 for sentence in sentences if re.search(rf"\b{re.escape(target_word)}\b", sentence, re.IGNORECASE)
201
+ )
202
+ details.update({"word": target_word, "required": expected, "actual": satisfied})
203
+ return satisfied >= expected, details
204
+
205
+ if instr_id == "count:numbers":
206
+ expected = int(kwargs.get("N") or 0)
207
+ actual = count_numbers(response)
208
+ details.update({"required": expected, "actual": actual})
209
+ return actual >= expected, details
210
+
211
+ if instr_id == "count:word_count_range":
212
+ min_words = int(kwargs.get("min_words") or 0)
213
+ max_words = int(kwargs.get("max_words") or 10_000)
214
+ word_count = len(tokens)
215
+ details.update({"min": min_words, "max": max_words, "actual": word_count})
216
+ return min_words <= word_count <= max_words, details
217
+
218
+ if instr_id == "count:unique_word_count":
219
+ expected = int(kwargs.get("N") or 0)
220
+ actual = unique_word_count(tokens)
221
+ details.update({"required": expected, "actual": actual})
222
+ return actual >= expected, details
223
+
224
+ if instr_id == "count:pronouns":
225
+ expected = int(kwargs.get("N") or 0)
226
+ actual = count_pronouns(tokens)
227
+ details.update({"required": expected, "actual": actual})
228
+ return actual >= expected, details
229
+
230
+ if instr_id == "format:list":
231
+ separator = str(kwargs.get("sep") or "-").strip()
232
+ lines = [line.strip() for line in response.splitlines() if line.strip()]
233
+ bullet_lines = [line for line in lines if line.startswith(separator)]
234
+ details.update({"separator": separator, "bullet_count": len(bullet_lines)})
235
+ return len(bullet_lines) >= 2, details
236
+
237
+ if instr_id == "format:emoji":
238
+ expected = int(kwargs.get("N") or 1)
239
+ emoji_count = count_emojis(response)
240
+ details.update({"required": expected, "actual": emoji_count})
241
+ return emoji_count >= expected, details
242
+
243
+ return False, {"unsupported": True}
244
+
245
+
246
+ def evaluate_ifbench(prompt: str, instructions: Sequence[Mapping[str, Any]], response: str) -> tuple[float, dict[str, Any]]:
247
+ results: dict[str, Any] = {}
248
+ passed = 0
249
+ total = 0
250
+ for instruction in instructions:
251
+ instr_id = str(instruction.get("id") or "")
252
+ kwargs = instruction.get("kwargs") or {}
253
+ ok, details = _evaluate_instruction(instr_id, kwargs, prompt, response)
254
+ results[instr_id] = {"pass": ok, **details}
255
+ if instr_id in SUPPORTED_INSTRUCTIONS:
256
+ total += 1
257
+ if ok:
258
+ passed += 1
259
+ reward = (passed / total) if total else 0.0
260
+ return reward, {"passed": passed, "total": total, "details": results}
261
+
262
+
263
+ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) -> RolloutResponse:
264
+ dataset: IFBenchDataset = fastapi_request.app.state.ifbench_dataset
265
+
266
+ split = str(((request.env.config or {}).get("split")) or DEFAULT_SPLIT)
267
+ seed = request.env.seed or 0
268
+
269
+ sample = dataset.sample(split=split, index=seed)
270
+
271
+ instruction_lines = [
272
+ f"- {instr['id']} ({_summarise_kwargs(instr['kwargs'])})" for instr in sample["instructions"]
273
+ ]
274
+ constraints_text = "\n".join(instruction_lines)
275
+
276
+ observation = {
277
+ "prompt": sample["prompt"],
278
+ "instructions": sample["instructions"],
279
+ "index": sample["index"],
280
+ "split": sample["split"],
281
+ }
282
+
283
+ placeholders = {
284
+ "prompt": sample["prompt"],
285
+ "instructions": constraints_text,
286
+ }
287
+
288
+ default_messages = [
289
+ {
290
+ "role": "system",
291
+ "pattern": (
292
+ "You must follow every instruction exactly. Produce a single response that satisfies all constraints."
293
+ ),
294
+ },
295
+ {
296
+ "role": "user",
297
+ "pattern": "Instructions:\n{instructions}\n\nTask:\n{prompt}",
298
+ },
299
+ ]
300
+
301
+ response_json: dict[str, Any] | None = None
302
+ response_text = ""
303
+ error_info: dict[str, Any] = {}
304
+
305
+ try:
306
+ response_text, response_json, _ = await call_chat_completion(
307
+ request.policy.config or {},
308
+ placeholders,
309
+ default_messages,
310
+ )
311
+ except HTTPException as http_err: # pragma: no cover
312
+ error_info = {"error": str(http_err.detail), "code": http_err.status_code}
313
+ except Exception as exc: # pragma: no cover
314
+ error_info = {"error": str(exc)}
315
+
316
+ reward, eval_details = evaluate_ifbench(sample["prompt"], sample["instructions"], response_text)
317
+ eval_details["response_json"] = response_json
318
+ eval_details.update(error_info)
319
+
320
+ with contextlib.suppress(Exception):
321
+ print(
322
+ f"[IFBENCH_ROLLOUT] run_id={request.run_id} index={sample['index']} "
323
+ f"passed={eval_details['passed']}/{eval_details['total']} reward={reward:.3f}",
324
+ flush=True,
325
+ )
326
+
327
+ step = RolloutStep(
328
+ obs=observation,
329
+ tool_calls=[],
330
+ reward=reward,
331
+ done=True,
332
+ info=eval_details,
333
+ )
334
+
335
+ inference_url = (request.policy.config or {}).get("inference_url")
336
+ trajectory = RolloutTrajectory(
337
+ env_id=f"ifbench::{sample['split']}::{sample['index']}",
338
+ policy_id=request.policy.policy_id or request.policy.policy_name or "policy",
339
+ steps=[step],
340
+ final={"observation": observation, "reward": reward},
341
+ length=1,
342
+ inference_url=str(inference_url or ""),
343
+ )
344
+
345
+ metrics = RolloutMetrics(
346
+ episode_returns=[reward],
347
+ mean_return=reward,
348
+ num_steps=1,
349
+ num_episodes=1,
350
+ outcome_score=reward,
351
+ events_score=reward,
352
+ details={"constraints_passed": eval_details.get("passed"), "constraints_total": eval_details.get("total")},
353
+ )
354
+
355
+ trace_payload = None
356
+ include_trace = bool(
357
+ (request.record and getattr(request.record, "return_trace", False))
358
+ or os.getenv("TASKAPP_TRACING_ENABLED")
359
+ )
360
+ if include_trace:
361
+ trace_payload = {
362
+ "session_id": str(uuid.uuid4()),
363
+ "events_count": 1,
364
+ "decision_rewards": [reward],
365
+ "metadata": {
366
+ "env": "ifbench",
367
+ "split": sample["split"],
368
+ "index": sample["index"],
369
+ "constraints_passed": eval_details.get("passed"),
370
+ "constraints_total": eval_details.get("total"),
371
+ },
372
+ }
373
+
374
+ return RolloutResponse(
375
+ run_id=request.run_id,
376
+ trajectories=[trajectory],
377
+ branches={},
378
+ metrics=metrics,
379
+ aborted=False,
380
+ ops_executed=2,
381
+ trace=trace_payload,
382
+ )
383
+
384
+
385
+ def build_dataset() -> tuple[TaskDatasetRegistry, IFBenchDataset]:
386
+ registry = TaskDatasetRegistry()
387
+ dataset = IFBenchDataset()
388
+ dataset.ensure_ready([DEFAULT_SPLIT])
389
+ registry.register(IFBENCH_DATASET_SPEC, lambda _spec: dataset, cache=True)
390
+ return registry, dataset
391
+
392
+
393
+ def _base_task_info() -> TaskInfo:
394
+ return TaskInfo(
395
+ task={
396
+ "id": "ifbench",
397
+ "name": "IFBench Instruction Following",
398
+ "version": "1.0.0",
399
+ "action_space": {
400
+ "type": "free_text",
401
+ "description": "Generate a completion that satisfies all constraints.",
402
+ },
403
+ },
404
+ environment="ifbench",
405
+ dataset={
406
+ **IFBENCH_DATASET_SPEC.model_dump(),
407
+ "hf_dataset": DATASET_ID,
408
+ },
409
+ rubric={
410
+ "version": "1",
411
+ "criteria_count": 1,
412
+ "source": "inline",
413
+ },
414
+ inference={
415
+ "supports_proxy": True,
416
+ "tool": None,
417
+ },
418
+ limits={"max_turns": 1},
419
+ task_metadata={"supported_instructions": sorted(SUPPORTED_INSTRUCTIONS)},
420
+ )
421
+
422
+
423
+ def describe_taskset(dataset: IFBenchDataset) -> Mapping[str, Any]:
424
+ return {
425
+ **IFBENCH_DATASET_SPEC.model_dump(),
426
+ "hf_dataset": DATASET_ID,
427
+ "supported_instructions": sorted(SUPPORTED_INSTRUCTIONS),
428
+ "sizes": {split: dataset.size(split) for split in AVAILABLE_SPLITS},
429
+ }
430
+
431
+
432
+ def provide_task_instances(dataset: IFBenchDataset, seeds: Sequence[int]) -> Iterable[TaskInfo]:
433
+ base_info = _base_task_info()
434
+ for seed in seeds:
435
+ sample = dataset.sample(split=DEFAULT_SPLIT, index=seed)
436
+ yield TaskInfo(
437
+ task=base_info.task,
438
+ environment=base_info.environment,
439
+ dataset={
440
+ **base_info.dataset,
441
+ "split": sample["split"],
442
+ "index": sample["index"],
443
+ },
444
+ rubric=base_info.rubric,
445
+ inference=base_info.inference,
446
+ limits=base_info.limits,
447
+ task_metadata={**base_info.task_metadata, "prompt": sample["prompt"][:80]},
448
+ )
449
+
450
+
451
+ OUTCOME_RUBRIC: Rubric = cast(
452
+ Rubric,
453
+ load_rubric(
454
+ {
455
+ "version": "1",
456
+ "goal_text": "Satisfy the IFBench constraints.",
457
+ "aggregation": "weighted_sum",
458
+ "criteria": [
459
+ {
460
+ "id": "constraint_satisfaction",
461
+ "description": "Meets all programmatically-checked constraints.",
462
+ "weight": 1.0,
463
+ }
464
+ ],
465
+ }
466
+ ),
467
+ )
468
+
469
+ EVENTS_RUBRIC: Rubric = cast(
470
+ Rubric,
471
+ load_rubric(
472
+ {
473
+ "version": "1",
474
+ "goal_text": "Keep responses concise while following instructions.",
475
+ "aggregation": "weighted_sum",
476
+ "criteria": [
477
+ {
478
+ "id": "concise_answer",
479
+ "description": "Avoid unnecessary content while satisfying constraints.",
480
+ "weight": 1.0,
481
+ }
482
+ ],
483
+ }
484
+ ),
485
+ )
486
+
487
+
488
+ def build_config() -> TaskAppConfig:
489
+ registry, dataset = build_dataset()
490
+ base_info = _base_task_info()
491
+
492
+ proxy_keys = normalize_vendor_keys()
493
+ proxy_config = ProxyConfig(
494
+ enable_openai=proxy_keys.get("OPENAI_API_KEY") is not None,
495
+ enable_groq=proxy_keys.get("GROQ_API_KEY") is not None,
496
+ system_hint="Follow every instruction exactly. Violations are failures.",
497
+ )
498
+
499
+ config = TaskAppConfig(
500
+ app_id="ifbench",
501
+ name="IFBench Instruction Following Task",
502
+ description="IFBench task app with automatic constraint checking for prompt optimisation.",
503
+ base_task_info=base_info,
504
+ describe_taskset=lambda: describe_taskset(dataset),
505
+ provide_task_instances=lambda seeds: provide_task_instances(dataset, seeds),
506
+ rollout=rollout_executor,
507
+ dataset_registry=registry,
508
+ rubrics=RubricBundle(outcome=OUTCOME_RUBRIC, events=EVENTS_RUBRIC),
509
+ proxy=proxy_config,
510
+ routers=(ifbench_router,),
511
+ app_state={"ifbench_dataset": dataset},
512
+ cors_origins=["*"],
513
+ )
514
+ return config
515
+
516
+
517
+ register_task_app(
518
+ entry=TaskAppEntry(
519
+ app_id="ifbench",
520
+ description="IFBench task app using automatically scored constraint subsets.",
521
+ config_factory=build_config,
522
+ aliases=("ifbench-instructions",),
523
+ modal=ModalDeploymentConfig(
524
+ app_name="synth-ifbench",
525
+ pip_packages=(
526
+ "datasets>=2.14.0",
527
+ "fastapi>=0.115.0",
528
+ "pydantic>=2.0.0",
529
+ "httpx>=0.26.0",
530
+ ),
531
+ extra_local_dirs=((str(REPO_ROOT / "synth_ai"), "/opt/synth_ai_repo/synth_ai"),),
532
+ ),
533
+ )
534
+ )
535
+
536
+
537
+ if __name__ == "__main__": # pragma: no cover - manual helper
538
+ import argparse
539
+ from synth_ai.task.server import run_task_app
540
+
541
+ parser = argparse.ArgumentParser(description="Run the IFBench task app locally")
542
+ parser.add_argument("--host", default="0.0.0.0")
543
+ parser.add_argument("--port", type=int, default=8111)
544
+ parser.add_argument("--reload", action="store_true", help="Enable uvicorn autoreload")
545
+ parser.add_argument(
546
+ "--env-file",
547
+ action="append",
548
+ default=[],
549
+ help="Additional .env files to load before startup",
550
+ )
551
+ args = parser.parse_args()
552
+
553
+ default_env = Path(__file__).resolve().parents[2] / ".env"
554
+ env_files = [str(default_env)] if default_env.exists() else []
555
+ env_files.extend(args.env_file or [])
556
+
557
+ run_task_app(
558
+ build_config,
559
+ host=args.host,
560
+ port=args.port,
561
+ reload=args.reload,
562
+ env_files=env_files,
563
+ )