synth-ai 0.2.9.dev7__py3-none-any.whl → 0.2.9.dev8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (327) hide show
  1. examples/__init__.py +16 -0
  2. examples/crafter_debug_render.py +8 -11
  3. examples/qwen_coder/README.md +102 -0
  4. examples/qwen_coder/_shared.py +113 -0
  5. examples/qwen_coder/configs/coder_lora_30b.toml +61 -0
  6. examples/qwen_coder/configs/coder_lora_4b.toml +57 -0
  7. examples/qwen_coder/configs/coder_lora_small.toml +58 -0
  8. examples/qwen_coder/generate_dataset.py +98 -0
  9. examples/qwen_coder/infer_ft_smoke.py +64 -0
  10. examples/qwen_coder/infer_prod_proxy.py +73 -0
  11. examples/qwen_coder/infer_via_synth.py +87 -0
  12. examples/qwen_coder/scripts/infer_coder.sh +18 -0
  13. examples/qwen_coder/scripts/train_coder_30b.sh +21 -0
  14. examples/qwen_coder/sft_full_17b.py +103 -0
  15. examples/qwen_coder/sft_lora_30b.py +110 -0
  16. examples/qwen_coder/subset_jsonl.py +38 -0
  17. examples/qwen_coder/validate_jsonl.py +59 -0
  18. examples/rl/run_eval.py +36 -37
  19. examples/rl/run_rl_and_save.py +5 -5
  20. examples/rl/task_app/math_single_step.py +65 -43
  21. examples/rl/task_app/math_task_app.py +3 -3
  22. examples/sft/README.md +139 -0
  23. examples/sft/configs/crafter_fft_qwen0p6b.toml +44 -0
  24. examples/sft/configs/crafter_lora_qwen0p6b.toml +45 -0
  25. examples/sft/evaluate.py +117 -0
  26. examples/sft/export_dataset.py +117 -0
  27. examples/sft/generate_traces.py +162 -0
  28. examples/swe/__init__.py +12 -0
  29. examples/swe/task_app/README.md +105 -0
  30. examples/swe/task_app/__init__.py +2 -0
  31. examples/swe/task_app/grpo_swe_mini.py +571 -0
  32. examples/swe/task_app/grpo_swe_mini_task_app.py +136 -0
  33. examples/swe/task_app/hosted/README.md +173 -0
  34. examples/swe/task_app/hosted/__init__.py +5 -0
  35. examples/swe/task_app/hosted/branching.py +143 -0
  36. examples/swe/task_app/hosted/environment_routes.py +1289 -0
  37. examples/swe/task_app/hosted/envs/__init__.py +1 -0
  38. examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
  39. examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
  40. examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
  41. examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
  42. examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
  43. examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
  44. examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
  45. examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
  46. examples/swe/task_app/hosted/envs/mini_swe/environment.py +1164 -0
  47. examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
  48. examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
  49. examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
  50. examples/swe/task_app/hosted/hosted_app.py +204 -0
  51. examples/swe/task_app/hosted/inference/__init__.py +5 -0
  52. examples/swe/task_app/hosted/inference/openai_client.py +618 -0
  53. examples/swe/task_app/hosted/main.py +100 -0
  54. examples/swe/task_app/hosted/policy_routes.py +1079 -0
  55. examples/swe/task_app/hosted/registry.py +195 -0
  56. examples/swe/task_app/hosted/rollout.py +1869 -0
  57. examples/swe/task_app/hosted/storage/__init__.py +5 -0
  58. examples/swe/task_app/hosted/storage/volume.py +211 -0
  59. examples/swe/task_app/hosted/test_agents.py +161 -0
  60. examples/swe/task_app/hosted/test_service.py +137 -0
  61. examples/swe/task_app/hosted/utils.py +62 -0
  62. examples/vlm/README.md +68 -0
  63. examples/vlm/configs/crafter_vlm_gpt4o.toml +44 -0
  64. examples/vlm/crafter_image_only_agent.py +207 -0
  65. examples/vlm/crafter_openai_vlm_agent.py +277 -0
  66. examples/vlm/filter_image_rows.py +63 -0
  67. examples/vlm/run_crafter_vlm_benchmark.py +316 -0
  68. examples/warming_up_to_rl/analyze_trace_db.py +5 -5
  69. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +11 -1
  70. examples/warming_up_to_rl/export_trace_sft.py +78 -21
  71. examples/warming_up_to_rl/groq_test.py +4 -4
  72. examples/warming_up_to_rl/manage_secrets.py +13 -18
  73. examples/warming_up_to_rl/run_eval.py +42 -44
  74. examples/warming_up_to_rl/run_fft_and_save.py +11 -16
  75. examples/warming_up_to_rl/run_local_rollout.py +1 -3
  76. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -4
  77. examples/warming_up_to_rl/run_local_rollout_parallel.py +1 -4
  78. examples/warming_up_to_rl/run_local_rollout_traced.py +3 -5
  79. examples/warming_up_to_rl/run_rl_and_save.py +5 -6
  80. examples/warming_up_to_rl/run_rollout_remote.py +8 -10
  81. examples/warming_up_to_rl/task_app/README.md +6 -2
  82. examples/warming_up_to_rl/task_app/grpo_crafter.py +234 -35
  83. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +2 -3
  84. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +1 -1
  85. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +9 -11
  86. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +131 -114
  87. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +101 -41
  88. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +73 -51
  89. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +14 -6
  90. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +16 -16
  91. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +32 -34
  92. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +94 -31
  93. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +0 -2
  94. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +303 -203
  95. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +21 -23
  96. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +328 -225
  97. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +13 -13
  98. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +1 -0
  99. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +1 -0
  100. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +4 -3
  101. synth/__init__.py +14 -0
  102. synth_ai/__init__.py +26 -4
  103. synth_ai/api/models/supported.py +376 -0
  104. synth_ai/api/train/builders.py +128 -21
  105. synth_ai/api/train/cli.py +80 -64
  106. synth_ai/api/train/config_finder.py +7 -2
  107. synth_ai/api/train/env_resolver.py +1 -1
  108. synth_ai/api/train/pollers.py +2 -1
  109. synth_ai/api/train/supported_algos.py +139 -0
  110. synth_ai/api/train/task_app.py +1 -2
  111. synth_ai/api/train/utils.py +13 -44
  112. synth_ai/cli/__init__.py +8 -0
  113. synth_ai/cli/_modal_wrapper.py +28 -0
  114. synth_ai/cli/_typer_patch.py +49 -0
  115. synth_ai/cli/balance.py +1 -2
  116. synth_ai/cli/calc.py +1 -1
  117. synth_ai/cli/demo.py +2 -1
  118. synth_ai/cli/recent.py +2 -2
  119. synth_ai/cli/rl_demo.py +2 -1
  120. synth_ai/cli/root.py +11 -13
  121. synth_ai/cli/status.py +2 -2
  122. synth_ai/cli/task_apps.py +529 -179
  123. synth_ai/cli/traces.py +6 -4
  124. synth_ai/cli/watch.py +12 -18
  125. synth_ai/demo_registry.py +1 -1
  126. synth_ai/demos/core/cli.py +36 -43
  127. synth_ai/demos/demo_task_apps/__init__.py +3 -3
  128. synth_ai/demos/demo_task_apps/core.py +17 -25
  129. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +3 -4
  130. synth_ai/demos/demo_task_apps/math/app.py +2 -1
  131. synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -4
  132. synth_ai/demos/demo_task_apps/math/modal_task_app.py +16 -18
  133. synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -1
  134. synth_ai/environments/examples/crafter_classic/environment.py +76 -1
  135. synth_ai/environments/reproducibility/tree.py +2 -5
  136. synth_ai/environments/service/app.py +11 -12
  137. synth_ai/environments/service/core_routes.py +4 -7
  138. synth_ai/environments/stateful/engine.py +1 -1
  139. synth_ai/environments/tasks/core.py +1 -0
  140. synth_ai/environments/tasks/filters.py +5 -6
  141. synth_ai/environments/tasks/utils.py +4 -5
  142. synth_ai/handshake.py +9 -9
  143. synth_ai/http.py +1 -1
  144. synth_ai/http_client.py +18 -10
  145. synth_ai/inference/client.py +15 -5
  146. synth_ai/jobs/client.py +78 -83
  147. synth_ai/learning/__init__.py +41 -6
  148. synth_ai/learning/algorithms.py +14 -0
  149. synth_ai/learning/client.py +91 -24
  150. synth_ai/learning/config.py +2 -38
  151. synth_ai/learning/ft_client.py +4 -59
  152. synth_ai/learning/health.py +5 -6
  153. synth_ai/learning/jobs.py +31 -47
  154. synth_ai/{rl → learning/rl}/__init__.py +14 -4
  155. synth_ai/learning/rl/client.py +267 -0
  156. synth_ai/learning/rl/config.py +31 -0
  157. synth_ai/{rl → learning/rl}/contracts.py +5 -8
  158. synth_ai/{rl → learning/rl}/env_keys.py +39 -15
  159. synth_ai/learning/rl/secrets.py +13 -0
  160. synth_ai/learning/rl_client.py +2 -281
  161. synth_ai/learning/sft/__init__.py +29 -0
  162. synth_ai/learning/sft/client.py +68 -0
  163. synth_ai/learning/sft/config.py +270 -0
  164. synth_ai/learning/sft/data.py +295 -0
  165. synth_ai/learning/sse.py +25 -24
  166. synth_ai/learning/validators.py +25 -28
  167. synth_ai/lm/__init__.py +21 -47
  168. synth_ai/main.py +4 -0
  169. synth_ai/task/__init__.py +25 -27
  170. synth_ai/task/apps/__init__.py +7 -8
  171. synth_ai/task/auth.py +8 -8
  172. synth_ai/task/client.py +14 -14
  173. synth_ai/task/contracts.py +36 -35
  174. synth_ai/task/datasets.py +6 -5
  175. synth_ai/task/errors.py +10 -10
  176. synth_ai/task/health.py +17 -9
  177. synth_ai/task/json.py +58 -23
  178. synth_ai/task/proxy.py +13 -9
  179. synth_ai/task/rubrics.py +16 -15
  180. synth_ai/task/server.py +12 -12
  181. synth_ai/task/tracing_utils.py +4 -4
  182. synth_ai/task/vendors.py +5 -6
  183. synth_ai/tracing_v3/__init__.py +2 -0
  184. synth_ai/tracing_v3/abstractions.py +21 -4
  185. synth_ai/tracing_v3/decorators.py +18 -16
  186. synth_ai/tracing_v3/hooks.py +5 -5
  187. synth_ai/tracing_v3/llm_call_record_helpers.py +6 -6
  188. synth_ai/tracing_v3/session_tracer.py +40 -14
  189. synth_ai/tracing_v3/storage/base.py +85 -0
  190. synth_ai/tracing_v3/storage/config.py +21 -8
  191. synth_ai/tracing_v3/storage/factory.py +10 -7
  192. synth_ai/tracing_v3/storage/utils.py +4 -2
  193. synth_ai/tracing_v3/turso/daemon.py +7 -2
  194. synth_ai/tracing_v3/turso/models.py +2 -2
  195. synth_ai/tracing_v3/turso/native_manager.py +1173 -0
  196. synth_ai/tracing_v3/utils.py +4 -4
  197. synth_ai/v0/api/__init__.py +8 -0
  198. synth_ai/v0/api/models/__init__.py +8 -0
  199. synth_ai/v0/api/models/supported.py +8 -0
  200. synth_ai/v0/config/__init__.py +15 -0
  201. synth_ai/v0/config/base_url.py +12 -0
  202. synth_ai/v0/lm/__init__.py +51 -0
  203. synth_ai/{lm → v0/lm}/caching/ephemeral.py +2 -2
  204. synth_ai/{lm → v0/lm}/caching/handler.py +4 -4
  205. synth_ai/{lm → v0/lm}/caching/initialize.py +1 -1
  206. synth_ai/{lm → v0/lm}/caching/persistent.py +1 -1
  207. synth_ai/{lm → v0/lm}/config.py +6 -1
  208. synth_ai/{lm → v0/lm}/core/all.py +9 -9
  209. synth_ai/{lm → v0/lm}/core/main.py +6 -6
  210. synth_ai/{lm → v0/lm}/core/main_v3.py +10 -10
  211. synth_ai/{lm → v0/lm}/core/synth_models.py +2 -14
  212. synth_ai/{lm → v0/lm}/core/vendor_clients.py +2 -2
  213. synth_ai/{lm → v0/lm}/overrides.py +2 -2
  214. synth_ai/{lm → v0/lm}/provider_support/anthropic.py +4 -4
  215. synth_ai/{lm → v0/lm}/provider_support/openai.py +5 -5
  216. synth_ai/{lm → v0/lm}/structured_outputs/handler.py +5 -5
  217. synth_ai/{lm → v0/lm}/structured_outputs/rehabilitate.py +1 -1
  218. synth_ai/{lm → v0/lm}/vendors/core/anthropic_api.py +9 -9
  219. synth_ai/{lm → v0/lm}/vendors/core/gemini_api.py +5 -5
  220. synth_ai/{lm → v0/lm}/vendors/core/mistral_api.py +5 -5
  221. synth_ai/{lm → v0/lm}/vendors/core/openai_api.py +10 -10
  222. synth_ai/{lm → v0/lm}/vendors/openai_standard.py +8 -8
  223. synth_ai/{lm → v0/lm}/vendors/openai_standard_responses.py +2 -2
  224. synth_ai/{lm → v0/lm}/vendors/supported/custom_endpoint.py +3 -3
  225. synth_ai/{lm → v0/lm}/vendors/supported/deepseek.py +2 -2
  226. synth_ai/{lm → v0/lm}/vendors/supported/grok.py +2 -2
  227. synth_ai/{lm → v0/lm}/vendors/supported/groq.py +1 -1
  228. synth_ai/{lm → v0/lm}/vendors/supported/ollama.py +1 -1
  229. synth_ai/{lm → v0/lm}/vendors/supported/openrouter.py +3 -3
  230. synth_ai/{lm → v0/lm}/vendors/supported/together.py +1 -1
  231. synth_ai/{lm → v0/lm}/vendors/synth_client.py +1 -1
  232. synth_ai/v0/tracing_v3/__init__.py +10 -0
  233. synth_ai/v0/tracing_v3/abstractions.py +3 -0
  234. synth_ai/v0/tracing_v3/decorators.py +3 -0
  235. synth_ai/v0/tracing_v3/llm_call_record_helpers.py +3 -0
  236. synth_ai/v0/tracing_v3/session_tracer.py +3 -0
  237. synth_ai-0.2.9.dev8.dist-info/METADATA +191 -0
  238. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/RECORD +268 -238
  239. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/top_level.txt +1 -0
  240. examples/common_old/backend.py +0 -20
  241. examples/evals_old/README.md +0 -98
  242. examples/evals_old/__init__.py +0 -6
  243. examples/evals_old/compare_models.py +0 -1038
  244. examples/evals_old/example_log.md +0 -145
  245. examples/evals_old/run_demo.sh +0 -126
  246. examples/evals_old/trace_analysis.py +0 -270
  247. examples/finetuning_old/_backup_synth_qwen/config.toml +0 -29
  248. examples/finetuning_old/_backup_synth_qwen/example_log.md +0 -324
  249. examples/finetuning_old/_backup_synth_qwen/filter_traces.py +0 -60
  250. examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +0 -243
  251. examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +0 -109
  252. examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +0 -1924
  253. examples/finetuning_old/_backup_synth_qwen/readme.md +0 -49
  254. examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +0 -114
  255. examples/finetuning_old/_backup_synth_qwen/run_demo.sh +0 -195
  256. examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +0 -119
  257. examples/finetuning_old/synth_qwen_v1/README.md +0 -68
  258. examples/finetuning_old/synth_qwen_v1/filter_traces.py +0 -60
  259. examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +0 -243
  260. examples/finetuning_old/synth_qwen_v1/finetune.py +0 -46
  261. examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +0 -71
  262. examples/finetuning_old/synth_qwen_v1/infer.py +0 -36
  263. examples/finetuning_old/synth_qwen_v1/poll.py +0 -46
  264. examples/finetuning_old/synth_qwen_v1/prepare_data.py +0 -35
  265. examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +0 -109
  266. examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +0 -1933
  267. examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +0 -210
  268. examples/finetuning_old/synth_qwen_v1/run_ft_job.py +0 -237
  269. examples/finetuning_old/synth_qwen_v1/upload_data.py +0 -34
  270. examples/finetuning_old/synth_qwen_v1/util.py +0 -152
  271. examples/rl_old/task_app.py +0 -1131
  272. examples/warming_up_to_rl/old/event_rewards.md +0 -234
  273. examples/warming_up_to_rl/old/notes.md +0 -73
  274. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +0 -738
  275. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +0 -580
  276. synth_ai/experimental/synth_oss.py +0 -445
  277. synth_ai/learning/filtering.py +0 -0
  278. synth_ai/learning/offline/dpo.py +0 -0
  279. synth_ai/learning/offline/providers.py +0 -7
  280. synth_ai/learning/offline/sft.py +0 -0
  281. synth_ai/learning/offline/shared.py +0 -0
  282. synth_ai/learning/online/grpo.py +0 -0
  283. synth_ai/learning/online/irft.py +0 -0
  284. synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
  285. synth_ai/learning/prompts/gepa.py +0 -0
  286. synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -211
  287. synth_ai/learning/prompts/mipro.py +0 -289
  288. synth_ai/learning/prompts/random_search.py +0 -249
  289. synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
  290. synth_ai/learning/prompts/run_random_search_banking77.py +0 -329
  291. synth_ai/rl/secrets.py +0 -19
  292. synth_ai/scripts/verify_rewards.py +0 -100
  293. synth_ai/tracing/__init__.py +0 -30
  294. synth_ai/tracing_v1/__init__.py +0 -33
  295. synth_ai/tracing_v3/turso/__init__.py +0 -25
  296. synth_ai/tracing_v3/turso/manager.py +0 -838
  297. synth_ai/zyk/__init__.py +0 -30
  298. synth_ai-0.2.9.dev7.dist-info/METADATA +0 -131
  299. /synth_ai/{lm → v0/lm}/caching/__init__.py +0 -0
  300. /synth_ai/{lm → v0/lm}/caching/constants.py +0 -0
  301. /synth_ai/{lm → v0/lm}/caching/dbs.py +0 -0
  302. /synth_ai/{lm → v0/lm}/constants.py +0 -0
  303. /synth_ai/{lm → v0/lm}/core/__init__.py +0 -0
  304. /synth_ai/{lm → v0/lm}/core/exceptions.py +0 -0
  305. /synth_ai/{lm → v0/lm}/cost/__init__.py +0 -0
  306. /synth_ai/{lm → v0/lm}/cost/monitor.py +0 -0
  307. /synth_ai/{lm → v0/lm}/cost/statefulness.py +0 -0
  308. /synth_ai/{lm → v0/lm}/injection.py +0 -0
  309. /synth_ai/{lm → v0/lm}/provider_support/__init__.py +0 -0
  310. /synth_ai/{lm → v0/lm}/provider_support/suppress_logging.py +0 -0
  311. /synth_ai/{lm → v0/lm}/structured_outputs/__init__.py +0 -0
  312. /synth_ai/{lm → v0/lm}/structured_outputs/inject.py +0 -0
  313. /synth_ai/{lm → v0/lm}/tools/__init__.py +0 -0
  314. /synth_ai/{lm → v0/lm}/tools/base.py +0 -0
  315. /synth_ai/{lm → v0/lm}/unified_interface.py +0 -0
  316. /synth_ai/{lm → v0/lm}/vendors/__init__.py +0 -0
  317. /synth_ai/{lm → v0/lm}/vendors/base.py +0 -0
  318. /synth_ai/{lm → v0/lm}/vendors/core/__init__.py +0 -0
  319. /synth_ai/{lm → v0/lm}/vendors/core/synth_dev_api.py +0 -0
  320. /synth_ai/{lm → v0/lm}/vendors/local/__init__.py +0 -0
  321. /synth_ai/{lm → v0/lm}/vendors/local/ollama.py +0 -0
  322. /synth_ai/{lm → v0/lm}/vendors/retries.py +0 -0
  323. /synth_ai/{lm → v0/lm}/vendors/supported/__init__.py +0 -0
  324. /synth_ai/{lm → v0/lm}/warmup.py +0 -0
  325. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/WHEEL +0 -0
  326. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/entry_points.txt +0 -0
  327. {synth_ai-0.2.9.dev7.dist-info → synth_ai-0.2.9.dev8.dist-info}/licenses/LICENSE +0 -0
@@ -1,234 +0,0 @@
1
- # Crafter Event-Level Rewards (NOTES)
2
-
3
- This note outlines how to support event-level reward layering for Crafter across the warming_up_to_rl task app and the monorepo clustered_training RL pipeline.
4
-
5
- ## Goals
6
- - Attribute reward at decision/step level (per tool call) instead of only using a single trajectory outcome reward.
7
- - Make this behavior controllable via TOML config flags (enable/disable and choose the source/kind of event reward).
8
- - Keep compatibility with existing trajectory-outcome paths; when disabled, the system behaves exactly as before.
9
-
10
- ## Definitions
11
- - "Decision": one LM tool call (e.g., `interact_many`) and the sequence of environment steps it triggers.
12
- - "Absolute achievement delta" (AchΔ): count of achievements that became true during a decision.
13
- - "Unique achievement delta" (UniqueΔ): count of achievements first unlocked in the episode by a decision.
14
- - "Env sparse reward": the environment’s own per-step reward (e.g., `reward_last_step`).
15
-
16
- ## What to compute per decision
17
- - From observation before and after the decision:
18
- - `turned_true = achievements_after − achievements_before`
19
- - `new_unique = episode_achievements_after − episode_achievements_before`
20
- - Scalars:
21
- - `ach_delta = len(turned_true)`
22
- - `unique_delta = len(new_unique)`
23
- - Optional: per-achievement markers for each `a ∈ new_unique` (reward 1.0) for fine-grained shaping.
24
-
25
- ## Switches/Flags in TOML
26
- Prefer reusing existing RL trainer flags in clustered_training (already present in code):
27
-
28
- ```
29
- [training]
30
- # Stepwise/event rewards
31
- step_rewards_enabled = true # master switch
32
- step_rewards_mode = "decision_stepwise" # "off" | "decision_stepwise" | "env_sparse"
33
- step_rewards_beta = 0.0 # optional coefficient for time weighting
34
- step_rewards_indicator_lambda = 0.0 # optional coefficient for indicator-based flips
35
-
36
- # Crafter-specific selection (proposed extension, optional)
37
- # event_rewards_kind = "unique" # "unique" | "absolute" (if omitted, default to "unique")
38
- ```
39
-
40
- - `step_rewards_enabled`: enables all event-level aggregation.
41
- - `step_rewards_mode`:
42
- - `off`: use only trajectory outcome reward (status quo).
43
- - `decision_stepwise`: use per-decision computed deltas (from policy app or collector), aggregate as returns.
44
- - `env_sparse`: use the environment’s `reward_last_step` per step.
45
- - `event_rewards_kind` (optional): if present, selects `unique_delta` (default) vs `ach_delta` for `decision_stepwise`.
46
-
47
- Warmup task TOML may place these under a `training` or `rollout` section; the launcher just forwards the full TOML blob to the backend, so the monorepo side should read the same keys.
48
-
49
- ## Warming_up_to_rl task app – producing decision rewards
50
- - In the Crafter policy (or rollout coordinator), for each decision:
51
- - Compute `ach_delta` and `unique_delta` as above.
52
- - Attach a compact record to the step metadata, e.g.:
53
- ```json
54
- {
55
- "decision_rewards": {
56
- "turn": 5,
57
- "ach_delta": 1,
58
- "unique_delta": 1,
59
- "all": ["collect_wood"],
60
- "unique": ["collect_wood"]
61
- }
62
- }
63
- ```
64
- - When `step_rewards_enabled=false`, omit this block.
65
- - When `step_rewards_mode="env_sparse"`, rely on `reward_last_step` (no decision block required).
66
-
67
- Notes:
68
- - The app already records previous tool calls and environment results; this simply adds a small, structured payload per decision (turn).
69
- - If per-step `reward_last_step` is unavailable, `decision_stepwise` remains effective as long as achievements maps are present.
70
-
71
- ## Monorepo clustered_training – consuming event rewards
72
- Integration points (based on existing config structure):
73
- - `ClusteredTrainerConfig` already includes:
74
- - `step_rewards_enabled: bool`
75
- - `step_rewards_mode: str` (off | decision_stepwise)
76
- - `step_rewards_beta: float`
77
- - `step_rewards_indicator_lambda: float`
78
-
79
- Collector changes (conceptual):
80
- 1. During trajectory collection, build a vector `r_t` of per-time-step rewards:
81
- - If `step_rewards_mode == "decision_stepwise"`:
82
- - For time step `t` corresponding to a decision, set:
83
- - `r_t = unique_delta` if `event_rewards_kind=="unique"` (default), else `r_t = ach_delta`.
84
- - For non-decision steps, `r_t = 0.0` (unless you prefer to spread rewards over sub-steps; keep simple attribution by default).
85
- - If `step_rewards_mode == "env_sparse"`:
86
- - For each environment step, set `r_t = reward_last_step`.
87
- - Else (`off`):
88
- - Use a single scalar outcome reward at the end (status quo).
89
-
90
- 2. Compute returns/advantages as usual, summing event rewards:
91
- - For GRPO/GRPO-Ludic, the typical group-based advantage calculation remains unchanged; only the reward signal changes from a single scalar to a sequence `[r_1, …, r_T]`.
92
- - Optional time weighting: `r_t ← r_t + beta * (T − t) * indicator_flip_t`, where `indicator_flip_t` is 1 if any unique achievement flipped at `t`, else 0. Use `step_rewards_indicator_lambda` as a coefficient if needed.
93
-
94
- Pseudo-code (collector side):
95
- ```python
96
- r = [0.0] * T
97
- if cfg.step_rewards_enabled:
98
- if cfg.step_rewards_mode == "decision_stepwise":
99
- for ev in decision_events: # each with fields {turn, ach_delta, unique_delta}
100
- idx = ev["turn"] - 1 # 0-based
101
- base = ev["unique_delta"] if event_kind == "unique" else ev["ach_delta"]
102
- r[idx] += float(base)
103
- if cfg.step_rewards_indicator_lambda > 0 and ev["unique_delta"] > 0:
104
- r[idx] += float(cfg.step_rewards_indicator_lambda)
105
- elif cfg.step_rewards_mode == "env_sparse":
106
- for t, step in enumerate(env_steps):
107
- r[t] += float(step.get("reward_last_step", 0.0))
108
- else:
109
- r[-1] += float(trajectory_outcome_reward)
110
- ```
111
-
112
- ## Respecting the TOML switch
113
- - warming_up_to_rl launcher (`run_rl_and_save.py`) forwards the entire TOML to the backend.
114
- - clustered_training should read `[training].step_rewards_enabled` and `[training].step_rewards_mode` (and optionally `event_rewards_kind`) inside its config loader (already present fields in `ClusteredTrainerConfig`).
115
- - When disabled, the collector must not attempt to parse or rely on any per-decision metadata.
116
-
117
- ## Debugging & metrics
118
- - Log per-trajectory aggregates: `ΣAchΔ`, `ΣUniqueΔ`, and a breakdown by decision turn (already added to the Groq rollout table in research). These can be mirrored in the backend logs for quick checks.
119
- - Add simple counters to training logs:
120
- - number of decisions with `unique_delta>0`
121
- - sum of deltas per batch
122
- - share of batches with nonzero event rewards
123
-
124
- ## Backward compatibility
125
- - When flags are off, the pipeline uses trajectory outcome rewards only.
126
- - No schema migrations are required; event-level metadata is optional.
127
-
128
- ## Recommended defaults
129
- - `step_rewards_enabled = true`
130
- - `step_rewards_mode = "decision_stepwise"`
131
- - Prefer `unique` deltas for better credit assignment; set `event_rewards_kind = "unique"` (if adopted) or implicitly default to unique deltas.
132
-
133
- Here’s the exact file-by-file implementation checklist, scoped so another engineer can implement from this alone.
134
-
135
- Warming_up_to_rl (task app) – record decision rewards and honor flags
136
- - Config examples (ensure flags present and documented)
137
- - `examples/warming_up_to_rl/configs/*.toml`
138
- - Add under [training]:
139
- - `step_rewards_enabled = true|false`
140
- - `step_rewards_mode = "off" | "decision_stepwise" | "env_sparse"`
141
- - Optional: `event_rewards_kind = "unique" | "absolute"`
142
- - Optional shaping: `step_rewards_beta`, `step_rewards_indicator_lambda`
143
-
144
- - Policy (compute ach/unique deltas per decision; emit into step metadata when enabled)
145
- - `examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py`
146
- - Before/after each tool call sequence, compute:
147
- - `ach_delta = len(achievements_after − achievements_before)`
148
- - `unique_delta = len((episode_achievements_after) − (episode_achievements_before))`
149
- - When `[training].step_rewards_enabled` and `step_rewards_mode == "decision_stepwise"`:
150
- - Attach to the step’s returned metadata:
151
- - `decision_rewards = { turn, ach_delta, unique_delta, all: [...], unique: [...] }`
152
- - If `step_rewards_mode == "env_sparse"`, do not emit `decision_rewards` (leave environment’s `reward_last_step` as the only per-step reward).
153
- - Respect clipping for long “Previous tool calls” context (already added; keep).
154
-
155
- - Policy routes (surface flags to policy; store on policy instance or in request metadata)
156
- - `examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py`
157
- - Accept training flags from create/init endpoints (if provided via config).
158
- - Pass through/attach the flags into the policy or per-step metadata so `policy.step(...)` can read them.
159
-
160
- - Rollout coordinator (guarantee metadata flows out with each step)
161
- - `examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py`
162
- - Ensure the step response returned to the caller includes `decision_rewards` when set by the policy.
163
- - No compute here; just propagate metadata.
164
-
165
- - Environment adapter (ensure observation has fields needed by the deltas)
166
- - `examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py`
167
- - Confirm each step response includes `observation.achievements_status` and `observation.reward_last_step`.
168
- - No reward computation changes here; just guarantee the fields exist.
169
-
170
- Monorepo (clustered training, GSPO/GRPO) – use decision/env-sparse rewards to build per-step returns
171
- - Config loader (read flags; default behavior preserved)
172
- - `backend/app/routes/clustered_training/core/algorithms/gspo/training/clustered_trainer.py`
173
- - In `ClusteredTrainerConfig.from_dict(...)`:
174
- - Already present: `step_rewards_enabled`, `step_rewards_mode`, `step_rewards_beta`, `step_rewards_indicator_lambda`.
175
- - Add (optional) read: `event_rewards_kind` with default `"unique"` if not present.
176
-
177
- - Collector/rollout trajectory builder (construct r_t per episode)
178
- - The module that converts environment/policy step records into trajectories (collector). If it’s split, cover the point where step arrays are built just before advantage computation.
179
- - New logic:
180
- - Initialize `r = [0.0] * T`.
181
- - If `step_rewards_enabled`:
182
- - If `step_rewards_mode == "decision_stepwise"`:
183
- - For each step metadata with `decision_rewards`:
184
- - `idx = turn - 1`
185
- - `base = unique_delta` if `event_rewards_kind == "unique"` else `ach_delta`
186
- - `r[idx] += float(base)`
187
- - If `step_rewards_indicator_lambda > 0` and `unique_delta > 0`, `r[idx] += step_rewards_indicator_lambda`
188
- - Else if `step_rewards_mode == "env_sparse"`:
189
- - For each step, `r[t] += float(observation.reward_last_step or 0.0)`
190
- - Else (`off`): `r[-1] += float(outcome_reward)`
191
- - Optional shaping: `r[t] += step_rewards_beta * (T - t) * indicator_flip_t` where `indicator_flip_t = 1` if the step had `unique_delta > 0`, else 0.
192
- - Ensure this path does not run when flags are off; old outcome-only behavior remains.
193
-
194
- - Advantage/returns computation (no API change; just consume r)
195
- - The function/module that currently builds returns/advantages from rewards.
196
- - No interface changes; ensure it takes `r` from the collector path above instead of a single scalar outcome reward when event rewards are enabled.
197
-
198
- - Logging/metrics (help ops confirm it’s working)
199
- - Add counters in the training loop logs:
200
- - Sum of `r` per batch (stepwise mode).
201
- - Count of decisions with `unique_delta > 0`.
202
- - Mode/flags echoed on startup.
203
-
204
- - RL configs (dev example TOMLs with flags)
205
- - `backend/app/routes/clustered_training/dev/configs/crafter_online.toml`
206
- - Add the `[training]` keys above with comments showing choices.
207
- - Any job start scripts that inline TOML (e.g. `tests/applications/crafter/rl/start_qwen_full_clustered.py` if used)
208
- - Ensure they don’t strip the new keys; no code change needed if they pass through the TOML.
209
-
210
- Research (optional reference; not required for GSPO)
211
- - Reference rollout script demonstrating decision-delta computation
212
- - `research/testing/crafter/eval_rollout_table_groq.py`
213
- - Already computes/prints per-decision deltas; use as validation aid (no further changes required for GSPO).
214
-
215
- Docs/notes (keep implementers aligned)
216
- - Warming up to RL notes
217
- - `examples/warming_up_to_rl/event_rewards.md`
218
- - Already describes flags and expectations; keep this in sync if any naming changes happen.
219
-
220
- - Research spec
221
- - `research/testing/crafter/event_rewards.txt`
222
- - Already contains the full design and the “recording AND using stepwise rewards” plan.
223
-
224
- Sanity checklist (engineer can validate with these)
225
- - With `[training].step_rewards_enabled=false`: identical behavior to today (only outcome reward used).
226
- - With `decision_stepwise`:
227
- - The task app emits `decision_rewards` per decision (check one trajectory).
228
- - The collector constructs `r_t` from `unique_delta` (or `ach_delta` if configured).
229
- - Training logs show nonzero stepwise batch reward sums.
230
- - With `env_sparse`:
231
- - No decision payload; rewards come strictly from `reward_last_step`.
232
- - Switching `event_rewards_kind` between `"unique"` and `"absolute"` changes which scalar lands in r at a decision turn.
233
-
234
- If you want, I can generate minimal code diffs for each target file after you confirm these paths and flag names.
@@ -1,73 +0,0 @@
1
- # Crafter Task App Ops Cheatsheet
2
-
3
- ## Discover available task apps
4
- - `uvx synth-ai task-app list`
5
- - Lists the registered apps plus any aliases (e.g. `grpo-crafter`, `crafter`).
6
-
7
- ## Run locally with uvicorn
8
- - Launch the FastAPI server:
9
- - `uvx synth-ai serve grpo-crafter --port 8010 --force`
10
- - `--force` frees the port if a previous run is still bound.
11
- - Add `--reload` while iterating on code.
12
- - Enable tracing + SFT dumps while serving:
13
- - `uvx synth-ai serve grpo-crafter --port 8010 --force --trace ./traces --trace-db ./traces/v3/synth_ai.db`
14
- - `--trace` writes JSONL trajectories into the folder.
15
- - `--trace-db` points the sqlite/Turso-compatible tracing DB (defaults to `traces/v3/synth_ai.db`).
16
-
17
- ## Modal hot-reload (`modal serve`)
18
- - Run the hosted app locally inside Modal’s hot-reload loop:
19
- - `uvx synth-ai task-app modal-serve grpo-crafter --env-file .env`
20
- - CLI will prompt for a `.env` file if not supplied; secrets are loaded via `Secret.from_dotenv`.
21
- - Keeps watching the repo for changes and streams logs in your terminal.
22
-
23
- ## Modal deploy (persistent endpoint)
24
- - Build + deploy to the `modal deploy` target:
25
- - `uvx synth-ai task-app deploy grpo-crafter --env-file .env`
26
- - Use `--dry-run` first to inspect the generated `modal deploy …` command.
27
- - `--modal-cli` lets you point at a non-default Modal binary if needed.
28
-
29
- ## Collecting traces & rollouts
30
- - Local rollouts against a running server with full trace payloads:
31
- - `uv run python examples/warming_up_to_rl/run_local_rollout_traced.py --api-key "$ENVIRONMENT_API_KEY" --base-url http://localhost:8010 --model gpt-4o-mini --trace-format full --trace-path ./trace_full.json`
32
- - This script prints a reward summary, dumps the trace JSON, and warns if episode returns don’t line up with event rewards.
33
- - Remote rollouts against a deployed Modal endpoint:
34
- - `uv run python examples/warming_up_to_rl/run_rollout_remote.py --base-url https://<modal-app-url> --api-key "$ENVIRONMENT_API_KEY" --model gpt-4o-mini --max-llm-calls 10`
35
-
36
- ## Trace analytics
37
- - Summarise model usage, reward breakdowns, and achievement histograms:
38
- - `uv run python examples/warming_up_to_rl/analyze_trace_db.py --db traces/v3/synth_ai.db`
39
- - Output includes per-model achievement tallies and episode reward stats.
40
-
41
- ## Exporting behavioural-cloning datasets
42
- - Filter sessions via model, achievements, rewards, etc., then export JSONL:
43
- - `uv run python examples/warming_up_to_rl/export_trace_sft.py \`
44
- ` --db traces/v3/synth_ai.db \`
45
- ` --output traces/qwen32b_filtered.jsonl \`
46
- ` --model qwen/qwen3-32b \`
47
- ` --exclude-achievement collect_sapling \`
48
- ` --exclude-achievement collect_drink \`
49
- ` --min-unique 3 \`
50
- ` --event-reward unique_achievement_delta:1.0 \`
51
- ` --limit 100`
52
- - `--exclude-achievement` makes it easy to ignore easier unlocks when enforcing `--min-unique`.
53
- - Combine `--require-achievement`, `--min-outcome-reward`, or provider filters as needed.
54
-
55
- ## Training jobs (RL + SFT)
56
- - `uvx synth-ai train` is the consolidated entry point for RL or SFT launches.
57
- - Omit `--config` to let the CLI enumerate candidate TOMLs (RL + FFT) and pick interactively.
58
- - Omit `--env-file` to browse available `.env` files; the CLI never auto-selects.
59
- - Missing secrets trigger an interactive loop: enter manually, switch `.env`, or fetch from Modal (secrets/apps) before proceeding.
60
- - RL run (local backend + local task app):
61
- - `uvx synth-ai train --type rl --config examples/warming_up_to_rl/configs/crafter_cluster.toml --backend http://localhost:8000/api --task-url http://localhost:8010`
62
- - Performs task-app health checks using the resolved `ENVIRONMENT_API_KEY` before posting to `/rl/jobs`.
63
- - Polls job status until terminal unless `--no-poll` is supplied.
64
- - SFT run (FFT fine-tune):
65
- - `uvx synth-ai train --type sft --config examples/warming_up_to_rl/configs/fft_crafter.toml --dataset traces/crafter_sft.jsonl`
66
- - Uploads training/validation JSONL to `/learning/files` and starts the job.
67
- - Poll output mirrors the legacy `run_fft_and_save.py` script.
68
- - Common flags:
69
- - `--dry-run` previews payloads/uploads without making requests.
70
- - `--idempotency` sets the `Idempotency-Key` header for RL submissions.
71
- - `--poll-timeout` / `--poll-interval` tune the backend polling cadence.
72
-
73
- > Tip: all `uvx synth-ai …` subcommands accept `--help` if you need to inspect additional options on the fly.