synth-ai 0.2.9.dev5__py3-none-any.whl → 0.2.9.dev6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (351) hide show
  1. examples/__init__.py +16 -0
  2. examples/crafter_debug_render.py +23 -17
  3. examples/qwen_coder/README.md +102 -0
  4. examples/qwen_coder/_shared.py +113 -0
  5. examples/qwen_coder/configs/coder_lora_30b.toml +61 -0
  6. examples/qwen_coder/configs/coder_lora_4b.toml +57 -0
  7. examples/qwen_coder/configs/coder_lora_small.toml +58 -0
  8. examples/qwen_coder/generate_dataset.py +98 -0
  9. examples/qwen_coder/infer_ft_smoke.py +64 -0
  10. examples/qwen_coder/infer_prod_proxy.py +73 -0
  11. examples/qwen_coder/infer_via_synth.py +87 -0
  12. examples/qwen_coder/scripts/infer_coder.sh +18 -0
  13. examples/qwen_coder/scripts/train_coder_30b.sh +21 -0
  14. examples/qwen_coder/sft_full_17b.py +103 -0
  15. examples/qwen_coder/sft_lora_30b.py +110 -0
  16. examples/qwen_coder/subset_jsonl.py +38 -0
  17. examples/qwen_coder/validate_jsonl.py +59 -0
  18. examples/rl/configs/eval_base_qwen.toml +1 -1
  19. examples/rl/configs/rl_from_base_qwen17.toml +1 -1
  20. examples/rl/download_dataset.py +26 -10
  21. examples/rl/run_eval.py +53 -52
  22. examples/rl/run_rl_and_save.py +29 -12
  23. examples/rl/task_app/math_single_step.py +180 -41
  24. examples/rl/task_app/math_task_app.py +14 -6
  25. examples/sft/README.md +139 -0
  26. examples/sft/configs/crafter_fft_qwen0p6b.toml +44 -0
  27. examples/sft/configs/crafter_lora_qwen0p6b.toml +45 -0
  28. examples/sft/evaluate.py +117 -0
  29. examples/sft/export_dataset.py +117 -0
  30. examples/sft/generate_traces.py +162 -0
  31. examples/swe/__init__.py +12 -0
  32. examples/swe/task_app/README.md +105 -0
  33. examples/swe/task_app/__init__.py +2 -0
  34. examples/swe/task_app/grpo_swe_mini.py +571 -0
  35. examples/swe/task_app/grpo_swe_mini_task_app.py +136 -0
  36. examples/swe/task_app/hosted/README.md +173 -0
  37. examples/swe/task_app/hosted/__init__.py +5 -0
  38. examples/swe/task_app/hosted/branching.py +143 -0
  39. examples/swe/task_app/hosted/environment_routes.py +1289 -0
  40. examples/swe/task_app/hosted/envs/__init__.py +1 -0
  41. examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
  42. examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
  43. examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
  44. examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
  45. examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
  46. examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
  47. examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
  48. examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
  49. examples/swe/task_app/hosted/envs/mini_swe/environment.py +1164 -0
  50. examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
  51. examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
  52. examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
  53. examples/swe/task_app/hosted/hosted_app.py +204 -0
  54. examples/swe/task_app/hosted/inference/__init__.py +5 -0
  55. examples/swe/task_app/hosted/inference/openai_client.py +618 -0
  56. examples/swe/task_app/hosted/main.py +100 -0
  57. examples/swe/task_app/hosted/policy_routes.py +1079 -0
  58. examples/swe/task_app/hosted/registry.py +195 -0
  59. examples/swe/task_app/hosted/rollout.py +1869 -0
  60. examples/swe/task_app/hosted/storage/__init__.py +5 -0
  61. examples/swe/task_app/hosted/storage/volume.py +211 -0
  62. examples/swe/task_app/hosted/test_agents.py +161 -0
  63. examples/swe/task_app/hosted/test_service.py +137 -0
  64. examples/swe/task_app/hosted/utils.py +62 -0
  65. examples/vlm/README.md +68 -0
  66. examples/vlm/configs/crafter_vlm_gpt4o.toml +44 -0
  67. examples/vlm/crafter_image_only_agent.py +207 -0
  68. examples/vlm/crafter_openai_vlm_agent.py +277 -0
  69. examples/vlm/filter_image_rows.py +63 -0
  70. examples/vlm/run_crafter_vlm_benchmark.py +316 -0
  71. examples/warming_up_to_rl/analyze_trace_db.py +12 -10
  72. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +11 -1
  73. examples/warming_up_to_rl/export_trace_sft.py +218 -36
  74. examples/warming_up_to_rl/groq_test.py +15 -8
  75. examples/warming_up_to_rl/manage_secrets.py +29 -25
  76. examples/warming_up_to_rl/readme.md +9 -2
  77. examples/warming_up_to_rl/run_eval.py +137 -61
  78. examples/warming_up_to_rl/run_fft_and_save.py +131 -60
  79. examples/warming_up_to_rl/run_local_rollout.py +88 -39
  80. examples/warming_up_to_rl/run_local_rollout_modal.py +114 -28
  81. examples/warming_up_to_rl/run_local_rollout_parallel.py +81 -20
  82. examples/warming_up_to_rl/run_local_rollout_traced.py +126 -23
  83. examples/warming_up_to_rl/run_rl_and_save.py +35 -12
  84. examples/warming_up_to_rl/run_rollout_remote.py +44 -19
  85. examples/warming_up_to_rl/task_app/README.md +6 -2
  86. examples/warming_up_to_rl/task_app/grpo_crafter.py +319 -57
  87. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +11 -30
  88. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +1 -1
  89. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +9 -11
  90. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +137 -182
  91. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -1
  92. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +1 -1
  93. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -1
  94. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +150 -57
  95. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +105 -69
  96. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +19 -7
  97. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +45 -42
  98. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +1 -1
  99. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +47 -45
  100. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +1 -1
  101. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +198 -92
  102. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +0 -2
  103. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +361 -263
  104. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +21 -23
  105. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +394 -274
  106. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +1 -1
  107. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +56 -62
  108. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +1 -0
  109. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +6 -15
  110. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +4 -3
  111. synth/__init__.py +14 -0
  112. synth_ai/__init__.py +20 -4
  113. synth_ai/api/models/supported.py +376 -0
  114. synth_ai/api/train/builders.py +157 -26
  115. synth_ai/api/train/cli.py +213 -57
  116. synth_ai/api/train/config_finder.py +65 -5
  117. synth_ai/api/train/env_resolver.py +33 -15
  118. synth_ai/api/train/pollers.py +13 -4
  119. synth_ai/api/train/supported_algos.py +139 -0
  120. synth_ai/api/train/task_app.py +5 -3
  121. synth_ai/api/train/utils.py +33 -48
  122. synth_ai/cli/__init__.py +19 -4
  123. synth_ai/cli/_modal_wrapper.py +28 -0
  124. synth_ai/cli/_typer_patch.py +49 -0
  125. synth_ai/cli/balance.py +2 -3
  126. synth_ai/cli/calc.py +1 -1
  127. synth_ai/cli/demo.py +21 -6
  128. synth_ai/cli/recent.py +2 -2
  129. synth_ai/cli/rl_demo.py +77 -17
  130. synth_ai/cli/root.py +116 -39
  131. synth_ai/cli/status.py +2 -2
  132. synth_ai/cli/task_apps.py +1699 -259
  133. synth_ai/cli/traces.py +7 -4
  134. synth_ai/cli/turso.py +73 -0
  135. synth_ai/cli/watch.py +12 -18
  136. synth_ai/core/experiment.py +0 -2
  137. synth_ai/demo_registry.py +68 -31
  138. synth_ai/demos/core/cli.py +516 -194
  139. synth_ai/demos/demo_task_apps/__init__.py +3 -3
  140. synth_ai/demos/demo_task_apps/core.py +64 -28
  141. synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +2 -3
  142. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +37 -30
  143. synth_ai/demos/demo_task_apps/math/_common.py +1 -2
  144. synth_ai/demos/demo_task_apps/math/app.py +2 -1
  145. synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -6
  146. synth_ai/demos/demo_task_apps/math/modal_task_app.py +183 -82
  147. synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -2
  148. synth_ai/environments/examples/bandit/engine.py +12 -4
  149. synth_ai/environments/examples/bandit/taskset.py +4 -4
  150. synth_ai/environments/examples/crafter_classic/environment.py +76 -1
  151. synth_ai/environments/reproducibility/tree.py +5 -6
  152. synth_ai/environments/service/app.py +11 -12
  153. synth_ai/environments/service/core_routes.py +10 -9
  154. synth_ai/environments/stateful/engine.py +1 -1
  155. synth_ai/environments/tasks/core.py +1 -0
  156. synth_ai/environments/tasks/filters.py +5 -6
  157. synth_ai/environments/tasks/utils.py +4 -5
  158. synth_ai/evals/base.py +0 -2
  159. synth_ai/handshake.py +11 -9
  160. synth_ai/http.py +1 -1
  161. synth_ai/http_client.py +43 -11
  162. synth_ai/inference/__init__.py +0 -2
  163. synth_ai/inference/client.py +20 -6
  164. synth_ai/jobs/client.py +103 -78
  165. synth_ai/learning/__init__.py +41 -6
  166. synth_ai/learning/algorithms.py +14 -0
  167. synth_ai/learning/client.py +121 -29
  168. synth_ai/learning/config.py +2 -40
  169. synth_ai/learning/constants.py +0 -2
  170. synth_ai/learning/ft_client.py +4 -56
  171. synth_ai/learning/health.py +13 -7
  172. synth_ai/learning/jobs.py +43 -47
  173. synth_ai/{rl → learning/rl}/__init__.py +14 -5
  174. synth_ai/learning/rl/client.py +267 -0
  175. synth_ai/learning/rl/config.py +31 -0
  176. synth_ai/{rl → learning/rl}/contracts.py +5 -10
  177. synth_ai/{rl → learning/rl}/env_keys.py +45 -16
  178. synth_ai/learning/rl/secrets.py +13 -0
  179. synth_ai/learning/rl_client.py +2 -253
  180. synth_ai/learning/sft/__init__.py +29 -0
  181. synth_ai/learning/sft/client.py +68 -0
  182. synth_ai/learning/sft/config.py +270 -0
  183. synth_ai/learning/sft/data.py +295 -0
  184. synth_ai/learning/sse.py +25 -26
  185. synth_ai/learning/validators.py +25 -24
  186. synth_ai/lm/__init__.py +21 -47
  187. synth_ai/task/__init__.py +26 -27
  188. synth_ai/task/apps/__init__.py +18 -19
  189. synth_ai/task/auth.py +35 -23
  190. synth_ai/task/client.py +15 -13
  191. synth_ai/task/contracts.py +37 -35
  192. synth_ai/task/datasets.py +9 -6
  193. synth_ai/task/errors.py +11 -10
  194. synth_ai/task/health.py +17 -11
  195. synth_ai/task/json.py +58 -24
  196. synth_ai/task/proxy.py +15 -14
  197. synth_ai/task/rubrics.py +22 -15
  198. synth_ai/task/server.py +43 -17
  199. synth_ai/task/tracing_utils.py +12 -7
  200. synth_ai/task/validators.py +0 -1
  201. synth_ai/task/vendors.py +5 -7
  202. synth_ai/tracing_v3/__init__.py +2 -0
  203. synth_ai/tracing_v3/abstractions.py +21 -4
  204. synth_ai/tracing_v3/db_config.py +26 -1
  205. synth_ai/tracing_v3/decorators.py +18 -15
  206. synth_ai/tracing_v3/examples/basic_usage.py +3 -2
  207. synth_ai/tracing_v3/hooks.py +6 -4
  208. synth_ai/tracing_v3/llm_call_record_helpers.py +6 -6
  209. synth_ai/tracing_v3/replica_sync.py +1 -0
  210. synth_ai/tracing_v3/session_tracer.py +63 -16
  211. synth_ai/tracing_v3/storage/base.py +89 -1
  212. synth_ai/tracing_v3/storage/config.py +21 -8
  213. synth_ai/tracing_v3/storage/factory.py +10 -8
  214. synth_ai/tracing_v3/storage/utils.py +4 -2
  215. synth_ai/tracing_v3/turso/daemon.py +7 -2
  216. synth_ai/tracing_v3/turso/models.py +5 -2
  217. synth_ai/tracing_v3/turso/native_manager.py +1173 -0
  218. synth_ai/tracing_v3/utils.py +4 -3
  219. synth_ai/v0/api/__init__.py +8 -0
  220. synth_ai/v0/api/models/__init__.py +8 -0
  221. synth_ai/v0/api/models/supported.py +8 -0
  222. synth_ai/v0/config/__init__.py +15 -0
  223. synth_ai/v0/config/base_url.py +12 -0
  224. synth_ai/v0/lm/__init__.py +51 -0
  225. synth_ai/{lm → v0/lm}/caching/ephemeral.py +3 -5
  226. synth_ai/{lm → v0/lm}/caching/handler.py +4 -4
  227. synth_ai/{lm → v0/lm}/caching/initialize.py +1 -1
  228. synth_ai/{lm → v0/lm}/caching/persistent.py +1 -1
  229. synth_ai/{lm → v0/lm}/config.py +6 -1
  230. synth_ai/{lm → v0/lm}/core/all.py +9 -9
  231. synth_ai/{lm → v0/lm}/core/exceptions.py +0 -2
  232. synth_ai/{lm → v0/lm}/core/main.py +19 -7
  233. synth_ai/{lm → v0/lm}/core/main_v3.py +10 -10
  234. synth_ai/{lm → v0/lm}/core/synth_models.py +2 -15
  235. synth_ai/{lm → v0/lm}/core/vendor_clients.py +6 -4
  236. synth_ai/{lm → v0/lm}/overrides.py +4 -4
  237. synth_ai/{lm → v0/lm}/provider_support/anthropic.py +4 -4
  238. synth_ai/{lm → v0/lm}/provider_support/openai.py +5 -5
  239. synth_ai/{lm → v0/lm}/structured_outputs/handler.py +5 -5
  240. synth_ai/{lm → v0/lm}/structured_outputs/rehabilitate.py +1 -1
  241. synth_ai/{lm → v0/lm}/vendors/core/anthropic_api.py +16 -16
  242. synth_ai/{lm → v0/lm}/vendors/core/gemini_api.py +5 -5
  243. synth_ai/{lm → v0/lm}/vendors/core/mistral_api.py +5 -5
  244. synth_ai/{lm → v0/lm}/vendors/core/openai_api.py +12 -10
  245. synth_ai/{lm → v0/lm}/vendors/openai_standard.py +11 -9
  246. synth_ai/{lm → v0/lm}/vendors/openai_standard_responses.py +8 -5
  247. synth_ai/{lm → v0/lm}/vendors/supported/custom_endpoint.py +4 -6
  248. synth_ai/{lm → v0/lm}/vendors/supported/deepseek.py +2 -2
  249. synth_ai/{lm → v0/lm}/vendors/supported/grok.py +2 -2
  250. synth_ai/{lm → v0/lm}/vendors/supported/groq.py +1 -1
  251. synth_ai/{lm → v0/lm}/vendors/supported/ollama.py +1 -1
  252. synth_ai/{lm → v0/lm}/vendors/supported/openrouter.py +3 -3
  253. synth_ai/{lm → v0/lm}/vendors/supported/together.py +1 -1
  254. synth_ai/{lm → v0/lm}/vendors/synth_client.py +38 -11
  255. synth_ai/v0/tracing/upload.py +32 -135
  256. synth_ai/v0/tracing_v3/__init__.py +10 -0
  257. synth_ai/v0/tracing_v3/abstractions.py +3 -0
  258. synth_ai/v0/tracing_v3/decorators.py +3 -0
  259. synth_ai/v0/tracing_v3/llm_call_record_helpers.py +3 -0
  260. synth_ai/v0/tracing_v3/session_tracer.py +3 -0
  261. synth_ai-0.2.9.dev6.dist-info/METADATA +191 -0
  262. {synth_ai-0.2.9.dev5.dist-info → synth_ai-0.2.9.dev6.dist-info}/RECORD +291 -262
  263. {synth_ai-0.2.9.dev5.dist-info → synth_ai-0.2.9.dev6.dist-info}/top_level.txt +1 -0
  264. examples/common_old/backend.py +0 -21
  265. examples/evals_old/README.md +0 -98
  266. examples/evals_old/__init__.py +0 -6
  267. examples/evals_old/compare_models.py +0 -1037
  268. examples/evals_old/example_log.md +0 -145
  269. examples/evals_old/run_demo.sh +0 -126
  270. examples/evals_old/trace_analysis.py +0 -270
  271. examples/finetuning_old/_backup_synth_qwen/config.toml +0 -29
  272. examples/finetuning_old/_backup_synth_qwen/example_log.md +0 -324
  273. examples/finetuning_old/_backup_synth_qwen/filter_traces.py +0 -60
  274. examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +0 -239
  275. examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +0 -109
  276. examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +0 -1924
  277. examples/finetuning_old/_backup_synth_qwen/readme.md +0 -49
  278. examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +0 -114
  279. examples/finetuning_old/_backup_synth_qwen/run_demo.sh +0 -195
  280. examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +0 -118
  281. examples/finetuning_old/synth_qwen_v1/README.md +0 -68
  282. examples/finetuning_old/synth_qwen_v1/filter_traces.py +0 -60
  283. examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +0 -239
  284. examples/finetuning_old/synth_qwen_v1/finetune.py +0 -46
  285. examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +0 -71
  286. examples/finetuning_old/synth_qwen_v1/infer.py +0 -37
  287. examples/finetuning_old/synth_qwen_v1/poll.py +0 -44
  288. examples/finetuning_old/synth_qwen_v1/prepare_data.py +0 -35
  289. examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +0 -109
  290. examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +0 -1932
  291. examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +0 -207
  292. examples/finetuning_old/synth_qwen_v1/run_ft_job.py +0 -232
  293. examples/finetuning_old/synth_qwen_v1/upload_data.py +0 -34
  294. examples/finetuning_old/synth_qwen_v1/util.py +0 -147
  295. examples/rl_old/task_app.py +0 -962
  296. examples/warming_up_to_rl/old/event_rewards.md +0 -234
  297. examples/warming_up_to_rl/old/notes.md +0 -73
  298. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +0 -738
  299. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +0 -580
  300. synth_ai/experimental/synth_oss.py +0 -446
  301. synth_ai/install_sqld.sh +0 -40
  302. synth_ai/learning/filtering.py +0 -0
  303. synth_ai/learning/offline/dpo.py +0 -0
  304. synth_ai/learning/offline/providers.py +0 -7
  305. synth_ai/learning/offline/sft.py +0 -0
  306. synth_ai/learning/offline/shared.py +0 -0
  307. synth_ai/learning/online/grpo.py +0 -0
  308. synth_ai/learning/online/irft.py +0 -0
  309. synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
  310. synth_ai/learning/prompts/gepa.py +0 -0
  311. synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -213
  312. synth_ai/learning/prompts/mipro.py +0 -289
  313. synth_ai/learning/prompts/random_search.py +0 -246
  314. synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
  315. synth_ai/learning/prompts/run_random_search_banking77.py +0 -324
  316. synth_ai/rl/secrets.py +0 -19
  317. synth_ai/scripts/verify_rewards.py +0 -100
  318. synth_ai/tracing/__init__.py +0 -30
  319. synth_ai/tracing_v1/__init__.py +0 -33
  320. synth_ai/tracing_v3/turso/__init__.py +0 -25
  321. synth_ai/tracing_v3/turso/manager.py +0 -774
  322. synth_ai/zyk/__init__.py +0 -30
  323. synth_ai-0.2.9.dev5.dist-info/METADATA +0 -131
  324. /synth_ai/{lm → v0/lm}/caching/__init__.py +0 -0
  325. /synth_ai/{lm → v0/lm}/caching/constants.py +0 -0
  326. /synth_ai/{lm → v0/lm}/caching/dbs.py +0 -0
  327. /synth_ai/{lm → v0/lm}/constants.py +0 -0
  328. /synth_ai/{lm → v0/lm}/core/__init__.py +0 -0
  329. /synth_ai/{lm → v0/lm}/cost/__init__.py +0 -0
  330. /synth_ai/{lm → v0/lm}/cost/monitor.py +0 -0
  331. /synth_ai/{lm → v0/lm}/cost/statefulness.py +0 -0
  332. /synth_ai/{lm → v0/lm}/injection.py +0 -0
  333. /synth_ai/{lm → v0/lm}/provider_support/__init__.py +0 -0
  334. /synth_ai/{lm → v0/lm}/provider_support/suppress_logging.py +0 -0
  335. /synth_ai/{lm → v0/lm}/structured_outputs/__init__.py +0 -0
  336. /synth_ai/{lm → v0/lm}/structured_outputs/inject.py +0 -0
  337. /synth_ai/{lm → v0/lm}/tools/__init__.py +0 -0
  338. /synth_ai/{lm → v0/lm}/tools/base.py +0 -0
  339. /synth_ai/{lm → v0/lm}/unified_interface.py +0 -0
  340. /synth_ai/{lm → v0/lm}/vendors/__init__.py +0 -0
  341. /synth_ai/{lm → v0/lm}/vendors/base.py +0 -0
  342. /synth_ai/{lm → v0/lm}/vendors/core/__init__.py +0 -0
  343. /synth_ai/{lm → v0/lm}/vendors/core/synth_dev_api.py +0 -0
  344. /synth_ai/{lm → v0/lm}/vendors/local/__init__.py +0 -0
  345. /synth_ai/{lm → v0/lm}/vendors/local/ollama.py +0 -0
  346. /synth_ai/{lm → v0/lm}/vendors/retries.py +0 -0
  347. /synth_ai/{lm → v0/lm}/vendors/supported/__init__.py +0 -0
  348. /synth_ai/{lm → v0/lm}/warmup.py +0 -0
  349. {synth_ai-0.2.9.dev5.dist-info → synth_ai-0.2.9.dev6.dist-info}/WHEEL +0 -0
  350. {synth_ai-0.2.9.dev5.dist-info → synth_ai-0.2.9.dev6.dist-info}/entry_points.txt +0 -0
  351. {synth_ai-0.2.9.dev5.dist-info → synth_ai-0.2.9.dev6.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,316 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Benchmark Crafter performance across prompt modalities (text-only, image-only, both).
4
+
5
+ For each mode we:
6
+ * Run 20 seeded episodes (configurable) with GPT-4o mini via OpenAI Chat Completions.
7
+ * Execute the returned tool calls in the local Crafter environment.
8
+ * Record achievements/steps and save every rendered frame under `examples/vlm/temp/`.
9
+
10
+ Concurrency is capped by an asyncio semaphore (default parallelism = 10).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import asyncio
17
+ import base64
18
+ import json
19
+ import os
20
+ from collections import Counter, defaultdict
21
+ from dataclasses import dataclass
22
+ from enum import Enum
23
+ from pathlib import Path
24
+ from typing import Any
25
+ from uuid import uuid4
26
+
27
+ from examples.warming_up_to_rl.task_app.synth_envs_hosted.envs.crafter.environment import (
28
+ CrafterEnvironmentWrapper,
29
+ )
30
+ from examples.warming_up_to_rl.task_app.synth_envs_hosted.envs.crafter.policy import CrafterPolicy
31
+ from openai import AsyncOpenAI
32
+ from synth_ai.environments.examples.crafter_classic.environment import CrafterClassicEnvironment
33
+ from synth_ai.environments.examples.crafter_classic.taskset import (
34
+ CrafterTaskInstance,
35
+ CrafterTaskInstanceMetadata,
36
+ )
37
+ from synth_ai.environments.tasks.core import Impetus, Intent
38
+
39
+ OUTPUT_ROOT = Path("examples/vlm/temp")
40
+
41
+
42
+ class Mode(str, Enum):
43
+ TEXT = "text"
44
+ IMAGE = "image"
45
+ BOTH = "both"
46
+
47
+
48
+ @dataclass
49
+ class EpisodeResult:
50
+ mode: Mode
51
+ seed: int
52
+ steps_taken: int
53
+ achievements: set[str]
54
+ total_reward: float
55
+ tool_calls: int
56
+
57
+
58
+ def _ensure_openai_client(api_key: str | None) -> AsyncOpenAI:
59
+ if not api_key:
60
+ raise RuntimeError(
61
+ "OPENAI_API_KEY must be set to run the VLM benchmark (export the key or add to your .env)."
62
+ )
63
+ return AsyncOpenAI(api_key=api_key)
64
+
65
+
66
+ def _build_task_instance(seed: int) -> CrafterTaskInstance:
67
+ impetus = Impetus(instructions="Explore, survive, and unlock achievements.")
68
+ intent = Intent(rubric={"goal": "Unlock achievements"}, gold_trajectories=None, gold_state_diff={})
69
+ metadata = CrafterTaskInstanceMetadata(
70
+ difficulty="custom",
71
+ seed=seed,
72
+ num_trees_radius=0,
73
+ num_cows_radius=0,
74
+ num_hostiles_radius=0,
75
+ )
76
+ instance = CrafterTaskInstance(
77
+ id=uuid4(),
78
+ impetus=impetus,
79
+ intent=intent,
80
+ metadata=metadata,
81
+ is_reproducible=True,
82
+ initial_engine_snapshot=None,
83
+ )
84
+ # Engine expects these config keys
85
+ instance.config = {"seed": seed, "length": 256, "area": [64, 64]}
86
+ return instance
87
+
88
+
89
+ def _save_observation_frame(observation_packet: dict[str, Any], dest_path: Path) -> None:
90
+ obs = observation_packet.get("observation")
91
+ if not isinstance(obs, dict):
92
+ return
93
+ image_b64 = obs.get("observation_image_base64")
94
+ if not isinstance(image_b64, str) or not image_b64:
95
+ return
96
+ try:
97
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
98
+ dest_path.write_bytes(base64.b64decode(image_b64))
99
+ except Exception:
100
+ pass # best effort
101
+
102
+
103
+ def _strip_image_fields(observation_packet: dict[str, Any]) -> dict[str, Any]:
104
+ stripped = json.loads(json.dumps(observation_packet))
105
+ obs = stripped.get("observation")
106
+ if isinstance(obs, dict):
107
+ for key in list(obs.keys()):
108
+ if key.startswith("observation_image"):
109
+ obs.pop(key, None)
110
+ return stripped
111
+
112
+
113
+ def _make_image_only_request(request: dict[str, Any]) -> dict[str, Any]:
114
+ cloned = json.loads(json.dumps(request))
115
+ for message in cloned.get("messages", []):
116
+ if message.get("role") != "user":
117
+ continue
118
+ content = message.get("content")
119
+ if isinstance(content, list):
120
+ image_parts = [
121
+ item
122
+ for item in content
123
+ if isinstance(item, dict) and item.get("type") in {"image_url", "image"}
124
+ ]
125
+ message["content"] = image_parts or content
126
+ elif isinstance(content, str):
127
+ # No structured parts available; leave as empty string
128
+ message["content"] = ""
129
+ return cloned
130
+
131
+
132
+ async def _run_episode(
133
+ *,
134
+ mode: Mode,
135
+ seed: int,
136
+ client: AsyncOpenAI,
137
+ model: str,
138
+ max_steps: int,
139
+ temperature: float,
140
+ semaphore: asyncio.Semaphore,
141
+ ) -> EpisodeResult:
142
+ async with semaphore:
143
+ task_instance = _build_task_instance(seed)
144
+ env = CrafterClassicEnvironment(task_instance)
145
+ wrapper = CrafterEnvironmentWrapper(env, seed=seed)
146
+
147
+ policy = CrafterPolicy(inference_url="openai://chat-completions", model=model)
148
+ await policy.initialize({"use_tools": True, "model": model})
149
+
150
+ observation_packet = await wrapper.initialize()
151
+ achievements: set[str] = set()
152
+ total_reward = 0.0
153
+ steps_taken = 0
154
+ tool_calls_total = 0
155
+
156
+ frames_dir = OUTPUT_ROOT / f"{mode.value}_frames" / f"seed_{seed:04d}"
157
+ _save_observation_frame(observation_packet, frames_dir / "step_000.png")
158
+
159
+ for step_idx in range(max_steps):
160
+ obs_dict = observation_packet.get("observation")
161
+ if not isinstance(obs_dict, dict):
162
+ break
163
+
164
+ observation_for_policy: dict[str, Any]
165
+ metadata_payload: dict[str, Any] = {}
166
+
167
+ if mode == Mode.TEXT:
168
+ observation_for_policy = _strip_image_fields(observation_packet)
169
+ else:
170
+ observation_for_policy = json.loads(json.dumps(observation_packet))
171
+ metadata_payload["raw_observation"] = observation_packet
172
+
173
+ obs_text = policy._format_observation_for_llm(observation_for_policy) # noqa: SLF001
174
+ _, meta = await policy.step(
175
+ observation_text=obs_text,
176
+ metadata=metadata_payload,
177
+ )
178
+ inference_request = json.loads(json.dumps(meta["inference_request"]))
179
+
180
+ if mode == Mode.IMAGE:
181
+ inference_request = _make_image_only_request(inference_request)
182
+
183
+ inference_request.update(
184
+ {
185
+ "model": model,
186
+ "temperature": temperature,
187
+ "max_tokens": inference_request.get("max_tokens", 512),
188
+ }
189
+ )
190
+ inference_request.pop("stop_after_tool_calls", None)
191
+ inference_request.pop("thinking_mode", None)
192
+ inference_request.pop("thinking_budget", None)
193
+
194
+ response = await client.chat.completions.create(**inference_request)
195
+ response_dict = response.model_dump()
196
+
197
+ assistant_tool_calls = CrafterPolicy.parse_response_to_tool_calls(
198
+ response_dict,
199
+ use_tools=policy.use_tools,
200
+ )
201
+ if not assistant_tool_calls:
202
+ break
203
+
204
+ tool_calls_total += len(assistant_tool_calls)
205
+ assistant_message = response_dict["choices"][0].get("message") or {}
206
+ assistant_text = assistant_message.get("content")
207
+
208
+ env_response = await wrapper.step(assistant_tool_calls)
209
+ if not isinstance(env_response, dict):
210
+ raise RuntimeError(f"Unexpected environment response type: {type(env_response)!r}")
211
+
212
+ policy._append_assistant_turn( # noqa: SLF001
213
+ assistant_text,
214
+ assistant_tool_calls,
215
+ env_response,
216
+ )
217
+
218
+ steps_taken += 1
219
+ obs = env_response.get("observation")
220
+ if isinstance(obs, dict):
221
+ ach = obs.get("achievements_status")
222
+ if isinstance(ach, dict):
223
+ for name, unlocked in ach.items():
224
+ if unlocked:
225
+ achievements.add(str(name))
226
+ reward = obs.get("reward_last_step")
227
+ if isinstance(reward, (int, float)):
228
+ total_reward += float(reward)
229
+
230
+ _save_observation_frame(env_response, frames_dir / f"step_{step_idx + 1:03d}.png")
231
+
232
+ if env_response.get("done"):
233
+ break
234
+ observation_packet = env_response
235
+
236
+ await wrapper.terminate()
237
+ return EpisodeResult(
238
+ mode=mode,
239
+ seed=seed,
240
+ steps_taken=steps_taken,
241
+ achievements=achievements,
242
+ total_reward=total_reward,
243
+ tool_calls=tool_calls_total,
244
+ )
245
+
246
+
247
+ def _summarise(results: list[EpisodeResult]) -> dict[str, Any]:
248
+ grouped: dict[Mode, list[EpisodeResult]] = defaultdict(list)
249
+ for result in results:
250
+ grouped[result.mode].append(result)
251
+
252
+ summary: dict[str, Any] = {}
253
+ for mode, mode_results in grouped.items():
254
+ if not mode_results:
255
+ continue
256
+ mean_steps = sum(r.steps_taken for r in mode_results) / len(mode_results)
257
+ mean_achievements = sum(len(r.achievements) for r in mode_results) / len(mode_results)
258
+ achievement_counts = Counter()
259
+ for res in mode_results:
260
+ achievement_counts.update(res.achievements)
261
+ summary[mode.value] = {
262
+ "episodes": len(mode_results),
263
+ "mean_steps": round(mean_steps, 2),
264
+ "mean_achievements": round(mean_achievements, 2),
265
+ "total_tool_calls": sum(r.tool_calls for r in mode_results),
266
+ "achievements": {name: count for name, count in sorted(achievement_counts.items())},
267
+ }
268
+ return summary
269
+
270
+
271
+ async def main() -> None:
272
+ parser = argparse.ArgumentParser(description=__doc__)
273
+ parser.add_argument("--model", default="gpt-4o-mini-2024-07-18", help="OpenAI model id to benchmark")
274
+ parser.add_argument("--seeds", type=int, default=20, help="Number of seeds per mode")
275
+ parser.add_argument("--steps", type=int, default=10, help="Max steps per episode")
276
+ parser.add_argument("--temperature", type=float, default=0.6, help="Sampling temperature")
277
+ parser.add_argument("--concurrency", type=int, default=10, help="Max concurrent OpenAI calls")
278
+ args = parser.parse_args()
279
+
280
+ api_key = os.getenv("OPENAI_API_KEY")
281
+ client = _ensure_openai_client(api_key)
282
+ semaphore = asyncio.Semaphore(max(1, args.concurrency))
283
+
284
+ OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
285
+
286
+ tasks: list[asyncio.Task[EpisodeResult]] = []
287
+ for mode in (Mode.TEXT, Mode.IMAGE, Mode.BOTH):
288
+ for seed in range(args.seeds):
289
+ task = asyncio.create_task(
290
+ _run_episode(
291
+ mode=mode,
292
+ seed=seed,
293
+ client=client,
294
+ model=args.model,
295
+ max_steps=args.steps,
296
+ temperature=args.temperature,
297
+ semaphore=semaphore,
298
+ )
299
+ )
300
+ tasks.append(task)
301
+
302
+ results = await asyncio.gather(*tasks)
303
+ summary = _summarise(results)
304
+
305
+ summary_path = OUTPUT_ROOT / "vlm_benchmark_summary.json"
306
+ summary_path.write_text(json.dumps(summary, indent=2), encoding="utf-8")
307
+
308
+ print("\nBenchmark Summary")
309
+ print("-----------------")
310
+ print(json.dumps(summary, indent=2))
311
+ print(f"\nFrames stored under: {OUTPUT_ROOT}/<mode>_frames/seed_xxxx/")
312
+ print(f"Summary saved to: {summary_path}")
313
+
314
+
315
+ if __name__ == "__main__":
316
+ asyncio.run(main())
@@ -9,7 +9,7 @@ import sqlite3
9
9
  import sys
10
10
  from collections import Counter, defaultdict
11
11
  from pathlib import Path
12
- from typing import Any, Dict, List, Set, Tuple
12
+ from typing import Any
13
13
 
14
14
  Row = sqlite3.Row
15
15
 
@@ -56,7 +56,7 @@ def fetch_model_usage(conn: sqlite3.Connection) -> list[dict[str, Any]]:
56
56
  def _parse_json(value: Any) -> Any:
57
57
  if value is None:
58
58
  return None
59
- if isinstance(value, (dict, list)):
59
+ if isinstance(value, dict | list):
60
60
  return value
61
61
  try:
62
62
  return json.loads(value)
@@ -64,7 +64,7 @@ def _parse_json(value: Any) -> Any:
64
64
  return None
65
65
 
66
66
 
67
- AchievementMap = dict[Tuple[str, int], dict[str, list[str]]]
67
+ AchievementMap = dict[tuple[str, int], dict[str, list[str]]]
68
68
 
69
69
 
70
70
  def fetch_achievement_data(
@@ -162,7 +162,7 @@ def fetch_achievement_data(
162
162
  achievement_name_counts.update(achievement_set)
163
163
 
164
164
  achievement_size_counts: Counter = Counter()
165
- for session_id, count in unique_counts_per_session.items():
165
+ for _session_id, count in unique_counts_per_session.items():
166
166
  achievement_size_counts[count] += 1
167
167
 
168
168
  return (
@@ -229,7 +229,9 @@ def format_model_stats(stats: list[dict[str, Any]]) -> str:
229
229
  if not stats:
230
230
  return "No model usage recorded."
231
231
  lines = ["Model usage (by LLM calls):"]
232
- header = f"{'Model':30} {'Provider':10} {'Calls':>7} {'Tokens (in/out)':>20} {'Avg latency ms':>15}"
232
+ header = (
233
+ f"{'Model':30} {'Provider':10} {'Calls':>7} {'Tokens (in/out)':>20} {'Avg latency ms':>15}"
234
+ )
233
235
  lines.append(header)
234
236
  lines.append("-" * len(header))
235
237
  for item in stats:
@@ -243,9 +245,7 @@ def format_model_stats(stats: list[dict[str, Any]]) -> str:
243
245
  return "\n".join(lines)
244
246
 
245
247
 
246
- def format_achievement_summary(
247
- name_counts: Counter, size_counts: Counter
248
- ) -> str:
248
+ def format_achievement_summary(name_counts: Counter, size_counts: Counter) -> str:
249
249
  lines = ["Unique achievements unlocked:"]
250
250
  if name_counts:
251
251
  top = name_counts.most_common()
@@ -295,7 +295,7 @@ def format_reward_summary(outcome: dict[str, Any], breakdown: list[dict[str, Any
295
295
 
296
296
 
297
297
  def compute_model_achievement_stats(
298
- conn: sqlite3.Connection, session_unique_sets: dict[str, Set[str]]
298
+ conn: sqlite3.Connection, session_unique_sets: dict[str, set[str]]
299
299
  ) -> dict[str, dict[str, Any]]:
300
300
  """Aggregate unique-achievement stats per model."""
301
301
 
@@ -349,7 +349,9 @@ def format_model_achievement_stats(model_stats: dict[str, dict[str, Any]]) -> st
349
349
  return "Achievement stats by model:\n (no model sessions recorded)"
350
350
 
351
351
  lines = ["Achievement stats by model:"]
352
- for model_name in sorted(model_stats.keys(), key=lambda m: model_stats[m]["sessions"], reverse=True):
352
+ for model_name in sorted(
353
+ model_stats.keys(), key=lambda m: model_stats[m]["sessions"], reverse=True
354
+ ):
353
355
  stats = model_stats[model_name]
354
356
  providers = ", ".join(sorted(stats["providers"])) if stats["providers"] else "-"
355
357
  sessions = stats["sessions"]
@@ -42,9 +42,13 @@ base = "Qwen/Qwen3-4B"
42
42
  label = "crafter-rl-from-base"
43
43
 
44
44
  [rollout]
45
+ env_name = "crafter"
45
46
  max_turns = 10
46
47
  episodes_per_batch = 64
47
- policy_name = "crafter"
48
+ policy_name = "crafter-react"
49
+ max_concurrent_rollouts = 8
50
+ batches_per_step = 2
51
+ ops = ["agent", "env"]
48
52
 
49
53
  [evaluation]
50
54
  # Run baseline evaluation over the first 100 seeds every 20 training iterations
@@ -55,6 +59,12 @@ seeds = [
55
59
  ]
56
60
 
57
61
  [training]
62
+ num_epochs = 1
63
+ iterations_per_epoch = 10
64
+ batch_size = 16
65
+ group_size = 4
66
+ gradient_accumulation_steps = 1
67
+ learning_rate = 5e-5
58
68
  log_interval = 1
59
69
  weight_sync_interval = 1
60
70
  # Additional RL hyperparameters can go here