synth-ai 0.2.9.dev4__py3-none-any.whl → 0.2.9.dev6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (353) hide show
  1. examples/__init__.py +16 -0
  2. examples/crafter_debug_render.py +23 -17
  3. examples/qwen_coder/README.md +102 -0
  4. examples/qwen_coder/_shared.py +113 -0
  5. examples/qwen_coder/configs/coder_lora_30b.toml +61 -0
  6. examples/qwen_coder/configs/coder_lora_4b.toml +57 -0
  7. examples/qwen_coder/configs/coder_lora_small.toml +58 -0
  8. examples/qwen_coder/generate_dataset.py +98 -0
  9. examples/qwen_coder/infer_ft_smoke.py +64 -0
  10. examples/qwen_coder/infer_prod_proxy.py +73 -0
  11. examples/qwen_coder/infer_via_synth.py +87 -0
  12. examples/qwen_coder/scripts/infer_coder.sh +18 -0
  13. examples/qwen_coder/scripts/train_coder_30b.sh +21 -0
  14. examples/qwen_coder/sft_full_17b.py +103 -0
  15. examples/qwen_coder/sft_lora_30b.py +110 -0
  16. examples/qwen_coder/subset_jsonl.py +38 -0
  17. examples/qwen_coder/validate_jsonl.py +59 -0
  18. examples/rl/configs/eval_base_qwen.toml +1 -1
  19. examples/rl/configs/rl_from_base_qwen17.toml +1 -1
  20. examples/rl/download_dataset.py +26 -10
  21. examples/rl/run_eval.py +53 -52
  22. examples/rl/run_rl_and_save.py +29 -12
  23. examples/rl/task_app/math_single_step.py +180 -41
  24. examples/rl/task_app/math_task_app.py +14 -6
  25. examples/sft/README.md +139 -0
  26. examples/sft/configs/crafter_fft_qwen0p6b.toml +44 -0
  27. examples/sft/configs/crafter_lora_qwen0p6b.toml +45 -0
  28. examples/sft/evaluate.py +117 -0
  29. examples/sft/export_dataset.py +117 -0
  30. examples/sft/generate_traces.py +162 -0
  31. examples/swe/__init__.py +12 -0
  32. examples/swe/task_app/README.md +105 -0
  33. examples/swe/task_app/__init__.py +2 -0
  34. examples/swe/task_app/grpo_swe_mini.py +571 -0
  35. examples/swe/task_app/grpo_swe_mini_task_app.py +136 -0
  36. examples/swe/task_app/hosted/README.md +173 -0
  37. examples/swe/task_app/hosted/__init__.py +5 -0
  38. examples/swe/task_app/hosted/branching.py +143 -0
  39. examples/swe/task_app/hosted/environment_routes.py +1289 -0
  40. examples/swe/task_app/hosted/envs/__init__.py +1 -0
  41. examples/swe/task_app/hosted/envs/crafter/__init__.py +6 -0
  42. examples/swe/task_app/hosted/envs/crafter/app.py +1 -0
  43. examples/swe/task_app/hosted/envs/crafter/environment.py +522 -0
  44. examples/swe/task_app/hosted/envs/crafter/policy.py +478 -0
  45. examples/swe/task_app/hosted/envs/crafter/react_agent.py +108 -0
  46. examples/swe/task_app/hosted/envs/crafter/shared.py +305 -0
  47. examples/swe/task_app/hosted/envs/crafter/tools.py +47 -0
  48. examples/swe/task_app/hosted/envs/mini_swe/__init__.py +8 -0
  49. examples/swe/task_app/hosted/envs/mini_swe/environment.py +1164 -0
  50. examples/swe/task_app/hosted/envs/mini_swe/policy.py +355 -0
  51. examples/swe/task_app/hosted/envs/mini_swe/shared.py +83 -0
  52. examples/swe/task_app/hosted/envs/mini_swe/tools.py +96 -0
  53. examples/swe/task_app/hosted/hosted_app.py +204 -0
  54. examples/swe/task_app/hosted/inference/__init__.py +5 -0
  55. examples/swe/task_app/hosted/inference/openai_client.py +618 -0
  56. examples/swe/task_app/hosted/main.py +100 -0
  57. examples/swe/task_app/hosted/policy_routes.py +1079 -0
  58. examples/swe/task_app/hosted/registry.py +195 -0
  59. examples/swe/task_app/hosted/rollout.py +1869 -0
  60. examples/swe/task_app/hosted/storage/__init__.py +5 -0
  61. examples/swe/task_app/hosted/storage/volume.py +211 -0
  62. examples/swe/task_app/hosted/test_agents.py +161 -0
  63. examples/swe/task_app/hosted/test_service.py +137 -0
  64. examples/swe/task_app/hosted/utils.py +62 -0
  65. examples/vlm/README.md +68 -0
  66. examples/vlm/configs/crafter_vlm_gpt4o.toml +44 -0
  67. examples/vlm/crafter_image_only_agent.py +207 -0
  68. examples/vlm/crafter_openai_vlm_agent.py +277 -0
  69. examples/vlm/filter_image_rows.py +63 -0
  70. examples/vlm/run_crafter_vlm_benchmark.py +316 -0
  71. examples/warming_up_to_rl/analyze_trace_db.py +12 -10
  72. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +11 -1
  73. examples/warming_up_to_rl/export_trace_sft.py +218 -36
  74. examples/warming_up_to_rl/groq_test.py +15 -8
  75. examples/warming_up_to_rl/manage_secrets.py +29 -25
  76. examples/warming_up_to_rl/readme.md +9 -2
  77. examples/warming_up_to_rl/run_eval.py +137 -61
  78. examples/warming_up_to_rl/run_fft_and_save.py +131 -60
  79. examples/warming_up_to_rl/run_local_rollout.py +88 -39
  80. examples/warming_up_to_rl/run_local_rollout_modal.py +114 -28
  81. examples/warming_up_to_rl/run_local_rollout_parallel.py +81 -20
  82. examples/warming_up_to_rl/run_local_rollout_traced.py +126 -23
  83. examples/warming_up_to_rl/run_rl_and_save.py +35 -12
  84. examples/warming_up_to_rl/run_rollout_remote.py +44 -19
  85. examples/warming_up_to_rl/task_app/README.md +6 -2
  86. examples/warming_up_to_rl/task_app/grpo_crafter.py +319 -57
  87. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +11 -30
  88. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +1 -1
  89. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +9 -11
  90. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +137 -182
  91. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -1
  92. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +1 -1
  93. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -1
  94. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +150 -57
  95. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +105 -69
  96. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +19 -7
  97. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +45 -42
  98. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +1 -1
  99. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +47 -45
  100. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +1 -1
  101. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +198 -92
  102. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +0 -2
  103. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +361 -263
  104. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +21 -23
  105. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +394 -274
  106. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +1 -1
  107. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +56 -62
  108. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +1 -0
  109. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +6 -15
  110. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +4 -3
  111. synth/__init__.py +14 -0
  112. synth_ai/__init__.py +20 -4
  113. synth_ai/api/models/supported.py +376 -0
  114. synth_ai/api/train/builders.py +157 -26
  115. synth_ai/api/train/cli.py +213 -57
  116. synth_ai/api/train/config_finder.py +65 -5
  117. synth_ai/api/train/env_resolver.py +33 -15
  118. synth_ai/api/train/pollers.py +13 -4
  119. synth_ai/api/train/supported_algos.py +139 -0
  120. synth_ai/api/train/task_app.py +5 -3
  121. synth_ai/api/train/utils.py +33 -48
  122. synth_ai/cli/__init__.py +19 -4
  123. synth_ai/cli/_modal_wrapper.py +28 -0
  124. synth_ai/cli/_typer_patch.py +49 -0
  125. synth_ai/cli/balance.py +2 -3
  126. synth_ai/cli/calc.py +1 -1
  127. synth_ai/cli/demo.py +21 -6
  128. synth_ai/cli/recent.py +2 -2
  129. synth_ai/cli/rl_demo.py +77 -17
  130. synth_ai/cli/root.py +116 -39
  131. synth_ai/cli/status.py +2 -2
  132. synth_ai/cli/task_apps.py +1709 -243
  133. synth_ai/cli/traces.py +7 -4
  134. synth_ai/cli/turso.py +73 -0
  135. synth_ai/cli/watch.py +12 -18
  136. synth_ai/core/experiment.py +0 -2
  137. synth_ai/demo_registry.py +68 -31
  138. synth_ai/demos/core/cli.py +516 -194
  139. synth_ai/demos/demo_task_apps/__init__.py +3 -3
  140. synth_ai/demos/demo_task_apps/core.py +64 -28
  141. synth_ai/demos/demo_task_apps/crafter/configs/crafter_fft_4b.toml +2 -3
  142. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +37 -30
  143. synth_ai/demos/demo_task_apps/math/_common.py +1 -2
  144. synth_ai/demos/demo_task_apps/math/app.py +2 -1
  145. synth_ai/demos/demo_task_apps/math/deploy_modal.py +3 -6
  146. synth_ai/demos/demo_task_apps/math/modal_task_app.py +183 -82
  147. synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -2
  148. synth_ai/environments/examples/bandit/engine.py +12 -4
  149. synth_ai/environments/examples/bandit/taskset.py +4 -4
  150. synth_ai/environments/examples/crafter_classic/environment.py +76 -1
  151. synth_ai/environments/reproducibility/tree.py +5 -6
  152. synth_ai/environments/service/app.py +11 -12
  153. synth_ai/environments/service/core_routes.py +10 -9
  154. synth_ai/environments/stateful/engine.py +1 -1
  155. synth_ai/environments/tasks/core.py +1 -0
  156. synth_ai/environments/tasks/filters.py +5 -6
  157. synth_ai/environments/tasks/utils.py +4 -5
  158. synth_ai/evals/base.py +0 -2
  159. synth_ai/handshake.py +11 -9
  160. synth_ai/http.py +1 -1
  161. synth_ai/http_client.py +43 -11
  162. synth_ai/inference/__init__.py +0 -2
  163. synth_ai/inference/client.py +20 -6
  164. synth_ai/jobs/client.py +103 -78
  165. synth_ai/learning/__init__.py +41 -6
  166. synth_ai/learning/algorithms.py +14 -0
  167. synth_ai/learning/client.py +121 -29
  168. synth_ai/learning/config.py +2 -40
  169. synth_ai/learning/constants.py +0 -2
  170. synth_ai/learning/ft_client.py +4 -56
  171. synth_ai/learning/health.py +13 -7
  172. synth_ai/learning/jobs.py +43 -47
  173. synth_ai/{rl → learning/rl}/__init__.py +14 -5
  174. synth_ai/learning/rl/client.py +267 -0
  175. synth_ai/learning/rl/config.py +31 -0
  176. synth_ai/{rl → learning/rl}/contracts.py +5 -10
  177. synth_ai/{rl → learning/rl}/env_keys.py +45 -16
  178. synth_ai/learning/rl/secrets.py +13 -0
  179. synth_ai/learning/rl_client.py +2 -253
  180. synth_ai/learning/sft/__init__.py +29 -0
  181. synth_ai/learning/sft/client.py +68 -0
  182. synth_ai/learning/sft/config.py +270 -0
  183. synth_ai/learning/sft/data.py +295 -0
  184. synth_ai/learning/sse.py +25 -26
  185. synth_ai/learning/validators.py +25 -24
  186. synth_ai/lm/__init__.py +21 -47
  187. synth_ai/task/__init__.py +26 -27
  188. synth_ai/task/apps/__init__.py +18 -19
  189. synth_ai/task/auth.py +35 -23
  190. synth_ai/task/client.py +15 -13
  191. synth_ai/task/contracts.py +37 -35
  192. synth_ai/task/datasets.py +9 -6
  193. synth_ai/task/errors.py +11 -10
  194. synth_ai/task/health.py +17 -11
  195. synth_ai/task/json.py +58 -24
  196. synth_ai/task/proxy.py +15 -14
  197. synth_ai/task/rubrics.py +22 -15
  198. synth_ai/task/server.py +43 -17
  199. synth_ai/task/tracing_utils.py +12 -7
  200. synth_ai/task/validators.py +0 -1
  201. synth_ai/task/vendors.py +5 -7
  202. synth_ai/tracing_v3/__init__.py +2 -0
  203. synth_ai/tracing_v3/abstractions.py +21 -4
  204. synth_ai/tracing_v3/db_config.py +26 -1
  205. synth_ai/tracing_v3/decorators.py +18 -15
  206. synth_ai/tracing_v3/examples/basic_usage.py +3 -2
  207. synth_ai/tracing_v3/hooks.py +6 -4
  208. synth_ai/tracing_v3/llm_call_record_helpers.py +6 -6
  209. synth_ai/tracing_v3/replica_sync.py +1 -0
  210. synth_ai/tracing_v3/session_tracer.py +63 -16
  211. synth_ai/tracing_v3/storage/base.py +89 -1
  212. synth_ai/tracing_v3/storage/config.py +21 -8
  213. synth_ai/tracing_v3/storage/factory.py +10 -8
  214. synth_ai/tracing_v3/storage/utils.py +4 -2
  215. synth_ai/tracing_v3/turso/daemon.py +7 -2
  216. synth_ai/tracing_v3/turso/models.py +5 -2
  217. synth_ai/tracing_v3/turso/native_manager.py +1173 -0
  218. synth_ai/tracing_v3/utils.py +4 -3
  219. synth_ai/v0/api/__init__.py +8 -0
  220. synth_ai/v0/api/models/__init__.py +8 -0
  221. synth_ai/v0/api/models/supported.py +8 -0
  222. synth_ai/v0/config/__init__.py +15 -0
  223. synth_ai/v0/config/base_url.py +12 -0
  224. synth_ai/v0/lm/__init__.py +51 -0
  225. synth_ai/{lm → v0/lm}/caching/ephemeral.py +3 -5
  226. synth_ai/{lm → v0/lm}/caching/handler.py +4 -4
  227. synth_ai/{lm → v0/lm}/caching/initialize.py +1 -1
  228. synth_ai/{lm → v0/lm}/caching/persistent.py +1 -1
  229. synth_ai/{lm → v0/lm}/config.py +6 -1
  230. synth_ai/{lm → v0/lm}/core/all.py +9 -9
  231. synth_ai/{lm → v0/lm}/core/exceptions.py +0 -2
  232. synth_ai/{lm → v0/lm}/core/main.py +19 -7
  233. synth_ai/{lm → v0/lm}/core/main_v3.py +10 -10
  234. synth_ai/{lm → v0/lm}/core/synth_models.py +2 -15
  235. synth_ai/{lm → v0/lm}/core/vendor_clients.py +6 -4
  236. synth_ai/{lm → v0/lm}/overrides.py +4 -4
  237. synth_ai/{lm → v0/lm}/provider_support/anthropic.py +4 -4
  238. synth_ai/{lm → v0/lm}/provider_support/openai.py +5 -5
  239. synth_ai/{lm → v0/lm}/structured_outputs/handler.py +5 -5
  240. synth_ai/{lm → v0/lm}/structured_outputs/rehabilitate.py +1 -1
  241. synth_ai/{lm → v0/lm}/vendors/core/anthropic_api.py +16 -16
  242. synth_ai/{lm → v0/lm}/vendors/core/gemini_api.py +5 -5
  243. synth_ai/{lm → v0/lm}/vendors/core/mistral_api.py +5 -5
  244. synth_ai/{lm → v0/lm}/vendors/core/openai_api.py +12 -10
  245. synth_ai/{lm → v0/lm}/vendors/openai_standard.py +11 -9
  246. synth_ai/{lm → v0/lm}/vendors/openai_standard_responses.py +8 -5
  247. synth_ai/{lm → v0/lm}/vendors/supported/custom_endpoint.py +4 -6
  248. synth_ai/{lm → v0/lm}/vendors/supported/deepseek.py +2 -2
  249. synth_ai/{lm → v0/lm}/vendors/supported/grok.py +2 -2
  250. synth_ai/{lm → v0/lm}/vendors/supported/groq.py +1 -1
  251. synth_ai/{lm → v0/lm}/vendors/supported/ollama.py +1 -1
  252. synth_ai/{lm → v0/lm}/vendors/supported/openrouter.py +3 -3
  253. synth_ai/{lm → v0/lm}/vendors/supported/together.py +1 -1
  254. synth_ai/{lm → v0/lm}/vendors/synth_client.py +38 -11
  255. synth_ai/v0/tracing/upload.py +32 -135
  256. synth_ai/v0/tracing_v3/__init__.py +10 -0
  257. synth_ai/v0/tracing_v3/abstractions.py +3 -0
  258. synth_ai/v0/tracing_v3/decorators.py +3 -0
  259. synth_ai/v0/tracing_v3/llm_call_record_helpers.py +3 -0
  260. synth_ai/v0/tracing_v3/session_tracer.py +3 -0
  261. synth_ai-0.2.9.dev6.dist-info/METADATA +191 -0
  262. {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/RECORD +291 -264
  263. {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/top_level.txt +1 -0
  264. examples/common_old/backend.py +0 -21
  265. examples/evals_old/README.md +0 -98
  266. examples/evals_old/__init__.py +0 -6
  267. examples/evals_old/compare_models.py +0 -1037
  268. examples/evals_old/example_log.md +0 -145
  269. examples/evals_old/run_demo.sh +0 -126
  270. examples/evals_old/trace_analysis.py +0 -270
  271. examples/finetuning_old/_backup_synth_qwen/config.toml +0 -29
  272. examples/finetuning_old/_backup_synth_qwen/example_log.md +0 -324
  273. examples/finetuning_old/_backup_synth_qwen/filter_traces.py +0 -60
  274. examples/finetuning_old/_backup_synth_qwen/filter_traces_achievements.py +0 -239
  275. examples/finetuning_old/_backup_synth_qwen/purge_v3_traces.py +0 -109
  276. examples/finetuning_old/_backup_synth_qwen/react_agent_lm.py +0 -1924
  277. examples/finetuning_old/_backup_synth_qwen/readme.md +0 -49
  278. examples/finetuning_old/_backup_synth_qwen/run_crafter_qwen4b.py +0 -114
  279. examples/finetuning_old/_backup_synth_qwen/run_demo.sh +0 -195
  280. examples/finetuning_old/_backup_synth_qwen/sft_kickoff.py +0 -118
  281. examples/finetuning_old/synth_qwen_v1/README.md +0 -68
  282. examples/finetuning_old/synth_qwen_v1/filter_traces.py +0 -60
  283. examples/finetuning_old/synth_qwen_v1/filter_traces_achievements.py +0 -239
  284. examples/finetuning_old/synth_qwen_v1/finetune.py +0 -46
  285. examples/finetuning_old/synth_qwen_v1/hello_ft_model.py +0 -71
  286. examples/finetuning_old/synth_qwen_v1/infer.py +0 -37
  287. examples/finetuning_old/synth_qwen_v1/poll.py +0 -44
  288. examples/finetuning_old/synth_qwen_v1/prepare_data.py +0 -35
  289. examples/finetuning_old/synth_qwen_v1/purge_v3_traces.py +0 -109
  290. examples/finetuning_old/synth_qwen_v1/react_agent_lm.py +0 -1932
  291. examples/finetuning_old/synth_qwen_v1/run_crafter_sft_job.py +0 -207
  292. examples/finetuning_old/synth_qwen_v1/run_ft_job.py +0 -232
  293. examples/finetuning_old/synth_qwen_v1/upload_data.py +0 -34
  294. examples/finetuning_old/synth_qwen_v1/util.py +0 -147
  295. examples/rl_old/task_app.py +0 -962
  296. examples/warming_up_to_rl/old/event_rewards.md +0 -234
  297. examples/warming_up_to_rl/old/notes.md +0 -73
  298. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_stepwise_rewards.py +0 -58
  299. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +0 -738
  300. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +0 -580
  301. synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
  302. synth_ai/experimental/synth_oss.py +0 -446
  303. synth_ai/install_sqld.sh +0 -40
  304. synth_ai/learning/filtering.py +0 -0
  305. synth_ai/learning/offline/dpo.py +0 -0
  306. synth_ai/learning/offline/providers.py +0 -7
  307. synth_ai/learning/offline/sft.py +0 -0
  308. synth_ai/learning/offline/shared.py +0 -0
  309. synth_ai/learning/online/grpo.py +0 -0
  310. synth_ai/learning/online/irft.py +0 -0
  311. synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
  312. synth_ai/learning/prompts/gepa.py +0 -0
  313. synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -213
  314. synth_ai/learning/prompts/mipro.py +0 -289
  315. synth_ai/learning/prompts/random_search.py +0 -246
  316. synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
  317. synth_ai/learning/prompts/run_random_search_banking77.py +0 -324
  318. synth_ai/rl/secrets.py +0 -19
  319. synth_ai/scripts/verify_rewards.py +0 -100
  320. synth_ai/tracing/__init__.py +0 -30
  321. synth_ai/tracing_v1/__init__.py +0 -33
  322. synth_ai/tracing_v3/turso/__init__.py +0 -25
  323. synth_ai/tracing_v3/turso/manager.py +0 -774
  324. synth_ai/zyk/__init__.py +0 -30
  325. synth_ai-0.2.9.dev4.dist-info/METADATA +0 -131
  326. /synth_ai/{lm → v0/lm}/caching/__init__.py +0 -0
  327. /synth_ai/{lm → v0/lm}/caching/constants.py +0 -0
  328. /synth_ai/{lm → v0/lm}/caching/dbs.py +0 -0
  329. /synth_ai/{lm → v0/lm}/constants.py +0 -0
  330. /synth_ai/{lm → v0/lm}/core/__init__.py +0 -0
  331. /synth_ai/{lm → v0/lm}/cost/__init__.py +0 -0
  332. /synth_ai/{lm → v0/lm}/cost/monitor.py +0 -0
  333. /synth_ai/{lm → v0/lm}/cost/statefulness.py +0 -0
  334. /synth_ai/{lm → v0/lm}/injection.py +0 -0
  335. /synth_ai/{lm → v0/lm}/provider_support/__init__.py +0 -0
  336. /synth_ai/{lm → v0/lm}/provider_support/suppress_logging.py +0 -0
  337. /synth_ai/{lm → v0/lm}/structured_outputs/__init__.py +0 -0
  338. /synth_ai/{lm → v0/lm}/structured_outputs/inject.py +0 -0
  339. /synth_ai/{lm → v0/lm}/tools/__init__.py +0 -0
  340. /synth_ai/{lm → v0/lm}/tools/base.py +0 -0
  341. /synth_ai/{lm → v0/lm}/unified_interface.py +0 -0
  342. /synth_ai/{lm → v0/lm}/vendors/__init__.py +0 -0
  343. /synth_ai/{lm → v0/lm}/vendors/base.py +0 -0
  344. /synth_ai/{lm → v0/lm}/vendors/core/__init__.py +0 -0
  345. /synth_ai/{lm → v0/lm}/vendors/core/synth_dev_api.py +0 -0
  346. /synth_ai/{lm → v0/lm}/vendors/local/__init__.py +0 -0
  347. /synth_ai/{lm → v0/lm}/vendors/local/ollama.py +0 -0
  348. /synth_ai/{lm → v0/lm}/vendors/retries.py +0 -0
  349. /synth_ai/{lm → v0/lm}/vendors/supported/__init__.py +0 -0
  350. /synth_ai/{lm → v0/lm}/warmup.py +0 -0
  351. {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/WHEEL +0 -0
  352. {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/entry_points.txt +0 -0
  353. {synth_ai-0.2.9.dev4.dist-info → synth_ai-0.2.9.dev6.dist-info}/licenses/LICENSE +0 -0
@@ -5,17 +5,22 @@ Baseline evaluation script (public-friendly skeleton)
5
5
  - Uses a TaskAppClient interface (to be implemented in synth-ai SDK)
6
6
  - Keeps structure aligned with research/testing/crafter eval harness
7
7
  """
8
+
8
9
  from __future__ import annotations
9
- import os
10
+
11
+ import argparse
12
+ import asyncio
13
+ import contextlib
10
14
  import json
15
+ import os
11
16
  import re
12
- from typing import Any, Dict, List, Optional
13
- from collections import Counter
14
- import asyncio
15
- import httpx
16
- import argparse
17
17
  import tomllib
18
+ from collections import Counter
18
19
  from pathlib import Path
20
+ from typing import Any
21
+
22
+ import httpx
23
+
19
24
 
20
25
  class TaskAppClient:
21
26
  """Minimal async client for the task app initialize/step/terminate routes.
@@ -23,12 +28,12 @@ class TaskAppClient:
23
28
  This is a public-friendly shim for examples, pending SDK surface consolidation.
24
29
  """
25
30
 
26
- def __init__(self, base_url: str, api_key: Optional[str] = None) -> None:
31
+ def __init__(self, base_url: str, api_key: str | None = None) -> None:
27
32
  self.base_url = base_url.rstrip("/")
28
33
  self.api_key = api_key
29
- self._client: Optional[httpx.AsyncClient] = None
34
+ self._client: httpx.AsyncClient | None = None
30
35
 
31
- async def __aenter__(self) -> "TaskAppClient":
36
+ async def __aenter__(self) -> TaskAppClient:
32
37
  headers = {}
33
38
  if self.api_key:
34
39
  headers["X-API-Key"] = self.api_key
@@ -54,9 +59,9 @@ class TaskAppClient:
54
59
  )
55
60
  return self._client
56
61
 
57
- async def initialize(self, env_name: str, config: Dict[str, Any]) -> Dict[str, Any]:
62
+ async def initialize(self, env_name: str, config: dict[str, Any]) -> dict[str, Any]:
58
63
  """POST /env/{env_name}/initialize (compat route supported in task app)."""
59
- payload: Dict[str, Any] = {
64
+ payload: dict[str, Any] = {
60
65
  "seed": config.get("seed"),
61
66
  }
62
67
  # Allow both world_config and config inputs; env routes will normalize difficulty
@@ -68,29 +73,31 @@ class TaskAppClient:
68
73
  resp.raise_for_status()
69
74
  return resp.json()
70
75
 
71
- async def step(self, env_name: str, env_id: str, tool_calls: List[Dict[str, Any]]) -> Dict[str, Any]:
76
+ async def step(
77
+ self, env_name: str, env_id: str, tool_calls: list[dict[str, Any]]
78
+ ) -> dict[str, Any]:
72
79
  """POST /env/{env_name}/step with wrapped tool_calls in action."""
73
80
  payload = {"env_id": env_id, "action": {"tool_calls": tool_calls}}
74
81
  resp = await self.client.post(f"/env/{env_name}/step", json=payload)
75
82
  resp.raise_for_status()
76
83
  return resp.json()
77
84
 
78
- async def terminate(self, env_name: str, env_id: str) -> Dict[str, Any]:
85
+ async def terminate(self, env_name: str, env_id: str) -> dict[str, Any]:
79
86
  resp = await self.client.post(f"/env/{env_name}/terminate", json={"env_id": env_id})
80
87
  resp.raise_for_status()
81
88
  return resp.json()
82
89
 
83
- async def get_info(self) -> Dict[str, Any]:
90
+ async def get_info(self) -> dict[str, Any]:
84
91
  resp = await self.client.get("/info")
85
92
  resp.raise_for_status()
86
93
  return resp.json()
87
94
 
88
- async def proxy_groq_chat(self, payload: Dict[str, Any]) -> Dict[str, Any]:
95
+ async def proxy_groq_chat(self, payload: dict[str, Any]) -> dict[str, Any]:
89
96
  resp = await self.client.post("/proxy/groq/v1/chat/completions", json=payload)
90
97
  resp.raise_for_status()
91
98
  return resp.json()
92
99
 
93
- async def vllm_chat(self, vllm_base_url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
100
+ async def vllm_chat(self, vllm_base_url: str, payload: dict[str, Any]) -> dict[str, Any]:
94
101
  async with httpx.AsyncClient(base_url=vllm_base_url.rstrip("/"), timeout=60.0) as c:
95
102
  resp = await c.post("/v1/chat/completions", json=payload)
96
103
  # Do not raise for status to surface body in errors
@@ -102,11 +109,21 @@ class TaskAppClient:
102
109
  return {"error": data}
103
110
  return data
104
111
 
105
- async def rollout(self, *, run_id: str, env_name: str, seed: int, difficulty: str, policy_name: str, policy_config: Dict[str, Any], max_turns: int) -> Dict[str, Any]:
106
- ops: List[str] = []
112
+ async def rollout(
113
+ self,
114
+ *,
115
+ run_id: str,
116
+ env_name: str,
117
+ seed: int,
118
+ difficulty: str,
119
+ policy_name: str,
120
+ policy_config: dict[str, Any],
121
+ max_turns: int,
122
+ ) -> dict[str, Any]:
123
+ ops: list[str] = []
107
124
  for _ in range(max_turns):
108
125
  ops.extend(["agent", "env"])
109
- payload: Dict[str, Any] = {
126
+ payload: dict[str, Any] = {
110
127
  "run_id": run_id,
111
128
  "env": {
112
129
  "env_name": env_name,
@@ -128,35 +145,41 @@ class TaskAppClient:
128
145
  resp.raise_for_status()
129
146
  return resp.json()
130
147
 
148
+
131
149
  TASK_APP_URL = os.getenv("TASK_APP_URL", "https://YOUR-TASK-APP.modal.run").rstrip("/")
132
150
  MODEL = os.getenv("EVAL_MODEL", "qwen/qwen3-32b")
133
151
  NUM_EPISODES = int(os.getenv("NUM_EPISODES", "3"))
134
152
  MAX_TURNS = int(os.getenv("MAX_TURNS", "10"))
135
153
  CONCURRENCY = int(os.getenv("CONCURRENCY", "1"))
136
154
 
137
- def _interact_tool_schema() -> List[Dict[str, Any]]:
138
- return [{
139
- "type": "function",
140
- "function": {
141
- "name": "interact",
142
- "description": "Perform actions in the Crafter environment.",
143
- "parameters": {
144
- "type": "object",
145
- "properties": {
146
- "actions": {"type": "array", "items": {"type": "string"}},
147
- "reasoning": {"type": "string"},
155
+
156
+ def _interact_tool_schema() -> list[dict[str, Any]]:
157
+ return [
158
+ {
159
+ "type": "function",
160
+ "function": {
161
+ "name": "interact",
162
+ "description": "Perform actions in the Crafter environment.",
163
+ "parameters": {
164
+ "type": "object",
165
+ "properties": {
166
+ "actions": {"type": "array", "items": {"type": "string"}},
167
+ "reasoning": {"type": "string"},
168
+ },
169
+ "required": ["actions", "reasoning"],
148
170
  },
149
- "required": ["actions", "reasoning"],
150
171
  },
151
- },
152
- }]
172
+ }
173
+ ]
174
+
153
175
 
154
- def _build_messages_from_observation(observation: Dict[str, Any], history: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
176
+ def _build_messages_from_observation(
177
+ observation: dict[str, Any], history: list[dict[str, Any]]
178
+ ) -> list[dict[str, Any]]:
155
179
  inv = observation.get("inventory") or {}
156
180
  pos = observation.get("player_position") or []
157
181
  ach = observation.get("achievements_status") or {}
158
- turns_taken = observation.get("num_steps_taken") or 0
159
- user_lines: List[str] = []
182
+ user_lines: list[str] = []
160
183
  user_lines.append("Environment: CrafterClassic")
161
184
  user_lines.append(f"Player position: {pos}")
162
185
  user_lines.append(f"Inventory: {json.dumps(inv, ensure_ascii=False)}")
@@ -171,7 +194,8 @@ def _build_messages_from_observation(observation: Dict[str, Any], history: List[
171
194
  content = "\n".join(user_lines)
172
195
  return [{"role": "user", "content": content}]
173
196
 
174
- def _parse_tool_calls_from_openai_response(data: Dict[str, Any]) -> List[str]:
197
+
198
+ def _parse_tool_calls_from_openai_response(data: dict[str, Any]) -> list[str]:
175
199
  try:
176
200
  choices = data.get("choices")
177
201
  if isinstance(choices, list) and choices:
@@ -203,7 +227,11 @@ def _parse_tool_calls_from_openai_response(data: Dict[str, Any]) -> List[str]:
203
227
  if isinstance(content, str):
204
228
  text = content
205
229
  elif isinstance(content, list):
206
- text = "\n".join(str(part.get("text")) for part in content if isinstance(part, dict) and part.get("text"))
230
+ text = "\n".join(
231
+ str(part.get("text"))
232
+ for part in content
233
+ if isinstance(part, dict) and part.get("text")
234
+ )
207
235
  for raw in re.findall(r"\{[\s\S]*\}", text or ""):
208
236
  try:
209
237
  obj = json.loads(raw)
@@ -217,9 +245,16 @@ def _parse_tool_calls_from_openai_response(data: Dict[str, Any]) -> List[str]:
217
245
  pass
218
246
  return []
219
247
 
220
- async def _choose_actions_via_llm(client: TaskAppClient, provider: str, model: str, observation: Dict[str, Any], history: List[Dict[str, Any]]) -> List[str]:
248
+
249
+ async def _choose_actions_via_llm(
250
+ client: TaskAppClient,
251
+ provider: str,
252
+ model: str,
253
+ observation: dict[str, Any],
254
+ history: list[dict[str, Any]],
255
+ ) -> list[str]:
221
256
  messages = _build_messages_from_observation(observation, history)
222
- payload: Dict[str, Any] = {
257
+ payload: dict[str, Any] = {
223
258
  "model": model,
224
259
  "messages": messages,
225
260
  "tools": _interact_tool_schema(),
@@ -245,33 +280,40 @@ async def _choose_actions_via_llm(client: TaskAppClient, provider: str, model: s
245
280
  actions = _parse_tool_calls_from_openai_response(data)
246
281
  return actions or []
247
282
 
248
- def _expand_actions_to_tool_calls(actions: List[str]) -> List[Dict[str, Any]]:
249
- out: List[Dict[str, Any]] = []
283
+
284
+ def _expand_actions_to_tool_calls(actions: list[str]) -> list[dict[str, Any]]:
285
+ out: list[dict[str, Any]] = []
250
286
  for a in actions[:5]:
251
287
  out.append({"tool": "interact", "args": {"action": a}})
252
288
  return out
253
289
 
290
+
254
291
  def _detect_provider(model: str) -> str:
255
292
  m = (model or "").lower()
256
293
  if "qwen/qwen3-32b" in m or "qwen-2.5-" in m or m.startswith("groq:"):
257
294
  return "groq"
258
295
  return "vllm"
259
296
 
260
- def _rollout_inference_url_from_cfg(cfg: Dict[str, Any], default_vllm: Optional[str]) -> Optional[str]:
297
+
298
+ def _rollout_inference_url_from_cfg(cfg: dict[str, Any], default_vllm: str | None) -> str | None:
261
299
  # Prefer explicit inference_url in TOML; else fall back to discovered vLLM base
262
300
  url = cfg.get("inference_url")
263
301
  if isinstance(url, str) and url:
264
302
  return url
265
303
  return default_vllm
266
304
 
267
- async def eval_episode(client: TaskAppClient, seed: int) -> Dict[str, Any]:
305
+
306
+ async def eval_episode(client: TaskAppClient, seed: int) -> dict[str, Any]:
268
307
  env_name = "CrafterClassic"
269
- history: List[Dict[str, Any]] = []
308
+ history: list[dict[str, Any]] = []
270
309
  achievements: set[str] = set()
271
310
  turns = 0
272
311
 
273
312
  # Initialize environment
274
- init_cfg: Dict[str, Any] = {"seed": seed, "world_config": {"difficulty": os.getenv("DIFFICULTY", "easy")}}
313
+ init_cfg: dict[str, Any] = {
314
+ "seed": seed,
315
+ "world_config": {"difficulty": os.getenv("DIFFICULTY", "easy")},
316
+ }
275
317
  created = await client.initialize(env_name, init_cfg)
276
318
  env_id = created.get("env_id")
277
319
  if not isinstance(env_id, str) or not env_id:
@@ -285,7 +327,9 @@ async def eval_episode(client: TaskAppClient, seed: int) -> Dict[str, Any]:
285
327
  try:
286
328
  while turns < MAX_TURNS and not done:
287
329
  # Ask LLM for actions; fallback to a simple exploratory pair
288
- chosen_actions = await _choose_actions_via_llm(client, provider, MODEL, observation, history)
330
+ chosen_actions = await _choose_actions_via_llm(
331
+ client, provider, MODEL, observation, history
332
+ )
289
333
  if not chosen_actions:
290
334
  chosen_actions = ["move_up", "do"]
291
335
  tool_calls = _expand_actions_to_tool_calls(chosen_actions)
@@ -299,13 +343,12 @@ async def eval_episode(client: TaskAppClient, seed: int) -> Dict[str, Any]:
299
343
  if isinstance(nxt, dict):
300
344
  observation = nxt
301
345
  finally:
302
- try:
346
+ with contextlib.suppress(Exception):
303
347
  await client.terminate(env_name, env_id)
304
- except Exception:
305
- pass
306
348
 
307
349
  return {"seed": seed, "turns": turns, "achievements": sorted(achievements)}
308
350
 
351
+
309
352
  async def main() -> None:
310
353
  # Best-effort load local .env if present (ensures ENVIRONMENT_API_KEY for rollout)
311
354
  try:
@@ -322,13 +365,17 @@ async def main() -> None:
322
365
  except Exception:
323
366
  pass
324
367
 
325
- parser = argparse.ArgumentParser(description="Baseline eval against task app with optional TOML config")
368
+ parser = argparse.ArgumentParser(
369
+ description="Baseline eval against task app with optional TOML config"
370
+ )
326
371
  parser.add_argument("--toml", help="Path to TOML config file", default=None)
327
- parser.add_argument("--use-rollout", action="store_true", help="Use server-side rollout endpoint for eval")
372
+ parser.add_argument(
373
+ "--use-rollout", action="store_true", help="Use server-side rollout endpoint for eval"
374
+ )
328
375
  args = parser.parse_args()
329
376
 
330
377
  global TASK_APP_URL, MODEL, NUM_EPISODES, MAX_TURNS, CONCURRENCY
331
- cfg: Dict[str, Any] = {}
378
+ cfg: dict[str, Any] = {}
332
379
  if args.toml:
333
380
  with open(args.toml, "rb") as f:
334
381
  cfg = tomllib.load(f)
@@ -346,10 +393,14 @@ async def main() -> None:
346
393
  if env_url:
347
394
  TASK_APP_URL = env_url.rstrip("/")
348
395
  else:
349
- raise RuntimeError("TASK_APP_URL is a placeholder. Set task_app_url in TOML or export TASK_APP_URL.")
396
+ raise RuntimeError(
397
+ "TASK_APP_URL is a placeholder. Set task_app_url in TOML or export TASK_APP_URL."
398
+ )
350
399
 
351
400
  print(f"Task App: {TASK_APP_URL}")
352
- print(f"Model: {MODEL} Episodes: {NUM_EPISODES} Max turns: {MAX_TURNS} Concurrency: {CONCURRENCY}")
401
+ print(
402
+ f"Model: {MODEL} Episodes: {NUM_EPISODES} Max turns: {MAX_TURNS} Concurrency: {CONCURRENCY}"
403
+ )
353
404
  sem = asyncio.Semaphore(max(CONCURRENCY, 1))
354
405
  async with TaskAppClient(TASK_APP_URL, api_key=os.getenv("ENVIRONMENT_API_KEY")) as client:
355
406
  if args.use_rollout:
@@ -359,16 +410,24 @@ async def main() -> None:
359
410
  inf_url = _rollout_inference_url_from_cfg(cfg, default_vllm)
360
411
  if not inf_url:
361
412
  raise RuntimeError("Could not resolve inference URL for rollout")
413
+
362
414
  async def _run(seed: int):
363
415
  async with sem:
364
416
  try:
365
417
  run_id = f"eval-{seed}"
366
418
  # Build policy config from TOML (explicit control; no server-side guessing)
367
- policy_cfg: Dict[str, Any] = {
419
+ policy_cfg: dict[str, Any] = {
368
420
  "model": cfg.get("model", MODEL),
369
421
  "inference_url": inf_url,
370
422
  }
371
- for k in ("max_tokens", "temperature", "top_p", "thinking_mode", "thinking_budget", "use_tools"):
423
+ for k in (
424
+ "max_tokens",
425
+ "temperature",
426
+ "top_p",
427
+ "thinking_mode",
428
+ "thinking_budget",
429
+ "use_tools",
430
+ ):
372
431
  if k in cfg and cfg.get(k) is not None:
373
432
  policy_cfg[k] = cfg.get(k)
374
433
 
@@ -385,8 +444,16 @@ async def main() -> None:
385
444
  ach = []
386
445
  try:
387
446
  trajs = r.get("trajectories") or []
388
- final_obs = (trajs[0].get("final") or {}).get("observation") if trajs and isinstance(trajs[0], dict) else None
389
- ach_map = (final_obs or {}).get("achievements_status") if isinstance(final_obs, dict) else None
447
+ final_obs = (
448
+ (trajs[0].get("final") or {}).get("observation")
449
+ if trajs and isinstance(trajs[0], dict)
450
+ else None
451
+ )
452
+ ach_map = (
453
+ (final_obs or {}).get("achievements_status")
454
+ if isinstance(final_obs, dict)
455
+ else None
456
+ )
390
457
  if isinstance(ach_map, dict):
391
458
  ach = sorted([k for k, v in ach_map.items() if v])
392
459
  except Exception:
@@ -401,7 +468,11 @@ async def main() -> None:
401
468
  return {"seed": seed, "turns": length, "achievements": ach}
402
469
  except Exception as e:
403
470
  return {"seed": seed, "turns": 0, "achievements": [], "error": str(e)}
404
- results = await asyncio.gather(*[asyncio.create_task(_run(i)) for i in range(1, NUM_EPISODES + 1)], return_exceptions=False)
471
+
472
+ results = await asyncio.gather(
473
+ *[asyncio.create_task(_run(i)) for i in range(1, NUM_EPISODES + 1)],
474
+ return_exceptions=False,
475
+ )
405
476
  # Aggregate summary
406
477
  counts = [len(r.get("achievements") or []) for r in results if isinstance(r, dict)]
407
478
  turns = [int(r.get("turns") or 0) for r in results if isinstance(r, dict)]
@@ -424,11 +495,16 @@ async def main() -> None:
424
495
  }
425
496
  print(json.dumps(summary, indent=2))
426
497
  else:
498
+
427
499
  async def _run(seed: int):
428
500
  async with sem:
429
501
  return await eval_episode(client, seed)
430
- results = await asyncio.gather(*[asyncio.create_task(_run(i)) for i in range(1, NUM_EPISODES + 1)])
502
+
503
+ results = await asyncio.gather(
504
+ *[asyncio.create_task(_run(i)) for i in range(1, NUM_EPISODES + 1)]
505
+ )
431
506
  print(json.dumps({"episodes": results}, indent=2))
432
507
 
508
+
433
509
  if __name__ == "__main__":
434
510
  asyncio.run(main())