synth-ai 0.2.9.dev11__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (909) hide show
  1. synth_ai/__init__.py +44 -45
  2. synth_ai/__main__.py +30 -3
  3. synth_ai/cli/__init__.py +104 -78
  4. synth_ai/cli/__main__.py +42 -0
  5. synth_ai/cli/_internal/__init__.py +5 -0
  6. synth_ai/cli/_internal/modal_wrapper.py +31 -0
  7. synth_ai/cli/_internal/storage.py +20 -0
  8. synth_ai/cli/_internal/typer_patch.py +47 -0
  9. synth_ai/cli/_internal/validate_task_app.py +29 -0
  10. synth_ai/cli/agents/__init__.py +17 -0
  11. synth_ai/cli/agents/claude.py +77 -0
  12. synth_ai/cli/agents/codex.py +265 -0
  13. synth_ai/cli/agents/opencode.py +253 -0
  14. synth_ai/cli/commands/__init__.py +18 -0
  15. synth_ai/cli/commands/artifacts/__init__.py +13 -0
  16. synth_ai/cli/commands/artifacts/client.py +119 -0
  17. synth_ai/cli/commands/artifacts/config.py +57 -0
  18. synth_ai/cli/commands/artifacts/core.py +24 -0
  19. synth_ai/cli/commands/artifacts/download.py +188 -0
  20. synth_ai/cli/commands/artifacts/export.py +186 -0
  21. synth_ai/cli/commands/artifacts/list.py +156 -0
  22. synth_ai/cli/commands/artifacts/parsing.py +250 -0
  23. synth_ai/cli/commands/artifacts/show.py +336 -0
  24. synth_ai/cli/commands/baseline/__init__.py +12 -0
  25. synth_ai/cli/commands/baseline/core.py +636 -0
  26. synth_ai/cli/commands/baseline/list.py +94 -0
  27. synth_ai/cli/commands/demo/__init__.py +3 -0
  28. synth_ai/cli/commands/demo/core.py +153 -0
  29. synth_ai/cli/commands/eval/__init__.py +19 -0
  30. synth_ai/cli/commands/eval/core.py +1113 -0
  31. synth_ai/cli/commands/eval/errors.py +81 -0
  32. synth_ai/cli/commands/eval/validation.py +133 -0
  33. synth_ai/cli/commands/filter/__init__.py +12 -0
  34. synth_ai/cli/commands/filter/core.py +424 -0
  35. synth_ai/cli/commands/filter/errors.py +55 -0
  36. synth_ai/cli/commands/filter/validation.py +77 -0
  37. synth_ai/cli/commands/help/__init__.py +185 -0
  38. synth_ai/cli/commands/help/core.py +72 -0
  39. synth_ai/cli/commands/scan/__init__.py +19 -0
  40. synth_ai/cli/commands/scan/cloudflare_scanner.py +403 -0
  41. synth_ai/cli/commands/scan/core.py +344 -0
  42. synth_ai/cli/commands/scan/health_checker.py +242 -0
  43. synth_ai/cli/commands/scan/local_scanner.py +278 -0
  44. synth_ai/cli/commands/scan/models.py +83 -0
  45. synth_ai/cli/commands/smoke/__init__.py +7 -0
  46. synth_ai/cli/commands/smoke/core.py +1438 -0
  47. synth_ai/cli/commands/status/__init__.py +66 -0
  48. synth_ai/cli/commands/status/client.py +192 -0
  49. synth_ai/cli/commands/status/config.py +92 -0
  50. synth_ai/cli/commands/status/errors.py +20 -0
  51. synth_ai/cli/commands/status/formatters.py +164 -0
  52. synth_ai/cli/commands/status/subcommands/__init__.py +9 -0
  53. synth_ai/cli/commands/status/subcommands/files.py +79 -0
  54. synth_ai/cli/commands/status/subcommands/jobs.py +334 -0
  55. synth_ai/cli/commands/status/subcommands/models.py +79 -0
  56. synth_ai/cli/commands/status/subcommands/pricing.py +23 -0
  57. synth_ai/cli/commands/status/subcommands/runs.py +81 -0
  58. synth_ai/cli/commands/status/subcommands/session.py +182 -0
  59. synth_ai/cli/commands/status/subcommands/summary.py +47 -0
  60. synth_ai/cli/commands/status/subcommands/usage.py +203 -0
  61. synth_ai/cli/commands/status/utils.py +114 -0
  62. synth_ai/cli/commands/train/__init__.py +53 -0
  63. synth_ai/cli/commands/train/core.py +22 -0
  64. synth_ai/cli/commands/train/errors.py +117 -0
  65. synth_ai/cli/commands/train/judge_schemas.py +201 -0
  66. synth_ai/cli/commands/train/judge_validation.py +305 -0
  67. synth_ai/cli/commands/train/prompt_learning_validation.py +633 -0
  68. synth_ai/cli/commands/train/validation.py +392 -0
  69. synth_ai/cli/demo_apps/__init__.py +10 -0
  70. synth_ai/cli/demo_apps/core/__init__.py +28 -0
  71. synth_ai/cli/demo_apps/core/cli.py +1735 -0
  72. synth_ai/cli/demo_apps/crafter/crafter_fft_4b.toml +55 -0
  73. synth_ai/cli/demo_apps/crafter/grpo_crafter_task_app.py +186 -0
  74. synth_ai/cli/demo_apps/crafter/rl_from_base_qwen4b.toml +74 -0
  75. synth_ai/cli/demo_apps/demo_registry.py +176 -0
  76. synth_ai/cli/demo_apps/demo_task_apps/core.py +440 -0
  77. synth_ai/cli/demo_apps/demo_task_apps/crafter/__init__.py +1 -0
  78. synth_ai/cli/demo_apps/demo_task_apps/crafter/grpo_crafter_task_app.py +185 -0
  79. synth_ai/cli/demo_apps/demo_task_apps/math/modal_task_app.py +742 -0
  80. synth_ai/cli/demo_apps/demo_task_apps/math/task_app_entry.py +39 -0
  81. synth_ai/cli/demo_apps/math/__init__.py +1 -0
  82. synth_ai/cli/demo_apps/math/_common.py +16 -0
  83. synth_ai/cli/demo_apps/math/app.py +38 -0
  84. synth_ai/cli/demo_apps/math/config.toml +76 -0
  85. synth_ai/cli/demo_apps/math/deploy_modal.py +54 -0
  86. synth_ai/cli/demo_apps/math/modal_task_app.py +702 -0
  87. synth_ai/cli/demo_apps/math/task_app_entry.py +53 -0
  88. synth_ai/cli/demo_apps/mipro/main.py +271 -0
  89. synth_ai/cli/demo_apps/mipro/task_app.py +933 -0
  90. synth_ai/cli/demo_apps/mipro/train_cfg.toml +92 -0
  91. synth_ai/cli/demos/__init__.py +12 -0
  92. synth_ai/cli/demos/demo.py +32 -0
  93. synth_ai/cli/demos/rl_demo.py +254 -0
  94. synth_ai/cli/deploy.py +216 -0
  95. synth_ai/cli/infra/__init__.py +14 -0
  96. synth_ai/cli/infra/balance.py +216 -0
  97. synth_ai/cli/infra/mcp.py +35 -0
  98. synth_ai/cli/infra/modal_app.py +36 -0
  99. synth_ai/cli/infra/setup.py +69 -0
  100. synth_ai/cli/infra/status.py +16 -0
  101. synth_ai/cli/infra/turso.py +77 -0
  102. synth_ai/cli/lib/__init__.py +10 -0
  103. synth_ai/cli/lib/agents.py +76 -0
  104. synth_ai/cli/lib/apps/modal_app.py +101 -0
  105. synth_ai/cli/lib/apps/task_app.py +643 -0
  106. synth_ai/cli/lib/bin.py +39 -0
  107. synth_ai/cli/lib/env.py +375 -0
  108. synth_ai/cli/lib/errors.py +85 -0
  109. synth_ai/cli/lib/modal.py +315 -0
  110. synth_ai/cli/lib/plotting.py +126 -0
  111. synth_ai/cli/lib/prompt_args.py +39 -0
  112. synth_ai/cli/lib/prompts.py +284 -0
  113. synth_ai/cli/lib/sqld.py +122 -0
  114. synth_ai/cli/lib/task_app_discovery.py +884 -0
  115. synth_ai/cli/lib/task_app_env.py +295 -0
  116. synth_ai/cli/lib/train_cfgs.py +300 -0
  117. synth_ai/cli/lib/tunnel_records.py +207 -0
  118. synth_ai/cli/local/__init__.py +14 -0
  119. synth_ai/cli/local/experiment_queue/__init__.py +72 -0
  120. synth_ai/cli/local/experiment_queue/api_schemas.py +221 -0
  121. synth_ai/cli/local/experiment_queue/celery_app.py +208 -0
  122. synth_ai/cli/local/experiment_queue/config.py +128 -0
  123. synth_ai/cli/local/experiment_queue/config_utils.py +272 -0
  124. synth_ai/cli/local/experiment_queue/database.py +175 -0
  125. synth_ai/cli/local/experiment_queue/dispatcher.py +119 -0
  126. synth_ai/cli/local/experiment_queue/models.py +231 -0
  127. synth_ai/cli/local/experiment_queue/progress_info.py +160 -0
  128. synth_ai/cli/local/experiment_queue/results.py +373 -0
  129. synth_ai/cli/local/experiment_queue/schemas.py +131 -0
  130. synth_ai/cli/local/experiment_queue/service.py +344 -0
  131. synth_ai/cli/local/experiment_queue/status.py +372 -0
  132. synth_ai/cli/local/experiment_queue/status_tracker.py +360 -0
  133. synth_ai/cli/local/experiment_queue/tasks.py +1984 -0
  134. synth_ai/cli/local/experiment_queue/trace_storage.py +65 -0
  135. synth_ai/cli/local/experiment_queue/validation.py +157 -0
  136. synth_ai/cli/local/session/__init__.py +92 -0
  137. synth_ai/cli/local/session/client.py +383 -0
  138. synth_ai/cli/local/session/constants.py +63 -0
  139. synth_ai/cli/local/session/exceptions.py +105 -0
  140. synth_ai/cli/local/session/manager.py +139 -0
  141. synth_ai/cli/local/session/models.py +89 -0
  142. synth_ai/cli/local/session/query.py +110 -0
  143. synth_ai/cli/root.py +30 -103
  144. synth_ai/cli/task_apps/__init__.py +26 -0
  145. synth_ai/cli/task_apps/commands.py +3153 -0
  146. synth_ai/cli/task_apps/deploy.py +7 -0
  147. synth_ai/cli/task_apps/list.py +26 -0
  148. synth_ai/cli/task_apps/main.py +36 -0
  149. synth_ai/cli/task_apps/modal_serve.py +11 -0
  150. synth_ai/cli/task_apps/serve.py +11 -0
  151. synth_ai/cli/training/__init__.py +8 -0
  152. synth_ai/cli/training/train.py +5 -0
  153. synth_ai/cli/training/train_cfg.py +34 -0
  154. synth_ai/cli/training/watch.py +506 -0
  155. synth_ai/cli/turso.py +34 -55
  156. synth_ai/cli/usage.py +159 -0
  157. synth_ai/cli/utils/__init__.py +8 -0
  158. synth_ai/cli/utils/experiments.py +235 -0
  159. synth_ai/cli/utils/queue.py +504 -0
  160. synth_ai/cli/utils/recent.py +133 -0
  161. synth_ai/cli/utils/traces.py +164 -0
  162. synth_ai/contracts/__init__.py +67 -0
  163. synth_ai/core/__init__.py +100 -0
  164. synth_ai/core/_utils/__init__.py +54 -0
  165. synth_ai/core/_utils/base_url.py +10 -0
  166. synth_ai/core/_utils/http.py +10 -0
  167. synth_ai/core/_utils/prompts.py +14 -0
  168. synth_ai/core/_utils/task_app_state.py +12 -0
  169. synth_ai/core/_utils/user_config.py +10 -0
  170. synth_ai/core/apps/common.py +116 -0
  171. synth_ai/core/auth.py +95 -0
  172. synth_ai/core/cfgs.py +240 -0
  173. synth_ai/core/config/__init__.py +16 -0
  174. synth_ai/core/config/base.py +168 -0
  175. synth_ai/core/config/resolver.py +89 -0
  176. synth_ai/core/env.py +220 -0
  177. synth_ai/core/errors.py +126 -0
  178. synth_ai/core/http.py +230 -0
  179. synth_ai/core/integrations/__init__.py +11 -0
  180. synth_ai/core/integrations/cloudflare.py +1710 -0
  181. synth_ai/core/integrations/mcp/__init__.py +6 -0
  182. synth_ai/core/integrations/mcp/__main__.py +8 -0
  183. synth_ai/core/integrations/mcp/claude.py +36 -0
  184. synth_ai/core/integrations/mcp/main.py +254 -0
  185. synth_ai/core/integrations/mcp/setup.py +100 -0
  186. synth_ai/core/integrations/modal.py +277 -0
  187. synth_ai/core/json.py +72 -0
  188. synth_ai/core/log_filter.py +99 -0
  189. synth_ai/core/logging.py +82 -0
  190. synth_ai/core/paths.py +107 -0
  191. synth_ai/core/pricing.py +109 -0
  192. synth_ai/core/process.py +233 -0
  193. synth_ai/core/ssl.py +25 -0
  194. synth_ai/core/storage/__init__.py +71 -0
  195. synth_ai/core/task_app_state.py +318 -0
  196. synth_ai/core/telemetry.py +282 -0
  197. synth_ai/core/tracing_v3/__init__.py +99 -0
  198. synth_ai/core/tracing_v3/config.py +229 -0
  199. synth_ai/core/tracing_v3/constants.py +21 -0
  200. synth_ai/core/tracing_v3/db_config.py +182 -0
  201. synth_ai/core/tracing_v3/decorators.py +401 -0
  202. synth_ai/core/tracing_v3/examples/basic_usage.py +194 -0
  203. synth_ai/core/tracing_v3/llm_call_record_helpers.py +437 -0
  204. synth_ai/core/tracing_v3/migration_helper.py +119 -0
  205. synth_ai/core/tracing_v3/replica_sync.py +262 -0
  206. synth_ai/core/tracing_v3/serialization.py +130 -0
  207. synth_ai/core/tracing_v3/session_tracer.py +542 -0
  208. synth_ai/core/tracing_v3/storage/base.py +211 -0
  209. synth_ai/core/tracing_v3/storage/config.py +109 -0
  210. synth_ai/core/tracing_v3/storage/factory.py +39 -0
  211. synth_ai/core/tracing_v3/storage/utils.py +206 -0
  212. synth_ai/core/tracing_v3/trace_utils.py +326 -0
  213. synth_ai/core/tracing_v3/turso/__init__.py +12 -0
  214. synth_ai/core/tracing_v3/turso/daemon.py +278 -0
  215. synth_ai/core/tracing_v3/turso/models.py +470 -0
  216. synth_ai/core/tracing_v3/turso/native_manager.py +1385 -0
  217. synth_ai/core/tracing_v3/utils.py +108 -0
  218. synth_ai/core/urls.py +18 -0
  219. synth_ai/core/user_config.py +137 -0
  220. synth_ai/core/uvicorn.py +222 -0
  221. synth_ai/data/__init__.py +110 -0
  222. synth_ai/data/enums.py +141 -0
  223. synth_ai/data/rewards.py +152 -0
  224. synth_ai/data/specs.py +36 -0
  225. synth_ai/data/traces.py +35 -0
  226. synth_ai/products/__init__.py +6 -0
  227. synth_ai/products/graph_evolve/__init__.py +46 -0
  228. synth_ai/products/graph_evolve/client.py +226 -0
  229. synth_ai/products/graph_evolve/config.py +591 -0
  230. synth_ai/products/graph_evolve/converters/__init__.py +42 -0
  231. synth_ai/products/graph_evolve/converters/openai_sft.py +484 -0
  232. synth_ai/products/graph_evolve/examples/hotpotqa/config.toml +109 -0
  233. synth_ai/products/graph_evolve/run.py +222 -0
  234. synth_ai/sdk/__init__.py +119 -0
  235. synth_ai/sdk/api/__init__.py +1 -0
  236. synth_ai/sdk/api/models/supported.py +514 -0
  237. synth_ai/sdk/api/research_agent/__init__.py +86 -0
  238. synth_ai/sdk/api/research_agent/cli.py +428 -0
  239. synth_ai/sdk/api/research_agent/config.py +357 -0
  240. synth_ai/sdk/api/research_agent/job.py +717 -0
  241. synth_ai/sdk/api/train/__init__.py +85 -0
  242. synth_ai/sdk/api/train/builders.py +895 -0
  243. synth_ai/sdk/api/train/cli.py +2188 -0
  244. synth_ai/sdk/api/train/config_finder.py +267 -0
  245. synth_ai/sdk/api/train/configs/__init__.py +65 -0
  246. synth_ai/sdk/api/train/configs/prompt_learning.py +1706 -0
  247. synth_ai/sdk/api/train/configs/rl.py +188 -0
  248. synth_ai/sdk/api/train/configs/sft.py +99 -0
  249. synth_ai/sdk/api/train/configs/shared.py +81 -0
  250. synth_ai/sdk/api/train/context_learning.py +312 -0
  251. synth_ai/sdk/api/train/env_resolver.py +418 -0
  252. synth_ai/sdk/api/train/graph_validators.py +216 -0
  253. synth_ai/sdk/api/train/graphgen.py +984 -0
  254. synth_ai/sdk/api/train/graphgen_models.py +823 -0
  255. synth_ai/sdk/api/train/graphgen_validators.py +109 -0
  256. synth_ai/sdk/api/train/pollers.py +124 -0
  257. synth_ai/sdk/api/train/progress/__init__.py +97 -0
  258. synth_ai/sdk/api/train/progress/dataclasses.py +569 -0
  259. synth_ai/sdk/api/train/progress/events.py +326 -0
  260. synth_ai/sdk/api/train/progress/results.py +428 -0
  261. synth_ai/sdk/api/train/progress/tracker.py +641 -0
  262. synth_ai/sdk/api/train/prompt_learning.py +470 -0
  263. synth_ai/sdk/api/train/rl.py +442 -0
  264. synth_ai/sdk/api/train/sft.py +396 -0
  265. synth_ai/sdk/api/train/summary.py +522 -0
  266. synth_ai/sdk/api/train/supported_algos.py +147 -0
  267. synth_ai/sdk/api/train/task_app.py +331 -0
  268. synth_ai/sdk/api/train/utils.py +279 -0
  269. synth_ai/sdk/api/train/validators.py +2424 -0
  270. synth_ai/sdk/baseline/__init__.py +25 -0
  271. synth_ai/sdk/baseline/config.py +209 -0
  272. synth_ai/sdk/baseline/discovery.py +216 -0
  273. synth_ai/sdk/baseline/execution.py +154 -0
  274. synth_ai/sdk/graphs/__init__.py +15 -0
  275. synth_ai/sdk/graphs/completions.py +570 -0
  276. synth_ai/sdk/inference/__init__.py +6 -0
  277. synth_ai/sdk/inference/client.py +128 -0
  278. synth_ai/sdk/jobs/__init__.py +16 -0
  279. synth_ai/sdk/jobs/client.py +371 -0
  280. synth_ai/sdk/judging/__init__.py +15 -0
  281. synth_ai/sdk/judging/base.py +24 -0
  282. synth_ai/sdk/judging/client.py +191 -0
  283. synth_ai/sdk/judging/schemas.py +222 -0
  284. synth_ai/sdk/judging/types.py +42 -0
  285. synth_ai/sdk/learning/__init__.py +69 -0
  286. synth_ai/sdk/learning/client.py +240 -0
  287. synth_ai/sdk/learning/ft_client.py +7 -0
  288. synth_ai/sdk/learning/health.py +49 -0
  289. synth_ai/sdk/learning/jobs.py +202 -0
  290. synth_ai/sdk/learning/prompt_extraction.py +334 -0
  291. synth_ai/sdk/learning/prompt_learning_client.py +455 -0
  292. synth_ai/sdk/learning/prompt_learning_types.py +185 -0
  293. synth_ai/sdk/learning/rl/client.py +268 -0
  294. synth_ai/sdk/learning/rl/contracts.py +27 -0
  295. synth_ai/sdk/learning/rl/env_keys.py +166 -0
  296. synth_ai/sdk/learning/rl/secrets.py +13 -0
  297. synth_ai/sdk/learning/sft/client.py +95 -0
  298. synth_ai/sdk/learning/sft/config.py +270 -0
  299. synth_ai/sdk/learning/sft/data.py +698 -0
  300. synth_ai/sdk/learning/validators.py +52 -0
  301. synth_ai/sdk/research_agent/__init__.py +34 -0
  302. synth_ai/sdk/research_agent/container_builder.py +328 -0
  303. synth_ai/sdk/research_agent/container_spec.py +198 -0
  304. synth_ai/sdk/research_agent/defaults.py +34 -0
  305. synth_ai/sdk/research_agent/results_collector.py +69 -0
  306. synth_ai/sdk/specs/__init__.py +46 -0
  307. synth_ai/sdk/specs/dataclasses.py +149 -0
  308. synth_ai/sdk/specs/loader.py +144 -0
  309. synth_ai/sdk/specs/serializer.py +199 -0
  310. synth_ai/sdk/specs/validation.py +250 -0
  311. synth_ai/sdk/streaming/__init__.py +35 -0
  312. synth_ai/sdk/streaming/config.py +94 -0
  313. synth_ai/sdk/streaming/handlers.py +1997 -0
  314. synth_ai/sdk/streaming/streamer.py +704 -0
  315. synth_ai/sdk/streaming/types.py +112 -0
  316. synth_ai/sdk/task/__init__.py +151 -0
  317. synth_ai/sdk/task/apps/__init__.py +133 -0
  318. synth_ai/sdk/task/config.py +261 -0
  319. synth_ai/sdk/task/contracts.py +298 -0
  320. synth_ai/sdk/task/datasets.py +108 -0
  321. synth_ai/sdk/task/in_process.py +1190 -0
  322. synth_ai/sdk/task/in_process_runner.py +309 -0
  323. synth_ai/sdk/task/inference_api.py +299 -0
  324. synth_ai/sdk/task/proxy.py +287 -0
  325. synth_ai/sdk/task/rubrics/__init__.py +55 -0
  326. synth_ai/sdk/task/rubrics/loaders.py +156 -0
  327. synth_ai/sdk/task/rubrics/models.py +57 -0
  328. synth_ai/sdk/task/rubrics/scoring.py +116 -0
  329. synth_ai/sdk/task/rubrics/strict.py +149 -0
  330. synth_ai/sdk/task/server.py +580 -0
  331. synth_ai/sdk/task/trace_correlation_helpers.py +506 -0
  332. synth_ai/sdk/task/tracing_utils.py +95 -0
  333. synth_ai/sdk/task/validators.py +456 -0
  334. synth_ai/sdk/tracing/__init__.py +39 -0
  335. synth_ai/sdk/training/__init__.py +102 -0
  336. synth_ai/sdk/usage/__init__.py +37 -0
  337. synth_ai/sdk/usage/client.py +171 -0
  338. synth_ai/sdk/usage/models.py +261 -0
  339. synth_ai/utils/__init__.py +213 -0
  340. synth_ai-0.4.1.dist-info/METADATA +195 -0
  341. synth_ai-0.4.1.dist-info/RECORD +379 -0
  342. synth_ai-0.4.1.dist-info/entry_points.txt +2 -0
  343. synth_ai-0.4.1.dist-info/top_level.txt +1 -0
  344. examples/__init__.py +0 -16
  345. examples/analyze_semantic_words.sh +0 -17
  346. examples/crafter_debug_render.py +0 -186
  347. examples/qwen_coder/README.md +0 -102
  348. examples/qwen_coder/_shared.py +0 -113
  349. examples/qwen_coder/configs/coder_lora_30b.toml +0 -61
  350. examples/qwen_coder/configs/coder_lora_4b.toml +0 -57
  351. examples/qwen_coder/configs/coder_lora_small.toml +0 -58
  352. examples/qwen_coder/generate_dataset.py +0 -98
  353. examples/qwen_coder/infer_ft_smoke.py +0 -64
  354. examples/qwen_coder/infer_prod_proxy.py +0 -73
  355. examples/qwen_coder/infer_via_synth.py +0 -87
  356. examples/qwen_coder/scripts/infer_coder.sh +0 -18
  357. examples/qwen_coder/scripts/train_coder_30b.sh +0 -21
  358. examples/qwen_coder/sft_full_17b.py +0 -103
  359. examples/qwen_coder/sft_lora_30b.py +0 -110
  360. examples/qwen_coder/subset_jsonl.py +0 -38
  361. examples/qwen_coder/validate_jsonl.py +0 -59
  362. examples/rl/README.md +0 -169
  363. examples/rl/configs/eval_base_qwen.toml +0 -15
  364. examples/rl/configs/eval_rl_qwen.toml +0 -11
  365. examples/rl/configs/rl_from_base_qwen.toml +0 -35
  366. examples/rl/configs/rl_from_base_qwen17.toml +0 -74
  367. examples/rl/configs/rl_from_ft_qwen.toml +0 -35
  368. examples/rl/download_dataset.py +0 -80
  369. examples/rl/run_eval.py +0 -436
  370. examples/rl/run_rl_and_save.py +0 -111
  371. examples/rl/task_app/README.md +0 -22
  372. examples/rl/task_app/math_single_step.py +0 -991
  373. examples/rl/task_app/math_task_app.py +0 -115
  374. examples/run_crafter_demo.sh +0 -10
  375. examples/sft/README.md +0 -139
  376. examples/sft/configs/crafter_fft_qwen0p6b.toml +0 -44
  377. examples/sft/configs/crafter_lora_qwen0p6b.toml +0 -45
  378. examples/sft/evaluate.py +0 -117
  379. examples/sft/export_dataset.py +0 -117
  380. examples/sft/generate_traces.py +0 -162
  381. examples/swe/__init__.py +0 -12
  382. examples/swe/task_app/README.md +0 -105
  383. examples/swe/task_app/__init__.py +0 -2
  384. examples/swe/task_app/grpo_swe_mini.py +0 -571
  385. examples/swe/task_app/grpo_swe_mini_task_app.py +0 -136
  386. examples/swe/task_app/hosted/README.md +0 -173
  387. examples/swe/task_app/hosted/__init__.py +0 -5
  388. examples/swe/task_app/hosted/branching.py +0 -143
  389. examples/swe/task_app/hosted/environment_routes.py +0 -1289
  390. examples/swe/task_app/hosted/envs/__init__.py +0 -1
  391. examples/swe/task_app/hosted/envs/crafter/__init__.py +0 -6
  392. examples/swe/task_app/hosted/envs/crafter/app.py +0 -1
  393. examples/swe/task_app/hosted/envs/crafter/environment.py +0 -522
  394. examples/swe/task_app/hosted/envs/crafter/policy.py +0 -478
  395. examples/swe/task_app/hosted/envs/crafter/react_agent.py +0 -108
  396. examples/swe/task_app/hosted/envs/crafter/shared.py +0 -305
  397. examples/swe/task_app/hosted/envs/crafter/tools.py +0 -47
  398. examples/swe/task_app/hosted/envs/mini_swe/__init__.py +0 -8
  399. examples/swe/task_app/hosted/envs/mini_swe/environment.py +0 -1164
  400. examples/swe/task_app/hosted/envs/mini_swe/policy.py +0 -355
  401. examples/swe/task_app/hosted/envs/mini_swe/shared.py +0 -83
  402. examples/swe/task_app/hosted/envs/mini_swe/tools.py +0 -96
  403. examples/swe/task_app/hosted/hosted_app.py +0 -204
  404. examples/swe/task_app/hosted/inference/__init__.py +0 -5
  405. examples/swe/task_app/hosted/inference/openai_client.py +0 -618
  406. examples/swe/task_app/hosted/main.py +0 -100
  407. examples/swe/task_app/hosted/policy_routes.py +0 -1079
  408. examples/swe/task_app/hosted/registry.py +0 -195
  409. examples/swe/task_app/hosted/rollout.py +0 -1869
  410. examples/swe/task_app/hosted/storage/__init__.py +0 -5
  411. examples/swe/task_app/hosted/storage/volume.py +0 -211
  412. examples/swe/task_app/hosted/test_agents.py +0 -161
  413. examples/swe/task_app/hosted/test_service.py +0 -137
  414. examples/swe/task_app/hosted/utils.py +0 -62
  415. examples/vlm/README.md +0 -68
  416. examples/vlm/configs/crafter_vlm_gpt4o.toml +0 -44
  417. examples/vlm/crafter_image_only_agent.py +0 -207
  418. examples/vlm/crafter_openai_vlm_agent.py +0 -277
  419. examples/vlm/filter_image_rows.py +0 -63
  420. examples/vlm/run_crafter_vlm_benchmark.py +0 -316
  421. examples/warming_up_to_rl/analyze_trace_db.py +0 -422
  422. examples/warming_up_to_rl/configs/crafter_fft.toml +0 -48
  423. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +0 -54
  424. examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +0 -20
  425. examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +0 -13
  426. examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +0 -23
  427. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +0 -83
  428. examples/warming_up_to_rl/configs/rl_from_ft.toml +0 -56
  429. examples/warming_up_to_rl/export_trace_sft.py +0 -723
  430. examples/warming_up_to_rl/groq_test.py +0 -95
  431. examples/warming_up_to_rl/manage_secrets.py +0 -131
  432. examples/warming_up_to_rl/readme.md +0 -179
  433. examples/warming_up_to_rl/run_eval.py +0 -510
  434. examples/warming_up_to_rl/run_fft_and_save.py +0 -380
  435. examples/warming_up_to_rl/run_local_rollout.py +0 -237
  436. examples/warming_up_to_rl/run_local_rollout_modal.py +0 -246
  437. examples/warming_up_to_rl/run_local_rollout_parallel.py +0 -403
  438. examples/warming_up_to_rl/run_local_rollout_traced.py +0 -475
  439. examples/warming_up_to_rl/run_rl_and_save.py +0 -124
  440. examples/warming_up_to_rl/run_rollout_remote.py +0 -154
  441. examples/warming_up_to_rl/task_app/README.md +0 -42
  442. examples/warming_up_to_rl/task_app/grpo_crafter.py +0 -700
  443. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +0 -146
  444. examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +0 -173
  445. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +0 -5
  446. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +0 -143
  447. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +0 -1226
  448. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +0 -1
  449. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +0 -6
  450. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +0 -1
  451. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +0 -522
  452. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +0 -478
  453. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +0 -108
  454. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +0 -305
  455. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +0 -47
  456. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +0 -204
  457. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +0 -5
  458. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +0 -618
  459. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +0 -100
  460. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +0 -1083
  461. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +0 -195
  462. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +0 -1869
  463. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +0 -5
  464. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +0 -211
  465. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +0 -161
  466. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +0 -137
  467. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +0 -62
  468. synth/__init__.py +0 -14
  469. synth_ai/api/models/supported.py +0 -376
  470. synth_ai/api/train/__init__.py +0 -5
  471. synth_ai/api/train/builders.py +0 -296
  472. synth_ai/api/train/cli.py +0 -606
  473. synth_ai/api/train/config_finder.py +0 -228
  474. synth_ai/api/train/env_resolver.py +0 -347
  475. synth_ai/api/train/pollers.py +0 -75
  476. synth_ai/api/train/supported_algos.py +0 -139
  477. synth_ai/api/train/task_app.py +0 -195
  478. synth_ai/api/train/utils.py +0 -217
  479. synth_ai/cli/_modal_wrapper.py +0 -28
  480. synth_ai/cli/_typer_patch.py +0 -49
  481. synth_ai/cli/balance.py +0 -203
  482. synth_ai/cli/calc.py +0 -69
  483. synth_ai/cli/demo.py +0 -159
  484. synth_ai/cli/legacy_root_backup.py +0 -470
  485. synth_ai/cli/man.py +0 -106
  486. synth_ai/cli/recent.py +0 -127
  487. synth_ai/cli/rl_demo.py +0 -274
  488. synth_ai/cli/status.py +0 -133
  489. synth_ai/cli/task_apps.py +0 -2782
  490. synth_ai/cli/traces.py +0 -163
  491. synth_ai/cli/watch.py +0 -505
  492. synth_ai/config/base_url.py +0 -107
  493. synth_ai/core/experiment.py +0 -13
  494. synth_ai/core/system.py +0 -15
  495. synth_ai/demo_registry.py +0 -295
  496. synth_ai/demos/core/__init__.py +0 -1
  497. synth_ai/demos/core/cli.py +0 -1756
  498. synth_ai/demos/demo_task_apps/core.py +0 -440
  499. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +0 -172
  500. synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +0 -22
  501. synth_ai/demos/demo_task_apps/math/modal_task_app.py +0 -739
  502. synth_ai/demos/demo_task_apps/math/task_app_entry.py +0 -37
  503. synth_ai/environments/__init__.py +0 -31
  504. synth_ai/environments/environment/__init__.py +0 -1
  505. synth_ai/environments/environment/artifacts/__init__.py +0 -1
  506. synth_ai/environments/environment/artifacts/base.py +0 -52
  507. synth_ai/environments/environment/core.py +0 -67
  508. synth_ai/environments/environment/db/__init__.py +0 -1
  509. synth_ai/environments/environment/db/sqlite.py +0 -45
  510. synth_ai/environments/environment/registry.py +0 -233
  511. synth_ai/environments/environment/resources/sqlite.py +0 -45
  512. synth_ai/environments/environment/results.py +0 -1
  513. synth_ai/environments/environment/rewards/__init__.py +0 -1
  514. synth_ai/environments/environment/rewards/core.py +0 -29
  515. synth_ai/environments/environment/shared_engine.py +0 -26
  516. synth_ai/environments/environment/tools/__init__.py +0 -200
  517. synth_ai/environments/examples/__init__.py +0 -1
  518. synth_ai/environments/examples/bandit/__init__.py +0 -33
  519. synth_ai/environments/examples/bandit/engine.py +0 -302
  520. synth_ai/environments/examples/bandit/environment.py +0 -194
  521. synth_ai/environments/examples/bandit/taskset.py +0 -200
  522. synth_ai/environments/examples/crafter_classic/__init__.py +0 -8
  523. synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +0 -250
  524. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +0 -59
  525. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +0 -152
  526. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +0 -24
  527. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +0 -1194
  528. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +0 -56
  529. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +0 -32
  530. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +0 -384
  531. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +0 -53
  532. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +0 -178
  533. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +0 -222
  534. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +0 -183
  535. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +0 -210
  536. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +0 -206
  537. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +0 -49
  538. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +0 -64
  539. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +0 -88
  540. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +0 -77
  541. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +0 -324
  542. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +0 -362
  543. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +0 -49
  544. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +0 -332
  545. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +0 -97
  546. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +0 -217
  547. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +0 -87
  548. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +0 -88
  549. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +0 -195
  550. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +0 -400
  551. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +0 -195
  552. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +0 -56
  553. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +0 -858
  554. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +0 -52
  555. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +0 -874
  556. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +0 -1412
  557. synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +0 -216
  558. synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +0 -296
  559. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +0 -58
  560. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +0 -464
  561. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +0 -152
  562. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +0 -51
  563. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +0 -1412
  564. synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +0 -112
  565. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +0 -203
  566. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +0 -305
  567. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +0 -126
  568. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +0 -94
  569. synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +0 -142
  570. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +0 -26
  571. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +0 -984
  572. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +0 -724
  573. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +0 -386
  574. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +0 -205
  575. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +0 -150
  576. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +0 -283
  577. synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +0 -280
  578. synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +0 -456
  579. synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +0 -166
  580. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +0 -102
  581. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +0 -128
  582. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +0 -655
  583. synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +0 -202
  584. synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +0 -166
  585. synth_ai/environments/examples/crafter_classic/config_logging.py +0 -111
  586. synth_ai/environments/examples/crafter_classic/debug_translation.py +0 -0
  587. synth_ai/environments/examples/crafter_classic/engine.py +0 -579
  588. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +0 -64
  589. synth_ai/environments/examples/crafter_classic/engine_helpers/action_map.py +0 -6
  590. synth_ai/environments/examples/crafter_classic/engine_helpers/serialization.py +0 -75
  591. synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +0 -267
  592. synth_ai/environments/examples/crafter_classic/environment.py +0 -479
  593. synth_ai/environments/examples/crafter_classic/taskset.py +0 -233
  594. synth_ai/environments/examples/crafter_classic/trace_hooks_v3.py +0 -228
  595. synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +0 -299
  596. synth_ai/environments/examples/crafter_custom/__init__.py +0 -4
  597. synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +0 -1
  598. synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +0 -202
  599. synth_ai/environments/examples/crafter_custom/crafter/__init__.py +0 -7
  600. synth_ai/environments/examples/crafter_custom/crafter/config.py +0 -182
  601. synth_ai/environments/examples/crafter_custom/crafter/constants.py +0 -8
  602. synth_ai/environments/examples/crafter_custom/crafter/engine.py +0 -269
  603. synth_ai/environments/examples/crafter_custom/crafter/env.py +0 -262
  604. synth_ai/environments/examples/crafter_custom/crafter/objects.py +0 -417
  605. synth_ai/environments/examples/crafter_custom/crafter/recorder.py +0 -187
  606. synth_ai/environments/examples/crafter_custom/crafter/worldgen.py +0 -118
  607. synth_ai/environments/examples/crafter_custom/dataset_builder.py +0 -373
  608. synth_ai/environments/examples/crafter_custom/environment.py +0 -312
  609. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +0 -159
  610. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +0 -158
  611. synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +0 -71
  612. synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +0 -105
  613. synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +0 -119
  614. synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +0 -52
  615. synth_ai/environments/examples/crafter_custom/run_dataset.py +0 -305
  616. synth_ai/environments/examples/enron/art_helpers/email_search_tools.py +0 -156
  617. synth_ai/environments/examples/enron/art_helpers/local_email_db.py +0 -281
  618. synth_ai/environments/examples/enron/art_helpers/types_enron.py +0 -25
  619. synth_ai/environments/examples/enron/engine.py +0 -295
  620. synth_ai/environments/examples/enron/environment.py +0 -166
  621. synth_ai/environments/examples/enron/taskset.py +0 -112
  622. synth_ai/environments/examples/enron/units/keyword_stats.py +0 -112
  623. synth_ai/environments/examples/minigrid/__init__.py +0 -48
  624. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +0 -1188
  625. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +0 -48
  626. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +0 -562
  627. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +0 -221
  628. synth_ai/environments/examples/minigrid/engine.py +0 -589
  629. synth_ai/environments/examples/minigrid/environment.py +0 -274
  630. synth_ai/environments/examples/minigrid/environment_mapping.py +0 -242
  631. synth_ai/environments/examples/minigrid/puzzle_loader.py +0 -417
  632. synth_ai/environments/examples/minigrid/taskset.py +0 -583
  633. synth_ai/environments/examples/nethack/__init__.py +0 -7
  634. synth_ai/environments/examples/nethack/achievements.py +0 -337
  635. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +0 -981
  636. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +0 -74
  637. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +0 -831
  638. synth_ai/environments/examples/nethack/engine.py +0 -739
  639. synth_ai/environments/examples/nethack/environment.py +0 -256
  640. synth_ai/environments/examples/nethack/helpers/__init__.py +0 -41
  641. synth_ai/environments/examples/nethack/helpers/action_mapping.py +0 -301
  642. synth_ai/environments/examples/nethack/helpers/nle_wrapper.py +0 -402
  643. synth_ai/environments/examples/nethack/helpers/observation_utils.py +0 -433
  644. synth_ai/environments/examples/nethack/helpers/recording_wrapper.py +0 -200
  645. synth_ai/environments/examples/nethack/helpers/trajectory_recorder.py +0 -269
  646. synth_ai/environments/examples/nethack/helpers/visualization/replay_viewer.py +0 -308
  647. synth_ai/environments/examples/nethack/helpers/visualization/visualizer.py +0 -431
  648. synth_ai/environments/examples/nethack/taskset.py +0 -323
  649. synth_ai/environments/examples/red/__init__.py +0 -7
  650. synth_ai/environments/examples/red/agent_demos/__init__.py +0 -1
  651. synth_ai/environments/examples/red/config_logging.py +0 -110
  652. synth_ai/environments/examples/red/engine.py +0 -694
  653. synth_ai/environments/examples/red/engine_helpers/__init__.py +0 -1
  654. synth_ai/environments/examples/red/engine_helpers/memory_map.py +0 -28
  655. synth_ai/environments/examples/red/engine_helpers/reward_components.py +0 -276
  656. synth_ai/environments/examples/red/engine_helpers/reward_library/__init__.py +0 -142
  657. synth_ai/environments/examples/red/engine_helpers/reward_library/adaptive_rewards.py +0 -57
  658. synth_ai/environments/examples/red/engine_helpers/reward_library/battle_rewards.py +0 -284
  659. synth_ai/environments/examples/red/engine_helpers/reward_library/composite_rewards.py +0 -150
  660. synth_ai/environments/examples/red/engine_helpers/reward_library/economy_rewards.py +0 -138
  661. synth_ai/environments/examples/red/engine_helpers/reward_library/efficiency_rewards.py +0 -57
  662. synth_ai/environments/examples/red/engine_helpers/reward_library/exploration_rewards.py +0 -331
  663. synth_ai/environments/examples/red/engine_helpers/reward_library/novelty_rewards.py +0 -121
  664. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_rewards.py +0 -559
  665. synth_ai/environments/examples/red/engine_helpers/reward_library/pokemon_rewards.py +0 -313
  666. synth_ai/environments/examples/red/engine_helpers/reward_library/social_rewards.py +0 -148
  667. synth_ai/environments/examples/red/engine_helpers/reward_library/story_rewards.py +0 -247
  668. synth_ai/environments/examples/red/engine_helpers/screen_analysis.py +0 -368
  669. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +0 -140
  670. synth_ai/environments/examples/red/environment.py +0 -238
  671. synth_ai/environments/examples/red/taskset.py +0 -79
  672. synth_ai/environments/examples/red/units/__init__.py +0 -1
  673. synth_ai/environments/examples/sokoban/__init__.py +0 -1
  674. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +0 -899
  675. synth_ai/environments/examples/sokoban/engine.py +0 -678
  676. synth_ai/environments/examples/sokoban/engine_helpers/__init__.py +0 -1
  677. synth_ai/environments/examples/sokoban/engine_helpers/room_utils.py +0 -657
  678. synth_ai/environments/examples/sokoban/engine_helpers/vendored/__init__.py +0 -18
  679. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/__init__.py +0 -3
  680. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/boxoban_env.py +0 -131
  681. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/render_utils.py +0 -370
  682. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/room_utils.py +0 -332
  683. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env.py +0 -306
  684. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_fixed_targets.py +0 -67
  685. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_pull.py +0 -115
  686. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_two_player.py +0 -123
  687. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_variations.py +0 -394
  688. synth_ai/environments/examples/sokoban/environment.py +0 -229
  689. synth_ai/environments/examples/sokoban/generate_verified_puzzles.py +0 -440
  690. synth_ai/environments/examples/sokoban/puzzle_loader.py +0 -312
  691. synth_ai/environments/examples/sokoban/taskset.py +0 -428
  692. synth_ai/environments/examples/tictactoe/__init__.py +0 -1
  693. synth_ai/environments/examples/tictactoe/engine.py +0 -368
  694. synth_ai/environments/examples/tictactoe/environment.py +0 -240
  695. synth_ai/environments/examples/tictactoe/taskset.py +0 -215
  696. synth_ai/environments/examples/verilog/__init__.py +0 -10
  697. synth_ai/environments/examples/verilog/engine.py +0 -329
  698. synth_ai/environments/examples/verilog/environment.py +0 -350
  699. synth_ai/environments/examples/verilog/taskset.py +0 -420
  700. synth_ai/environments/examples/wordle/__init__.py +0 -29
  701. synth_ai/environments/examples/wordle/engine.py +0 -398
  702. synth_ai/environments/examples/wordle/environment.py +0 -159
  703. synth_ai/environments/examples/wordle/helpers/generate_instances_wordfreq.py +0 -75
  704. synth_ai/environments/examples/wordle/taskset.py +0 -230
  705. synth_ai/environments/reproducibility/core.py +0 -42
  706. synth_ai/environments/reproducibility/helpers.py +0 -0
  707. synth_ai/environments/reproducibility/tree.py +0 -363
  708. synth_ai/environments/service/app.py +0 -97
  709. synth_ai/environments/service/core_routes.py +0 -1021
  710. synth_ai/environments/service/external_registry.py +0 -56
  711. synth_ai/environments/service/registry.py +0 -9
  712. synth_ai/environments/stateful/__init__.py +0 -1
  713. synth_ai/environments/stateful/core.py +0 -163
  714. synth_ai/environments/stateful/engine.py +0 -21
  715. synth_ai/environments/stateful/state.py +0 -7
  716. synth_ai/environments/tasks/api.py +0 -19
  717. synth_ai/environments/tasks/core.py +0 -81
  718. synth_ai/environments/tasks/filters.py +0 -40
  719. synth_ai/environments/tasks/utils.py +0 -90
  720. synth_ai/environments/v0_observability/history.py +0 -3
  721. synth_ai/environments/v0_observability/log.py +0 -2
  722. synth_ai/evals/base.py +0 -13
  723. synth_ai/handshake.py +0 -109
  724. synth_ai/http.py +0 -26
  725. synth_ai/http_client.py +0 -136
  726. synth_ai/inference/__init__.py +0 -5
  727. synth_ai/inference/client.py +0 -34
  728. synth_ai/jobs/client.py +0 -271
  729. synth_ai/learning/__init__.py +0 -59
  730. synth_ai/learning/client.py +0 -241
  731. synth_ai/learning/ft_client.py +0 -7
  732. synth_ai/learning/health.py +0 -49
  733. synth_ai/learning/jobs.py +0 -201
  734. synth_ai/learning/rl/client.py +0 -267
  735. synth_ai/learning/rl/contracts.py +0 -27
  736. synth_ai/learning/rl/env_keys.py +0 -166
  737. synth_ai/learning/rl/secrets.py +0 -13
  738. synth_ai/learning/sft/client.py +0 -68
  739. synth_ai/learning/sft/config.py +0 -270
  740. synth_ai/learning/sft/data.py +0 -295
  741. synth_ai/learning/validators.py +0 -49
  742. synth_ai/lm/__init__.py +0 -25
  743. synth_ai/main.py +0 -6
  744. synth_ai/task/__init__.py +0 -102
  745. synth_ai/task/apps/__init__.py +0 -128
  746. synth_ai/task/contracts.py +0 -137
  747. synth_ai/task/datasets.py +0 -108
  748. synth_ai/task/proxy.py +0 -259
  749. synth_ai/task/server.py +0 -424
  750. synth_ai/task/tracing_utils.py +0 -84
  751. synth_ai/task/validators.py +0 -11
  752. synth_ai/tracing_v3/__init__.py +0 -97
  753. synth_ai/tracing_v3/config.py +0 -84
  754. synth_ai/tracing_v3/db_config.py +0 -194
  755. synth_ai/tracing_v3/decorators.py +0 -369
  756. synth_ai/tracing_v3/examples/basic_usage.py +0 -189
  757. synth_ai/tracing_v3/llm_call_record_helpers.py +0 -337
  758. synth_ai/tracing_v3/migration_helper.py +0 -120
  759. synth_ai/tracing_v3/replica_sync.py +0 -258
  760. synth_ai/tracing_v3/session_tracer.py +0 -530
  761. synth_ai/tracing_v3/storage/base.py +0 -210
  762. synth_ai/tracing_v3/storage/config.py +0 -75
  763. synth_ai/tracing_v3/storage/factory.py +0 -39
  764. synth_ai/tracing_v3/storage/utils.py +0 -204
  765. synth_ai/tracing_v3/turso/daemon.py +0 -149
  766. synth_ai/tracing_v3/turso/models.py +0 -469
  767. synth_ai/tracing_v3/turso/native_manager.py +0 -1173
  768. synth_ai/tracing_v3/utils.py +0 -108
  769. synth_ai/v0/api/__init__.py +0 -8
  770. synth_ai/v0/api/models/__init__.py +0 -8
  771. synth_ai/v0/api/models/supported.py +0 -8
  772. synth_ai/v0/config/__init__.py +0 -15
  773. synth_ai/v0/config/base_url.py +0 -12
  774. synth_ai/v0/lm/__init__.py +0 -51
  775. synth_ai/v0/lm/caching/constants.py +0 -6
  776. synth_ai/v0/lm/caching/dbs.py +0 -0
  777. synth_ai/v0/lm/caching/ephemeral.py +0 -100
  778. synth_ai/v0/lm/caching/handler.py +0 -137
  779. synth_ai/v0/lm/caching/initialize.py +0 -11
  780. synth_ai/v0/lm/caching/persistent.py +0 -114
  781. synth_ai/v0/lm/config.py +0 -115
  782. synth_ai/v0/lm/constants.py +0 -32
  783. synth_ai/v0/lm/core/__init__.py +0 -8
  784. synth_ai/v0/lm/core/all.py +0 -73
  785. synth_ai/v0/lm/core/exceptions.py +0 -5
  786. synth_ai/v0/lm/core/main.py +0 -331
  787. synth_ai/v0/lm/core/main_v3.py +0 -594
  788. synth_ai/v0/lm/core/synth_models.py +0 -35
  789. synth_ai/v0/lm/core/vendor_clients.py +0 -190
  790. synth_ai/v0/lm/cost/__init__.py +0 -0
  791. synth_ai/v0/lm/cost/monitor.py +0 -1
  792. synth_ai/v0/lm/cost/statefulness.py +0 -1
  793. synth_ai/v0/lm/injection.py +0 -80
  794. synth_ai/v0/lm/overrides.py +0 -206
  795. synth_ai/v0/lm/provider_support/__init__.py +0 -8
  796. synth_ai/v0/lm/provider_support/anthropic.py +0 -972
  797. synth_ai/v0/lm/provider_support/openai.py +0 -1139
  798. synth_ai/v0/lm/provider_support/suppress_logging.py +0 -31
  799. synth_ai/v0/lm/structured_outputs/__init__.py +0 -0
  800. synth_ai/v0/lm/structured_outputs/handler.py +0 -440
  801. synth_ai/v0/lm/structured_outputs/inject.py +0 -297
  802. synth_ai/v0/lm/structured_outputs/rehabilitate.py +0 -185
  803. synth_ai/v0/lm/tools/__init__.py +0 -3
  804. synth_ai/v0/lm/tools/base.py +0 -172
  805. synth_ai/v0/lm/unified_interface.py +0 -202
  806. synth_ai/v0/lm/vendors/__init__.py +0 -0
  807. synth_ai/v0/lm/vendors/base.py +0 -81
  808. synth_ai/v0/lm/vendors/core/__init__.py +0 -0
  809. synth_ai/v0/lm/vendors/core/anthropic_api.py +0 -387
  810. synth_ai/v0/lm/vendors/core/gemini_api.py +0 -292
  811. synth_ai/v0/lm/vendors/core/mistral_api.py +0 -322
  812. synth_ai/v0/lm/vendors/core/openai_api.py +0 -227
  813. synth_ai/v0/lm/vendors/core/synth_dev_api.py +0 -0
  814. synth_ai/v0/lm/vendors/local/__init__.py +0 -0
  815. synth_ai/v0/lm/vendors/local/ollama.py +0 -0
  816. synth_ai/v0/lm/vendors/openai_standard.py +0 -782
  817. synth_ai/v0/lm/vendors/openai_standard_responses.py +0 -259
  818. synth_ai/v0/lm/vendors/retries.py +0 -22
  819. synth_ai/v0/lm/vendors/supported/__init__.py +0 -0
  820. synth_ai/v0/lm/vendors/supported/custom_endpoint.py +0 -415
  821. synth_ai/v0/lm/vendors/supported/deepseek.py +0 -69
  822. synth_ai/v0/lm/vendors/supported/grok.py +0 -75
  823. synth_ai/v0/lm/vendors/supported/groq.py +0 -16
  824. synth_ai/v0/lm/vendors/supported/ollama.py +0 -15
  825. synth_ai/v0/lm/vendors/supported/openrouter.py +0 -74
  826. synth_ai/v0/lm/vendors/supported/together.py +0 -11
  827. synth_ai/v0/lm/vendors/synth_client.py +0 -835
  828. synth_ai/v0/lm/warmup.py +0 -186
  829. synth_ai/v0/tracing/__init__.py +0 -0
  830. synth_ai/v0/tracing/abstractions.py +0 -224
  831. synth_ai/v0/tracing/base_client.py +0 -91
  832. synth_ai/v0/tracing/client_manager.py +0 -131
  833. synth_ai/v0/tracing/config.py +0 -142
  834. synth_ai/v0/tracing/context.py +0 -146
  835. synth_ai/v0/tracing/decorators.py +0 -682
  836. synth_ai/v0/tracing/events/__init__.py +0 -0
  837. synth_ai/v0/tracing/events/manage.py +0 -147
  838. synth_ai/v0/tracing/events/scope.py +0 -86
  839. synth_ai/v0/tracing/events/store.py +0 -228
  840. synth_ai/v0/tracing/immediate_client.py +0 -151
  841. synth_ai/v0/tracing/local.py +0 -18
  842. synth_ai/v0/tracing/log_client_base.py +0 -73
  843. synth_ai/v0/tracing/retry_queue.py +0 -186
  844. synth_ai/v0/tracing/trackers.py +0 -515
  845. synth_ai/v0/tracing/upload.py +0 -409
  846. synth_ai/v0/tracing/utils.py +0 -9
  847. synth_ai/v0/tracing_v1/__init__.py +0 -16
  848. synth_ai/v0/tracing_v1/abstractions.py +0 -224
  849. synth_ai/v0/tracing_v1/base_client.py +0 -91
  850. synth_ai/v0/tracing_v1/client_manager.py +0 -131
  851. synth_ai/v0/tracing_v1/config.py +0 -142
  852. synth_ai/v0/tracing_v1/context.py +0 -146
  853. synth_ai/v0/tracing_v1/decorators.py +0 -703
  854. synth_ai/v0/tracing_v1/events/__init__.py +0 -0
  855. synth_ai/v0/tracing_v1/events/manage.py +0 -147
  856. synth_ai/v0/tracing_v1/events/scope.py +0 -86
  857. synth_ai/v0/tracing_v1/events/store.py +0 -228
  858. synth_ai/v0/tracing_v1/immediate_client.py +0 -151
  859. synth_ai/v0/tracing_v1/local.py +0 -18
  860. synth_ai/v0/tracing_v1/log_client_base.py +0 -73
  861. synth_ai/v0/tracing_v1/retry_queue.py +0 -186
  862. synth_ai/v0/tracing_v1/trackers.py +0 -515
  863. synth_ai/v0/tracing_v1/upload.py +0 -527
  864. synth_ai/v0/tracing_v1/utils.py +0 -9
  865. synth_ai/v0/tracing_v3/__init__.py +0 -10
  866. synth_ai/v0/tracing_v3/abstractions.py +0 -3
  867. synth_ai/v0/tracing_v3/decorators.py +0 -3
  868. synth_ai/v0/tracing_v3/llm_call_record_helpers.py +0 -3
  869. synth_ai/v0/tracing_v3/session_tracer.py +0 -3
  870. synth_ai-0.2.9.dev11.dist-info/METADATA +0 -191
  871. synth_ai-0.2.9.dev11.dist-info/RECORD +0 -571
  872. synth_ai-0.2.9.dev11.dist-info/entry_points.txt +0 -3
  873. synth_ai-0.2.9.dev11.dist-info/top_level.txt +0 -3
  874. /synth_ai/{demos/demo_task_apps → cli/demo_apps}/crafter/__init__.py +0 -0
  875. /synth_ai/{demos → cli/demo_apps}/demo_task_apps/__init__.py +0 -0
  876. /synth_ai/{demos → cli/demo_apps}/demo_task_apps/crafter/configs/crafter_fft_4b.toml +0 -0
  877. /synth_ai/{demos → cli/demo_apps}/demo_task_apps/crafter/configs/rl_from_base_qwen4b.toml +0 -0
  878. /synth_ai/{demos → cli/demo_apps}/demo_task_apps/math/__init__.py +0 -0
  879. /synth_ai/{demos → cli/demo_apps}/demo_task_apps/math/_common.py +0 -0
  880. /synth_ai/{demos → cli/demo_apps}/demo_task_apps/math/app.py +0 -0
  881. /synth_ai/{demos → cli/demo_apps}/demo_task_apps/math/config.toml +0 -0
  882. /synth_ai/{demos → cli/demo_apps}/demo_task_apps/math/deploy_modal.py +0 -0
  883. /synth_ai/{v0/lm/caching → core/apps}/__init__.py +0 -0
  884. /synth_ai/{tracing_v3 → core/tracing_v3}/abstractions.py +0 -0
  885. /synth_ai/{tracing_v3 → core/tracing_v3}/hooks.py +0 -0
  886. /synth_ai/{tracing_v3 → core/tracing_v3}/lm_call_record_abstractions.py +0 -0
  887. /synth_ai/{tracing_v3 → core/tracing_v3}/storage/__init__.py +0 -0
  888. /synth_ai/{tracing_v3 → core/tracing_v3}/storage/exceptions.py +0 -0
  889. /synth_ai/{tracing_v3 → core/tracing_v3}/storage/types.py +0 -0
  890. /synth_ai/{compound/cais.py → py.typed} +0 -0
  891. /synth_ai/{learning → sdk/learning}/algorithms.py +0 -0
  892. /synth_ai/{learning → sdk/learning}/config.py +0 -0
  893. /synth_ai/{learning → sdk/learning}/constants.py +0 -0
  894. /synth_ai/{learning → sdk/learning}/core.py +0 -0
  895. /synth_ai/{learning → sdk/learning}/gateway.py +0 -0
  896. /synth_ai/{learning → sdk/learning}/rl/__init__.py +0 -0
  897. /synth_ai/{learning → sdk/learning}/rl/config.py +0 -0
  898. /synth_ai/{learning → sdk/learning}/rl_client.py +0 -0
  899. /synth_ai/{learning → sdk/learning}/sft/__init__.py +0 -0
  900. /synth_ai/{learning → sdk/learning}/sse.py +0 -0
  901. /synth_ai/{task → sdk/task}/auth.py +0 -0
  902. /synth_ai/{task → sdk/task}/client.py +0 -0
  903. /synth_ai/{task → sdk/task}/errors.py +0 -0
  904. /synth_ai/{task → sdk/task}/health.py +0 -0
  905. /synth_ai/{task → sdk/task}/json.py +0 -0
  906. /synth_ai/{task → sdk/task}/rubrics.py +0 -0
  907. /synth_ai/{task → sdk/task}/vendors.py +0 -0
  908. {synth_ai-0.2.9.dev11.dist-info → synth_ai-0.4.1.dist-info}/WHEEL +0 -0
  909. {synth_ai-0.2.9.dev11.dist-info → synth_ai-0.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,858 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Comprehensive script to run Crafter rollouts for multiple models and compare their performance.
4
- Updated to use tracing_v3 with async architecture.
5
-
6
- Runs experiments for:
7
- - gpt-4o-mini
8
- - gpt-4.1-mini
9
- - gpt-4.1-nano
10
- - gemini-1.5-flash
11
- - gemini-2.5-flash-lite
12
- - qwen3/32b
13
-
14
- Analyzes and compares:
15
- - Invalid action rates
16
- - Achievement frequencies by step
17
- - Achievement counts across models
18
- - Performance metrics
19
- - Cost analysis
20
- """
21
-
22
- import argparse
23
- import asyncio
24
- import json
25
- import logging
26
- import os
27
- import sys
28
- import time
29
- from collections import defaultdict
30
- from datetime import datetime
31
- from pathlib import Path
32
- from typing import Any
33
- from uuid import uuid4
34
-
35
- import numpy as np
36
- import pandas as pd
37
- from tqdm import tqdm
38
- from tqdm.asyncio import tqdm_asyncio as atqdm
39
-
40
- # Disable httpx logging for cleaner output
41
- logging.getLogger("httpx").setLevel(logging.WARNING)
42
-
43
- # Add parent directory to path for imports
44
- sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent.parent.parent))
45
-
46
- # Disable v1 logging to see v3 tracing clearly
47
- os.environ["LANGFUSE_ENABLED"] = "false"
48
- os.environ["SYNTH_LOGGING"] = "false"
49
-
50
- # Import enhanced LM with v3 tracing
51
- from synth_ai.lm.core.main_v3 import LM
52
- from synth_ai.tracing_v3.abstractions import (
53
- EnvironmentEvent,
54
- RuntimeEvent,
55
- SessionEventMarkovBlanketMessage,
56
- TimeRecord,
57
- )
58
- from synth_ai.tracing_v3.decorators import set_turn_number
59
-
60
- # Import session tracer for v3 tracing
61
- from synth_ai.tracing_v3.session_tracer import SessionTracer
62
-
63
- # from synth_ai.tracing_v3.utils import create_experiment_context # Not needed
64
- from synth_ai.tracing_v3.turso.manager import AsyncSQLTraceManager
65
-
66
- # Import Crafter hooks
67
- try:
68
- from synth_ai.environments.examples.crafter_classic.trace_hooks_v3 import CRAFTER_HOOKS
69
- print(f"āœ… Loaded {len(CRAFTER_HOOKS.hooks)} Crafter achievement hooks (Easy, Medium, Hard)")
70
- except ImportError:
71
- print("Warning: Could not import CRAFTER_HOOKS for v3")
72
- from synth_ai.tracing_v3.hooks import HookManager
73
- CRAFTER_HOOKS = HookManager()
74
-
75
- import random
76
-
77
- import httpx
78
-
79
- # Global buckets for sessions
80
- _SESSIONS: dict[str, tuple[str, object]] = {} # session_id -> (experiment_id, trace)
81
-
82
- # Configuration
83
- MODELS_TO_TEST = [
84
- "gpt-4o-mini",
85
- "gpt-4.1-mini",
86
- ]
87
-
88
- # Service URLs (modify these based on your setup)
89
- CRAFTER_SERVICE_URL = "http://localhost:8901"
90
-
91
- # Database configuration - uses the centralized config which matches serve.sh
92
- from synth_ai.tracing_v3.db_config import get_default_db_config
93
-
94
- db_config = get_default_db_config()
95
- DATABASE_URL = db_config.database_url
96
-
97
- # Retry configuration for HTTP requests
98
- MAX_RETRIES = 3
99
- BASE_DELAY = 0.1
100
- MAX_DELAY = 2.0
101
- HTTP_TIMEOUT = 30.0
102
-
103
- class ExperimentConfig:
104
- """Configuration for the multi-model experiment."""
105
-
106
- def __init__(self):
107
- self.num_episodes = 10 # Number of episodes per model
108
- self.max_turns = 100 # Max turns per episode
109
- self.difficulty = "easy"
110
- self.save_traces = True
111
- self.verbose = True
112
- self.quiet = False # Default to verbose mode
113
- self.enable_v3_tracing = True
114
- self.v3_trace_dir = "./traces"
115
- self.crafter_service_url = CRAFTER_SERVICE_URL
116
- self.database_url = DATABASE_URL
117
- self.base_seed = 1000 # Base seed for episode generation
118
- self.turn_timeout = 30.0 # Timeout per turn in seconds
119
- self.episode_timeout = 300.0 # Total timeout per episode in seconds
120
-
121
-
122
- async def retry_http_request(client: httpx.AsyncClient, method: str, url: str, **kwargs) -> Any:
123
- """Retry HTTP requests with exponential backoff and jitter."""
124
- last_exception = None
125
-
126
- for attempt in range(MAX_RETRIES):
127
- try:
128
- if attempt > 0:
129
- delay = min(BASE_DELAY * (2 ** (attempt - 1)), MAX_DELAY)
130
- jitter = random.uniform(0, 0.1 * delay)
131
- total_delay = delay + jitter
132
- await asyncio.sleep(total_delay)
133
-
134
- response = await client.request(method, url, timeout=HTTP_TIMEOUT, **kwargs)
135
-
136
- if response.status_code < 500:
137
- return response
138
-
139
- last_exception = Exception(f"HTTP {response.status_code}: {response.text}")
140
-
141
- except httpx.ConnectError as e:
142
- last_exception = Exception(f"Connection failed to {url}: {e}")
143
- if attempt < MAX_RETRIES - 1:
144
- await asyncio.sleep(1.0 * (2 ** attempt))
145
- except httpx.ReadError as e:
146
- last_exception = e
147
- if attempt < MAX_RETRIES - 1:
148
- read_error_delay = min(1.0 * (2 ** attempt), 5.0)
149
- await asyncio.sleep(read_error_delay)
150
- except Exception as e:
151
- last_exception = e
152
-
153
- print(f" āŒ HTTP request failed after {MAX_RETRIES} attempts: {method} {url}")
154
- print(f" āŒ Error: {type(last_exception).__name__}: {str(last_exception)[:200]}")
155
- raise last_exception
156
-
157
-
158
- # Crafter action mapping
159
- CRAFTER_ACTIONS = {
160
- "noop": 0, "move_left": 1, "move_right": 2, "move_up": 3, "move_down": 4,
161
- "do": 5, "sleep": 6, "place_stone": 7, "place_table": 8, "place_furnace": 9,
162
- "place_plant": 10, "make_wood_pickaxe": 11, "make_stone_pickaxe": 12,
163
- "make_iron_pickaxe": 13, "make_wood_sword": 14, "make_stone_sword": 15,
164
- "make_iron_sword": 16, "eat_cow": 17, "eat_plant": 18
165
- }
166
-
167
- # Create reverse mapping for validation
168
- INT_TO_ACTION_STRING = {v: k for k, v in CRAFTER_ACTIONS.items()}
169
-
170
-
171
- def compress_observation_for_trace(obs: dict[str, Any]) -> str:
172
- """Compress observation data for storage in traces."""
173
- try:
174
- return json.dumps({
175
- "inv": {k: v for k, v in obs.get("inventory", {}).items() if v > 0},
176
- "nearby": obs.get("nearby", []),
177
- "hp": obs.get("status", {}).get("health", 0),
178
- "food": obs.get("status", {}).get("food", 0),
179
- "ach": sum(1 for v in obs.get("achievements_status", {}).values() if v)
180
- }, separators=(',', ':'))
181
- except Exception as e:
182
- return f"{{\"error\": \"{str(e)}\"}}"
183
-
184
-
185
- def create_message(content: str, message_type: str, system_id: str, turn: int) -> SessionEventMarkovBlanketMessage:
186
- """Create a SessionEventMarkovBlanketMessage with metadata."""
187
- return SessionEventMarkovBlanketMessage(
188
- content=content,
189
- message_type=message_type,
190
- metadata={"system_id": system_id, "turn": turn},
191
- time_record=TimeRecord(
192
- event_time=time.time(),
193
- message_time=turn
194
- )
195
- )
196
-
197
-
198
- async def run_episode(config: ExperimentConfig,
199
- model_name: str,
200
- episode_num: int,
201
- experiment_id: str) -> dict[str, Any]:
202
- """Run a single episode with a specific model using v3 tracing."""
203
- # Create a new session tracer for this episode
204
- session_tracer = SessionTracer(hooks=CRAFTER_HOOKS, db_url=config.database_url)
205
-
206
- # Start session with metadata
207
- session_id = await session_tracer.start_session(
208
- metadata={
209
- "model": model_name,
210
- "episode": episode_num,
211
- "experiment_id": experiment_id,
212
- "difficulty": config.difficulty
213
- }
214
- )
215
-
216
- # Started tracing session (output disabled for clean UI)
217
-
218
- # Store session in global bucket
219
- _SESSIONS[session_id] = (experiment_id, session_tracer)
220
-
221
- # Initialize LM with session tracer
222
- lm = LM(
223
- vendor="openai",
224
- model=model_name,
225
- temperature=0.1, # Low temperature for more consistent gameplay
226
- session_tracer=session_tracer,
227
- system_id=f"crafter_agent_{model_name}",
228
- enable_v3_tracing=True
229
- )
230
-
231
- # Create HTTP client
232
- async with httpx.AsyncClient() as client:
233
- try:
234
- # Initialize environment with consecutive seed
235
- seed = config.base_seed + episode_num # Base seed + episode number for consecutive seeds
236
- request_data = {"config": {"difficulty": config.difficulty, "seed": seed}}
237
- init_response = await retry_http_request(
238
- client, "POST", f"{config.crafter_service_url}/env/CrafterClassic/initialize",
239
- json=request_data
240
- )
241
- init_data = init_response.json()
242
-
243
- # Debug the response format (removed for clean output)
244
-
245
- # Handle different possible response formats
246
- if "instance_id" in init_data:
247
- instance_id = init_data["instance_id"]
248
- elif "env_id" in init_data:
249
- instance_id = init_data["env_id"]
250
- elif "id" in init_data:
251
- instance_id = init_data["id"]
252
- else:
253
- # If none of the expected keys exist, print the response and raise a clear error
254
- print(f"āŒ Unexpected response format from Crafter service: {init_data}")
255
- raise KeyError(f"Could not find environment ID in response. Available keys: {list(init_data.keys())}")
256
-
257
- # Get initial observation (from initialize response)
258
- obs = init_data["observation"]
259
-
260
- prev_obs = obs
261
- done = False
262
- invalid_actions = 0
263
- total_actions = 0
264
- episode_start_time = time.time()
265
-
266
- for turn in range(config.max_turns):
267
- if done:
268
- break
269
-
270
- # Check episode timeout
271
- if time.time() - episode_start_time > config.episode_timeout:
272
- print(f" ā° Episode {episode_num} timed out after {config.episode_timeout}s")
273
- done = True
274
- break
275
-
276
- # Update progress bar
277
- if hasattr(config, '_pbar'):
278
- current_achievements = sum(1 for v in obs.get("achievements_status", {}).values() if v)
279
- config._pbar.set_postfix({
280
- f"ep{episode_num}": f"step {turn+1}/{config.max_turns}, ach: {current_achievements}"
281
- })
282
-
283
- set_turn_number(turn)
284
-
285
- # Start timestep for this turn
286
- await session_tracer.start_timestep(f"turn_{turn}")
287
-
288
- # Prepare context for the agent
289
- inventory_str = ", ".join([f"{k}: {v}" for k, v in obs.get("inventory", {}).items() if v > 0])
290
- if not inventory_str:
291
- inventory_str = "empty"
292
-
293
- nearby_str = ", ".join(obs.get("nearby", []))
294
- if not nearby_str:
295
- nearby_str = "nothing"
296
-
297
- status = obs.get("status", {})
298
- health = status.get("health", 0)
299
- hunger = status.get("food", 0)
300
-
301
- # Get more detailed game state
302
- position = obs.get("position", [0, 0])
303
- achievements = obs.get("achievements_status", {})
304
- unlocked = [name for name, status in achievements.items() if status]
305
- achievements_str = ", ".join(unlocked) if unlocked else "none"
306
-
307
- # Get semantic map if available
308
- semantic_map = obs.get("semantic_map", None)
309
- map_str = ""
310
- if semantic_map is not None:
311
- # Simple 5x5 view around player
312
- try:
313
- px, py = position
314
- view_size = 5
315
- half = view_size // 2
316
- map_lines = []
317
- for dy in range(-half, half + 1):
318
- row = []
319
- for dx in range(-half, half + 1):
320
- x, y = px + dx, py + dy
321
- if dx == 0 and dy == 0:
322
- row.append("@") # Player
323
- elif 0 <= x < len(semantic_map) and 0 <= y < len(semantic_map[0]):
324
- cell = semantic_map[x][y]
325
- # Map common items
326
- if cell == 0:
327
- row.append(".") # Empty/grass
328
- elif cell == 1:
329
- row.append("T") # Tree
330
- elif cell == 2:
331
- row.append("S") # Stone
332
- elif cell == 3:
333
- row.append("C") # Cow
334
- elif cell == 4:
335
- row.append("W") # Water
336
- else:
337
- row.append("?")
338
- else:
339
- row.append("#") # Out of bounds
340
- map_lines.append(" ".join(row))
341
- map_str = "\nMap (5x5 view, @ = you):\n" + "\n".join(map_lines)
342
- except Exception:
343
- map_str = "\nMap view unavailable"
344
-
345
- # Create agent prompt
346
- prompt = f"""Game State (Turn {turn}):
347
- - Position: {position}
348
- - Health: {health}/9
349
- - Hunger: {hunger}/9
350
- - Inventory: {inventory_str}
351
- - Nearby objects: {nearby_str}
352
- - Achievements unlocked: {achievements_str}
353
- {map_str}
354
-
355
- Choose your next actions based on what you see. Use the 'interact' tool with a list of action IDs.
356
-
357
- Tips:
358
- - Look at the map! T=tree (wood), S=stone, C=cow (food), W=water
359
- - To collect resources: move to them (actions 1-4) then use action 5 (do)
360
- - To craft: place table (8) first, then craft tools (11-16)
361
- - If hungry and see cow (C), move to it and eat (17)
362
-
363
- What actions do you want to take?"""
364
-
365
- # Send observation as message
366
- obs_msg = create_message(
367
- f"Observation: {compress_observation_for_trace(obs)}",
368
- "system",
369
- f"crafter_env_{instance_id}",
370
- turn
371
- )
372
- await session_tracer.record_message(
373
- content=obs_msg.content,
374
- message_type=obs_msg.message_type,
375
- event_time=obs_msg.time_record.event_time,
376
- message_time=obs_msg.time_record.message_time,
377
- metadata=obs_msg.metadata
378
- )
379
-
380
- # Get action from LM with tools (with timeout)
381
- turn_start_time = time.time()
382
- try:
383
- # Define the interact tool for Crafter
384
- from pydantic import BaseModel, Field
385
- from synth_ai.lm.tools.base import BaseTool
386
-
387
- class InteractArgs(BaseModel):
388
- actions: list[int] = Field(..., description="List of action IDs to execute")
389
-
390
- interact_tool = BaseTool(
391
- name="interact",
392
- arguments=InteractArgs,
393
- description="Execute actions in the Crafter game"
394
- )
395
-
396
- # Create system message that explains available actions
397
- action_list = "\n".join([f"{action_id}: {action}" for action, action_id in CRAFTER_ACTIONS.items()])
398
- system_message = f"""You are an agent playing Crafter, a 2D survival game. Your goal is to survive and unlock achievements.
399
-
400
- You MUST use the 'interact' tool to execute actions. The tool takes a list of action IDs.
401
-
402
- Action ID mapping:
403
- {action_list}
404
-
405
- Strategy tips:
406
- - Start by collecting wood (move to trees and use action 5)
407
- - Place a crafting table (action 8) to unlock crafting recipes
408
- - Craft tools to collect resources more efficiently
409
- - Eat when hungry, sleep when tired
410
- - Explore to find different resources
411
-
412
- IMPORTANT: Always use the 'interact' tool with a list of action IDs. For example: interact(actions=[2, 2, 5]) to move right twice and collect."""
413
-
414
- # Get actions from LM using tools with timeout
415
- try:
416
- action_response = await asyncio.wait_for(
417
- lm.respond_async(
418
- system_message=system_message,
419
- user_message=prompt,
420
- tools=[interact_tool],
421
- turn_number=turn
422
- ),
423
- timeout=config.turn_timeout
424
- )
425
- except asyncio.TimeoutError:
426
- print(f" ā° Turn {turn} timed out for episode {episode_num} after {config.turn_timeout}s")
427
- action_response = None
428
- done = True
429
- break
430
-
431
- # Debug: print response (removed for clean output)
432
-
433
- # Extract tool calls from response
434
- if hasattr(action_response, 'tool_calls') and action_response.tool_calls:
435
- tool_calls = action_response.tool_calls
436
-
437
- # Process each tool call
438
- for tool_call in tool_calls:
439
- if tool_call.get('function', {}).get('name') == 'interact':
440
- # Extract actions from the tool call
441
- import json
442
- args = json.loads(tool_call.get('function', {}).get('arguments', '{}'))
443
- actions = args.get('actions', [])
444
-
445
- if not actions:
446
- # If no actions provided, use noop
447
- actions = [0]
448
-
449
- # Execute each action separately
450
- for action_id in actions:
451
- total_actions += 1
452
-
453
- # Validate action ID
454
- if action_id not in INT_TO_ACTION_STRING:
455
- # Invalid action logging removed for clean output
456
- action_id = 0
457
- invalid_actions += 1
458
-
459
- # Send action to Crafter service with timeout
460
- try:
461
- step_response = await asyncio.wait_for(
462
- retry_http_request(
463
- client, "POST", f"{config.crafter_service_url}/env/CrafterClassic/step",
464
- json={
465
- "env_id": instance_id,
466
- "action": {
467
- "tool_calls": [
468
- {"tool": "interact", "args": {"action": action_id}}
469
- ]
470
- }
471
- }
472
- ),
473
- timeout=5.0 # 5 second timeout for individual action
474
- )
475
- except asyncio.TimeoutError:
476
- print(f" ā° Action execution timed out in episode {episode_num}")
477
- done = True
478
- break
479
-
480
- if step_response.status_code != 200:
481
- print(f" āŒ Step failed: {step_response.status_code} - {step_response.text}")
482
- done = True
483
- break
484
-
485
- step_data = step_response.json()
486
-
487
- # Extract data from response
488
- new_obs = step_data["observation"]
489
- reward = step_data["reward"]
490
- done = step_data["done"]
491
-
492
- # Record runtime event for action
493
- action_name = INT_TO_ACTION_STRING.get(action_id, "unknown")
494
- runtime_event = RuntimeEvent(
495
- system_instance_id=f"crafter_env_{instance_id}",
496
- time_record=TimeRecord(
497
- event_time=time.time(),
498
- message_time=turn
499
- ),
500
- actions=[action_id],
501
- metadata={
502
- "action_name": action_name,
503
- "valid": action_name != "noop" or invalid_actions == 0
504
- }
505
- )
506
- await session_tracer.record_event(runtime_event)
507
-
508
- # Record environment event
509
- env_event = EnvironmentEvent(
510
- system_instance_id=f"crafter_env_{instance_id}",
511
- time_record=TimeRecord(
512
- event_time=time.time(),
513
- message_time=turn
514
- ),
515
- reward=reward,
516
- terminated=done,
517
- system_state_before={"observation": prev_obs},
518
- system_state_after={"observation": new_obs, "public_state": {"achievements_status": new_obs.get("achievements_status", {})}}
519
- )
520
- await session_tracer.record_event(env_event)
521
-
522
- # Update for next turn
523
- prev_obs = obs
524
- obs = new_obs
525
-
526
- if done:
527
- break
528
-
529
- # Update progress bar after each action
530
- if hasattr(config, '_pbar'):
531
- config._pbar.update(1)
532
- else:
533
- # No tool calls provided, use noop
534
- action_id = 0
535
- total_actions += 1
536
- invalid_actions += 1
537
-
538
- # Send noop action with timeout
539
- try:
540
- step_response = await asyncio.wait_for(
541
- retry_http_request(
542
- client, "POST", f"{config.crafter_service_url}/env/CrafterClassic/step",
543
- json={
544
- "env_id": instance_id,
545
- "action": {
546
- "tool_calls": [
547
- {"tool": "interact", "args": {"action": action_id}}
548
- ]
549
- }
550
- }
551
- ),
552
- timeout=5.0 # 5 second timeout
553
- )
554
- except asyncio.TimeoutError:
555
- print(f" ā° Noop action timed out in episode {episode_num}")
556
- done = True
557
- break
558
-
559
- if step_response.status_code != 200:
560
- print(f" āŒ Step failed: {step_response.status_code} - {step_response.text}")
561
- done = True
562
- else:
563
- step_data = step_response.json()
564
- new_obs = step_data["observation"]
565
- reward = step_data["reward"]
566
- done = step_data["done"]
567
-
568
- # Update observation
569
- prev_obs = obs
570
- obs = new_obs
571
-
572
- # End timestep
573
- await session_tracer.end_timestep(f"turn_{turn}")
574
-
575
- except Exception as e:
576
- print(f" āŒ Environment step error: {e}")
577
- done = True
578
-
579
- # Update progress bar for remaining steps if episode ended early
580
- if hasattr(config, '_pbar') and turn < config.max_turns - 1:
581
- remaining_steps = config.max_turns - turn - 1
582
- config._pbar.update(remaining_steps)
583
-
584
- # Calculate invalid action rate
585
- invalid_rate = invalid_actions / total_actions if total_actions > 0 else 0
586
-
587
- # Calculate achievements
588
- final_achievements = obs.get("achievements_status", {})
589
- total_achievements = sum(1 for v in final_achievements.values() if v)
590
-
591
- # Terminate environment
592
- try:
593
- await retry_http_request(
594
- client, "POST", f"{config.crafter_service_url}/env/CrafterClassic/terminate",
595
- json={"env_id": instance_id}
596
- )
597
- except Exception as e:
598
- print(f" āš ļø Failed to terminate environment: {e}")
599
-
600
- # End session
601
- await session_tracer.end_session(save=config.save_traces)
602
- # Close the tracer for this episode
603
- await session_tracer.close()
604
-
605
- return {
606
- "model": model_name,
607
- "episode": episode_num,
608
- "total_achievements": total_achievements,
609
- "achievements": final_achievements,
610
- "invalid_action_rate": invalid_rate,
611
- "total_actions": total_actions,
612
- "invalid_actions": invalid_actions,
613
- "session_id": session_id
614
- }
615
-
616
- except Exception as e:
617
- print(f" āŒ Episode failed: {e}")
618
- import traceback
619
- traceback.print_exc()
620
-
621
- # End session even if failed
622
- await session_tracer.end_session(save=config.save_traces)
623
- # Close the tracer for this episode
624
- await session_tracer.close()
625
-
626
- return {
627
- "model": model_name,
628
- "episode": episode_num,
629
- "total_achievements": 0,
630
- "achievements": {},
631
- "invalid_action_rate": 1.0,
632
- "total_actions": 0,
633
- "invalid_actions": 0,
634
- "session_id": session_id,
635
- "error": str(e)
636
- }
637
-
638
-
639
- async def run_model_experiment(config: ExperimentConfig, model_name: str, experiment_id: str) -> list[dict[str, Any]]:
640
- """Run multiple episodes for a single model in parallel."""
641
- print(f"\nšŸš€ Running {config.num_episodes} episodes for {model_name} in parallel...\n")
642
-
643
- # Create a progress bar for all steps across all episodes
644
- total_steps = config.num_episodes * config.max_turns
645
- pbar = atqdm(total=total_steps, desc=f"{model_name}", unit="steps", leave=True)
646
- config._pbar = pbar # Store in config so episodes can update it
647
-
648
- try:
649
- # Create tasks for all episodes (each will create its own tracer)
650
- tasks = []
651
- for i in range(config.num_episodes):
652
- task = run_episode(config, model_name, i, experiment_id)
653
- tasks.append(task)
654
-
655
- # Run all episodes in parallel
656
- results = await asyncio.gather(*tasks)
657
-
658
- # Calculate summary stats
659
- successful_results = [r for r in results if "error" not in r]
660
- if successful_results:
661
- avg_achievements = sum(r["total_achievements"] for r in successful_results) / len(successful_results)
662
- avg_invalid_rate = sum(r["invalid_action_rate"] for r in successful_results) / len(successful_results)
663
- pbar.set_postfix({
664
- "avg_achievements": f"{avg_achievements:.1f}",
665
- "avg_invalid_rate": f"{avg_invalid_rate:.1%}",
666
- "success_rate": f"{len(successful_results)}/{len(results)}"
667
- })
668
- finally:
669
- pbar.close()
670
-
671
- return results
672
-
673
-
674
- async def analyze_results(config: ExperimentConfig, all_results: dict[str, list[dict[str, Any]]]):
675
- """Analyze results across all models using v3 database."""
676
- print("\nšŸ“Š Analysis Results:")
677
- print("=" * 80)
678
-
679
- # Initialize database manager
680
- db_manager = AsyncSQLTraceManager(config.database_url)
681
- await db_manager.initialize()
682
-
683
- try:
684
- # Basic statistics by model
685
- model_stats = {}
686
- for model, results in all_results.items():
687
- valid_results = [r for r in results if "error" not in r]
688
- if valid_results:
689
- achievements = [r["total_achievements"] for r in valid_results]
690
- invalid_rates = [r["invalid_action_rate"] for r in valid_results]
691
-
692
- model_stats[model] = {
693
- "avg_achievements": np.mean(achievements),
694
- "std_achievements": np.std(achievements),
695
- "max_achievements": max(achievements),
696
- "avg_invalid_rate": np.mean(invalid_rates),
697
- "success_rate": len(valid_results) / len(results)
698
- }
699
-
700
- # Print model comparison
701
- print("\nšŸ“ˆ Model Performance Summary:")
702
- print(f"{'Model':<20} {'Avg Achievements':<18} {'Max Achievements':<18} {'Invalid Rate':<15} {'Success Rate':<15}")
703
- print("-" * 86)
704
-
705
- for model, stats in sorted(model_stats.items(), key=lambda x: x[1]["avg_achievements"], reverse=True):
706
- print(f"{model:<20} {stats['avg_achievements']:>6.2f} ± {stats['std_achievements']:>4.2f} "
707
- f"{stats['max_achievements']:>16} {stats['avg_invalid_rate']:>12.2%} {stats['success_rate']:>12.2%}")
708
-
709
- # Achievement frequency analysis
710
- print("\nšŸ† Achievement Frequencies:")
711
- achievement_counts = defaultdict(lambda: defaultdict(int))
712
-
713
- for model, results in all_results.items():
714
- for result in results:
715
- if "error" not in result:
716
- for achievement, unlocked in result["achievements"].items():
717
- if unlocked:
718
- achievement_counts[model][achievement] += 1
719
-
720
- # Get all unique achievements
721
- all_achievements = set()
722
- for model_achievements in achievement_counts.values():
723
- all_achievements.update(model_achievements.keys())
724
-
725
- # Print achievement table
726
- if all_achievements:
727
- print(f"\n{'Achievement':<25} " + " ".join(f"{model[:8]:>10}" for model in sorted(all_results.keys())))
728
- print("-" * (25 + 11 * len(all_results)))
729
-
730
- for achievement in sorted(all_achievements):
731
- row = f"{achievement:<25}"
732
- for model in sorted(all_results.keys()):
733
- count = achievement_counts[model].get(achievement, 0)
734
- total = len([r for r in all_results[model] if "error" not in r])
735
- pct = (count / total * 100) if total > 0 else 0
736
- row += f" {count:>3}/{total:<3} ({pct:>3.0f}%)"
737
- print(row)
738
-
739
- # Query model usage from database - filter to only show models used in this experiment
740
- print("\nšŸ’° Model Usage Statistics from Current Experiment:")
741
- model_usage_df = await db_manager.get_model_usage()
742
-
743
- if model_usage_df is not None and not model_usage_df.empty:
744
- # Filter to only show models from this experiment
745
- experiment_models = set(all_results.keys())
746
- filtered_df = model_usage_df[model_usage_df['model_name'].isin(experiment_models)]
747
-
748
- if not filtered_df.empty:
749
- # Format model usage statistics as table
750
- print(f"{'Model':<20} {'Provider':<10} {'Usage Count':<12} {'Avg Latency (ms)':<18} {'Total Cost':<12}")
751
- print("-" * 72)
752
- for _, row in filtered_df.iterrows():
753
- avg_latency = row['avg_latency_ms']
754
- if pd.notna(avg_latency):
755
- print(f"{row['model_name']:<20} {row['provider'] or 'N/A':<10} {row['usage_count']:<12} "
756
- f"{avg_latency:<18.2f} ${row['total_cost_usd']:<11.4f}")
757
- else:
758
- print(f"{row['model_name']:<20} {row['provider'] or 'N/A':<10} {row['usage_count']:<12} "
759
- f"{'N/A':<18} ${row['total_cost_usd']:<11.4f}")
760
-
761
- # Export detailed results
762
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
763
- results_file = f"crafter_experiment_results_{timestamp}.json"
764
-
765
- with open(results_file, "w") as f:
766
- json.dump({
767
- "config": {
768
- "num_episodes": config.num_episodes,
769
- "max_turns": config.max_turns,
770
- "difficulty": config.difficulty,
771
- "models": list(all_results.keys())
772
- },
773
- "results": all_results,
774
- "statistics": model_stats,
775
- "timestamp": timestamp
776
- }, f, indent=2)
777
-
778
- print(f"\nšŸ’¾ Detailed results saved to: {results_file}")
779
-
780
- finally:
781
- await db_manager.close()
782
-
783
-
784
- async def main():
785
- """Main entry point for the experiment."""
786
- parser = argparse.ArgumentParser(description="Run Crafter experiments with multiple models")
787
- parser.add_argument("--episodes", type=int, default=5, help="Number of episodes per model")
788
- parser.add_argument("--max-turns", type=int, default=100, help="Maximum turns per episode")
789
- parser.add_argument("--difficulty", choices=["easy", "medium", "hard"], default="easy", help="Game difficulty")
790
- parser.add_argument("--models", nargs="+", default=MODELS_TO_TEST, help="Models to test")
791
- parser.add_argument("--no-save", action="store_true", help="Don't save traces to database")
792
- parser.add_argument("--quiet", action="store_true", help="Reduce output verbosity")
793
- parser.add_argument("--db-url", default=DATABASE_URL, help="Database URL for tracing")
794
- parser.add_argument("--base-seed", type=int, default=1000, help="Base seed for episodes (episodes use base_seed+episode_num)")
795
- parser.add_argument("--turn-timeout", type=float, default=30.0, help="Timeout per turn in seconds")
796
- parser.add_argument("--episode-timeout", type=float, default=300.0, help="Total timeout per episode in seconds")
797
-
798
- args = parser.parse_args()
799
-
800
- # Create configuration
801
- config = ExperimentConfig()
802
- config.num_episodes = args.episodes
803
- config.max_turns = args.max_turns
804
- config.difficulty = args.difficulty
805
- config.save_traces = not args.no_save
806
- config.verbose = not args.quiet
807
- config.quiet = args.quiet
808
- config.database_url = args.db_url
809
- config.base_seed = args.base_seed
810
- config.turn_timeout = args.turn_timeout
811
- config.episode_timeout = args.episode_timeout
812
-
813
- # Generate experiment ID
814
- experiment_id = f"crafter_multi_model_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
815
-
816
- print("šŸŽ® Crafter Multi-Model Experiment")
817
- print("=" * 50)
818
- print(f"Experiment ID: {experiment_id}")
819
- print(f"Models: {', '.join(args.models)}")
820
- print(f"Episodes per model: {config.num_episodes}")
821
- print(f"Max turns per episode: {config.max_turns}")
822
- print(f"Difficulty: {config.difficulty}")
823
- print(f"Seeds: {config.base_seed} to {config.base_seed + config.num_episodes - 1}")
824
- print(f"Turn timeout: {config.turn_timeout}s")
825
- print(f"Episode timeout: {config.episode_timeout}s")
826
- print(f"Save traces: {config.save_traces}")
827
- print(f"Database URL: {config.database_url}")
828
- print("=" * 50)
829
-
830
- # Check Crafter service
831
- try:
832
- async with httpx.AsyncClient() as client:
833
- response = await client.get(f"{config.crafter_service_url}/health", timeout=5.0)
834
- if response.status_code != 200:
835
- print(f"āŒ Crafter service not healthy at {config.crafter_service_url}")
836
- return
837
- except Exception as e:
838
- print(f"āŒ Cannot connect to Crafter service at {config.crafter_service_url}: {e}")
839
- print("Please ensure the Crafter service is running.")
840
- return
841
-
842
- print("āœ… Crafter service is running")
843
-
844
- # Run experiments for each model
845
- all_results = {}
846
-
847
- for model in args.models:
848
- results = await run_model_experiment(config, model, experiment_id)
849
- all_results[model] = results
850
-
851
- # Analyze and compare results
852
- await analyze_results(config, all_results)
853
-
854
- print("\nāœ… Experiment complete!")
855
-
856
-
857
- if __name__ == "__main__":
858
- asyncio.run(main())