synth-ai 0.2.8.dev2__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (740) hide show
  1. synth_ai/__init__.py +44 -24
  2. synth_ai/__main__.py +30 -3
  3. synth_ai/cli/__init__.py +103 -48
  4. synth_ai/cli/__main__.py +42 -0
  5. synth_ai/cli/_internal/__init__.py +5 -0
  6. synth_ai/cli/_internal/modal_wrapper.py +31 -0
  7. synth_ai/cli/_internal/storage.py +20 -0
  8. synth_ai/cli/_internal/typer_patch.py +47 -0
  9. synth_ai/cli/_internal/validate_task_app.py +29 -0
  10. synth_ai/cli/agents/__init__.py +17 -0
  11. synth_ai/cli/agents/claude.py +77 -0
  12. synth_ai/cli/agents/codex.py +265 -0
  13. synth_ai/cli/agents/opencode.py +253 -0
  14. synth_ai/cli/commands/__init__.py +18 -0
  15. synth_ai/cli/commands/artifacts/__init__.py +13 -0
  16. synth_ai/cli/commands/artifacts/client.py +119 -0
  17. synth_ai/cli/commands/artifacts/config.py +57 -0
  18. synth_ai/cli/commands/artifacts/core.py +24 -0
  19. synth_ai/cli/commands/artifacts/download.py +188 -0
  20. synth_ai/cli/commands/artifacts/export.py +186 -0
  21. synth_ai/cli/commands/artifacts/list.py +156 -0
  22. synth_ai/cli/commands/artifacts/parsing.py +250 -0
  23. synth_ai/cli/commands/artifacts/show.py +336 -0
  24. synth_ai/cli/commands/demo/__init__.py +3 -0
  25. synth_ai/cli/commands/demo/core.py +153 -0
  26. synth_ai/cli/commands/eval/__init__.py +10 -0
  27. synth_ai/cli/commands/eval/config.py +338 -0
  28. synth_ai/cli/commands/eval/core.py +256 -0
  29. synth_ai/cli/commands/eval/runner.py +704 -0
  30. synth_ai/cli/commands/eval/validation.py +60 -0
  31. synth_ai/cli/commands/filter/__init__.py +12 -0
  32. synth_ai/cli/commands/filter/core.py +424 -0
  33. synth_ai/cli/commands/filter/errors.py +55 -0
  34. synth_ai/cli/commands/filter/validation.py +77 -0
  35. synth_ai/cli/commands/help/__init__.py +185 -0
  36. synth_ai/cli/commands/help/core.py +72 -0
  37. synth_ai/cli/commands/scan/__init__.py +19 -0
  38. synth_ai/cli/commands/scan/cloudflare_scanner.py +403 -0
  39. synth_ai/cli/commands/scan/core.py +344 -0
  40. synth_ai/cli/commands/scan/health_checker.py +242 -0
  41. synth_ai/cli/commands/scan/local_scanner.py +278 -0
  42. synth_ai/cli/commands/scan/models.py +83 -0
  43. synth_ai/cli/commands/smoke/__init__.py +7 -0
  44. synth_ai/cli/commands/smoke/core.py +1428 -0
  45. synth_ai/cli/commands/status/__init__.py +3 -0
  46. synth_ai/cli/commands/status/client.py +91 -0
  47. synth_ai/cli/commands/status/config.py +12 -0
  48. synth_ai/cli/commands/status/errors.py +11 -0
  49. synth_ai/cli/commands/status/subcommands/__init__.py +3 -0
  50. synth_ai/cli/commands/status/subcommands/config.py +13 -0
  51. synth_ai/cli/commands/status/subcommands/files.py +34 -0
  52. synth_ai/cli/commands/status/subcommands/jobs.py +51 -0
  53. synth_ai/cli/commands/status/subcommands/models.py +35 -0
  54. synth_ai/cli/commands/status/subcommands/runs.py +34 -0
  55. synth_ai/cli/commands/status/subcommands/session.py +77 -0
  56. synth_ai/cli/commands/status/subcommands/summary.py +39 -0
  57. synth_ai/cli/commands/status/subcommands/utils.py +41 -0
  58. synth_ai/cli/commands/status/utils.py +23 -0
  59. synth_ai/cli/commands/train/__init__.py +53 -0
  60. synth_ai/cli/commands/train/core.py +22 -0
  61. synth_ai/cli/commands/train/errors.py +117 -0
  62. synth_ai/cli/commands/train/judge_schemas.py +201 -0
  63. synth_ai/cli/commands/train/judge_validation.py +305 -0
  64. synth_ai/cli/commands/train/prompt_learning_validation.py +633 -0
  65. synth_ai/cli/commands/train/validation.py +392 -0
  66. synth_ai/cli/demo_apps/__init__.py +10 -0
  67. synth_ai/cli/demo_apps/core/__init__.py +28 -0
  68. synth_ai/{demos → cli/demo_apps}/core/cli.py +783 -441
  69. synth_ai/cli/demo_apps/crafter/__init__.py +1 -0
  70. synth_ai/cli/demo_apps/crafter/crafter_fft_4b.toml +55 -0
  71. synth_ai/cli/demo_apps/crafter/grpo_crafter_task_app.py +186 -0
  72. synth_ai/cli/demo_apps/crafter/rl_from_base_qwen4b.toml +74 -0
  73. synth_ai/cli/demo_apps/demo_registry.py +176 -0
  74. synth_ai/cli/demo_apps/demo_task_apps/__init__.py +7 -0
  75. synth_ai/{demos → cli/demo_apps}/demo_task_apps/core.py +75 -37
  76. synth_ai/cli/demo_apps/demo_task_apps/crafter/__init__.py +1 -0
  77. synth_ai/cli/demo_apps/demo_task_apps/crafter/configs/crafter_fft_4b.toml +53 -0
  78. synth_ai/cli/demo_apps/demo_task_apps/crafter/configs/rl_from_base_qwen4b.toml +73 -0
  79. synth_ai/cli/demo_apps/demo_task_apps/crafter/grpo_crafter_task_app.py +185 -0
  80. synth_ai/{demos → cli/demo_apps}/demo_task_apps/math/_common.py +1 -2
  81. synth_ai/{demos → cli/demo_apps}/demo_task_apps/math/app.py +2 -1
  82. synth_ai/cli/demo_apps/demo_task_apps/math/config.toml +73 -0
  83. synth_ai/{demos → cli/demo_apps}/demo_task_apps/math/deploy_modal.py +3 -6
  84. synth_ai/cli/demo_apps/demo_task_apps/math/modal_task_app.py +738 -0
  85. synth_ai/cli/demo_apps/demo_task_apps/math/task_app_entry.py +39 -0
  86. synth_ai/cli/demo_apps/math/__init__.py +1 -0
  87. synth_ai/cli/demo_apps/math/_common.py +16 -0
  88. synth_ai/cli/demo_apps/math/app.py +38 -0
  89. synth_ai/cli/demo_apps/math/config.toml +75 -0
  90. synth_ai/cli/demo_apps/math/deploy_modal.py +54 -0
  91. synth_ai/cli/demo_apps/math/modal_task_app.py +698 -0
  92. synth_ai/cli/demo_apps/math/task_app_entry.py +53 -0
  93. synth_ai/cli/demo_apps/mipro/main.py +271 -0
  94. synth_ai/cli/demo_apps/mipro/task_app.py +922 -0
  95. synth_ai/cli/demo_apps/mipro/train_cfg.toml +92 -0
  96. synth_ai/cli/demos/__init__.py +12 -0
  97. synth_ai/cli/demos/demo.py +32 -0
  98. synth_ai/cli/demos/rl_demo.py +254 -0
  99. synth_ai/cli/deploy.py +216 -0
  100. synth_ai/cli/infra/__init__.py +14 -0
  101. synth_ai/cli/{balance.py → infra/balance.py} +16 -4
  102. synth_ai/cli/infra/mcp.py +35 -0
  103. synth_ai/cli/infra/modal_app.py +36 -0
  104. synth_ai/cli/infra/setup.py +69 -0
  105. synth_ai/cli/infra/status.py +16 -0
  106. synth_ai/cli/infra/turso.py +77 -0
  107. synth_ai/cli/lib/__init__.py +10 -0
  108. synth_ai/cli/lib/agents.py +76 -0
  109. synth_ai/cli/lib/apps/modal_app.py +101 -0
  110. synth_ai/cli/lib/apps/task_app.py +642 -0
  111. synth_ai/cli/lib/bin.py +39 -0
  112. synth_ai/cli/lib/env.py +375 -0
  113. synth_ai/cli/lib/errors.py +85 -0
  114. synth_ai/cli/lib/modal.py +315 -0
  115. synth_ai/cli/lib/plotting.py +126 -0
  116. synth_ai/cli/lib/prompt_args.py +39 -0
  117. synth_ai/cli/lib/prompts.py +284 -0
  118. synth_ai/cli/lib/sqld.py +122 -0
  119. synth_ai/cli/lib/task_app_discovery.py +884 -0
  120. synth_ai/cli/lib/task_app_env.py +295 -0
  121. synth_ai/cli/lib/train_cfgs.py +300 -0
  122. synth_ai/cli/lib/tunnel_records.py +207 -0
  123. synth_ai/cli/local/__init__.py +14 -0
  124. synth_ai/cli/local/experiment_queue/__init__.py +72 -0
  125. synth_ai/cli/local/experiment_queue/api_schemas.py +221 -0
  126. synth_ai/cli/local/experiment_queue/celery_app.py +208 -0
  127. synth_ai/cli/local/experiment_queue/config.py +128 -0
  128. synth_ai/cli/local/experiment_queue/config_utils.py +272 -0
  129. synth_ai/cli/local/experiment_queue/database.py +175 -0
  130. synth_ai/cli/local/experiment_queue/dispatcher.py +119 -0
  131. synth_ai/cli/local/experiment_queue/models.py +231 -0
  132. synth_ai/cli/local/experiment_queue/progress_info.py +160 -0
  133. synth_ai/cli/local/experiment_queue/results.py +373 -0
  134. synth_ai/cli/local/experiment_queue/schemas.py +131 -0
  135. synth_ai/cli/local/experiment_queue/service.py +344 -0
  136. synth_ai/cli/local/experiment_queue/status.py +372 -0
  137. synth_ai/cli/local/experiment_queue/status_tracker.py +360 -0
  138. synth_ai/cli/local/experiment_queue/tasks.py +1984 -0
  139. synth_ai/cli/local/experiment_queue/trace_storage.py +65 -0
  140. synth_ai/cli/local/experiment_queue/validation.py +157 -0
  141. synth_ai/cli/local/session/__init__.py +92 -0
  142. synth_ai/cli/local/session/client.py +383 -0
  143. synth_ai/cli/local/session/constants.py +63 -0
  144. synth_ai/cli/local/session/exceptions.py +105 -0
  145. synth_ai/cli/local/session/manager.py +139 -0
  146. synth_ai/cli/local/session/models.py +89 -0
  147. synth_ai/cli/local/session/query.py +110 -0
  148. synth_ai/cli/root.py +150 -108
  149. synth_ai/cli/task_apps/__init__.py +37 -0
  150. synth_ai/cli/task_apps/commands.py +3145 -0
  151. synth_ai/cli/task_apps/deploy.py +7 -0
  152. synth_ai/cli/task_apps/list.py +26 -0
  153. synth_ai/cli/task_apps/main.py +36 -0
  154. synth_ai/cli/task_apps/modal_serve.py +11 -0
  155. synth_ai/cli/task_apps/serve.py +11 -0
  156. synth_ai/cli/training/__init__.py +8 -0
  157. synth_ai/cli/training/train.py +5 -0
  158. synth_ai/cli/training/train_cfg.py +34 -0
  159. synth_ai/cli/{watch.py → training/watch.py} +13 -18
  160. synth_ai/cli/turso.py +52 -0
  161. synth_ai/cli/utils/__init__.py +8 -0
  162. synth_ai/cli/utils/experiments.py +235 -0
  163. synth_ai/cli/utils/queue.py +504 -0
  164. synth_ai/cli/{recent.py → utils/recent.py} +13 -7
  165. synth_ai/cli/{traces.py → utils/traces.py} +9 -5
  166. synth_ai/contracts/__init__.py +67 -0
  167. synth_ai/core/__init__.py +100 -0
  168. synth_ai/core/_utils/__init__.py +54 -0
  169. synth_ai/core/_utils/base_url.py +10 -0
  170. synth_ai/core/_utils/http.py +10 -0
  171. synth_ai/core/_utils/prompts.py +14 -0
  172. synth_ai/core/_utils/task_app_state.py +12 -0
  173. synth_ai/core/_utils/user_config.py +10 -0
  174. synth_ai/core/apps/common.py +116 -0
  175. synth_ai/core/auth.py +95 -0
  176. synth_ai/core/cfgs.py +240 -0
  177. synth_ai/core/config/__init__.py +16 -0
  178. synth_ai/core/config/base.py +168 -0
  179. synth_ai/core/config/resolver.py +89 -0
  180. synth_ai/core/env.py +231 -0
  181. synth_ai/core/errors.py +126 -0
  182. synth_ai/core/http.py +230 -0
  183. synth_ai/core/integrations/__init__.py +11 -0
  184. synth_ai/core/integrations/cloudflare.py +1710 -0
  185. synth_ai/core/integrations/mcp/__init__.py +6 -0
  186. synth_ai/core/integrations/mcp/__main__.py +8 -0
  187. synth_ai/core/integrations/mcp/claude.py +36 -0
  188. synth_ai/core/integrations/mcp/main.py +254 -0
  189. synth_ai/core/integrations/mcp/setup.py +100 -0
  190. synth_ai/core/integrations/modal.py +277 -0
  191. synth_ai/core/json.py +72 -0
  192. synth_ai/core/log_filter.py +99 -0
  193. synth_ai/core/logging.py +82 -0
  194. synth_ai/core/paths.py +107 -0
  195. synth_ai/core/pricing.py +109 -0
  196. synth_ai/core/process.py +233 -0
  197. synth_ai/core/ssl.py +25 -0
  198. synth_ai/core/storage/__init__.py +71 -0
  199. synth_ai/core/task_app_state.py +318 -0
  200. synth_ai/core/telemetry.py +282 -0
  201. synth_ai/{tracing_v3 → core/tracing_v3}/__init__.py +5 -1
  202. synth_ai/{tracing_v3 → core/tracing_v3}/abstractions.py +21 -4
  203. synth_ai/core/tracing_v3/config.py +229 -0
  204. synth_ai/core/tracing_v3/constants.py +21 -0
  205. synth_ai/{tracing_v3 → core/tracing_v3}/db_config.py +42 -29
  206. synth_ai/{tracing_v3 → core/tracing_v3}/decorators.py +80 -45
  207. synth_ai/{tracing_v3 → core/tracing_v3}/examples/basic_usage.py +15 -9
  208. synth_ai/{tracing_v3 → core/tracing_v3}/hooks.py +6 -4
  209. synth_ai/{tracing_v3 → core/tracing_v3}/llm_call_record_helpers.py +161 -61
  210. synth_ai/{tracing_v3 → core/tracing_v3}/migration_helper.py +1 -2
  211. synth_ai/{tracing_v3 → core/tracing_v3}/replica_sync.py +12 -7
  212. synth_ai/core/tracing_v3/serialization.py +130 -0
  213. synth_ai/{tracing_v3 → core/tracing_v3}/session_tracer.py +88 -21
  214. synth_ai/{tracing_v3 → core/tracing_v3}/storage/base.py +99 -12
  215. synth_ai/core/tracing_v3/storage/config.py +109 -0
  216. synth_ai/{tracing_v3 → core/tracing_v3}/storage/factory.py +11 -9
  217. synth_ai/{tracing_v3 → core/tracing_v3}/storage/utils.py +15 -11
  218. synth_ai/core/tracing_v3/trace_utils.py +326 -0
  219. synth_ai/core/tracing_v3/turso/__init__.py +12 -0
  220. synth_ai/core/tracing_v3/turso/daemon.py +278 -0
  221. synth_ai/{tracing_v3 → core/tracing_v3}/turso/models.py +7 -3
  222. synth_ai/core/tracing_v3/turso/native_manager.py +1385 -0
  223. synth_ai/{tracing_v3 → core/tracing_v3}/utils.py +5 -4
  224. synth_ai/core/urls.py +18 -0
  225. synth_ai/core/user_config.py +137 -0
  226. synth_ai/core/uvicorn.py +222 -0
  227. synth_ai/data/__init__.py +83 -0
  228. synth_ai/data/enums.py +123 -0
  229. synth_ai/data/rewards.py +152 -0
  230. synth_ai/data/traces.py +35 -0
  231. synth_ai/products/__init__.py +6 -0
  232. synth_ai/products/graph_evolve/__init__.py +46 -0
  233. synth_ai/products/graph_evolve/client.py +226 -0
  234. synth_ai/products/graph_evolve/config.py +591 -0
  235. synth_ai/products/graph_evolve/converters/__init__.py +42 -0
  236. synth_ai/products/graph_evolve/converters/openai_sft.py +484 -0
  237. synth_ai/products/graph_evolve/examples/hotpotqa/config.toml +109 -0
  238. synth_ai/products/graph_evolve/run.py +222 -0
  239. synth_ai/products/graph_gepa/__init__.py +23 -0
  240. synth_ai/products/graph_gepa/converters/__init__.py +19 -0
  241. synth_ai/products/graph_gepa/converters/openai_sft.py +29 -0
  242. synth_ai/sdk/__init__.py +123 -0
  243. synth_ai/sdk/api/__init__.py +1 -0
  244. synth_ai/sdk/api/models/supported.py +514 -0
  245. synth_ai/sdk/api/research_agent/__init__.py +296 -0
  246. synth_ai/sdk/api/train/__init__.py +85 -0
  247. synth_ai/sdk/api/train/builders.py +895 -0
  248. synth_ai/sdk/api/train/cli.py +2199 -0
  249. synth_ai/sdk/api/train/config_finder.py +267 -0
  250. synth_ai/sdk/api/train/configs/__init__.py +65 -0
  251. synth_ai/sdk/api/train/configs/prompt_learning.py +1706 -0
  252. synth_ai/sdk/api/train/configs/rl.py +187 -0
  253. synth_ai/sdk/api/train/configs/sft.py +99 -0
  254. synth_ai/sdk/api/train/configs/shared.py +81 -0
  255. synth_ai/sdk/api/train/context_learning.py +312 -0
  256. synth_ai/sdk/api/train/env_resolver.py +418 -0
  257. synth_ai/sdk/api/train/graph_validators.py +216 -0
  258. synth_ai/sdk/api/train/graphgen.py +984 -0
  259. synth_ai/sdk/api/train/graphgen_models.py +823 -0
  260. synth_ai/sdk/api/train/graphgen_validators.py +109 -0
  261. synth_ai/sdk/api/train/local_api.py +10 -0
  262. synth_ai/sdk/api/train/pollers.py +124 -0
  263. synth_ai/sdk/api/train/progress/__init__.py +97 -0
  264. synth_ai/sdk/api/train/progress/dataclasses.py +569 -0
  265. synth_ai/sdk/api/train/progress/events.py +326 -0
  266. synth_ai/sdk/api/train/progress/results.py +428 -0
  267. synth_ai/sdk/api/train/progress/tracker.py +641 -0
  268. synth_ai/sdk/api/train/prompt_learning.py +469 -0
  269. synth_ai/sdk/api/train/rl.py +441 -0
  270. synth_ai/sdk/api/train/sft.py +396 -0
  271. synth_ai/sdk/api/train/summary.py +522 -0
  272. synth_ai/sdk/api/train/supported_algos.py +147 -0
  273. synth_ai/sdk/api/train/task_app.py +351 -0
  274. synth_ai/sdk/api/train/utils.py +279 -0
  275. synth_ai/sdk/api/train/validators.py +2424 -0
  276. synth_ai/sdk/graphs/__init__.py +15 -0
  277. synth_ai/sdk/graphs/completions.py +570 -0
  278. synth_ai/{inference → sdk/inference}/__init__.py +0 -1
  279. synth_ai/sdk/inference/client.py +128 -0
  280. synth_ai/sdk/jobs/__init__.py +16 -0
  281. synth_ai/sdk/jobs/client.py +371 -0
  282. synth_ai/sdk/judging/__init__.py +14 -0
  283. synth_ai/sdk/judging/base.py +24 -0
  284. synth_ai/sdk/judging/client.py +40 -0
  285. synth_ai/sdk/judging/schemas.py +222 -0
  286. synth_ai/sdk/judging/types.py +42 -0
  287. synth_ai/sdk/learning/__init__.py +99 -0
  288. synth_ai/sdk/learning/algorithms.py +14 -0
  289. synth_ai/{learning → sdk/learning}/client.py +121 -30
  290. synth_ai/sdk/learning/config.py +5 -0
  291. synth_ai/{learning → sdk/learning}/constants.py +0 -2
  292. synth_ai/sdk/learning/context_learning_client.py +531 -0
  293. synth_ai/sdk/learning/context_learning_types.py +292 -0
  294. synth_ai/sdk/learning/ft_client.py +7 -0
  295. synth_ai/{learning → sdk/learning}/health.py +15 -9
  296. synth_ai/{learning → sdk/learning}/jobs.py +44 -47
  297. synth_ai/sdk/learning/prompt_extraction.py +334 -0
  298. synth_ai/sdk/learning/prompt_learning_client.py +455 -0
  299. synth_ai/sdk/learning/prompt_learning_types.py +186 -0
  300. synth_ai/{rl → sdk/learning/rl}/__init__.py +13 -8
  301. synth_ai/{learning/rl_client.py → sdk/learning/rl/client.py} +89 -77
  302. synth_ai/sdk/learning/rl/config.py +31 -0
  303. synth_ai/{rl → sdk/learning/rl}/contracts.py +5 -14
  304. synth_ai/{rl → sdk/learning/rl}/env_keys.py +45 -16
  305. synth_ai/sdk/learning/rl/secrets.py +13 -0
  306. synth_ai/sdk/learning/rl_client.py +5 -0
  307. synth_ai/sdk/learning/sft/__init__.py +29 -0
  308. synth_ai/sdk/learning/sft/client.py +95 -0
  309. synth_ai/sdk/learning/sft/config.py +270 -0
  310. synth_ai/sdk/learning/sft/data.py +698 -0
  311. synth_ai/sdk/learning/sse.py +57 -0
  312. synth_ai/sdk/learning/validators.py +52 -0
  313. synth_ai/sdk/localapi/__init__.py +40 -0
  314. synth_ai/sdk/localapi/apps/__init__.py +28 -0
  315. synth_ai/sdk/localapi/client.py +10 -0
  316. synth_ai/sdk/localapi/contracts.py +10 -0
  317. synth_ai/sdk/localapi/helpers.py +519 -0
  318. synth_ai/sdk/localapi/rollouts.py +87 -0
  319. synth_ai/sdk/localapi/server.py +29 -0
  320. synth_ai/sdk/localapi/template.py +70 -0
  321. synth_ai/sdk/streaming/__init__.py +35 -0
  322. synth_ai/sdk/streaming/config.py +94 -0
  323. synth_ai/sdk/streaming/handlers.py +1997 -0
  324. synth_ai/sdk/streaming/streamer.py +713 -0
  325. synth_ai/sdk/streaming/types.py +112 -0
  326. synth_ai/sdk/task/__init__.py +164 -0
  327. synth_ai/sdk/task/apps/__init__.py +169 -0
  328. synth_ai/sdk/task/auth.py +165 -0
  329. synth_ai/sdk/task/client.py +175 -0
  330. synth_ai/sdk/task/config.py +257 -0
  331. synth_ai/sdk/task/contracts.py +219 -0
  332. synth_ai/sdk/task/datasets.py +108 -0
  333. synth_ai/sdk/task/errors.py +50 -0
  334. synth_ai/sdk/task/health.py +34 -0
  335. synth_ai/sdk/task/in_process.py +1190 -0
  336. synth_ai/sdk/task/in_process_runner.py +314 -0
  337. synth_ai/sdk/task/inference_api.py +299 -0
  338. synth_ai/sdk/task/json.py +111 -0
  339. synth_ai/sdk/task/proxy.py +287 -0
  340. synth_ai/sdk/task/rubrics/__init__.py +55 -0
  341. synth_ai/sdk/task/rubrics/loaders.py +156 -0
  342. synth_ai/sdk/task/rubrics/models.py +57 -0
  343. synth_ai/sdk/task/rubrics/scoring.py +116 -0
  344. synth_ai/sdk/task/rubrics/strict.py +149 -0
  345. synth_ai/sdk/task/rubrics.py +219 -0
  346. synth_ai/sdk/task/server.py +631 -0
  347. synth_ai/sdk/task/trace_correlation_helpers.py +539 -0
  348. synth_ai/sdk/task/tracing_utils.py +95 -0
  349. synth_ai/sdk/task/validators.py +441 -0
  350. synth_ai/sdk/task/vendors.py +59 -0
  351. synth_ai/sdk/training/__init__.py +102 -0
  352. synth_ai/sdk/tunnels/__init__.py +83 -0
  353. synth_ai/sdk/tunnels/cleanup.py +83 -0
  354. synth_ai/sdk/tunnels/ports.py +120 -0
  355. synth_ai/utils/__init__.py +213 -0
  356. synth_ai-0.4.3.dist-info/METADATA +262 -0
  357. synth_ai-0.4.3.dist-info/RECORD +370 -0
  358. {synth_ai-0.2.8.dev2.dist-info → synth_ai-0.4.3.dist-info}/entry_points.txt +0 -1
  359. synth_ai/cli/calc.py +0 -69
  360. synth_ai/cli/demo.py +0 -144
  361. synth_ai/cli/legacy_root_backup.py +0 -470
  362. synth_ai/cli/man.py +0 -106
  363. synth_ai/cli/rl_demo.py +0 -202
  364. synth_ai/cli/status.py +0 -133
  365. synth_ai/config/base_url.py +0 -107
  366. synth_ai/core/experiment.py +0 -15
  367. synth_ai/core/system.py +0 -15
  368. synth_ai/demos/core/__init__.py +0 -1
  369. synth_ai/demos/demo_task_apps/__init__.py +0 -1
  370. synth_ai/demos/demo_task_apps/math/config.toml +0 -129
  371. synth_ai/demos/demo_task_apps/math/deploy_task_app.sh +0 -22
  372. synth_ai/demos/demo_task_apps/math/modal_task_app.py +0 -415
  373. synth_ai/environments/__init__.py +0 -31
  374. synth_ai/environments/environment/__init__.py +0 -1
  375. synth_ai/environments/environment/artifacts/__init__.py +0 -1
  376. synth_ai/environments/environment/artifacts/base.py +0 -52
  377. synth_ai/environments/environment/core.py +0 -67
  378. synth_ai/environments/environment/db/__init__.py +0 -1
  379. synth_ai/environments/environment/db/sqlite.py +0 -45
  380. synth_ai/environments/environment/registry.py +0 -233
  381. synth_ai/environments/environment/resources/sqlite.py +0 -45
  382. synth_ai/environments/environment/results.py +0 -1
  383. synth_ai/environments/environment/rewards/__init__.py +0 -1
  384. synth_ai/environments/environment/rewards/core.py +0 -29
  385. synth_ai/environments/environment/shared_engine.py +0 -26
  386. synth_ai/environments/environment/tools/__init__.py +0 -200
  387. synth_ai/environments/examples/__init__.py +0 -1
  388. synth_ai/environments/examples/bandit/__init__.py +0 -33
  389. synth_ai/environments/examples/bandit/engine.py +0 -294
  390. synth_ai/environments/examples/bandit/environment.py +0 -194
  391. synth_ai/environments/examples/bandit/taskset.py +0 -200
  392. synth_ai/environments/examples/crafter_classic/__init__.py +0 -8
  393. synth_ai/environments/examples/crafter_classic/agent_demos/analyze_semantic_words_markdown.py +0 -250
  394. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_comprehensive_evaluation.py +0 -59
  395. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_browser.py +0 -152
  396. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_config.toml +0 -24
  397. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_evaluation_framework.py +0 -1194
  398. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/crafter_synth_config.toml +0 -56
  399. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_config_modal.toml +0 -32
  400. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/filter_traces_sft_turso.py +0 -738
  401. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/kick_off_ft_modal.py +0 -384
  402. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_action_results.py +0 -53
  403. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_agent_actions.py +0 -178
  404. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_latest_run.py +0 -222
  405. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_lm_traces.py +0 -183
  406. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_no_rewards.py +0 -210
  407. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/analyze_trace_issue.py +0 -206
  408. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_db_schema.py +0 -49
  409. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/check_latest_results.py +0 -64
  410. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/debug_agent_responses.py +0 -88
  411. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_modal_ft/old/quick_trace_check.py +0 -77
  412. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/compare_experiments.py +0 -324
  413. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/filter_traces_sft_turso.py +0 -580
  414. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/kick_off_ft_oai.py +0 -362
  415. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/multi_model_config.toml +0 -49
  416. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_enhanced_hooks.py +0 -332
  417. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_events.py +0 -97
  418. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/analyze_hook_results.py +0 -217
  419. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_hook_storage.py +0 -87
  420. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/check_seeds.py +0 -88
  421. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/compare_seed_performance.py +0 -195
  422. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/custom_eval_pipelines.py +0 -400
  423. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/plot_hook_frequency.py +0 -195
  424. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/old/seed_analysis_summary.py +0 -56
  425. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_openai_ft/run_rollouts_for_models_and_compare_v3.py +0 -858
  426. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_quick_evaluation.py +0 -52
  427. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_react_agent.py +0 -874
  428. synth_ai/environments/examples/crafter_classic/agent_demos/crafter_trace_evaluation.py +0 -1412
  429. synth_ai/environments/examples/crafter_classic/agent_demos/example_v3_usage.py +0 -216
  430. synth_ai/environments/examples/crafter_classic/agent_demos/old/compare_traces.py +0 -296
  431. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_comprehensive_evaluation.py +0 -58
  432. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_env_serialization.py +0 -464
  433. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_evaluation_browser.py +0 -152
  434. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_quick_evaluation.py +0 -51
  435. synth_ai/environments/examples/crafter_classic/agent_demos/old/crafter_trace_evaluation.py +0 -1412
  436. synth_ai/environments/examples/crafter_classic/agent_demos/old/debug_player_loss.py +0 -112
  437. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_service.py +0 -203
  438. synth_ai/environments/examples/crafter_classic/agent_demos/old/diagnose_slowness.py +0 -305
  439. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_by_difficulty.py +0 -126
  440. synth_ai/environments/examples/crafter_classic/agent_demos/old/eval_example.py +0 -94
  441. synth_ai/environments/examples/crafter_classic/agent_demos/old/explore_saved_states.py +0 -142
  442. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft.py +0 -26
  443. synth_ai/environments/examples/crafter_classic/agent_demos/old/filter_traces_sft_OLD.py +0 -984
  444. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_gemini.py +0 -724
  445. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_data_modal.py +0 -386
  446. synth_ai/environments/examples/crafter_classic/agent_demos/old/generate_ft_metadata.py +0 -205
  447. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_gemini.py +0 -150
  448. synth_ai/environments/examples/crafter_classic/agent_demos/old/kick_off_ft_modal.py +0 -283
  449. synth_ai/environments/examples/crafter_classic/agent_demos/old/prepare_vertex_ft.py +0 -280
  450. synth_ai/environments/examples/crafter_classic/agent_demos/old/profile_env_slowness.py +0 -456
  451. synth_ai/environments/examples/crafter_classic/agent_demos/old/replicate_issue.py +0 -166
  452. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_and_eval.py +0 -102
  453. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_comparison.py +0 -128
  454. synth_ai/environments/examples/crafter_classic/agent_demos/old/run_qwen_rollouts.py +0 -655
  455. synth_ai/environments/examples/crafter_classic/agent_demos/old/trace_eval_OLD.py +0 -202
  456. synth_ai/environments/examples/crafter_classic/agent_demos/old/validate_openai_format.py +0 -166
  457. synth_ai/environments/examples/crafter_classic/config_logging.py +0 -111
  458. synth_ai/environments/examples/crafter_classic/debug_translation.py +0 -0
  459. synth_ai/environments/examples/crafter_classic/engine.py +0 -579
  460. synth_ai/environments/examples/crafter_classic/engine_deterministic_patch.py +0 -64
  461. synth_ai/environments/examples/crafter_classic/engine_helpers/action_map.py +0 -6
  462. synth_ai/environments/examples/crafter_classic/engine_helpers/serialization.py +0 -75
  463. synth_ai/environments/examples/crafter_classic/engine_serialization_patch_v3.py +0 -267
  464. synth_ai/environments/examples/crafter_classic/environment.py +0 -404
  465. synth_ai/environments/examples/crafter_classic/taskset.py +0 -233
  466. synth_ai/environments/examples/crafter_classic/trace_hooks_v3.py +0 -228
  467. synth_ai/environments/examples/crafter_classic/world_config_patch_simple.py +0 -299
  468. synth_ai/environments/examples/crafter_custom/__init__.py +0 -4
  469. synth_ai/environments/examples/crafter_custom/agent_demos/__init__.py +0 -1
  470. synth_ai/environments/examples/crafter_custom/agent_demos/trace_eval.py +0 -202
  471. synth_ai/environments/examples/crafter_custom/crafter/__init__.py +0 -7
  472. synth_ai/environments/examples/crafter_custom/crafter/config.py +0 -182
  473. synth_ai/environments/examples/crafter_custom/crafter/constants.py +0 -8
  474. synth_ai/environments/examples/crafter_custom/crafter/engine.py +0 -269
  475. synth_ai/environments/examples/crafter_custom/crafter/env.py +0 -262
  476. synth_ai/environments/examples/crafter_custom/crafter/objects.py +0 -417
  477. synth_ai/environments/examples/crafter_custom/crafter/recorder.py +0 -187
  478. synth_ai/environments/examples/crafter_custom/crafter/worldgen.py +0 -118
  479. synth_ai/environments/examples/crafter_custom/dataset_builder.py +0 -373
  480. synth_ai/environments/examples/crafter_custom/environment.py +0 -312
  481. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_issue.py +0 -159
  482. synth_ai/environments/examples/crafter_custom/old/analyze_diamond_spawning.py +0 -158
  483. synth_ai/environments/examples/crafter_custom/old/compare_worlds.py +0 -71
  484. synth_ai/environments/examples/crafter_custom/old/dataset_stats.py +0 -105
  485. synth_ai/environments/examples/crafter_custom/old/diamond_spawning_summary.py +0 -119
  486. synth_ai/environments/examples/crafter_custom/old/example_dataset_usage.py +0 -52
  487. synth_ai/environments/examples/crafter_custom/run_dataset.py +0 -305
  488. synth_ai/environments/examples/enron/art_helpers/email_search_tools.py +0 -156
  489. synth_ai/environments/examples/enron/art_helpers/local_email_db.py +0 -281
  490. synth_ai/environments/examples/enron/art_helpers/types_enron.py +0 -25
  491. synth_ai/environments/examples/enron/engine.py +0 -295
  492. synth_ai/environments/examples/enron/environment.py +0 -166
  493. synth_ai/environments/examples/enron/taskset.py +0 -112
  494. synth_ai/environments/examples/enron/units/keyword_stats.py +0 -112
  495. synth_ai/environments/examples/minigrid/__init__.py +0 -48
  496. synth_ai/environments/examples/minigrid/agent_demos/minigrid_evaluation_framework.py +0 -1188
  497. synth_ai/environments/examples/minigrid/agent_demos/minigrid_quick_evaluation.py +0 -48
  498. synth_ai/environments/examples/minigrid/agent_demos/minigrid_react_agent.py +0 -562
  499. synth_ai/environments/examples/minigrid/agent_demos/minigrid_trace_evaluation.py +0 -221
  500. synth_ai/environments/examples/minigrid/engine.py +0 -589
  501. synth_ai/environments/examples/minigrid/environment.py +0 -274
  502. synth_ai/environments/examples/minigrid/environment_mapping.py +0 -242
  503. synth_ai/environments/examples/minigrid/puzzle_loader.py +0 -417
  504. synth_ai/environments/examples/minigrid/taskset.py +0 -583
  505. synth_ai/environments/examples/nethack/__init__.py +0 -7
  506. synth_ai/environments/examples/nethack/achievements.py +0 -337
  507. synth_ai/environments/examples/nethack/agent_demos/nethack_evaluation_framework.py +0 -981
  508. synth_ai/environments/examples/nethack/agent_demos/nethack_quick_evaluation.py +0 -74
  509. synth_ai/environments/examples/nethack/agent_demos/nethack_react_agent.py +0 -831
  510. synth_ai/environments/examples/nethack/engine.py +0 -739
  511. synth_ai/environments/examples/nethack/environment.py +0 -256
  512. synth_ai/environments/examples/nethack/helpers/__init__.py +0 -41
  513. synth_ai/environments/examples/nethack/helpers/action_mapping.py +0 -301
  514. synth_ai/environments/examples/nethack/helpers/nle_wrapper.py +0 -402
  515. synth_ai/environments/examples/nethack/helpers/observation_utils.py +0 -433
  516. synth_ai/environments/examples/nethack/helpers/recording_wrapper.py +0 -200
  517. synth_ai/environments/examples/nethack/helpers/trajectory_recorder.py +0 -269
  518. synth_ai/environments/examples/nethack/helpers/visualization/replay_viewer.py +0 -308
  519. synth_ai/environments/examples/nethack/helpers/visualization/visualizer.py +0 -431
  520. synth_ai/environments/examples/nethack/taskset.py +0 -323
  521. synth_ai/environments/examples/red/__init__.py +0 -7
  522. synth_ai/environments/examples/red/agent_demos/__init__.py +0 -1
  523. synth_ai/environments/examples/red/config_logging.py +0 -110
  524. synth_ai/environments/examples/red/engine.py +0 -694
  525. synth_ai/environments/examples/red/engine_helpers/__init__.py +0 -1
  526. synth_ai/environments/examples/red/engine_helpers/memory_map.py +0 -28
  527. synth_ai/environments/examples/red/engine_helpers/reward_components.py +0 -276
  528. synth_ai/environments/examples/red/engine_helpers/reward_library/__init__.py +0 -142
  529. synth_ai/environments/examples/red/engine_helpers/reward_library/adaptive_rewards.py +0 -57
  530. synth_ai/environments/examples/red/engine_helpers/reward_library/battle_rewards.py +0 -284
  531. synth_ai/environments/examples/red/engine_helpers/reward_library/composite_rewards.py +0 -150
  532. synth_ai/environments/examples/red/engine_helpers/reward_library/economy_rewards.py +0 -138
  533. synth_ai/environments/examples/red/engine_helpers/reward_library/efficiency_rewards.py +0 -57
  534. synth_ai/environments/examples/red/engine_helpers/reward_library/exploration_rewards.py +0 -331
  535. synth_ai/environments/examples/red/engine_helpers/reward_library/novelty_rewards.py +0 -121
  536. synth_ai/environments/examples/red/engine_helpers/reward_library/pallet_town_rewards.py +0 -559
  537. synth_ai/environments/examples/red/engine_helpers/reward_library/pokemon_rewards.py +0 -313
  538. synth_ai/environments/examples/red/engine_helpers/reward_library/social_rewards.py +0 -148
  539. synth_ai/environments/examples/red/engine_helpers/reward_library/story_rewards.py +0 -247
  540. synth_ai/environments/examples/red/engine_helpers/screen_analysis.py +0 -368
  541. synth_ai/environments/examples/red/engine_helpers/state_extraction.py +0 -140
  542. synth_ai/environments/examples/red/environment.py +0 -238
  543. synth_ai/environments/examples/red/taskset.py +0 -79
  544. synth_ai/environments/examples/red/units/__init__.py +0 -1
  545. synth_ai/environments/examples/sokoban/__init__.py +0 -1
  546. synth_ai/environments/examples/sokoban/agent_demos/sokoban_full_eval.py +0 -899
  547. synth_ai/environments/examples/sokoban/engine.py +0 -678
  548. synth_ai/environments/examples/sokoban/engine_helpers/__init__.py +0 -1
  549. synth_ai/environments/examples/sokoban/engine_helpers/room_utils.py +0 -657
  550. synth_ai/environments/examples/sokoban/engine_helpers/vendored/__init__.py +0 -18
  551. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/__init__.py +0 -3
  552. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/boxoban_env.py +0 -131
  553. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/render_utils.py +0 -370
  554. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/room_utils.py +0 -332
  555. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env.py +0 -306
  556. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_fixed_targets.py +0 -67
  557. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_pull.py +0 -115
  558. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_two_player.py +0 -123
  559. synth_ai/environments/examples/sokoban/engine_helpers/vendored/envs/sokoban_env_variations.py +0 -394
  560. synth_ai/environments/examples/sokoban/environment.py +0 -229
  561. synth_ai/environments/examples/sokoban/generate_verified_puzzles.py +0 -440
  562. synth_ai/environments/examples/sokoban/puzzle_loader.py +0 -312
  563. synth_ai/environments/examples/sokoban/taskset.py +0 -428
  564. synth_ai/environments/examples/sokoban/units/astar_common.py +0 -95
  565. synth_ai/environments/examples/tictactoe/__init__.py +0 -1
  566. synth_ai/environments/examples/tictactoe/engine.py +0 -368
  567. synth_ai/environments/examples/tictactoe/environment.py +0 -240
  568. synth_ai/environments/examples/tictactoe/taskset.py +0 -215
  569. synth_ai/environments/examples/verilog/__init__.py +0 -10
  570. synth_ai/environments/examples/verilog/engine.py +0 -329
  571. synth_ai/environments/examples/verilog/environment.py +0 -350
  572. synth_ai/environments/examples/verilog/taskset.py +0 -420
  573. synth_ai/environments/examples/wordle/__init__.py +0 -29
  574. synth_ai/environments/examples/wordle/engine.py +0 -398
  575. synth_ai/environments/examples/wordle/environment.py +0 -159
  576. synth_ai/environments/examples/wordle/helpers/generate_instances_wordfreq.py +0 -75
  577. synth_ai/environments/examples/wordle/taskset.py +0 -230
  578. synth_ai/environments/reproducibility/core.py +0 -42
  579. synth_ai/environments/reproducibility/helpers.py +0 -0
  580. synth_ai/environments/reproducibility/tree.py +0 -364
  581. synth_ai/environments/service/app.py +0 -98
  582. synth_ai/environments/service/core_routes.py +0 -1020
  583. synth_ai/environments/service/external_registry.py +0 -56
  584. synth_ai/environments/service/registry.py +0 -9
  585. synth_ai/environments/stateful/__init__.py +0 -1
  586. synth_ai/environments/stateful/core.py +0 -163
  587. synth_ai/environments/stateful/engine.py +0 -21
  588. synth_ai/environments/stateful/state.py +0 -7
  589. synth_ai/environments/tasks/api.py +0 -19
  590. synth_ai/environments/tasks/core.py +0 -80
  591. synth_ai/environments/tasks/filters.py +0 -41
  592. synth_ai/environments/tasks/utils.py +0 -91
  593. synth_ai/environments/v0_observability/history.py +0 -3
  594. synth_ai/environments/v0_observability/log.py +0 -2
  595. synth_ai/evals/base.py +0 -15
  596. synth_ai/experimental/synth_oss.py +0 -446
  597. synth_ai/handshake.py +0 -63
  598. synth_ai/http.py +0 -26
  599. synth_ai/http_client.py +0 -104
  600. synth_ai/inference/client.py +0 -20
  601. synth_ai/install_sqld.sh +0 -40
  602. synth_ai/jobs/client.py +0 -246
  603. synth_ai/learning/__init__.py +0 -24
  604. synth_ai/learning/config.py +0 -43
  605. synth_ai/learning/filtering.py +0 -0
  606. synth_ai/learning/ft_client.py +0 -59
  607. synth_ai/learning/offline/dpo.py +0 -0
  608. synth_ai/learning/offline/providers.py +0 -7
  609. synth_ai/learning/offline/sft.py +0 -0
  610. synth_ai/learning/offline/shared.py +0 -0
  611. synth_ai/learning/online/grpo.py +0 -0
  612. synth_ai/learning/online/irft.py +0 -0
  613. synth_ai/learning/prompts/banking77_injection_eval.py +0 -168
  614. synth_ai/learning/prompts/gepa.py +0 -0
  615. synth_ai/learning/prompts/hello_world_in_context_injection_ex.py +0 -213
  616. synth_ai/learning/prompts/mipro.py +0 -289
  617. synth_ai/learning/prompts/random_search.py +0 -246
  618. synth_ai/learning/prompts/run_mipro_banking77.py +0 -172
  619. synth_ai/learning/prompts/run_random_search_banking77.py +0 -324
  620. synth_ai/learning/sse.py +0 -58
  621. synth_ai/learning/validators.py +0 -48
  622. synth_ai/lm/__init__.py +0 -51
  623. synth_ai/lm/caching/constants.py +0 -6
  624. synth_ai/lm/caching/dbs.py +0 -0
  625. synth_ai/lm/caching/ephemeral.py +0 -102
  626. synth_ai/lm/caching/handler.py +0 -137
  627. synth_ai/lm/caching/initialize.py +0 -11
  628. synth_ai/lm/caching/persistent.py +0 -114
  629. synth_ai/lm/config.py +0 -110
  630. synth_ai/lm/constants.py +0 -32
  631. synth_ai/lm/core/__init__.py +0 -8
  632. synth_ai/lm/core/all.py +0 -73
  633. synth_ai/lm/core/exceptions.py +0 -7
  634. synth_ai/lm/core/main.py +0 -319
  635. synth_ai/lm/core/main_v3.py +0 -594
  636. synth_ai/lm/core/synth_models.py +0 -48
  637. synth_ai/lm/core/vendor_clients.py +0 -188
  638. synth_ai/lm/cost/__init__.py +0 -0
  639. synth_ai/lm/cost/monitor.py +0 -1
  640. synth_ai/lm/cost/statefulness.py +0 -1
  641. synth_ai/lm/injection.py +0 -80
  642. synth_ai/lm/overrides.py +0 -206
  643. synth_ai/lm/provider_support/__init__.py +0 -8
  644. synth_ai/lm/provider_support/anthropic.py +0 -972
  645. synth_ai/lm/provider_support/openai.py +0 -1139
  646. synth_ai/lm/provider_support/suppress_logging.py +0 -31
  647. synth_ai/lm/structured_outputs/__init__.py +0 -0
  648. synth_ai/lm/structured_outputs/handler.py +0 -440
  649. synth_ai/lm/structured_outputs/inject.py +0 -297
  650. synth_ai/lm/structured_outputs/rehabilitate.py +0 -185
  651. synth_ai/lm/tools/__init__.py +0 -3
  652. synth_ai/lm/tools/base.py +0 -172
  653. synth_ai/lm/unified_interface.py +0 -202
  654. synth_ai/lm/vendors/__init__.py +0 -0
  655. synth_ai/lm/vendors/base.py +0 -81
  656. synth_ai/lm/vendors/core/__init__.py +0 -0
  657. synth_ai/lm/vendors/core/anthropic_api.py +0 -387
  658. synth_ai/lm/vendors/core/gemini_api.py +0 -292
  659. synth_ai/lm/vendors/core/mistral_api.py +0 -322
  660. synth_ai/lm/vendors/core/openai_api.py +0 -225
  661. synth_ai/lm/vendors/core/synth_dev_api.py +0 -0
  662. synth_ai/lm/vendors/local/__init__.py +0 -0
  663. synth_ai/lm/vendors/local/ollama.py +0 -0
  664. synth_ai/lm/vendors/openai_standard.py +0 -780
  665. synth_ai/lm/vendors/openai_standard_responses.py +0 -256
  666. synth_ai/lm/vendors/retries.py +0 -22
  667. synth_ai/lm/vendors/supported/__init__.py +0 -0
  668. synth_ai/lm/vendors/supported/custom_endpoint.py +0 -417
  669. synth_ai/lm/vendors/supported/deepseek.py +0 -69
  670. synth_ai/lm/vendors/supported/grok.py +0 -75
  671. synth_ai/lm/vendors/supported/groq.py +0 -16
  672. synth_ai/lm/vendors/supported/ollama.py +0 -15
  673. synth_ai/lm/vendors/supported/openrouter.py +0 -74
  674. synth_ai/lm/vendors/supported/together.py +0 -11
  675. synth_ai/lm/vendors/synth_client.py +0 -808
  676. synth_ai/lm/warmup.py +0 -186
  677. synth_ai/rl/secrets.py +0 -19
  678. synth_ai/scripts/verify_rewards.py +0 -100
  679. synth_ai/task/__init__.py +0 -10
  680. synth_ai/task/contracts.py +0 -120
  681. synth_ai/task/health.py +0 -28
  682. synth_ai/task/validators.py +0 -12
  683. synth_ai/tracing/__init__.py +0 -30
  684. synth_ai/tracing_v1/__init__.py +0 -33
  685. synth_ai/tracing_v3/config.py +0 -84
  686. synth_ai/tracing_v3/storage/config.py +0 -62
  687. synth_ai/tracing_v3/turso/__init__.py +0 -25
  688. synth_ai/tracing_v3/turso/daemon.py +0 -144
  689. synth_ai/tracing_v3/turso/manager.py +0 -760
  690. synth_ai/v0/tracing/__init__.py +0 -0
  691. synth_ai/v0/tracing/abstractions.py +0 -224
  692. synth_ai/v0/tracing/base_client.py +0 -91
  693. synth_ai/v0/tracing/client_manager.py +0 -131
  694. synth_ai/v0/tracing/config.py +0 -142
  695. synth_ai/v0/tracing/context.py +0 -146
  696. synth_ai/v0/tracing/decorators.py +0 -682
  697. synth_ai/v0/tracing/events/__init__.py +0 -0
  698. synth_ai/v0/tracing/events/manage.py +0 -147
  699. synth_ai/v0/tracing/events/scope.py +0 -86
  700. synth_ai/v0/tracing/events/store.py +0 -228
  701. synth_ai/v0/tracing/immediate_client.py +0 -151
  702. synth_ai/v0/tracing/local.py +0 -18
  703. synth_ai/v0/tracing/log_client_base.py +0 -73
  704. synth_ai/v0/tracing/retry_queue.py +0 -186
  705. synth_ai/v0/tracing/trackers.py +0 -515
  706. synth_ai/v0/tracing/upload.py +0 -512
  707. synth_ai/v0/tracing/utils.py +0 -9
  708. synth_ai/v0/tracing_v1/__init__.py +0 -16
  709. synth_ai/v0/tracing_v1/abstractions.py +0 -224
  710. synth_ai/v0/tracing_v1/base_client.py +0 -91
  711. synth_ai/v0/tracing_v1/client_manager.py +0 -131
  712. synth_ai/v0/tracing_v1/config.py +0 -142
  713. synth_ai/v0/tracing_v1/context.py +0 -146
  714. synth_ai/v0/tracing_v1/decorators.py +0 -703
  715. synth_ai/v0/tracing_v1/events/__init__.py +0 -0
  716. synth_ai/v0/tracing_v1/events/manage.py +0 -147
  717. synth_ai/v0/tracing_v1/events/scope.py +0 -86
  718. synth_ai/v0/tracing_v1/events/store.py +0 -228
  719. synth_ai/v0/tracing_v1/immediate_client.py +0 -151
  720. synth_ai/v0/tracing_v1/local.py +0 -18
  721. synth_ai/v0/tracing_v1/log_client_base.py +0 -73
  722. synth_ai/v0/tracing_v1/retry_queue.py +0 -186
  723. synth_ai/v0/tracing_v1/trackers.py +0 -515
  724. synth_ai/v0/tracing_v1/upload.py +0 -527
  725. synth_ai/v0/tracing_v1/utils.py +0 -9
  726. synth_ai/zyk/__init__.py +0 -30
  727. synth_ai-0.2.8.dev2.dist-info/METADATA +0 -129
  728. synth_ai-0.2.8.dev2.dist-info/RECORD +0 -420
  729. /synth_ai/{demos → cli/demo_apps}/demo_task_apps/math/__init__.py +0 -0
  730. /synth_ai/{lm/caching → core/apps}/__init__.py +0 -0
  731. /synth_ai/{tracing_v3 → core/tracing_v3}/lm_call_record_abstractions.py +0 -0
  732. /synth_ai/{tracing_v3 → core/tracing_v3}/storage/__init__.py +0 -0
  733. /synth_ai/{tracing_v3 → core/tracing_v3}/storage/exceptions.py +0 -0
  734. /synth_ai/{tracing_v3 → core/tracing_v3}/storage/types.py +0 -0
  735. /synth_ai/{compound/cais.py → py.typed} +0 -0
  736. /synth_ai/{learning → sdk/learning}/core.py +0 -0
  737. /synth_ai/{learning → sdk/learning}/gateway.py +0 -0
  738. {synth_ai-0.2.8.dev2.dist-info → synth_ai-0.4.3.dist-info}/WHEEL +0 -0
  739. {synth_ai-0.2.8.dev2.dist-info → synth_ai-0.4.3.dist-info}/licenses/LICENSE +0 -0
  740. {synth_ai-0.2.8.dev2.dist-info → synth_ai-0.4.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,704 @@
1
+ """Eval runner for executing rollouts against task apps.
2
+
3
+ This module provides two execution modes:
4
+
5
+ 1. **Backend Mode (Default)**: Routes through backend interceptor for trace/usage capture
6
+ - Creates eval job via POST /api/eval/jobs
7
+ - Polls job status until completion
8
+ - Fetches detailed results with token costs and traces
9
+ - Requires backend_url and backend_api_key (or SYNTH_BASE_URL/SYNTH_API_KEY env vars)
10
+
11
+ 2. **Direct Mode**: Calls task apps directly (legacy, no usage tracking)
12
+ - Makes direct HTTP requests to task app /rollout endpoint
13
+ - No trace capture or usage tracking
14
+ - Simpler but limited functionality
15
+
16
+ Usage:
17
+ ```python
18
+ from synth_ai.cli.commands.eval.runner import run_eval
19
+ from synth_ai.cli.commands.eval.config import EvalRunConfig
20
+
21
+ config = EvalRunConfig(
22
+ app_id="banking77",
23
+ task_app_url="http://localhost:8103",
24
+ env_name="banking77",
25
+ seeds=[0, 1, 2],
26
+ policy_config={"model": "gpt-4"},
27
+ )
28
+
29
+ results = await run_eval(config)
30
+ ```
31
+
32
+ CLI Usage:
33
+ ```bash
34
+ # Direct mode (no backend)
35
+ python -m synth_ai.cli eval \
36
+ --config banking77_eval.toml \
37
+ --url http://localhost:8103
38
+
39
+ # Backend mode (with trace capture)
40
+ python -m synth_ai.cli eval \
41
+ --config banking77_eval.toml \
42
+ --url http://localhost:8103 \
43
+ --backend http://localhost:8000
44
+ ```
45
+
46
+ See Also:
47
+ - `synth_ai.cli.commands.eval.config`: Configuration loading
48
+ - `monorepo/backend/app/routes/eval/job_service.py`: Backend eval job service
49
+ """
50
+
51
+ from __future__ import annotations
52
+
53
+ import asyncio
54
+ import json
55
+ import os
56
+ import time
57
+ import uuid
58
+ from dataclasses import dataclass
59
+ from typing import Any
60
+
61
+ import httpx
62
+
63
+ from synth_ai.sdk.task.client import TaskAppClient
64
+ from synth_ai.sdk.task.contracts import (
65
+ RolloutEnvSpec,
66
+ RolloutPolicySpec,
67
+ RolloutRecordConfig,
68
+ RolloutRequest,
69
+ RolloutMode,
70
+ )
71
+
72
+ from .config import EvalRunConfig
73
+
74
+ # Default poll interval for backend job status
75
+ _POLL_INTERVAL_S = 2.0
76
+ _MAX_POLL_ATTEMPTS = 600 # 20 minutes max
77
+
78
+
79
+ @dataclass(slots=True)
80
+ class EvalResult:
81
+ seed: int
82
+ score: float | None
83
+ mean_return: float | None
84
+ outcome_score: float | None
85
+ events_score: float | None
86
+ latency_ms: float | None
87
+ verifier_score: float | None
88
+ tokens: int | None
89
+ cost_usd: float | None
90
+ error: str | None = None
91
+ trace: dict[str, Any] | None = None
92
+
93
+
94
+ def _count_tokens_from_trace(trace: dict[str, Any] | None) -> int:
95
+ """Extract total token count from trace.
96
+
97
+ Checks multiple locations:
98
+ 1. trace.usage.total_tokens (task app returns usage directly)
99
+ 2. trace.event_history[].usage (v3 trace format)
100
+ 3. trace.event_history[].response.usage (nested response)
101
+ """
102
+ if not trace:
103
+ return 0
104
+
105
+ # First check for direct usage in trace (task app format)
106
+ usage = trace.get("usage")
107
+ if isinstance(usage, dict):
108
+ total = usage.get("total_tokens", 0)
109
+ if total > 0:
110
+ return total
111
+
112
+ # Fall back to event_history (v3 trace format)
113
+ total = 0
114
+ event_history = trace.get("event_history") or []
115
+ for event in event_history:
116
+ if not isinstance(event, dict):
117
+ continue
118
+ # Check for usage in LM call events
119
+ evt_usage = event.get("usage") or {}
120
+ if isinstance(evt_usage, dict):
121
+ total += evt_usage.get("total_tokens", 0)
122
+ # Also check nested response usage
123
+ response = event.get("response") or {}
124
+ if isinstance(response, dict):
125
+ resp_usage = response.get("usage") or {}
126
+ if isinstance(resp_usage, dict):
127
+ total += resp_usage.get("total_tokens", 0)
128
+ return total
129
+
130
+
131
+ def _count_tokens_from_trajectories(trajectories: list[Any]) -> int:
132
+ """Extract token count from trajectory steps."""
133
+ total = 0
134
+ for traj in trajectories:
135
+ if not hasattr(traj, "steps"):
136
+ continue
137
+ for step in traj.steps:
138
+ if not hasattr(step, "info") or not isinstance(step.info, dict):
139
+ continue
140
+ # Check for tokens in step info
141
+ tokens = step.info.get("tokens")
142
+ if isinstance(tokens, int):
143
+ total += tokens
144
+ # Check nested usage
145
+ usage = step.info.get("usage") or {}
146
+ if isinstance(usage, dict):
147
+ total += usage.get("total_tokens", 0)
148
+ return total
149
+
150
+
151
+ def _build_run_id(config: EvalRunConfig, seed: int) -> str:
152
+ base = config.app_id or config.env_name or "eval"
153
+ suffix = uuid.uuid4().hex[:8]
154
+ return f"{base}-seed-{seed}-{suffix}"
155
+
156
+
157
+ def _build_rollout_request(config: EvalRunConfig, seed: int) -> RolloutRequest:
158
+ env_config = dict(config.env_config or {})
159
+ policy_config = dict(config.policy_config or {})
160
+
161
+ output_mode = policy_config.pop("output_mode", None)
162
+ structured_config = policy_config.pop("structured_config", None)
163
+
164
+ policy_kwargs: dict[str, Any] = {
165
+ "policy_name": config.policy_name,
166
+ "config": policy_config,
167
+ }
168
+ if output_mode is not None:
169
+ policy_kwargs["output_mode"] = output_mode
170
+ if structured_config is not None:
171
+ policy_kwargs["structured_config"] = structured_config
172
+
173
+ # Cast trace_format to expected literal type
174
+ trace_fmt: Any = config.trace_format
175
+ record = RolloutRecordConfig(
176
+ trajectories=True,
177
+ logprobs=False,
178
+ value=False,
179
+ return_trace=config.return_trace,
180
+ trace_format=trace_fmt,
181
+ )
182
+
183
+ synth_base = os.getenv("SYNTH_API_BASE") or os.getenv("SYNTH_BASE_URL")
184
+
185
+ return RolloutRequest(
186
+ run_id=_build_run_id(config, seed),
187
+ env=RolloutEnvSpec(env_name=config.env_name, config=env_config, seed=seed),
188
+ policy=RolloutPolicySpec(**policy_kwargs),
189
+ record=record,
190
+ on_done="reset",
191
+ training_session_id=None,
192
+ synth_base_url=synth_base,
193
+ mode=config.mode or RolloutMode.EVAL,
194
+ )
195
+
196
+
197
+ async def _eval_seed(
198
+ client: TaskAppClient,
199
+ config: EvalRunConfig,
200
+ seed: int,
201
+ semaphore: asyncio.Semaphore,
202
+ ) -> EvalResult:
203
+ """Execute a single rollout for one seed (used in direct mode).
204
+
205
+ Args:
206
+ client: TaskAppClient instance for making HTTP requests.
207
+ config: Evaluation configuration.
208
+ seed: Seed/index to evaluate.
209
+ semaphore: Semaphore for concurrency control.
210
+
211
+ Returns:
212
+ EvalResult with score, metrics, tokens, cost, and optional trace.
213
+
214
+ Note:
215
+ This function is only used in direct mode. Backend mode uses the
216
+ backend job service which handles rollouts internally.
217
+ """
218
+ async with semaphore:
219
+ start = time.perf_counter()
220
+ try:
221
+ request = _build_rollout_request(config, seed)
222
+ response = await client.rollout(request)
223
+ latency_ms = (time.perf_counter() - start) * 1000.0
224
+
225
+ metrics = response.metrics
226
+ mean_return = metrics.mean_return
227
+ outcome_score = metrics.outcome_score
228
+ events_score = metrics.events_score
229
+
230
+ score = outcome_score if outcome_score is not None else mean_return
231
+ verifier_score = None
232
+ tokens = None
233
+ cost_usd = None
234
+
235
+ if isinstance(metrics.details, dict):
236
+ verifier_score = metrics.details.get("verifier_score")
237
+ tokens = metrics.details.get("tokens")
238
+ cost_usd = metrics.details.get("cost_usd")
239
+
240
+ # Extract trace if return_trace was requested
241
+ trace = response.trace if config.return_trace else None
242
+
243
+ # Count tokens from trace or trajectories if not in metrics
244
+ if tokens is None:
245
+ if trace:
246
+ tokens = _count_tokens_from_trace(trace)
247
+ else:
248
+ trajectories = getattr(response, "trajectories", None)
249
+ if trajectories:
250
+ tokens = _count_tokens_from_trajectories(trajectories)
251
+ if tokens == 0:
252
+ tokens = None
253
+
254
+ return EvalResult(
255
+ seed=seed,
256
+ score=score,
257
+ mean_return=mean_return,
258
+ outcome_score=outcome_score,
259
+ events_score=events_score,
260
+ latency_ms=latency_ms,
261
+ verifier_score=verifier_score,
262
+ tokens=tokens,
263
+ cost_usd=cost_usd,
264
+ error=None,
265
+ trace=trace,
266
+ )
267
+ except Exception as exc:
268
+ latency_ms = (time.perf_counter() - start) * 1000.0
269
+ return EvalResult(
270
+ seed=seed,
271
+ score=None,
272
+ mean_return=None,
273
+ outcome_score=None,
274
+ events_score=None,
275
+ latency_ms=latency_ms,
276
+ verifier_score=None,
277
+ tokens=None,
278
+ cost_usd=None,
279
+ error=str(exc),
280
+ trace=None,
281
+ )
282
+
283
+
284
+ async def run_eval(config: EvalRunConfig) -> list[EvalResult]:
285
+ """Run evaluation against a task app.
286
+
287
+ Automatically selects execution mode based on configuration:
288
+ - **Backend mode**: Used if `backend_url` and `backend_api_key` are provided
289
+ (or SYNTH_BASE_URL/SYNTH_API_KEY env vars are set)
290
+ - **Direct mode**: Used otherwise (calls task app directly)
291
+
292
+ Args:
293
+ config: Evaluation configuration including task app URL, seeds, policy config, etc.
294
+
295
+ Returns:
296
+ List of EvalResult objects, one per seed, sorted by seed number.
297
+
298
+ Raises:
299
+ ValueError: If required configuration is missing (task_app_url, seeds, etc.)
300
+ RuntimeError: If backend job creation or polling fails
301
+
302
+ Example:
303
+ ```python
304
+ config = EvalRunConfig(
305
+ app_id="banking77",
306
+ task_app_url="http://localhost:8103",
307
+ backend_url="http://localhost:8000", # Enables backend mode
308
+ backend_api_key="sk-...",
309
+ env_name="banking77",
310
+ seeds=[0, 1, 2],
311
+ policy_config={"model": "gpt-4"},
312
+ )
313
+ results = await run_eval(config)
314
+ ```
315
+
316
+ See Also:
317
+ - `run_eval_direct()`: Direct mode implementation
318
+ - `run_eval_via_backend()`: Backend mode implementation
319
+ """
320
+ backend_url = config.backend_url or os.getenv("SYNTH_BASE_URL") or os.getenv("BACKEND_OVERRIDE")
321
+ api_key = config.backend_api_key or os.getenv("SYNTH_API_KEY")
322
+
323
+ # Use backend mode if we have both backend URL and API key
324
+ if backend_url and api_key:
325
+ return await run_eval_via_backend(config, backend_url, api_key)
326
+
327
+ # Fall back to direct mode
328
+ return await run_eval_direct(config)
329
+
330
+
331
+ async def run_eval_direct(config: EvalRunConfig) -> list[EvalResult]:
332
+ """Direct mode: Call task apps directly without backend.
333
+
334
+ Makes direct HTTP requests to the task app's `/rollout` endpoint.
335
+ This mode does NOT capture traces or track token usage via the backend interceptor.
336
+
337
+ **Use Cases:**
338
+ - Quick local testing without backend setup
339
+ - Legacy workflows that don't need trace capture
340
+ - Simple evaluations without cost tracking
341
+
342
+ **Limitations:**
343
+ - No trace capture (traces must be returned by task app if needed)
344
+ - No token cost calculation (unless task app provides it)
345
+ - No backend interceptor for LLM call tracking
346
+
347
+ Args:
348
+ config: Evaluation configuration. Must include `task_app_url` and `seeds`.
349
+
350
+ Returns:
351
+ List of EvalResult objects, one per seed.
352
+
353
+ Raises:
354
+ ValueError: If `task_app_url` or `seeds` are missing.
355
+
356
+ Example:
357
+ ```python
358
+ config = EvalRunConfig(
359
+ app_id="banking77",
360
+ task_app_url="http://localhost:8103",
361
+ env_name="banking77",
362
+ seeds=[0, 1, 2],
363
+ policy_config={"model": "gpt-4"},
364
+ )
365
+ results = await run_eval_direct(config)
366
+ ```
367
+ """
368
+ if not config.task_app_url:
369
+ raise ValueError("task_app_url is required for eval runs")
370
+ if not config.seeds:
371
+ raise ValueError("No seeds provided for evaluation")
372
+
373
+ api_key = config.task_app_api_key or os.getenv("ENVIRONMENT_API_KEY")
374
+ semaphore = asyncio.Semaphore(max(1, int(config.concurrency or 1)))
375
+
376
+ async with TaskAppClient(base_url=config.task_app_url, api_key=api_key) as client:
377
+ tasks = [
378
+ _eval_seed(client, config, seed, semaphore)
379
+ for seed in config.seeds
380
+ ]
381
+ results = await asyncio.gather(*tasks)
382
+
383
+ results.sort(key=lambda item: item.seed)
384
+ return results
385
+
386
+
387
+ async def run_eval_via_backend(
388
+ config: EvalRunConfig,
389
+ backend_url: str,
390
+ api_key: str,
391
+ ) -> list[EvalResult]:
392
+ """Backend mode: Route through backend interceptor for trace/usage capture.
393
+
394
+ This mode creates an eval job on the backend, which:
395
+ 1. Routes LLM calls through the inference interceptor
396
+ 2. Captures traces and token usage automatically
397
+ 3. Calculates costs based on model pricing
398
+ 4. Provides detailed results with timing and metrics
399
+
400
+ **Flow:**
401
+ 1. POST `/api/eval/jobs` - Create eval job
402
+ 2. Poll GET `/api/eval/jobs/{job_id}` - Check job status until completed
403
+ 3. GET `/api/eval/jobs/{job_id}/results` - Fetch detailed results
404
+
405
+ **Benefits:**
406
+ - Automatic trace capture via interceptor
407
+ - Token usage tracking and cost calculation
408
+ - Centralized job management and monitoring
409
+ - Support for async job execution
410
+
411
+ Args:
412
+ config: Evaluation configuration including task app URL, seeds, policy config.
413
+ backend_url: Backend API base URL (e.g., "http://localhost:8000")
414
+ api_key: Backend API key for authentication (Bearer token)
415
+
416
+ Returns:
417
+ List of EvalResult objects with detailed metrics including tokens, costs, traces.
418
+
419
+ Raises:
420
+ ValueError: If required configuration is missing.
421
+ RuntimeError: If job creation, polling, or result fetching fails.
422
+
423
+ Example:
424
+ ```python
425
+ config = EvalRunConfig(
426
+ app_id="banking77",
427
+ task_app_url="http://localhost:8103",
428
+ env_name="banking77",
429
+ seeds=[0, 1, 2],
430
+ policy_config={"model": "gpt-4"},
431
+ )
432
+ results = await run_eval_via_backend(
433
+ config,
434
+ backend_url="http://localhost:8000",
435
+ api_key="sk-...",
436
+ )
437
+ ```
438
+
439
+ See Also:
440
+ - `monorepo/backend/app/routes/eval/job_service.py`: Backend job service implementation
441
+ - `monorepo/backend/app/routes/eval/routes.py`: Backend API routes
442
+ """
443
+ if not config.task_app_url:
444
+ raise ValueError("task_app_url is required for eval runs")
445
+ if not config.seeds:
446
+ raise ValueError("No seeds provided for evaluation")
447
+
448
+ base = backend_url.rstrip("/")
449
+ if not base.endswith("/api"):
450
+ base = f"{base}/api"
451
+
452
+ headers = {"Authorization": f"Bearer {api_key}"}
453
+
454
+ # Build policy config for backend
455
+ policy = dict(config.policy_config or {})
456
+ policy["policy_name"] = config.policy_name
457
+
458
+ # Create eval job request
459
+ job_request = {
460
+ "task_app_url": config.task_app_url,
461
+ "task_app_api_key": config.task_app_api_key or os.getenv("ENVIRONMENT_API_KEY"),
462
+ "app_id": config.app_id,
463
+ "env_name": config.env_name,
464
+ "seeds": list(config.seeds),
465
+ "policy": policy,
466
+ "env_config": config.env_config,
467
+ "mode": config.mode.value if hasattr(config.mode, "value") else str(config.mode or "eval"),
468
+ "max_concurrent": config.concurrency,
469
+ "timeout": config.timeout,
470
+ }
471
+
472
+ async with httpx.AsyncClient(timeout=httpx.Timeout(30.0)) as client:
473
+ # 1. Create the eval job
474
+ print(f"[eval] Creating eval job via backend: {base}/eval/jobs", flush=True)
475
+ resp = await client.post(f"{base}/eval/jobs", json=job_request, headers=headers)
476
+
477
+ if resp.status_code not in (200, 201):
478
+ raise RuntimeError(f"Failed to create eval job: {resp.status_code} {resp.text}")
479
+
480
+ job_data = resp.json()
481
+ job_id = job_data.get("job_id")
482
+ if not job_id:
483
+ raise RuntimeError(f"No job_id in response: {job_data}")
484
+
485
+ print(f"[eval] Job created: {job_id}", flush=True)
486
+
487
+ # 2. Poll for job completion
488
+ for attempt in range(_MAX_POLL_ATTEMPTS):
489
+ await asyncio.sleep(_POLL_INTERVAL_S)
490
+
491
+ status_resp = await client.get(f"{base}/eval/jobs/{job_id}", headers=headers)
492
+ if status_resp.status_code != 200:
493
+ print(f"[eval] Warning: status check failed: {status_resp.status_code}", flush=True)
494
+ continue
495
+
496
+ status_data = status_resp.json()
497
+ status = status_data.get("status", "")
498
+
499
+ if status in ("completed", "failed"):
500
+ break
501
+
502
+ if attempt % 10 == 0:
503
+ print(f"[eval] Job {job_id} status: {status} (attempt {attempt})", flush=True)
504
+ else:
505
+ raise RuntimeError(f"Eval job {job_id} timed out after {_MAX_POLL_ATTEMPTS * _POLL_INTERVAL_S}s")
506
+
507
+ if status == "failed":
508
+ error = status_data.get("error", "Unknown error")
509
+ raise RuntimeError(f"Eval job {job_id} failed: {error}")
510
+
511
+ # 3. Get detailed results
512
+ results_resp = await client.get(f"{base}/eval/jobs/{job_id}/results", headers=headers)
513
+ if results_resp.status_code != 200:
514
+ raise RuntimeError(f"Failed to get results: {results_resp.status_code} {results_resp.text}")
515
+
516
+ results_data = results_resp.json()
517
+ result_rows = results_data.get("results", [])
518
+
519
+ # Convert to EvalResult objects
520
+ results: list[EvalResult] = []
521
+ for row in result_rows:
522
+ results.append(EvalResult(
523
+ seed=int(row.get("seed", 0)),
524
+ score=row.get("score"),
525
+ mean_return=row.get("mean_return"),
526
+ outcome_score=row.get("outcome_score"),
527
+ events_score=row.get("events_score"),
528
+ latency_ms=row.get("latency_ms"),
529
+ verifier_score=row.get("verifier_score"),
530
+ tokens=row.get("tokens"),
531
+ cost_usd=row.get("cost_usd"),
532
+ error=row.get("error"),
533
+ trace=None, # Traces fetched separately if needed
534
+ ))
535
+
536
+ results.sort(key=lambda item: item.seed)
537
+
538
+ # Print summary from backend
539
+ summary = results_data.get("summary", {})
540
+ if summary:
541
+ print(f"[eval] Backend summary: {summary}", flush=True)
542
+
543
+ return results
544
+
545
+
546
+ async def fetch_traces_from_backend(
547
+ job_id: str,
548
+ backend_url: str,
549
+ api_key: str,
550
+ output_dir: str,
551
+ ) -> str:
552
+ """Download traces zip from backend and extract to output_dir.
553
+
554
+ Returns path to the extracted traces directory.
555
+ """
556
+ import zipfile
557
+ import io
558
+ from pathlib import Path
559
+
560
+ base = backend_url.rstrip("/")
561
+ if not base.endswith("/api"):
562
+ base = f"{base}/api"
563
+
564
+ headers = {"Authorization": f"Bearer {api_key}"}
565
+
566
+ async with httpx.AsyncClient(timeout=httpx.Timeout(60.0)) as client:
567
+ resp = await client.get(f"{base}/eval/jobs/{job_id}/traces", headers=headers)
568
+
569
+ if resp.status_code != 200:
570
+ raise RuntimeError(f"Failed to download traces: {resp.status_code} {resp.text}")
571
+
572
+ # Extract zip contents
573
+ path = Path(output_dir)
574
+ path.mkdir(parents=True, exist_ok=True)
575
+
576
+ with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
577
+ zf.extractall(path)
578
+
579
+ return str(path)
580
+
581
+
582
+ def format_eval_table(results: list[EvalResult]) -> str:
583
+ headers = [
584
+ "seed",
585
+ "score",
586
+ "mean_return",
587
+ "outcome",
588
+ "events",
589
+ "latency_ms",
590
+ "verifier",
591
+ "tokens",
592
+ "cost_usd",
593
+ "error",
594
+ ]
595
+
596
+ def _fmt(value: Any) -> str:
597
+ if value is None:
598
+ return "-"
599
+ if isinstance(value, float):
600
+ return f"{value:.4f}".rstrip("0").rstrip(".")
601
+ return str(value)
602
+
603
+ rows = [
604
+ [
605
+ r.seed,
606
+ _fmt(r.score),
607
+ _fmt(r.mean_return),
608
+ _fmt(r.outcome_score),
609
+ _fmt(r.events_score),
610
+ _fmt(r.latency_ms),
611
+ _fmt(r.verifier_score),
612
+ _fmt(r.tokens),
613
+ _fmt(r.cost_usd),
614
+ r.error or "-",
615
+ ]
616
+ for r in results
617
+ ]
618
+
619
+ def _avg(values: list[float | int]) -> float | None:
620
+ return sum(values) / len(values) if values else None
621
+
622
+ scores = [r.score for r in results if isinstance(r.score, (int, float))]
623
+ mean_returns = [r.mean_return for r in results if isinstance(r.mean_return, (int, float))]
624
+ outcomes = [r.outcome_score for r in results if isinstance(r.outcome_score, (int, float))]
625
+ events = [r.events_score for r in results if isinstance(r.events_score, (int, float))]
626
+ latencies = [r.latency_ms for r in results if isinstance(r.latency_ms, (int, float))]
627
+ verifier_scores = [r.verifier_score for r in results if isinstance(r.verifier_score, (int, float))]
628
+ tokens = [r.tokens for r in results if isinstance(r.tokens, int)]
629
+ costs = [r.cost_usd for r in results if isinstance(r.cost_usd, (int, float))]
630
+
631
+ rows.append(
632
+ [
633
+ "avg",
634
+ _fmt(_avg(scores)),
635
+ _fmt(_avg(mean_returns)),
636
+ _fmt(_avg(outcomes)),
637
+ _fmt(_avg(events)),
638
+ _fmt(_avg(latencies)),
639
+ _fmt(_avg(verifier_scores)),
640
+ _fmt(int(sum(tokens) / len(tokens)) if tokens else None),
641
+ _fmt(_avg(costs)),
642
+ "-",
643
+ ]
644
+ )
645
+
646
+ widths = [len(h) for h in headers]
647
+ for row in rows:
648
+ for idx, cell in enumerate(row):
649
+ widths[idx] = max(widths[idx], len(str(cell)))
650
+
651
+ def _render_row(row: list[Any]) -> str:
652
+ return " | ".join(str(cell).ljust(widths[idx]) for idx, cell in enumerate(row))
653
+
654
+ sep = "-+-".join("-" * width for width in widths)
655
+ lines = [_render_row(headers), sep]
656
+ lines.extend(_render_row(row) for row in rows)
657
+ return "\n".join(lines)
658
+
659
+
660
+ def format_eval_report(config: EvalRunConfig, results: list[EvalResult]) -> str:
661
+ payload = {
662
+ "app_id": config.app_id,
663
+ "task_app_url": config.task_app_url,
664
+ "env_name": config.env_name,
665
+ "policy_name": config.policy_name,
666
+ "policy_config": config.policy_config,
667
+ "seeds": config.seeds,
668
+ "concurrency": config.concurrency,
669
+ }
670
+ header = json.dumps(payload, indent=2, default=str)
671
+ table = format_eval_table(results)
672
+ return f"Eval config\n{header}\n\nResults\n{table}\n"
673
+
674
+
675
+ def save_traces(results: list[EvalResult], traces_dir: str) -> int:
676
+ """Save traces to individual JSON files in the given directory.
677
+
678
+ Returns the number of traces saved.
679
+ """
680
+ from pathlib import Path
681
+
682
+ path = Path(traces_dir)
683
+ path.mkdir(parents=True, exist_ok=True)
684
+
685
+ saved = 0
686
+ for result in results:
687
+ if result.trace is not None:
688
+ trace_file = path / f"seed_{result.seed}_trace.json"
689
+ trace_file.write_text(json.dumps(result.trace, indent=2, default=str))
690
+ saved += 1
691
+
692
+ return saved
693
+
694
+
695
+ __all__ = [
696
+ "run_eval",
697
+ "run_eval_direct",
698
+ "run_eval_via_backend",
699
+ "fetch_traces_from_backend",
700
+ "format_eval_table",
701
+ "format_eval_report",
702
+ "save_traces",
703
+ "EvalResult",
704
+ ]