@elizaos/plugin-training 2.0.3-beta.5 → 2.0.3-beta.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (363) hide show
  1. package/dist/backends/native.d.ts +96 -0
  2. package/dist/backends/native.d.ts.map +1 -0
  3. package/dist/backends/native.js +308 -0
  4. package/dist/backends/native.js.map +1 -0
  5. package/dist/cli/train.d.ts +22 -0
  6. package/dist/cli/train.d.ts.map +1 -0
  7. package/dist/cli/train.js +219 -0
  8. package/dist/cli/train.js.map +1 -0
  9. package/dist/core/action-benchmark-runner.d.ts +55 -0
  10. package/dist/core/action-benchmark-runner.d.ts.map +1 -0
  11. package/dist/core/action-benchmark-runner.js +341 -0
  12. package/dist/core/action-benchmark-runner.js.map +1 -0
  13. package/dist/core/artifact-store.d.ts +72 -0
  14. package/dist/core/artifact-store.d.ts.map +1 -0
  15. package/dist/core/artifact-store.js +50 -0
  16. package/dist/core/artifact-store.js.map +1 -0
  17. package/dist/core/benchmark-matrix-artifact.d.ts +102 -0
  18. package/dist/core/benchmark-matrix-artifact.d.ts.map +1 -0
  19. package/dist/core/benchmark-matrix-artifact.js +381 -0
  20. package/dist/core/benchmark-matrix-artifact.js.map +1 -0
  21. package/dist/core/benchmark-vs-cerebras-runner.d.ts +37 -0
  22. package/dist/core/benchmark-vs-cerebras-runner.d.ts.map +1 -0
  23. package/dist/core/benchmark-vs-cerebras-runner.js +151 -0
  24. package/dist/core/benchmark-vs-cerebras-runner.js.map +1 -0
  25. package/dist/core/cerebras-eval-model.d.ts +54 -0
  26. package/dist/core/cerebras-eval-model.d.ts.map +1 -0
  27. package/dist/core/cerebras-eval-model.js +249 -0
  28. package/dist/core/cerebras-eval-model.js.map +1 -0
  29. package/dist/core/cli.d.ts +15 -0
  30. package/dist/core/cli.d.ts.map +1 -0
  31. package/dist/core/cli.js +1003 -0
  32. package/dist/core/cli.js.map +1 -0
  33. package/dist/core/context-audit.d.ts +51 -0
  34. package/dist/core/context-audit.d.ts.map +1 -0
  35. package/dist/core/context-audit.js +166 -0
  36. package/dist/core/context-audit.js.map +1 -0
  37. package/dist/core/context-catalog.d.ts +47 -0
  38. package/dist/core/context-catalog.d.ts.map +1 -0
  39. package/dist/core/context-catalog.js +269 -0
  40. package/dist/core/context-catalog.js.map +1 -0
  41. package/dist/core/context-types.d.ts +3 -0
  42. package/dist/core/context-types.d.ts.map +1 -0
  43. package/dist/core/context-types.js +18 -0
  44. package/dist/core/context-types.js.map +1 -0
  45. package/dist/core/dataset-generator.d.ts +135 -0
  46. package/dist/core/dataset-generator.d.ts.map +1 -0
  47. package/dist/core/dataset-generator.js +895 -0
  48. package/dist/core/dataset-generator.js.map +1 -0
  49. package/dist/core/eliza1-benchmark-recipe.d.ts +18 -0
  50. package/dist/core/eliza1-benchmark-recipe.d.ts.map +1 -0
  51. package/dist/core/eliza1-benchmark-recipe.js +64 -0
  52. package/dist/core/eliza1-benchmark-recipe.js.map +1 -0
  53. package/dist/core/eliza1-bundle-stager.d.ts +57 -0
  54. package/dist/core/eliza1-bundle-stager.d.ts.map +1 -0
  55. package/dist/core/eliza1-bundle-stager.js +149 -0
  56. package/dist/core/eliza1-bundle-stager.js.map +1 -0
  57. package/dist/core/ensure-cron-job.d.ts +53 -0
  58. package/dist/core/ensure-cron-job.d.ts.map +1 -0
  59. package/dist/core/ensure-cron-job.js +51 -0
  60. package/dist/core/ensure-cron-job.js.map +1 -0
  61. package/dist/core/eval-comparison-artifact.d.ts +72 -0
  62. package/dist/core/eval-comparison-artifact.d.ts.map +1 -0
  63. package/dist/core/eval-comparison-artifact.js +281 -0
  64. package/dist/core/eval-comparison-artifact.js.map +1 -0
  65. package/dist/core/feed-generation-runner.d.ts +37 -0
  66. package/dist/core/feed-generation-runner.d.ts.map +1 -0
  67. package/dist/core/feed-generation-runner.js +232 -0
  68. package/dist/core/feed-generation-runner.js.map +1 -0
  69. package/dist/core/html-escape.d.ts +5 -0
  70. package/dist/core/html-escape.d.ts.map +1 -0
  71. package/dist/core/html-escape.js +11 -0
  72. package/dist/core/html-escape.js.map +1 -0
  73. package/dist/core/huggingface-dataset-ingest.d.ts +52 -0
  74. package/dist/core/huggingface-dataset-ingest.d.ts.map +1 -0
  75. package/dist/core/huggingface-dataset-ingest.js +134 -0
  76. package/dist/core/huggingface-dataset-ingest.js.map +1 -0
  77. package/dist/core/index.d.ts +29 -0
  78. package/dist/core/index.d.ts.map +1 -0
  79. package/dist/core/index.js +204 -0
  80. package/dist/core/index.js.map +1 -0
  81. package/dist/core/privacy-filter.d.ts +95 -0
  82. package/dist/core/privacy-filter.d.ts.map +1 -0
  83. package/dist/core/privacy-filter.js +324 -0
  84. package/dist/core/privacy-filter.js.map +1 -0
  85. package/dist/core/promotion-gate.d.ts +117 -0
  86. package/dist/core/promotion-gate.d.ts.map +1 -0
  87. package/dist/core/promotion-gate.js +85 -0
  88. package/dist/core/promotion-gate.js.map +1 -0
  89. package/dist/core/promotion-persist.d.ts +116 -0
  90. package/dist/core/promotion-persist.d.ts.map +1 -0
  91. package/dist/core/promotion-persist.js +93 -0
  92. package/dist/core/promotion-persist.js.map +1 -0
  93. package/dist/core/prompt-compare.d.ts +99 -0
  94. package/dist/core/prompt-compare.d.ts.map +1 -0
  95. package/dist/core/prompt-compare.js +210 -0
  96. package/dist/core/prompt-compare.js.map +1 -0
  97. package/dist/core/replay-validator.d.ts +136 -0
  98. package/dist/core/replay-validator.d.ts.map +1 -0
  99. package/dist/core/replay-validator.js +312 -0
  100. package/dist/core/replay-validator.js.map +1 -0
  101. package/dist/core/roleplay-executor.d.ts +123 -0
  102. package/dist/core/roleplay-executor.d.ts.map +1 -0
  103. package/dist/core/roleplay-executor.js +675 -0
  104. package/dist/core/roleplay-executor.js.map +1 -0
  105. package/dist/core/roleplay-trajectories.d.ts +54 -0
  106. package/dist/core/roleplay-trajectories.d.ts.map +1 -0
  107. package/dist/core/roleplay-trajectories.js +88 -0
  108. package/dist/core/roleplay-trajectories.js.map +1 -0
  109. package/dist/core/scenario-blueprints.d.ts +62 -0
  110. package/dist/core/scenario-blueprints.d.ts.map +1 -0
  111. package/dist/core/scenario-blueprints.js +850 -0
  112. package/dist/core/scenario-blueprints.js.map +1 -0
  113. package/dist/core/scenario-runner.d.ts +36 -0
  114. package/dist/core/scenario-runner.d.ts.map +1 -0
  115. package/dist/core/scenario-runner.js +216 -0
  116. package/dist/core/scenario-runner.js.map +1 -0
  117. package/dist/core/skill-scoring-cron.d.ts +57 -0
  118. package/dist/core/skill-scoring-cron.d.ts.map +1 -0
  119. package/dist/core/skill-scoring-cron.js +180 -0
  120. package/dist/core/skill-scoring-cron.js.map +1 -0
  121. package/dist/core/test-trajectory-collector.d.ts +37 -0
  122. package/dist/core/test-trajectory-collector.d.ts.map +1 -0
  123. package/dist/core/test-trajectory-collector.js +225 -0
  124. package/dist/core/test-trajectory-collector.js.map +1 -0
  125. package/dist/core/track-c-queue-task.d.ts +37 -0
  126. package/dist/core/track-c-queue-task.d.ts.map +1 -0
  127. package/dist/core/track-c-queue-task.js +104 -0
  128. package/dist/core/track-c-queue-task.js.map +1 -0
  129. package/dist/core/training-analysis-index.d.ts +104 -0
  130. package/dist/core/training-analysis-index.d.ts.map +1 -0
  131. package/dist/core/training-analysis-index.js +3297 -0
  132. package/dist/core/training-analysis-index.js.map +1 -0
  133. package/dist/core/training-collection-runner.d.ts +508 -0
  134. package/dist/core/training-collection-runner.d.ts.map +1 -0
  135. package/dist/core/training-collection-runner.js +2299 -0
  136. package/dist/core/training-collection-runner.js.map +1 -0
  137. package/dist/core/training-config.d.ts +52 -0
  138. package/dist/core/training-config.d.ts.map +1 -0
  139. package/dist/core/training-config.js +117 -0
  140. package/dist/core/training-config.js.map +1 -0
  141. package/dist/core/training-orchestrator.d.ts +112 -0
  142. package/dist/core/training-orchestrator.d.ts.map +1 -0
  143. package/dist/core/training-orchestrator.js +729 -0
  144. package/dist/core/training-orchestrator.js.map +1 -0
  145. package/dist/core/training-readiness-report.d.ts +52 -0
  146. package/dist/core/training-readiness-report.d.ts.map +1 -0
  147. package/dist/core/training-readiness-report.js +765 -0
  148. package/dist/core/training-readiness-report.js.map +1 -0
  149. package/dist/core/trajectory-consumer.d.ts +15 -0
  150. package/dist/core/trajectory-consumer.d.ts.map +1 -0
  151. package/dist/core/trajectory-consumer.js +61 -0
  152. package/dist/core/trajectory-consumer.js.map +1 -0
  153. package/dist/core/trajectory-export-bundle.d.ts +95 -0
  154. package/dist/core/trajectory-export-bundle.d.ts.map +1 -0
  155. package/dist/core/trajectory-export-bundle.js +561 -0
  156. package/dist/core/trajectory-export-bundle.js.map +1 -0
  157. package/dist/core/trajectory-export-cron.d.ts +57 -0
  158. package/dist/core/trajectory-export-cron.d.ts.map +1 -0
  159. package/dist/core/trajectory-export-cron.js +170 -0
  160. package/dist/core/trajectory-export-cron.js.map +1 -0
  161. package/dist/core/trajectory-hf-upload.d.ts +50 -0
  162. package/dist/core/trajectory-hf-upload.d.ts.map +1 -0
  163. package/dist/core/trajectory-hf-upload.js +111 -0
  164. package/dist/core/trajectory-hf-upload.js.map +1 -0
  165. package/dist/core/trajectory-task-datasets.d.ts +62 -0
  166. package/dist/core/trajectory-task-datasets.d.ts.map +1 -0
  167. package/dist/core/trajectory-task-datasets.js +427 -0
  168. package/dist/core/trajectory-task-datasets.js.map +1 -0
  169. package/dist/core/wait-for-service.d.ts +25 -0
  170. package/dist/core/wait-for-service.d.ts.map +1 -0
  171. package/dist/core/wait-for-service.js +19 -0
  172. package/dist/core/wait-for-service.js.map +1 -0
  173. package/dist/core/workspace-runtime.d.ts +4 -0
  174. package/dist/core/workspace-runtime.d.ts.map +1 -0
  175. package/dist/core/workspace-runtime.js +25 -0
  176. package/dist/core/workspace-runtime.js.map +1 -0
  177. package/dist/dspy/artifact.d.ts +54 -0
  178. package/dist/dspy/artifact.d.ts.map +1 -0
  179. package/dist/dspy/artifact.js +61 -0
  180. package/dist/dspy/artifact.js.map +1 -0
  181. package/dist/dspy/chain-of-thought.d.ts +27 -0
  182. package/dist/dspy/chain-of-thought.d.ts.map +1 -0
  183. package/dist/dspy/chain-of-thought.js +43 -0
  184. package/dist/dspy/chain-of-thought.js.map +1 -0
  185. package/dist/dspy/examples.d.ts +72 -0
  186. package/dist/dspy/examples.d.ts.map +1 -0
  187. package/dist/dspy/examples.js +105 -0
  188. package/dist/dspy/examples.js.map +1 -0
  189. package/dist/dspy/index.d.ts +15 -0
  190. package/dist/dspy/index.d.ts.map +1 -0
  191. package/dist/dspy/index.js +40 -0
  192. package/dist/dspy/index.js.map +1 -0
  193. package/dist/dspy/lm-adapter.d.ts +100 -0
  194. package/dist/dspy/lm-adapter.d.ts.map +1 -0
  195. package/dist/dspy/lm-adapter.js +81 -0
  196. package/dist/dspy/lm-adapter.js.map +1 -0
  197. package/dist/dspy/optimizers/dspy-bootstrap-fewshot.d.ts +23 -0
  198. package/dist/dspy/optimizers/dspy-bootstrap-fewshot.d.ts.map +1 -0
  199. package/dist/dspy/optimizers/dspy-bootstrap-fewshot.js +85 -0
  200. package/dist/dspy/optimizers/dspy-bootstrap-fewshot.js.map +1 -0
  201. package/dist/dspy/optimizers/dspy-copro.d.ts +29 -0
  202. package/dist/dspy/optimizers/dspy-copro.d.ts.map +1 -0
  203. package/dist/dspy/optimizers/dspy-copro.js +141 -0
  204. package/dist/dspy/optimizers/dspy-copro.js.map +1 -0
  205. package/dist/dspy/optimizers/dspy-mipro.d.ts +37 -0
  206. package/dist/dspy/optimizers/dspy-mipro.d.ts.map +1 -0
  207. package/dist/dspy/optimizers/dspy-mipro.js +194 -0
  208. package/dist/dspy/optimizers/dspy-mipro.js.map +1 -0
  209. package/dist/dspy/optimizers/index.d.ts +5 -0
  210. package/dist/dspy/optimizers/index.d.ts.map +1 -0
  211. package/dist/dspy/optimizers/index.js +11 -0
  212. package/dist/dspy/optimizers/index.js.map +1 -0
  213. package/dist/dspy/optimizers/types.d.ts +39 -0
  214. package/dist/dspy/optimizers/types.d.ts.map +1 -0
  215. package/dist/dspy/optimizers/types.js +1 -0
  216. package/dist/dspy/optimizers/types.js.map +1 -0
  217. package/dist/dspy/predict.d.ts +49 -0
  218. package/dist/dspy/predict.d.ts.map +1 -0
  219. package/dist/dspy/predict.js +73 -0
  220. package/dist/dspy/predict.js.map +1 -0
  221. package/dist/dspy/signature.d.ts +88 -0
  222. package/dist/dspy/signature.d.ts.map +1 -0
  223. package/dist/dspy/signature.js +205 -0
  224. package/dist/dspy/signature.js.map +1 -0
  225. package/dist/index.d.ts +15 -0
  226. package/dist/index.d.ts.map +1 -0
  227. package/dist/index.js +15 -0
  228. package/dist/index.js.map +1 -0
  229. package/dist/optimizers/bootstrap-fewshot.d.ts +42 -0
  230. package/dist/optimizers/bootstrap-fewshot.d.ts.map +1 -0
  231. package/dist/optimizers/bootstrap-fewshot.js +92 -0
  232. package/dist/optimizers/bootstrap-fewshot.js.map +1 -0
  233. package/dist/optimizers/gepa.d.ts +63 -0
  234. package/dist/optimizers/gepa.d.ts.map +1 -0
  235. package/dist/optimizers/gepa.js +232 -0
  236. package/dist/optimizers/gepa.js.map +1 -0
  237. package/dist/optimizers/index.d.ts +7 -0
  238. package/dist/optimizers/index.d.ts.map +1 -0
  239. package/dist/optimizers/index.js +51 -0
  240. package/dist/optimizers/index.js.map +1 -0
  241. package/dist/optimizers/instruction-search.d.ts +39 -0
  242. package/dist/optimizers/instruction-search.d.ts.map +1 -0
  243. package/dist/optimizers/instruction-search.js +108 -0
  244. package/dist/optimizers/instruction-search.js.map +1 -0
  245. package/dist/optimizers/prompt-evolution.d.ts +39 -0
  246. package/dist/optimizers/prompt-evolution.d.ts.map +1 -0
  247. package/dist/optimizers/prompt-evolution.js +101 -0
  248. package/dist/optimizers/prompt-evolution.js.map +1 -0
  249. package/dist/optimizers/scoring.d.ts +139 -0
  250. package/dist/optimizers/scoring.d.ts.map +1 -0
  251. package/dist/optimizers/scoring.js +299 -0
  252. package/dist/optimizers/scoring.js.map +1 -0
  253. package/dist/optimizers/types.d.ts +105 -0
  254. package/dist/optimizers/types.d.ts.map +1 -0
  255. package/dist/optimizers/types.js +1 -0
  256. package/dist/optimizers/types.js.map +1 -0
  257. package/dist/register-runtime.d.ts +3 -0
  258. package/dist/register-runtime.d.ts.map +1 -0
  259. package/dist/register-runtime.js +60 -0
  260. package/dist/register-runtime.js.map +1 -0
  261. package/dist/register-terminal-view.d.ts +15 -0
  262. package/dist/register-terminal-view.d.ts.map +1 -0
  263. package/dist/register-terminal-view.js +31 -0
  264. package/dist/register-terminal-view.js.map +1 -0
  265. package/dist/routes/experience-routes.d.ts +21 -0
  266. package/dist/routes/experience-routes.d.ts.map +1 -0
  267. package/dist/routes/experience-routes.js +513 -0
  268. package/dist/routes/experience-routes.js.map +1 -0
  269. package/dist/routes/index.d.ts +5 -0
  270. package/dist/routes/index.d.ts.map +1 -0
  271. package/dist/routes/index.js +17 -0
  272. package/dist/routes/index.js.map +1 -0
  273. package/dist/routes/training-routes.d.ts +10 -0
  274. package/dist/routes/training-routes.d.ts.map +1 -0
  275. package/dist/routes/training-routes.js +1239 -0
  276. package/dist/routes/training-routes.js.map +1 -0
  277. package/dist/routes/training-vast-routes.d.ts +35 -0
  278. package/dist/routes/training-vast-routes.d.ts.map +1 -0
  279. package/dist/routes/training-vast-routes.js +249 -0
  280. package/dist/routes/training-vast-routes.js.map +1 -0
  281. package/dist/routes/trajectory-routes.d.ts +19 -0
  282. package/dist/routes/trajectory-routes.d.ts.map +1 -0
  283. package/dist/routes/trajectory-routes.js +1122 -0
  284. package/dist/routes/trajectory-routes.js.map +1 -0
  285. package/dist/services/index.d.ts +9 -0
  286. package/dist/services/index.d.ts.map +1 -0
  287. package/dist/services/index.js +63 -0
  288. package/dist/services/index.js.map +1 -0
  289. package/dist/services/training-backend-check.d.ts +8 -0
  290. package/dist/services/training-backend-check.d.ts.map +1 -0
  291. package/dist/services/training-backend-check.js +31 -0
  292. package/dist/services/training-backend-check.js.map +1 -0
  293. package/dist/services/training-service-like.d.ts +40 -0
  294. package/dist/services/training-service-like.d.ts.map +1 -0
  295. package/dist/services/training-service-like.js +1 -0
  296. package/dist/services/training-service-like.js.map +1 -0
  297. package/dist/services/training-service-registry.d.ts +4 -0
  298. package/dist/services/training-service-registry.d.ts.map +1 -0
  299. package/dist/services/training-service-registry.js +12 -0
  300. package/dist/services/training-service-registry.js.map +1 -0
  301. package/dist/services/training-service.d.ts +59 -0
  302. package/dist/services/training-service.d.ts.map +1 -0
  303. package/dist/services/training-service.js +154 -0
  304. package/dist/services/training-service.js.map +1 -0
  305. package/dist/services/training-trigger.d.ts +177 -0
  306. package/dist/services/training-trigger.d.ts.map +1 -0
  307. package/dist/services/training-trigger.js +300 -0
  308. package/dist/services/training-trigger.js.map +1 -0
  309. package/dist/services/training-vast-service.d.ts +149 -0
  310. package/dist/services/training-vast-service.d.ts.map +1 -0
  311. package/dist/services/training-vast-service.js +648 -0
  312. package/dist/services/training-vast-service.js.map +1 -0
  313. package/dist/services/vast-inference-stats.d.ts +37 -0
  314. package/dist/services/vast-inference-stats.d.ts.map +1 -0
  315. package/dist/services/vast-inference-stats.js +81 -0
  316. package/dist/services/vast-inference-stats.js.map +1 -0
  317. package/dist/services/vast-job-store.d.ts +74 -0
  318. package/dist/services/vast-job-store.d.ts.map +1 -0
  319. package/dist/services/vast-job-store.js +194 -0
  320. package/dist/services/vast-job-store.js.map +1 -0
  321. package/dist/services/vast-subprocess.d.ts +27 -0
  322. package/dist/services/vast-subprocess.d.ts.map +1 -0
  323. package/dist/services/vast-subprocess.js +78 -0
  324. package/dist/services/vast-subprocess.js.map +1 -0
  325. package/dist/setup-routes.d.ts +17 -0
  326. package/dist/setup-routes.d.ts.map +1 -0
  327. package/dist/setup-routes.js +319 -0
  328. package/dist/setup-routes.js.map +1 -0
  329. package/dist/ui/FineTuningSpatialView.d.ts +49 -0
  330. package/dist/ui/FineTuningSpatialView.d.ts.map +1 -0
  331. package/dist/ui/FineTuningSpatialView.js +154 -0
  332. package/dist/ui/FineTuningSpatialView.js.map +1 -0
  333. package/dist/ui/FineTuningView.d.ts +7 -0
  334. package/dist/ui/FineTuningView.d.ts.map +1 -0
  335. package/dist/ui/FineTuningView.helpers.d.ts +17 -0
  336. package/dist/ui/FineTuningView.helpers.d.ts.map +1 -0
  337. package/dist/ui/FineTuningView.helpers.js +30 -0
  338. package/dist/ui/FineTuningView.helpers.js.map +1 -0
  339. package/dist/ui/FineTuningView.interact.d.ts +2 -0
  340. package/dist/ui/FineTuningView.interact.d.ts.map +1 -0
  341. package/dist/ui/FineTuningView.interact.js +300 -0
  342. package/dist/ui/FineTuningView.interact.js.map +1 -0
  343. package/dist/ui/FineTuningView.js +4653 -0
  344. package/dist/ui/FineTuningView.js.map +1 -0
  345. package/dist/ui/fine-tuning-panels.d.ts +100 -0
  346. package/dist/ui/fine-tuning-panels.d.ts.map +1 -0
  347. package/dist/ui/fine-tuning-panels.helpers.d.ts +19 -0
  348. package/dist/ui/fine-tuning-panels.helpers.d.ts.map +1 -0
  349. package/dist/ui/fine-tuning-panels.helpers.js +77 -0
  350. package/dist/ui/fine-tuning-panels.helpers.js.map +1 -0
  351. package/dist/ui/fine-tuning-panels.js +928 -0
  352. package/dist/ui/fine-tuning-panels.js.map +1 -0
  353. package/dist/ui/index.d.ts +5 -0
  354. package/dist/ui/index.d.ts.map +1 -0
  355. package/dist/ui/index.js +5 -0
  356. package/dist/ui/index.js.map +1 -0
  357. package/dist/ui/training-view-bundle.d.ts +3 -0
  358. package/dist/ui/training-view-bundle.d.ts.map +1 -0
  359. package/dist/ui/training-view-bundle.js +7 -0
  360. package/dist/ui/training-view-bundle.js.map +1 -0
  361. package/dist/views/bundle.js +5312 -0
  362. package/dist/views/bundle.js.map +1 -0
  363. package/package.json +7 -7
@@ -0,0 +1,381 @@
1
+ import { mkdir, writeFile } from "node:fs/promises";
2
+ import { readFile } from "node:fs/promises";
3
+ import { join } from "node:path";
4
+ import {
5
+ canonicalElizaOneTierSort,
6
+ normalizeElizaOneBenchmarkTier
7
+ } from "./eliza1-benchmark-recipe.js";
8
+ import { EVAL_COMPARISON_ARTIFACT_SCHEMA } from "./eval-comparison-artifact.js";
9
+ import { trainingStateRoot } from "./training-config.js";
10
+ const BENCHMARK_MATRIX_ARTIFACT_SCHEMA = "eliza_benchmark_matrix_artifact";
11
+ const BENCHMARK_MATRIX_ARTIFACT_VERSION = 1;
12
+ const ACTION_BENCHMARK_REPORT_SCHEMA = "eliza_action_selection_benchmark_report";
13
+ const ACTION_SELECTION_BENCHMARK_ID = "eliza_harness_action_selection";
14
+ const LOCAL_EVAL_COMPARISON_BENCHMARK_ID = "eliza_harness_local_eval_comparison";
15
+ const ELIZA_ONE_MATRIX_TIERS = [
16
+ "0b",
17
+ "2b",
18
+ "4b",
19
+ "9b",
20
+ "27b"
21
+ ];
22
+ function asRecord(value) {
23
+ return value && typeof value === "object" && !Array.isArray(value) ? value : null;
24
+ }
25
+ function asString(value) {
26
+ return typeof value === "string" && value.trim().length > 0 ? value.trim() : null;
27
+ }
28
+ function asNumber(value) {
29
+ return typeof value === "number" && Number.isFinite(value) ? value : null;
30
+ }
31
+ function finiteScore(value) {
32
+ if (!Number.isFinite(value))
33
+ throw new Error(`score must be finite; got ${value}`);
34
+ return value;
35
+ }
36
+ function roundMetric(value) {
37
+ return value === null ? null : Number(value.toFixed(6));
38
+ }
39
+ function percentDelta(base, next) {
40
+ if (base === null || next === null || base === 0) return null;
41
+ return (next - base) / Math.abs(base) * 100;
42
+ }
43
+ function isDryRunRow(row) {
44
+ if (!row) return false;
45
+ const rawSource = asRecord(row.raw.source);
46
+ return row.metrics.dryRun === true || row.raw.dryRun === true || rawSource?.dryRun === true;
47
+ }
48
+ function inferTier(modelId, explicit) {
49
+ const tier = asString(explicit);
50
+ if (tier) return normalizeElizaOneBenchmarkTier(tier) ?? tier;
51
+ const normalized = modelId.toLowerCase();
52
+ if (normalized.includes("27b")) return "27b";
53
+ if (normalized.includes("9b")) return "9b";
54
+ if (normalized.includes("4b")) return "4b";
55
+ if (normalized.includes("2b")) return "2b";
56
+ if (normalized.includes("0b")) return "0b";
57
+ return null;
58
+ }
59
+ function normalizeRow(row) {
60
+ const explicitReferenceTier = asString(row.tier);
61
+ return {
62
+ modelId: row.modelId,
63
+ benchmark: row.benchmark,
64
+ score: finiteScore(row.score),
65
+ variant: row.variant,
66
+ tier: row.variant === "reference" && !explicitReferenceTier ? null : inferTier(row.modelId, row.tier),
67
+ provider: asString(row.provider),
68
+ datasetVersion: asString(row.datasetVersion),
69
+ codeCommit: asString(row.codeCommit),
70
+ ts: row.ts ?? null,
71
+ metrics: row.metrics ?? {},
72
+ raw: row.raw ?? {}
73
+ };
74
+ }
75
+ function selectReferenceModelId(rows, explicit) {
76
+ if (explicit) return explicit;
77
+ return rows.find((row) => row.variant === "reference")?.modelId ?? rows.find((row) => row.provider === "cerebras")?.modelId ?? null;
78
+ }
79
+ function scoreFor(rows, benchmark, variant, tier) {
80
+ if (variant === "reference") {
81
+ return rows.find(
82
+ (row) => row.benchmark === benchmark && row.variant === "reference" && row.tier === tier
83
+ ) ?? rows.find(
84
+ (row) => row.benchmark === benchmark && row.variant === "reference" && row.tier === null
85
+ ) ?? null;
86
+ }
87
+ return rows.find(
88
+ (row) => row.benchmark === benchmark && row.variant === variant && row.tier === tier
89
+ ) ?? null;
90
+ }
91
+ function buildComparisons(rows, referenceModelId) {
92
+ const tiers = Array.from(
93
+ new Set(
94
+ rows.map((row) => row.tier).filter((tier) => tier !== null)
95
+ )
96
+ ).sort(canonicalElizaOneTierSort);
97
+ const benchmarks = Array.from(
98
+ new Set(rows.map((row) => row.benchmark))
99
+ ).sort();
100
+ const comparisons = [];
101
+ for (const tier of tiers) {
102
+ for (const benchmark of benchmarks) {
103
+ const base = scoreFor(rows, benchmark, "base", tier);
104
+ const trained = scoreFor(rows, benchmark, "trained", tier);
105
+ const reference = scoreFor(rows, benchmark, "reference", tier);
106
+ if (!base && !trained && !reference) continue;
107
+ const dryRun = isDryRunRow(base) || isDryRunRow(trained) || isDryRunRow(reference);
108
+ comparisons.push({
109
+ tier,
110
+ benchmark,
111
+ baseModelId: base?.modelId ?? null,
112
+ trainedModelId: trained?.modelId ?? null,
113
+ referenceModelId: reference?.modelId ?? referenceModelId,
114
+ baseScore: base?.score ?? null,
115
+ trainedScore: trained?.score ?? null,
116
+ referenceScore: reference?.score ?? null,
117
+ improvementAbsolute: roundMetric(
118
+ base && trained ? trained.score - base.score : null
119
+ ),
120
+ improvementPercent: roundMetric(
121
+ percentDelta(base?.score ?? null, trained?.score ?? null)
122
+ ),
123
+ trainedVsReferenceAbsolute: roundMetric(
124
+ trained && reference ? trained.score - reference.score : null
125
+ ),
126
+ trainedVsReferencePercent: roundMetric(
127
+ percentDelta(reference?.score ?? null, trained?.score ?? null)
128
+ ),
129
+ dryRun
130
+ });
131
+ }
132
+ }
133
+ return comparisons;
134
+ }
135
+ function safeTimestamp(value) {
136
+ return value.replace(/[:.]/g, "-");
137
+ }
138
+ function rowFromActionBenchmarkArtifact(payload, source) {
139
+ const reportSource = asRecord(payload.source) ?? {};
140
+ const embeddedVariant = reportSource.variant;
141
+ const modelId = source.modelId ?? asString(reportSource.modelId) ?? void 0;
142
+ const variant = source.variant ?? (embeddedVariant === "reference" || embeddedVariant === "base" || embeddedVariant === "trained" ? embeddedVariant : void 0);
143
+ if (!modelId || !variant) {
144
+ throw new Error(
145
+ `Action benchmark artifact ${source.path} requires modelId and variant`
146
+ );
147
+ }
148
+ const summary = asRecord(payload.summary) ?? {};
149
+ const dryRun = payload.dryRun === true || reportSource.dryRun === true;
150
+ const useMocks = source.useMocks === true || reportSource.useMocks === true || payload.useMocks === true;
151
+ const score = asNumber(summary.accuracy) ?? (dryRun ? 0 : null);
152
+ if (score === null) {
153
+ throw new Error(
154
+ `Action benchmark artifact ${source.path} missing accuracy`
155
+ );
156
+ }
157
+ const caseSamples = Array.isArray(payload.results) ? payload.results.map(asRecord).filter((result) => result !== null).slice(0, 8).map((result) => ({
158
+ caseId: asString(result.caseId),
159
+ prompt: asString(result.prompt) ?? asString(result.input) ?? asString(result.userPrompt),
160
+ expectedAction: asString(result.expectedAction),
161
+ actualAction: asString(result.actualAction),
162
+ pass: result.pass === true,
163
+ response: asString(result.response) ?? asString(result.output) ?? asString(result.finalResponse) ?? asString(result.failureReason),
164
+ latencyMs: asNumber(result.latencyMs),
165
+ trajectoryPath: asString(result.trajectoryPath)
166
+ })) : [];
167
+ return [
168
+ {
169
+ modelId,
170
+ variant,
171
+ benchmark: source.benchmark ?? asString(reportSource.benchmark) ?? ACTION_SELECTION_BENCHMARK_ID,
172
+ score,
173
+ tier: source.tier ?? asString(reportSource.tier) ?? void 0,
174
+ provider: source.provider ?? asString(reportSource.provider) ?? void 0,
175
+ datasetVersion: source.datasetVersion ?? asString(reportSource.datasetVersion) ?? void 0,
176
+ codeCommit: source.codeCommit ?? asString(reportSource.codeCommit) ?? void 0,
177
+ ts: asString(payload.generatedAt) ?? void 0,
178
+ metrics: {
179
+ plannerAccuracy: summary.plannerAccuracy,
180
+ executionAccuracy: summary.executionAccuracy,
181
+ total: summary.total,
182
+ passed: summary.passed,
183
+ failed: summary.failed,
184
+ latency: summary.latency,
185
+ failureModes: payload.failureModes,
186
+ dryRun,
187
+ useMocks
188
+ },
189
+ raw: {
190
+ artifactPath: source.path,
191
+ schema: payload.schema,
192
+ source: payload.source,
193
+ caseSamples,
194
+ dryRun,
195
+ useMocks
196
+ }
197
+ }
198
+ ];
199
+ }
200
+ function rowsFromEvalComparisonArtifact(payload, source) {
201
+ const models = asRecord(payload.models) ?? {};
202
+ const metrics = asRecord(payload.metrics) ?? {};
203
+ const benchmark = source.benchmark ?? LOCAL_EVAL_COMPARISON_BENCHMARK_ID;
204
+ const baseModelId = source.variant === "base" ? source.modelId : asString(models.base);
205
+ const trainedModelId = source.variant === "trained" ? source.modelId : asString(models.trained);
206
+ const rows = [];
207
+ const baseScore = asNumber(metrics.baseScore);
208
+ if (baseModelId && baseScore !== null) {
209
+ rows.push({
210
+ modelId: baseModelId,
211
+ variant: "base",
212
+ benchmark,
213
+ score: baseScore,
214
+ tier: source.tier,
215
+ provider: source.provider,
216
+ datasetVersion: source.datasetVersion,
217
+ codeCommit: source.codeCommit,
218
+ ts: asString(payload.generatedAt) ?? void 0,
219
+ metrics: {
220
+ latencyMs: metrics.baseLatencyMs,
221
+ promptCount: metrics.promptCount
222
+ },
223
+ raw: {
224
+ artifactPath: source.path,
225
+ schema: payload.schema
226
+ }
227
+ });
228
+ }
229
+ const trainedScore = asNumber(metrics.trainedScore);
230
+ if (trainedModelId && trainedScore !== null) {
231
+ rows.push({
232
+ modelId: trainedModelId,
233
+ variant: "trained",
234
+ benchmark,
235
+ score: trainedScore,
236
+ tier: source.tier,
237
+ provider: source.provider,
238
+ datasetVersion: source.datasetVersion,
239
+ codeCommit: source.codeCommit,
240
+ ts: asString(payload.generatedAt) ?? void 0,
241
+ metrics: {
242
+ latencyMs: metrics.trainedLatencyMs,
243
+ promptCount: metrics.promptCount,
244
+ improvementAbsolute: metrics.improvementAbsolute,
245
+ improvementPercent: metrics.improvementPercent
246
+ },
247
+ raw: {
248
+ artifactPath: source.path,
249
+ schema: payload.schema
250
+ }
251
+ });
252
+ }
253
+ return rows;
254
+ }
255
+ function rowsFromBenchmarkMatrixArtifact(payload, source) {
256
+ const rows = Array.isArray(payload.rows) ? payload.rows.map(asRecord).filter((row) => row !== null) : [];
257
+ return rows.map((row) => {
258
+ const modelId = asString(row.modelId);
259
+ const benchmark = asString(row.benchmark);
260
+ const variant = row.variant;
261
+ const score = asNumber(row.score);
262
+ if (!modelId || !benchmark || score === null || variant !== "reference" && variant !== "base" && variant !== "trained") {
263
+ throw new Error(
264
+ `Benchmark matrix artifact ${source.path} has an invalid row`
265
+ );
266
+ }
267
+ return {
268
+ modelId,
269
+ benchmark: source.benchmark ?? benchmark,
270
+ score,
271
+ variant,
272
+ tier: source.tier ?? asString(row.tier) ?? void 0,
273
+ provider: source.provider ?? asString(row.provider) ?? void 0,
274
+ datasetVersion: source.datasetVersion ?? asString(row.datasetVersion) ?? void 0,
275
+ codeCommit: source.codeCommit ?? asString(row.codeCommit) ?? void 0,
276
+ ts: row.ts,
277
+ metrics: asRecord(row.metrics) ?? {},
278
+ raw: {
279
+ ...asRecord(row.raw) ?? {},
280
+ artifactPath: source.path,
281
+ schema: payload.schema
282
+ }
283
+ };
284
+ });
285
+ }
286
+ function buildBenchmarkMatrixRowsFromArtifactPayload(payload, source) {
287
+ if (payload.schema === ACTION_BENCHMARK_REPORT_SCHEMA) {
288
+ return rowFromActionBenchmarkArtifact(payload, source);
289
+ }
290
+ if (payload.schema === EVAL_COMPARISON_ARTIFACT_SCHEMA) {
291
+ return rowsFromEvalComparisonArtifact(payload, source);
292
+ }
293
+ if (payload.schema === BENCHMARK_MATRIX_ARTIFACT_SCHEMA) {
294
+ return rowsFromBenchmarkMatrixArtifact(payload, source);
295
+ }
296
+ throw new Error(`Unsupported benchmark artifact schema in ${source.path}`);
297
+ }
298
+ async function buildBenchmarkMatrixRowsFromArtifacts(artifacts) {
299
+ const rows = [];
300
+ for (const source of artifacts) {
301
+ const payload = asRecord(JSON.parse(await readFile(source.path, "utf-8")));
302
+ if (!payload)
303
+ throw new Error(`Artifact ${source.path} must be a JSON object`);
304
+ rows.push(...buildBenchmarkMatrixRowsFromArtifactPayload(payload, source));
305
+ }
306
+ return rows;
307
+ }
308
+ function buildBenchmarkMatrixArtifactPayload(input) {
309
+ const rows = input.rows.map(normalizeRow);
310
+ const referenceModelId = selectReferenceModelId(rows, input.referenceModelId);
311
+ const tiers = Array.from(
312
+ new Set(
313
+ rows.map((row) => row.tier).filter((tier) => !!tier)
314
+ )
315
+ ).sort(canonicalElizaOneTierSort);
316
+ const benchmarks = Array.from(
317
+ new Set(rows.map((row) => row.benchmark))
318
+ ).sort();
319
+ const comparisons = buildComparisons(rows, referenceModelId);
320
+ return {
321
+ schema: BENCHMARK_MATRIX_ARTIFACT_SCHEMA,
322
+ version: BENCHMARK_MATRIX_ARTIFACT_VERSION,
323
+ generatedAt: input.generatedAt ?? (/* @__PURE__ */ new Date()).toISOString(),
324
+ source: input.source ?? { kind: "training_benchmark_matrix" },
325
+ referenceModelId,
326
+ tiers,
327
+ benchmarks,
328
+ counts: {
329
+ rows: rows.length,
330
+ comparisons: comparisons.length,
331
+ tiers: tiers.length,
332
+ benchmarks: benchmarks.length
333
+ },
334
+ rows,
335
+ comparisons
336
+ };
337
+ }
338
+ async function writeBenchmarkMatrixArtifact(input) {
339
+ const artifact = buildBenchmarkMatrixArtifactPayload(input);
340
+ const outputDir = input.outputDir ?? join(
341
+ trainingStateRoot(),
342
+ "benchmarks",
343
+ safeTimestamp(artifact.generatedAt)
344
+ );
345
+ await mkdir(outputDir, { recursive: true });
346
+ const artifactPath = join(outputDir, "benchmark-matrix.json");
347
+ await writeFile(
348
+ artifactPath,
349
+ `${JSON.stringify(artifact, null, 2)}
350
+ `,
351
+ "utf-8"
352
+ );
353
+ return { outputDir, artifactPath, artifact };
354
+ }
355
+ async function writeBenchmarkMatrixArtifactFromArtifacts(input) {
356
+ const rows = await buildBenchmarkMatrixRowsFromArtifacts(input.artifacts);
357
+ return writeBenchmarkMatrixArtifact({
358
+ rows,
359
+ outputDir: input.outputDir,
360
+ generatedAt: input.generatedAt,
361
+ referenceModelId: input.referenceModelId,
362
+ source: input.source ?? {
363
+ kind: "training_benchmark_matrix_from_artifacts",
364
+ artifacts: input.artifacts.map((artifact) => artifact.path)
365
+ }
366
+ });
367
+ }
368
+ export {
369
+ ACTION_BENCHMARK_REPORT_SCHEMA,
370
+ ACTION_SELECTION_BENCHMARK_ID,
371
+ BENCHMARK_MATRIX_ARTIFACT_SCHEMA,
372
+ BENCHMARK_MATRIX_ARTIFACT_VERSION,
373
+ ELIZA_ONE_MATRIX_TIERS,
374
+ LOCAL_EVAL_COMPARISON_BENCHMARK_ID,
375
+ buildBenchmarkMatrixArtifactPayload,
376
+ buildBenchmarkMatrixRowsFromArtifactPayload,
377
+ buildBenchmarkMatrixRowsFromArtifacts,
378
+ writeBenchmarkMatrixArtifact,
379
+ writeBenchmarkMatrixArtifactFromArtifacts
380
+ };
381
+ //# sourceMappingURL=benchmark-matrix-artifact.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/core/benchmark-matrix-artifact.ts"],"sourcesContent":["import { mkdir, writeFile } from \"node:fs/promises\";\nimport { readFile } from \"node:fs/promises\";\nimport { join } from \"node:path\";\nimport {\n canonicalElizaOneTierSort,\n normalizeElizaOneBenchmarkTier,\n} from \"./eliza1-benchmark-recipe.js\";\nimport { EVAL_COMPARISON_ARTIFACT_SCHEMA } from \"./eval-comparison-artifact.js\";\nimport { trainingStateRoot } from \"./training-config.js\";\n\nexport const BENCHMARK_MATRIX_ARTIFACT_SCHEMA =\n \"eliza_benchmark_matrix_artifact\";\nexport const BENCHMARK_MATRIX_ARTIFACT_VERSION = 1;\nexport const ACTION_BENCHMARK_REPORT_SCHEMA =\n \"eliza_action_selection_benchmark_report\";\nexport const ACTION_SELECTION_BENCHMARK_ID = \"eliza_harness_action_selection\";\nexport const LOCAL_EVAL_COMPARISON_BENCHMARK_ID =\n \"eliza_harness_local_eval_comparison\";\n\nexport const ELIZA_ONE_MATRIX_TIERS = [\n \"0b\",\n \"2b\",\n \"4b\",\n \"9b\",\n \"27b\",\n] as const;\n\nexport type ElizaOneMatrixTier = (typeof ELIZA_ONE_MATRIX_TIERS)[number];\nexport type BenchmarkMatrixVariant = \"reference\" | \"base\" | \"trained\";\n\nexport interface BenchmarkMatrixRowInput {\n modelId: string;\n benchmark: string;\n score: number;\n variant: BenchmarkMatrixVariant;\n tier?: string;\n provider?: string;\n datasetVersion?: string;\n codeCommit?: string;\n ts?: number | string;\n metrics?: Record<string, unknown>;\n raw?: Record<string, unknown>;\n}\n\nexport interface BenchmarkMatrixInput {\n rows: BenchmarkMatrixRowInput[];\n outputDir?: string;\n generatedAt?: string;\n referenceModelId?: string;\n source?: Record<string, unknown>;\n}\n\nexport interface BenchmarkMatrixArtifactSource {\n path: string;\n modelId?: string;\n benchmark?: string;\n variant?: BenchmarkMatrixVariant;\n tier?: string;\n provider?: string;\n datasetVersion?: string;\n codeCommit?: string;\n useMocks?: boolean;\n}\n\nexport interface BenchmarkMatrixFromArtifactsInput {\n artifacts: BenchmarkMatrixArtifactSource[];\n outputDir?: string;\n generatedAt?: string;\n referenceModelId?: string;\n source?: Record<string, unknown>;\n}\n\nexport interface BenchmarkMatrixCell {\n modelId: string;\n benchmark: string;\n score: number;\n variant: BenchmarkMatrixVariant;\n tier: string | null;\n provider: string | null;\n datasetVersion: string | null;\n codeCommit: string | null;\n ts: number | string | null;\n metrics: Record<string, unknown>;\n raw: Record<string, unknown>;\n}\n\nexport interface BenchmarkMatrixComparison {\n tier: string;\n benchmark: string;\n baseModelId: string | null;\n trainedModelId: string | null;\n referenceModelId: string | null;\n baseScore: number | null;\n trainedScore: number | null;\n referenceScore: number | null;\n improvementAbsolute: number | null;\n improvementPercent: number | null;\n trainedVsReferenceAbsolute: number | null;\n trainedVsReferencePercent: number | null;\n dryRun: boolean;\n}\n\nexport interface BenchmarkMatrixArtifact {\n schema: typeof BENCHMARK_MATRIX_ARTIFACT_SCHEMA;\n version: typeof BENCHMARK_MATRIX_ARTIFACT_VERSION;\n generatedAt: string;\n source: Record<string, unknown>;\n referenceModelId: string | null;\n tiers: string[];\n benchmarks: string[];\n counts: {\n rows: number;\n comparisons: number;\n tiers: number;\n benchmarks: number;\n };\n rows: BenchmarkMatrixCell[];\n comparisons: BenchmarkMatrixComparison[];\n}\n\nexport interface BenchmarkMatrixArtifactResult {\n outputDir: string;\n artifactPath: string;\n artifact: BenchmarkMatrixArtifact;\n}\n\nfunction asRecord(value: unknown): Record<string, unknown> | null {\n return value && typeof value === \"object\" && !Array.isArray(value)\n ? (value as Record<string, unknown>)\n : null;\n}\n\nfunction asString(value: unknown): string | null {\n return typeof value === \"string\" && value.trim().length > 0\n ? value.trim()\n : null;\n}\n\nfunction asNumber(value: unknown): number | null {\n return typeof value === \"number\" && Number.isFinite(value) ? value : null;\n}\n\nfunction finiteScore(value: number): number {\n if (!Number.isFinite(value))\n throw new Error(`score must be finite; got ${value}`);\n return value;\n}\n\nfunction roundMetric(value: number | null): number | null {\n return value === null ? null : Number(value.toFixed(6));\n}\n\nfunction percentDelta(base: number | null, next: number | null): number | null {\n if (base === null || next === null || base === 0) return null;\n return ((next - base) / Math.abs(base)) * 100;\n}\n\nfunction isDryRunRow(row: BenchmarkMatrixCell | null | undefined): boolean {\n if (!row) return false;\n const rawSource = asRecord(row.raw.source);\n return (\n row.metrics.dryRun === true ||\n row.raw.dryRun === true ||\n rawSource?.dryRun === true\n );\n}\n\nfunction inferTier(modelId: string, explicit?: string): string | null {\n const tier = asString(explicit);\n if (tier) return normalizeElizaOneBenchmarkTier(tier) ?? tier;\n const normalized = modelId.toLowerCase();\n if (normalized.includes(\"27b\")) return \"27b\";\n if (normalized.includes(\"9b\")) return \"9b\";\n if (normalized.includes(\"4b\")) return \"4b\";\n if (normalized.includes(\"2b\")) return \"2b\";\n if (normalized.includes(\"0b\")) return \"0b\";\n return null;\n}\n\nfunction normalizeRow(row: BenchmarkMatrixRowInput): BenchmarkMatrixCell {\n const explicitReferenceTier = asString(row.tier);\n return {\n modelId: row.modelId,\n benchmark: row.benchmark,\n score: finiteScore(row.score),\n variant: row.variant,\n tier:\n row.variant === \"reference\" && !explicitReferenceTier\n ? null\n : inferTier(row.modelId, row.tier),\n provider: asString(row.provider),\n datasetVersion: asString(row.datasetVersion),\n codeCommit: asString(row.codeCommit),\n ts: row.ts ?? null,\n metrics: row.metrics ?? {},\n raw: row.raw ?? {},\n };\n}\n\nfunction selectReferenceModelId(\n rows: readonly BenchmarkMatrixCell[],\n explicit?: string,\n): string | null {\n if (explicit) return explicit;\n return (\n rows.find((row) => row.variant === \"reference\")?.modelId ??\n rows.find((row) => row.provider === \"cerebras\")?.modelId ??\n null\n );\n}\n\nfunction scoreFor(\n rows: readonly BenchmarkMatrixCell[],\n benchmark: string,\n variant: BenchmarkMatrixVariant,\n tier?: string,\n): BenchmarkMatrixCell | null {\n if (variant === \"reference\") {\n return (\n rows.find(\n (row) =>\n row.benchmark === benchmark &&\n row.variant === \"reference\" &&\n row.tier === tier,\n ) ??\n rows.find(\n (row) =>\n row.benchmark === benchmark &&\n row.variant === \"reference\" &&\n row.tier === null,\n ) ??\n null\n );\n }\n return (\n rows.find(\n (row) =>\n row.benchmark === benchmark &&\n row.variant === variant &&\n row.tier === tier,\n ) ?? null\n );\n}\n\nfunction buildComparisons(\n rows: readonly BenchmarkMatrixCell[],\n referenceModelId: string | null,\n): BenchmarkMatrixComparison[] {\n const tiers = Array.from(\n new Set(\n rows\n .map((row) => row.tier)\n .filter((tier): tier is string => tier !== null),\n ),\n ).sort(canonicalElizaOneTierSort);\n const benchmarks = Array.from(\n new Set(rows.map((row) => row.benchmark)),\n ).sort();\n const comparisons: BenchmarkMatrixComparison[] = [];\n for (const tier of tiers) {\n for (const benchmark of benchmarks) {\n const base = scoreFor(rows, benchmark, \"base\", tier);\n const trained = scoreFor(rows, benchmark, \"trained\", tier);\n const reference = scoreFor(rows, benchmark, \"reference\", tier);\n if (!base && !trained && !reference) continue;\n const dryRun =\n isDryRunRow(base) || isDryRunRow(trained) || isDryRunRow(reference);\n comparisons.push({\n tier,\n benchmark,\n baseModelId: base?.modelId ?? null,\n trainedModelId: trained?.modelId ?? null,\n referenceModelId: reference?.modelId ?? referenceModelId,\n baseScore: base?.score ?? null,\n trainedScore: trained?.score ?? null,\n referenceScore: reference?.score ?? null,\n improvementAbsolute: roundMetric(\n base && trained ? trained.score - base.score : null,\n ),\n improvementPercent: roundMetric(\n percentDelta(base?.score ?? null, trained?.score ?? null),\n ),\n trainedVsReferenceAbsolute: roundMetric(\n trained && reference ? trained.score - reference.score : null,\n ),\n trainedVsReferencePercent: roundMetric(\n percentDelta(reference?.score ?? null, trained?.score ?? null),\n ),\n dryRun,\n });\n }\n }\n return comparisons;\n}\n\nfunction safeTimestamp(value: string): string {\n return value.replace(/[:.]/g, \"-\");\n}\n\nfunction rowFromActionBenchmarkArtifact(\n payload: Record<string, unknown>,\n source: BenchmarkMatrixArtifactSource,\n): BenchmarkMatrixRowInput[] {\n const reportSource = asRecord(payload.source) ?? {};\n const embeddedVariant = reportSource.variant;\n const modelId = source.modelId ?? asString(reportSource.modelId) ?? undefined;\n const variant =\n source.variant ??\n (embeddedVariant === \"reference\" ||\n embeddedVariant === \"base\" ||\n embeddedVariant === \"trained\"\n ? embeddedVariant\n : undefined);\n if (!modelId || !variant) {\n throw new Error(\n `Action benchmark artifact ${source.path} requires modelId and variant`,\n );\n }\n const summary = asRecord(payload.summary) ?? {};\n const dryRun = payload.dryRun === true || reportSource.dryRun === true;\n const useMocks =\n source.useMocks === true ||\n reportSource.useMocks === true ||\n payload.useMocks === true;\n const score = asNumber(summary.accuracy) ?? (dryRun ? 0 : null);\n if (score === null) {\n throw new Error(\n `Action benchmark artifact ${source.path} missing accuracy`,\n );\n }\n const caseSamples = Array.isArray(payload.results)\n ? payload.results\n .map(asRecord)\n .filter((result): result is Record<string, unknown> => result !== null)\n .slice(0, 8)\n .map((result) => ({\n caseId: asString(result.caseId),\n prompt:\n asString(result.prompt) ??\n asString(result.input) ??\n asString(result.userPrompt),\n expectedAction: asString(result.expectedAction),\n actualAction: asString(result.actualAction),\n pass: result.pass === true,\n response:\n asString(result.response) ??\n asString(result.output) ??\n asString(result.finalResponse) ??\n asString(result.failureReason),\n latencyMs: asNumber(result.latencyMs),\n trajectoryPath: asString(result.trajectoryPath),\n }))\n : [];\n return [\n {\n modelId,\n variant,\n benchmark:\n source.benchmark ??\n asString(reportSource.benchmark) ??\n ACTION_SELECTION_BENCHMARK_ID,\n score,\n tier: source.tier ?? asString(reportSource.tier) ?? undefined,\n provider: source.provider ?? asString(reportSource.provider) ?? undefined,\n datasetVersion:\n source.datasetVersion ??\n asString(reportSource.datasetVersion) ??\n undefined,\n codeCommit:\n source.codeCommit ?? asString(reportSource.codeCommit) ?? undefined,\n ts: asString(payload.generatedAt) ?? undefined,\n metrics: {\n plannerAccuracy: summary.plannerAccuracy,\n executionAccuracy: summary.executionAccuracy,\n total: summary.total,\n passed: summary.passed,\n failed: summary.failed,\n latency: summary.latency,\n failureModes: payload.failureModes,\n dryRun,\n useMocks,\n },\n raw: {\n artifactPath: source.path,\n schema: payload.schema,\n source: payload.source,\n caseSamples,\n dryRun,\n useMocks,\n },\n },\n ];\n}\n\nfunction rowsFromEvalComparisonArtifact(\n payload: Record<string, unknown>,\n source: BenchmarkMatrixArtifactSource,\n): BenchmarkMatrixRowInput[] {\n const models = asRecord(payload.models) ?? {};\n const metrics = asRecord(payload.metrics) ?? {};\n const benchmark = source.benchmark ?? LOCAL_EVAL_COMPARISON_BENCHMARK_ID;\n const baseModelId =\n source.variant === \"base\" ? source.modelId : asString(models.base);\n const trainedModelId =\n source.variant === \"trained\" ? source.modelId : asString(models.trained);\n const rows: BenchmarkMatrixRowInput[] = [];\n const baseScore = asNumber(metrics.baseScore);\n if (baseModelId && baseScore !== null) {\n rows.push({\n modelId: baseModelId,\n variant: \"base\",\n benchmark,\n score: baseScore,\n tier: source.tier,\n provider: source.provider,\n datasetVersion: source.datasetVersion,\n codeCommit: source.codeCommit,\n ts: asString(payload.generatedAt) ?? undefined,\n metrics: {\n latencyMs: metrics.baseLatencyMs,\n promptCount: metrics.promptCount,\n },\n raw: {\n artifactPath: source.path,\n schema: payload.schema,\n },\n });\n }\n const trainedScore = asNumber(metrics.trainedScore);\n if (trainedModelId && trainedScore !== null) {\n rows.push({\n modelId: trainedModelId,\n variant: \"trained\",\n benchmark,\n score: trainedScore,\n tier: source.tier,\n provider: source.provider,\n datasetVersion: source.datasetVersion,\n codeCommit: source.codeCommit,\n ts: asString(payload.generatedAt) ?? undefined,\n metrics: {\n latencyMs: metrics.trainedLatencyMs,\n promptCount: metrics.promptCount,\n improvementAbsolute: metrics.improvementAbsolute,\n improvementPercent: metrics.improvementPercent,\n },\n raw: {\n artifactPath: source.path,\n schema: payload.schema,\n },\n });\n }\n return rows;\n}\n\nfunction rowsFromBenchmarkMatrixArtifact(\n payload: Record<string, unknown>,\n source: BenchmarkMatrixArtifactSource,\n): BenchmarkMatrixRowInput[] {\n const rows = Array.isArray(payload.rows)\n ? payload.rows\n .map(asRecord)\n .filter((row): row is Record<string, unknown> => row !== null)\n : [];\n return rows.map((row) => {\n const modelId = asString(row.modelId);\n const benchmark = asString(row.benchmark);\n const variant = row.variant;\n const score = asNumber(row.score);\n if (\n !modelId ||\n !benchmark ||\n score === null ||\n (variant !== \"reference\" && variant !== \"base\" && variant !== \"trained\")\n ) {\n throw new Error(\n `Benchmark matrix artifact ${source.path} has an invalid row`,\n );\n }\n return {\n modelId,\n benchmark: source.benchmark ?? benchmark,\n score,\n variant,\n tier: source.tier ?? asString(row.tier) ?? undefined,\n provider: source.provider ?? asString(row.provider) ?? undefined,\n datasetVersion:\n source.datasetVersion ?? asString(row.datasetVersion) ?? undefined,\n codeCommit: source.codeCommit ?? asString(row.codeCommit) ?? undefined,\n ts: row.ts as number | string | undefined,\n metrics: asRecord(row.metrics) ?? {},\n raw: {\n ...(asRecord(row.raw) ?? {}),\n artifactPath: source.path,\n schema: payload.schema,\n },\n };\n });\n}\n\nexport function buildBenchmarkMatrixRowsFromArtifactPayload(\n payload: Record<string, unknown>,\n source: BenchmarkMatrixArtifactSource,\n): BenchmarkMatrixRowInput[] {\n if (payload.schema === ACTION_BENCHMARK_REPORT_SCHEMA) {\n return rowFromActionBenchmarkArtifact(payload, source);\n }\n if (payload.schema === EVAL_COMPARISON_ARTIFACT_SCHEMA) {\n return rowsFromEvalComparisonArtifact(payload, source);\n }\n if (payload.schema === BENCHMARK_MATRIX_ARTIFACT_SCHEMA) {\n return rowsFromBenchmarkMatrixArtifact(payload, source);\n }\n throw new Error(`Unsupported benchmark artifact schema in ${source.path}`);\n}\n\nexport async function buildBenchmarkMatrixRowsFromArtifacts(\n artifacts: BenchmarkMatrixArtifactSource[],\n): Promise<BenchmarkMatrixRowInput[]> {\n const rows: BenchmarkMatrixRowInput[] = [];\n for (const source of artifacts) {\n const payload = asRecord(JSON.parse(await readFile(source.path, \"utf-8\")));\n if (!payload)\n throw new Error(`Artifact ${source.path} must be a JSON object`);\n rows.push(...buildBenchmarkMatrixRowsFromArtifactPayload(payload, source));\n }\n return rows;\n}\n\nexport function buildBenchmarkMatrixArtifactPayload(\n input: BenchmarkMatrixInput,\n): BenchmarkMatrixArtifact {\n const rows = input.rows.map(normalizeRow);\n const referenceModelId = selectReferenceModelId(rows, input.referenceModelId);\n const tiers = Array.from(\n new Set(\n rows.map((row) => row.tier).filter((tier): tier is string => !!tier),\n ),\n ).sort(canonicalElizaOneTierSort);\n const benchmarks = Array.from(\n new Set(rows.map((row) => row.benchmark)),\n ).sort();\n const comparisons = buildComparisons(rows, referenceModelId);\n return {\n schema: BENCHMARK_MATRIX_ARTIFACT_SCHEMA,\n version: BENCHMARK_MATRIX_ARTIFACT_VERSION,\n generatedAt: input.generatedAt ?? new Date().toISOString(),\n source: input.source ?? { kind: \"training_benchmark_matrix\" },\n referenceModelId,\n tiers,\n benchmarks,\n counts: {\n rows: rows.length,\n comparisons: comparisons.length,\n tiers: tiers.length,\n benchmarks: benchmarks.length,\n },\n rows,\n comparisons,\n };\n}\n\nexport async function writeBenchmarkMatrixArtifact(\n input: BenchmarkMatrixInput,\n): Promise<BenchmarkMatrixArtifactResult> {\n const artifact = buildBenchmarkMatrixArtifactPayload(input);\n const outputDir =\n input.outputDir ??\n join(\n trainingStateRoot(),\n \"benchmarks\",\n safeTimestamp(artifact.generatedAt),\n );\n await mkdir(outputDir, { recursive: true });\n const artifactPath = join(outputDir, \"benchmark-matrix.json\");\n await writeFile(\n artifactPath,\n `${JSON.stringify(artifact, null, 2)}\\n`,\n \"utf-8\",\n );\n return { outputDir, artifactPath, artifact };\n}\n\nexport async function writeBenchmarkMatrixArtifactFromArtifacts(\n input: BenchmarkMatrixFromArtifactsInput,\n): Promise<BenchmarkMatrixArtifactResult> {\n const rows = await buildBenchmarkMatrixRowsFromArtifacts(input.artifacts);\n return writeBenchmarkMatrixArtifact({\n rows,\n outputDir: input.outputDir,\n generatedAt: input.generatedAt,\n referenceModelId: input.referenceModelId,\n source: input.source ?? {\n kind: \"training_benchmark_matrix_from_artifacts\",\n artifacts: input.artifacts.map((artifact) => artifact.path),\n },\n });\n}\n"],"mappings":"AAAA,SAAS,OAAO,iBAAiB;AACjC,SAAS,gBAAgB;AACzB,SAAS,YAAY;AACrB;AAAA,EACE;AAAA,EACA;AAAA,OACK;AACP,SAAS,uCAAuC;AAChD,SAAS,yBAAyB;AAE3B,MAAM,mCACX;AACK,MAAM,oCAAoC;AAC1C,MAAM,iCACX;AACK,MAAM,gCAAgC;AACtC,MAAM,qCACX;AAEK,MAAM,yBAAyB;AAAA,EACpC;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF;AAqGA,SAAS,SAAS,OAAgD;AAChE,SAAO,SAAS,OAAO,UAAU,YAAY,CAAC,MAAM,QAAQ,KAAK,IAC5D,QACD;AACN;AAEA,SAAS,SAAS,OAA+B;AAC/C,SAAO,OAAO,UAAU,YAAY,MAAM,KAAK,EAAE,SAAS,IACtD,MAAM,KAAK,IACX;AACN;AAEA,SAAS,SAAS,OAA+B;AAC/C,SAAO,OAAO,UAAU,YAAY,OAAO,SAAS,KAAK,IAAI,QAAQ;AACvE;AAEA,SAAS,YAAY,OAAuB;AAC1C,MAAI,CAAC,OAAO,SAAS,KAAK;AACxB,UAAM,IAAI,MAAM,6BAA6B,KAAK,EAAE;AACtD,SAAO;AACT;AAEA,SAAS,YAAY,OAAqC;AACxD,SAAO,UAAU,OAAO,OAAO,OAAO,MAAM,QAAQ,CAAC,CAAC;AACxD;AAEA,SAAS,aAAa,MAAqB,MAAoC;AAC7E,MAAI,SAAS,QAAQ,SAAS,QAAQ,SAAS,EAAG,QAAO;AACzD,UAAS,OAAO,QAAQ,KAAK,IAAI,IAAI,IAAK;AAC5C;AAEA,SAAS,YAAY,KAAsD;AACzE,MAAI,CAAC,IAAK,QAAO;AACjB,QAAM,YAAY,SAAS,IAAI,IAAI,MAAM;AACzC,SACE,IAAI,QAAQ,WAAW,QACvB,IAAI,IAAI,WAAW,QACnB,WAAW,WAAW;AAE1B;AAEA,SAAS,UAAU,SAAiB,UAAkC;AACpE,QAAM,OAAO,SAAS,QAAQ;AAC9B,MAAI,KAAM,QAAO,+BAA+B,IAAI,KAAK;AACzD,QAAM,aAAa,QAAQ,YAAY;AACvC,MAAI,WAAW,SAAS,KAAK,EAAG,QAAO;AACvC,MAAI,WAAW,SAAS,IAAI,EAAG,QAAO;AACtC,MAAI,WAAW,SAAS,IAAI,EAAG,QAAO;AACtC,MAAI,WAAW,SAAS,IAAI,EAAG,QAAO;AACtC,MAAI,WAAW,SAAS,IAAI,EAAG,QAAO;AACtC,SAAO;AACT;AAEA,SAAS,aAAa,KAAmD;AACvE,QAAM,wBAAwB,SAAS,IAAI,IAAI;AAC/C,SAAO;AAAA,IACL,SAAS,IAAI;AAAA,IACb,WAAW,IAAI;AAAA,IACf,OAAO,YAAY,IAAI,KAAK;AAAA,IAC5B,SAAS,IAAI;AAAA,IACb,MACE,IAAI,YAAY,eAAe,CAAC,wBAC5B,OACA,UAAU,IAAI,SAAS,IAAI,IAAI;AAAA,IACrC,UAAU,SAAS,IAAI,QAAQ;AAAA,IAC/B,gBAAgB,SAAS,IAAI,cAAc;AAAA,IAC3C,YAAY,SAAS,IAAI,UAAU;AAAA,IACnC,IAAI,IAAI,MAAM;AAAA,IACd,SAAS,IAAI,WAAW,CAAC;AAAA,IACzB,KAAK,IAAI,OAAO,CAAC;AAAA,EACnB;AACF;AAEA,SAAS,uBACP,MACA,UACe;AACf,MAAI,SAAU,QAAO;AACrB,SACE,KAAK,KAAK,CAAC,QAAQ,IAAI,YAAY,WAAW,GAAG,WACjD,KAAK,KAAK,CAAC,QAAQ,IAAI,aAAa,UAAU,GAAG,WACjD;AAEJ;AAEA,SAAS,SACP,MACA,WACA,SACA,MAC4B;AAC5B,MAAI,YAAY,aAAa;AAC3B,WACE,KAAK;AAAA,MACH,CAAC,QACC,IAAI,cAAc,aAClB,IAAI,YAAY,eAChB,IAAI,SAAS;AAAA,IACjB,KACA,KAAK;AAAA,MACH,CAAC,QACC,IAAI,cAAc,aAClB,IAAI,YAAY,eAChB,IAAI,SAAS;AAAA,IACjB,KACA;AAAA,EAEJ;AACA,SACE,KAAK;AAAA,IACH,CAAC,QACC,IAAI,cAAc,aAClB,IAAI,YAAY,WAChB,IAAI,SAAS;AAAA,EACjB,KAAK;AAET;AAEA,SAAS,iBACP,MACA,kBAC6B;AAC7B,QAAM,QAAQ,MAAM;AAAA,IAClB,IAAI;AAAA,MACF,KACG,IAAI,CAAC,QAAQ,IAAI,IAAI,EACrB,OAAO,CAAC,SAAyB,SAAS,IAAI;AAAA,IACnD;AAAA,EACF,EAAE,KAAK,yBAAyB;AAChC,QAAM,aAAa,MAAM;AAAA,IACvB,IAAI,IAAI,KAAK,IAAI,CAAC,QAAQ,IAAI,SAAS,CAAC;AAAA,EAC1C,EAAE,KAAK;AACP,QAAM,cAA2C,CAAC;AAClD,aAAW,QAAQ,OAAO;AACxB,eAAW,aAAa,YAAY;AAClC,YAAM,OAAO,SAAS,MAAM,WAAW,QAAQ,IAAI;AACnD,YAAM,UAAU,SAAS,MAAM,WAAW,WAAW,IAAI;AACzD,YAAM,YAAY,SAAS,MAAM,WAAW,aAAa,IAAI;AAC7D,UAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,UAAW;AACrC,YAAM,SACJ,YAAY,IAAI,KAAK,YAAY,OAAO,KAAK,YAAY,SAAS;AACpE,kBAAY,KAAK;AAAA,QACf;AAAA,QACA;AAAA,QACA,aAAa,MAAM,WAAW;AAAA,QAC9B,gBAAgB,SAAS,WAAW;AAAA,QACpC,kBAAkB,WAAW,WAAW;AAAA,QACxC,WAAW,MAAM,SAAS;AAAA,QAC1B,cAAc,SAAS,SAAS;AAAA,QAChC,gBAAgB,WAAW,SAAS;AAAA,QACpC,qBAAqB;AAAA,UACnB,QAAQ,UAAU,QAAQ,QAAQ,KAAK,QAAQ;AAAA,QACjD;AAAA,QACA,oBAAoB;AAAA,UAClB,aAAa,MAAM,SAAS,MAAM,SAAS,SAAS,IAAI;AAAA,QAC1D;AAAA,QACA,4BAA4B;AAAA,UAC1B,WAAW,YAAY,QAAQ,QAAQ,UAAU,QAAQ;AAAA,QAC3D;AAAA,QACA,2BAA2B;AAAA,UACzB,aAAa,WAAW,SAAS,MAAM,SAAS,SAAS,IAAI;AAAA,QAC/D;AAAA,QACA;AAAA,MACF,CAAC;AAAA,IACH;AAAA,EACF;AACA,SAAO;AACT;AAEA,SAAS,cAAc,OAAuB;AAC5C,SAAO,MAAM,QAAQ,SAAS,GAAG;AACnC;AAEA,SAAS,+BACP,SACA,QAC2B;AAC3B,QAAM,eAAe,SAAS,QAAQ,MAAM,KAAK,CAAC;AAClD,QAAM,kBAAkB,aAAa;AACrC,QAAM,UAAU,OAAO,WAAW,SAAS,aAAa,OAAO,KAAK;AACpE,QAAM,UACJ,OAAO,YACN,oBAAoB,eACrB,oBAAoB,UACpB,oBAAoB,YAChB,kBACA;AACN,MAAI,CAAC,WAAW,CAAC,SAAS;AACxB,UAAM,IAAI;AAAA,MACR,6BAA6B,OAAO,IAAI;AAAA,IAC1C;AAAA,EACF;AACA,QAAM,UAAU,SAAS,QAAQ,OAAO,KAAK,CAAC;AAC9C,QAAM,SAAS,QAAQ,WAAW,QAAQ,aAAa,WAAW;AAClE,QAAM,WACJ,OAAO,aAAa,QACpB,aAAa,aAAa,QAC1B,QAAQ,aAAa;AACvB,QAAM,QAAQ,SAAS,QAAQ,QAAQ,MAAM,SAAS,IAAI;AAC1D,MAAI,UAAU,MAAM;AAClB,UAAM,IAAI;AAAA,MACR,6BAA6B,OAAO,IAAI;AAAA,IAC1C;AAAA,EACF;AACA,QAAM,cAAc,MAAM,QAAQ,QAAQ,OAAO,IAC7C,QAAQ,QACL,IAAI,QAAQ,EACZ,OAAO,CAAC,WAA8C,WAAW,IAAI,EACrE,MAAM,GAAG,CAAC,EACV,IAAI,CAAC,YAAY;AAAA,IAChB,QAAQ,SAAS,OAAO,MAAM;AAAA,IAC9B,QACE,SAAS,OAAO,MAAM,KACtB,SAAS,OAAO,KAAK,KACrB,SAAS,OAAO,UAAU;AAAA,IAC5B,gBAAgB,SAAS,OAAO,cAAc;AAAA,IAC9C,cAAc,SAAS,OAAO,YAAY;AAAA,IAC1C,MAAM,OAAO,SAAS;AAAA,IACtB,UACE,SAAS,OAAO,QAAQ,KACxB,SAAS,OAAO,MAAM,KACtB,SAAS,OAAO,aAAa,KAC7B,SAAS,OAAO,aAAa;AAAA,IAC/B,WAAW,SAAS,OAAO,SAAS;AAAA,IACpC,gBAAgB,SAAS,OAAO,cAAc;AAAA,EAChD,EAAE,IACJ,CAAC;AACL,SAAO;AAAA,IACL;AAAA,MACE;AAAA,MACA;AAAA,MACA,WACE,OAAO,aACP,SAAS,aAAa,SAAS,KAC/B;AAAA,MACF;AAAA,MACA,MAAM,OAAO,QAAQ,SAAS,aAAa,IAAI,KAAK;AAAA,MACpD,UAAU,OAAO,YAAY,SAAS,aAAa,QAAQ,KAAK;AAAA,MAChE,gBACE,OAAO,kBACP,SAAS,aAAa,cAAc,KACpC;AAAA,MACF,YACE,OAAO,cAAc,SAAS,aAAa,UAAU,KAAK;AAAA,MAC5D,IAAI,SAAS,QAAQ,WAAW,KAAK;AAAA,MACrC,SAAS;AAAA,QACP,iBAAiB,QAAQ;AAAA,QACzB,mBAAmB,QAAQ;AAAA,QAC3B,OAAO,QAAQ;AAAA,QACf,QAAQ,QAAQ;AAAA,QAChB,QAAQ,QAAQ;AAAA,QAChB,SAAS,QAAQ;AAAA,QACjB,cAAc,QAAQ;AAAA,QACtB;AAAA,QACA;AAAA,MACF;AAAA,MACA,KAAK;AAAA,QACH,cAAc,OAAO;AAAA,QACrB,QAAQ,QAAQ;AAAA,QAChB,QAAQ,QAAQ;AAAA,QAChB;AAAA,QACA;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,+BACP,SACA,QAC2B;AAC3B,QAAM,SAAS,SAAS,QAAQ,MAAM,KAAK,CAAC;AAC5C,QAAM,UAAU,SAAS,QAAQ,OAAO,KAAK,CAAC;AAC9C,QAAM,YAAY,OAAO,aAAa;AACtC,QAAM,cACJ,OAAO,YAAY,SAAS,OAAO,UAAU,SAAS,OAAO,IAAI;AACnE,QAAM,iBACJ,OAAO,YAAY,YAAY,OAAO,UAAU,SAAS,OAAO,OAAO;AACzE,QAAM,OAAkC,CAAC;AACzC,QAAM,YAAY,SAAS,QAAQ,SAAS;AAC5C,MAAI,eAAe,cAAc,MAAM;AACrC,SAAK,KAAK;AAAA,MACR,SAAS;AAAA,MACT,SAAS;AAAA,MACT;AAAA,MACA,OAAO;AAAA,MACP,MAAM,OAAO;AAAA,MACb,UAAU,OAAO;AAAA,MACjB,gBAAgB,OAAO;AAAA,MACvB,YAAY,OAAO;AAAA,MACnB,IAAI,SAAS,QAAQ,WAAW,KAAK;AAAA,MACrC,SAAS;AAAA,QACP,WAAW,QAAQ;AAAA,QACnB,aAAa,QAAQ;AAAA,MACvB;AAAA,MACA,KAAK;AAAA,QACH,cAAc,OAAO;AAAA,QACrB,QAAQ,QAAQ;AAAA,MAClB;AAAA,IACF,CAAC;AAAA,EACH;AACA,QAAM,eAAe,SAAS,QAAQ,YAAY;AAClD,MAAI,kBAAkB,iBAAiB,MAAM;AAC3C,SAAK,KAAK;AAAA,MACR,SAAS;AAAA,MACT,SAAS;AAAA,MACT;AAAA,MACA,OAAO;AAAA,MACP,MAAM,OAAO;AAAA,MACb,UAAU,OAAO;AAAA,MACjB,gBAAgB,OAAO;AAAA,MACvB,YAAY,OAAO;AAAA,MACnB,IAAI,SAAS,QAAQ,WAAW,KAAK;AAAA,MACrC,SAAS;AAAA,QACP,WAAW,QAAQ;AAAA,QACnB,aAAa,QAAQ;AAAA,QACrB,qBAAqB,QAAQ;AAAA,QAC7B,oBAAoB,QAAQ;AAAA,MAC9B;AAAA,MACA,KAAK;AAAA,QACH,cAAc,OAAO;AAAA,QACrB,QAAQ,QAAQ;AAAA,MAClB;AAAA,IACF,CAAC;AAAA,EACH;AACA,SAAO;AACT;AAEA,SAAS,gCACP,SACA,QAC2B;AAC3B,QAAM,OAAO,MAAM,QAAQ,QAAQ,IAAI,IACnC,QAAQ,KACL,IAAI,QAAQ,EACZ,OAAO,CAAC,QAAwC,QAAQ,IAAI,IAC/D,CAAC;AACL,SAAO,KAAK,IAAI,CAAC,QAAQ;AACvB,UAAM,UAAU,SAAS,IAAI,OAAO;AACpC,UAAM,YAAY,SAAS,IAAI,SAAS;AACxC,UAAM,UAAU,IAAI;AACpB,UAAM,QAAQ,SAAS,IAAI,KAAK;AAChC,QACE,CAAC,WACD,CAAC,aACD,UAAU,QACT,YAAY,eAAe,YAAY,UAAU,YAAY,WAC9D;AACA,YAAM,IAAI;AAAA,QACR,6BAA6B,OAAO,IAAI;AAAA,MAC1C;AAAA,IACF;AACA,WAAO;AAAA,MACL;AAAA,MACA,WAAW,OAAO,aAAa;AAAA,MAC/B;AAAA,MACA;AAAA,MACA,MAAM,OAAO,QAAQ,SAAS,IAAI,IAAI,KAAK;AAAA,MAC3C,UAAU,OAAO,YAAY,SAAS,IAAI,QAAQ,KAAK;AAAA,MACvD,gBACE,OAAO,kBAAkB,SAAS,IAAI,cAAc,KAAK;AAAA,MAC3D,YAAY,OAAO,cAAc,SAAS,IAAI,UAAU,KAAK;AAAA,MAC7D,IAAI,IAAI;AAAA,MACR,SAAS,SAAS,IAAI,OAAO,KAAK,CAAC;AAAA,MACnC,KAAK;AAAA,QACH,GAAI,SAAS,IAAI,GAAG,KAAK,CAAC;AAAA,QAC1B,cAAc,OAAO;AAAA,QACrB,QAAQ,QAAQ;AAAA,MAClB;AAAA,IACF;AAAA,EACF,CAAC;AACH;AAEO,SAAS,4CACd,SACA,QAC2B;AAC3B,MAAI,QAAQ,WAAW,gCAAgC;AACrD,WAAO,+BAA+B,SAAS,MAAM;AAAA,EACvD;AACA,MAAI,QAAQ,WAAW,iCAAiC;AACtD,WAAO,+BAA+B,SAAS,MAAM;AAAA,EACvD;AACA,MAAI,QAAQ,WAAW,kCAAkC;AACvD,WAAO,gCAAgC,SAAS,MAAM;AAAA,EACxD;AACA,QAAM,IAAI,MAAM,4CAA4C,OAAO,IAAI,EAAE;AAC3E;AAEA,eAAsB,sCACpB,WACoC;AACpC,QAAM,OAAkC,CAAC;AACzC,aAAW,UAAU,WAAW;AAC9B,UAAM,UAAU,SAAS,KAAK,MAAM,MAAM,SAAS,OAAO,MAAM,OAAO,CAAC,CAAC;AACzE,QAAI,CAAC;AACH,YAAM,IAAI,MAAM,YAAY,OAAO,IAAI,wBAAwB;AACjE,SAAK,KAAK,GAAG,4CAA4C,SAAS,MAAM,CAAC;AAAA,EAC3E;AACA,SAAO;AACT;AAEO,SAAS,oCACd,OACyB;AACzB,QAAM,OAAO,MAAM,KAAK,IAAI,YAAY;AACxC,QAAM,mBAAmB,uBAAuB,MAAM,MAAM,gBAAgB;AAC5E,QAAM,QAAQ,MAAM;AAAA,IAClB,IAAI;AAAA,MACF,KAAK,IAAI,CAAC,QAAQ,IAAI,IAAI,EAAE,OAAO,CAAC,SAAyB,CAAC,CAAC,IAAI;AAAA,IACrE;AAAA,EACF,EAAE,KAAK,yBAAyB;AAChC,QAAM,aAAa,MAAM;AAAA,IACvB,IAAI,IAAI,KAAK,IAAI,CAAC,QAAQ,IAAI,SAAS,CAAC;AAAA,EAC1C,EAAE,KAAK;AACP,QAAM,cAAc,iBAAiB,MAAM,gBAAgB;AAC3D,SAAO;AAAA,IACL,QAAQ;AAAA,IACR,SAAS;AAAA,IACT,aAAa,MAAM,gBAAe,oBAAI,KAAK,GAAE,YAAY;AAAA,IACzD,QAAQ,MAAM,UAAU,EAAE,MAAM,4BAA4B;AAAA,IAC5D;AAAA,IACA;AAAA,IACA;AAAA,IACA,QAAQ;AAAA,MACN,MAAM,KAAK;AAAA,MACX,aAAa,YAAY;AAAA,MACzB,OAAO,MAAM;AAAA,MACb,YAAY,WAAW;AAAA,IACzB;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAEA,eAAsB,6BACpB,OACwC;AACxC,QAAM,WAAW,oCAAoC,KAAK;AAC1D,QAAM,YACJ,MAAM,aACN;AAAA,IACE,kBAAkB;AAAA,IAClB;AAAA,IACA,cAAc,SAAS,WAAW;AAAA,EACpC;AACF,QAAM,MAAM,WAAW,EAAE,WAAW,KAAK,CAAC;AAC1C,QAAM,eAAe,KAAK,WAAW,uBAAuB;AAC5D,QAAM;AAAA,IACJ;AAAA,IACA,GAAG,KAAK,UAAU,UAAU,MAAM,CAAC,CAAC;AAAA;AAAA,IACpC;AAAA,EACF;AACA,SAAO,EAAE,WAAW,cAAc,SAAS;AAC7C;AAEA,eAAsB,0CACpB,OACwC;AACxC,QAAM,OAAO,MAAM,sCAAsC,MAAM,SAAS;AACxE,SAAO,6BAA6B;AAAA,IAClC;AAAA,IACA,WAAW,MAAM;AAAA,IACjB,aAAa,MAAM;AAAA,IACnB,kBAAkB,MAAM;AAAA,IACxB,QAAQ,MAAM,UAAU;AAAA,MACtB,MAAM;AAAA,MACN,WAAW,MAAM,UAAU,IAAI,CAAC,aAAa,SAAS,IAAI;AAAA,IAC5D;AAAA,EACF,CAAC;AACH;","names":[]}
@@ -0,0 +1,37 @@
1
+ export type BenchmarkVsCerebrasBenchmark = "eliza_harness_action_selection" | "clawbench" | "hermes" | "all";
2
+ export interface BenchmarkVsCerebrasRunOptions {
3
+ trainingRoot?: string;
4
+ python?: string;
5
+ tiers?: string;
6
+ benchmark?: BenchmarkVsCerebrasBenchmark;
7
+ variants?: "trained" | "base" | "both";
8
+ cerebrasModel?: string;
9
+ maxSamples?: number;
10
+ outputDir?: string;
11
+ checkpointsDir?: string;
12
+ trainedModelPath?: string;
13
+ dryRun?: boolean;
14
+ resultsDb?: string;
15
+ datasetVersion?: string;
16
+ codeCommit?: string;
17
+ matrixOutputDir?: string;
18
+ }
19
+ export interface BenchmarkVsCerebrasRunResult {
20
+ trainingRoot: string;
21
+ outputDir: string;
22
+ matrixOutputDir: string | null;
23
+ matrixArtifactPath: string | null;
24
+ resultsDb: string | null;
25
+ command: string[];
26
+ stdout: string;
27
+ stderr: string;
28
+ exitCode: number;
29
+ }
30
+ export declare function benchmarkVsCerebrasTierList(value: string | undefined): string;
31
+ export declare function buildBenchmarkVsCerebrasArgs(options: BenchmarkVsCerebrasRunOptions, resolved: {
32
+ trainingRoot: string;
33
+ outputDir: string;
34
+ matrixOutputDir?: string;
35
+ }): string[];
36
+ export declare function runBenchmarkVsCerebras(options: BenchmarkVsCerebrasRunOptions): Promise<BenchmarkVsCerebrasRunResult>;
37
+ //# sourceMappingURL=benchmark-vs-cerebras-runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"benchmark-vs-cerebras-runner.d.ts","sourceRoot":"","sources":["../../src/core/benchmark-vs-cerebras-runner.ts"],"names":[],"mappings":"AAMA,MAAM,MAAM,4BAA4B,GACpC,gCAAgC,GAChC,WAAW,GACX,QAAQ,GACR,KAAK,CAAC;AAEV,MAAM,WAAW,6BAA6B;IAC5C,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,SAAS,CAAC,EAAE,4BAA4B,CAAC;IACzC,QAAQ,CAAC,EAAE,SAAS,GAAG,MAAM,GAAG,MAAM,CAAC;IACvC,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,eAAe,CAAC,EAAE,MAAM,CAAC;CAC1B;AAED,MAAM,WAAW,4BAA4B;IAC3C,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,OAAO,EAAE,MAAM,EAAE,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;CAClB;AAkDD,wBAAgB,2BAA2B,CAAC,KAAK,EAAE,MAAM,GAAG,SAAS,GAAG,MAAM,CAI7E;AA6BD,wBAAgB,4BAA4B,CAC1C,OAAO,EAAE,6BAA6B,EACtC,QAAQ,EAAE;IACR,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,eAAe,CAAC,EAAE,MAAM,CAAC;CAC1B,GACA,MAAM,EAAE,CAsCV;AAED,wBAAsB,sBAAsB,CAC1C,OAAO,EAAE,6BAA6B,GACrC,OAAO,CAAC,4BAA4B,CAAC,CAsCvC"}
@@ -0,0 +1,151 @@
1
+ import { spawn } from "node:child_process";
2
+ import { mkdir } from "node:fs/promises";
3
+ import { join, resolve } from "node:path";
4
+ import { ELIZA_ONE_BENCHMARK_TIER_LIST } from "./eliza1-benchmark-recipe.js";
5
+ import { trainingStateRoot } from "./training-config.js";
6
+ function safeTimestamp(value) {
7
+ return value.replace(/[:.]/g, "-");
8
+ }
9
+ const TRAINING_TIER_KEYS = {
10
+ "2b": "gemma4-e2b",
11
+ "4b": "gemma4-e4b",
12
+ "9b": "gemma4-12b",
13
+ "27b": "gemma4-31b",
14
+ "eliza-1-2b": "gemma4-e2b",
15
+ "eliza-1-4b": "gemma4-e4b",
16
+ "eliza-1-9b": "gemma4-12b",
17
+ "eliza-1-27b": "gemma4-31b",
18
+ "gemma4-e2b": "gemma4-e2b",
19
+ "gemma4-e4b": "gemma4-e4b",
20
+ "gemma4-12b": "gemma4-12b",
21
+ "gemma4-31b": "gemma4-31b",
22
+ "gemma-4-e2b": "gemma4-e2b",
23
+ "gemma-4-e4b": "gemma4-e4b",
24
+ "gemma-4-12b": "gemma4-12b",
25
+ "gemma-4-31b": "gemma4-31b",
26
+ "google/gemma-4-e2b": "gemma4-e2b",
27
+ "google/gemma-4-e4b": "gemma4-e4b",
28
+ "google/gemma-4-12b": "gemma4-12b",
29
+ "google/gemma-4-31b": "gemma4-31b",
30
+ "google-gemma-4-e2b": "gemma4-e2b",
31
+ "google-gemma-4-e4b": "gemma4-e4b",
32
+ "google-gemma-4-12b": "gemma4-12b",
33
+ "google-gemma-4-31b": "gemma4-31b"
34
+ };
35
+ const RETIRED_QWEN_TIER_ALIAS_RE = /\bqwen(?:\d+(?:\.\d+)?)?\b/i;
36
+ function normalizeTrainingTierKey(value) {
37
+ const trimmed = value.trim();
38
+ const key = trimmed.toLowerCase().replace(/_/g, "-");
39
+ if (RETIRED_QWEN_TIER_ALIAS_RE.test(key)) {
40
+ throw new Error(
41
+ `Qwen tier aliases are retired; use an active Gemma 4 tier key instead (${ELIZA_ONE_BENCHMARK_TIER_LIST}).`
42
+ );
43
+ }
44
+ return TRAINING_TIER_KEYS[key] ?? TRAINING_TIER_KEYS[key.replace(/\//g, "-")] ?? trimmed;
45
+ }
46
+ function benchmarkVsCerebrasTierList(value) {
47
+ const raw = value?.trim() || ELIZA_ONE_BENCHMARK_TIER_LIST;
48
+ if (raw.toLowerCase() === "all") return "all";
49
+ return raw.split(",").map(normalizeTrainingTierKey).filter(Boolean).join(",");
50
+ }
51
+ function collectProcess(command, args, cwd) {
52
+ return new Promise((resolvePromise, reject) => {
53
+ const child = spawn(command, args, {
54
+ cwd,
55
+ stdio: ["ignore", "pipe", "pipe"]
56
+ });
57
+ let stdout = "";
58
+ let stderr = "";
59
+ child.stdout.setEncoding("utf-8");
60
+ child.stderr.setEncoding("utf-8");
61
+ child.stdout.on("data", (chunk) => {
62
+ stdout += chunk;
63
+ });
64
+ child.stderr.on("data", (chunk) => {
65
+ stderr += chunk;
66
+ });
67
+ child.on("error", reject);
68
+ child.on("close", (code) => {
69
+ resolvePromise({ stdout, stderr, exitCode: code ?? 1 });
70
+ });
71
+ });
72
+ }
73
+ function buildBenchmarkVsCerebrasArgs(options, resolved) {
74
+ const scriptPath = join(
75
+ resolved.trainingRoot,
76
+ "scripts",
77
+ "benchmark_vs_cerebras.py"
78
+ );
79
+ const args = [
80
+ scriptPath,
81
+ "--tiers",
82
+ benchmarkVsCerebrasTierList(options.tiers),
83
+ "--benchmark",
84
+ options.benchmark ?? "eliza_harness_action_selection",
85
+ "--variants",
86
+ options.variants ?? "trained",
87
+ "--cerebras-model",
88
+ options.cerebrasModel ?? "gpt-oss-120b",
89
+ "--max-samples",
90
+ String(
91
+ typeof options.maxSamples === "number" ? Math.max(1, Math.floor(options.maxSamples)) : 50
92
+ ),
93
+ "--output-dir",
94
+ resolved.outputDir
95
+ ];
96
+ if (options.checkpointsDir)
97
+ args.push("--checkpoints-dir", options.checkpointsDir);
98
+ if (options.trainedModelPath)
99
+ args.push("--trained-model-path", options.trainedModelPath);
100
+ if (options.dryRun) args.push("--dry-run");
101
+ if (options.resultsDb) args.push("--results-db", options.resultsDb);
102
+ if (options.datasetVersion)
103
+ args.push("--dataset-version", options.datasetVersion);
104
+ if (options.codeCommit) args.push("--code-commit", options.codeCommit);
105
+ if (resolved.matrixOutputDir) {
106
+ args.push("--matrix-output-dir", resolved.matrixOutputDir);
107
+ }
108
+ return args;
109
+ }
110
+ async function runBenchmarkVsCerebras(options) {
111
+ const trainingRoot = resolve(
112
+ options.trainingRoot ?? join(process.cwd(), "packages", "training")
113
+ );
114
+ const stamp = safeTimestamp((/* @__PURE__ */ new Date()).toISOString());
115
+ const outputDir = options.outputDir ?? join(trainingStateRoot(), "benchmarks", "runs", stamp);
116
+ const matrixOutputDir = options.matrixOutputDir ?? join(trainingStateRoot(), "benchmarks", "matrices", stamp);
117
+ await mkdir(outputDir, { recursive: true });
118
+ await mkdir(matrixOutputDir, { recursive: true });
119
+ const args = buildBenchmarkVsCerebrasArgs(options, {
120
+ trainingRoot,
121
+ outputDir,
122
+ matrixOutputDir
123
+ });
124
+ const proc = await collectProcess(
125
+ options.python ?? "python3",
126
+ args,
127
+ trainingRoot
128
+ );
129
+ if (proc.exitCode !== 0) {
130
+ throw new Error(
131
+ `benchmark_vs_cerebras.py exited with code ${proc.exitCode}: ${proc.stderr || proc.stdout}`
132
+ );
133
+ }
134
+ return {
135
+ trainingRoot,
136
+ outputDir,
137
+ matrixOutputDir,
138
+ matrixArtifactPath: join(matrixOutputDir, "benchmark-matrix.json"),
139
+ resultsDb: options.resultsDb ?? null,
140
+ command: [options.python ?? "python3", ...args],
141
+ stdout: proc.stdout,
142
+ stderr: proc.stderr,
143
+ exitCode: proc.exitCode
144
+ };
145
+ }
146
+ export {
147
+ benchmarkVsCerebrasTierList,
148
+ buildBenchmarkVsCerebrasArgs,
149
+ runBenchmarkVsCerebras
150
+ };
151
+ //# sourceMappingURL=benchmark-vs-cerebras-runner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/core/benchmark-vs-cerebras-runner.ts"],"sourcesContent":["import { spawn } from \"node:child_process\";\nimport { mkdir } from \"node:fs/promises\";\nimport { join, resolve } from \"node:path\";\nimport { ELIZA_ONE_BENCHMARK_TIER_LIST } from \"./eliza1-benchmark-recipe.js\";\nimport { trainingStateRoot } from \"./training-config.js\";\n\nexport type BenchmarkVsCerebrasBenchmark =\n | \"eliza_harness_action_selection\"\n | \"clawbench\"\n | \"hermes\"\n | \"all\";\n\nexport interface BenchmarkVsCerebrasRunOptions {\n trainingRoot?: string;\n python?: string;\n tiers?: string;\n benchmark?: BenchmarkVsCerebrasBenchmark;\n variants?: \"trained\" | \"base\" | \"both\";\n cerebrasModel?: string;\n maxSamples?: number;\n outputDir?: string;\n checkpointsDir?: string;\n trainedModelPath?: string;\n dryRun?: boolean;\n resultsDb?: string;\n datasetVersion?: string;\n codeCommit?: string;\n matrixOutputDir?: string;\n}\n\nexport interface BenchmarkVsCerebrasRunResult {\n trainingRoot: string;\n outputDir: string;\n matrixOutputDir: string | null;\n matrixArtifactPath: string | null;\n resultsDb: string | null;\n command: string[];\n stdout: string;\n stderr: string;\n exitCode: number;\n}\n\nfunction safeTimestamp(value: string): string {\n return value.replace(/[:.]/g, \"-\");\n}\n\nconst TRAINING_TIER_KEYS: Record<string, string> = {\n \"2b\": \"gemma4-e2b\",\n \"4b\": \"gemma4-e4b\",\n \"9b\": \"gemma4-12b\",\n \"27b\": \"gemma4-31b\",\n \"eliza-1-2b\": \"gemma4-e2b\",\n \"eliza-1-4b\": \"gemma4-e4b\",\n \"eliza-1-9b\": \"gemma4-12b\",\n \"eliza-1-27b\": \"gemma4-31b\",\n \"gemma4-e2b\": \"gemma4-e2b\",\n \"gemma4-e4b\": \"gemma4-e4b\",\n \"gemma4-12b\": \"gemma4-12b\",\n \"gemma4-31b\": \"gemma4-31b\",\n \"gemma-4-e2b\": \"gemma4-e2b\",\n \"gemma-4-e4b\": \"gemma4-e4b\",\n \"gemma-4-12b\": \"gemma4-12b\",\n \"gemma-4-31b\": \"gemma4-31b\",\n \"google/gemma-4-e2b\": \"gemma4-e2b\",\n \"google/gemma-4-e4b\": \"gemma4-e4b\",\n \"google/gemma-4-12b\": \"gemma4-12b\",\n \"google/gemma-4-31b\": \"gemma4-31b\",\n \"google-gemma-4-e2b\": \"gemma4-e2b\",\n \"google-gemma-4-e4b\": \"gemma4-e4b\",\n \"google-gemma-4-12b\": \"gemma4-12b\",\n \"google-gemma-4-31b\": \"gemma4-31b\",\n};\n\nconst RETIRED_QWEN_TIER_ALIAS_RE = /\\bqwen(?:\\d+(?:\\.\\d+)?)?\\b/i;\n\nfunction normalizeTrainingTierKey(value: string): string {\n const trimmed = value.trim();\n const key = trimmed.toLowerCase().replace(/_/g, \"-\");\n if (RETIRED_QWEN_TIER_ALIAS_RE.test(key)) {\n throw new Error(\n `Qwen tier aliases are retired; use an active Gemma 4 tier key instead (${ELIZA_ONE_BENCHMARK_TIER_LIST}).`,\n );\n }\n return (\n TRAINING_TIER_KEYS[key] ??\n TRAINING_TIER_KEYS[key.replace(/\\//g, \"-\")] ??\n trimmed\n );\n}\n\nexport function benchmarkVsCerebrasTierList(value: string | undefined): string {\n const raw = value?.trim() || ELIZA_ONE_BENCHMARK_TIER_LIST;\n if (raw.toLowerCase() === \"all\") return \"all\";\n return raw.split(\",\").map(normalizeTrainingTierKey).filter(Boolean).join(\",\");\n}\n\nfunction collectProcess(\n command: string,\n args: string[],\n cwd: string,\n): Promise<{ stdout: string; stderr: string; exitCode: number }> {\n return new Promise((resolvePromise, reject) => {\n const child = spawn(command, args, {\n cwd,\n stdio: [\"ignore\", \"pipe\", \"pipe\"],\n });\n let stdout = \"\";\n let stderr = \"\";\n child.stdout.setEncoding(\"utf-8\");\n child.stderr.setEncoding(\"utf-8\");\n child.stdout.on(\"data\", (chunk) => {\n stdout += chunk;\n });\n child.stderr.on(\"data\", (chunk) => {\n stderr += chunk;\n });\n child.on(\"error\", reject);\n child.on(\"close\", (code) => {\n resolvePromise({ stdout, stderr, exitCode: code ?? 1 });\n });\n });\n}\n\nexport function buildBenchmarkVsCerebrasArgs(\n options: BenchmarkVsCerebrasRunOptions,\n resolved: {\n trainingRoot: string;\n outputDir: string;\n matrixOutputDir?: string;\n },\n): string[] {\n const scriptPath = join(\n resolved.trainingRoot,\n \"scripts\",\n \"benchmark_vs_cerebras.py\",\n );\n const args = [\n scriptPath,\n \"--tiers\",\n benchmarkVsCerebrasTierList(options.tiers),\n \"--benchmark\",\n options.benchmark ?? \"eliza_harness_action_selection\",\n \"--variants\",\n options.variants ?? \"trained\",\n \"--cerebras-model\",\n options.cerebrasModel ?? \"gpt-oss-120b\",\n \"--max-samples\",\n String(\n typeof options.maxSamples === \"number\"\n ? Math.max(1, Math.floor(options.maxSamples))\n : 50,\n ),\n \"--output-dir\",\n resolved.outputDir,\n ];\n if (options.checkpointsDir)\n args.push(\"--checkpoints-dir\", options.checkpointsDir);\n if (options.trainedModelPath)\n args.push(\"--trained-model-path\", options.trainedModelPath);\n if (options.dryRun) args.push(\"--dry-run\");\n if (options.resultsDb) args.push(\"--results-db\", options.resultsDb);\n if (options.datasetVersion)\n args.push(\"--dataset-version\", options.datasetVersion);\n if (options.codeCommit) args.push(\"--code-commit\", options.codeCommit);\n if (resolved.matrixOutputDir) {\n args.push(\"--matrix-output-dir\", resolved.matrixOutputDir);\n }\n return args;\n}\n\nexport async function runBenchmarkVsCerebras(\n options: BenchmarkVsCerebrasRunOptions,\n): Promise<BenchmarkVsCerebrasRunResult> {\n const trainingRoot = resolve(\n options.trainingRoot ?? join(process.cwd(), \"packages\", \"training\"),\n );\n const stamp = safeTimestamp(new Date().toISOString());\n const outputDir =\n options.outputDir ?? join(trainingStateRoot(), \"benchmarks\", \"runs\", stamp);\n const matrixOutputDir =\n options.matrixOutputDir ??\n join(trainingStateRoot(), \"benchmarks\", \"matrices\", stamp);\n await mkdir(outputDir, { recursive: true });\n await mkdir(matrixOutputDir, { recursive: true });\n const args = buildBenchmarkVsCerebrasArgs(options, {\n trainingRoot,\n outputDir,\n matrixOutputDir,\n });\n const proc = await collectProcess(\n options.python ?? \"python3\",\n args,\n trainingRoot,\n );\n if (proc.exitCode !== 0) {\n throw new Error(\n `benchmark_vs_cerebras.py exited with code ${proc.exitCode}: ${proc.stderr || proc.stdout}`,\n );\n }\n return {\n trainingRoot,\n outputDir,\n matrixOutputDir,\n matrixArtifactPath: join(matrixOutputDir, \"benchmark-matrix.json\"),\n resultsDb: options.resultsDb ?? null,\n command: [options.python ?? \"python3\", ...args],\n stdout: proc.stdout,\n stderr: proc.stderr,\n exitCode: proc.exitCode,\n };\n}\n"],"mappings":"AAAA,SAAS,aAAa;AACtB,SAAS,aAAa;AACtB,SAAS,MAAM,eAAe;AAC9B,SAAS,qCAAqC;AAC9C,SAAS,yBAAyB;AAsClC,SAAS,cAAc,OAAuB;AAC5C,SAAO,MAAM,QAAQ,SAAS,GAAG;AACnC;AAEA,MAAM,qBAA6C;AAAA,EACjD,MAAM;AAAA,EACN,MAAM;AAAA,EACN,MAAM;AAAA,EACN,OAAO;AAAA,EACP,cAAc;AAAA,EACd,cAAc;AAAA,EACd,cAAc;AAAA,EACd,eAAe;AAAA,EACf,cAAc;AAAA,EACd,cAAc;AAAA,EACd,cAAc;AAAA,EACd,cAAc;AAAA,EACd,eAAe;AAAA,EACf,eAAe;AAAA,EACf,eAAe;AAAA,EACf,eAAe;AAAA,EACf,sBAAsB;AAAA,EACtB,sBAAsB;AAAA,EACtB,sBAAsB;AAAA,EACtB,sBAAsB;AAAA,EACtB,sBAAsB;AAAA,EACtB,sBAAsB;AAAA,EACtB,sBAAsB;AAAA,EACtB,sBAAsB;AACxB;AAEA,MAAM,6BAA6B;AAEnC,SAAS,yBAAyB,OAAuB;AACvD,QAAM,UAAU,MAAM,KAAK;AAC3B,QAAM,MAAM,QAAQ,YAAY,EAAE,QAAQ,MAAM,GAAG;AACnD,MAAI,2BAA2B,KAAK,GAAG,GAAG;AACxC,UAAM,IAAI;AAAA,MACR,0EAA0E,6BAA6B;AAAA,IACzG;AAAA,EACF;AACA,SACE,mBAAmB,GAAG,KACtB,mBAAmB,IAAI,QAAQ,OAAO,GAAG,CAAC,KAC1C;AAEJ;AAEO,SAAS,4BAA4B,OAAmC;AAC7E,QAAM,MAAM,OAAO,KAAK,KAAK;AAC7B,MAAI,IAAI,YAAY,MAAM,MAAO,QAAO;AACxC,SAAO,IAAI,MAAM,GAAG,EAAE,IAAI,wBAAwB,EAAE,OAAO,OAAO,EAAE,KAAK,GAAG;AAC9E;AAEA,SAAS,eACP,SACA,MACA,KAC+D;AAC/D,SAAO,IAAI,QAAQ,CAAC,gBAAgB,WAAW;AAC7C,UAAM,QAAQ,MAAM,SAAS,MAAM;AAAA,MACjC;AAAA,MACA,OAAO,CAAC,UAAU,QAAQ,MAAM;AAAA,IAClC,CAAC;AACD,QAAI,SAAS;AACb,QAAI,SAAS;AACb,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,GAAG,QAAQ,CAAC,UAAU;AACjC,gBAAU;AAAA,IACZ,CAAC;AACD,UAAM,OAAO,GAAG,QAAQ,CAAC,UAAU;AACjC,gBAAU;AAAA,IACZ,CAAC;AACD,UAAM,GAAG,SAAS,MAAM;AACxB,UAAM,GAAG,SAAS,CAAC,SAAS;AAC1B,qBAAe,EAAE,QAAQ,QAAQ,UAAU,QAAQ,EAAE,CAAC;AAAA,IACxD,CAAC;AAAA,EACH,CAAC;AACH;AAEO,SAAS,6BACd,SACA,UAKU;AACV,QAAM,aAAa;AAAA,IACjB,SAAS;AAAA,IACT;AAAA,IACA;AAAA,EACF;AACA,QAAM,OAAO;AAAA,IACX;AAAA,IACA;AAAA,IACA,4BAA4B,QAAQ,KAAK;AAAA,IACzC;AAAA,IACA,QAAQ,aAAa;AAAA,IACrB;AAAA,IACA,QAAQ,YAAY;AAAA,IACpB;AAAA,IACA,QAAQ,iBAAiB;AAAA,IACzB;AAAA,IACA;AAAA,MACE,OAAO,QAAQ,eAAe,WAC1B,KAAK,IAAI,GAAG,KAAK,MAAM,QAAQ,UAAU,CAAC,IAC1C;AAAA,IACN;AAAA,IACA;AAAA,IACA,SAAS;AAAA,EACX;AACA,MAAI,QAAQ;AACV,SAAK,KAAK,qBAAqB,QAAQ,cAAc;AACvD,MAAI,QAAQ;AACV,SAAK,KAAK,wBAAwB,QAAQ,gBAAgB;AAC5D,MAAI,QAAQ,OAAQ,MAAK,KAAK,WAAW;AACzC,MAAI,QAAQ,UAAW,MAAK,KAAK,gBAAgB,QAAQ,SAAS;AAClE,MAAI,QAAQ;AACV,SAAK,KAAK,qBAAqB,QAAQ,cAAc;AACvD,MAAI,QAAQ,WAAY,MAAK,KAAK,iBAAiB,QAAQ,UAAU;AACrE,MAAI,SAAS,iBAAiB;AAC5B,SAAK,KAAK,uBAAuB,SAAS,eAAe;AAAA,EAC3D;AACA,SAAO;AACT;AAEA,eAAsB,uBACpB,SACuC;AACvC,QAAM,eAAe;AAAA,IACnB,QAAQ,gBAAgB,KAAK,QAAQ,IAAI,GAAG,YAAY,UAAU;AAAA,EACpE;AACA,QAAM,QAAQ,eAAc,oBAAI,KAAK,GAAE,YAAY,CAAC;AACpD,QAAM,YACJ,QAAQ,aAAa,KAAK,kBAAkB,GAAG,cAAc,QAAQ,KAAK;AAC5E,QAAM,kBACJ,QAAQ,mBACR,KAAK,kBAAkB,GAAG,cAAc,YAAY,KAAK;AAC3D,QAAM,MAAM,WAAW,EAAE,WAAW,KAAK,CAAC;AAC1C,QAAM,MAAM,iBAAiB,EAAE,WAAW,KAAK,CAAC;AAChD,QAAM,OAAO,6BAA6B,SAAS;AAAA,IACjD;AAAA,IACA;AAAA,IACA;AAAA,EACF,CAAC;AACD,QAAM,OAAO,MAAM;AAAA,IACjB,QAAQ,UAAU;AAAA,IAClB;AAAA,IACA;AAAA,EACF;AACA,MAAI,KAAK,aAAa,GAAG;AACvB,UAAM,IAAI;AAAA,MACR,6CAA6C,KAAK,QAAQ,KAAK,KAAK,UAAU,KAAK,MAAM;AAAA,IAC3F;AAAA,EACF;AACA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA,oBAAoB,KAAK,iBAAiB,uBAAuB;AAAA,IACjE,WAAW,QAAQ,aAAa;AAAA,IAChC,SAAS,CAAC,QAAQ,UAAU,WAAW,GAAG,IAAI;AAAA,IAC9C,QAAQ,KAAK;AAAA,IACb,QAAQ,KAAK;AAAA,IACb,UAAU,KAAK;AAAA,EACjB;AACF;","names":[]}