@elizaos/plugin-training 2.0.3-beta.5 → 2.0.3-beta.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (363) hide show
  1. package/dist/backends/native.d.ts +96 -0
  2. package/dist/backends/native.d.ts.map +1 -0
  3. package/dist/backends/native.js +308 -0
  4. package/dist/backends/native.js.map +1 -0
  5. package/dist/cli/train.d.ts +22 -0
  6. package/dist/cli/train.d.ts.map +1 -0
  7. package/dist/cli/train.js +219 -0
  8. package/dist/cli/train.js.map +1 -0
  9. package/dist/core/action-benchmark-runner.d.ts +55 -0
  10. package/dist/core/action-benchmark-runner.d.ts.map +1 -0
  11. package/dist/core/action-benchmark-runner.js +341 -0
  12. package/dist/core/action-benchmark-runner.js.map +1 -0
  13. package/dist/core/artifact-store.d.ts +72 -0
  14. package/dist/core/artifact-store.d.ts.map +1 -0
  15. package/dist/core/artifact-store.js +50 -0
  16. package/dist/core/artifact-store.js.map +1 -0
  17. package/dist/core/benchmark-matrix-artifact.d.ts +102 -0
  18. package/dist/core/benchmark-matrix-artifact.d.ts.map +1 -0
  19. package/dist/core/benchmark-matrix-artifact.js +381 -0
  20. package/dist/core/benchmark-matrix-artifact.js.map +1 -0
  21. package/dist/core/benchmark-vs-cerebras-runner.d.ts +37 -0
  22. package/dist/core/benchmark-vs-cerebras-runner.d.ts.map +1 -0
  23. package/dist/core/benchmark-vs-cerebras-runner.js +151 -0
  24. package/dist/core/benchmark-vs-cerebras-runner.js.map +1 -0
  25. package/dist/core/cerebras-eval-model.d.ts +54 -0
  26. package/dist/core/cerebras-eval-model.d.ts.map +1 -0
  27. package/dist/core/cerebras-eval-model.js +249 -0
  28. package/dist/core/cerebras-eval-model.js.map +1 -0
  29. package/dist/core/cli.d.ts +15 -0
  30. package/dist/core/cli.d.ts.map +1 -0
  31. package/dist/core/cli.js +1003 -0
  32. package/dist/core/cli.js.map +1 -0
  33. package/dist/core/context-audit.d.ts +51 -0
  34. package/dist/core/context-audit.d.ts.map +1 -0
  35. package/dist/core/context-audit.js +166 -0
  36. package/dist/core/context-audit.js.map +1 -0
  37. package/dist/core/context-catalog.d.ts +47 -0
  38. package/dist/core/context-catalog.d.ts.map +1 -0
  39. package/dist/core/context-catalog.js +269 -0
  40. package/dist/core/context-catalog.js.map +1 -0
  41. package/dist/core/context-types.d.ts +3 -0
  42. package/dist/core/context-types.d.ts.map +1 -0
  43. package/dist/core/context-types.js +18 -0
  44. package/dist/core/context-types.js.map +1 -0
  45. package/dist/core/dataset-generator.d.ts +135 -0
  46. package/dist/core/dataset-generator.d.ts.map +1 -0
  47. package/dist/core/dataset-generator.js +895 -0
  48. package/dist/core/dataset-generator.js.map +1 -0
  49. package/dist/core/eliza1-benchmark-recipe.d.ts +18 -0
  50. package/dist/core/eliza1-benchmark-recipe.d.ts.map +1 -0
  51. package/dist/core/eliza1-benchmark-recipe.js +64 -0
  52. package/dist/core/eliza1-benchmark-recipe.js.map +1 -0
  53. package/dist/core/eliza1-bundle-stager.d.ts +57 -0
  54. package/dist/core/eliza1-bundle-stager.d.ts.map +1 -0
  55. package/dist/core/eliza1-bundle-stager.js +149 -0
  56. package/dist/core/eliza1-bundle-stager.js.map +1 -0
  57. package/dist/core/ensure-cron-job.d.ts +53 -0
  58. package/dist/core/ensure-cron-job.d.ts.map +1 -0
  59. package/dist/core/ensure-cron-job.js +51 -0
  60. package/dist/core/ensure-cron-job.js.map +1 -0
  61. package/dist/core/eval-comparison-artifact.d.ts +72 -0
  62. package/dist/core/eval-comparison-artifact.d.ts.map +1 -0
  63. package/dist/core/eval-comparison-artifact.js +281 -0
  64. package/dist/core/eval-comparison-artifact.js.map +1 -0
  65. package/dist/core/feed-generation-runner.d.ts +37 -0
  66. package/dist/core/feed-generation-runner.d.ts.map +1 -0
  67. package/dist/core/feed-generation-runner.js +232 -0
  68. package/dist/core/feed-generation-runner.js.map +1 -0
  69. package/dist/core/html-escape.d.ts +5 -0
  70. package/dist/core/html-escape.d.ts.map +1 -0
  71. package/dist/core/html-escape.js +11 -0
  72. package/dist/core/html-escape.js.map +1 -0
  73. package/dist/core/huggingface-dataset-ingest.d.ts +52 -0
  74. package/dist/core/huggingface-dataset-ingest.d.ts.map +1 -0
  75. package/dist/core/huggingface-dataset-ingest.js +134 -0
  76. package/dist/core/huggingface-dataset-ingest.js.map +1 -0
  77. package/dist/core/index.d.ts +29 -0
  78. package/dist/core/index.d.ts.map +1 -0
  79. package/dist/core/index.js +204 -0
  80. package/dist/core/index.js.map +1 -0
  81. package/dist/core/privacy-filter.d.ts +95 -0
  82. package/dist/core/privacy-filter.d.ts.map +1 -0
  83. package/dist/core/privacy-filter.js +324 -0
  84. package/dist/core/privacy-filter.js.map +1 -0
  85. package/dist/core/promotion-gate.d.ts +117 -0
  86. package/dist/core/promotion-gate.d.ts.map +1 -0
  87. package/dist/core/promotion-gate.js +85 -0
  88. package/dist/core/promotion-gate.js.map +1 -0
  89. package/dist/core/promotion-persist.d.ts +116 -0
  90. package/dist/core/promotion-persist.d.ts.map +1 -0
  91. package/dist/core/promotion-persist.js +93 -0
  92. package/dist/core/promotion-persist.js.map +1 -0
  93. package/dist/core/prompt-compare.d.ts +99 -0
  94. package/dist/core/prompt-compare.d.ts.map +1 -0
  95. package/dist/core/prompt-compare.js +210 -0
  96. package/dist/core/prompt-compare.js.map +1 -0
  97. package/dist/core/replay-validator.d.ts +136 -0
  98. package/dist/core/replay-validator.d.ts.map +1 -0
  99. package/dist/core/replay-validator.js +312 -0
  100. package/dist/core/replay-validator.js.map +1 -0
  101. package/dist/core/roleplay-executor.d.ts +123 -0
  102. package/dist/core/roleplay-executor.d.ts.map +1 -0
  103. package/dist/core/roleplay-executor.js +675 -0
  104. package/dist/core/roleplay-executor.js.map +1 -0
  105. package/dist/core/roleplay-trajectories.d.ts +54 -0
  106. package/dist/core/roleplay-trajectories.d.ts.map +1 -0
  107. package/dist/core/roleplay-trajectories.js +88 -0
  108. package/dist/core/roleplay-trajectories.js.map +1 -0
  109. package/dist/core/scenario-blueprints.d.ts +62 -0
  110. package/dist/core/scenario-blueprints.d.ts.map +1 -0
  111. package/dist/core/scenario-blueprints.js +850 -0
  112. package/dist/core/scenario-blueprints.js.map +1 -0
  113. package/dist/core/scenario-runner.d.ts +36 -0
  114. package/dist/core/scenario-runner.d.ts.map +1 -0
  115. package/dist/core/scenario-runner.js +216 -0
  116. package/dist/core/scenario-runner.js.map +1 -0
  117. package/dist/core/skill-scoring-cron.d.ts +57 -0
  118. package/dist/core/skill-scoring-cron.d.ts.map +1 -0
  119. package/dist/core/skill-scoring-cron.js +180 -0
  120. package/dist/core/skill-scoring-cron.js.map +1 -0
  121. package/dist/core/test-trajectory-collector.d.ts +37 -0
  122. package/dist/core/test-trajectory-collector.d.ts.map +1 -0
  123. package/dist/core/test-trajectory-collector.js +225 -0
  124. package/dist/core/test-trajectory-collector.js.map +1 -0
  125. package/dist/core/track-c-queue-task.d.ts +37 -0
  126. package/dist/core/track-c-queue-task.d.ts.map +1 -0
  127. package/dist/core/track-c-queue-task.js +104 -0
  128. package/dist/core/track-c-queue-task.js.map +1 -0
  129. package/dist/core/training-analysis-index.d.ts +104 -0
  130. package/dist/core/training-analysis-index.d.ts.map +1 -0
  131. package/dist/core/training-analysis-index.js +3297 -0
  132. package/dist/core/training-analysis-index.js.map +1 -0
  133. package/dist/core/training-collection-runner.d.ts +508 -0
  134. package/dist/core/training-collection-runner.d.ts.map +1 -0
  135. package/dist/core/training-collection-runner.js +2299 -0
  136. package/dist/core/training-collection-runner.js.map +1 -0
  137. package/dist/core/training-config.d.ts +52 -0
  138. package/dist/core/training-config.d.ts.map +1 -0
  139. package/dist/core/training-config.js +117 -0
  140. package/dist/core/training-config.js.map +1 -0
  141. package/dist/core/training-orchestrator.d.ts +112 -0
  142. package/dist/core/training-orchestrator.d.ts.map +1 -0
  143. package/dist/core/training-orchestrator.js +729 -0
  144. package/dist/core/training-orchestrator.js.map +1 -0
  145. package/dist/core/training-readiness-report.d.ts +52 -0
  146. package/dist/core/training-readiness-report.d.ts.map +1 -0
  147. package/dist/core/training-readiness-report.js +765 -0
  148. package/dist/core/training-readiness-report.js.map +1 -0
  149. package/dist/core/trajectory-consumer.d.ts +15 -0
  150. package/dist/core/trajectory-consumer.d.ts.map +1 -0
  151. package/dist/core/trajectory-consumer.js +61 -0
  152. package/dist/core/trajectory-consumer.js.map +1 -0
  153. package/dist/core/trajectory-export-bundle.d.ts +95 -0
  154. package/dist/core/trajectory-export-bundle.d.ts.map +1 -0
  155. package/dist/core/trajectory-export-bundle.js +561 -0
  156. package/dist/core/trajectory-export-bundle.js.map +1 -0
  157. package/dist/core/trajectory-export-cron.d.ts +57 -0
  158. package/dist/core/trajectory-export-cron.d.ts.map +1 -0
  159. package/dist/core/trajectory-export-cron.js +170 -0
  160. package/dist/core/trajectory-export-cron.js.map +1 -0
  161. package/dist/core/trajectory-hf-upload.d.ts +50 -0
  162. package/dist/core/trajectory-hf-upload.d.ts.map +1 -0
  163. package/dist/core/trajectory-hf-upload.js +111 -0
  164. package/dist/core/trajectory-hf-upload.js.map +1 -0
  165. package/dist/core/trajectory-task-datasets.d.ts +62 -0
  166. package/dist/core/trajectory-task-datasets.d.ts.map +1 -0
  167. package/dist/core/trajectory-task-datasets.js +427 -0
  168. package/dist/core/trajectory-task-datasets.js.map +1 -0
  169. package/dist/core/wait-for-service.d.ts +25 -0
  170. package/dist/core/wait-for-service.d.ts.map +1 -0
  171. package/dist/core/wait-for-service.js +19 -0
  172. package/dist/core/wait-for-service.js.map +1 -0
  173. package/dist/core/workspace-runtime.d.ts +4 -0
  174. package/dist/core/workspace-runtime.d.ts.map +1 -0
  175. package/dist/core/workspace-runtime.js +25 -0
  176. package/dist/core/workspace-runtime.js.map +1 -0
  177. package/dist/dspy/artifact.d.ts +54 -0
  178. package/dist/dspy/artifact.d.ts.map +1 -0
  179. package/dist/dspy/artifact.js +61 -0
  180. package/dist/dspy/artifact.js.map +1 -0
  181. package/dist/dspy/chain-of-thought.d.ts +27 -0
  182. package/dist/dspy/chain-of-thought.d.ts.map +1 -0
  183. package/dist/dspy/chain-of-thought.js +43 -0
  184. package/dist/dspy/chain-of-thought.js.map +1 -0
  185. package/dist/dspy/examples.d.ts +72 -0
  186. package/dist/dspy/examples.d.ts.map +1 -0
  187. package/dist/dspy/examples.js +105 -0
  188. package/dist/dspy/examples.js.map +1 -0
  189. package/dist/dspy/index.d.ts +15 -0
  190. package/dist/dspy/index.d.ts.map +1 -0
  191. package/dist/dspy/index.js +40 -0
  192. package/dist/dspy/index.js.map +1 -0
  193. package/dist/dspy/lm-adapter.d.ts +100 -0
  194. package/dist/dspy/lm-adapter.d.ts.map +1 -0
  195. package/dist/dspy/lm-adapter.js +81 -0
  196. package/dist/dspy/lm-adapter.js.map +1 -0
  197. package/dist/dspy/optimizers/dspy-bootstrap-fewshot.d.ts +23 -0
  198. package/dist/dspy/optimizers/dspy-bootstrap-fewshot.d.ts.map +1 -0
  199. package/dist/dspy/optimizers/dspy-bootstrap-fewshot.js +85 -0
  200. package/dist/dspy/optimizers/dspy-bootstrap-fewshot.js.map +1 -0
  201. package/dist/dspy/optimizers/dspy-copro.d.ts +29 -0
  202. package/dist/dspy/optimizers/dspy-copro.d.ts.map +1 -0
  203. package/dist/dspy/optimizers/dspy-copro.js +141 -0
  204. package/dist/dspy/optimizers/dspy-copro.js.map +1 -0
  205. package/dist/dspy/optimizers/dspy-mipro.d.ts +37 -0
  206. package/dist/dspy/optimizers/dspy-mipro.d.ts.map +1 -0
  207. package/dist/dspy/optimizers/dspy-mipro.js +194 -0
  208. package/dist/dspy/optimizers/dspy-mipro.js.map +1 -0
  209. package/dist/dspy/optimizers/index.d.ts +5 -0
  210. package/dist/dspy/optimizers/index.d.ts.map +1 -0
  211. package/dist/dspy/optimizers/index.js +11 -0
  212. package/dist/dspy/optimizers/index.js.map +1 -0
  213. package/dist/dspy/optimizers/types.d.ts +39 -0
  214. package/dist/dspy/optimizers/types.d.ts.map +1 -0
  215. package/dist/dspy/optimizers/types.js +1 -0
  216. package/dist/dspy/optimizers/types.js.map +1 -0
  217. package/dist/dspy/predict.d.ts +49 -0
  218. package/dist/dspy/predict.d.ts.map +1 -0
  219. package/dist/dspy/predict.js +73 -0
  220. package/dist/dspy/predict.js.map +1 -0
  221. package/dist/dspy/signature.d.ts +88 -0
  222. package/dist/dspy/signature.d.ts.map +1 -0
  223. package/dist/dspy/signature.js +205 -0
  224. package/dist/dspy/signature.js.map +1 -0
  225. package/dist/index.d.ts +15 -0
  226. package/dist/index.d.ts.map +1 -0
  227. package/dist/index.js +15 -0
  228. package/dist/index.js.map +1 -0
  229. package/dist/optimizers/bootstrap-fewshot.d.ts +42 -0
  230. package/dist/optimizers/bootstrap-fewshot.d.ts.map +1 -0
  231. package/dist/optimizers/bootstrap-fewshot.js +92 -0
  232. package/dist/optimizers/bootstrap-fewshot.js.map +1 -0
  233. package/dist/optimizers/gepa.d.ts +63 -0
  234. package/dist/optimizers/gepa.d.ts.map +1 -0
  235. package/dist/optimizers/gepa.js +232 -0
  236. package/dist/optimizers/gepa.js.map +1 -0
  237. package/dist/optimizers/index.d.ts +7 -0
  238. package/dist/optimizers/index.d.ts.map +1 -0
  239. package/dist/optimizers/index.js +51 -0
  240. package/dist/optimizers/index.js.map +1 -0
  241. package/dist/optimizers/instruction-search.d.ts +39 -0
  242. package/dist/optimizers/instruction-search.d.ts.map +1 -0
  243. package/dist/optimizers/instruction-search.js +108 -0
  244. package/dist/optimizers/instruction-search.js.map +1 -0
  245. package/dist/optimizers/prompt-evolution.d.ts +39 -0
  246. package/dist/optimizers/prompt-evolution.d.ts.map +1 -0
  247. package/dist/optimizers/prompt-evolution.js +101 -0
  248. package/dist/optimizers/prompt-evolution.js.map +1 -0
  249. package/dist/optimizers/scoring.d.ts +139 -0
  250. package/dist/optimizers/scoring.d.ts.map +1 -0
  251. package/dist/optimizers/scoring.js +299 -0
  252. package/dist/optimizers/scoring.js.map +1 -0
  253. package/dist/optimizers/types.d.ts +105 -0
  254. package/dist/optimizers/types.d.ts.map +1 -0
  255. package/dist/optimizers/types.js +1 -0
  256. package/dist/optimizers/types.js.map +1 -0
  257. package/dist/register-runtime.d.ts +3 -0
  258. package/dist/register-runtime.d.ts.map +1 -0
  259. package/dist/register-runtime.js +60 -0
  260. package/dist/register-runtime.js.map +1 -0
  261. package/dist/register-terminal-view.d.ts +15 -0
  262. package/dist/register-terminal-view.d.ts.map +1 -0
  263. package/dist/register-terminal-view.js +31 -0
  264. package/dist/register-terminal-view.js.map +1 -0
  265. package/dist/routes/experience-routes.d.ts +21 -0
  266. package/dist/routes/experience-routes.d.ts.map +1 -0
  267. package/dist/routes/experience-routes.js +513 -0
  268. package/dist/routes/experience-routes.js.map +1 -0
  269. package/dist/routes/index.d.ts +5 -0
  270. package/dist/routes/index.d.ts.map +1 -0
  271. package/dist/routes/index.js +17 -0
  272. package/dist/routes/index.js.map +1 -0
  273. package/dist/routes/training-routes.d.ts +10 -0
  274. package/dist/routes/training-routes.d.ts.map +1 -0
  275. package/dist/routes/training-routes.js +1239 -0
  276. package/dist/routes/training-routes.js.map +1 -0
  277. package/dist/routes/training-vast-routes.d.ts +35 -0
  278. package/dist/routes/training-vast-routes.d.ts.map +1 -0
  279. package/dist/routes/training-vast-routes.js +249 -0
  280. package/dist/routes/training-vast-routes.js.map +1 -0
  281. package/dist/routes/trajectory-routes.d.ts +19 -0
  282. package/dist/routes/trajectory-routes.d.ts.map +1 -0
  283. package/dist/routes/trajectory-routes.js +1122 -0
  284. package/dist/routes/trajectory-routes.js.map +1 -0
  285. package/dist/services/index.d.ts +9 -0
  286. package/dist/services/index.d.ts.map +1 -0
  287. package/dist/services/index.js +63 -0
  288. package/dist/services/index.js.map +1 -0
  289. package/dist/services/training-backend-check.d.ts +8 -0
  290. package/dist/services/training-backend-check.d.ts.map +1 -0
  291. package/dist/services/training-backend-check.js +31 -0
  292. package/dist/services/training-backend-check.js.map +1 -0
  293. package/dist/services/training-service-like.d.ts +40 -0
  294. package/dist/services/training-service-like.d.ts.map +1 -0
  295. package/dist/services/training-service-like.js +1 -0
  296. package/dist/services/training-service-like.js.map +1 -0
  297. package/dist/services/training-service-registry.d.ts +4 -0
  298. package/dist/services/training-service-registry.d.ts.map +1 -0
  299. package/dist/services/training-service-registry.js +12 -0
  300. package/dist/services/training-service-registry.js.map +1 -0
  301. package/dist/services/training-service.d.ts +59 -0
  302. package/dist/services/training-service.d.ts.map +1 -0
  303. package/dist/services/training-service.js +154 -0
  304. package/dist/services/training-service.js.map +1 -0
  305. package/dist/services/training-trigger.d.ts +177 -0
  306. package/dist/services/training-trigger.d.ts.map +1 -0
  307. package/dist/services/training-trigger.js +300 -0
  308. package/dist/services/training-trigger.js.map +1 -0
  309. package/dist/services/training-vast-service.d.ts +149 -0
  310. package/dist/services/training-vast-service.d.ts.map +1 -0
  311. package/dist/services/training-vast-service.js +648 -0
  312. package/dist/services/training-vast-service.js.map +1 -0
  313. package/dist/services/vast-inference-stats.d.ts +37 -0
  314. package/dist/services/vast-inference-stats.d.ts.map +1 -0
  315. package/dist/services/vast-inference-stats.js +81 -0
  316. package/dist/services/vast-inference-stats.js.map +1 -0
  317. package/dist/services/vast-job-store.d.ts +74 -0
  318. package/dist/services/vast-job-store.d.ts.map +1 -0
  319. package/dist/services/vast-job-store.js +194 -0
  320. package/dist/services/vast-job-store.js.map +1 -0
  321. package/dist/services/vast-subprocess.d.ts +27 -0
  322. package/dist/services/vast-subprocess.d.ts.map +1 -0
  323. package/dist/services/vast-subprocess.js +78 -0
  324. package/dist/services/vast-subprocess.js.map +1 -0
  325. package/dist/setup-routes.d.ts +17 -0
  326. package/dist/setup-routes.d.ts.map +1 -0
  327. package/dist/setup-routes.js +319 -0
  328. package/dist/setup-routes.js.map +1 -0
  329. package/dist/ui/FineTuningSpatialView.d.ts +49 -0
  330. package/dist/ui/FineTuningSpatialView.d.ts.map +1 -0
  331. package/dist/ui/FineTuningSpatialView.js +154 -0
  332. package/dist/ui/FineTuningSpatialView.js.map +1 -0
  333. package/dist/ui/FineTuningView.d.ts +7 -0
  334. package/dist/ui/FineTuningView.d.ts.map +1 -0
  335. package/dist/ui/FineTuningView.helpers.d.ts +17 -0
  336. package/dist/ui/FineTuningView.helpers.d.ts.map +1 -0
  337. package/dist/ui/FineTuningView.helpers.js +30 -0
  338. package/dist/ui/FineTuningView.helpers.js.map +1 -0
  339. package/dist/ui/FineTuningView.interact.d.ts +2 -0
  340. package/dist/ui/FineTuningView.interact.d.ts.map +1 -0
  341. package/dist/ui/FineTuningView.interact.js +300 -0
  342. package/dist/ui/FineTuningView.interact.js.map +1 -0
  343. package/dist/ui/FineTuningView.js +4653 -0
  344. package/dist/ui/FineTuningView.js.map +1 -0
  345. package/dist/ui/fine-tuning-panels.d.ts +100 -0
  346. package/dist/ui/fine-tuning-panels.d.ts.map +1 -0
  347. package/dist/ui/fine-tuning-panels.helpers.d.ts +19 -0
  348. package/dist/ui/fine-tuning-panels.helpers.d.ts.map +1 -0
  349. package/dist/ui/fine-tuning-panels.helpers.js +77 -0
  350. package/dist/ui/fine-tuning-panels.helpers.js.map +1 -0
  351. package/dist/ui/fine-tuning-panels.js +928 -0
  352. package/dist/ui/fine-tuning-panels.js.map +1 -0
  353. package/dist/ui/index.d.ts +5 -0
  354. package/dist/ui/index.d.ts.map +1 -0
  355. package/dist/ui/index.js +5 -0
  356. package/dist/ui/index.js.map +1 -0
  357. package/dist/ui/training-view-bundle.d.ts +3 -0
  358. package/dist/ui/training-view-bundle.d.ts.map +1 -0
  359. package/dist/ui/training-view-bundle.js +7 -0
  360. package/dist/ui/training-view-bundle.js.map +1 -0
  361. package/dist/views/bundle.js +5312 -0
  362. package/dist/views/bundle.js.map +1 -0
  363. package/package.json +7 -7
@@ -0,0 +1,2299 @@
1
+ import { existsSync } from "node:fs";
2
+ import { mkdir, readdir, readFile, stat, writeFile } from "node:fs/promises";
3
+ import { basename, dirname, join, resolve } from "node:path";
4
+ import {
5
+ runActionBenchmark
6
+ } from "./action-benchmark-runner.js";
7
+ import {
8
+ writeBenchmarkMatrixArtifactFromArtifacts
9
+ } from "./benchmark-matrix-artifact.js";
10
+ import {
11
+ runBenchmarkVsCerebras
12
+ } from "./benchmark-vs-cerebras-runner.js";
13
+ import {
14
+ canonicalElizaOneTierSort,
15
+ ELIZA_ONE_BENCHMARK_TIERS,
16
+ elizaOneActionBenchmarkPairs,
17
+ elizaOneBenchmarkModelId,
18
+ parseElizaOneBenchmarkTiers
19
+ } from "./eliza1-benchmark-recipe.js";
20
+ import {
21
+ stageEliza1Bundle
22
+ } from "./eliza1-bundle-stager.js";
23
+ import {
24
+ EVAL_COMPARISON_ARTIFACT_SCHEMA,
25
+ runLocalEvalComparison
26
+ } from "./eval-comparison-artifact.js";
27
+ import {
28
+ runFeedGeneration
29
+ } from "./feed-generation-runner.js";
30
+ import {
31
+ ingestHuggingFaceDataset
32
+ } from "./huggingface-dataset-ingest.js";
33
+ import {
34
+ runScenarios
35
+ } from "./scenario-runner.js";
36
+ import {
37
+ collectTestTrajectories
38
+ } from "./test-trajectory-collector.js";
39
+ import {
40
+ buildTrainingAnalysisIndex
41
+ } from "./training-analysis-index.js";
42
+ import { trainingStateRoot } from "./training-config.js";
43
+ import {
44
+ writeTrainingReadinessReport
45
+ } from "./training-readiness-report.js";
46
+ import {
47
+ buildTrajectoryExportBundle
48
+ } from "./trajectory-export-bundle.js";
49
+ import { discoverWorkspaceRoot } from "./workspace-runtime.js";
50
+ const TRAINING_COLLECTION_RUN_SCHEMA = "eliza_training_collection_run";
51
+ const TRAINING_COLLECTION_RUN_VERSION = 1;
52
+ const TRAINING_COLLECTION_INDEX_SCHEMA = "eliza_training_collection_index";
53
+ const TRAINING_COLLECTION_INDEX_VERSION = 1;
54
+ const DEFAULT_ACTION_BENCHMARK_PAIR_TIER = "2b";
55
+ function resultRecord(step) {
56
+ return step.result && typeof step.result === "object" && !Array.isArray(step.result) ? step.result : {};
57
+ }
58
+ function autoBenchmarkMatrixSources(steps, explicit = []) {
59
+ const sources = [...explicit];
60
+ for (const step of steps) {
61
+ if (step.status !== "succeeded") continue;
62
+ const result = resultRecord(step);
63
+ if (step.id === "eval_comparison" && typeof result.artifactPath === "string") {
64
+ sources.push({ path: result.artifactPath });
65
+ }
66
+ if (step.id === "benchmark_vs_cerebras" && typeof result.matrixArtifactPath === "string" && existsSync(result.matrixArtifactPath)) {
67
+ sources.push({ path: result.matrixArtifactPath });
68
+ }
69
+ if (step.id === "action_benchmark" && Array.isArray(result.matrixSources)) {
70
+ for (const matrixSource of result.matrixSources) {
71
+ if (matrixSource && typeof matrixSource === "object" && !Array.isArray(matrixSource) && typeof matrixSource.path === "string" && existsSync(
72
+ matrixSource.path
73
+ ) && !sources.some(
74
+ (source) => source.path === matrixSource.path
75
+ )) {
76
+ sources.push(matrixSource);
77
+ }
78
+ }
79
+ }
80
+ if (step.id === "action_benchmark" && result.matrixSource && typeof result.matrixSource === "object" && !Array.isArray(result.matrixSource) && typeof result.matrixSource.path === "string" && existsSync(
81
+ result.matrixSource.path
82
+ ) && !sources.some(
83
+ (source) => source.path === result.matrixSource.path
84
+ )) {
85
+ sources.push(result.matrixSource);
86
+ }
87
+ }
88
+ return sources.filter(
89
+ (source, index, all) => all.findIndex((candidate) => candidate.path === source.path) === index
90
+ );
91
+ }
92
+ function safePathPart(value) {
93
+ return value.replace(/[^a-zA-Z0-9._-]+/g, "-").replace(/^-+|-+$/g, "");
94
+ }
95
+ function actionBenchmarkPairLabel(pair, index) {
96
+ return safePathPart(pair.label?.trim() || pair.tier?.trim() || "") || `pair-${index + 1}`;
97
+ }
98
+ function explicitActionBenchmarkPairs(options) {
99
+ const pairs = actionBenchmarkPairsOption(options.actionBenchmarkPairs);
100
+ if (pairs.length > 0) return pairs;
101
+ return options.actionBenchmarkPair ? [options.actionBenchmarkPair] : [];
102
+ }
103
+ function actionBenchmarkPairsOption(value) {
104
+ if (typeof value === "string") {
105
+ return elizaOneActionBenchmarkPairs(parseElizaOneBenchmarkTiers(value, []));
106
+ }
107
+ if (value && value.length > 0) {
108
+ return value;
109
+ }
110
+ return [];
111
+ }
112
+ function shouldUseDefaultActionBenchmarkPair(options) {
113
+ if (!boolWithDefault(options.includeActionBenchmark, true)) return false;
114
+ if (!boolWithDefault(options.includeBenchmarkMatrix, true)) return false;
115
+ if (explicitActionBenchmarkPairs(options).length > 0) return false;
116
+ const actionBenchmark = options.actionBenchmark;
117
+ return !(actionBenchmark?.variant || actionBenchmark?.modelId?.trim() || actionBenchmark?.runtimeModel?.trim());
118
+ }
119
+ function defaultActionBenchmarkPair(options) {
120
+ const tier = options.actionBenchmark?.tier?.trim() || DEFAULT_ACTION_BENCHMARK_PAIR_TIER;
121
+ return {
122
+ tier,
123
+ base: { variant: "base" },
124
+ trained: { variant: "trained" }
125
+ };
126
+ }
127
+ function effectiveActionBenchmarkPairs(options) {
128
+ const explicit = explicitActionBenchmarkPairs(options);
129
+ if (explicit.length > 0) return explicit;
130
+ return shouldUseDefaultActionBenchmarkPair(options) ? [defaultActionBenchmarkPair(options)] : [];
131
+ }
132
+ async function runActionBenchmarkPair(input) {
133
+ const { common, outputDir, workspaceRoot, pair, label } = input;
134
+ const tier = pair.tier?.trim() || common?.tier;
135
+ const baseOutputDir = pair.base?.outputDir ?? (input.preserveSinglePairLayout ? join(outputDir, "base") : join(outputDir, label, "base"));
136
+ const trainedOutputDir = pair.trained?.outputDir ?? (input.preserveSinglePairLayout ? join(outputDir, "trained") : join(outputDir, label, "trained"));
137
+ const base = pair.base ? await runActionBenchmark({
138
+ ...common ?? {},
139
+ ...pair.base,
140
+ workspaceRoot: pair.base.workspaceRoot ?? common?.workspaceRoot ?? workspaceRoot,
141
+ outputDir: baseOutputDir,
142
+ modelId: pair.base.modelId ?? elizaOneBenchmarkModelId(tier, "base") ?? common?.modelId,
143
+ runtimeModel: pair.base.runtimeModel ?? elizaOneBenchmarkModelId(tier, "base") ?? common?.runtimeModel,
144
+ tier: pair.base.tier ?? tier,
145
+ variant: pair.base.variant ?? "base"
146
+ }) : null;
147
+ const trained = pair.trained ? await runActionBenchmark({
148
+ ...common ?? {},
149
+ ...pair.trained,
150
+ workspaceRoot: pair.trained.workspaceRoot ?? common?.workspaceRoot ?? workspaceRoot,
151
+ outputDir: trainedOutputDir,
152
+ modelId: pair.trained.modelId ?? elizaOneBenchmarkModelId(tier, "trained") ?? common?.modelId,
153
+ runtimeModel: pair.trained.runtimeModel ?? elizaOneBenchmarkModelId(tier, "trained") ?? common?.runtimeModel,
154
+ tier: pair.trained.tier ?? tier,
155
+ variant: pair.trained.variant ?? "trained"
156
+ }) : null;
157
+ return {
158
+ label,
159
+ tier: tier ?? null,
160
+ runs: { base, trained },
161
+ matrixSources: [base?.matrixSource, trained?.matrixSource].filter(
162
+ (source) => source != null
163
+ )
164
+ };
165
+ }
166
+ async function runActionBenchmarkCollectionStep(input) {
167
+ const { outputDir, workspaceRoot, options } = input;
168
+ const explicitPairs = actionBenchmarkPairsOption(
169
+ options.actionBenchmarkPairs
170
+ );
171
+ if (explicitPairs.length > 0) {
172
+ const pairs = [];
173
+ for (const [index, pair] of explicitPairs.entries()) {
174
+ pairs.push(
175
+ await runActionBenchmarkPair({
176
+ outputDir,
177
+ workspaceRoot,
178
+ common: options.actionBenchmark,
179
+ pair: {
180
+ ...pair,
181
+ base: pair.base ?? {},
182
+ trained: pair.trained ?? {}
183
+ },
184
+ label: actionBenchmarkPairLabel(pair, index)
185
+ })
186
+ );
187
+ }
188
+ return {
189
+ outputDir,
190
+ pairs,
191
+ runs: pairs[0]?.runs ?? { base: null, trained: null },
192
+ matrixSources: pairs.flatMap((pair) => pair.matrixSources)
193
+ };
194
+ }
195
+ const actionBenchmarkPairs = effectiveActionBenchmarkPairs(options);
196
+ if (actionBenchmarkPairs.length === 1) {
197
+ const actionBenchmarkPair = actionBenchmarkPairs[0];
198
+ const pair = await runActionBenchmarkPair({
199
+ outputDir,
200
+ workspaceRoot,
201
+ common: options.actionBenchmark,
202
+ pair: {
203
+ ...actionBenchmarkPair,
204
+ base: actionBenchmarkPair.base ?? {},
205
+ trained: actionBenchmarkPair.trained ?? {}
206
+ },
207
+ label: actionBenchmarkPairLabel(actionBenchmarkPair, 0),
208
+ preserveSinglePairLayout: true
209
+ });
210
+ return {
211
+ outputDir,
212
+ pairs: [pair],
213
+ runs: pair.runs,
214
+ matrixSources: pair.matrixSources
215
+ };
216
+ }
217
+ return runActionBenchmark({
218
+ ...options.actionBenchmark ?? {},
219
+ workspaceRoot: options.actionBenchmark?.workspaceRoot ?? workspaceRoot,
220
+ outputDir: options.actionBenchmark?.outputDir ?? outputDir
221
+ });
222
+ }
223
+ function safeTimestamp(value) {
224
+ return value.replace(/[:.]/g, "-");
225
+ }
226
+ function boolWithDefault(value, fallback) {
227
+ return typeof value === "boolean" ? value : fallback;
228
+ }
229
+ function liveActionBenchmarkRequested(options) {
230
+ return boolWithDefault(options.includeActionBenchmark, true) && options.actionBenchmark?.dryRun === false;
231
+ }
232
+ function liveEvalComparisonRequested(options) {
233
+ return boolWithDefault(options.includeEvalComparison, false) && options.evalComparison?.dryRun === false;
234
+ }
235
+ function liveFeedGenerationRequested(options) {
236
+ return boolWithDefault(options.includeFeed, true) && options.feed?.dryRun === false;
237
+ }
238
+ function liveBenchmarkVsCerebrasRequested(options) {
239
+ return boolWithDefault(options.includeBenchmarkVsCerebras, false) && options.benchmarkVsCerebras?.dryRun === false;
240
+ }
241
+ function fileCheck(id, label, path) {
242
+ return existsSync(path) ? {
243
+ id,
244
+ label,
245
+ status: "ok",
246
+ detail: "found",
247
+ path
248
+ } : {
249
+ id,
250
+ label,
251
+ status: "missing",
252
+ detail: "required file was not found",
253
+ path
254
+ };
255
+ }
256
+ function endpointProbeUrl(baseUrl) {
257
+ const url = new URL(baseUrl);
258
+ const normalizedPath = url.pathname.replace(/\/+$/, "");
259
+ url.pathname = `${normalizedPath || ""}/models`;
260
+ return url.toString();
261
+ }
262
+ async function probeOpenAICompatibleEndpoint(baseUrl) {
263
+ const url = endpointProbeUrl(baseUrl);
264
+ const controller = new AbortController();
265
+ const timeout = setTimeout(() => controller.abort(), 2e3);
266
+ try {
267
+ const response = await fetch(url, {
268
+ method: "GET",
269
+ signal: controller.signal
270
+ });
271
+ return {
272
+ id: "action_benchmark_endpoint",
273
+ label: "Action benchmark endpoint",
274
+ status: response.ok ? "ok" : "warning",
275
+ detail: response.ok ? `OpenAI-compatible endpoint responded at ${url}` : `endpoint responded with HTTP ${response.status} at ${url}`
276
+ };
277
+ } catch (err) {
278
+ return {
279
+ id: "action_benchmark_endpoint",
280
+ label: "Action benchmark endpoint",
281
+ status: "missing",
282
+ detail: err instanceof Error && err.name === "AbortError" ? `timed out probing OpenAI-compatible endpoint at ${url}` : `could not reach OpenAI-compatible endpoint at ${url}`
283
+ };
284
+ } finally {
285
+ clearTimeout(timeout);
286
+ }
287
+ }
288
+ function buildTrainingCollectionPreflight(input) {
289
+ const { options, workspaceRoot, trainingRoot } = input;
290
+ const checks = [];
291
+ const actionLive = liveActionBenchmarkRequested(options);
292
+ const evalLive = liveEvalComparisonRequested(options);
293
+ const feedLive = liveFeedGenerationRequested(options);
294
+ const cerebrasLive = liveBenchmarkVsCerebrasRequested(options);
295
+ const liveRequired = actionLive || evalLive || feedLive || cerebrasLive;
296
+ const resolvedWorkspaceRoot = workspaceRoot ?? discoverWorkspaceRoot() ?? process.cwd();
297
+ const resolvedTrainingRoot = trainingRoot ?? join(resolvedWorkspaceRoot, "packages", "training");
298
+ checks.push(
299
+ fileCheck(
300
+ "app_core_action_benchmark",
301
+ "App-core Eliza harness benchmark",
302
+ join(
303
+ resolvedWorkspaceRoot,
304
+ "packages",
305
+ "app-core",
306
+ "test",
307
+ "benchmarks",
308
+ "action-selection.real.test.ts"
309
+ )
310
+ )
311
+ );
312
+ checks.push(
313
+ fileCheck(
314
+ "local_eval_compare_script",
315
+ "Local base-vs-trained eval script",
316
+ join(resolvedTrainingRoot, "scripts", "rl", "compare_local_models.py")
317
+ )
318
+ );
319
+ checks.push(
320
+ fileCheck(
321
+ "benchmark_vs_cerebras_script",
322
+ "Benchmark-vs-Cerebras script",
323
+ join(resolvedTrainingRoot, "scripts", "benchmark_vs_cerebras.py")
324
+ )
325
+ );
326
+ if (actionLive) {
327
+ const provider = options.actionBenchmark?.provider ?? "local-llama-cpp";
328
+ const baseUrl = options.actionBenchmark?.baseUrl ?? "http://localhost:11434/v1";
329
+ checks.push({
330
+ id: "action_benchmark_provider",
331
+ label: "Action benchmark provider",
332
+ status: provider === "local-llama-cpp" ? "warning" : "ok",
333
+ detail: provider === "local-llama-cpp" ? `local provider selected; verify OpenAI-compatible endpoint is serving at ${baseUrl}` : `provider ${provider} selected`
334
+ });
335
+ } else {
336
+ checks.push({
337
+ id: "action_benchmark_provider",
338
+ label: "Action benchmark provider",
339
+ status: "skipped",
340
+ detail: "live action benchmark not requested"
341
+ });
342
+ }
343
+ if (feedLive) {
344
+ checks.push({
345
+ id: "feed_database_url",
346
+ label: "Feed database URL",
347
+ status: process.env.DATABASE_URL ? "ok" : "missing",
348
+ detail: process.env.DATABASE_URL ? "DATABASE_URL is set for live feed generation" : "DATABASE_URL is required for live packages/feed train parallel generation"
349
+ });
350
+ } else {
351
+ checks.push({
352
+ id: "feed_database_url",
353
+ label: "Feed database URL",
354
+ status: "skipped",
355
+ detail: "live feed generation not requested"
356
+ });
357
+ }
358
+ if (cerebrasLive) {
359
+ checks.push({
360
+ id: "cerebras_api_key",
361
+ label: "Cerebras API key",
362
+ status: process.env.CEREBRAS_API_KEY ? "ok" : "missing",
363
+ detail: process.env.CEREBRAS_API_KEY ? "CEREBRAS_API_KEY is set" : "CEREBRAS_API_KEY is required for live Cerebras reference runs"
364
+ });
365
+ } else {
366
+ checks.push({
367
+ id: "cerebras_api_key",
368
+ label: "Cerebras API key",
369
+ status: "skipped",
370
+ detail: "live Cerebras reference run not requested"
371
+ });
372
+ }
373
+ if (evalLive) {
374
+ checks.push({
375
+ id: "eval_model_inputs",
376
+ label: "Eval comparison model inputs",
377
+ status: options.evalComparison?.manifestPath || options.evalComparison?.model && options.evalComparison?.trainedModelPath && options.evalComparison?.backend ? "ok" : "missing",
378
+ detail: "requires manifestPath or model, trainedModelPath, and backend for live eval comparison"
379
+ });
380
+ } else {
381
+ checks.push({
382
+ id: "eval_model_inputs",
383
+ label: "Eval comparison model inputs",
384
+ status: "skipped",
385
+ detail: "live local eval comparison not requested"
386
+ });
387
+ }
388
+ return { liveRequired, checks };
389
+ }
390
+ async function buildTrainingCollectionPreflightWithProbes(input) {
391
+ const preflight = buildTrainingCollectionPreflight(input);
392
+ if (!input.options.preflightProbe || !liveActionBenchmarkRequested(input.options)) {
393
+ return preflight;
394
+ }
395
+ const provider = input.options.actionBenchmark?.provider ?? "local-llama-cpp";
396
+ if (provider !== "local-llama-cpp") {
397
+ return preflight;
398
+ }
399
+ const baseUrl = input.options.actionBenchmark?.baseUrl ?? "http://localhost:11434/v1";
400
+ try {
401
+ preflight.checks.push(await probeOpenAICompatibleEndpoint(baseUrl));
402
+ } catch (err) {
403
+ preflight.checks.push({
404
+ id: "action_benchmark_endpoint",
405
+ label: "Action benchmark endpoint",
406
+ status: "warning",
407
+ detail: `endpoint probe failed before request: ${String(err)}`
408
+ });
409
+ }
410
+ return preflight;
411
+ }
412
+ function stepOutputDir(outputDir, step) {
413
+ return join(outputDir, step);
414
+ }
415
+ const ELIZA1_MODEL_REGISTRY_ENTRY_SCHEMA = "eliza1_model_registry_entry";
416
+ async function writeEliza1ModelRegistryArtifacts(input) {
417
+ await mkdir(input.outputDir, { recursive: true });
418
+ const manifests = [];
419
+ for (const tier of ELIZA_ONE_BENCHMARK_TIERS) {
420
+ for (const variant of ["base", "trained"]) {
421
+ const modelId = elizaOneBenchmarkModelId(tier, variant);
422
+ if (!modelId) continue;
423
+ const baseModel = variant === "trained" ? elizaOneBenchmarkModelId(tier, "base") ?? null : null;
424
+ const outputPath = `hf://elizaos/${modelId}`;
425
+ const repoId = `elizaos/${modelId}`;
426
+ const manifestPath = join(
427
+ input.outputDir,
428
+ `${tier}-${variant}-model-manifest.json`
429
+ );
430
+ const manifest = {
431
+ schema: ELIZA1_MODEL_REGISTRY_ENTRY_SCHEMA,
432
+ schemaVersion: 1,
433
+ generatedAt: input.generatedAt,
434
+ source: { kind: "eliza1_model_registry" },
435
+ modelId,
436
+ model_name: modelId,
437
+ output_path: outputPath,
438
+ baseModel,
439
+ tier,
440
+ variant,
441
+ family: "eliza-1",
442
+ repoId,
443
+ registry: {
444
+ provider: "huggingface",
445
+ repoId
446
+ }
447
+ };
448
+ await writeFile(manifestPath, `${JSON.stringify(manifest, null, 2)}
449
+ `);
450
+ manifests.push({
451
+ tier,
452
+ variant,
453
+ modelId,
454
+ manifestPath,
455
+ outputPath,
456
+ baseModel
457
+ });
458
+ }
459
+ }
460
+ return {
461
+ outputDir: input.outputDir,
462
+ generatedAt: input.generatedAt,
463
+ manifests
464
+ };
465
+ }
466
+ function schemaOfArtifact(artifact) {
467
+ const schema = artifact.summary.schema;
468
+ return typeof schema === "string" ? schema : void 0;
469
+ }
470
+ function sourceKindOfArtifact(artifact) {
471
+ const source = artifact.summary.source;
472
+ if (typeof source === "string") return source;
473
+ if (!source || typeof source !== "object" || Array.isArray(source)) {
474
+ return void 0;
475
+ }
476
+ const kind = source.kind;
477
+ return typeof kind === "string" ? kind : void 0;
478
+ }
479
+ function artifactEvidenceCategory(artifact) {
480
+ const schema = schemaOfArtifact(artifact);
481
+ const sourceKind = sourceKindOfArtifact(artifact);
482
+ if (schema === "eliza_huggingface_dataset_ingest") return "huggingface";
483
+ if (schema === "feed_training_trajectory_export" || schema === "feed_parallel_generation") {
484
+ return "feed";
485
+ }
486
+ if (artifact.kind === "trajectory_bundle" && sourceKind === "training_collection_natural_trajectories") {
487
+ return "natural";
488
+ }
489
+ if (artifact.kind === "scenario_run" || schema === "eliza_scenario_native_export") {
490
+ return "scenario";
491
+ }
492
+ if (sourceKind === "app_core_test_trajectory") return "test";
493
+ if (schema === "eliza_training_jsonl_dataset") return "training_jsonl";
494
+ if (artifact.kind === "benchmark_matrix") return "benchmark";
495
+ if (artifact.kind === "model") return "model";
496
+ if (artifact.kind === "eval") return "eval";
497
+ return "other";
498
+ }
499
+ function countArtifacts(analysis, predicate) {
500
+ return analysis.manifest.artifacts.filter(predicate).length;
501
+ }
502
+ function readinessStatus(readiness, id) {
503
+ return readiness.checks.find((check) => check.id === id)?.status ?? "missing";
504
+ }
505
+ function summarizeStepCounts(steps) {
506
+ return {
507
+ skipped: steps.filter((step) => step.status === "skipped").length,
508
+ succeeded: steps.filter((step) => step.status === "succeeded").length,
509
+ failed: steps.filter((step) => step.status === "failed").length
510
+ };
511
+ }
512
+ function recordValue(value) {
513
+ return value && typeof value === "object" && !Array.isArray(value) ? value : null;
514
+ }
515
+ function stringOrNull(value) {
516
+ return typeof value === "string" && value.trim().length > 0 ? value.trim() : null;
517
+ }
518
+ function numberOrNull(value) {
519
+ return typeof value === "number" && Number.isFinite(value) ? value : null;
520
+ }
521
+ function booleanFlagFromRecords(key, ...records) {
522
+ return records.some((record) => record?.[key] === true);
523
+ }
524
+ function benchmarkRecordIsDryRun(record) {
525
+ const source = recordValue(record.source);
526
+ const metrics = recordValue(record.metrics);
527
+ const raw = recordValue(record.raw);
528
+ const rawSource = recordValue(raw?.source);
529
+ return booleanFlagFromRecords(
530
+ "dryRun",
531
+ record,
532
+ source,
533
+ metrics,
534
+ raw,
535
+ rawSource
536
+ );
537
+ }
538
+ function benchmarkRecordUsesMocks(record) {
539
+ const source = recordValue(record.source);
540
+ const metrics = recordValue(record.metrics);
541
+ const raw = recordValue(record.raw);
542
+ const rawSource = recordValue(raw?.source);
543
+ return booleanFlagFromRecords(
544
+ "useMocks",
545
+ record,
546
+ source,
547
+ metrics,
548
+ raw,
549
+ rawSource
550
+ );
551
+ }
552
+ function normalizeBenchmarkTier(value) {
553
+ const tier = stringOrNull(value);
554
+ if (!tier) return null;
555
+ return tier;
556
+ }
557
+ function benchmarkComparisonHasModelBackedRows(comparison, rows) {
558
+ const tier = normalizeBenchmarkTier(comparison.tier);
559
+ const benchmark = stringOrNull(comparison.benchmark);
560
+ if (!tier || !benchmark) return false;
561
+ const hasVariant = (variant) => rows.map(recordValue).filter((row) => Boolean(row)).some(
562
+ (row) => row.variant === variant && normalizeBenchmarkTier(row.tier) === tier && stringOrNull(row.benchmark) === benchmark && numberOrNull(row.score) !== null && !benchmarkRecordIsDryRun(row) && !benchmarkRecordUsesMocks(row)
563
+ );
564
+ return hasVariant("base") && hasVariant("trained");
565
+ }
566
+ function benchmarkComparisonUsesMocks(comparison, rows) {
567
+ const tier = normalizeBenchmarkTier(comparison.tier);
568
+ const benchmark = stringOrNull(comparison.benchmark);
569
+ if (!tier || !benchmark) return false;
570
+ return rows.map(recordValue).filter((row) => Boolean(row)).some(
571
+ (row) => (row.variant === "base" || row.variant === "trained") && normalizeBenchmarkTier(row.tier) === tier && stringOrNull(row.benchmark) === benchmark && benchmarkRecordUsesMocks(row)
572
+ );
573
+ }
574
+ function sanitizeRecipeValue(value) {
575
+ if (Array.isArray(value)) return value.map(sanitizeRecipeValue);
576
+ if (!value || typeof value !== "object") return value;
577
+ const out = {};
578
+ for (const [key, item] of Object.entries(value)) {
579
+ if (/token|secret|password|api[_-]?key/i.test(key)) continue;
580
+ if (key === "trajectories") {
581
+ out.trajectoryCount = Array.isArray(item) ? item.length : 0;
582
+ continue;
583
+ }
584
+ out[key] = sanitizeRecipeValue(item);
585
+ }
586
+ return out;
587
+ }
588
+ function sanitizeRecipeRecord(value) {
589
+ const sanitized = sanitizeRecipeValue(value);
590
+ return sanitized && typeof sanitized === "object" && !Array.isArray(sanitized) ? sanitized : {};
591
+ }
592
+ function isPathLikeKey(key) {
593
+ return /(?:path|dir)$/i.test(key) || key === "outputDir" || key === "matrixOutputDir" || key === "trajectoryDir";
594
+ }
595
+ function collectCommand(value) {
596
+ if (!value || typeof value !== "object") return null;
597
+ if (Array.isArray(value)) {
598
+ for (const item of value) {
599
+ const command = collectCommand(item);
600
+ if (command) return command;
601
+ }
602
+ return null;
603
+ }
604
+ const record = value;
605
+ if (Array.isArray(record.command) && record.command.every((part) => typeof part === "string")) {
606
+ return record.command;
607
+ }
608
+ for (const item of Object.values(record)) {
609
+ const command = collectCommand(item);
610
+ if (command) return command;
611
+ }
612
+ return null;
613
+ }
614
+ function collectStepPaths(value, prefix = "", depth = 0) {
615
+ if (!value || typeof value !== "object" || depth > 4) return [];
616
+ if (Array.isArray(value)) {
617
+ return value.flatMap(
618
+ (item, index) => collectStepPaths(item, `${prefix}[${index}]`, depth + 1)
619
+ );
620
+ }
621
+ const out = [];
622
+ for (const [key, item] of Object.entries(value)) {
623
+ const label = prefix ? `${prefix}.${key}` : key;
624
+ if (typeof item === "string" && item.trim() && isPathLikeKey(key)) {
625
+ out.push({ label, path: item.trim() });
626
+ continue;
627
+ }
628
+ out.push(...collectStepPaths(item, label, depth + 1));
629
+ }
630
+ return out;
631
+ }
632
+ function outputExcerpt(value) {
633
+ if (typeof value !== "string") return null;
634
+ const trimmed = value.trim();
635
+ if (!trimmed) return null;
636
+ return trimmed.length > 2e3 ? `${trimmed.slice(0, 2e3)}...` : trimmed;
637
+ }
638
+ function collectOutputText(value, key, depth = 0) {
639
+ if (!value || typeof value !== "object" || depth > 5) return null;
640
+ const excerpts = [];
641
+ const visit = (item, itemDepth) => {
642
+ if (!item || typeof item !== "object" || itemDepth > 5) return;
643
+ if (Array.isArray(item)) {
644
+ for (const child of item) visit(child, itemDepth + 1);
645
+ return;
646
+ }
647
+ const record = item;
648
+ const excerpt = outputExcerpt(record[key]);
649
+ if (excerpt && !excerpts.includes(excerpt)) excerpts.push(excerpt);
650
+ if (excerpts.length >= 4) return;
651
+ for (const child of Object.values(record)) {
652
+ visit(child, itemDepth + 1);
653
+ if (excerpts.length >= 4) return;
654
+ }
655
+ };
656
+ visit(value, depth);
657
+ return excerpts.length > 0 ? excerpts.join("\n---\n") : null;
658
+ }
659
+ function summarizeStepArtifacts(steps) {
660
+ return steps.map((step) => {
661
+ const result = resultRecord(step);
662
+ const uniquePaths = Array.from(
663
+ new Map(
664
+ collectStepPaths(result).map((entry) => [
665
+ `${entry.label}:${entry.path}`,
666
+ entry
667
+ ])
668
+ ).values()
669
+ ).slice(0, 32);
670
+ return {
671
+ stepId: step.id,
672
+ status: step.status,
673
+ outputDir: step.outputDir,
674
+ command: collectCommand(result),
675
+ exitCode: numberOrNull(result.exitCode),
676
+ stdout: outputExcerpt(result.stdout) ?? collectOutputText(result, "stdout"),
677
+ stderr: outputExcerpt(result.stderr) ?? collectOutputText(result, "stderr"),
678
+ paths: uniquePaths
679
+ };
680
+ });
681
+ }
682
+ function buildCollectionRecipe(options) {
683
+ const defaultPair = shouldUseDefaultActionBenchmarkPair(options) ? defaultActionBenchmarkPair(options) : void 0;
684
+ const actionBenchmarkPair = options.actionBenchmarkPair ?? defaultPair;
685
+ return {
686
+ include: {
687
+ huggingFace: boolWithDefault(options.includeHuggingFace, true),
688
+ feed: boolWithDefault(options.includeFeed, true),
689
+ naturalTrajectories: boolWithDefault(
690
+ options.includeNaturalTrajectories,
691
+ false
692
+ ),
693
+ testTrajectories: boolWithDefault(options.includeTestTrajectories, false),
694
+ scenarios: boolWithDefault(options.includeScenarios, true),
695
+ evalComparison: boolWithDefault(options.includeEvalComparison, false),
696
+ actionBenchmark: boolWithDefault(options.includeActionBenchmark, true),
697
+ benchmarkVsCerebras: boolWithDefault(
698
+ options.includeBenchmarkVsCerebras,
699
+ false
700
+ ),
701
+ eliza1ModelRegistry: boolWithDefault(
702
+ options.includeEliza1ModelRegistry,
703
+ true
704
+ ),
705
+ eliza1BundleStage: boolWithDefault(
706
+ options.includeEliza1BundleStage,
707
+ false
708
+ ),
709
+ benchmarkMatrix: boolWithDefault(options.includeBenchmarkMatrix, true)
710
+ },
711
+ sources: {
712
+ huggingFace: sanitizeRecipeRecord(options.huggingFace),
713
+ feed: sanitizeRecipeRecord(options.feed),
714
+ naturalTrajectories: sanitizeRecipeRecord(options.naturalTrajectories),
715
+ testTrajectories: sanitizeRecipeRecord(options.testTrajectories),
716
+ scenarios: sanitizeRecipeRecord(options.scenarios)
717
+ },
718
+ evals: {
719
+ evalComparison: sanitizeRecipeRecord(options.evalComparison),
720
+ actionBenchmark: sanitizeRecipeRecord(options.actionBenchmark),
721
+ actionBenchmarkPair: actionBenchmarkPair ? sanitizeRecipeRecord(actionBenchmarkPair) : null,
722
+ actionBenchmarkPairs: actionBenchmarkPairsOption(
723
+ options.actionBenchmarkPairs
724
+ ).map(sanitizeRecipeRecord),
725
+ benchmarkVsCerebras: sanitizeRecipeRecord(options.benchmarkVsCerebras),
726
+ benchmarkMatrix: sanitizeRecipeRecord(options.benchmarkMatrix)
727
+ },
728
+ training: {
729
+ eliza1ModelRegistry: {},
730
+ eliza1BundleStage: sanitizeRecipeRecord(options.eliza1BundleStage)
731
+ }
732
+ };
733
+ }
734
+ function summarizeBenchmarkEvidence(input) {
735
+ const actionBenchmarkResult = recordValue(
736
+ input.steps.find((step) => step.id === "action_benchmark")?.result
737
+ );
738
+ const actionBenchmarkPairs = Array.isArray(actionBenchmarkResult?.pairs) ? actionBenchmarkResult.pairs.length : 0;
739
+ const actionBenchmarkMatrixSources = Array.isArray(
740
+ actionBenchmarkResult?.matrixSources
741
+ ) ? actionBenchmarkResult.matrixSources.length : 0;
742
+ const matrixArtifacts = input.analysis.manifest.artifacts.filter(
743
+ (artifact) => artifact.kind === "benchmark_matrix"
744
+ );
745
+ const rows = matrixArtifacts.flatMap(
746
+ (artifact) => Array.isArray(recordValue(artifact.payload)?.rows) ? recordValue(artifact.payload)?.rows : []
747
+ );
748
+ const comparisons = matrixArtifacts.flatMap(
749
+ (artifact) => Array.isArray(recordValue(artifact.payload)?.comparisons) ? recordValue(artifact.payload)?.comparisons : []
750
+ );
751
+ const tiers = Array.from(
752
+ new Set(
753
+ comparisons.map((comparison) => stringOrNull(recordValue(comparison)?.tier)).filter((tier) => tier !== null)
754
+ )
755
+ ).sort(canonicalElizaOneTierSort);
756
+ const comparisonInventory = comparisons.map(recordValue).filter(
757
+ (comparison) => Boolean(comparison)
758
+ ).map((comparison) => {
759
+ const modelBacked = benchmarkComparisonHasModelBackedRows(
760
+ comparison,
761
+ rows
762
+ );
763
+ return {
764
+ tier: stringOrNull(comparison.tier),
765
+ benchmark: stringOrNull(comparison.benchmark),
766
+ baseModelId: stringOrNull(comparison.baseModelId),
767
+ trainedModelId: stringOrNull(comparison.trainedModelId),
768
+ referenceModelId: stringOrNull(comparison.referenceModelId),
769
+ baseScore: numberOrNull(comparison.baseScore),
770
+ trainedScore: numberOrNull(comparison.trainedScore),
771
+ improvementPercent: numberOrNull(comparison.improvementPercent),
772
+ referenceScore: numberOrNull(comparison.referenceScore),
773
+ trainedVsReferencePercent: numberOrNull(
774
+ comparison.trainedVsReferencePercent
775
+ ),
776
+ dryRun: comparison.dryRun === true,
777
+ useMocks: benchmarkComparisonUsesMocks(comparison, rows),
778
+ modelBacked
779
+ };
780
+ });
781
+ const improvementComparisons = comparisonInventory.filter(
782
+ (comparison) => comparison.dryRun !== true && comparison.modelBacked && (comparison.baseScore !== null || comparison.trainedScore !== null || comparison.improvementPercent !== null)
783
+ ).map((comparison) => ({
784
+ tier: comparison.tier,
785
+ benchmark: comparison.benchmark,
786
+ baseScore: comparison.baseScore,
787
+ trainedScore: comparison.trainedScore,
788
+ improvementPercent: comparison.improvementPercent,
789
+ referenceScore: comparison.referenceScore,
790
+ trainedVsReferencePercent: comparison.trainedVsReferencePercent,
791
+ modelBacked: true
792
+ }));
793
+ const establishedTiers = Array.from(
794
+ new Set(
795
+ improvementComparisons.map((comparison) => normalizeBenchmarkTier(comparison.tier)).filter(
796
+ (tier) => tier !== null && ELIZA_ONE_BENCHMARK_TIERS.includes(tier)
797
+ )
798
+ )
799
+ ).sort(canonicalElizaOneTierSort);
800
+ const remainingTiers = ELIZA_ONE_BENCHMARK_TIERS.filter(
801
+ (tier) => !establishedTiers.includes(tier)
802
+ );
803
+ return {
804
+ actionBenchmarkPairs,
805
+ actionBenchmarkMatrixSources,
806
+ benchmarkRows: rows.length,
807
+ benchmarkComparisons: comparisons.length,
808
+ tiers,
809
+ comparisonInventory,
810
+ improvementComparisons,
811
+ baselineProgress: {
812
+ tierOrder: [...ELIZA_ONE_BENCHMARK_TIERS],
813
+ establishedTiers,
814
+ remainingTiers,
815
+ nextTier: remainingTiers[0] ?? null,
816
+ smallestTierEstablished: establishedTiers.includes("2b"),
817
+ allTiersEstablished: remainingTiers.length === 0
818
+ },
819
+ caseSamples: rows.map(recordValue).filter((row) => Boolean(row)).flatMap((row) => {
820
+ const raw = recordValue(row.raw);
821
+ const caseSamples = Array.isArray(raw?.caseSamples) ? raw.caseSamples : [];
822
+ return caseSamples.map(recordValue).filter(
823
+ (sample) => Boolean(sample)
824
+ ).map((sample) => ({
825
+ tier: stringOrNull(row.tier),
826
+ variant: stringOrNull(row.variant),
827
+ modelId: stringOrNull(row.modelId),
828
+ benchmark: stringOrNull(row.benchmark),
829
+ score: numberOrNull(row.score),
830
+ caseId: stringOrNull(sample.caseId),
831
+ prompt: stringOrNull(sample.prompt),
832
+ expectedAction: stringOrNull(sample.expectedAction),
833
+ actualAction: stringOrNull(sample.actualAction),
834
+ pass: sample.pass === true,
835
+ response: stringOrNull(sample.response),
836
+ latencyMs: numberOrNull(sample.latencyMs),
837
+ trajectoryPath: stringOrNull(sample.trajectoryPath),
838
+ useMocks: benchmarkRecordUsesMocks(row)
839
+ }));
840
+ }).slice(0, 24)
841
+ };
842
+ }
843
+ function summarizeModelInventory(analysis) {
844
+ return analysis.manifest.artifacts.filter((artifact) => artifact.kind === "model").filter((artifact) => {
845
+ const summary = artifact.summary;
846
+ return stringOrNull(summary.model) !== null || stringOrNull(summary.outputPath) !== null;
847
+ }).map((artifact) => {
848
+ const summary = artifact.summary;
849
+ return {
850
+ title: artifact.title,
851
+ path: artifact.path,
852
+ schema: schemaOfArtifact(artifact) ?? null,
853
+ model: stringOrNull(summary.model),
854
+ tier: stringOrNull(summary.tier),
855
+ variant: stringOrNull(summary.variant),
856
+ outputPath: stringOrNull(summary.outputPath),
857
+ baseModel: stringOrNull(summary.baseModel),
858
+ repoId: stringOrNull(summary.repoId),
859
+ baseEvalScore: numberOrNull(summary.baseEvalScore),
860
+ trainedEvalScore: numberOrNull(summary.trainedEvalScore),
861
+ evalImprovementPercent: numberOrNull(summary.evalImprovementPercent)
862
+ };
863
+ }).sort((left, right) => {
864
+ const byTier = canonicalElizaOneTierSort(
865
+ left.tier ?? "",
866
+ right.tier ?? ""
867
+ );
868
+ if (byTier !== 0) return byTier;
869
+ const byVariant = (left.variant === "trained" ? 1 : 0) - (right.variant === "trained" ? 1 : 0);
870
+ if (byVariant !== 0) return byVariant;
871
+ return left.title.localeCompare(right.title);
872
+ });
873
+ }
874
+ function summarizeEvalComparisonInventory(analysis) {
875
+ return analysis.manifest.artifacts.filter(
876
+ (artifact) => artifact.kind === "eval" && schemaOfArtifact(artifact) === EVAL_COMPARISON_ARTIFACT_SCHEMA
877
+ ).map((artifact) => {
878
+ const summary = artifact.summary;
879
+ return {
880
+ title: artifact.title,
881
+ path: artifact.path,
882
+ baseModel: stringOrNull(summary.baseModel),
883
+ trainedModel: stringOrNull(summary.trainedModel),
884
+ backend: stringOrNull(summary.backend),
885
+ baseScore: numberOrNull(summary.baseScore),
886
+ trainedScore: numberOrNull(summary.trainedScore),
887
+ improvementAbsolute: numberOrNull(summary.improvementAbsolute),
888
+ improvementPercent: numberOrNull(summary.improvementPercent),
889
+ baseLatencyMs: numberOrNull(summary.baseLatencyMs),
890
+ trainedLatencyMs: numberOrNull(summary.trainedLatencyMs),
891
+ latencyDeltaMs: numberOrNull(summary.latencyDeltaMs),
892
+ promptCount: numberOrNull(summary.promptCount),
893
+ distinctResponseCount: numberOrNull(summary.distinctResponseCount),
894
+ reportPath: stringOrNull(summary.reportPath)
895
+ };
896
+ }).sort((left, right) => left.title.localeCompare(right.title));
897
+ }
898
+ function summarizeFeedEvidence(analysis) {
899
+ const feedArtifacts = analysis.manifest.artifacts.filter(
900
+ (artifact) => artifact.kind === "trajectory_dataset" && (schemaOfArtifact(artifact) === "feed_training_trajectory_export" || schemaOfArtifact(artifact) === "feed_parallel_generation")
901
+ );
902
+ const runs = [];
903
+ const archetypeStats = [];
904
+ const trajectorySamples = [];
905
+ for (const artifact of feedArtifacts) {
906
+ const summary = artifact.summary;
907
+ const source = recordValue(summary.source) ?? {};
908
+ runs.push({
909
+ title: artifact.title,
910
+ path: artifact.path,
911
+ schema: schemaOfArtifact(artifact) ?? null,
912
+ sourceKind: stringOrNull(source.kind),
913
+ archetype: stringOrNull(source.archetype),
914
+ archetypes: source.archetypes ?? null,
915
+ trajectories: numberOrNull(summary.trajectories),
916
+ totalTicks: numberOrNull(summary.totalTicks),
917
+ durationMs: numberOrNull(summary.durationMs),
918
+ errors: numberOrNull(summary.errors),
919
+ exportPath: stringOrNull(summary.exportPath),
920
+ outputDir: stringOrNull(summary.outputDir)
921
+ });
922
+ const stats = recordValue(summary.archetypeStats);
923
+ if (stats) {
924
+ for (const [archetype, value] of Object.entries(stats)) {
925
+ const row = recordValue(value) ?? {};
926
+ archetypeStats.push({
927
+ title: artifact.title,
928
+ path: artifact.path,
929
+ archetype,
930
+ agents: numberOrNull(row.agents),
931
+ trajectories: numberOrNull(row.trajectories),
932
+ avgTicksPerAgent: numberOrNull(row.avgTicksPerAgent)
933
+ });
934
+ }
935
+ }
936
+ const samples = Array.isArray(summary.feedSamplePreviews) ? summary.feedSamplePreviews : [];
937
+ for (const sample of samples) {
938
+ const row = recordValue(sample);
939
+ if (!row) continue;
940
+ trajectorySamples.push({
941
+ title: artifact.title,
942
+ path: artifact.path,
943
+ trajectoryId: stringOrNull(row.trajectoryId),
944
+ agentId: stringOrNull(row.agentId),
945
+ archetype: stringOrNull(row.archetype),
946
+ scenarioId: stringOrNull(row.scenarioId),
947
+ score: numberOrNull(row.score),
948
+ finalPnl: numberOrNull(row.finalPnl),
949
+ steps: numberOrNull(row.steps),
950
+ firstStep: row.firstStep ?? null,
951
+ firstInput: row.firstInput ?? null,
952
+ firstOutput: row.firstOutput ?? null,
953
+ reasoning: row.reasoning ?? null
954
+ });
955
+ }
956
+ }
957
+ return { runs, archetypeStats, trajectorySamples };
958
+ }
959
+ function collectionSourceSample(artifact, row) {
960
+ const source = recordValue(artifact.summary.source) ?? {};
961
+ return {
962
+ title: artifact.title,
963
+ path: artifact.path,
964
+ schema: schemaOfArtifact(artifact) ?? null,
965
+ sourceKind: stringOrNull(source.kind) ?? stringOrNull(artifact.summary.source),
966
+ trajectoryId: stringOrNull(row.trajectoryId),
967
+ scenarioId: stringOrNull(row.scenarioId),
968
+ task: stringOrNull(row.task) ?? stringOrNull(row.taskType) ?? stringOrNull(row.purpose),
969
+ input: row.input ?? row.llmInput ?? row.firstInput ?? row.firstStep ?? null,
970
+ output: row.output ?? row.llmOutput ?? row.firstOutput ?? row.reasoning ?? null,
971
+ model: stringOrNull(row.model) ?? stringOrNull(row.provider),
972
+ systemPrompt: row.systemPrompt ?? null,
973
+ callId: stringOrNull(row.callId)
974
+ };
975
+ }
976
+ function appendSamplesFromSummary(target, artifact, key) {
977
+ const samples = Array.isArray(artifact.summary[key]) ? artifact.summary[key] : [];
978
+ for (const sample of samples) {
979
+ const row = recordValue(sample);
980
+ if (!row) continue;
981
+ target.push(collectionSourceSample(artifact, row));
982
+ }
983
+ }
984
+ function summarizeSourceSamples(analysis) {
985
+ const samples = {
986
+ huggingFace: [],
987
+ feed: [],
988
+ natural: [],
989
+ scenarios: [],
990
+ tests: [],
991
+ trainingJsonl: []
992
+ };
993
+ for (const artifact of analysis.manifest.artifacts) {
994
+ const schema = schemaOfArtifact(artifact);
995
+ if (schema === "eliza_huggingface_dataset_ingest") {
996
+ appendSamplesFromSummary(
997
+ samples.huggingFace,
998
+ artifact,
999
+ "hfSamplePreviews"
1000
+ );
1001
+ } else if (artifact.kind === "trajectory_dataset" && (schema === "feed_training_trajectory_export" || schema === "feed_parallel_generation")) {
1002
+ appendSamplesFromSummary(samples.feed, artifact, "feedSamplePreviews");
1003
+ } else if (artifact.kind === "trajectory_bundle" && sourceKindOfArtifact(artifact) === "training_collection_natural_trajectories") {
1004
+ const callPreviews = Array.isArray(artifact.summary.llmCallPreviews) ? artifact.summary.llmCallPreviews : [];
1005
+ appendSamplesFromSummary(
1006
+ samples.natural,
1007
+ artifact,
1008
+ callPreviews.length > 0 ? "llmCallPreviews" : "samplePreviews"
1009
+ );
1010
+ } else if (artifact.kind === "scenario_run") {
1011
+ appendSamplesFromSummary(samples.scenarios, artifact, "turnPreviews");
1012
+ } else if (schema === "eliza_scenario_native_export") {
1013
+ appendSamplesFromSummary(
1014
+ samples.scenarios,
1015
+ artifact,
1016
+ "scenarioNativeSamplePreviews"
1017
+ );
1018
+ } else if (schema === "eliza_test_trajectory_record" && sourceKindOfArtifact(artifact) === "app_core_test_trajectory") {
1019
+ appendSamplesFromSummary(samples.tests, artifact, "testSamplePreviews");
1020
+ } else if (schema === "eliza_training_jsonl_dataset") {
1021
+ appendSamplesFromSummary(
1022
+ samples.trainingJsonl,
1023
+ artifact,
1024
+ "samplePreviews"
1025
+ );
1026
+ }
1027
+ }
1028
+ return {
1029
+ huggingFace: samples.huggingFace.slice(0, 12),
1030
+ feed: samples.feed.slice(0, 12),
1031
+ natural: samples.natural.slice(0, 12),
1032
+ scenarios: samples.scenarios.slice(0, 12),
1033
+ tests: samples.tests.slice(0, 12),
1034
+ trainingJsonl: samples.trainingJsonl.slice(0, 12)
1035
+ };
1036
+ }
1037
+ function buildCollectionEvidenceSummary(input) {
1038
+ const { analysis, readiness, steps, preflight } = input;
1039
+ return {
1040
+ preflight,
1041
+ viewerHtmlPath: analysis.indexHtmlPath,
1042
+ analysisManifestPath: analysis.manifestPath,
1043
+ readinessReportPath: readiness.reportPath,
1044
+ artifactCounts: analysis.manifest.counts,
1045
+ coverage: {
1046
+ dataSources: analysis.manifest.coverage.dataSources,
1047
+ readableSamples: analysis.manifest.coverage.readableSamples,
1048
+ evals: analysis.manifest.coverage.evals,
1049
+ benchmarks: analysis.manifest.coverage.benchmarks,
1050
+ models: {
1051
+ artifacts: analysis.manifest.coverage.models.artifacts,
1052
+ stagedBundles: analysis.manifest.coverage.models.stagedBundles,
1053
+ inventoryCount: analysis.manifest.coverage.models.inventory.length
1054
+ }
1055
+ },
1056
+ stepCounts: summarizeStepCounts(steps),
1057
+ stepArtifacts: summarizeStepArtifacts(steps),
1058
+ dataSources: {
1059
+ huggingFaceDatasets: countArtifacts(
1060
+ analysis,
1061
+ (artifact) => artifact.kind === "trajectory_dataset" && schemaOfArtifact(artifact) === "eliza_huggingface_dataset_ingest"
1062
+ ),
1063
+ feedDatasets: countArtifacts(
1064
+ analysis,
1065
+ (artifact) => artifact.kind === "trajectory_dataset" && (schemaOfArtifact(artifact) === "feed_training_trajectory_export" || schemaOfArtifact(artifact) === "feed_parallel_generation")
1066
+ ),
1067
+ naturalTrajectoryBundles: countArtifacts(
1068
+ analysis,
1069
+ (artifact) => artifact.kind === "trajectory_bundle" && sourceKindOfArtifact(artifact) === "training_collection_natural_trajectories"
1070
+ ),
1071
+ scenarioRuns: countArtifacts(
1072
+ analysis,
1073
+ (artifact) => artifact.kind === "scenario_run"
1074
+ ),
1075
+ scenarioNativeDatasets: countArtifacts(
1076
+ analysis,
1077
+ (artifact) => artifact.kind === "trajectory_dataset" && schemaOfArtifact(artifact) === "eliza_scenario_native_export"
1078
+ ),
1079
+ testTrajectories: countArtifacts(
1080
+ analysis,
1081
+ (artifact) => artifact.kind === "trajectory_dataset" && sourceKindOfArtifact(artifact) === "app_core_test_trajectory"
1082
+ ),
1083
+ trainingJsonlDatasets: countArtifacts(
1084
+ analysis,
1085
+ (artifact) => artifact.kind === "trajectory_dataset" && schemaOfArtifact(artifact) === "eliza_training_jsonl_dataset"
1086
+ )
1087
+ },
1088
+ feed: summarizeFeedEvidence(analysis),
1089
+ sourceSamples: summarizeSourceSamples(analysis),
1090
+ training: {
1091
+ trainingRuns: analysis.manifest.counts.trainingRuns,
1092
+ models: analysis.manifest.counts.models,
1093
+ modelInventory: summarizeModelInventory(analysis)
1094
+ },
1095
+ evals: {
1096
+ evalArtifacts: analysis.manifest.counts.evals,
1097
+ actionBenchmarks: countArtifacts(
1098
+ analysis,
1099
+ (artifact) => artifact.kind === "eval" && schemaOfArtifact(artifact) === "eliza_action_selection_benchmark_report"
1100
+ ),
1101
+ evalComparisons: countArtifacts(
1102
+ analysis,
1103
+ (artifact) => artifact.kind === "eval" && schemaOfArtifact(artifact) === EVAL_COMPARISON_ARTIFACT_SCHEMA
1104
+ ),
1105
+ benchmarkMatrices: analysis.manifest.counts.benchmarkMatrices,
1106
+ comparisonInventory: summarizeEvalComparisonInventory(analysis)
1107
+ },
1108
+ artifactLinks: analysis.manifest.artifacts.map((artifact) => ({
1109
+ category: artifactEvidenceCategory(artifact),
1110
+ kind: artifact.kind,
1111
+ title: artifact.title,
1112
+ path: artifact.path,
1113
+ schema: schemaOfArtifact(artifact) ?? null
1114
+ })),
1115
+ benchmarks: summarizeBenchmarkEvidence({ analysis, steps }),
1116
+ benchmarkReadiness: {
1117
+ smallestTier: readinessStatus(readiness, "smallest_model_benchmark"),
1118
+ allEliza1Tiers: readinessStatus(readiness, "all_eliza1_tiers_benchmark"),
1119
+ allEliza1TierImprovements: readinessStatus(
1120
+ readiness,
1121
+ "all_eliza1_tier_improvements"
1122
+ ),
1123
+ cerebrasReference: readinessStatus(readiness, "cerebras_reference"),
1124
+ baseTrainedImprovement: readinessStatus(
1125
+ readiness,
1126
+ "base_trained_improvement"
1127
+ )
1128
+ },
1129
+ readinessGaps: readiness.checks.filter((check) => check.status !== "ready").map((check) => ({
1130
+ id: check.id,
1131
+ label: check.label,
1132
+ status: check.status,
1133
+ note: check.note,
1134
+ recommendedCapability: check.recommendedAction?.capability ?? null,
1135
+ recommendedParams: check.recommendedAction?.params ?? null
1136
+ }))
1137
+ };
1138
+ }
1139
+ function markdownInline(value) {
1140
+ if (value === null || value === void 0 || value === "") return "n/a";
1141
+ if (typeof value === "string") {
1142
+ return value.replace(/\r?\n/g, " ").replace(/\|/g, "\\|");
1143
+ }
1144
+ if (typeof value === "number" || typeof value === "boolean") {
1145
+ return String(value);
1146
+ }
1147
+ return JSON.stringify(value).replace(/\r?\n/g, " ").replace(/\|/g, "\\|");
1148
+ }
1149
+ function markdownPathLink(value) {
1150
+ const path = typeof value === "string" ? value.trim() : "";
1151
+ if (!path) return "n/a";
1152
+ const label = basename(path) || path;
1153
+ const href = /^[a-z][a-z0-9+.-]*:\/\//i.test(path) ? path : fileHref(path);
1154
+ return `[${label.replace(/\]/g, "\\]")}](${href.replace(/\)/g, "%29")})`;
1155
+ }
1156
+ function markdownTable(headers, rows) {
1157
+ if (rows.length === 0) return "_None._\n";
1158
+ return [
1159
+ `| ${headers.map(markdownInline).join(" | ")} |`,
1160
+ `| ${headers.map(() => "---").join(" | ")} |`,
1161
+ ...rows.map((row) => `| ${row.map(markdownInline).join(" | ")} |`),
1162
+ ""
1163
+ ].join("\n");
1164
+ }
1165
+ function escapeHtml(value) {
1166
+ return String(value ?? "").replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#39;");
1167
+ }
1168
+ function fileHref(path) {
1169
+ return encodeURI(`file://${path}`);
1170
+ }
1171
+ function compactCollectionIndexValue(value) {
1172
+ const raw = typeof value === "string" ? value : value === null || value === void 0 ? "" : JSON.stringify(value);
1173
+ return raw.length > 180 ? `${raw.slice(0, 177)}...` : raw;
1174
+ }
1175
+ function isTrainingCollectionManifest(value) {
1176
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
1177
+ return false;
1178
+ }
1179
+ const record = value;
1180
+ return record.schema === TRAINING_COLLECTION_RUN_SCHEMA && record.schemaVersion === TRAINING_COLLECTION_RUN_VERSION && typeof record.generatedAt === "string" && typeof record.outputDir === "string" && typeof record.manifestPath === "string" && typeof record.readmePath === "string" && record.analysis !== null && typeof record.analysis === "object" && !Array.isArray(record.analysis) && typeof record.analysis.indexHtmlPath === "string" && record.readiness !== null && typeof record.readiness === "object" && !Array.isArray(record.readiness) && record.evidence !== null && typeof record.evidence === "object" && !Array.isArray(record.evidence);
1181
+ }
1182
+ function emptyCollectionCoverage() {
1183
+ return {
1184
+ dataSources: {
1185
+ huggingFace: 0,
1186
+ feed: 0,
1187
+ natural: 0,
1188
+ scenarios: 0,
1189
+ tests: 0,
1190
+ trainingJsonl: 0
1191
+ },
1192
+ readableSamples: {
1193
+ huggingFace: 0,
1194
+ feed: 0,
1195
+ natural: 0,
1196
+ scenarios: 0,
1197
+ tests: 0,
1198
+ trainingJsonl: 0,
1199
+ total: 0
1200
+ },
1201
+ evals: {
1202
+ artifacts: 0,
1203
+ comparisons: 0,
1204
+ scoredComparisons: 0
1205
+ },
1206
+ benchmarks: {
1207
+ matrices: 0,
1208
+ comparisons: 0,
1209
+ scoredComparisons: 0,
1210
+ caseSamples: 0,
1211
+ tiers: [],
1212
+ allEliza1TiersCovered: false,
1213
+ tierCoverage: []
1214
+ },
1215
+ models: {
1216
+ artifacts: 0,
1217
+ stagedBundles: 0,
1218
+ inventoryCount: 0
1219
+ }
1220
+ };
1221
+ }
1222
+ function collectionCoverage(evidence) {
1223
+ return evidence.coverage ?? emptyCollectionCoverage();
1224
+ }
1225
+ function collectionBaselineProgress(benchmarks) {
1226
+ const existing = benchmarks.baselineProgress && typeof benchmarks.baselineProgress === "object" ? benchmarks.baselineProgress : null;
1227
+ if (existing && Array.isArray(existing.tierOrder) && Array.isArray(existing.establishedTiers) && Array.isArray(existing.remainingTiers)) {
1228
+ return {
1229
+ tierOrder: existing.tierOrder,
1230
+ establishedTiers: existing.establishedTiers,
1231
+ remainingTiers: existing.remainingTiers,
1232
+ nextTier: existing.nextTier ?? null,
1233
+ smallestTierEstablished: existing.smallestTierEstablished === true,
1234
+ allTiersEstablished: existing.allTiersEstablished === true
1235
+ };
1236
+ }
1237
+ const establishedTiers = Array.from(
1238
+ new Set(
1239
+ (benchmarks.improvementComparisons ?? []).map((comparison) => normalizeBenchmarkTier(comparison.tier)).filter(
1240
+ (tier) => tier !== null && ELIZA_ONE_BENCHMARK_TIERS.includes(tier)
1241
+ )
1242
+ )
1243
+ ).sort(canonicalElizaOneTierSort);
1244
+ const remainingTiers = ELIZA_ONE_BENCHMARK_TIERS.filter(
1245
+ (tier) => !establishedTiers.includes(tier)
1246
+ );
1247
+ return {
1248
+ tierOrder: [...ELIZA_ONE_BENCHMARK_TIERS],
1249
+ establishedTiers,
1250
+ remainingTiers,
1251
+ nextTier: remainingTiers[0] ?? null,
1252
+ smallestTierEstablished: establishedTiers.includes("2b"),
1253
+ allTiersEstablished: remainingTiers.length === 0
1254
+ };
1255
+ }
1256
+ function summarizeCollectionManifest(manifest) {
1257
+ const coverage = collectionCoverage(manifest.evidence);
1258
+ const trainingEvidence = manifest.evidence.training ?? {
1259
+ trainingRuns: 0,
1260
+ models: 0,
1261
+ modelInventory: []
1262
+ };
1263
+ const baselineProgress = collectionBaselineProgress(
1264
+ manifest.evidence.benchmarks
1265
+ );
1266
+ const benchmarkComparisonInventory = manifest.evidence.benchmarks.comparisonInventory?.length > 0 ? manifest.evidence.benchmarks.comparisonInventory : (manifest.evidence.benchmarks.improvementComparisons ?? []).map(
1267
+ (comparison) => ({
1268
+ tier: comparison.tier,
1269
+ benchmark: comparison.benchmark,
1270
+ baseModelId: null,
1271
+ trainedModelId: null,
1272
+ referenceModelId: null,
1273
+ baseScore: comparison.baseScore,
1274
+ trainedScore: comparison.trainedScore,
1275
+ improvementPercent: comparison.improvementPercent,
1276
+ referenceScore: comparison.referenceScore,
1277
+ trainedVsReferencePercent: comparison.trainedVsReferencePercent,
1278
+ dryRun: false,
1279
+ useMocks: false,
1280
+ modelBacked: comparison.modelBacked
1281
+ })
1282
+ );
1283
+ const sourceArtifacts = (manifest.evidence.artifactLinks ?? []).filter(
1284
+ (artifact) => artifact.category === "huggingface" || artifact.category === "feed" || artifact.category === "natural" || artifact.category === "scenario" || artifact.category === "test" || artifact.category === "training_jsonl"
1285
+ ).slice(0, 12).map((artifact) => ({
1286
+ category: artifact.category,
1287
+ title: artifact.title,
1288
+ path: artifact.path,
1289
+ schema: artifact.schema
1290
+ }));
1291
+ const evidenceSourceSamples = manifest.evidence.sourceSamples ?? {
1292
+ huggingFace: [],
1293
+ feed: [],
1294
+ natural: [],
1295
+ scenarios: [],
1296
+ tests: [],
1297
+ trainingJsonl: []
1298
+ };
1299
+ const sourceSamples = {
1300
+ huggingFace: (evidenceSourceSamples.huggingFace ?? []).slice(0, 3),
1301
+ feed: (evidenceSourceSamples.feed ?? []).slice(0, 3),
1302
+ natural: (evidenceSourceSamples.natural ?? []).slice(0, 3),
1303
+ scenarios: (evidenceSourceSamples.scenarios ?? []).slice(0, 3),
1304
+ tests: (evidenceSourceSamples.tests ?? []).slice(0, 3),
1305
+ trainingJsonl: (evidenceSourceSamples.trainingJsonl ?? []).slice(0, 3)
1306
+ };
1307
+ const evidenceArtifacts = (manifest.evidence.artifactLinks ?? []).filter(
1308
+ (artifact) => artifact.category === "eval" || artifact.category === "benchmark" || artifact.category === "model"
1309
+ ).slice(0, 12).map((artifact) => ({
1310
+ category: artifact.category,
1311
+ title: artifact.title,
1312
+ path: artifact.path,
1313
+ schema: artifact.schema
1314
+ }));
1315
+ return {
1316
+ generatedAt: manifest.generatedAt,
1317
+ outputDir: manifest.outputDir,
1318
+ manifestPath: manifest.manifestPath,
1319
+ readmePath: manifest.readmePath,
1320
+ analysisIndexHtmlPath: manifest.analysis.indexHtmlPath,
1321
+ readinessStatus: manifest.readiness.status,
1322
+ readiness: {
1323
+ ready: manifest.readiness.ready,
1324
+ partial: manifest.readiness.partial,
1325
+ missing: manifest.readiness.missing
1326
+ },
1327
+ readinessGaps: (manifest.evidence.readinessGaps ?? []).slice(0, 8),
1328
+ artifactCount: manifest.analysis.artifactCount,
1329
+ stepCounts: manifest.evidence.stepCounts,
1330
+ dataSources: manifest.evidence.dataSources,
1331
+ sourceSamples,
1332
+ sourceArtifacts,
1333
+ evidenceArtifacts,
1334
+ training: {
1335
+ trainingRuns: trainingEvidence.trainingRuns,
1336
+ models: trainingEvidence.models,
1337
+ modelInventory: (trainingEvidence.modelInventory ?? []).slice(0, 5)
1338
+ },
1339
+ benchmarks: {
1340
+ actionBenchmarkPairs: manifest.evidence.benchmarks.actionBenchmarkPairs,
1341
+ benchmarkComparisons: manifest.evidence.benchmarks.benchmarkComparisons,
1342
+ caseSamples: manifest.evidence.benchmarks.caseSamples?.length ?? 0,
1343
+ tiers: manifest.evidence.benchmarks.tiers,
1344
+ comparisonInventory: benchmarkComparisonInventory.slice(0, 5),
1345
+ baselineProgress
1346
+ },
1347
+ evals: {
1348
+ evalArtifacts: manifest.evidence.evals.evalArtifacts,
1349
+ evalComparisons: manifest.evidence.evals.evalComparisons,
1350
+ actionBenchmarks: manifest.evidence.evals.actionBenchmarks,
1351
+ benchmarkMatrices: manifest.evidence.evals.benchmarkMatrices,
1352
+ comparisonInventory: manifest.evidence.evals.comparisonInventory?.slice(0, 5) ?? []
1353
+ },
1354
+ coverage
1355
+ };
1356
+ }
1357
+ async function readCollectionManifestSummary(manifestPath) {
1358
+ try {
1359
+ const parsed = JSON.parse(await readFile(manifestPath, "utf8"));
1360
+ if (!isTrainingCollectionManifest(parsed)) return null;
1361
+ return summarizeCollectionManifest(parsed);
1362
+ } catch {
1363
+ return null;
1364
+ }
1365
+ }
1366
+ async function discoverCollectionManifestPaths(root) {
1367
+ try {
1368
+ const rootStat = await stat(root);
1369
+ if (!rootStat.isDirectory()) return [];
1370
+ } catch {
1371
+ return [];
1372
+ }
1373
+ const paths = /* @__PURE__ */ new Set();
1374
+ const rootManifest = join(root, "collection-manifest.json");
1375
+ if (existsSync(rootManifest)) {
1376
+ paths.add(rootManifest);
1377
+ }
1378
+ for (const entry of await readdir(root, { withFileTypes: true })) {
1379
+ if (!entry.isDirectory()) continue;
1380
+ const manifestPath = join(root, entry.name, "collection-manifest.json");
1381
+ if (existsSync(manifestPath)) {
1382
+ paths.add(manifestPath);
1383
+ }
1384
+ }
1385
+ return [...paths];
1386
+ }
1387
+ async function listTrainingCollections(options = {}) {
1388
+ const root = resolve(
1389
+ options.root ?? join(trainingStateRoot(), "collections")
1390
+ );
1391
+ const indexJsonPath = join(root, "collection-index.json");
1392
+ const indexHtmlPath = join(root, "collection-index.html");
1393
+ const limit = typeof options.limit === "number" && Number.isFinite(options.limit) ? Math.max(1, Math.floor(options.limit)) : 20;
1394
+ const summaries = (await Promise.all(
1395
+ (await discoverCollectionManifestPaths(root)).map((manifestPath) => readCollectionManifestSummary(manifestPath))
1396
+ )).filter((summary) => !!summary);
1397
+ summaries.sort((a, b) => {
1398
+ const generatedDelta = Date.parse(b.generatedAt) - Date.parse(a.generatedAt);
1399
+ if (Number.isFinite(generatedDelta) && generatedDelta !== 0) {
1400
+ return generatedDelta;
1401
+ }
1402
+ return b.outputDir.localeCompare(a.outputDir);
1403
+ });
1404
+ return {
1405
+ root,
1406
+ indexJsonPath,
1407
+ indexHtmlPath,
1408
+ collections: summaries.slice(0, limit)
1409
+ };
1410
+ }
1411
+ function buildCollectionIndexHtml(index) {
1412
+ const rows = index.collections.map((collection) => {
1413
+ const sourceSummary = [
1414
+ `hf:${collection.dataSources.huggingFaceDatasets}`,
1415
+ `feed:${collection.dataSources.feedDatasets}`,
1416
+ `natural:${collection.dataSources.naturalTrajectoryBundles}`,
1417
+ `scenarios:${collection.dataSources.scenarioRuns}`,
1418
+ `native:${collection.dataSources.scenarioNativeDatasets}`,
1419
+ `tests:${collection.dataSources.testTrajectories}`,
1420
+ `jsonl:${collection.dataSources.trainingJsonlDatasets}`
1421
+ ].join(" ");
1422
+ const sourceLinks = collection.sourceArtifacts.length > 0 ? collection.sourceArtifacts.slice(0, 6).map(
1423
+ (artifact) => `<a href="${escapeHtml(fileHref(artifact.path))}">${escapeHtml(`${artifact.category}:${artifact.title}`)}</a>`
1424
+ ).join(" ") : "<span>no source artifacts</span>";
1425
+ const sourceSampleRows = [
1426
+ ["hf", collection.sourceSamples.huggingFace],
1427
+ ["feed", collection.sourceSamples.feed],
1428
+ ["natural", collection.sourceSamples.natural],
1429
+ ["scenarios", collection.sourceSamples.scenarios],
1430
+ ["tests", collection.sourceSamples.tests],
1431
+ ["jsonl", collection.sourceSamples.trainingJsonl]
1432
+ ].flatMap(
1433
+ ([category, samples]) => samples.slice(0, 2).map(
1434
+ (sample) => [
1435
+ category,
1436
+ sample.trajectoryId ?? sample.scenarioId ?? sample.title,
1437
+ sample.task ?? sample.sourceKind ?? sample.schema ?? "sample",
1438
+ `input:${compactCollectionIndexValue(sample.input) || "n/a"}`,
1439
+ `output:${compactCollectionIndexValue(sample.output) || "n/a"}`
1440
+ ].join(" ")
1441
+ )
1442
+ );
1443
+ const sourceSampleSummary = sourceSampleRows.length > 0 ? sourceSampleRows.join(" | ") : "none";
1444
+ const gapSummary = collection.readinessGaps.length > 0 ? collection.readinessGaps.slice(0, 4).map(
1445
+ (gap) => [
1446
+ `${gap.id}:${gap.status}`,
1447
+ gap.recommendedCapability ? `->${gap.recommendedCapability}` : null,
1448
+ gap.recommendedParams ? ` params=${JSON.stringify(gap.recommendedParams)}` : null
1449
+ ].filter(Boolean).join("")
1450
+ ).join(" | ") : "none";
1451
+ const benchmarkSummary = [
1452
+ `pairs:${collection.benchmarks.actionBenchmarkPairs}`,
1453
+ `comparisons:${collection.benchmarks.benchmarkComparisons}`,
1454
+ `cases:${collection.benchmarks.caseSamples}`,
1455
+ `tiers:${collection.benchmarks.tiers.join(",") || "none"}`
1456
+ ].join(" ");
1457
+ const baselineSummary = [
1458
+ `established:${collection.benchmarks.baselineProgress.establishedTiers.join(",") || "none"}`,
1459
+ `next:${collection.benchmarks.baselineProgress.nextTier ?? "none"}`,
1460
+ `remaining:${collection.benchmarks.baselineProgress.remainingTiers.join(",") || "none"}`
1461
+ ].join(" ");
1462
+ const benchmarkHighlights = collection.benchmarks.comparisonInventory.length > 0 ? collection.benchmarks.comparisonInventory.slice(0, 3).map(
1463
+ (comparison) => [
1464
+ comparison.tier ?? "tier",
1465
+ comparison.benchmark ?? "benchmark",
1466
+ `base:${comparison.baseScore ?? "n/a"}`,
1467
+ `trained:${comparison.trainedScore ?? "n/a"}`,
1468
+ `reference:${comparison.referenceScore ?? "n/a"}`,
1469
+ `improvement:${comparison.improvementPercent ?? "n/a"}%`,
1470
+ `vs-reference:${comparison.trainedVsReferencePercent ?? "n/a"}%`,
1471
+ comparison.dryRun ? "dry-run" : comparison.modelBacked ? "model-backed" : comparison.useMocks ? "mocked" : "unverified"
1472
+ ].join(" ")
1473
+ ).join(" | ") : "none";
1474
+ const benchmarkLinks = collection.evidenceArtifacts.filter(
1475
+ (artifact) => artifact.category === "benchmark"
1476
+ ).length > 0 ? collection.evidenceArtifacts.filter((artifact) => artifact.category === "benchmark").slice(0, 4).map(
1477
+ (artifact) => `<a href="${escapeHtml(fileHref(artifact.path))}">${escapeHtml(`${artifact.category}:${artifact.title}`)}</a>`
1478
+ ).join(" ") : "<span>no benchmark artifacts</span>";
1479
+ const evalSummary = [
1480
+ `evals:${collection.evals.evalArtifacts}`,
1481
+ `comparisons:${collection.evals.evalComparisons}`,
1482
+ `action:${collection.evals.actionBenchmarks}`,
1483
+ `matrices:${collection.evals.benchmarkMatrices}`
1484
+ ].join(" ");
1485
+ const evalLinks = collection.evidenceArtifacts.filter(
1486
+ (artifact) => artifact.category === "eval"
1487
+ ).length > 0 ? collection.evidenceArtifacts.filter((artifact) => artifact.category === "eval").slice(0, 4).map(
1488
+ (artifact) => `<a href="${escapeHtml(fileHref(artifact.path))}">${escapeHtml(`${artifact.category}:${artifact.title}`)}</a>`
1489
+ ).join(" ") : "<span>no eval artifacts</span>";
1490
+ const modelSummary = [
1491
+ `runs:${collection.training.trainingRuns}`,
1492
+ `models:${collection.training.models}`,
1493
+ `inventory:${collection.training.modelInventory.length}`,
1494
+ `tracked:${collection.coverage.models.inventoryCount}`
1495
+ ].join(" ");
1496
+ const modelHighlights = collection.training.modelInventory.length > 0 ? collection.training.modelInventory.slice(0, 3).map(
1497
+ (model) => [
1498
+ model.tier ?? "tier",
1499
+ model.variant ?? "variant",
1500
+ model.model ?? "model",
1501
+ `base:${model.baseModel ?? "n/a"}`,
1502
+ `output:${model.outputPath ?? "n/a"}`,
1503
+ `improvement:${model.evalImprovementPercent ?? "n/a"}%`
1504
+ ].join(" ")
1505
+ ).join(" | ") : "none";
1506
+ const modelLinks = collection.evidenceArtifacts.filter(
1507
+ (artifact) => artifact.category === "model"
1508
+ ).length > 0 ? collection.evidenceArtifacts.filter((artifact) => artifact.category === "model").slice(0, 4).map(
1509
+ (artifact) => `<a href="${escapeHtml(fileHref(artifact.path))}">${escapeHtml(`${artifact.category}:${artifact.title}`)}</a>`
1510
+ ).join(" ") : "<span>no model artifacts</span>";
1511
+ const coverageSummary = [
1512
+ `samples:${collection.coverage.readableSamples.total}`,
1513
+ `scored-evals:${collection.coverage.evals.scoredComparisons}/${collection.coverage.evals.comparisons}`,
1514
+ `scored-bench:${collection.coverage.benchmarks.scoredComparisons}/${collection.coverage.benchmarks.comparisons}`,
1515
+ `all-tiers:${collection.coverage.benchmarks.allEliza1TiersCovered ? "yes" : "no"}`
1516
+ ].join(" ");
1517
+ return `<tr>
1518
+ <td>${escapeHtml(collection.generatedAt)}</td>
1519
+ <td>${escapeHtml(collection.readinessStatus)}<br><span>${escapeHtml(`ready:${collection.readiness.ready} partial:${collection.readiness.partial} missing:${collection.readiness.missing}`)}</span></td>
1520
+ <td>${escapeHtml(gapSummary)}</td>
1521
+ <td>${escapeHtml(sourceSummary)}<br><span>${escapeHtml(sourceSampleSummary)}</span><br>${sourceLinks}</td>
1522
+ <td>${escapeHtml(`${benchmarkSummary} ${baselineSummary}`)}<br><span>${escapeHtml(benchmarkHighlights)}</span><br>${benchmarkLinks}</td>
1523
+ <td>${escapeHtml(evalSummary)}<br>${evalLinks}</td>
1524
+ <td>${escapeHtml(modelSummary)}<br><span>${escapeHtml(modelHighlights)}</span><br>${modelLinks}</td>
1525
+ <td>${escapeHtml(coverageSummary)}</td>
1526
+ <td>${escapeHtml(collection.artifactCount)}</td>
1527
+ <td><a href="${escapeHtml(fileHref(collection.analysisIndexHtmlPath))}">viewer</a> <a href="${escapeHtml(fileHref(collection.readmePath))}">readme</a> <a href="${escapeHtml(fileHref(collection.manifestPath))}">manifest</a></td>
1528
+ <td><code>${escapeHtml(collection.outputDir)}</code></td>
1529
+ </tr>`;
1530
+ }).join("\n");
1531
+ return `<!doctype html>
1532
+ <html lang="en">
1533
+ <head>
1534
+ <meta charset="utf-8">
1535
+ <meta name="viewport" content="width=device-width, initial-scale=1">
1536
+ <title>Eliza Training Collections</title>
1537
+ <style>
1538
+ body { font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; margin: 0; color: #111827; background: #f8fafc; }
1539
+ main { max-width: 1200px; margin: 0 auto; padding: 32px 20px; }
1540
+ h1 { margin: 0 0 6px; font-size: 28px; }
1541
+ .meta { color: #64748b; font-size: 13px; margin-bottom: 20px; }
1542
+ table { width: 100%; border-collapse: collapse; background: #fff; border: 1px solid #dbe3ef; }
1543
+ th, td { border-bottom: 1px solid #e5edf7; padding: 10px; text-align: left; vertical-align: top; font-size: 13px; }
1544
+ th { background: #eef3f8; font-size: 11px; text-transform: uppercase; letter-spacing: .08em; color: #475569; }
1545
+ tr:hover { background: #f8fbff; }
1546
+ code { font-family: ui-monospace, SFMono-Regular, Menlo, monospace; font-size: 12px; word-break: break-all; }
1547
+ a { color: #0f766e; margin-right: 8px; }
1548
+ span { color: #64748b; }
1549
+ </style>
1550
+ </head>
1551
+ <body>
1552
+ <main>
1553
+ <h1>Eliza Training Collections</h1>
1554
+ <div class="meta">Generated ${escapeHtml(index.generatedAt)} \xB7 root <code>${escapeHtml(index.root)}</code> \xB7 ${index.collections.length} runs \xB7 <a href="${escapeHtml(fileHref(index.indexJsonPath))}">JSON index</a></div>
1555
+ <table>
1556
+ <thead>
1557
+ <tr>
1558
+ <th>Generated</th>
1559
+ <th>Readiness</th>
1560
+ <th>Gaps</th>
1561
+ <th>Sources</th>
1562
+ <th>Benchmarks</th>
1563
+ <th>Evals</th>
1564
+ <th>Models</th>
1565
+ <th>Coverage</th>
1566
+ <th>Artifacts</th>
1567
+ <th>Links</th>
1568
+ <th>Output</th>
1569
+ </tr>
1570
+ </thead>
1571
+ <tbody>
1572
+ ${rows || '<tr><td colspan="11">No collection runs found.</td></tr>'}
1573
+ </tbody>
1574
+ </table>
1575
+ </main>
1576
+ </body>
1577
+ </html>
1578
+ `;
1579
+ }
1580
+ async function writeTrainingCollectionIndex(options = {}) {
1581
+ const listed = await listTrainingCollections(options);
1582
+ await mkdir(listed.root, { recursive: true });
1583
+ const index = {
1584
+ schema: TRAINING_COLLECTION_INDEX_SCHEMA,
1585
+ schemaVersion: TRAINING_COLLECTION_INDEX_VERSION,
1586
+ generatedAt: options.generatedAt ?? (/* @__PURE__ */ new Date()).toISOString(),
1587
+ root: listed.root,
1588
+ indexJsonPath: listed.indexJsonPath,
1589
+ indexHtmlPath: listed.indexHtmlPath,
1590
+ collections: listed.collections
1591
+ };
1592
+ await writeFile(
1593
+ index.indexJsonPath,
1594
+ `${JSON.stringify(index, null, 2)}
1595
+ `,
1596
+ "utf8"
1597
+ );
1598
+ await writeFile(index.indexHtmlPath, buildCollectionIndexHtml(index), "utf8");
1599
+ return index;
1600
+ }
1601
+ function buildCollectionReadme(manifest) {
1602
+ const evidence = manifest.evidence;
1603
+ const sourceSamples = evidence.sourceSamples;
1604
+ const coverage = collectionCoverage(evidence);
1605
+ const sampleCounts = [
1606
+ ["Hugging Face", sourceSamples.huggingFace.length],
1607
+ ["Feed", sourceSamples.feed.length],
1608
+ ["Natural", sourceSamples.natural.length],
1609
+ ["Scenarios", sourceSamples.scenarios.length],
1610
+ ["Tests", sourceSamples.tests.length],
1611
+ ["Training JSONL", sourceSamples.trainingJsonl.length]
1612
+ ];
1613
+ const sampleRows = [
1614
+ ["Hugging Face", sourceSamples.huggingFace],
1615
+ ["Feed", sourceSamples.feed],
1616
+ ["Natural", sourceSamples.natural],
1617
+ ["Scenarios", sourceSamples.scenarios],
1618
+ ["Tests", sourceSamples.tests],
1619
+ ["Training JSONL", sourceSamples.trainingJsonl]
1620
+ ].flatMap(
1621
+ ([source, samples]) => samples.slice(0, 6).map((sample) => [
1622
+ source,
1623
+ sample.title,
1624
+ sample.task ?? sample.scenarioId ?? sample.sourceKind,
1625
+ sample.trajectoryId,
1626
+ sample.model,
1627
+ sample.input,
1628
+ sample.output,
1629
+ markdownPathLink(sample.path)
1630
+ ])
1631
+ );
1632
+ const benchmarkReadiness = evidence.benchmarkReadiness;
1633
+ const baselineProgress = evidence.benchmarks.baselineProgress;
1634
+ const preflightRows = evidence.preflight.checks.map((check) => [
1635
+ check.status,
1636
+ check.id,
1637
+ check.label,
1638
+ check.detail,
1639
+ markdownPathLink(check.path)
1640
+ ]);
1641
+ const modelRows = evidence.training.modelInventory.slice(0, 12).map((model) => [
1642
+ model.tier,
1643
+ model.variant,
1644
+ model.model,
1645
+ model.baseModel,
1646
+ model.baseEvalScore,
1647
+ model.trainedEvalScore,
1648
+ markdownPathLink(model.outputPath),
1649
+ model.evalImprovementPercent
1650
+ ]);
1651
+ const comparisonRows = evidence.benchmarks.comparisonInventory.slice(0, 12).map((comparison) => [
1652
+ comparison.tier,
1653
+ comparison.benchmark,
1654
+ comparison.baseScore,
1655
+ comparison.trainedScore,
1656
+ comparison.referenceScore,
1657
+ comparison.improvementPercent,
1658
+ comparison.trainedVsReferencePercent,
1659
+ comparison.dryRun ? "dry-run" : comparison.modelBacked ? "model-backed" : comparison.useMocks ? "mocked" : "unverified"
1660
+ ]);
1661
+ const evalComparisonRows = evidence.evals.comparisonInventory.slice(0, 12).map((comparison) => [
1662
+ comparison.baseModel,
1663
+ comparison.trainedModel,
1664
+ comparison.backend,
1665
+ comparison.baseScore,
1666
+ comparison.trainedScore,
1667
+ comparison.improvementPercent,
1668
+ comparison.baseLatencyMs,
1669
+ comparison.trainedLatencyMs,
1670
+ markdownPathLink(comparison.reportPath)
1671
+ ]);
1672
+ const caseRows = evidence.benchmarks.caseSamples.slice(0, 12).map((sample) => [
1673
+ sample.tier,
1674
+ sample.variant,
1675
+ sample.caseId,
1676
+ sample.pass,
1677
+ sample.prompt,
1678
+ sample.expectedAction,
1679
+ sample.actualAction,
1680
+ markdownPathLink(sample.trajectoryPath)
1681
+ ]);
1682
+ const gapRows = evidence.readinessGaps.map((gap) => [
1683
+ gap.status,
1684
+ gap.id,
1685
+ gap.note,
1686
+ gap.recommendedCapability,
1687
+ gap.recommendedParams
1688
+ ]);
1689
+ const artifactRows = evidence.artifactLinks.slice(0, 24).map((artifact) => [
1690
+ artifact.category,
1691
+ artifact.kind,
1692
+ artifact.schema,
1693
+ artifact.title,
1694
+ markdownPathLink(artifact.path)
1695
+ ]);
1696
+ const stepArtifactRows = evidence.stepArtifacts.flatMap((step) => {
1697
+ const command = step.command?.join(" ") ?? null;
1698
+ if (step.paths.length === 0) {
1699
+ return [
1700
+ [
1701
+ step.stepId,
1702
+ step.status,
1703
+ command,
1704
+ step.exitCode,
1705
+ step.stdout,
1706
+ step.stderr,
1707
+ "n/a",
1708
+ markdownPathLink(step.outputDir)
1709
+ ]
1710
+ ];
1711
+ }
1712
+ return step.paths.slice(0, 8).map((path) => [
1713
+ step.stepId,
1714
+ step.status,
1715
+ command,
1716
+ step.exitCode,
1717
+ step.stdout,
1718
+ step.stderr,
1719
+ path.label,
1720
+ markdownPathLink(path.path)
1721
+ ]);
1722
+ });
1723
+ return `# Eliza Training Collection
1724
+
1725
+ Generated: ${manifest.generatedAt}
1726
+
1727
+ ## Entry Points
1728
+
1729
+ - Output directory: ${markdownPathLink(manifest.outputDir)}
1730
+ - Collection manifest: ${markdownPathLink(manifest.manifestPath)}
1731
+ - Run summary: ${markdownPathLink(manifest.readmePath)}
1732
+ - Analysis viewer: ${markdownPathLink(manifest.analysis.indexHtmlPath)}
1733
+ - Analysis manifest: ${markdownPathLink(manifest.analysis.manifestPath)}
1734
+ - Readiness report: ${markdownPathLink(manifest.readiness.reportPath)}
1735
+
1736
+ ## Provenance
1737
+
1738
+ - Generated by: ${manifest.provenance.generatedBy}
1739
+ - Workspace root: ${manifest.provenance.workspaceRoot ?? "n/a"}
1740
+ - Training state root: ${manifest.provenance.trainingStateRoot}
1741
+ - Analysis roots: ${manifest.provenance.analysisRoots.join(", ")}
1742
+ - Output layout: collection=${manifest.provenance.outputLayout.collection} analysis=${manifest.provenance.outputLayout.analysis} steps=${manifest.provenance.outputLayout.steps}
1743
+
1744
+ ## Readiness
1745
+
1746
+ - Status: ${manifest.readiness.status}
1747
+ - Checks: ready=${manifest.readiness.ready} partial=${manifest.readiness.partial} missing=${manifest.readiness.missing}
1748
+ - Benchmark readiness: smallest=${benchmarkReadiness.smallestTier} all-tiers=${benchmarkReadiness.allEliza1Tiers} improvement=${benchmarkReadiness.baseTrainedImprovement} all-tier-improvements=${benchmarkReadiness.allEliza1TierImprovements} cerebras=${benchmarkReadiness.cerebrasReference}
1749
+
1750
+ ## Live Preflight
1751
+
1752
+ - Live work requested: ${evidence.preflight.liveRequired ? "yes" : "no"}
1753
+
1754
+ ${markdownTable(["Status", "Check", "Label", "Detail", "Path"], preflightRows)}
1755
+
1756
+ ## Coverage
1757
+
1758
+ - Data sources: hf=${coverage.dataSources.huggingFace} feed=${coverage.dataSources.feed} natural=${coverage.dataSources.natural} scenarios=${coverage.dataSources.scenarios} tests=${coverage.dataSources.tests} jsonl=${coverage.dataSources.trainingJsonl}
1759
+ - Readable samples: total=${coverage.readableSamples.total} hf=${coverage.readableSamples.huggingFace} feed=${coverage.readableSamples.feed} natural=${coverage.readableSamples.natural} scenarios=${coverage.readableSamples.scenarios} tests=${coverage.readableSamples.tests} jsonl=${coverage.readableSamples.trainingJsonl}
1760
+ - Eval comparisons: scored=${coverage.evals.scoredComparisons}/${coverage.evals.comparisons} artifacts=${coverage.evals.artifacts}
1761
+ - Benchmark comparisons: scored=${coverage.benchmarks.scoredComparisons}/${coverage.benchmarks.comparisons} matrices=${coverage.benchmarks.matrices} case-samples=${coverage.benchmarks.caseSamples} all-tiers=${coverage.benchmarks.allEliza1TiersCovered ? "yes" : "no"}
1762
+ - Benchmark tiers: ${coverage.benchmarks.tiers.join(", ") || "none"}
1763
+ - Model inventory: artifacts=${coverage.models.artifacts} inventory=${coverage.models.inventoryCount} staged-bundles=${coverage.models.stagedBundles}
1764
+
1765
+ ## Baseline Progression
1766
+
1767
+ - Tier order: ${baselineProgress.tierOrder.join(" -> ")}
1768
+ - Established tiers: ${baselineProgress.establishedTiers.join(", ") || "none"}
1769
+ - Remaining tiers: ${baselineProgress.remainingTiers.join(", ") || "none"}
1770
+ - Next tier: ${baselineProgress.nextTier ?? "none"}
1771
+ - Smallest tier established: ${baselineProgress.smallestTierEstablished ? "yes" : "no"}
1772
+ - All tiers established: ${baselineProgress.allTiersEstablished ? "yes" : "no"}
1773
+
1774
+ ## Steps
1775
+
1776
+ ${markdownTable(
1777
+ ["Step", "Status", "Output", "Error"],
1778
+ manifest.steps.map((step) => [
1779
+ step.id,
1780
+ step.status,
1781
+ markdownPathLink(step.outputDir),
1782
+ step.error
1783
+ ])
1784
+ )}
1785
+ ## Step Artifacts
1786
+
1787
+ ${markdownTable(
1788
+ [
1789
+ "Step",
1790
+ "Status",
1791
+ "Command",
1792
+ "Exit",
1793
+ "Stdout",
1794
+ "Stderr",
1795
+ "Path Label",
1796
+ "Path"
1797
+ ],
1798
+ stepArtifactRows
1799
+ )}
1800
+ ## Data Sources
1801
+
1802
+ ${markdownTable(
1803
+ ["Source", "Count"],
1804
+ [
1805
+ ["Hugging Face datasets", evidence.dataSources.huggingFaceDatasets],
1806
+ ["Feed datasets", evidence.dataSources.feedDatasets],
1807
+ [
1808
+ "Natural trajectory bundles",
1809
+ evidence.dataSources.naturalTrajectoryBundles
1810
+ ],
1811
+ ["Scenario runs", evidence.dataSources.scenarioRuns],
1812
+ ["Scenario native datasets", evidence.dataSources.scenarioNativeDatasets],
1813
+ ["Test trajectories", evidence.dataSources.testTrajectories],
1814
+ ["Training JSONL datasets", evidence.dataSources.trainingJsonlDatasets]
1815
+ ]
1816
+ )}
1817
+ ## Source Samples
1818
+
1819
+ ${markdownTable(["Source", "Samples"], sampleCounts)}
1820
+ ## Source Sample Preview
1821
+
1822
+ ${markdownTable(
1823
+ ["Source", "Title", "Task", "Trajectory", "Model", "Input", "Output", "Path"],
1824
+ sampleRows
1825
+ )}
1826
+ ## Model Inventory
1827
+
1828
+ ${markdownTable(
1829
+ [
1830
+ "Tier",
1831
+ "Variant",
1832
+ "Model",
1833
+ "Base Model",
1834
+ "Base Score",
1835
+ "Trained Score",
1836
+ "Output",
1837
+ "Eval Improvement %"
1838
+ ],
1839
+ modelRows
1840
+ )}
1841
+ ## Benchmark Comparisons
1842
+
1843
+ ${markdownTable(
1844
+ [
1845
+ "Tier",
1846
+ "Benchmark",
1847
+ "Base",
1848
+ "Trained",
1849
+ "Reference",
1850
+ "Improvement %",
1851
+ "Vs Reference %",
1852
+ "Evidence"
1853
+ ],
1854
+ comparisonRows
1855
+ )}
1856
+ ## Eval Comparisons
1857
+
1858
+ ${markdownTable(
1859
+ [
1860
+ "Base Model",
1861
+ "Trained Model",
1862
+ "Backend",
1863
+ "Base Score",
1864
+ "Trained Score",
1865
+ "Improvement %",
1866
+ "Base Latency",
1867
+ "Trained Latency",
1868
+ "Report"
1869
+ ],
1870
+ evalComparisonRows
1871
+ )}
1872
+ ## Benchmark Case Samples
1873
+
1874
+ ${markdownTable(
1875
+ [
1876
+ "Tier",
1877
+ "Variant",
1878
+ "Case",
1879
+ "Pass",
1880
+ "Input",
1881
+ "Expected",
1882
+ "Actual",
1883
+ "Trajectory"
1884
+ ],
1885
+ caseRows
1886
+ )}
1887
+ ## Readiness Gaps
1888
+
1889
+ ${markdownTable(["Status", "Check", "Note", "Recommended Capability", "Recommended Params"], gapRows)}
1890
+ ## Evidence Artifacts
1891
+
1892
+ ${markdownTable(["Category", "Kind", "Schema", "Title", "Path"], artifactRows)}
1893
+ `;
1894
+ }
1895
+ async function writeCollectionReadme(manifest) {
1896
+ const readmePath = join(manifest.outputDir, "README.md");
1897
+ await writeFile(readmePath, buildCollectionReadme(manifest), "utf8");
1898
+ return readmePath;
1899
+ }
1900
+ async function runStep(id, enabled, outputDir, run) {
1901
+ if (!enabled) {
1902
+ return {
1903
+ id,
1904
+ status: "skipped",
1905
+ outputDir: null,
1906
+ error: null,
1907
+ result: null
1908
+ };
1909
+ }
1910
+ const dir = stepOutputDir(outputDir, id);
1911
+ try {
1912
+ const result = await run(dir);
1913
+ return {
1914
+ id,
1915
+ status: "succeeded",
1916
+ outputDir: result.outputDir ?? dir,
1917
+ error: null,
1918
+ result
1919
+ };
1920
+ } catch (err) {
1921
+ return {
1922
+ id,
1923
+ status: "failed",
1924
+ outputDir: dir,
1925
+ error: err instanceof Error ? err.message : String(err),
1926
+ result: null
1927
+ };
1928
+ }
1929
+ }
1930
+ async function runTrainingCollection(options = {}) {
1931
+ const generatedAt = (options.now?.() ?? /* @__PURE__ */ new Date()).toISOString();
1932
+ const stateRoot = trainingStateRoot();
1933
+ const outputDir = options.outputDir ?? join(stateRoot, "collections", safeTimestamp(generatedAt));
1934
+ const workspaceRoot = options.workspaceRoot ? resolve(options.workspaceRoot) : discoverWorkspaceRoot();
1935
+ const trainingRoot = workspaceRoot ? join(workspaceRoot, "packages", "training") : void 0;
1936
+ await mkdir(outputDir, { recursive: true });
1937
+ const steps = [];
1938
+ steps.push(
1939
+ await runStep(
1940
+ "huggingface",
1941
+ boolWithDefault(options.includeHuggingFace, true),
1942
+ outputDir,
1943
+ (dir) => ingestHuggingFaceDataset({
1944
+ ...options.huggingFace ?? {},
1945
+ outputDir: options.huggingFace?.outputDir ?? dir
1946
+ })
1947
+ )
1948
+ );
1949
+ steps.push(
1950
+ await runStep(
1951
+ "feed",
1952
+ boolWithDefault(options.includeFeed, true),
1953
+ outputDir,
1954
+ (dir) => runFeedGeneration({
1955
+ ...options.feed ?? {},
1956
+ workspaceRoot: options.feed?.workspaceRoot ?? workspaceRoot,
1957
+ outputDir: options.feed?.outputDir ?? dir
1958
+ })
1959
+ )
1960
+ );
1961
+ steps.push(
1962
+ await runStep(
1963
+ "natural_trajectories",
1964
+ boolWithDefault(options.includeNaturalTrajectories, false),
1965
+ outputDir,
1966
+ (dir) => buildTrajectoryExportBundle({
1967
+ ...options.naturalTrajectories ?? {},
1968
+ outputDir: options.naturalTrajectories?.outputDir ?? dir,
1969
+ source: {
1970
+ kind: "training_collection_natural_trajectories",
1971
+ ...options.naturalTrajectories?.source ?? {}
1972
+ }
1973
+ })
1974
+ )
1975
+ );
1976
+ steps.push(
1977
+ await runStep(
1978
+ "test_trajectories",
1979
+ boolWithDefault(options.includeTestTrajectories, false),
1980
+ outputDir,
1981
+ (dir) => collectTestTrajectories({
1982
+ ...options.testTrajectories ?? {},
1983
+ workspaceRoot: options.testTrajectories?.workspaceRoot ?? workspaceRoot,
1984
+ outputDir: options.testTrajectories?.outputDir ?? dir,
1985
+ generatedAt: options.testTrajectories?.generatedAt ?? generatedAt,
1986
+ syntheticFallback: options.testTrajectories?.syntheticFallback ?? true
1987
+ })
1988
+ )
1989
+ );
1990
+ steps.push(
1991
+ await runStep(
1992
+ "scenarios",
1993
+ boolWithDefault(options.includeScenarios, true),
1994
+ outputDir,
1995
+ (dir) => runScenarios({
1996
+ ...options.scenarios ?? {},
1997
+ workspaceRoot: options.scenarios?.workspaceRoot ?? workspaceRoot,
1998
+ outputDir: options.scenarios?.outputDir ?? dir
1999
+ })
2000
+ )
2001
+ );
2002
+ steps.push(
2003
+ await runStep(
2004
+ "eval_comparison",
2005
+ boolWithDefault(options.includeEvalComparison, false),
2006
+ outputDir,
2007
+ (dir) => runLocalEvalComparison({
2008
+ ...options.evalComparison ?? {},
2009
+ trainingRoot: options.evalComparison?.trainingRoot ?? trainingRoot,
2010
+ outputDir: options.evalComparison?.outputDir ?? dir
2011
+ })
2012
+ )
2013
+ );
2014
+ steps.push(
2015
+ await runStep(
2016
+ "action_benchmark",
2017
+ boolWithDefault(options.includeActionBenchmark, true),
2018
+ outputDir,
2019
+ (dir) => runActionBenchmarkCollectionStep({
2020
+ outputDir: dir,
2021
+ workspaceRoot,
2022
+ options
2023
+ })
2024
+ )
2025
+ );
2026
+ steps.push(
2027
+ await runStep(
2028
+ "benchmark_vs_cerebras",
2029
+ boolWithDefault(options.includeBenchmarkVsCerebras, false),
2030
+ outputDir,
2031
+ (dir) => runBenchmarkVsCerebras({
2032
+ ...options.benchmarkVsCerebras ?? {},
2033
+ trainingRoot: options.benchmarkVsCerebras?.trainingRoot ?? trainingRoot,
2034
+ outputDir: options.benchmarkVsCerebras?.outputDir ?? dir,
2035
+ matrixOutputDir: options.benchmarkVsCerebras?.matrixOutputDir ?? join(dir, "matrix")
2036
+ })
2037
+ )
2038
+ );
2039
+ steps.push(
2040
+ await runStep(
2041
+ "eliza1_model_registry",
2042
+ boolWithDefault(options.includeEliza1ModelRegistry, true),
2043
+ outputDir,
2044
+ (dir) => writeEliza1ModelRegistryArtifacts({
2045
+ outputDir: dir,
2046
+ generatedAt
2047
+ })
2048
+ )
2049
+ );
2050
+ steps.push(
2051
+ await runStep(
2052
+ "eliza1_bundle_stage",
2053
+ boolWithDefault(options.includeEliza1BundleStage, false),
2054
+ outputDir,
2055
+ (dir) => stageEliza1Bundle({
2056
+ ...options.eliza1BundleStage ?? {},
2057
+ trainingRoot: options.eliza1BundleStage?.trainingRoot ?? trainingRoot,
2058
+ outputDir: options.eliza1BundleStage?.outputDir ?? dir
2059
+ })
2060
+ )
2061
+ );
2062
+ steps.push(
2063
+ await runStep(
2064
+ "benchmark_matrix",
2065
+ boolWithDefault(options.includeBenchmarkMatrix, true),
2066
+ outputDir,
2067
+ async (dir) => {
2068
+ const artifacts = autoBenchmarkMatrixSources(
2069
+ steps,
2070
+ options.benchmarkMatrix?.artifacts
2071
+ );
2072
+ if (artifacts.length === 0) {
2073
+ throw new Error(
2074
+ "No benchmark artifacts available for benchmark matrix generation"
2075
+ );
2076
+ }
2077
+ return writeBenchmarkMatrixArtifactFromArtifacts({
2078
+ artifacts,
2079
+ outputDir: options.benchmarkMatrix?.outputDir ?? dir,
2080
+ generatedAt: options.benchmarkMatrix?.generatedAt ?? generatedAt,
2081
+ referenceModelId: options.benchmarkMatrix?.referenceModelId,
2082
+ source: options.benchmarkMatrix?.source ?? {
2083
+ kind: "training_collection_benchmark_matrix",
2084
+ collectionOutputDir: outputDir
2085
+ }
2086
+ });
2087
+ }
2088
+ )
2089
+ );
2090
+ const analysisRoots = [outputDir, ...options.analysis?.roots ?? []];
2091
+ let analysis = await buildTrainingAnalysisIndex({
2092
+ ...options.analysis ?? {},
2093
+ roots: analysisRoots,
2094
+ outputDir: options.analysis?.outputDir ?? join(outputDir, "analysis")
2095
+ });
2096
+ const manifestPath = join(outputDir, "collection-manifest.json");
2097
+ const readmePath = join(outputDir, "README.md");
2098
+ const preflight = await buildTrainingCollectionPreflightWithProbes({
2099
+ options,
2100
+ workspaceRoot,
2101
+ trainingRoot
2102
+ });
2103
+ const manifest = {
2104
+ schema: TRAINING_COLLECTION_RUN_SCHEMA,
2105
+ schemaVersion: TRAINING_COLLECTION_RUN_VERSION,
2106
+ generatedAt,
2107
+ outputDir,
2108
+ manifestPath,
2109
+ readmePath,
2110
+ provenance: {
2111
+ generatedBy: "plugin-training",
2112
+ workspaceRoot: workspaceRoot ?? null,
2113
+ trainingStateRoot: stateRoot,
2114
+ analysisRoots,
2115
+ outputLayout: {
2116
+ collection: outputDir,
2117
+ analysis: options.analysis?.outputDir ?? join(outputDir, "analysis"),
2118
+ steps: outputDir
2119
+ }
2120
+ },
2121
+ recipe: buildCollectionRecipe(options),
2122
+ analysis: {
2123
+ outputDir: analysis.outputDir,
2124
+ indexHtmlPath: analysis.indexHtmlPath,
2125
+ manifestPath: analysis.manifestPath,
2126
+ artifactCount: analysis.manifest.counts.artifacts
2127
+ },
2128
+ readiness: {
2129
+ outputDir: join(outputDir, "analysis"),
2130
+ reportPath: join(outputDir, "analysis", "training-readiness-report.json"),
2131
+ status: "missing",
2132
+ ready: 0,
2133
+ partial: 0,
2134
+ missing: 0
2135
+ },
2136
+ evidence: {
2137
+ preflight,
2138
+ viewerHtmlPath: analysis.indexHtmlPath,
2139
+ analysisManifestPath: analysis.manifestPath,
2140
+ readinessReportPath: join(
2141
+ outputDir,
2142
+ "analysis",
2143
+ "training-readiness-report.json"
2144
+ ),
2145
+ artifactCounts: analysis.manifest.counts,
2146
+ coverage: {
2147
+ dataSources: analysis.manifest.coverage.dataSources,
2148
+ readableSamples: analysis.manifest.coverage.readableSamples,
2149
+ evals: analysis.manifest.coverage.evals,
2150
+ benchmarks: analysis.manifest.coverage.benchmarks,
2151
+ models: {
2152
+ artifacts: analysis.manifest.coverage.models.artifacts,
2153
+ stagedBundles: analysis.manifest.coverage.models.stagedBundles,
2154
+ inventoryCount: analysis.manifest.coverage.models.inventory.length
2155
+ }
2156
+ },
2157
+ stepCounts: summarizeStepCounts(steps),
2158
+ stepArtifacts: summarizeStepArtifacts(steps),
2159
+ dataSources: {
2160
+ huggingFaceDatasets: 0,
2161
+ feedDatasets: 0,
2162
+ naturalTrajectoryBundles: 0,
2163
+ scenarioRuns: 0,
2164
+ scenarioNativeDatasets: 0,
2165
+ testTrajectories: 0,
2166
+ trainingJsonlDatasets: 0
2167
+ },
2168
+ feed: { runs: [], archetypeStats: [], trajectorySamples: [] },
2169
+ sourceSamples: {
2170
+ huggingFace: [],
2171
+ feed: [],
2172
+ natural: [],
2173
+ scenarios: [],
2174
+ tests: [],
2175
+ trainingJsonl: []
2176
+ },
2177
+ training: { trainingRuns: 0, models: 0, modelInventory: [] },
2178
+ evals: {
2179
+ evalArtifacts: 0,
2180
+ actionBenchmarks: 0,
2181
+ evalComparisons: 0,
2182
+ benchmarkMatrices: 0,
2183
+ comparisonInventory: []
2184
+ },
2185
+ artifactLinks: [],
2186
+ benchmarks: {
2187
+ actionBenchmarkPairs: 0,
2188
+ actionBenchmarkMatrixSources: 0,
2189
+ benchmarkRows: 0,
2190
+ benchmarkComparisons: 0,
2191
+ tiers: [],
2192
+ comparisonInventory: [],
2193
+ improvementComparisons: [],
2194
+ baselineProgress: {
2195
+ tierOrder: [...ELIZA_ONE_BENCHMARK_TIERS],
2196
+ establishedTiers: [],
2197
+ remainingTiers: [...ELIZA_ONE_BENCHMARK_TIERS],
2198
+ nextTier: ELIZA_ONE_BENCHMARK_TIERS[0] ?? null,
2199
+ smallestTierEstablished: false,
2200
+ allTiersEstablished: false
2201
+ },
2202
+ caseSamples: []
2203
+ },
2204
+ benchmarkReadiness: {
2205
+ smallestTier: "missing",
2206
+ allEliza1Tiers: "missing",
2207
+ allEliza1TierImprovements: "missing",
2208
+ cerebrasReference: "missing",
2209
+ baseTrainedImprovement: "missing"
2210
+ },
2211
+ readinessGaps: []
2212
+ },
2213
+ steps
2214
+ };
2215
+ await writeFile(
2216
+ manifestPath,
2217
+ `${JSON.stringify(manifest, null, 2)}
2218
+ `,
2219
+ "utf8"
2220
+ );
2221
+ analysis = await buildTrainingAnalysisIndex({
2222
+ ...options.analysis ?? {},
2223
+ roots: analysisRoots,
2224
+ outputDir: options.analysis?.outputDir ?? join(outputDir, "analysis")
2225
+ });
2226
+ manifest.analysis = {
2227
+ outputDir: analysis.outputDir,
2228
+ indexHtmlPath: analysis.indexHtmlPath,
2229
+ manifestPath: analysis.manifestPath,
2230
+ artifactCount: analysis.manifest.counts.artifacts
2231
+ };
2232
+ const readiness = await writeTrainingReadinessReport(analysis, {
2233
+ outputDir: analysis.outputDir,
2234
+ generatedAt
2235
+ });
2236
+ manifest.readiness = {
2237
+ outputDir: readiness.outputDir,
2238
+ reportPath: readiness.reportPath,
2239
+ status: readiness.report.status,
2240
+ ready: readiness.report.counts.ready,
2241
+ partial: readiness.report.counts.partial,
2242
+ missing: readiness.report.counts.missing
2243
+ };
2244
+ manifest.evidence = buildCollectionEvidenceSummary({
2245
+ analysis,
2246
+ readiness: readiness.report,
2247
+ steps,
2248
+ preflight: manifest.evidence.preflight
2249
+ });
2250
+ analysis = await buildTrainingAnalysisIndex({
2251
+ ...options.analysis ?? {},
2252
+ roots: analysisRoots,
2253
+ outputDir: options.analysis?.outputDir ?? join(outputDir, "analysis")
2254
+ });
2255
+ manifest.analysis = {
2256
+ outputDir: analysis.outputDir,
2257
+ indexHtmlPath: analysis.indexHtmlPath,
2258
+ manifestPath: analysis.manifestPath,
2259
+ artifactCount: analysis.manifest.counts.artifacts
2260
+ };
2261
+ manifest.evidence = buildCollectionEvidenceSummary({
2262
+ analysis,
2263
+ readiness: readiness.report,
2264
+ steps,
2265
+ preflight: manifest.evidence.preflight
2266
+ });
2267
+ await writeFile(
2268
+ manifestPath,
2269
+ `${JSON.stringify(manifest, null, 2)}
2270
+ `,
2271
+ "utf8"
2272
+ );
2273
+ await writeCollectionReadme(manifest);
2274
+ const collectionIndex = await writeTrainingCollectionIndex({
2275
+ root: dirname(outputDir),
2276
+ generatedAt
2277
+ });
2278
+ return {
2279
+ outputDir,
2280
+ manifestPath,
2281
+ readmePath,
2282
+ collectionIndex,
2283
+ manifest,
2284
+ analysis
2285
+ };
2286
+ }
2287
+ export {
2288
+ ELIZA1_MODEL_REGISTRY_ENTRY_SCHEMA,
2289
+ TRAINING_COLLECTION_INDEX_SCHEMA,
2290
+ TRAINING_COLLECTION_INDEX_VERSION,
2291
+ TRAINING_COLLECTION_RUN_SCHEMA,
2292
+ TRAINING_COLLECTION_RUN_VERSION,
2293
+ buildTrainingCollectionPreflight,
2294
+ buildTrainingCollectionPreflightWithProbes,
2295
+ listTrainingCollections,
2296
+ runTrainingCollection,
2297
+ writeTrainingCollectionIndex
2298
+ };
2299
+ //# sourceMappingURL=training-collection-runner.js.map