@elizaos/plugin-training 2.0.3-beta.5 → 2.0.3-beta.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (363) hide show
  1. package/dist/backends/native.d.ts +96 -0
  2. package/dist/backends/native.d.ts.map +1 -0
  3. package/dist/backends/native.js +308 -0
  4. package/dist/backends/native.js.map +1 -0
  5. package/dist/cli/train.d.ts +22 -0
  6. package/dist/cli/train.d.ts.map +1 -0
  7. package/dist/cli/train.js +219 -0
  8. package/dist/cli/train.js.map +1 -0
  9. package/dist/core/action-benchmark-runner.d.ts +55 -0
  10. package/dist/core/action-benchmark-runner.d.ts.map +1 -0
  11. package/dist/core/action-benchmark-runner.js +341 -0
  12. package/dist/core/action-benchmark-runner.js.map +1 -0
  13. package/dist/core/artifact-store.d.ts +72 -0
  14. package/dist/core/artifact-store.d.ts.map +1 -0
  15. package/dist/core/artifact-store.js +50 -0
  16. package/dist/core/artifact-store.js.map +1 -0
  17. package/dist/core/benchmark-matrix-artifact.d.ts +102 -0
  18. package/dist/core/benchmark-matrix-artifact.d.ts.map +1 -0
  19. package/dist/core/benchmark-matrix-artifact.js +381 -0
  20. package/dist/core/benchmark-matrix-artifact.js.map +1 -0
  21. package/dist/core/benchmark-vs-cerebras-runner.d.ts +37 -0
  22. package/dist/core/benchmark-vs-cerebras-runner.d.ts.map +1 -0
  23. package/dist/core/benchmark-vs-cerebras-runner.js +151 -0
  24. package/dist/core/benchmark-vs-cerebras-runner.js.map +1 -0
  25. package/dist/core/cerebras-eval-model.d.ts +54 -0
  26. package/dist/core/cerebras-eval-model.d.ts.map +1 -0
  27. package/dist/core/cerebras-eval-model.js +249 -0
  28. package/dist/core/cerebras-eval-model.js.map +1 -0
  29. package/dist/core/cli.d.ts +15 -0
  30. package/dist/core/cli.d.ts.map +1 -0
  31. package/dist/core/cli.js +1003 -0
  32. package/dist/core/cli.js.map +1 -0
  33. package/dist/core/context-audit.d.ts +51 -0
  34. package/dist/core/context-audit.d.ts.map +1 -0
  35. package/dist/core/context-audit.js +166 -0
  36. package/dist/core/context-audit.js.map +1 -0
  37. package/dist/core/context-catalog.d.ts +47 -0
  38. package/dist/core/context-catalog.d.ts.map +1 -0
  39. package/dist/core/context-catalog.js +269 -0
  40. package/dist/core/context-catalog.js.map +1 -0
  41. package/dist/core/context-types.d.ts +3 -0
  42. package/dist/core/context-types.d.ts.map +1 -0
  43. package/dist/core/context-types.js +18 -0
  44. package/dist/core/context-types.js.map +1 -0
  45. package/dist/core/dataset-generator.d.ts +135 -0
  46. package/dist/core/dataset-generator.d.ts.map +1 -0
  47. package/dist/core/dataset-generator.js +895 -0
  48. package/dist/core/dataset-generator.js.map +1 -0
  49. package/dist/core/eliza1-benchmark-recipe.d.ts +18 -0
  50. package/dist/core/eliza1-benchmark-recipe.d.ts.map +1 -0
  51. package/dist/core/eliza1-benchmark-recipe.js +64 -0
  52. package/dist/core/eliza1-benchmark-recipe.js.map +1 -0
  53. package/dist/core/eliza1-bundle-stager.d.ts +57 -0
  54. package/dist/core/eliza1-bundle-stager.d.ts.map +1 -0
  55. package/dist/core/eliza1-bundle-stager.js +149 -0
  56. package/dist/core/eliza1-bundle-stager.js.map +1 -0
  57. package/dist/core/ensure-cron-job.d.ts +53 -0
  58. package/dist/core/ensure-cron-job.d.ts.map +1 -0
  59. package/dist/core/ensure-cron-job.js +51 -0
  60. package/dist/core/ensure-cron-job.js.map +1 -0
  61. package/dist/core/eval-comparison-artifact.d.ts +72 -0
  62. package/dist/core/eval-comparison-artifact.d.ts.map +1 -0
  63. package/dist/core/eval-comparison-artifact.js +281 -0
  64. package/dist/core/eval-comparison-artifact.js.map +1 -0
  65. package/dist/core/feed-generation-runner.d.ts +37 -0
  66. package/dist/core/feed-generation-runner.d.ts.map +1 -0
  67. package/dist/core/feed-generation-runner.js +232 -0
  68. package/dist/core/feed-generation-runner.js.map +1 -0
  69. package/dist/core/html-escape.d.ts +5 -0
  70. package/dist/core/html-escape.d.ts.map +1 -0
  71. package/dist/core/html-escape.js +11 -0
  72. package/dist/core/html-escape.js.map +1 -0
  73. package/dist/core/huggingface-dataset-ingest.d.ts +52 -0
  74. package/dist/core/huggingface-dataset-ingest.d.ts.map +1 -0
  75. package/dist/core/huggingface-dataset-ingest.js +134 -0
  76. package/dist/core/huggingface-dataset-ingest.js.map +1 -0
  77. package/dist/core/index.d.ts +29 -0
  78. package/dist/core/index.d.ts.map +1 -0
  79. package/dist/core/index.js +204 -0
  80. package/dist/core/index.js.map +1 -0
  81. package/dist/core/privacy-filter.d.ts +95 -0
  82. package/dist/core/privacy-filter.d.ts.map +1 -0
  83. package/dist/core/privacy-filter.js +324 -0
  84. package/dist/core/privacy-filter.js.map +1 -0
  85. package/dist/core/promotion-gate.d.ts +117 -0
  86. package/dist/core/promotion-gate.d.ts.map +1 -0
  87. package/dist/core/promotion-gate.js +85 -0
  88. package/dist/core/promotion-gate.js.map +1 -0
  89. package/dist/core/promotion-persist.d.ts +116 -0
  90. package/dist/core/promotion-persist.d.ts.map +1 -0
  91. package/dist/core/promotion-persist.js +93 -0
  92. package/dist/core/promotion-persist.js.map +1 -0
  93. package/dist/core/prompt-compare.d.ts +99 -0
  94. package/dist/core/prompt-compare.d.ts.map +1 -0
  95. package/dist/core/prompt-compare.js +210 -0
  96. package/dist/core/prompt-compare.js.map +1 -0
  97. package/dist/core/replay-validator.d.ts +136 -0
  98. package/dist/core/replay-validator.d.ts.map +1 -0
  99. package/dist/core/replay-validator.js +312 -0
  100. package/dist/core/replay-validator.js.map +1 -0
  101. package/dist/core/roleplay-executor.d.ts +123 -0
  102. package/dist/core/roleplay-executor.d.ts.map +1 -0
  103. package/dist/core/roleplay-executor.js +675 -0
  104. package/dist/core/roleplay-executor.js.map +1 -0
  105. package/dist/core/roleplay-trajectories.d.ts +54 -0
  106. package/dist/core/roleplay-trajectories.d.ts.map +1 -0
  107. package/dist/core/roleplay-trajectories.js +88 -0
  108. package/dist/core/roleplay-trajectories.js.map +1 -0
  109. package/dist/core/scenario-blueprints.d.ts +62 -0
  110. package/dist/core/scenario-blueprints.d.ts.map +1 -0
  111. package/dist/core/scenario-blueprints.js +850 -0
  112. package/dist/core/scenario-blueprints.js.map +1 -0
  113. package/dist/core/scenario-runner.d.ts +36 -0
  114. package/dist/core/scenario-runner.d.ts.map +1 -0
  115. package/dist/core/scenario-runner.js +216 -0
  116. package/dist/core/scenario-runner.js.map +1 -0
  117. package/dist/core/skill-scoring-cron.d.ts +57 -0
  118. package/dist/core/skill-scoring-cron.d.ts.map +1 -0
  119. package/dist/core/skill-scoring-cron.js +180 -0
  120. package/dist/core/skill-scoring-cron.js.map +1 -0
  121. package/dist/core/test-trajectory-collector.d.ts +37 -0
  122. package/dist/core/test-trajectory-collector.d.ts.map +1 -0
  123. package/dist/core/test-trajectory-collector.js +225 -0
  124. package/dist/core/test-trajectory-collector.js.map +1 -0
  125. package/dist/core/track-c-queue-task.d.ts +37 -0
  126. package/dist/core/track-c-queue-task.d.ts.map +1 -0
  127. package/dist/core/track-c-queue-task.js +104 -0
  128. package/dist/core/track-c-queue-task.js.map +1 -0
  129. package/dist/core/training-analysis-index.d.ts +104 -0
  130. package/dist/core/training-analysis-index.d.ts.map +1 -0
  131. package/dist/core/training-analysis-index.js +3297 -0
  132. package/dist/core/training-analysis-index.js.map +1 -0
  133. package/dist/core/training-collection-runner.d.ts +508 -0
  134. package/dist/core/training-collection-runner.d.ts.map +1 -0
  135. package/dist/core/training-collection-runner.js +2299 -0
  136. package/dist/core/training-collection-runner.js.map +1 -0
  137. package/dist/core/training-config.d.ts +52 -0
  138. package/dist/core/training-config.d.ts.map +1 -0
  139. package/dist/core/training-config.js +117 -0
  140. package/dist/core/training-config.js.map +1 -0
  141. package/dist/core/training-orchestrator.d.ts +112 -0
  142. package/dist/core/training-orchestrator.d.ts.map +1 -0
  143. package/dist/core/training-orchestrator.js +729 -0
  144. package/dist/core/training-orchestrator.js.map +1 -0
  145. package/dist/core/training-readiness-report.d.ts +52 -0
  146. package/dist/core/training-readiness-report.d.ts.map +1 -0
  147. package/dist/core/training-readiness-report.js +765 -0
  148. package/dist/core/training-readiness-report.js.map +1 -0
  149. package/dist/core/trajectory-consumer.d.ts +15 -0
  150. package/dist/core/trajectory-consumer.d.ts.map +1 -0
  151. package/dist/core/trajectory-consumer.js +61 -0
  152. package/dist/core/trajectory-consumer.js.map +1 -0
  153. package/dist/core/trajectory-export-bundle.d.ts +95 -0
  154. package/dist/core/trajectory-export-bundle.d.ts.map +1 -0
  155. package/dist/core/trajectory-export-bundle.js +561 -0
  156. package/dist/core/trajectory-export-bundle.js.map +1 -0
  157. package/dist/core/trajectory-export-cron.d.ts +57 -0
  158. package/dist/core/trajectory-export-cron.d.ts.map +1 -0
  159. package/dist/core/trajectory-export-cron.js +170 -0
  160. package/dist/core/trajectory-export-cron.js.map +1 -0
  161. package/dist/core/trajectory-hf-upload.d.ts +50 -0
  162. package/dist/core/trajectory-hf-upload.d.ts.map +1 -0
  163. package/dist/core/trajectory-hf-upload.js +111 -0
  164. package/dist/core/trajectory-hf-upload.js.map +1 -0
  165. package/dist/core/trajectory-task-datasets.d.ts +62 -0
  166. package/dist/core/trajectory-task-datasets.d.ts.map +1 -0
  167. package/dist/core/trajectory-task-datasets.js +427 -0
  168. package/dist/core/trajectory-task-datasets.js.map +1 -0
  169. package/dist/core/wait-for-service.d.ts +25 -0
  170. package/dist/core/wait-for-service.d.ts.map +1 -0
  171. package/dist/core/wait-for-service.js +19 -0
  172. package/dist/core/wait-for-service.js.map +1 -0
  173. package/dist/core/workspace-runtime.d.ts +4 -0
  174. package/dist/core/workspace-runtime.d.ts.map +1 -0
  175. package/dist/core/workspace-runtime.js +25 -0
  176. package/dist/core/workspace-runtime.js.map +1 -0
  177. package/dist/dspy/artifact.d.ts +54 -0
  178. package/dist/dspy/artifact.d.ts.map +1 -0
  179. package/dist/dspy/artifact.js +61 -0
  180. package/dist/dspy/artifact.js.map +1 -0
  181. package/dist/dspy/chain-of-thought.d.ts +27 -0
  182. package/dist/dspy/chain-of-thought.d.ts.map +1 -0
  183. package/dist/dspy/chain-of-thought.js +43 -0
  184. package/dist/dspy/chain-of-thought.js.map +1 -0
  185. package/dist/dspy/examples.d.ts +72 -0
  186. package/dist/dspy/examples.d.ts.map +1 -0
  187. package/dist/dspy/examples.js +105 -0
  188. package/dist/dspy/examples.js.map +1 -0
  189. package/dist/dspy/index.d.ts +15 -0
  190. package/dist/dspy/index.d.ts.map +1 -0
  191. package/dist/dspy/index.js +40 -0
  192. package/dist/dspy/index.js.map +1 -0
  193. package/dist/dspy/lm-adapter.d.ts +100 -0
  194. package/dist/dspy/lm-adapter.d.ts.map +1 -0
  195. package/dist/dspy/lm-adapter.js +81 -0
  196. package/dist/dspy/lm-adapter.js.map +1 -0
  197. package/dist/dspy/optimizers/dspy-bootstrap-fewshot.d.ts +23 -0
  198. package/dist/dspy/optimizers/dspy-bootstrap-fewshot.d.ts.map +1 -0
  199. package/dist/dspy/optimizers/dspy-bootstrap-fewshot.js +85 -0
  200. package/dist/dspy/optimizers/dspy-bootstrap-fewshot.js.map +1 -0
  201. package/dist/dspy/optimizers/dspy-copro.d.ts +29 -0
  202. package/dist/dspy/optimizers/dspy-copro.d.ts.map +1 -0
  203. package/dist/dspy/optimizers/dspy-copro.js +141 -0
  204. package/dist/dspy/optimizers/dspy-copro.js.map +1 -0
  205. package/dist/dspy/optimizers/dspy-mipro.d.ts +37 -0
  206. package/dist/dspy/optimizers/dspy-mipro.d.ts.map +1 -0
  207. package/dist/dspy/optimizers/dspy-mipro.js +194 -0
  208. package/dist/dspy/optimizers/dspy-mipro.js.map +1 -0
  209. package/dist/dspy/optimizers/index.d.ts +5 -0
  210. package/dist/dspy/optimizers/index.d.ts.map +1 -0
  211. package/dist/dspy/optimizers/index.js +11 -0
  212. package/dist/dspy/optimizers/index.js.map +1 -0
  213. package/dist/dspy/optimizers/types.d.ts +39 -0
  214. package/dist/dspy/optimizers/types.d.ts.map +1 -0
  215. package/dist/dspy/optimizers/types.js +1 -0
  216. package/dist/dspy/optimizers/types.js.map +1 -0
  217. package/dist/dspy/predict.d.ts +49 -0
  218. package/dist/dspy/predict.d.ts.map +1 -0
  219. package/dist/dspy/predict.js +73 -0
  220. package/dist/dspy/predict.js.map +1 -0
  221. package/dist/dspy/signature.d.ts +88 -0
  222. package/dist/dspy/signature.d.ts.map +1 -0
  223. package/dist/dspy/signature.js +205 -0
  224. package/dist/dspy/signature.js.map +1 -0
  225. package/dist/index.d.ts +15 -0
  226. package/dist/index.d.ts.map +1 -0
  227. package/dist/index.js +15 -0
  228. package/dist/index.js.map +1 -0
  229. package/dist/optimizers/bootstrap-fewshot.d.ts +42 -0
  230. package/dist/optimizers/bootstrap-fewshot.d.ts.map +1 -0
  231. package/dist/optimizers/bootstrap-fewshot.js +92 -0
  232. package/dist/optimizers/bootstrap-fewshot.js.map +1 -0
  233. package/dist/optimizers/gepa.d.ts +63 -0
  234. package/dist/optimizers/gepa.d.ts.map +1 -0
  235. package/dist/optimizers/gepa.js +232 -0
  236. package/dist/optimizers/gepa.js.map +1 -0
  237. package/dist/optimizers/index.d.ts +7 -0
  238. package/dist/optimizers/index.d.ts.map +1 -0
  239. package/dist/optimizers/index.js +51 -0
  240. package/dist/optimizers/index.js.map +1 -0
  241. package/dist/optimizers/instruction-search.d.ts +39 -0
  242. package/dist/optimizers/instruction-search.d.ts.map +1 -0
  243. package/dist/optimizers/instruction-search.js +108 -0
  244. package/dist/optimizers/instruction-search.js.map +1 -0
  245. package/dist/optimizers/prompt-evolution.d.ts +39 -0
  246. package/dist/optimizers/prompt-evolution.d.ts.map +1 -0
  247. package/dist/optimizers/prompt-evolution.js +101 -0
  248. package/dist/optimizers/prompt-evolution.js.map +1 -0
  249. package/dist/optimizers/scoring.d.ts +139 -0
  250. package/dist/optimizers/scoring.d.ts.map +1 -0
  251. package/dist/optimizers/scoring.js +299 -0
  252. package/dist/optimizers/scoring.js.map +1 -0
  253. package/dist/optimizers/types.d.ts +105 -0
  254. package/dist/optimizers/types.d.ts.map +1 -0
  255. package/dist/optimizers/types.js +1 -0
  256. package/dist/optimizers/types.js.map +1 -0
  257. package/dist/register-runtime.d.ts +3 -0
  258. package/dist/register-runtime.d.ts.map +1 -0
  259. package/dist/register-runtime.js +60 -0
  260. package/dist/register-runtime.js.map +1 -0
  261. package/dist/register-terminal-view.d.ts +15 -0
  262. package/dist/register-terminal-view.d.ts.map +1 -0
  263. package/dist/register-terminal-view.js +31 -0
  264. package/dist/register-terminal-view.js.map +1 -0
  265. package/dist/routes/experience-routes.d.ts +21 -0
  266. package/dist/routes/experience-routes.d.ts.map +1 -0
  267. package/dist/routes/experience-routes.js +513 -0
  268. package/dist/routes/experience-routes.js.map +1 -0
  269. package/dist/routes/index.d.ts +5 -0
  270. package/dist/routes/index.d.ts.map +1 -0
  271. package/dist/routes/index.js +17 -0
  272. package/dist/routes/index.js.map +1 -0
  273. package/dist/routes/training-routes.d.ts +10 -0
  274. package/dist/routes/training-routes.d.ts.map +1 -0
  275. package/dist/routes/training-routes.js +1239 -0
  276. package/dist/routes/training-routes.js.map +1 -0
  277. package/dist/routes/training-vast-routes.d.ts +35 -0
  278. package/dist/routes/training-vast-routes.d.ts.map +1 -0
  279. package/dist/routes/training-vast-routes.js +249 -0
  280. package/dist/routes/training-vast-routes.js.map +1 -0
  281. package/dist/routes/trajectory-routes.d.ts +19 -0
  282. package/dist/routes/trajectory-routes.d.ts.map +1 -0
  283. package/dist/routes/trajectory-routes.js +1122 -0
  284. package/dist/routes/trajectory-routes.js.map +1 -0
  285. package/dist/services/index.d.ts +9 -0
  286. package/dist/services/index.d.ts.map +1 -0
  287. package/dist/services/index.js +63 -0
  288. package/dist/services/index.js.map +1 -0
  289. package/dist/services/training-backend-check.d.ts +8 -0
  290. package/dist/services/training-backend-check.d.ts.map +1 -0
  291. package/dist/services/training-backend-check.js +31 -0
  292. package/dist/services/training-backend-check.js.map +1 -0
  293. package/dist/services/training-service-like.d.ts +40 -0
  294. package/dist/services/training-service-like.d.ts.map +1 -0
  295. package/dist/services/training-service-like.js +1 -0
  296. package/dist/services/training-service-like.js.map +1 -0
  297. package/dist/services/training-service-registry.d.ts +4 -0
  298. package/dist/services/training-service-registry.d.ts.map +1 -0
  299. package/dist/services/training-service-registry.js +12 -0
  300. package/dist/services/training-service-registry.js.map +1 -0
  301. package/dist/services/training-service.d.ts +59 -0
  302. package/dist/services/training-service.d.ts.map +1 -0
  303. package/dist/services/training-service.js +154 -0
  304. package/dist/services/training-service.js.map +1 -0
  305. package/dist/services/training-trigger.d.ts +177 -0
  306. package/dist/services/training-trigger.d.ts.map +1 -0
  307. package/dist/services/training-trigger.js +300 -0
  308. package/dist/services/training-trigger.js.map +1 -0
  309. package/dist/services/training-vast-service.d.ts +149 -0
  310. package/dist/services/training-vast-service.d.ts.map +1 -0
  311. package/dist/services/training-vast-service.js +648 -0
  312. package/dist/services/training-vast-service.js.map +1 -0
  313. package/dist/services/vast-inference-stats.d.ts +37 -0
  314. package/dist/services/vast-inference-stats.d.ts.map +1 -0
  315. package/dist/services/vast-inference-stats.js +81 -0
  316. package/dist/services/vast-inference-stats.js.map +1 -0
  317. package/dist/services/vast-job-store.d.ts +74 -0
  318. package/dist/services/vast-job-store.d.ts.map +1 -0
  319. package/dist/services/vast-job-store.js +194 -0
  320. package/dist/services/vast-job-store.js.map +1 -0
  321. package/dist/services/vast-subprocess.d.ts +27 -0
  322. package/dist/services/vast-subprocess.d.ts.map +1 -0
  323. package/dist/services/vast-subprocess.js +78 -0
  324. package/dist/services/vast-subprocess.js.map +1 -0
  325. package/dist/setup-routes.d.ts +17 -0
  326. package/dist/setup-routes.d.ts.map +1 -0
  327. package/dist/setup-routes.js +319 -0
  328. package/dist/setup-routes.js.map +1 -0
  329. package/dist/ui/FineTuningSpatialView.d.ts +49 -0
  330. package/dist/ui/FineTuningSpatialView.d.ts.map +1 -0
  331. package/dist/ui/FineTuningSpatialView.js +154 -0
  332. package/dist/ui/FineTuningSpatialView.js.map +1 -0
  333. package/dist/ui/FineTuningView.d.ts +7 -0
  334. package/dist/ui/FineTuningView.d.ts.map +1 -0
  335. package/dist/ui/FineTuningView.helpers.d.ts +17 -0
  336. package/dist/ui/FineTuningView.helpers.d.ts.map +1 -0
  337. package/dist/ui/FineTuningView.helpers.js +30 -0
  338. package/dist/ui/FineTuningView.helpers.js.map +1 -0
  339. package/dist/ui/FineTuningView.interact.d.ts +2 -0
  340. package/dist/ui/FineTuningView.interact.d.ts.map +1 -0
  341. package/dist/ui/FineTuningView.interact.js +300 -0
  342. package/dist/ui/FineTuningView.interact.js.map +1 -0
  343. package/dist/ui/FineTuningView.js +4653 -0
  344. package/dist/ui/FineTuningView.js.map +1 -0
  345. package/dist/ui/fine-tuning-panels.d.ts +100 -0
  346. package/dist/ui/fine-tuning-panels.d.ts.map +1 -0
  347. package/dist/ui/fine-tuning-panels.helpers.d.ts +19 -0
  348. package/dist/ui/fine-tuning-panels.helpers.d.ts.map +1 -0
  349. package/dist/ui/fine-tuning-panels.helpers.js +77 -0
  350. package/dist/ui/fine-tuning-panels.helpers.js.map +1 -0
  351. package/dist/ui/fine-tuning-panels.js +928 -0
  352. package/dist/ui/fine-tuning-panels.js.map +1 -0
  353. package/dist/ui/index.d.ts +5 -0
  354. package/dist/ui/index.d.ts.map +1 -0
  355. package/dist/ui/index.js +5 -0
  356. package/dist/ui/index.js.map +1 -0
  357. package/dist/ui/training-view-bundle.d.ts +3 -0
  358. package/dist/ui/training-view-bundle.d.ts.map +1 -0
  359. package/dist/ui/training-view-bundle.js +7 -0
  360. package/dist/ui/training-view-bundle.js.map +1 -0
  361. package/dist/views/bundle.js +5312 -0
  362. package/dist/views/bundle.js.map +1 -0
  363. package/package.json +7 -7
@@ -0,0 +1,116 @@
1
+ /**
2
+ * Persistence wiring for the A/B promotion gate.
3
+ *
4
+ * Pulled out of `training-orchestrator.ts` so it can be unit-tested without
5
+ * dragging in `@elizaos/agent` (which transitively imports the AI SDK gateway
6
+ * and breaks bare vitest runs). The orchestrator imports and calls
7
+ * `gatedPersistNativeResult` from here.
8
+ *
9
+ * Contract:
10
+ * 1. Resolve the incumbent prompt — current artifact via
11
+ * `service.getPrompt(task)`, falling back to the baseline template.
12
+ * 2. Run the promotion gate on (incumbent, candidate, dataset, scorer).
13
+ * 3. On promote → write via `service.setPrompt(...)` and prune the per-task
14
+ * store to the configured retention budget.
15
+ * 4. On reject → write `candidate_rejected_<timestamp>.json` under
16
+ * `<store-root>/<task>/rejected/` and leave the incumbent in place.
17
+ *
18
+ * No fallbacks on failure: missing services / missing store roots return
19
+ * structured notes so the orchestrator can surface them in the run record.
20
+ */
21
+ import type { OptimizationExample, PromptScorer } from "../optimizers/index.js";
22
+ import type { TrajectoryTrainingTask } from "./trajectory-task-datasets.js";
23
+ export type PromotionOptimizerName = "instruction-search" | "prompt-evolution" | "gepa" | "bootstrap-fewshot" | "dspy-bootstrap-fewshot" | "dspy-copro" | "dspy-mipro";
24
+ export interface PromotionFewShotExample {
25
+ id?: string;
26
+ input: {
27
+ user: string;
28
+ system?: string;
29
+ };
30
+ expectedOutput: string;
31
+ reward?: number;
32
+ metadata?: Record<string, unknown>;
33
+ }
34
+ export interface PromotionArtifactInput {
35
+ task: TrajectoryTrainingTask;
36
+ optimizer: PromotionOptimizerName;
37
+ baseline: string;
38
+ prompt: string;
39
+ score: number;
40
+ baselineScore: number;
41
+ datasetId: string;
42
+ datasetSize: number;
43
+ generatedAt: string;
44
+ lineage: Array<{
45
+ round: number;
46
+ variant: number;
47
+ score: number;
48
+ notes?: string;
49
+ }>;
50
+ fewShotExamples?: PromotionFewShotExample[];
51
+ }
52
+ export interface PromotionServiceLike {
53
+ setPrompt: (task: TrajectoryTrainingTask, artifact: PromotionArtifactInput) => Promise<string>;
54
+ /**
55
+ * Synchronous accessor for the incumbent prompt. Optional because older
56
+ * builds may not expose it; the gate falls back to the baseline template
57
+ * when missing.
58
+ */
59
+ getPrompt?: (task: TrajectoryTrainingTask) => {
60
+ prompt: string;
61
+ optimizerSource: PromotionOptimizerName;
62
+ } | null;
63
+ /**
64
+ * Returns the on-disk root used to store artifacts. Required for rejected /
65
+ * pruned bookkeeping; missing → gate still runs but rejected files are not
66
+ * persisted (logged via notes).
67
+ */
68
+ getStoreRoot?: () => string;
69
+ }
70
+ export interface PromotionNativeBackendResultLike {
71
+ optimizer: PromotionOptimizerName;
72
+ datasetSize: number;
73
+ score: number;
74
+ baselineScore: number;
75
+ result: {
76
+ optimizedPrompt: string;
77
+ lineage: Array<{
78
+ round: number;
79
+ variant: number;
80
+ score: number;
81
+ notes?: string;
82
+ }>;
83
+ fewShotExamples?: PromotionFewShotExample[];
84
+ };
85
+ /** Full parsed dataset. Fallback target for the gate when no holdout exists. */
86
+ dataset: OptimizationExample[];
87
+ /**
88
+ * Optional held-out subset the optimizer never saw. When present and
89
+ * non-empty the promotion gate scores against this set instead of
90
+ * `dataset`, eliminating train-on-test contamination.
91
+ */
92
+ holdoutSet?: OptimizationExample[];
93
+ scorer: PromptScorer;
94
+ }
95
+ export interface GatedPersistInput {
96
+ task: TrajectoryTrainingTask;
97
+ datasetPath: string;
98
+ runId: string;
99
+ baselinePrompt: string;
100
+ result: PromotionNativeBackendResultLike;
101
+ service: PromotionServiceLike;
102
+ /** Notes already accumulated by the dispatcher; new lines are appended. */
103
+ notesPrefix: string[];
104
+ }
105
+ export interface GatedPersistResult {
106
+ invoked: boolean;
107
+ artifactPath?: string;
108
+ notes: string[];
109
+ }
110
+ /**
111
+ * Gate + persist step extracted from the orchestrator's native dispatcher so
112
+ * it can be tested without spinning up a real optimizer or runtime. Returns
113
+ * the same shape the dispatcher emits.
114
+ */
115
+ export declare function gatedPersistNativeResult(input: GatedPersistInput): Promise<GatedPersistResult>;
116
+ //# sourceMappingURL=promotion-persist.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"promotion-persist.d.ts","sourceRoot":"","sources":["../../src/core/promotion-persist.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAEH,OAAO,KAAK,EAAE,mBAAmB,EAAE,YAAY,EAAE,MAAM,wBAAwB,CAAC;AAOhF,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,+BAA+B,CAAC;AAE5E,MAAM,MAAM,sBAAsB,GAC9B,oBAAoB,GACpB,kBAAkB,GAClB,MAAM,GACN,mBAAmB,GACnB,wBAAwB,GACxB,YAAY,GACZ,YAAY,CAAC;AAEjB,MAAM,WAAW,uBAAuB;IACtC,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IACzC,cAAc,EAAE,MAAM,CAAC;IACvB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACpC;AAED,MAAM,WAAW,sBAAsB;IACrC,IAAI,EAAE,sBAAsB,CAAC;IAC7B,SAAS,EAAE,sBAAsB,CAAC;IAClC,QAAQ,EAAE,MAAM,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,EAAE,KAAK,CAAC;QACb,KAAK,EAAE,MAAM,CAAC;QACd,OAAO,EAAE,MAAM,CAAC;QAChB,KAAK,EAAE,MAAM,CAAC;QACd,KAAK,CAAC,EAAE,MAAM,CAAC;KAChB,CAAC,CAAC;IACH,eAAe,CAAC,EAAE,uBAAuB,EAAE,CAAC;CAC7C;AAED,MAAM,WAAW,oBAAoB;IACnC,SAAS,EAAE,CACT,IAAI,EAAE,sBAAsB,EAC5B,QAAQ,EAAE,sBAAsB,KAC7B,OAAO,CAAC,MAAM,CAAC,CAAC;IACrB;;;;OAIG;IACH,SAAS,CAAC,EAAE,CACV,IAAI,EAAE,sBAAsB,KACzB;QAAE,MAAM,EAAE,MAAM,CAAC;QAAC,eAAe,EAAE,sBAAsB,CAAA;KAAE,GAAG,IAAI,CAAC;IACxE;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,MAAM,CAAC;CAC7B;AAED,MAAM,WAAW,gCAAgC;IAC/C,SAAS,EAAE,sBAAsB,CAAC;IAClC,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,aAAa,EAAE,MAAM,CAAC;IACtB,MAAM,EAAE;QACN,eAAe,EAAE,MAAM,CAAC;QACxB,OAAO,EAAE,KAAK,CAAC;YACb,KAAK,EAAE,MAAM,CAAC;YACd,OAAO,EAAE,MAAM,CAAC;YAChB,KAAK,EAAE,MAAM,CAAC;YACd,KAAK,CAAC,EAAE,MAAM,CAAC;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,EAAE,uBAAuB,EAAE,CAAC;KAC7C,CAAC;IACF,gFAAgF;IAChF,OAAO,EAAE,mBAAmB,EAAE,CAAC;IAC/B;;;;OAIG;IACH,UAAU,CAAC,EAAE,mBAAmB,EAAE,CAAC;IACnC,MAAM,EAAE,YAAY,CAAC;CACtB;AAED,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,sBAAsB,CAAC;IAC7B,WAAW,EAAE,MAAM,CAAC;IACpB,KAAK,EAAE,MAAM,CAAC;IACd,cAAc,EAAE,MAAM,CAAC;IACvB,MAAM,EAAE,gCAAgC,CAAC;IACzC,OAAO,EAAE,oBAAoB,CAAC;IAC9B,2EAA2E;IAC3E,WAAW,EAAE,MAAM,EAAE,CAAC;CACvB;AAED,MAAM,WAAW,kBAAkB;IACjC,OAAO,EAAE,OAAO,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,KAAK,EAAE,MAAM,EAAE,CAAC;CACjB;AAED;;;;GAIG;AACH,wBAAsB,wBAAwB,CAC5C,KAAK,EAAE,iBAAiB,GACvB,OAAO,CAAC,kBAAkB,CAAC,CAmG7B"}
@@ -0,0 +1,93 @@
1
+ import {
2
+ DEFAULT_PROMOTED_ARTIFACT_RETENTION,
3
+ prunePromotedArtifacts,
4
+ writeRejectedCandidate
5
+ } from "./artifact-store.js";
6
+ import { evaluatePromotion } from "./promotion-gate.js";
7
+ async function gatedPersistNativeResult(input) {
8
+ const notes = [...input.notesPrefix];
9
+ const incumbentResolved = typeof input.service.getPrompt === "function" ? input.service.getPrompt(input.task) : null;
10
+ const incumbentPrompt = incumbentResolved?.prompt ?? input.baselinePrompt;
11
+ const incumbentSource = incumbentResolved ? "current" : "baseline";
12
+ const holdoutSet = input.result.holdoutSet;
13
+ const gateDataset = holdoutSet && holdoutSet.length > 0 ? holdoutSet : input.result.dataset;
14
+ const gateSource = holdoutSet && holdoutSet.length > 0 ? `holdout(n=${holdoutSet.length})` : `full-dataset(n=${input.result.dataset.length}) [no holdout available]`;
15
+ const decision = await evaluatePromotion({
16
+ incumbentPrompt,
17
+ candidatePrompt: input.result.result.optimizedPrompt,
18
+ dataset: gateDataset,
19
+ scorer: input.result.scorer
20
+ });
21
+ notes.push(
22
+ `promotion-gate ${decision.promote ? "PROMOTE" : "REJECT"} incumbent_source=${incumbentSource} gate_dataset=${gateSource} ${decision.reason}`
23
+ );
24
+ const generatedAt = (/* @__PURE__ */ new Date()).toISOString();
25
+ if (!decision.promote) {
26
+ const storeRoot2 = input.service.getStoreRoot?.();
27
+ if (!storeRoot2) {
28
+ notes.push(
29
+ "OptimizedPromptService does not expose getStoreRoot; rejected candidate not persisted"
30
+ );
31
+ return { invoked: true, notes };
32
+ }
33
+ const rejectedPath = await writeRejectedCandidate(storeRoot2, input.task, {
34
+ rejectedAt: generatedAt,
35
+ task: input.task,
36
+ optimizer: input.result.optimizer,
37
+ candidatePrompt: input.result.result.optimizedPrompt,
38
+ incumbentPrompt,
39
+ scores: {
40
+ incumbentMeanScore: decision.incumbentMeanScore,
41
+ incumbentStdDev: decision.incumbentStdDev,
42
+ candidateScore: decision.candidateScore,
43
+ delta: decision.delta,
44
+ promotionMargin: decision.promotionMargin,
45
+ noiseThreshold: decision.noiseThreshold,
46
+ incumbentReseeds: decision.incumbentReseeds,
47
+ examplesPerPass: decision.examplesPerPass,
48
+ incumbentScores: decision.incumbentScores
49
+ },
50
+ reason: decision.reason,
51
+ datasetId: input.datasetPath,
52
+ runId: input.runId
53
+ });
54
+ notes.push(`rejected candidate written to ${rejectedPath}`);
55
+ return { invoked: true, notes };
56
+ }
57
+ const writePath = await input.service.setPrompt(input.task, {
58
+ task: input.task,
59
+ optimizer: input.result.optimizer,
60
+ baseline: input.baselinePrompt,
61
+ prompt: input.result.result.optimizedPrompt,
62
+ score: input.result.score,
63
+ baselineScore: input.result.baselineScore,
64
+ datasetId: input.datasetPath,
65
+ datasetSize: input.result.datasetSize,
66
+ generatedAt,
67
+ lineage: input.result.result.lineage,
68
+ fewShotExamples: input.result.result.fewShotExamples
69
+ });
70
+ notes.push(`artifact written to ${writePath}`);
71
+ const storeRoot = input.service.getStoreRoot?.();
72
+ if (storeRoot) {
73
+ const removed = await prunePromotedArtifacts(
74
+ storeRoot,
75
+ input.task,
76
+ DEFAULT_PROMOTED_ARTIFACT_RETENTION
77
+ );
78
+ if (removed.length > 0) {
79
+ notes.push(
80
+ `pruned ${removed.length} stale artifact(s); retained ${DEFAULT_PROMOTED_ARTIFACT_RETENTION} most recent`
81
+ );
82
+ }
83
+ }
84
+ return {
85
+ invoked: true,
86
+ artifactPath: writePath,
87
+ notes
88
+ };
89
+ }
90
+ export {
91
+ gatedPersistNativeResult
92
+ };
93
+ //# sourceMappingURL=promotion-persist.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/core/promotion-persist.ts"],"sourcesContent":["/**\n * Persistence wiring for the A/B promotion gate.\n *\n * Pulled out of `training-orchestrator.ts` so it can be unit-tested without\n * dragging in `@elizaos/agent` (which transitively imports the AI SDK gateway\n * and breaks bare vitest runs). The orchestrator imports and calls\n * `gatedPersistNativeResult` from here.\n *\n * Contract:\n * 1. Resolve the incumbent prompt — current artifact via\n * `service.getPrompt(task)`, falling back to the baseline template.\n * 2. Run the promotion gate on (incumbent, candidate, dataset, scorer).\n * 3. On promote → write via `service.setPrompt(...)` and prune the per-task\n * store to the configured retention budget.\n * 4. On reject → write `candidate_rejected_<timestamp>.json` under\n * `<store-root>/<task>/rejected/` and leave the incumbent in place.\n *\n * No fallbacks on failure: missing services / missing store roots return\n * structured notes so the orchestrator can surface them in the run record.\n */\n\nimport type { OptimizationExample, PromptScorer } from \"../optimizers/index.js\";\nimport {\n DEFAULT_PROMOTED_ARTIFACT_RETENTION,\n prunePromotedArtifacts,\n writeRejectedCandidate,\n} from \"./artifact-store.js\";\nimport { evaluatePromotion } from \"./promotion-gate.js\";\nimport type { TrajectoryTrainingTask } from \"./trajectory-task-datasets.js\";\n\nexport type PromotionOptimizerName =\n | \"instruction-search\"\n | \"prompt-evolution\"\n | \"gepa\"\n | \"bootstrap-fewshot\"\n | \"dspy-bootstrap-fewshot\"\n | \"dspy-copro\"\n | \"dspy-mipro\";\n\nexport interface PromotionFewShotExample {\n id?: string;\n input: { user: string; system?: string };\n expectedOutput: string;\n reward?: number;\n metadata?: Record<string, unknown>;\n}\n\nexport interface PromotionArtifactInput {\n task: TrajectoryTrainingTask;\n optimizer: PromotionOptimizerName;\n baseline: string;\n prompt: string;\n score: number;\n baselineScore: number;\n datasetId: string;\n datasetSize: number;\n generatedAt: string;\n lineage: Array<{\n round: number;\n variant: number;\n score: number;\n notes?: string;\n }>;\n fewShotExamples?: PromotionFewShotExample[];\n}\n\nexport interface PromotionServiceLike {\n setPrompt: (\n task: TrajectoryTrainingTask,\n artifact: PromotionArtifactInput,\n ) => Promise<string>;\n /**\n * Synchronous accessor for the incumbent prompt. Optional because older\n * builds may not expose it; the gate falls back to the baseline template\n * when missing.\n */\n getPrompt?: (\n task: TrajectoryTrainingTask,\n ) => { prompt: string; optimizerSource: PromotionOptimizerName } | null;\n /**\n * Returns the on-disk root used to store artifacts. Required for rejected /\n * pruned bookkeeping; missing → gate still runs but rejected files are not\n * persisted (logged via notes).\n */\n getStoreRoot?: () => string;\n}\n\nexport interface PromotionNativeBackendResultLike {\n optimizer: PromotionOptimizerName;\n datasetSize: number;\n score: number;\n baselineScore: number;\n result: {\n optimizedPrompt: string;\n lineage: Array<{\n round: number;\n variant: number;\n score: number;\n notes?: string;\n }>;\n fewShotExamples?: PromotionFewShotExample[];\n };\n /** Full parsed dataset. Fallback target for the gate when no holdout exists. */\n dataset: OptimizationExample[];\n /**\n * Optional held-out subset the optimizer never saw. When present and\n * non-empty the promotion gate scores against this set instead of\n * `dataset`, eliminating train-on-test contamination.\n */\n holdoutSet?: OptimizationExample[];\n scorer: PromptScorer;\n}\n\nexport interface GatedPersistInput {\n task: TrajectoryTrainingTask;\n datasetPath: string;\n runId: string;\n baselinePrompt: string;\n result: PromotionNativeBackendResultLike;\n service: PromotionServiceLike;\n /** Notes already accumulated by the dispatcher; new lines are appended. */\n notesPrefix: string[];\n}\n\nexport interface GatedPersistResult {\n invoked: boolean;\n artifactPath?: string;\n notes: string[];\n}\n\n/**\n * Gate + persist step extracted from the orchestrator's native dispatcher so\n * it can be tested without spinning up a real optimizer or runtime. Returns\n * the same shape the dispatcher emits.\n */\nexport async function gatedPersistNativeResult(\n input: GatedPersistInput,\n): Promise<GatedPersistResult> {\n const notes = [...input.notesPrefix];\n\n const incumbentResolved =\n typeof input.service.getPrompt === \"function\"\n ? input.service.getPrompt(input.task)\n : null;\n const incumbentPrompt = incumbentResolved?.prompt ?? input.baselinePrompt;\n const incumbentSource = incumbentResolved ? \"current\" : \"baseline\";\n\n // Prefer the held-out subset (the optimizer never saw it) so the gate is\n // not a train-on-test pass. Fall back to the full dataset for back-compat\n // and for tiny datasets where the deterministic split produced no holdout.\n const holdoutSet = input.result.holdoutSet;\n const gateDataset =\n holdoutSet && holdoutSet.length > 0 ? holdoutSet : input.result.dataset;\n const gateSource =\n holdoutSet && holdoutSet.length > 0\n ? `holdout(n=${holdoutSet.length})`\n : `full-dataset(n=${input.result.dataset.length}) [no holdout available]`;\n\n const decision = await evaluatePromotion({\n incumbentPrompt,\n candidatePrompt: input.result.result.optimizedPrompt,\n dataset: gateDataset,\n scorer: input.result.scorer,\n });\n notes.push(\n `promotion-gate ${decision.promote ? \"PROMOTE\" : \"REJECT\"} incumbent_source=${incumbentSource} gate_dataset=${gateSource} ${decision.reason}`,\n );\n\n const generatedAt = new Date().toISOString();\n if (!decision.promote) {\n const storeRoot = input.service.getStoreRoot?.();\n if (!storeRoot) {\n notes.push(\n \"OptimizedPromptService does not expose getStoreRoot; rejected candidate not persisted\",\n );\n return { invoked: true, notes };\n }\n const rejectedPath = await writeRejectedCandidate(storeRoot, input.task, {\n rejectedAt: generatedAt,\n task: input.task,\n optimizer: input.result.optimizer,\n candidatePrompt: input.result.result.optimizedPrompt,\n incumbentPrompt,\n scores: {\n incumbentMeanScore: decision.incumbentMeanScore,\n incumbentStdDev: decision.incumbentStdDev,\n candidateScore: decision.candidateScore,\n delta: decision.delta,\n promotionMargin: decision.promotionMargin,\n noiseThreshold: decision.noiseThreshold,\n incumbentReseeds: decision.incumbentReseeds,\n examplesPerPass: decision.examplesPerPass,\n incumbentScores: decision.incumbentScores,\n },\n reason: decision.reason,\n datasetId: input.datasetPath,\n runId: input.runId,\n });\n notes.push(`rejected candidate written to ${rejectedPath}`);\n return { invoked: true, notes };\n }\n\n const writePath = await input.service.setPrompt(input.task, {\n task: input.task,\n optimizer: input.result.optimizer,\n baseline: input.baselinePrompt,\n prompt: input.result.result.optimizedPrompt,\n score: input.result.score,\n baselineScore: input.result.baselineScore,\n datasetId: input.datasetPath,\n datasetSize: input.result.datasetSize,\n generatedAt,\n lineage: input.result.result.lineage,\n fewShotExamples: input.result.result.fewShotExamples,\n });\n notes.push(`artifact written to ${writePath}`);\n\n const storeRoot = input.service.getStoreRoot?.();\n if (storeRoot) {\n const removed = await prunePromotedArtifacts(\n storeRoot,\n input.task,\n DEFAULT_PROMOTED_ARTIFACT_RETENTION,\n );\n if (removed.length > 0) {\n notes.push(\n `pruned ${removed.length} stale artifact(s); retained ${DEFAULT_PROMOTED_ARTIFACT_RETENTION} most recent`,\n );\n }\n }\n\n return {\n invoked: true,\n artifactPath: writePath,\n notes,\n };\n}\n"],"mappings":"AAsBA;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,OACK;AACP,SAAS,yBAAyB;AA4GlC,eAAsB,yBACpB,OAC6B;AAC7B,QAAM,QAAQ,CAAC,GAAG,MAAM,WAAW;AAEnC,QAAM,oBACJ,OAAO,MAAM,QAAQ,cAAc,aAC/B,MAAM,QAAQ,UAAU,MAAM,IAAI,IAClC;AACN,QAAM,kBAAkB,mBAAmB,UAAU,MAAM;AAC3D,QAAM,kBAAkB,oBAAoB,YAAY;AAKxD,QAAM,aAAa,MAAM,OAAO;AAChC,QAAM,cACJ,cAAc,WAAW,SAAS,IAAI,aAAa,MAAM,OAAO;AAClE,QAAM,aACJ,cAAc,WAAW,SAAS,IAC9B,aAAa,WAAW,MAAM,MAC9B,kBAAkB,MAAM,OAAO,QAAQ,MAAM;AAEnD,QAAM,WAAW,MAAM,kBAAkB;AAAA,IACvC;AAAA,IACA,iBAAiB,MAAM,OAAO,OAAO;AAAA,IACrC,SAAS;AAAA,IACT,QAAQ,MAAM,OAAO;AAAA,EACvB,CAAC;AACD,QAAM;AAAA,IACJ,kBAAkB,SAAS,UAAU,YAAY,QAAQ,qBAAqB,eAAe,iBAAiB,UAAU,IAAI,SAAS,MAAM;AAAA,EAC7I;AAEA,QAAM,eAAc,oBAAI,KAAK,GAAE,YAAY;AAC3C,MAAI,CAAC,SAAS,SAAS;AACrB,UAAMA,aAAY,MAAM,QAAQ,eAAe;AAC/C,QAAI,CAACA,YAAW;AACd,YAAM;AAAA,QACJ;AAAA,MACF;AACA,aAAO,EAAE,SAAS,MAAM,MAAM;AAAA,IAChC;AACA,UAAM,eAAe,MAAM,uBAAuBA,YAAW,MAAM,MAAM;AAAA,MACvE,YAAY;AAAA,MACZ,MAAM,MAAM;AAAA,MACZ,WAAW,MAAM,OAAO;AAAA,MACxB,iBAAiB,MAAM,OAAO,OAAO;AAAA,MACrC;AAAA,MACA,QAAQ;AAAA,QACN,oBAAoB,SAAS;AAAA,QAC7B,iBAAiB,SAAS;AAAA,QAC1B,gBAAgB,SAAS;AAAA,QACzB,OAAO,SAAS;AAAA,QAChB,iBAAiB,SAAS;AAAA,QAC1B,gBAAgB,SAAS;AAAA,QACzB,kBAAkB,SAAS;AAAA,QAC3B,iBAAiB,SAAS;AAAA,QAC1B,iBAAiB,SAAS;AAAA,MAC5B;AAAA,MACA,QAAQ,SAAS;AAAA,MACjB,WAAW,MAAM;AAAA,MACjB,OAAO,MAAM;AAAA,IACf,CAAC;AACD,UAAM,KAAK,iCAAiC,YAAY,EAAE;AAC1D,WAAO,EAAE,SAAS,MAAM,MAAM;AAAA,EAChC;AAEA,QAAM,YAAY,MAAM,MAAM,QAAQ,UAAU,MAAM,MAAM;AAAA,IAC1D,MAAM,MAAM;AAAA,IACZ,WAAW,MAAM,OAAO;AAAA,IACxB,UAAU,MAAM;AAAA,IAChB,QAAQ,MAAM,OAAO,OAAO;AAAA,IAC5B,OAAO,MAAM,OAAO;AAAA,IACpB,eAAe,MAAM,OAAO;AAAA,IAC5B,WAAW,MAAM;AAAA,IACjB,aAAa,MAAM,OAAO;AAAA,IAC1B;AAAA,IACA,SAAS,MAAM,OAAO,OAAO;AAAA,IAC7B,iBAAiB,MAAM,OAAO,OAAO;AAAA,EACvC,CAAC;AACD,QAAM,KAAK,uBAAuB,SAAS,EAAE;AAE7C,QAAM,YAAY,MAAM,QAAQ,eAAe;AAC/C,MAAI,WAAW;AACb,UAAM,UAAU,MAAM;AAAA,MACpB;AAAA,MACA,MAAM;AAAA,MACN;AAAA,IACF;AACA,QAAI,QAAQ,SAAS,GAAG;AACtB,YAAM;AAAA,QACJ,UAAU,QAAQ,MAAM,gCAAgC,mCAAmC;AAAA,MAC7F;AAAA,IACF;AAAA,EACF;AAEA,SAAO;AAAA,IACL,SAAS;AAAA,IACT,cAAc;AAAA,IACd;AAAA,EACF;AACF;","names":["storeRoot"]}
@@ -0,0 +1,99 @@
1
+ /**
2
+ * Prompt A/B comparison harness.
3
+ *
4
+ * Given two prompts (baseline + variant) and a dataset of historical
5
+ * input/expected pairs, run each prompt through the same model and
6
+ * report mean scores plus per-example deltas. Used to gate prompt
7
+ * cleanup / compression changes before they ship: a variant that
8
+ * regresses against the historical reference indicates a behavioral
9
+ * change, not a pure cosmetic edit.
10
+ *
11
+ * The harness is a thin wrapper over the native optimizer scorer
12
+ * (`createPromptScorer` from optimizers/scoring.ts) — that scorer is
13
+ * already designed for prompt-vs-dataset evaluation, just with a
14
+ * single prompt at a time. We invoke it twice and diff the results.
15
+ *
16
+ * Two scoring modes are supported:
17
+ *
18
+ * - `vs_historical` (default): each prompt is scored against the
19
+ * recorded `expectedOutput` (Jaccard token overlap by default,
20
+ * action-name match for the action_planner task). Cheap and
21
+ * deterministic. Both prompts are scored independently; the delta
22
+ * tells you whether the variant reproduces the historical output
23
+ * as well as the baseline does.
24
+ *
25
+ * - `pairwise`: run baseline on every example to capture v1 outputs,
26
+ * then run variant on the same inputs and compare v2 outputs to v1
27
+ * outputs directly (pairwise Jaccard). This answers "did the
28
+ * variant produce semantically equivalent output?", which is a
29
+ * stricter regression test than `vs_historical` because the latter
30
+ * is biased — historical outputs were likely produced by a prompt
31
+ * close to the baseline.
32
+ *
33
+ * No new model abstractions are introduced. Reuses:
34
+ * - `parseJsonlDataset()` from backends/native.ts (private — mirrored inline here to avoid exporting the training-backend parser)
35
+ * - `createRuntimeAdapter()` from optimizers/scoring.ts
36
+ * - `createPromptScorer()` from optimizers/scoring.ts
37
+ * - `scoreAgreement()` / `scorePlannerAction()` from optimizers/scoring.ts
38
+ *
39
+ * Cost note: N examples × 2 prompts = 2N model calls per run in
40
+ * `vs_historical` mode; same in `pairwise` mode (baseline outputs are
41
+ * captured once, variant once). Default temperature 0 for determinism.
42
+ */
43
+ import { type LlmAdapter, type OptimizationExample, type UseModelHandler } from "../optimizers/index.js";
44
+ import type { TrajectoryTrainingTask } from "./trajectory-task-datasets.js";
45
+ export type ScorerKind = "agreement" | "planner_action";
46
+ export type CompareMode = "vs_historical" | "pairwise";
47
+ export interface PromptComparisonInput {
48
+ /** System prompt under test as the baseline (often the current canonical prompt). */
49
+ baselinePrompt: string;
50
+ /** System prompt under test as the variant (e.g. caveman-compressed). */
51
+ variantPrompt: string;
52
+ /** Dataset of `(input, expectedOutput)` rows. Path to a JSONL file produced by `exportTrajectoryTaskDatasets`, or an in-memory array. */
53
+ dataset: string | OptimizationExample[];
54
+ /** Task hint — selects the right scorer when `scorer` is omitted. Defaults to `agreement`. */
55
+ task?: TrajectoryTrainingTask;
56
+ /** Force a specific scorer regardless of task. */
57
+ scorer?: ScorerKind;
58
+ /** Cap how many examples to score (handy for cheap previews). */
59
+ maxExamples?: number;
60
+ /** Compare mode: `vs_historical` (default) or `pairwise`. */
61
+ mode?: CompareMode;
62
+ /** Temperature passed to the adapter. Defaults to 0 for determinism. */
63
+ temperature?: number;
64
+ /** Max tokens per completion. Defaults to 512. */
65
+ maxTokens?: number;
66
+ /** Loose runtime shape — only `useModel` is required. Mutually exclusive with `adapter`. */
67
+ runtime?: {
68
+ useModel: UseModelHandler;
69
+ };
70
+ /** Pre-built LLM adapter (tests, alternative providers). */
71
+ adapter?: LlmAdapter;
72
+ }
73
+ export interface PromptComparisonResult {
74
+ baselineScore: number;
75
+ variantScore: number;
76
+ /** `variantScore - baselineScore`. Positive means variant is closer to reference. */
77
+ delta: number;
78
+ /** Percentage delta, where 0 baseline collapses to 0 to avoid divide-by-zero. */
79
+ deltaPercent: number;
80
+ examplesScored: number;
81
+ scorer: ScorerKind;
82
+ mode: CompareMode;
83
+ /** True when the variant did not measurably regress (delta ≥ -tolerance). */
84
+ passed: boolean;
85
+ /** Tolerance applied to `passed`. Defaults to 0.02 (2 percentage points). */
86
+ tolerance: number;
87
+ }
88
+ /** Default tolerance: a variant is considered safe if its score is within
89
+ * 2 percentage points of the baseline. Tunable per call. */
90
+ export declare const DEFAULT_REGRESSION_TOLERANCE = 0.02;
91
+ /**
92
+ * Compare two prompts on the same dataset and report mean scores plus
93
+ * delta. Throws on dataset I/O errors; never throws for "variant is
94
+ * worse" — read `result.passed` for the gate decision.
95
+ */
96
+ export declare function comparePrompts(input: PromptComparisonInput): Promise<PromptComparisonResult>;
97
+ /** Render a result as a single-line summary suitable for CLI output. */
98
+ export declare function formatComparisonSummary(result: PromptComparisonResult): string;
99
+ //# sourceMappingURL=prompt-compare.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompt-compare.d.ts","sourceRoot":"","sources":["../../src/core/prompt-compare.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAyCG;AAGH,OAAO,EAGL,KAAK,UAAU,EACf,KAAK,mBAAmB,EAGxB,KAAK,eAAe,EACrB,MAAM,wBAAwB,CAAC;AAChC,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,+BAA+B,CAAC;AAE5E,MAAM,MAAM,UAAU,GAAG,WAAW,GAAG,gBAAgB,CAAC;AACxD,MAAM,MAAM,WAAW,GAAG,eAAe,GAAG,UAAU,CAAC;AAEvD,MAAM,WAAW,qBAAqB;IACpC,qFAAqF;IACrF,cAAc,EAAE,MAAM,CAAC;IACvB,yEAAyE;IACzE,aAAa,EAAE,MAAM,CAAC;IACtB,yIAAyI;IACzI,OAAO,EAAE,MAAM,GAAG,mBAAmB,EAAE,CAAC;IACxC,8FAA8F;IAC9F,IAAI,CAAC,EAAE,sBAAsB,CAAC;IAC9B,kDAAkD;IAClD,MAAM,CAAC,EAAE,UAAU,CAAC;IACpB,iEAAiE;IACjE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,6DAA6D;IAC7D,IAAI,CAAC,EAAE,WAAW,CAAC;IACnB,wEAAwE;IACxE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,kDAAkD;IAClD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,4FAA4F;IAC5F,OAAO,CAAC,EAAE;QAAE,QAAQ,EAAE,eAAe,CAAA;KAAE,CAAC;IACxC,4DAA4D;IAC5D,OAAO,CAAC,EAAE,UAAU,CAAC;CACtB;AAED,MAAM,WAAW,sBAAsB;IACrC,aAAa,EAAE,MAAM,CAAC;IACtB,YAAY,EAAE,MAAM,CAAC;IACrB,qFAAqF;IACrF,KAAK,EAAE,MAAM,CAAC;IACd,iFAAiF;IACjF,YAAY,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,MAAM,CAAC;IACvB,MAAM,EAAE,UAAU,CAAC;IACnB,IAAI,EAAE,WAAW,CAAC;IAClB,6EAA6E;IAC7E,MAAM,EAAE,OAAO,CAAC;IAChB,6EAA6E;IAC7E,SAAS,EAAE,MAAM,CAAC;CACnB;AAED;6DAC6D;AAC7D,eAAO,MAAM,4BAA4B,OAAO,CAAC;AAEjD;;;;GAIG;AACH,wBAAsB,cAAc,CAClC,KAAK,EAAE,qBAAqB,GAC3B,OAAO,CAAC,sBAAsB,CAAC,CAkDjC;AAsND,wEAAwE;AACxE,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,sBAAsB,GAC7B,MAAM,CAWR"}
@@ -0,0 +1,210 @@
1
+ import { existsSync, readFileSync } from "node:fs";
2
+ import {
3
+ createPromptScorer,
4
+ createRuntimeAdapter,
5
+ scoreAgreement,
6
+ scorePlannerAction
7
+ } from "../optimizers/index.js";
8
+ const DEFAULT_REGRESSION_TOLERANCE = 0.02;
9
+ async function comparePrompts(input) {
10
+ const examples = loadDataset(input.dataset);
11
+ const cap = typeof input.maxExamples === "number" && input.maxExamples > 0 ? Math.min(input.maxExamples, examples.length) : examples.length;
12
+ const limited = examples.slice(0, cap);
13
+ if (limited.length === 0) {
14
+ return emptyResult(input);
15
+ }
16
+ const adapter = await resolveAdapter(input);
17
+ const scorerKind = input.scorer ?? (input.task === "action_planner" ? "planner_action" : "agreement");
18
+ const compare = scorerKind === "planner_action" ? scorePlannerAction : scoreAgreement;
19
+ const mode = input.mode ?? "vs_historical";
20
+ if (mode === "pairwise") {
21
+ return runPairwise({
22
+ adapter,
23
+ baselinePrompt: input.baselinePrompt,
24
+ variantPrompt: input.variantPrompt,
25
+ examples: limited,
26
+ compare,
27
+ scorerKind,
28
+ temperature: input.temperature ?? 0,
29
+ maxTokens: input.maxTokens ?? 512,
30
+ tolerance: DEFAULT_REGRESSION_TOLERANCE
31
+ });
32
+ }
33
+ const scorer = createPromptScorer(adapter, {
34
+ compare,
35
+ temperature: input.temperature ?? 0,
36
+ maxTokens: input.maxTokens ?? 512
37
+ });
38
+ const baselineScore = await scorer(input.baselinePrompt, limited);
39
+ const variantScore = await scorer(input.variantPrompt, limited);
40
+ return finalize({
41
+ baselineScore,
42
+ variantScore,
43
+ examplesScored: limited.length,
44
+ scorerKind,
45
+ mode,
46
+ tolerance: DEFAULT_REGRESSION_TOLERANCE
47
+ });
48
+ }
49
+ function loadDataset(dataset) {
50
+ if (typeof dataset !== "string") return dataset;
51
+ if (!existsSync(dataset)) {
52
+ throw new Error(`[prompt-compare] dataset not found at ${dataset}`);
53
+ }
54
+ const raw = readFileSync(dataset, "utf-8");
55
+ const lines = raw.split("\n").filter((line) => line.trim().length > 0);
56
+ const examples = [];
57
+ let index = 0;
58
+ for (const line of lines) {
59
+ const example = jsonlLineToExample(line, index);
60
+ if (example) examples.push(example);
61
+ index += 1;
62
+ }
63
+ return examples;
64
+ }
65
+ function jsonlLineToExample(line, index) {
66
+ let parsed;
67
+ try {
68
+ parsed = JSON.parse(line);
69
+ } catch {
70
+ return null;
71
+ }
72
+ let system;
73
+ let user;
74
+ let expected;
75
+ if (typeof parsed.request?.system === "string" && parsed.request.system) {
76
+ system = parsed.request.system;
77
+ }
78
+ for (const msg of parsed.request?.messages ?? []) {
79
+ if (!system && msg.role === "system" && typeof msg.content === "string") {
80
+ system = msg.content;
81
+ }
82
+ if (msg.role === "user" && typeof msg.content === "string") {
83
+ user = user ? `${user}
84
+ ${msg.content}` : msg.content;
85
+ }
86
+ if (msg.role === "assistant" && typeof msg.content === "string") {
87
+ expected = msg.content;
88
+ }
89
+ }
90
+ if (!user && typeof parsed.request?.prompt === "string") {
91
+ user = parsed.request.prompt;
92
+ }
93
+ if (parsed.response) {
94
+ if (typeof parsed.response.text === "string" && parsed.response.text) {
95
+ expected = parsed.response.text;
96
+ } else if (Array.isArray(parsed.response.toolCalls)) {
97
+ expected = JSON.stringify({ toolCalls: parsed.response.toolCalls });
98
+ }
99
+ }
100
+ if (!user || !expected) return null;
101
+ return {
102
+ id: `row-${index}`,
103
+ input: { system, user },
104
+ expectedOutput: expected
105
+ };
106
+ }
107
+ async function resolveAdapter(input) {
108
+ if (input.adapter) return input.adapter;
109
+ const trainProvider = process.env.TRAIN_MODEL_PROVIDER?.trim() ?? process.env.TRAINING_PROVIDER?.trim();
110
+ if (trainProvider === "cerebras") {
111
+ const { getTrainingUseModelAdapter } = await import("./cerebras-eval-model.js");
112
+ return createRuntimeAdapter(getTrainingUseModelAdapter());
113
+ }
114
+ if (!input.runtime) {
115
+ throw new Error(
116
+ "[prompt-compare] either `runtime` or `adapter` must be provided"
117
+ );
118
+ }
119
+ return createRuntimeAdapter(input.runtime.useModel);
120
+ }
121
+ async function runPairwise(input) {
122
+ let baselineToReference = 0;
123
+ let variantToReference = 0;
124
+ let variantToBaseline = 0;
125
+ for (const example of input.examples) {
126
+ const baselineOutput = await input.adapter.complete({
127
+ system: input.baselinePrompt,
128
+ user: example.input.user,
129
+ temperature: input.temperature,
130
+ maxTokens: input.maxTokens
131
+ });
132
+ const variantOutput = await input.adapter.complete({
133
+ system: input.variantPrompt,
134
+ user: example.input.user,
135
+ temperature: input.temperature,
136
+ maxTokens: input.maxTokens
137
+ });
138
+ baselineToReference += input.compare(
139
+ baselineOutput,
140
+ example.expectedOutput
141
+ );
142
+ variantToReference += input.compare(variantOutput, example.expectedOutput);
143
+ variantToBaseline += input.compare(variantOutput, baselineOutput);
144
+ }
145
+ const n = input.examples.length;
146
+ const baselineScore = baselineToReference / n;
147
+ const variantScore = variantToReference / n;
148
+ const result = finalize({
149
+ baselineScore,
150
+ variantScore,
151
+ examplesScored: n,
152
+ scorerKind: input.scorerKind,
153
+ mode: "pairwise",
154
+ tolerance: input.tolerance
155
+ });
156
+ const pairwise = variantToBaseline / n;
157
+ return {
158
+ ...result,
159
+ delta: pairwise - 1,
160
+ deltaPercent: (pairwise - 1) * 100,
161
+ passed: pairwise + input.tolerance >= 1
162
+ };
163
+ }
164
+ function finalize(input) {
165
+ const delta = input.variantScore - input.baselineScore;
166
+ const deltaPercent = input.baselineScore === 0 ? 0 : delta / input.baselineScore * 100;
167
+ return {
168
+ baselineScore: input.baselineScore,
169
+ variantScore: input.variantScore,
170
+ delta,
171
+ deltaPercent,
172
+ examplesScored: input.examplesScored,
173
+ scorer: input.scorerKind,
174
+ mode: input.mode,
175
+ passed: delta + input.tolerance >= 0,
176
+ tolerance: input.tolerance
177
+ };
178
+ }
179
+ function emptyResult(input) {
180
+ const scorer = input.scorer ?? (input.task === "action_planner" ? "planner_action" : "agreement");
181
+ return {
182
+ baselineScore: 0,
183
+ variantScore: 0,
184
+ delta: 0,
185
+ deltaPercent: 0,
186
+ examplesScored: 0,
187
+ scorer,
188
+ mode: input.mode ?? "vs_historical",
189
+ passed: true,
190
+ tolerance: DEFAULT_REGRESSION_TOLERANCE
191
+ };
192
+ }
193
+ function formatComparisonSummary(result) {
194
+ const sign = result.delta >= 0 ? "+" : "";
195
+ const verdict = result.passed ? "PASS" : "FAIL";
196
+ return [
197
+ `[prompt-compare] ${verdict} mode=${result.mode} scorer=${result.scorer}`,
198
+ `n=${result.examplesScored}`,
199
+ `baseline=${result.baselineScore.toFixed(4)}`,
200
+ `variant=${result.variantScore.toFixed(4)}`,
201
+ `delta=${sign}${result.delta.toFixed(4)} (${sign}${result.deltaPercent.toFixed(2)}%)`,
202
+ `tolerance=${result.tolerance}`
203
+ ].join(" ");
204
+ }
205
+ export {
206
+ DEFAULT_REGRESSION_TOLERANCE,
207
+ comparePrompts,
208
+ formatComparisonSummary
209
+ };
210
+ //# sourceMappingURL=prompt-compare.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/core/prompt-compare.ts"],"sourcesContent":["/**\n * Prompt A/B comparison harness.\n *\n * Given two prompts (baseline + variant) and a dataset of historical\n * input/expected pairs, run each prompt through the same model and\n * report mean scores plus per-example deltas. Used to gate prompt\n * cleanup / compression changes before they ship: a variant that\n * regresses against the historical reference indicates a behavioral\n * change, not a pure cosmetic edit.\n *\n * The harness is a thin wrapper over the native optimizer scorer\n * (`createPromptScorer` from optimizers/scoring.ts) — that scorer is\n * already designed for prompt-vs-dataset evaluation, just with a\n * single prompt at a time. We invoke it twice and diff the results.\n *\n * Two scoring modes are supported:\n *\n * - `vs_historical` (default): each prompt is scored against the\n * recorded `expectedOutput` (Jaccard token overlap by default,\n * action-name match for the action_planner task). Cheap and\n * deterministic. Both prompts are scored independently; the delta\n * tells you whether the variant reproduces the historical output\n * as well as the baseline does.\n *\n * - `pairwise`: run baseline on every example to capture v1 outputs,\n * then run variant on the same inputs and compare v2 outputs to v1\n * outputs directly (pairwise Jaccard). This answers \"did the\n * variant produce semantically equivalent output?\", which is a\n * stricter regression test than `vs_historical` because the latter\n * is biased — historical outputs were likely produced by a prompt\n * close to the baseline.\n *\n * No new model abstractions are introduced. Reuses:\n * - `parseJsonlDataset()` from backends/native.ts (private — mirrored inline here to avoid exporting the training-backend parser)\n * - `createRuntimeAdapter()` from optimizers/scoring.ts\n * - `createPromptScorer()` from optimizers/scoring.ts\n * - `scoreAgreement()` / `scorePlannerAction()` from optimizers/scoring.ts\n *\n * Cost note: N examples × 2 prompts = 2N model calls per run in\n * `vs_historical` mode; same in `pairwise` mode (baseline outputs are\n * captured once, variant once). Default temperature 0 for determinism.\n */\n\nimport { existsSync, readFileSync } from \"node:fs\";\nimport {\n createPromptScorer,\n createRuntimeAdapter,\n type LlmAdapter,\n type OptimizationExample,\n scoreAgreement,\n scorePlannerAction,\n type UseModelHandler,\n} from \"../optimizers/index.js\";\nimport type { TrajectoryTrainingTask } from \"./trajectory-task-datasets.js\";\n\nexport type ScorerKind = \"agreement\" | \"planner_action\";\nexport type CompareMode = \"vs_historical\" | \"pairwise\";\n\nexport interface PromptComparisonInput {\n /** System prompt under test as the baseline (often the current canonical prompt). */\n baselinePrompt: string;\n /** System prompt under test as the variant (e.g. caveman-compressed). */\n variantPrompt: string;\n /** Dataset of `(input, expectedOutput)` rows. Path to a JSONL file produced by `exportTrajectoryTaskDatasets`, or an in-memory array. */\n dataset: string | OptimizationExample[];\n /** Task hint — selects the right scorer when `scorer` is omitted. Defaults to `agreement`. */\n task?: TrajectoryTrainingTask;\n /** Force a specific scorer regardless of task. */\n scorer?: ScorerKind;\n /** Cap how many examples to score (handy for cheap previews). */\n maxExamples?: number;\n /** Compare mode: `vs_historical` (default) or `pairwise`. */\n mode?: CompareMode;\n /** Temperature passed to the adapter. Defaults to 0 for determinism. */\n temperature?: number;\n /** Max tokens per completion. Defaults to 512. */\n maxTokens?: number;\n /** Loose runtime shape — only `useModel` is required. Mutually exclusive with `adapter`. */\n runtime?: { useModel: UseModelHandler };\n /** Pre-built LLM adapter (tests, alternative providers). */\n adapter?: LlmAdapter;\n}\n\nexport interface PromptComparisonResult {\n baselineScore: number;\n variantScore: number;\n /** `variantScore - baselineScore`. Positive means variant is closer to reference. */\n delta: number;\n /** Percentage delta, where 0 baseline collapses to 0 to avoid divide-by-zero. */\n deltaPercent: number;\n examplesScored: number;\n scorer: ScorerKind;\n mode: CompareMode;\n /** True when the variant did not measurably regress (delta ≥ -tolerance). */\n passed: boolean;\n /** Tolerance applied to `passed`. Defaults to 0.02 (2 percentage points). */\n tolerance: number;\n}\n\n/** Default tolerance: a variant is considered safe if its score is within\n * 2 percentage points of the baseline. Tunable per call. */\nexport const DEFAULT_REGRESSION_TOLERANCE = 0.02;\n\n/**\n * Compare two prompts on the same dataset and report mean scores plus\n * delta. Throws on dataset I/O errors; never throws for \"variant is\n * worse\" — read `result.passed` for the gate decision.\n */\nexport async function comparePrompts(\n input: PromptComparisonInput,\n): Promise<PromptComparisonResult> {\n const examples = loadDataset(input.dataset);\n const cap =\n typeof input.maxExamples === \"number\" && input.maxExamples > 0\n ? Math.min(input.maxExamples, examples.length)\n : examples.length;\n const limited = examples.slice(0, cap);\n\n if (limited.length === 0) {\n return emptyResult(input);\n }\n\n const adapter = await resolveAdapter(input);\n const scorerKind: ScorerKind =\n input.scorer ??\n (input.task === \"action_planner\" ? \"planner_action\" : \"agreement\");\n const compare =\n scorerKind === \"planner_action\" ? scorePlannerAction : scoreAgreement;\n const mode: CompareMode = input.mode ?? \"vs_historical\";\n\n if (mode === \"pairwise\") {\n return runPairwise({\n adapter,\n baselinePrompt: input.baselinePrompt,\n variantPrompt: input.variantPrompt,\n examples: limited,\n compare,\n scorerKind,\n temperature: input.temperature ?? 0,\n maxTokens: input.maxTokens ?? 512,\n tolerance: DEFAULT_REGRESSION_TOLERANCE,\n });\n }\n\n const scorer = createPromptScorer(adapter, {\n compare,\n temperature: input.temperature ?? 0,\n maxTokens: input.maxTokens ?? 512,\n });\n const baselineScore = await scorer(input.baselinePrompt, limited);\n const variantScore = await scorer(input.variantPrompt, limited);\n\n return finalize({\n baselineScore,\n variantScore,\n examplesScored: limited.length,\n scorerKind,\n mode,\n tolerance: DEFAULT_REGRESSION_TOLERANCE,\n });\n}\n\nfunction loadDataset(\n dataset: string | OptimizationExample[],\n): OptimizationExample[] {\n if (typeof dataset !== \"string\") return dataset;\n if (!existsSync(dataset)) {\n throw new Error(`[prompt-compare] dataset not found at ${dataset}`);\n }\n const raw = readFileSync(dataset, \"utf-8\");\n const lines = raw.split(\"\\n\").filter((line) => line.trim().length > 0);\n const examples: OptimizationExample[] = [];\n let index = 0;\n for (const line of lines) {\n const example = jsonlLineToExample(line, index);\n if (example) examples.push(example);\n index += 1;\n }\n return examples;\n}\n\ninterface JsonlMessage {\n role: \"system\" | \"developer\" | \"user\" | \"assistant\" | \"tool\";\n content: string;\n}\n\ninterface JsonlRow {\n format?: string;\n request?: { system?: string; prompt?: string; messages?: JsonlMessage[] };\n response?: { text?: string; toolCalls?: unknown[] };\n}\n\n/** Parse one `eliza_native_v1` row to an OptimizationExample. Mirrors\n * `rowToExample()` in backends/native.ts; copied here to avoid an\n * import cycle and to accept rows that don't carry the `boundary`\n * field (older exports). */\nfunction jsonlLineToExample(\n line: string,\n index: number,\n): OptimizationExample | null {\n let parsed: JsonlRow;\n try {\n parsed = JSON.parse(line) as JsonlRow;\n } catch {\n return null;\n }\n let system: string | undefined;\n let user: string | undefined;\n let expected: string | undefined;\n if (typeof parsed.request?.system === \"string\" && parsed.request.system) {\n system = parsed.request.system;\n }\n for (const msg of parsed.request?.messages ?? []) {\n if (!system && msg.role === \"system\" && typeof msg.content === \"string\") {\n system = msg.content;\n }\n if (msg.role === \"user\" && typeof msg.content === \"string\") {\n user = user ? `${user}\\n${msg.content}` : msg.content;\n }\n if (msg.role === \"assistant\" && typeof msg.content === \"string\") {\n expected = msg.content;\n }\n }\n if (!user && typeof parsed.request?.prompt === \"string\") {\n user = parsed.request.prompt;\n }\n if (parsed.response) {\n if (typeof parsed.response.text === \"string\" && parsed.response.text) {\n expected = parsed.response.text;\n } else if (Array.isArray(parsed.response.toolCalls)) {\n expected = JSON.stringify({ toolCalls: parsed.response.toolCalls });\n }\n }\n if (!user || !expected) return null;\n return {\n id: `row-${index}`,\n input: { system, user },\n expectedOutput: expected,\n };\n}\n\nasync function resolveAdapter(\n input: PromptComparisonInput,\n): Promise<LlmAdapter> {\n if (input.adapter) return input.adapter;\n // Standing direction: training-side comparison runs on Cerebras\n // gpt-oss-120b unless the operator passes their own adapter.\n const trainProvider =\n process.env.TRAIN_MODEL_PROVIDER?.trim() ??\n process.env.TRAINING_PROVIDER?.trim();\n if (trainProvider === \"cerebras\") {\n const { getTrainingUseModelAdapter } = await import(\n \"./cerebras-eval-model.js\"\n );\n return createRuntimeAdapter(getTrainingUseModelAdapter());\n }\n if (!input.runtime) {\n throw new Error(\n \"[prompt-compare] either `runtime` or `adapter` must be provided\",\n );\n }\n return createRuntimeAdapter(input.runtime.useModel);\n}\n\ninterface PairwiseInput {\n adapter: LlmAdapter;\n baselinePrompt: string;\n variantPrompt: string;\n examples: OptimizationExample[];\n compare: (actual: string, expected: string) => number;\n scorerKind: ScorerKind;\n temperature: number;\n maxTokens: number;\n tolerance: number;\n}\n\n/** Pairwise mode: capture baseline outputs, then compare variant\n * outputs to those captured baselines. Both `baselineScore` and\n * `variantScore` are reported as similarity-to-historical (same as\n * vs_historical mode) so the two modes report a comparable axis;\n * `delta` here additionally reflects mean pairwise self-similarity\n * via the same compare function, which is its strength as a\n * regression test. */\nasync function runPairwise(\n input: PairwiseInput,\n): Promise<PromptComparisonResult> {\n let baselineToReference = 0;\n let variantToReference = 0;\n let variantToBaseline = 0;\n for (const example of input.examples) {\n const baselineOutput = await input.adapter.complete({\n system: input.baselinePrompt,\n user: example.input.user,\n temperature: input.temperature,\n maxTokens: input.maxTokens,\n });\n const variantOutput = await input.adapter.complete({\n system: input.variantPrompt,\n user: example.input.user,\n temperature: input.temperature,\n maxTokens: input.maxTokens,\n });\n baselineToReference += input.compare(\n baselineOutput,\n example.expectedOutput,\n );\n variantToReference += input.compare(variantOutput, example.expectedOutput);\n variantToBaseline += input.compare(variantOutput, baselineOutput);\n }\n const n = input.examples.length;\n const baselineScore = baselineToReference / n;\n const variantScore = variantToReference / n;\n const result = finalize({\n baselineScore,\n variantScore,\n examplesScored: n,\n scorerKind: input.scorerKind,\n mode: \"pairwise\",\n tolerance: input.tolerance,\n });\n // Replace delta with the pairwise self-similarity signal; deltaPercent\n // becomes the gap between variant→baseline similarity and 1.0.\n const pairwise = variantToBaseline / n;\n return {\n ...result,\n delta: pairwise - 1,\n deltaPercent: (pairwise - 1) * 100,\n passed: pairwise + input.tolerance >= 1,\n };\n}\n\ninterface FinalizeInput {\n baselineScore: number;\n variantScore: number;\n examplesScored: number;\n scorerKind: ScorerKind;\n mode: CompareMode;\n tolerance: number;\n}\n\nfunction finalize(input: FinalizeInput): PromptComparisonResult {\n const delta = input.variantScore - input.baselineScore;\n const deltaPercent =\n input.baselineScore === 0 ? 0 : (delta / input.baselineScore) * 100;\n return {\n baselineScore: input.baselineScore,\n variantScore: input.variantScore,\n delta,\n deltaPercent,\n examplesScored: input.examplesScored,\n scorer: input.scorerKind,\n mode: input.mode,\n passed: delta + input.tolerance >= 0,\n tolerance: input.tolerance,\n };\n}\n\nfunction emptyResult(input: PromptComparisonInput): PromptComparisonResult {\n const scorer: ScorerKind =\n input.scorer ??\n (input.task === \"action_planner\" ? \"planner_action\" : \"agreement\");\n return {\n baselineScore: 0,\n variantScore: 0,\n delta: 0,\n deltaPercent: 0,\n examplesScored: 0,\n scorer,\n mode: input.mode ?? \"vs_historical\",\n passed: true,\n tolerance: DEFAULT_REGRESSION_TOLERANCE,\n };\n}\n\n/** Render a result as a single-line summary suitable for CLI output. */\nexport function formatComparisonSummary(\n result: PromptComparisonResult,\n): string {\n const sign = result.delta >= 0 ? \"+\" : \"\";\n const verdict = result.passed ? \"PASS\" : \"FAIL\";\n return [\n `[prompt-compare] ${verdict} mode=${result.mode} scorer=${result.scorer}`,\n `n=${result.examplesScored}`,\n `baseline=${result.baselineScore.toFixed(4)}`,\n `variant=${result.variantScore.toFixed(4)}`,\n `delta=${sign}${result.delta.toFixed(4)} (${sign}${result.deltaPercent.toFixed(2)}%)`,\n `tolerance=${result.tolerance}`,\n ].join(\" \");\n}\n"],"mappings":"AA2CA,SAAS,YAAY,oBAAoB;AACzC;AAAA,EACE;AAAA,EACA;AAAA,EAGA;AAAA,EACA;AAAA,OAEK;AAiDA,MAAM,+BAA+B;AAO5C,eAAsB,eACpB,OACiC;AACjC,QAAM,WAAW,YAAY,MAAM,OAAO;AAC1C,QAAM,MACJ,OAAO,MAAM,gBAAgB,YAAY,MAAM,cAAc,IACzD,KAAK,IAAI,MAAM,aAAa,SAAS,MAAM,IAC3C,SAAS;AACf,QAAM,UAAU,SAAS,MAAM,GAAG,GAAG;AAErC,MAAI,QAAQ,WAAW,GAAG;AACxB,WAAO,YAAY,KAAK;AAAA,EAC1B;AAEA,QAAM,UAAU,MAAM,eAAe,KAAK;AAC1C,QAAM,aACJ,MAAM,WACL,MAAM,SAAS,mBAAmB,mBAAmB;AACxD,QAAM,UACJ,eAAe,mBAAmB,qBAAqB;AACzD,QAAM,OAAoB,MAAM,QAAQ;AAExC,MAAI,SAAS,YAAY;AACvB,WAAO,YAAY;AAAA,MACjB;AAAA,MACA,gBAAgB,MAAM;AAAA,MACtB,eAAe,MAAM;AAAA,MACrB,UAAU;AAAA,MACV;AAAA,MACA;AAAA,MACA,aAAa,MAAM,eAAe;AAAA,MAClC,WAAW,MAAM,aAAa;AAAA,MAC9B,WAAW;AAAA,IACb,CAAC;AAAA,EACH;AAEA,QAAM,SAAS,mBAAmB,SAAS;AAAA,IACzC;AAAA,IACA,aAAa,MAAM,eAAe;AAAA,IAClC,WAAW,MAAM,aAAa;AAAA,EAChC,CAAC;AACD,QAAM,gBAAgB,MAAM,OAAO,MAAM,gBAAgB,OAAO;AAChE,QAAM,eAAe,MAAM,OAAO,MAAM,eAAe,OAAO;AAE9D,SAAO,SAAS;AAAA,IACd;AAAA,IACA;AAAA,IACA,gBAAgB,QAAQ;AAAA,IACxB;AAAA,IACA;AAAA,IACA,WAAW;AAAA,EACb,CAAC;AACH;AAEA,SAAS,YACP,SACuB;AACvB,MAAI,OAAO,YAAY,SAAU,QAAO;AACxC,MAAI,CAAC,WAAW,OAAO,GAAG;AACxB,UAAM,IAAI,MAAM,yCAAyC,OAAO,EAAE;AAAA,EACpE;AACA,QAAM,MAAM,aAAa,SAAS,OAAO;AACzC,QAAM,QAAQ,IAAI,MAAM,IAAI,EAAE,OAAO,CAAC,SAAS,KAAK,KAAK,EAAE,SAAS,CAAC;AACrE,QAAM,WAAkC,CAAC;AACzC,MAAI,QAAQ;AACZ,aAAW,QAAQ,OAAO;AACxB,UAAM,UAAU,mBAAmB,MAAM,KAAK;AAC9C,QAAI,QAAS,UAAS,KAAK,OAAO;AAClC,aAAS;AAAA,EACX;AACA,SAAO;AACT;AAiBA,SAAS,mBACP,MACA,OAC4B;AAC5B,MAAI;AACJ,MAAI;AACF,aAAS,KAAK,MAAM,IAAI;AAAA,EAC1B,QAAQ;AACN,WAAO;AAAA,EACT;AACA,MAAI;AACJ,MAAI;AACJ,MAAI;AACJ,MAAI,OAAO,OAAO,SAAS,WAAW,YAAY,OAAO,QAAQ,QAAQ;AACvE,aAAS,OAAO,QAAQ;AAAA,EAC1B;AACA,aAAW,OAAO,OAAO,SAAS,YAAY,CAAC,GAAG;AAChD,QAAI,CAAC,UAAU,IAAI,SAAS,YAAY,OAAO,IAAI,YAAY,UAAU;AACvE,eAAS,IAAI;AAAA,IACf;AACA,QAAI,IAAI,SAAS,UAAU,OAAO,IAAI,YAAY,UAAU;AAC1D,aAAO,OAAO,GAAG,IAAI;AAAA,EAAK,IAAI,OAAO,KAAK,IAAI;AAAA,IAChD;AACA,QAAI,IAAI,SAAS,eAAe,OAAO,IAAI,YAAY,UAAU;AAC/D,iBAAW,IAAI;AAAA,IACjB;AAAA,EACF;AACA,MAAI,CAAC,QAAQ,OAAO,OAAO,SAAS,WAAW,UAAU;AACvD,WAAO,OAAO,QAAQ;AAAA,EACxB;AACA,MAAI,OAAO,UAAU;AACnB,QAAI,OAAO,OAAO,SAAS,SAAS,YAAY,OAAO,SAAS,MAAM;AACpE,iBAAW,OAAO,SAAS;AAAA,IAC7B,WAAW,MAAM,QAAQ,OAAO,SAAS,SAAS,GAAG;AACnD,iBAAW,KAAK,UAAU,EAAE,WAAW,OAAO,SAAS,UAAU,CAAC;AAAA,IACpE;AAAA,EACF;AACA,MAAI,CAAC,QAAQ,CAAC,SAAU,QAAO;AAC/B,SAAO;AAAA,IACL,IAAI,OAAO,KAAK;AAAA,IAChB,OAAO,EAAE,QAAQ,KAAK;AAAA,IACtB,gBAAgB;AAAA,EAClB;AACF;AAEA,eAAe,eACb,OACqB;AACrB,MAAI,MAAM,QAAS,QAAO,MAAM;AAGhC,QAAM,gBACJ,QAAQ,IAAI,sBAAsB,KAAK,KACvC,QAAQ,IAAI,mBAAmB,KAAK;AACtC,MAAI,kBAAkB,YAAY;AAChC,UAAM,EAAE,2BAA2B,IAAI,MAAM,OAC3C,0BACF;AACA,WAAO,qBAAqB,2BAA2B,CAAC;AAAA,EAC1D;AACA,MAAI,CAAC,MAAM,SAAS;AAClB,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,SAAO,qBAAqB,MAAM,QAAQ,QAAQ;AACpD;AAqBA,eAAe,YACb,OACiC;AACjC,MAAI,sBAAsB;AAC1B,MAAI,qBAAqB;AACzB,MAAI,oBAAoB;AACxB,aAAW,WAAW,MAAM,UAAU;AACpC,UAAM,iBAAiB,MAAM,MAAM,QAAQ,SAAS;AAAA,MAClD,QAAQ,MAAM;AAAA,MACd,MAAM,QAAQ,MAAM;AAAA,MACpB,aAAa,MAAM;AAAA,MACnB,WAAW,MAAM;AAAA,IACnB,CAAC;AACD,UAAM,gBAAgB,MAAM,MAAM,QAAQ,SAAS;AAAA,MACjD,QAAQ,MAAM;AAAA,MACd,MAAM,QAAQ,MAAM;AAAA,MACpB,aAAa,MAAM;AAAA,MACnB,WAAW,MAAM;AAAA,IACnB,CAAC;AACD,2BAAuB,MAAM;AAAA,MAC3B;AAAA,MACA,QAAQ;AAAA,IACV;AACA,0BAAsB,MAAM,QAAQ,eAAe,QAAQ,cAAc;AACzE,yBAAqB,MAAM,QAAQ,eAAe,cAAc;AAAA,EAClE;AACA,QAAM,IAAI,MAAM,SAAS;AACzB,QAAM,gBAAgB,sBAAsB;AAC5C,QAAM,eAAe,qBAAqB;AAC1C,QAAM,SAAS,SAAS;AAAA,IACtB;AAAA,IACA;AAAA,IACA,gBAAgB;AAAA,IAChB,YAAY,MAAM;AAAA,IAClB,MAAM;AAAA,IACN,WAAW,MAAM;AAAA,EACnB,CAAC;AAGD,QAAM,WAAW,oBAAoB;AACrC,SAAO;AAAA,IACL,GAAG;AAAA,IACH,OAAO,WAAW;AAAA,IAClB,eAAe,WAAW,KAAK;AAAA,IAC/B,QAAQ,WAAW,MAAM,aAAa;AAAA,EACxC;AACF;AAWA,SAAS,SAAS,OAA8C;AAC9D,QAAM,QAAQ,MAAM,eAAe,MAAM;AACzC,QAAM,eACJ,MAAM,kBAAkB,IAAI,IAAK,QAAQ,MAAM,gBAAiB;AAClE,SAAO;AAAA,IACL,eAAe,MAAM;AAAA,IACrB,cAAc,MAAM;AAAA,IACpB;AAAA,IACA;AAAA,IACA,gBAAgB,MAAM;AAAA,IACtB,QAAQ,MAAM;AAAA,IACd,MAAM,MAAM;AAAA,IACZ,QAAQ,QAAQ,MAAM,aAAa;AAAA,IACnC,WAAW,MAAM;AAAA,EACnB;AACF;AAEA,SAAS,YAAY,OAAsD;AACzE,QAAM,SACJ,MAAM,WACL,MAAM,SAAS,mBAAmB,mBAAmB;AACxD,SAAO;AAAA,IACL,eAAe;AAAA,IACf,cAAc;AAAA,IACd,OAAO;AAAA,IACP,cAAc;AAAA,IACd,gBAAgB;AAAA,IAChB;AAAA,IACA,MAAM,MAAM,QAAQ;AAAA,IACpB,QAAQ;AAAA,IACR,WAAW;AAAA,EACb;AACF;AAGO,SAAS,wBACd,QACQ;AACR,QAAM,OAAO,OAAO,SAAS,IAAI,MAAM;AACvC,QAAM,UAAU,OAAO,SAAS,SAAS;AACzC,SAAO;AAAA,IACL,oBAAoB,OAAO,SAAS,OAAO,IAAI,WAAW,OAAO,MAAM;AAAA,IACvE,KAAK,OAAO,cAAc;AAAA,IAC1B,YAAY,OAAO,cAAc,QAAQ,CAAC,CAAC;AAAA,IAC3C,WAAW,OAAO,aAAa,QAAQ,CAAC,CAAC;AAAA,IACzC,SAAS,IAAI,GAAG,OAAO,MAAM,QAAQ,CAAC,CAAC,KAAK,IAAI,GAAG,OAAO,aAAa,QAAQ,CAAC,CAAC;AAAA,IACjF,aAAa,OAAO,SAAS;AAAA,EAC/B,EAAE,KAAK,GAAG;AACZ;","names":[]}