voratiq 0.1.0-beta.2 → 0.1.0-beta.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (527) hide show
  1. package/README.md +41 -29
  2. package/dist/agents/launch/chat.d.ts +23 -0
  3. package/dist/agents/launch/chat.js +44 -0
  4. package/dist/agents/launch/environment.d.ts +8 -0
  5. package/dist/{commands/run/agents/workspace-prep.js → agents/launch/environment.js} +5 -27
  6. package/dist/agents/launch/prompt.d.ts +6 -0
  7. package/dist/agents/launch/prompt.js +12 -0
  8. package/dist/agents/launch/provider-state.d.ts +39 -0
  9. package/dist/agents/launch/provider-state.js +103 -0
  10. package/dist/agents/runtime/auth.d.ts +27 -0
  11. package/dist/agents/runtime/auth.js +72 -0
  12. package/dist/agents/runtime/chat.d.ts +5 -0
  13. package/dist/agents/runtime/chat.js +7 -0
  14. package/dist/agents/runtime/errors.d.ts +27 -0
  15. package/dist/agents/runtime/errors.js +51 -0
  16. package/dist/{commands/run/agents → agents/runtime}/failures.d.ts +0 -1
  17. package/dist/agents/runtime/failures.js +136 -0
  18. package/dist/agents/runtime/harness.d.ts +2 -0
  19. package/dist/agents/runtime/harness.js +119 -0
  20. package/dist/{commands/run/agents/sandbox-launcher.d.ts → agents/runtime/launcher.d.ts} +18 -6
  21. package/dist/{commands/run/agents/sandbox-launcher.js → agents/runtime/launcher.js} +17 -39
  22. package/dist/{commands/run/agents/workspace-prep.d.ts → agents/runtime/manifest.d.ts} +6 -6
  23. package/dist/agents/runtime/manifest.js +34 -0
  24. package/dist/agents/runtime/policy.d.ts +32 -0
  25. package/dist/agents/runtime/policy.js +240 -0
  26. package/dist/agents/runtime/registry.d.ts +4 -0
  27. package/dist/agents/runtime/registry.js +54 -0
  28. package/dist/{commands/run → agents/runtime}/sandbox.d.ts +8 -2
  29. package/dist/{commands/run → agents/runtime}/sandbox.js +28 -67
  30. package/dist/agents/runtime/shim/run-agent-shim.d.ts +1 -0
  31. package/dist/agents/runtime/shim/run-agent-shim.js +276 -0
  32. package/dist/agents/runtime/types.d.ts +91 -0
  33. package/dist/{commands/run/agents → agents/runtime}/watchdog.d.ts +4 -3
  34. package/dist/{commands/run/agents → agents/runtime}/watchdog.js +155 -26
  35. package/dist/auth/providers/codex.js +7 -2
  36. package/dist/auth/providers/gemini.js +14 -6
  37. package/dist/auth/providers/types.d.ts +1 -0
  38. package/dist/auth/providers/utils.d.ts +0 -1
  39. package/dist/auth/providers/utils.js +1 -49
  40. package/dist/bin.js +369 -71
  41. package/dist/cli/apply.d.ts +4 -0
  42. package/dist/cli/apply.js +28 -9
  43. package/dist/cli/auto.d.ts +32 -0
  44. package/dist/cli/auto.js +232 -0
  45. package/dist/cli/contract.d.ts +328 -0
  46. package/dist/cli/contract.js +480 -0
  47. package/dist/cli/errors.d.ts +3 -0
  48. package/dist/cli/errors.js +21 -3
  49. package/dist/cli/init.d.ts +5 -0
  50. package/dist/cli/init.js +34 -6
  51. package/dist/cli/list.d.ts +6 -4
  52. package/dist/cli/list.js +39 -16
  53. package/dist/cli/mcp.d.ts +2 -0
  54. package/dist/cli/mcp.js +16 -0
  55. package/dist/cli/message.d.ts +28 -0
  56. package/dist/cli/message.js +147 -0
  57. package/dist/cli/operator-envelope.d.ts +180 -0
  58. package/dist/cli/operator-envelope.js +425 -0
  59. package/dist/cli/output.d.ts +15 -1
  60. package/dist/cli/output.js +153 -5
  61. package/dist/cli/prune.d.ts +7 -3
  62. package/dist/cli/prune.js +57 -12
  63. package/dist/cli/reduce.d.ts +29 -0
  64. package/dist/cli/reduce.js +211 -0
  65. package/dist/cli/root-launcher.d.ts +4 -0
  66. package/dist/cli/root-launcher.js +15 -0
  67. package/dist/cli/run.d.ts +27 -1
  68. package/dist/cli/run.js +108 -16
  69. package/dist/cli/spec.d.ts +31 -0
  70. package/dist/cli/spec.js +180 -0
  71. package/dist/cli/verify.d.ts +35 -0
  72. package/dist/cli/verify.js +297 -0
  73. package/dist/commands/apply/command.d.ts +2 -0
  74. package/dist/commands/apply/command.js +145 -6
  75. package/dist/commands/apply/errors.d.ts +43 -4
  76. package/dist/commands/apply/errors.js +100 -22
  77. package/dist/commands/apply/types.d.ts +2 -1
  78. package/dist/commands/auto/command.d.ts +145 -0
  79. package/dist/commands/auto/command.js +433 -0
  80. package/dist/commands/auto/errors.d.ts +19 -0
  81. package/dist/commands/auto/errors.js +19 -0
  82. package/dist/commands/auto/validation.d.ts +14 -0
  83. package/dist/commands/auto/validation.js +90 -0
  84. package/dist/commands/fetch.d.ts +2 -2
  85. package/dist/commands/fetch.js +4 -4
  86. package/dist/commands/init/agents.d.ts +2 -1
  87. package/dist/commands/init/agents.js +66 -63
  88. package/dist/commands/init/command.js +300 -16
  89. package/dist/commands/init/types.d.ts +18 -7
  90. package/dist/commands/interactive/lifecycle.d.ts +15 -0
  91. package/dist/commands/interactive/lifecycle.js +141 -0
  92. package/dist/commands/list/command.d.ts +10 -3
  93. package/dist/commands/list/command.js +597 -40
  94. package/dist/commands/message/command.d.ts +23 -0
  95. package/dist/commands/message/command.js +215 -0
  96. package/dist/commands/message/errors.d.ts +9 -0
  97. package/dist/commands/message/errors.js +20 -0
  98. package/dist/commands/message/lifecycle.d.ts +14 -0
  99. package/dist/commands/message/lifecycle.js +128 -0
  100. package/dist/commands/prune/command.d.ts +2 -1
  101. package/dist/commands/prune/command.js +61 -10
  102. package/dist/commands/prune/errors.d.ts +1 -1
  103. package/dist/commands/prune/errors.js +5 -5
  104. package/dist/commands/prune/types.d.ts +21 -0
  105. package/dist/commands/reduce/command.d.ts +26 -0
  106. package/dist/commands/reduce/command.js +145 -0
  107. package/dist/commands/reduce/errors.d.ts +17 -0
  108. package/dist/commands/reduce/errors.js +32 -0
  109. package/dist/commands/reduce/targets.d.ts +11 -0
  110. package/dist/commands/reduce/targets.js +271 -0
  111. package/dist/commands/root-launcher/command.d.ts +31 -0
  112. package/dist/commands/root-launcher/command.js +233 -0
  113. package/dist/commands/run/command.d.ts +6 -1
  114. package/dist/commands/run/command.js +70 -40
  115. package/dist/commands/run/lifecycle.d.ts +7 -5
  116. package/dist/commands/run/lifecycle.js +45 -23
  117. package/dist/commands/run/record-init.d.ts +4 -1
  118. package/dist/commands/run/record-init.js +5 -2
  119. package/dist/commands/run/shim/run-agent-shim.d.ts +2 -1
  120. package/dist/commands/run/shim/run-agent-shim.js +4 -219
  121. package/dist/commands/run/validation.d.ts +2 -3
  122. package/dist/commands/run/validation.js +54 -25
  123. package/dist/commands/shared/max-parallel.d.ts +5 -0
  124. package/dist/commands/shared/max-parallel.js +15 -0
  125. package/dist/commands/shared/preview.d.ts +10 -0
  126. package/dist/commands/shared/preview.js +60 -0
  127. package/dist/commands/shared/resolve-reduction-competitors.d.ts +15 -0
  128. package/dist/commands/shared/resolve-reduction-competitors.js +13 -0
  129. package/dist/commands/shared/resolve-stage-competitors.d.ts +19 -0
  130. package/dist/commands/shared/resolve-stage-competitors.js +171 -0
  131. package/dist/commands/shared/session-id.d.ts +1 -0
  132. package/dist/commands/shared/session-id.js +1 -0
  133. package/dist/commands/spec/command.d.ts +22 -0
  134. package/dist/commands/spec/command.js +330 -0
  135. package/dist/commands/spec/errors.d.ts +11 -0
  136. package/dist/commands/spec/errors.js +23 -0
  137. package/dist/commands/verify/agents.d.ts +8 -0
  138. package/dist/commands/verify/agents.js +29 -0
  139. package/dist/commands/verify/command.d.ts +23 -0
  140. package/dist/commands/verify/command.js +168 -0
  141. package/dist/commands/verify/lifecycle.d.ts +14 -0
  142. package/dist/commands/verify/lifecycle.js +229 -0
  143. package/dist/commands/verify/max-parallel.d.ts +7 -0
  144. package/dist/commands/verify/max-parallel.js +15 -0
  145. package/dist/commands/verify/targets.d.ts +18 -0
  146. package/dist/commands/verify/targets.js +420 -0
  147. package/dist/competition/command-adapter.d.ts +35 -0
  148. package/dist/competition/command-adapter.js +20 -0
  149. package/dist/competition/core.d.ts +41 -0
  150. package/dist/competition/core.js +181 -0
  151. package/dist/competition/shared/extra-context.d.ts +14 -0
  152. package/dist/competition/shared/extra-context.js +100 -0
  153. package/dist/competition/shared/preflight.d.ts +11 -0
  154. package/dist/competition/shared/preflight.js +39 -0
  155. package/dist/competition/shared/prompt-helpers.d.ts +16 -0
  156. package/dist/competition/shared/prompt-helpers.js +27 -0
  157. package/dist/competition/shared/prune.d.ts +1 -0
  158. package/dist/competition/shared/prune.js +4 -0
  159. package/dist/competition/shared/sandbox-policy.d.ts +9 -0
  160. package/dist/competition/shared/sandbox-policy.js +7 -0
  161. package/dist/competition/shared/teardown.d.ts +36 -0
  162. package/dist/competition/shared/teardown.js +101 -0
  163. package/dist/configs/agents/defaults.d.ts +31 -2
  164. package/dist/configs/agents/defaults.js +346 -30
  165. package/dist/configs/agents/errors.js +14 -11
  166. package/dist/configs/agents/loader.d.ts +11 -1
  167. package/dist/configs/agents/loader.js +71 -4
  168. package/dist/configs/agents/types.js +2 -2
  169. package/dist/configs/environment/detect.js +9 -4
  170. package/dist/configs/environment/errors.js +4 -4
  171. package/dist/configs/environment/loader.d.ts +1 -1
  172. package/dist/configs/environment/loader.js +3 -3
  173. package/dist/configs/orchestration/bootstrap.d.ts +16 -0
  174. package/dist/configs/orchestration/bootstrap.js +122 -0
  175. package/dist/configs/orchestration/errors.d.ts +15 -0
  176. package/dist/configs/orchestration/errors.js +28 -0
  177. package/dist/configs/orchestration/loader.d.ts +9 -0
  178. package/dist/configs/orchestration/loader.js +148 -0
  179. package/dist/configs/orchestration/types.d.ts +102 -0
  180. package/dist/configs/orchestration/types.js +65 -0
  181. package/dist/configs/sandbox/defaults.js +14 -4
  182. package/dist/configs/sandbox/errors.d.ts +1 -1
  183. package/dist/configs/sandbox/errors.js +1 -1
  184. package/dist/configs/sandbox/loader.js +6 -4
  185. package/dist/configs/sandbox/schemas.js +4 -2
  186. package/dist/configs/settings/loader.d.ts +7 -0
  187. package/dist/configs/settings/loader.js +81 -0
  188. package/dist/configs/settings/types.d.ts +47 -0
  189. package/dist/configs/settings/types.js +23 -0
  190. package/dist/configs/verification/errors.d.ts +11 -0
  191. package/dist/configs/verification/errors.js +21 -0
  192. package/dist/configs/verification/loader.d.ts +8 -0
  193. package/dist/configs/verification/loader.js +43 -0
  194. package/dist/configs/verification/methods.d.ts +35 -0
  195. package/dist/configs/verification/methods.js +41 -0
  196. package/dist/configs/verification/programmatic-defaults.d.ts +10 -0
  197. package/dist/configs/verification/programmatic-defaults.js +42 -0
  198. package/dist/configs/verification/programmatic-detect.d.ts +10 -0
  199. package/dist/configs/{evals/detect.js → verification/programmatic-detect.js} +22 -33
  200. package/dist/configs/verification/types.d.ts +49 -0
  201. package/dist/configs/verification/types.js +45 -0
  202. package/dist/contracts/list.d.ts +207 -0
  203. package/dist/contracts/list.js +154 -0
  204. package/dist/domain/interactive/model/types.d.ts +104 -0
  205. package/dist/domain/interactive/model/types.js +83 -0
  206. package/dist/domain/interactive/persistence/adapter.d.ts +39 -0
  207. package/dist/domain/interactive/persistence/adapter.js +144 -0
  208. package/dist/domain/interactive/prompt.d.ts +3 -0
  209. package/dist/domain/interactive/prompt.js +7 -0
  210. package/dist/domain/message/competition/adapter.d.ts +36 -0
  211. package/dist/domain/message/competition/adapter.js +197 -0
  212. package/dist/domain/message/competition/prompt.d.ts +8 -0
  213. package/dist/domain/message/competition/prompt.js +29 -0
  214. package/dist/domain/message/model/mutators.d.ts +17 -0
  215. package/dist/domain/message/model/mutators.js +107 -0
  216. package/dist/domain/message/model/types.d.ts +100 -0
  217. package/dist/domain/message/model/types.js +87 -0
  218. package/dist/domain/message/persistence/adapter.d.ts +43 -0
  219. package/dist/domain/message/persistence/adapter.js +124 -0
  220. package/dist/domain/reduce/competition/adapter.d.ts +42 -0
  221. package/dist/domain/reduce/competition/adapter.js +826 -0
  222. package/dist/domain/reduce/competition/output-validation.d.ts +4 -0
  223. package/dist/domain/reduce/competition/output-validation.js +18 -0
  224. package/dist/domain/reduce/competition/prompt.d.ts +10 -0
  225. package/dist/domain/reduce/competition/prompt.js +96 -0
  226. package/dist/domain/reduce/competition/reduction.d.ts +9 -0
  227. package/dist/domain/reduce/competition/reduction.js +32 -0
  228. package/dist/domain/reduce/model/types.d.ts +122 -0
  229. package/dist/domain/reduce/model/types.js +84 -0
  230. package/dist/domain/reduce/persistence/adapter.d.ts +43 -0
  231. package/dist/domain/reduce/persistence/adapter.js +126 -0
  232. package/dist/domain/run/competition/adapter.d.ts +30 -0
  233. package/dist/domain/run/competition/adapter.js +39 -0
  234. package/dist/domain/run/competition/agent-execution.d.ts +20 -0
  235. package/dist/domain/run/competition/agent-execution.js +45 -0
  236. package/dist/domain/run/competition/agent-preparation.d.ts +12 -0
  237. package/dist/domain/run/competition/agent-preparation.js +24 -0
  238. package/dist/domain/run/competition/agents/artifacts.d.ts +17 -0
  239. package/dist/domain/run/competition/agents/artifacts.js +173 -0
  240. package/dist/{commands/run → domain/run/competition}/agents/lifecycle.d.ts +3 -3
  241. package/dist/{commands/run → domain/run/competition}/agents/lifecycle.js +84 -64
  242. package/dist/domain/run/competition/agents/post-processing.d.ts +12 -0
  243. package/dist/domain/run/competition/agents/post-processing.js +4 -0
  244. package/dist/domain/run/competition/agents/preparation.js +64 -0
  245. package/dist/{commands/run → domain/run/competition}/agents/run-context.d.ts +9 -16
  246. package/dist/{commands/run → domain/run/competition}/agents/run-context.js +22 -70
  247. package/dist/{commands/run → domain/run/competition}/agents/types.d.ts +10 -13
  248. package/dist/domain/run/competition/agents/workspace.d.ts +21 -0
  249. package/dist/domain/run/competition/agents/workspace.js +47 -0
  250. package/dist/{commands/run → domain/run/competition}/errors.d.ts +8 -1
  251. package/dist/{commands/run → domain/run/competition}/errors.js +39 -9
  252. package/dist/{commands/run → domain/run/competition}/phases.d.ts +1 -2
  253. package/dist/domain/run/competition/phases.js +1 -0
  254. package/dist/domain/run/competition/prompt.d.ts +7 -0
  255. package/dist/domain/run/competition/prompt.js +27 -0
  256. package/dist/{commands/run → domain/run/competition}/reports.d.ts +5 -3
  257. package/dist/{commands/run → domain/run/competition}/reports.js +7 -19
  258. package/dist/domain/run/competition/termination-state.d.ts +4 -0
  259. package/dist/domain/run/competition/termination-state.js +12 -0
  260. package/dist/{records → domain/run/model}/enhanced.d.ts +6 -7
  261. package/dist/{records → domain/run/model}/enhanced.js +11 -11
  262. package/dist/{records → domain/run/model}/errors.d.ts +1 -1
  263. package/dist/{records → domain/run/model}/errors.js +5 -5
  264. package/dist/{records → domain/run/model}/mutators.d.ts +4 -3
  265. package/dist/{records → domain/run/model}/mutators.js +58 -36
  266. package/dist/domain/run/model/types.d.ts +376 -0
  267. package/dist/domain/run/model/types.js +192 -0
  268. package/dist/{records/persistence.d.ts → domain/run/persistence/adapter.d.ts} +9 -3
  269. package/dist/domain/run/persistence/adapter.js +340 -0
  270. package/dist/domain/run/persistence/error-mapping.d.ts +2 -0
  271. package/dist/domain/run/persistence/error-mapping.js +17 -0
  272. package/dist/domain/shared/lifecycle.d.ts +54 -0
  273. package/dist/domain/shared/lifecycle.js +165 -0
  274. package/dist/domain/shared/token-usage.d.ts +21 -0
  275. package/dist/domain/shared/token-usage.js +38 -0
  276. package/dist/domain/spec/competition/adapter.d.ts +31 -0
  277. package/dist/domain/spec/competition/adapter.js +196 -0
  278. package/dist/domain/spec/competition/prompt.d.ts +11 -0
  279. package/dist/domain/spec/competition/prompt.js +44 -0
  280. package/dist/domain/spec/model/output.d.ts +13 -0
  281. package/dist/domain/spec/model/output.js +36 -0
  282. package/dist/domain/spec/model/types.d.ts +98 -0
  283. package/dist/domain/spec/model/types.js +84 -0
  284. package/dist/domain/spec/persistence/adapter.d.ts +51 -0
  285. package/dist/domain/spec/persistence/adapter.js +140 -0
  286. package/dist/domain/verify/blinding/aliases.d.ts +7 -0
  287. package/dist/domain/verify/blinding/aliases.js +23 -0
  288. package/dist/domain/verify/competition/adapter.d.ts +54 -0
  289. package/dist/domain/verify/competition/adapter.js +444 -0
  290. package/dist/domain/verify/competition/artifacts.d.ts +6 -0
  291. package/dist/domain/verify/competition/artifacts.js +7 -0
  292. package/dist/domain/verify/competition/blinding.d.ts +24 -0
  293. package/dist/domain/verify/competition/blinding.js +109 -0
  294. package/dist/domain/verify/competition/finalize.d.ts +11 -0
  295. package/dist/domain/verify/competition/finalize.js +65 -0
  296. package/dist/domain/verify/competition/programmatic.d.ts +15 -0
  297. package/dist/domain/verify/competition/programmatic.js +352 -0
  298. package/dist/domain/verify/competition/prompt.d.ts +19 -0
  299. package/dist/domain/verify/competition/prompt.js +63 -0
  300. package/dist/domain/verify/competition/rubric.d.ts +23 -0
  301. package/dist/domain/verify/competition/rubric.js +77 -0
  302. package/dist/domain/verify/competition/shared-layout.d.ts +121 -0
  303. package/dist/domain/verify/competition/shared-layout.js +365 -0
  304. package/dist/domain/verify/competition/target.d.ts +47 -0
  305. package/dist/domain/verify/competition/target.js +1 -0
  306. package/dist/domain/verify/model/mutators.d.ts +16 -0
  307. package/dist/domain/verify/model/mutators.js +126 -0
  308. package/dist/domain/verify/model/types.d.ts +408 -0
  309. package/dist/domain/verify/model/types.js +289 -0
  310. package/dist/domain/verify/persistence/adapter.d.ts +43 -0
  311. package/dist/domain/verify/persistence/adapter.js +126 -0
  312. package/dist/domain/verify/programmatic/runner.d.ts +22 -0
  313. package/dist/domain/verify/programmatic/runner.js +209 -0
  314. package/dist/domain/verify/rubric-result.d.ts +28 -0
  315. package/dist/domain/verify/rubric-result.js +121 -0
  316. package/dist/extra-context/contract.d.ts +17 -0
  317. package/dist/extra-context/contract.js +60 -0
  318. package/dist/interactive/index.d.ts +2 -0
  319. package/dist/interactive/index.js +1 -0
  320. package/dist/interactive/providers/launch.d.ts +23 -0
  321. package/dist/interactive/providers/launch.js +203 -0
  322. package/dist/interactive/providers/mcp.d.ts +13 -0
  323. package/dist/interactive/providers/mcp.js +547 -0
  324. package/dist/interactive/providers/shared.d.ts +2 -0
  325. package/dist/interactive/providers/shared.js +1 -0
  326. package/dist/interactive/providers.d.ts +3 -0
  327. package/dist/interactive/providers.js +3 -0
  328. package/dist/interactive/records.d.ts +2 -0
  329. package/dist/interactive/records.js +1 -0
  330. package/dist/interactive/substrate.d.ts +21 -0
  331. package/dist/interactive/substrate.js +522 -0
  332. package/dist/interactive/types.d.ts +101 -0
  333. package/dist/interactive/types.js +1 -0
  334. package/dist/mcp/server.d.ts +88 -0
  335. package/dist/mcp/server.js +790 -0
  336. package/dist/persistence/error-mapping.d.ts +19 -0
  337. package/dist/persistence/error-mapping.js +44 -0
  338. package/dist/persistence/errors.d.ts +26 -0
  339. package/dist/persistence/errors.js +49 -0
  340. package/dist/persistence/extra-context.d.ts +9 -0
  341. package/dist/persistence/extra-context.js +60 -0
  342. package/dist/{records → persistence}/history-lock.js +2 -2
  343. package/dist/persistence/record-path-schema.d.ts +3 -0
  344. package/dist/persistence/record-path-schema.js +16 -0
  345. package/dist/persistence/session-store.d.ts +92 -0
  346. package/dist/persistence/session-store.js +412 -0
  347. package/dist/policy/auto.d.ts +13 -0
  348. package/dist/policy/auto.js +22 -0
  349. package/dist/policy/index.d.ts +5 -0
  350. package/dist/policy/index.js +5 -0
  351. package/dist/policy/resolution.d.ts +6 -0
  352. package/dist/policy/resolution.js +23 -0
  353. package/dist/policy/result.d.ts +53 -0
  354. package/dist/policy/result.js +15 -0
  355. package/dist/policy/selector.d.ts +11 -0
  356. package/dist/policy/selector.js +57 -0
  357. package/dist/policy/verification.d.ts +77 -0
  358. package/dist/policy/verification.js +365 -0
  359. package/dist/policy/verifier-selection.d.ts +13 -0
  360. package/dist/policy/verifier-selection.js +78 -0
  361. package/dist/preflight/branch.d.ts +9 -0
  362. package/dist/preflight/branch.js +48 -0
  363. package/dist/preflight/errors.d.ts +3 -0
  364. package/dist/preflight/errors.js +10 -3
  365. package/dist/preflight/index.d.ts +13 -0
  366. package/dist/preflight/index.js +43 -8
  367. package/dist/render/interactions/confirmation.js +4 -2
  368. package/dist/render/transcripts/apply.js +9 -10
  369. package/dist/render/transcripts/auto.d.ts +27 -0
  370. package/dist/render/transcripts/auto.js +21 -0
  371. package/dist/render/transcripts/init.d.ts +4 -15
  372. package/dist/render/transcripts/init.js +71 -72
  373. package/dist/render/transcripts/list.d.ts +10 -1
  374. package/dist/render/transcripts/list.js +121 -15
  375. package/dist/render/transcripts/message.d.ts +72 -0
  376. package/dist/render/transcripts/message.js +362 -0
  377. package/dist/render/transcripts/prune.d.ts +7 -2
  378. package/dist/render/transcripts/prune.js +64 -17
  379. package/dist/render/transcripts/reduce.d.ts +74 -0
  380. package/dist/render/transcripts/reduce.js +395 -0
  381. package/dist/render/transcripts/root-launcher.d.ts +19 -0
  382. package/dist/render/transcripts/root-launcher.js +40 -0
  383. package/dist/render/transcripts/run.d.ts +35 -6
  384. package/dist/render/transcripts/run.js +241 -165
  385. package/dist/render/transcripts/shared.d.ts +2 -0
  386. package/dist/render/transcripts/shared.js +11 -4
  387. package/dist/render/transcripts/spec.d.ts +74 -0
  388. package/dist/render/transcripts/spec.js +394 -0
  389. package/dist/render/transcripts/stage-progress.d.ts +22 -0
  390. package/dist/render/transcripts/stage-progress.js +6 -0
  391. package/dist/render/transcripts/update-check.d.ts +2 -0
  392. package/dist/render/transcripts/update-check.js +22 -0
  393. package/dist/render/transcripts/verify.d.ts +74 -0
  394. package/dist/render/transcripts/verify.js +409 -0
  395. package/dist/render/utils/agents.d.ts +10 -9
  396. package/dist/render/utils/agents.js +30 -82
  397. package/dist/render/utils/badges.d.ts +3 -20
  398. package/dist/render/utils/badges.js +3 -36
  399. package/dist/render/utils/duration.d.ts +12 -0
  400. package/dist/render/utils/duration.js +37 -0
  401. package/dist/render/utils/interactive-frame.d.ts +6 -0
  402. package/dist/render/utils/interactive-frame.js +38 -0
  403. package/dist/render/utils/records.js +4 -4
  404. package/dist/render/utils/runs.d.ts +3 -9
  405. package/dist/render/utils/runs.js +16 -48
  406. package/dist/render/utils/stage-output.d.ts +20 -0
  407. package/dist/render/utils/stage-output.js +44 -0
  408. package/dist/render/utils/timezone.d.ts +2 -0
  409. package/dist/render/utils/timezone.js +42 -0
  410. package/dist/render/utils/transcript-shell.d.ts +66 -0
  411. package/dist/render/utils/transcript-shell.js +155 -0
  412. package/dist/render/utils/transcript.d.ts +7 -1
  413. package/dist/render/utils/transcript.js +12 -2
  414. package/dist/render/utils/wrap.d.ts +1 -0
  415. package/dist/render/utils/wrap.js +20 -0
  416. package/dist/status/colors.d.ts +2 -3
  417. package/dist/status/colors.js +3 -3
  418. package/dist/status/index.d.ts +108 -8
  419. package/dist/status/index.js +164 -5
  420. package/dist/update-check/checker.d.ts +24 -0
  421. package/dist/update-check/checker.js +130 -0
  422. package/dist/update-check/prompt.d.ts +25 -0
  423. package/dist/update-check/prompt.js +62 -0
  424. package/dist/update-check/semver.d.ts +17 -0
  425. package/dist/update-check/semver.js +36 -0
  426. package/dist/update-check/state-path.d.ts +8 -0
  427. package/dist/update-check/state-path.js +18 -0
  428. package/dist/utils/binaries.js +14 -8
  429. package/dist/utils/errors.d.ts +3 -1
  430. package/dist/utils/errors.js +3 -1
  431. package/dist/utils/git.d.ts +10 -0
  432. package/dist/utils/git.js +15 -3
  433. package/dist/utils/output.d.ts +5 -1
  434. package/dist/utils/output.js +4 -2
  435. package/dist/utils/process.d.ts +2 -1
  436. package/dist/utils/process.js +7 -3
  437. package/dist/utils/session-id.d.ts +1 -0
  438. package/dist/utils/session-id.js +22 -0
  439. package/dist/utils/slug.d.ts +2 -0
  440. package/dist/utils/slug.js +15 -0
  441. package/dist/utils/voratiq-cli-target.d.ts +9 -0
  442. package/dist/utils/voratiq-cli-target.js +58 -0
  443. package/dist/workspace/agents.d.ts +13 -16
  444. package/dist/workspace/agents.js +22 -147
  445. package/dist/workspace/chat/artifacts.d.ts +9 -0
  446. package/dist/workspace/chat/artifacts.js +82 -12
  447. package/dist/workspace/chat/native-usage.d.ts +13 -0
  448. package/dist/workspace/chat/native-usage.js +60 -0
  449. package/dist/workspace/chat/sources.d.ts +9 -5
  450. package/dist/workspace/chat/sources.js +89 -23
  451. package/dist/workspace/chat/token-usage-result.d.ts +23 -0
  452. package/dist/workspace/chat/token-usage-result.js +7 -0
  453. package/dist/workspace/chat/usage-extractor.d.ts +30 -0
  454. package/dist/workspace/chat/usage-extractor.js +461 -0
  455. package/dist/workspace/chat/usage-mappings.d.ts +20 -0
  456. package/dist/workspace/chat/usage-mappings.js +136 -0
  457. package/dist/workspace/credential-guard.js +1 -1
  458. package/dist/workspace/dependencies.js +4 -4
  459. package/dist/workspace/errors.d.ts +5 -0
  460. package/dist/workspace/errors.js +13 -3
  461. package/dist/workspace/layout.d.ts +17 -6
  462. package/dist/workspace/layout.js +51 -32
  463. package/dist/workspace/promotion.d.ts +32 -0
  464. package/dist/workspace/promotion.js +34 -0
  465. package/dist/workspace/prune.d.ts +1 -1
  466. package/dist/workspace/run.d.ts +1 -3
  467. package/dist/workspace/run.js +6 -15
  468. package/dist/workspace/setup.d.ts +8 -0
  469. package/dist/workspace/setup.js +359 -56
  470. package/dist/workspace/shim.js +1 -1
  471. package/dist/workspace/structure.d.ts +91 -26
  472. package/dist/workspace/structure.js +227 -43
  473. package/dist/workspace/templates.d.ts +9 -3
  474. package/dist/workspace/templates.js +26 -15
  475. package/dist/workspace/verification-defaults.d.ts +12 -0
  476. package/dist/workspace/verification-defaults.js +1017 -0
  477. package/package.json +30 -24
  478. package/dist/cli/review.d.ts +0 -12
  479. package/dist/cli/review.js +0 -33
  480. package/dist/commands/errors.d.ts +0 -4
  481. package/dist/commands/errors.js +0 -7
  482. package/dist/commands/init/evals.d.ts +0 -4
  483. package/dist/commands/init/evals.js +0 -219
  484. package/dist/commands/review/command.d.ts +0 -10
  485. package/dist/commands/review/command.js +0 -26
  486. package/dist/commands/run/agent-execution.d.ts +0 -19
  487. package/dist/commands/run/agent-execution.js +0 -63
  488. package/dist/commands/run/agents/auth-stage.d.ts +0 -23
  489. package/dist/commands/run/agents/auth-stage.js +0 -108
  490. package/dist/commands/run/agents/chat-preserver.d.ts +0 -9
  491. package/dist/commands/run/agents/chat-preserver.js +0 -35
  492. package/dist/commands/run/agents/eval-runner.d.ts +0 -19
  493. package/dist/commands/run/agents/eval-runner.js +0 -27
  494. package/dist/commands/run/agents/failures.js +0 -32
  495. package/dist/commands/run/agents/preparation.js +0 -123
  496. package/dist/commands/run/agents.d.ts +0 -14
  497. package/dist/commands/run/agents.js +0 -47
  498. package/dist/commands/run/prompts.d.ts +0 -4
  499. package/dist/commands/run/prompts.js +0 -16
  500. package/dist/commands/run/sandbox-registry.d.ts +0 -4
  501. package/dist/commands/run/sandbox-registry.js +0 -54
  502. package/dist/configs/evals/defaults.d.ts +0 -8
  503. package/dist/configs/evals/defaults.js +0 -28
  504. package/dist/configs/evals/detect.d.ts +0 -10
  505. package/dist/configs/evals/errors.d.ts +0 -16
  506. package/dist/configs/evals/errors.js +0 -29
  507. package/dist/configs/evals/loader.d.ts +0 -9
  508. package/dist/configs/evals/loader.js +0 -46
  509. package/dist/configs/evals/types.d.ts +0 -42
  510. package/dist/configs/evals/types.js +0 -74
  511. package/dist/evals/runner.d.ts +0 -16
  512. package/dist/evals/runner.js +0 -132
  513. package/dist/records/persistence.js +0 -469
  514. package/dist/records/types.d.ts +0 -255
  515. package/dist/records/types.js +0 -160
  516. package/dist/render/transcripts/review.d.ts +0 -2
  517. package/dist/render/transcripts/review.js +0 -36
  518. /package/dist/{commands/run → agents/runtime}/shim/agent-manifest.d.ts +0 -0
  519. /package/dist/{commands/run → agents/runtime}/shim/agent-manifest.js +0 -0
  520. /package/dist/{commands/run → agents/runtime/shim}/argv.d.ts +0 -0
  521. /package/dist/{commands/run → agents/runtime/shim}/argv.js +0 -0
  522. /package/dist/{commands/run/agents → agents/runtime}/types.js +0 -0
  523. /package/dist/{commands/run → domain/run/competition}/agents/preparation.d.ts +0 -0
  524. /package/dist/{commands/run/phases.js → domain/run/competition/agents/types.js} +0 -0
  525. /package/dist/{commands/run → domain/run/model}/id.d.ts +0 -0
  526. /package/dist/{commands/run → domain/run/model}/id.js +0 -0
  527. /package/dist/{records → persistence}/history-lock.d.ts +0 -0
@@ -0,0 +1,1017 @@
1
+ import { normalizeProgrammaticCommand } from "../configs/verification/methods.js";
2
+ import { listDetectedProgrammaticDefaults } from "../configs/verification/programmatic-defaults.js";
3
+ import { detectProgrammaticSuggestions } from "../configs/verification/programmatic-detect.js";
4
+ const DEFAULT_SPEC_RUBRIC = [{ template: "spec-verification" }];
5
+ const DEFAULT_RUN_RUBRIC = [{ template: "run-verification" }];
6
+ const DEFAULT_REDUCE_RUBRIC = [{ template: "reduce-verification" }];
7
+ const DEFAULT_MESSAGE_RUBRIC = [{ template: "message-verification" }];
8
+ export async function buildDefaultVerificationConfigYaml(params) {
9
+ const suggestions = await detectProgrammaticSuggestions(params.root, params.environment);
10
+ const runProgrammaticDefaults = listDetectedProgrammaticDefaults(suggestions).flatMap((entry) => {
11
+ const command = normalizeProgrammaticCommand(entry.command);
12
+ return command ? [{ slug: entry.slug, command }] : [];
13
+ });
14
+ const lines = [];
15
+ appendRubricStage(lines, "spec", DEFAULT_SPEC_RUBRIC);
16
+ lines.push("");
17
+ appendRunStage(lines, runProgrammaticDefaults);
18
+ lines.push("");
19
+ appendRubricStage(lines, "reduce", DEFAULT_REDUCE_RUBRIC);
20
+ lines.push("");
21
+ appendRubricStage(lines, "message", DEFAULT_MESSAGE_RUBRIC);
22
+ return `${lines.join("\n")}\n`;
23
+ }
24
+ function appendRunStage(lines, runProgrammaticDefaults) {
25
+ lines.push("run:");
26
+ if (runProgrammaticDefaults.length > 0) {
27
+ lines.push(" programmatic:");
28
+ for (const entry of runProgrammaticDefaults) {
29
+ lines.push(` ${entry.slug}: ${JSON.stringify(entry.command)}`);
30
+ }
31
+ lines.push("");
32
+ }
33
+ lines.push(" rubric:");
34
+ for (const entry of DEFAULT_RUN_RUBRIC) {
35
+ lines.push(` - template: ${entry.template}`);
36
+ }
37
+ }
38
+ function appendRubricStage(lines, stage, rubric) {
39
+ lines.push(`${stage}:`);
40
+ lines.push(" rubric:");
41
+ for (const entry of rubric) {
42
+ lines.push(` - template: ${entry.template}`);
43
+ }
44
+ }
45
+ export const SHIPPED_VERIFICATION_TEMPLATES = [
46
+ {
47
+ name: "spec-verification",
48
+ prompt: `You are a blinded verifier agent reviewing multiple spec drafts for the same task and choosing the single best draft to execute.
49
+
50
+ Method boundary:
51
+
52
+ - produce per-draft assessments and a final ranking in one structured output
53
+
54
+ Inputs:
55
+
56
+ - the original task description
57
+ - the full blinded draft set
58
+ - any shared context needed to understand the intended outcome
59
+
60
+ Expected working style:
61
+
62
+ 1. Read the original task description first.
63
+ 2. Derive the key contract items the draft must preserve and use stable ids such as \`C1\`, \`C2\`, \`C3\`.
64
+ 3. Inspect the draft set directly.
65
+ 4. Assess each draft against the verification rubric.
66
+ 5. Record per-draft contract coverage, draft readiness, recommendation posture, and bounded follow-up work.
67
+ 6. Derive a strict best-to-worst ranking from those assessments.
68
+ 7. Make the ranking strict, complete, and tie-free across the full draft set.
69
+ 8. Set \`preferred\` equal to \`ranking[0]\`.
70
+
71
+ Judgment discipline:
72
+
73
+ - focus on whether the draft preserves the requested task and acceptance bar, not whether it sounds polished
74
+ - make claims only when you can point to concrete draft text or source-task evidence
75
+ - call out hidden assumptions, ambiguous boundaries, and missing execution contracts explicitly
76
+ - treat decomposition as a quality dimension, not a mandatory outcome; a draft can be strong if it stays atomic for the right task
77
+ - include lightweight \`evidence_refs\` for each draft assessment
78
+ - keep \`comparison\` focused on cross-draft tradeoffs such as task fit, decomposition quality, and execute-now readiness
79
+ - make \`comparison\` explain why \`ranking[0]\` beat \`ranking[1]\`, not just why lower-ranked drafts lost
80
+ - include \`next_actions\` only for the selected draft path
81
+
82
+ Expected output shape:
83
+
84
+ - \`assessments[]\` with one entry per draft
85
+ - top-level \`preferred\` naming the selected draft
86
+ - each assessment should include:
87
+ - \`draft\`
88
+ - \`completion_status\`
89
+ - \`recommendation_level\`
90
+ - \`quality\`
91
+ - \`evaluation\`
92
+ - \`contract_coverage\`
93
+ - \`implementation_notes\`
94
+ - \`follow_up\`
95
+ - \`evidence_refs\`
96
+ - top-level \`comparison\` should capture cross-draft tradeoffs
97
+ - top-level \`ranking\` must be strict, complete, and tie-free
98
+ - top-level \`rationale\` should explain why \`preferred\` / \`ranking[0]\` is the best execution contract
99
+ - top-level \`next_actions\` should stay short and operational
100
+ `,
101
+ rubric: `# Spec Review
102
+
103
+ Review the draft set by assessing each draft on:
104
+
105
+ - task fidelity
106
+ - boundary control
107
+ - acceptance contract
108
+ - decomposition
109
+ - execution readiness
110
+ - uncertainty handling
111
+
112
+ Then derive a final ranking from those assessments.
113
+
114
+ ## Task Fidelity
115
+
116
+ Ask:
117
+
118
+ - Does the draft preserve the actual requested outcome?
119
+ - Does it stay aligned to the originating task rather than drifting into adjacent cleanup or architecture work?
120
+ - Are important terms and goals concrete enough to execute against?
121
+
122
+ Task fidelity should dominate stylistic polish.
123
+
124
+ Every draft assessment should include explicit \`contract_coverage\` entries so the ranking is traceable back to the originating task rather than inferred from vague quality labels.
125
+
126
+ ## Boundary Control
127
+
128
+ Ask:
129
+
130
+ - Is in-scope versus out-of-scope legible?
131
+ - Does the draft constrain likely overreach paths?
132
+ - Are constraints and non-goals clear enough to keep downstream execution bounded?
133
+
134
+ ## Acceptance Contract
135
+
136
+ Ask:
137
+
138
+ - Does the draft define what done looks like?
139
+ - Are success conditions checkable rather than aspirational?
140
+ - Does it identify the artifacts, behaviors, or tests that should prove completion?
141
+
142
+ ## Decomposition
143
+
144
+ Ask:
145
+
146
+ - Does the draft break the work down only when decomposition helps execution?
147
+ - If decomposition is present, are the parts coherent, ordered, and complete enough to act on?
148
+ - If decomposition is absent, is the task still executable as one bounded unit?
149
+
150
+ Strong decomposition can mean either a good phased breakdown or a disciplined choice to keep the task atomic.
151
+
152
+ ## Execution Readiness
153
+
154
+ Ask:
155
+
156
+ - Could a run agent plausibly execute this draft without major guesswork?
157
+ - Does the draft expose concrete contracts for CLI flags, persistence, artifacts, or user-facing behavior when they matter?
158
+ - Is the implementation path specific enough to reduce downstream ambiguity?
159
+
160
+ ## Uncertainty Handling
161
+
162
+ Ask:
163
+
164
+ - Does the draft surface key assumptions and dependencies?
165
+ - Does it expose meaningful uncertainty instead of hiding it?
166
+ - Are unresolved questions bounded and explicit?
167
+
168
+ ## Draft posture
169
+
170
+ Each draft assessment should also name:
171
+
172
+ - \`completion_status\`: \`ready\`, \`ready_with_gap\`, \`ready_with_gaps\`, \`incomplete\`, or \`not_verifiable\`
173
+ - \`recommendation_level\`: \`execute_now\`, \`strong_foundation\`, or \`not_recommended\`
174
+ - \`quality\`: \`high\`, \`medium\`, or \`low\`
175
+
176
+ These fields preserve the practical decision posture a spec selector needs:
177
+
178
+ - \`execute_now\` means the draft is fit to drive execution without reopening major contract questions
179
+ - \`strong_foundation\` means the draft is directionally strong but still needs bounded tightening
180
+ - \`not_recommended\` means the draft should not win the stage
181
+
182
+ Keep descriptive task typing out of this rubric. If you need normalized labels like \`intent\`, \`scope\`, \`stack\`, or \`difficulty\`, use the separate \`spec-type\` rubric.
183
+
184
+ ## Ranking rule
185
+
186
+ The final ranking should follow from the draft assessments above.
187
+
188
+ It should answer:
189
+
190
+ - which draft should win?
191
+ - which ordering best reflects execution trustworthiness?
192
+
193
+ It should not ignore the structured per-draft assessments.
194
+ It must rank the full eligible draft set with no ties.
195
+ Set \`preferred\` equal to \`ranking[0]\`.
196
+
197
+ The verification artifact should also include:
198
+
199
+ - \`comparison\`: cross-draft tradeoffs, explicitly including why \`ranking[0]\` beat \`ranking[1]\`
200
+ - \`rationale\`: why \`preferred\` / \`ranking[0]\` is the best choice
201
+ - \`next_actions\`: short, operational follow-up for the selected draft path
202
+ `,
203
+ schema: `type: object
204
+ required:
205
+ - assessments
206
+ - preferred
207
+ - comparison
208
+ - ranking
209
+ - rationale
210
+ - next_actions
211
+ properties:
212
+ assessments:
213
+ type: array
214
+ items:
215
+ type: object
216
+ required:
217
+ - draft
218
+ - completion_status
219
+ - recommendation_level
220
+ - quality
221
+ - evaluation
222
+ - contract_coverage
223
+ - implementation_notes
224
+ - follow_up
225
+ - evidence_refs
226
+ properties:
227
+ draft:
228
+ type: string
229
+ completion_status:
230
+ type: string
231
+ enum: ["ready", "ready_with_gap", "ready_with_gaps", "incomplete", "not_verifiable"]
232
+ recommendation_level:
233
+ type: string
234
+ enum: ["execute_now", "strong_foundation", "not_recommended"]
235
+ quality:
236
+ type: string
237
+ enum: ["high", "medium", "low"]
238
+ evaluation:
239
+ type: object
240
+ required:
241
+ - task_fidelity
242
+ - boundary_control
243
+ - acceptance_contract
244
+ - decomposition
245
+ - execution_readiness
246
+ - uncertainty_handling
247
+ properties:
248
+ task_fidelity:
249
+ type: string
250
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
251
+ boundary_control:
252
+ type: string
253
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
254
+ acceptance_contract:
255
+ type: string
256
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
257
+ decomposition:
258
+ type: string
259
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
260
+ execution_readiness:
261
+ type: string
262
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
263
+ uncertainty_handling:
264
+ type: string
265
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
266
+ contract_coverage:
267
+ type: array
268
+ items:
269
+ type: object
270
+ required:
271
+ - contract_item
272
+ - status
273
+ - note
274
+ - evidence_refs
275
+ properties:
276
+ contract_item:
277
+ type: string
278
+ status:
279
+ type: string
280
+ enum: ["met", "partial", "not_met", "not_verifiable"]
281
+ note:
282
+ type: string
283
+ evidence_refs:
284
+ type: array
285
+ items:
286
+ type: string
287
+ implementation_notes:
288
+ type: string
289
+ follow_up:
290
+ type: array
291
+ items:
292
+ type: string
293
+ evidence_refs:
294
+ type: array
295
+ items:
296
+ type: string
297
+ comparison:
298
+ type: string
299
+ preferred:
300
+ type: string
301
+ ranking:
302
+ type: array
303
+ items:
304
+ type: string
305
+ rationale:
306
+ type: string
307
+ next_actions:
308
+ type: array
309
+ items:
310
+ type: string
311
+ `,
312
+ },
313
+ {
314
+ name: "run-verification",
315
+ prompt: `You are a blinded verifier agent reviewing multiple run candidates for the same selected spec and choosing the single best candidate to apply.
316
+
317
+ Inputs:
318
+
319
+ - the selected spec
320
+ - the full blinded candidate set
321
+ - candidate diffs and supporting artifacts
322
+ - any shared run artifacts needed to understand the task context
323
+
324
+ Expected working style:
325
+
326
+ 1. Read the spec first.
327
+ 2. Derive the key requirements the run had to satisfy and use stable ids such as \`R1\`, \`R2\`, \`R3\`.
328
+ 3. Inspect the candidate set directly.
329
+ 4. Assess each candidate against the verification rubric.
330
+ 5. Record per-candidate requirement coverage, completion posture, recommendation posture, and bounded follow-up work.
331
+ 6. Derive a strict best-to-worst ranking from those assessments.
332
+ 7. Make the ranking strict, complete, and tie-free across the full eligible candidate set.
333
+ 8. Set \`preferred\` equal to \`ranking[0]\`.
334
+
335
+ Judgment discipline:
336
+
337
+ - make claims only when you can point to evidence from candidate diffs or staged files
338
+ - if you cannot verify something, say so explicitly
339
+ - distinguish cleanup issues from correctness or apply-risk issues
340
+ - focus on whether the candidate actually solved the asked task, not whether it merely looks plausible
341
+ - focus on bounded, decision-relevant follow-up work
342
+ - include lightweight \`evidence_refs\` for each candidate assessment
343
+ - keep \`comparison\` focused on cross-candidate tradeoffs such as scope adherence, approach quality, and apply-now cleanliness
344
+ - make \`comparison\` explain why \`ranking[0]\` beat \`ranking[1]\`, not just why lower-ranked candidates lost
345
+ - include \`next_actions\` only for the selected path
346
+
347
+ Expected output shape:
348
+
349
+ - \`assessments[]\` with one entry per candidate
350
+ - top-level \`preferred\` naming the selected candidate
351
+ - each assessment should include:
352
+ - \`candidate\`
353
+ - \`completion_status\`
354
+ - \`recommendation_level\`
355
+ - \`quality\`
356
+ - \`evaluation\`
357
+ - \`requirements_coverage\`
358
+ - \`implementation_notes\`
359
+ - \`follow_up\`
360
+ - \`evidence_refs\`
361
+ - top-level \`comparison\` should capture cross-candidate tradeoffs
362
+ - top-level \`ranking\` must be strict, complete, and tie-free
363
+ - top-level \`rationale\` should explain why \`preferred\` / \`ranking[0]\` is the best apply choice
364
+ - top-level \`next_actions\` should stay short and operational
365
+ `,
366
+ rubric: `# Run Review
367
+
368
+ Review the candidate set by assessing each candidate on:
369
+
370
+ - spec adherence
371
+ - approach
372
+ - codebase fit
373
+ - apply risk
374
+ - evidence
375
+
376
+ Then derive a final ranking from those assessments.
377
+
378
+ ## Spec Adherence
379
+
380
+ Ask:
381
+
382
+ - Does the candidate satisfy the selected spec?
383
+ - Are key requirements clearly met, partially met, not met, or not verifiable?
384
+ - Are there obvious mismatches between the changed artifacts and the intended outcome?
385
+
386
+ Spec adherence should dominate elegance or cleanup concerns.
387
+
388
+ Every candidate assessment should include explicit \`requirements_coverage\` entries so the ranking is traceable back to the asked task rather than inferred from generic quality labels.
389
+
390
+ ## Approach
391
+
392
+ Ask:
393
+
394
+ - Does the candidate take the right approach to the task, not just produce a superficially acceptable output?
395
+ - Does it avoid scope drift, indirect fixes, or restructuring the task did not ask for?
396
+ - Does it create a strong enough foundation without introducing unnecessary complexity?
397
+
398
+ This is where verification should capture the gap between "passed checks" and "is actually the change we would keep."
399
+
400
+ ## Codebase Fit
401
+
402
+ Ask:
403
+
404
+ - Does the implementation fit existing patterns, interfaces, and boundaries?
405
+ - Does it look like a coherent extension of the codebase rather than an alien insertion?
406
+ - Are migrations, rollbacks, or integration seams well-bounded?
407
+
408
+ ## Apply Risk
409
+
410
+ Ask:
411
+
412
+ - What is the likely blast radius of applying this candidate?
413
+ - Are there hidden regressions, ambiguous behaviors, or fragile assumptions?
414
+ - Are any missing steps or follow-ups bounded and low-risk, or do they open up unbounded uncertainty?
415
+
416
+ ## Evidence
417
+
418
+ Ask:
419
+
420
+ - Are important claims supported by concrete artifacts?
421
+ - Does the candidate leave meaningful uncertainty unresolved?
422
+
423
+ Evidence here means direct artifact evidence for the candidate itself:
424
+
425
+ - diffs
426
+ - changed files
427
+ - summaries when present
428
+ - cited files and line ranges
429
+
430
+ ## Candidate posture
431
+
432
+ Each candidate assessment should also name:
433
+
434
+ - \`completion_status\`: \`complete\`, \`complete_with_gap\`, \`complete_with_gaps\`, \`incomplete\`, or \`not_verifiable\`
435
+ - \`recommendation_level\`: \`apply_now\`, \`strong_foundation\`, or \`not_recommended\`
436
+ - \`quality\`: \`high\`, \`medium\`, or \`low\`
437
+
438
+ These fields preserve the practical decision posture the current verification artifact captures:
439
+
440
+ - \`apply_now\` means the candidate is fit to apply without reopening major questions
441
+ - \`strong_foundation\` means the candidate is strong but still needs bounded follow-up before it is the cleanest apply choice
442
+ - \`not_recommended\` means the candidate should not win the run
443
+
444
+ ## Ranking rule
445
+
446
+ The final ranking should follow from the candidate assessments above.
447
+
448
+ It should answer:
449
+
450
+ - which candidate should win?
451
+ - which ordering best reflects apply trustworthiness?
452
+
453
+ It should not ignore the structured per-candidate assessments.
454
+ It must rank the full eligible candidate set with no ties.
455
+ Set \`preferred\` equal to \`ranking[0]\`.
456
+
457
+ The verification artifact should also include:
458
+
459
+ - \`comparison\`: cross-candidate tradeoffs, explicitly including why \`ranking[0]\` beat \`ranking[1]\`
460
+ - \`rationale\`: why \`preferred\` / \`ranking[0]\` is the best choice
461
+ - \`next_actions\`: short, operational follow-up for the selected path
462
+ `,
463
+ schema: `type: object
464
+ required:
465
+ - assessments
466
+ - preferred
467
+ - comparison
468
+ - ranking
469
+ - rationale
470
+ - next_actions
471
+ properties:
472
+ assessments:
473
+ type: array
474
+ items:
475
+ type: object
476
+ required:
477
+ - candidate
478
+ - completion_status
479
+ - recommendation_level
480
+ - quality
481
+ - evaluation
482
+ - requirements_coverage
483
+ - implementation_notes
484
+ - follow_up
485
+ - evidence_refs
486
+ properties:
487
+ candidate:
488
+ type: string
489
+ completion_status:
490
+ type: string
491
+ enum: ["complete", "complete_with_gap", "complete_with_gaps", "incomplete", "not_verifiable"]
492
+ recommendation_level:
493
+ type: string
494
+ enum: ["apply_now", "strong_foundation", "not_recommended"]
495
+ quality:
496
+ type: string
497
+ enum: ["high", "medium", "low"]
498
+ evaluation:
499
+ type: object
500
+ required:
501
+ - spec_adherence
502
+ - approach
503
+ - codebase_fit
504
+ - apply_risk
505
+ - evidence
506
+ properties:
507
+ spec_adherence:
508
+ type: string
509
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
510
+ approach:
511
+ type: string
512
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
513
+ codebase_fit:
514
+ type: string
515
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
516
+ apply_risk:
517
+ type: string
518
+ enum: ["low", "medium", "high", "unknown"]
519
+ evidence:
520
+ type: string
521
+ enum: ["strong", "acceptable", "weak", "missing"]
522
+ requirements_coverage:
523
+ type: array
524
+ items:
525
+ type: object
526
+ required:
527
+ - requirement
528
+ - status
529
+ - note
530
+ - evidence_refs
531
+ properties:
532
+ requirement:
533
+ type: string
534
+ status:
535
+ type: string
536
+ enum: ["met", "partial", "not_met", "not_verifiable"]
537
+ note:
538
+ type: string
539
+ evidence_refs:
540
+ type: array
541
+ items:
542
+ type: string
543
+ implementation_notes:
544
+ type: string
545
+ follow_up:
546
+ type: array
547
+ items:
548
+ type: string
549
+ evidence_refs:
550
+ type: array
551
+ items:
552
+ type: string
553
+ comparison:
554
+ type: string
555
+ preferred:
556
+ type: string
557
+ ranking:
558
+ type: array
559
+ items:
560
+ type: string
561
+ rationale:
562
+ type: string
563
+ next_actions:
564
+ type: array
565
+ items:
566
+ type: string
567
+ `,
568
+ },
569
+ {
570
+ name: "reduce-verification",
571
+ prompt: `You are performing reduction verification over a blinded set of reduction candidates for one completed target session.
572
+
573
+ Your goal is to decide which reduction is the best carry-forward artifact for later use.
574
+
575
+ Read order:
576
+
577
+ 1. Read the blinded reduction artifacts for all candidates.
578
+ 2. Compare them on fidelity, usefulness, compression quality, and next-step utility.
579
+ 3. Produce one structured result that includes per-candidate assessments, a strict full ranking, and an explicit preferred reduction.
580
+
581
+ What matters:
582
+
583
+ - preserve important facts from the source session
584
+ - remove noise without dropping durable signal
585
+ - surface unresolved uncertainty honestly
586
+ - preserve the decisions, caveats, and next-step guidance a later operator would actually need
587
+ - produce guidance that is actually useful for later \`spec\`, \`run\`, \`reduce\`, or \`verify\` work
588
+
589
+ What does not matter:
590
+
591
+ - prose flourish
592
+ - maximal detail for its own sake
593
+ - ranking a reduction highly just because it is long
594
+ - re-litigating the full session when the reduction should carry the durable outcome forward
595
+
596
+ Do not defer to any one reduction because of agent provenance. Candidates are blinded and should be judged on artifact quality alone.
597
+
598
+ Ranking requirements:
599
+
600
+ - rank the full eligible candidate set
601
+ - do not use ties
602
+ - set \`preferred\` equal to \`ranking[0]\`
603
+ - make \`comparison\` explain why \`ranking[0]\` beat \`ranking[1]\`
604
+
605
+ Expected output shape:
606
+
607
+ - \`assessments[]\` with one entry per candidate reduction
608
+ - each assessment should include:
609
+ - \`candidate\`
610
+ - \`recommendation_level\`
611
+ - \`quality\`
612
+ - \`evaluation\`
613
+ - \`strengths\`
614
+ - \`gaps\`
615
+ - \`evidence_refs\`
616
+ - top-level \`preferred\` naming the selected reduction
617
+ - top-level \`comparison\` explaining why \`ranking[0]\` beat \`ranking[1]\`
618
+ - top-level \`ranking\` must be strict, complete, and tie-free
619
+ - top-level \`rationale\` should explain why \`preferred\` is the best carry-forward artifact
620
+ - top-level \`next_actions\` should stay short and operational
621
+ `,
622
+ rubric: `# Reduce Review Rubric
623
+
624
+ This rubric answers one question: which reduction is the most useful durable carry-forward artifact?
625
+
626
+ Evaluate each candidate on:
627
+
628
+ - fidelity
629
+ - does it preserve the important facts and conclusions from the source session?
630
+ - compression
631
+ - does it remove noise without discarding durable signal?
632
+ - uncertainty
633
+ - does it preserve unresolved caveats instead of laundering them away?
634
+ - next_step_utility
635
+ - would this artifact actually help a later operator or human continue the work without reopening the whole source session?
636
+ - evidence
637
+ - are the claims grounded in visible source artifacts?
638
+
639
+ Recommendation posture:
640
+
641
+ - carry_forward_now
642
+ - strong enough to use as the preferred reduction artifact immediately
643
+ - usable_with_gap
644
+ - useful, but has clear omissions or weaknesses
645
+ - not_recommended
646
+ - too lossy, misleading, or weak to be the preferred carry-forward artifact
647
+
648
+ Comparison guidance:
649
+
650
+ - prefer durable synthesis over exhaustive recap
651
+ - prefer honest uncertainty over false confidence
652
+ - prefer actionable carry-forward guidance over generic summary language
653
+ - prefer reductions that preserve the session's decisions and caveats, not just its topic area
654
+ - do not reward verbosity by default
655
+
656
+ ## Candidate posture
657
+
658
+ Each candidate assessment should also name:
659
+
660
+ - \`recommendation_level\`: \`carry_forward_now\`, \`usable_with_gap\`, or \`not_recommended\`
661
+ - \`quality\`: \`high\`, \`medium\`, or \`low\`
662
+
663
+ These fields preserve the practical decision posture a reduction selector needs:
664
+
665
+ - \`carry_forward_now\` means the reduction is strong enough to use immediately as the preferred carry-forward artifact
666
+ - \`usable_with_gap\` means the reduction is directionally useful but has bounded omissions or weaknesses
667
+ - \`not_recommended\` means the reduction should not win the stage
668
+
669
+ Use \`strengths\` and \`gaps\` for per-candidate observations only. Put cross-candidate tradeoffs in \`comparison\` and final winner justification in \`rationale\`.
670
+
671
+ Ranking rule:
672
+
673
+ - rank the full eligible candidate set with no ties
674
+ - set \`preferred\` equal to \`ranking[0]\`
675
+ - make \`comparison\` explain why \`ranking[0]\` beat \`ranking[1]\`
676
+
677
+ The verification artifact should also include:
678
+
679
+ - \`comparison\`: cross-candidate tradeoffs, explicitly including why \`ranking[0]\` beat \`ranking[1]\`
680
+ - \`rationale\`: why \`preferred\` / \`ranking[0]\` is the best carry-forward choice
681
+ - \`next_actions\`: short, operational follow-up for the selected path only
682
+ `,
683
+ schema: `type: object
684
+ required:
685
+ - assessments
686
+ - preferred
687
+ - comparison
688
+ - ranking
689
+ - rationale
690
+ - next_actions
691
+ properties:
692
+ assessments:
693
+ type: array
694
+ items:
695
+ type: object
696
+ required:
697
+ - candidate
698
+ - recommendation_level
699
+ - quality
700
+ - evaluation
701
+ - strengths
702
+ - gaps
703
+ - evidence_refs
704
+ properties:
705
+ candidate:
706
+ type: string
707
+ recommendation_level:
708
+ type: string
709
+ enum: ["carry_forward_now", "usable_with_gap", "not_recommended"]
710
+ quality:
711
+ type: string
712
+ enum: ["high", "medium", "low"]
713
+ evaluation:
714
+ type: object
715
+ required:
716
+ - fidelity
717
+ - compression
718
+ - uncertainty
719
+ - next_step_utility
720
+ - evidence
721
+ properties:
722
+ fidelity:
723
+ type: string
724
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
725
+ compression:
726
+ type: string
727
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
728
+ uncertainty:
729
+ type: string
730
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
731
+ next_step_utility:
732
+ type: string
733
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
734
+ evidence:
735
+ type: string
736
+ enum: ["strong", "acceptable", "weak", "missing"]
737
+ strengths:
738
+ type: array
739
+ items:
740
+ type: string
741
+ gaps:
742
+ type: array
743
+ items:
744
+ type: string
745
+ evidence_refs:
746
+ type: array
747
+ items:
748
+ type: string
749
+ comparison:
750
+ type: string
751
+ preferred:
752
+ type: string
753
+ ranking:
754
+ type: array
755
+ items:
756
+ type: string
757
+ rationale:
758
+ type: string
759
+ next_actions:
760
+ type: array
761
+ items:
762
+ type: string
763
+ `,
764
+ },
765
+ {
766
+ name: "message-verification",
767
+ prompt: `You are a blinded verifier agent reviewing multiple message responses to the same prompt and choosing the single best response artifact to carry forward.
768
+
769
+ Inputs:
770
+
771
+ - the original message prompt
772
+ - the full blinded response set
773
+ - any shared context needed to understand the prompt
774
+
775
+ Expected working style:
776
+
777
+ 1. Read the original message prompt first.
778
+ 2. Derive the key response requirements the prompt establishes and use stable ids such as \`R1\`, \`R2\`, \`R3\`.
779
+ 3. Inspect the blinded response set directly.
780
+ 4. Assess each response against the verification rubric.
781
+ 5. Record per-response requirement coverage, completion posture, recommendation posture, and bounded follow-up work.
782
+ 6. Derive a strict best-to-worst ranking from those assessments.
783
+ 7. Make the ranking strict, complete, and tie-free across the full eligible response set.
784
+ 8. Set \`preferred\` equal to \`ranking[0]\`.
785
+
786
+ Judgment discipline:
787
+
788
+ - make claims only when you can point to evidence from the prompt or the response artifacts
789
+ - if you cannot verify something, say so explicitly
790
+ - focus on whether the response actually answered the asked prompt, not whether it merely sounds polished
791
+ - distinguish bounded omissions from fundamental prompt misses
792
+ - focus on decision-relevant follow-up work
793
+ - include lightweight \`evidence_refs\` for each response assessment
794
+ - keep \`comparison\` focused on cross-response tradeoffs such as prompt adherence, response quality, and carry-forward usefulness
795
+ - make \`comparison\` explain why \`ranking[0]\` beat \`ranking[1]\`, not just why lower-ranked responses lost
796
+ - include \`next_actions\` only for the selected response path
797
+
798
+ Expected output shape:
799
+
800
+ - \`assessments[]\` with one entry per response candidate
801
+ - top-level \`preferred\` naming the selected candidate
802
+ - each assessment should include:
803
+ - \`candidate\`
804
+ - \`completion_status\`
805
+ - \`recommendation_level\`
806
+ - \`quality\`
807
+ - \`evaluation\`
808
+ - \`requirements_coverage\`
809
+ - \`implementation_notes\`
810
+ - \`follow_up\`
811
+ - \`evidence_refs\`
812
+ - top-level \`comparison\` should capture cross-candidate tradeoffs
813
+ - top-level \`ranking\` must be strict, complete, and tie-free
814
+ - top-level \`rationale\` should explain why \`preferred\` / \`ranking[0]\` is the best carry-forward response
815
+ - top-level \`next_actions\` should stay short and operational
816
+ `,
817
+ rubric: `# Message Review
818
+
819
+ Review the response set by assessing each candidate on:
820
+
821
+ - prompt adherence
822
+ - task fit
823
+ - response quality
824
+ - decision usefulness
825
+ - evidence
826
+
827
+ Then derive a final ranking from those assessments.
828
+
829
+ ## Prompt Adherence
830
+
831
+ Ask:
832
+
833
+ - Does the response answer the actual prompt?
834
+ - Are key prompt requirements clearly met, partially met, not met, or not verifiable?
835
+ - Are there obvious mismatches between what the prompt asked for and what the response delivered?
836
+
837
+ Prompt adherence should dominate polish or stylistic preference.
838
+
839
+ Every candidate assessment should include explicit \`requirements_coverage\` entries so the ranking is traceable back to the original prompt rather than inferred from generic quality labels.
840
+
841
+ ## Task Fit
842
+
843
+ Ask:
844
+
845
+ - Does the response take the right posture for the prompt, not just produce superficially plausible language?
846
+ - Does it stay within the asked scope instead of drifting into adjacent advice, cleanup, or speculation?
847
+ - Does it answer at the right level of abstraction for the task?
848
+
849
+ ## Response Quality
850
+
851
+ Ask:
852
+
853
+ - Is the response coherent, direct, and internally consistent?
854
+ - Does it surface uncertainty honestly instead of bluffing?
855
+ - Does it preserve the important distinctions or caveats the prompt context requires?
856
+
857
+ ## Decision Usefulness
858
+
859
+ Ask:
860
+
861
+ - Would this response be the best durable artifact to keep from the message session?
862
+ - Does it leave later operators or humans with a clear answer, recommendation, or next step?
863
+ - Are any missing follow-ups bounded and low-risk, or do they reopen major prompt questions?
864
+
865
+ ## Evidence
866
+
867
+ Ask:
868
+
869
+ - Are important claims supported by concrete prompt or response evidence?
870
+ - Does the response leave meaningful uncertainty unresolved?
871
+
872
+ Evidence here means direct artifact evidence for the prompt/response pair itself:
873
+
874
+ - the staged prompt artifact
875
+ - the blinded response artifacts
876
+ - cited prompt or response excerpts
877
+
878
+ ## Candidate posture
879
+
880
+ Each candidate assessment should also name:
881
+
882
+ - \`completion_status\`: \`complete\`, \`complete_with_gap\`, \`complete_with_gaps\`, \`incomplete\`, or \`not_verifiable\`
883
+ - \`recommendation_level\`: \`carry_forward_now\`, \`strong_foundation\`, or \`not_recommended\`
884
+ - \`quality\`: \`high\`, \`medium\`, or \`low\`
885
+
886
+ These fields preserve the practical decision posture a message selector needs:
887
+
888
+ - \`carry_forward_now\` means the response is fit to keep as the preferred durable message artifact immediately
889
+ - \`strong_foundation\` means the response is directionally strong but still needs bounded tightening or follow-up
890
+ - \`not_recommended\` means the response should not win the message verification
891
+
892
+ ## Ranking rule
893
+
894
+ The final ranking should follow from the candidate assessments above.
895
+
896
+ It should answer:
897
+
898
+ - which response should win?
899
+ - which ordering best reflects carry-forward usefulness?
900
+
901
+ It should not ignore the structured per-candidate assessments.
902
+ It must rank the full eligible response set with no ties.
903
+ Set \`preferred\` equal to \`ranking[0]\`.
904
+
905
+ The verification artifact should also include:
906
+
907
+ - \`comparison\`: cross-candidate tradeoffs, explicitly including why \`ranking[0]\` beat \`ranking[1]\`
908
+ - \`rationale\`: why \`preferred\` / \`ranking[0]\` is the best choice
909
+ - \`next_actions\`: short, operational follow-up for the selected path
910
+ `,
911
+ schema: `type: object
912
+ required:
913
+ - assessments
914
+ - preferred
915
+ - comparison
916
+ - ranking
917
+ - rationale
918
+ - next_actions
919
+ properties:
920
+ assessments:
921
+ type: array
922
+ items:
923
+ type: object
924
+ required:
925
+ - candidate
926
+ - completion_status
927
+ - recommendation_level
928
+ - quality
929
+ - evaluation
930
+ - requirements_coverage
931
+ - implementation_notes
932
+ - follow_up
933
+ - evidence_refs
934
+ properties:
935
+ candidate:
936
+ type: string
937
+ completion_status:
938
+ type: string
939
+ enum: ["complete", "complete_with_gap", "complete_with_gaps", "incomplete", "not_verifiable"]
940
+ recommendation_level:
941
+ type: string
942
+ enum: ["carry_forward_now", "strong_foundation", "not_recommended"]
943
+ quality:
944
+ type: string
945
+ enum: ["high", "medium", "low"]
946
+ evaluation:
947
+ type: object
948
+ required:
949
+ - prompt_adherence
950
+ - task_fit
951
+ - response_quality
952
+ - decision_usefulness
953
+ - evidence
954
+ properties:
955
+ prompt_adherence:
956
+ type: string
957
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
958
+ task_fit:
959
+ type: string
960
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
961
+ response_quality:
962
+ type: string
963
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
964
+ decision_usefulness:
965
+ type: string
966
+ enum: ["strong", "acceptable", "weak", "not_verifiable"]
967
+ evidence:
968
+ type: string
969
+ enum: ["strong", "acceptable", "weak", "missing"]
970
+ requirements_coverage:
971
+ type: array
972
+ items:
973
+ type: object
974
+ required:
975
+ - requirement
976
+ - status
977
+ - note
978
+ - evidence_refs
979
+ properties:
980
+ requirement:
981
+ type: string
982
+ status:
983
+ type: string
984
+ enum: ["met", "partial", "not_met", "not_verifiable"]
985
+ note:
986
+ type: string
987
+ evidence_refs:
988
+ type: array
989
+ items:
990
+ type: string
991
+ implementation_notes:
992
+ type: string
993
+ follow_up:
994
+ type: array
995
+ items:
996
+ type: string
997
+ evidence_refs:
998
+ type: array
999
+ items:
1000
+ type: string
1001
+ comparison:
1002
+ type: string
1003
+ preferred:
1004
+ type: string
1005
+ ranking:
1006
+ type: array
1007
+ items:
1008
+ type: string
1009
+ rationale:
1010
+ type: string
1011
+ next_actions:
1012
+ type: array
1013
+ items:
1014
+ type: string
1015
+ `,
1016
+ },
1017
+ ];