@pugi/cli 0.1.0-beta.10 → 0.1.0-beta.101

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (464) hide show
  1. package/CHANGELOG.md +132 -0
  2. package/LICENSE +1 -1
  3. package/README.md +55 -11
  4. package/assets/pugi-prozr2-mascot.ansi +9 -0
  5. package/bin/run.js +33 -1
  6. package/dist/commands/deploy.js +40 -40
  7. package/dist/commands/flatten.js +191 -0
  8. package/dist/commands/jobs-watch.js +201 -0
  9. package/dist/commands/jobs.js +42 -27
  10. package/dist/commands/retro.js +210 -0
  11. package/dist/commands/smoke.js +133 -0
  12. package/dist/core/agent-progress/cleanup.js +134 -0
  13. package/dist/core/agent-progress/schema.js +144 -0
  14. package/dist/core/agent-progress/writer.js +101 -0
  15. package/dist/core/agents/adaptive-router.js +330 -0
  16. package/dist/core/agents/query-decomposer.js +297 -0
  17. package/dist/core/agents/registry.js +3 -3
  18. package/dist/core/approvals/shortcut-resolver.js +98 -0
  19. package/dist/core/artifact-chain/dispatcher.js +148 -0
  20. package/dist/core/artifact-chain/exporter.js +164 -0
  21. package/dist/core/artifact-chain/state.js +243 -0
  22. package/dist/core/artifact-chain/steps.js +169 -0
  23. package/dist/core/ask-user/question.js +92 -0
  24. package/dist/core/audit/audit-trail.js +275 -0
  25. package/dist/core/auth/ensure-authenticated.js +129 -0
  26. package/dist/core/auth/env-provider.js +238 -0
  27. package/dist/core/auto-open-browser.js +4 -4
  28. package/dist/core/auto-update/channels.js +122 -0
  29. package/dist/core/auto-update/checker.js +241 -0
  30. package/dist/core/auto-update/state.js +235 -0
  31. package/dist/core/bare-mode/index.js +107 -0
  32. package/dist/core/bash/redirect.js +281 -0
  33. package/dist/core/bash-classifier.js +436 -40
  34. package/dist/core/checkpoint/resumer.js +149 -0
  35. package/dist/core/checkpoint/rewinder.js +291 -0
  36. package/dist/core/checkpoints/shadow-git.js +670 -0
  37. package/dist/core/citations/parser.js +109 -0
  38. package/dist/core/classifier/yolo-classifier.js +88 -0
  39. package/dist/core/codegraph/db.js +506 -0
  40. package/dist/core/codegraph/decision-store.js +248 -0
  41. package/dist/core/codegraph/detect-repo.js +459 -0
  42. package/dist/core/codegraph/install.js +134 -0
  43. package/dist/core/codegraph/offer-hook.js +220 -0
  44. package/dist/core/codegraph/parser.js +598 -0
  45. package/dist/core/codegraph/queries/go.scm +57 -0
  46. package/dist/core/codegraph/queries/javascript.scm +56 -0
  47. package/dist/core/codegraph/queries/python.scm +55 -0
  48. package/dist/core/codegraph/queries/rust.scm +63 -0
  49. package/dist/core/codegraph/queries/typescript.scm +91 -0
  50. package/dist/core/codegraph/reindex.js +218 -0
  51. package/dist/core/codegraph/resolve-edges.js +107 -0
  52. package/dist/core/codegraph/types.js +34 -0
  53. package/dist/core/codegraph/watcher.js +440 -0
  54. package/dist/core/compact/auto-trigger.js +96 -0
  55. package/dist/core/compact/buffer-rewriter.js +115 -0
  56. package/dist/core/compact/summarizer.js +208 -0
  57. package/dist/core/compact/token-counter.js +108 -0
  58. package/dist/core/consensus/anvil-fanout.js +25 -25
  59. package/dist/core/consensus/diff-capture.js +121 -12
  60. package/dist/core/consensus/rubric.js +21 -21
  61. package/dist/core/context/builder.js +6 -6
  62. package/dist/core/context/compaction-events.js +8 -8
  63. package/dist/core/context/compaction.js +31 -31
  64. package/dist/core/context/index.js +15 -8
  65. package/dist/core/context/invariants.js +51 -51
  66. package/dist/core/context/markdown-loader.js +28 -10
  67. package/dist/core/context/markdown-traverse.js +255 -0
  68. package/dist/core/context/pugiignore.js +41 -41
  69. package/dist/core/context/repo-skeleton.js +37 -37
  70. package/dist/core/context/tool-eviction.js +55 -0
  71. package/dist/core/context/watcher.js +32 -32
  72. package/dist/core/context/working-set.js +23 -23
  73. package/dist/core/coordinator/agent-tools.js +77 -0
  74. package/dist/core/coordinator/agent-toolset.js +65 -0
  75. package/dist/core/coordinator/fsm.js +73 -0
  76. package/dist/core/coordinator/mode-fsm.js +70 -0
  77. package/dist/core/cost/rate-card.js +129 -0
  78. package/dist/core/cost/tracker.js +221 -0
  79. package/dist/core/credentials.js +13 -13
  80. package/dist/core/cron/scheduler.js +138 -0
  81. package/dist/core/denial-tracking/index.js +8 -0
  82. package/dist/core/denial-tracking/state.js +264 -0
  83. package/dist/core/diagnostics/probe-runner.js +93 -0
  84. package/dist/core/diagnostics/probes/api.js +46 -0
  85. package/dist/core/diagnostics/probes/auth.js +93 -0
  86. package/dist/core/diagnostics/probes/bare-mode.js +42 -0
  87. package/dist/core/diagnostics/probes/cli-version.js +127 -0
  88. package/dist/core/diagnostics/probes/config.js +72 -0
  89. package/dist/core/diagnostics/probes/denial-tracking.js +57 -0
  90. package/dist/core/diagnostics/probes/disk.js +81 -0
  91. package/dist/core/diagnostics/probes/engine-live.js +46 -0
  92. package/dist/core/diagnostics/probes/git.js +65 -0
  93. package/dist/core/diagnostics/probes/hooks.js +118 -0
  94. package/dist/core/diagnostics/probes/mcp.js +75 -0
  95. package/dist/core/diagnostics/probes/node.js +59 -0
  96. package/dist/core/diagnostics/probes/pnpm.js +36 -0
  97. package/dist/core/diagnostics/probes/pugi-md.js +89 -0
  98. package/dist/core/diagnostics/probes/sandbox.js +67 -0
  99. package/dist/core/diagnostics/probes/session.js +74 -0
  100. package/dist/core/diagnostics/probes/status-snapshot.js +488 -0
  101. package/dist/core/diagnostics/probes/workspace.js +63 -0
  102. package/dist/core/diagnostics/types.js +70 -0
  103. package/dist/core/dispatch/cache-cleanup.js +197 -0
  104. package/dist/core/dispatch/cache-handoff.js +295 -0
  105. package/dist/core/edits/apply-patch-layer-e.js +189 -0
  106. package/dist/core/edits/dispatch.js +333 -7
  107. package/dist/core/edits/format-detector.js +260 -0
  108. package/dist/core/edits/format-matrix.js +26 -0
  109. package/dist/core/edits/fuzzy-ladder.js +650 -0
  110. package/dist/core/edits/index.js +5 -1
  111. package/dist/core/edits/journal.js +199 -0
  112. package/dist/core/edits/layer-a-apply.js +15 -15
  113. package/dist/core/edits/layer-a-fuzzy-apply.js +198 -0
  114. package/dist/core/edits/layer-b-apply.js +9 -9
  115. package/dist/core/edits/layer-c-apply.js +6 -6
  116. package/dist/core/edits/layer-d-ast.js +557 -14
  117. package/dist/core/edits/marker-parser.js +12 -12
  118. package/dist/core/edits/security-gate.js +27 -27
  119. package/dist/core/edits/verify-hook.js +273 -0
  120. package/dist/core/edits/worktree.js +29 -29
  121. package/dist/core/engine/anvil-client.js +214 -26
  122. package/dist/core/engine/auto-compact.js +247 -0
  123. package/dist/core/engine/budgets.js +220 -0
  124. package/dist/core/engine/compact-llm-summarizer.js +124 -0
  125. package/dist/core/engine/context-prefix.js +155 -0
  126. package/dist/core/engine/index.js +1 -1
  127. package/dist/core/engine/intensity.js +163 -0
  128. package/dist/core/engine/intent.js +260 -0
  129. package/dist/core/engine/native-pugi.js +1559 -227
  130. package/dist/core/engine/prompts.js +219 -19
  131. package/dist/core/engine/strip-internal-fields.js +124 -0
  132. package/dist/core/engine/tool-bridge.js +1887 -59
  133. package/dist/core/engine/verification-patterns.js +195 -0
  134. package/dist/core/eval/v1/ledger.js +83 -0
  135. package/dist/core/eval/v1/runner.js +280 -0
  136. package/dist/core/eval/v1/scoring.js +68 -0
  137. package/dist/core/eval/v1/task-loader.js +191 -0
  138. package/dist/core/eval/v1/types.js +14 -0
  139. package/dist/core/eval/v1/verifier.js +176 -0
  140. package/dist/core/eval/v1/yaml-parser.js +250 -0
  141. package/dist/core/evaluation/golden-dataset.js +293 -0
  142. package/dist/core/feedback/queue.js +177 -0
  143. package/dist/core/feedback/submitter.js +145 -0
  144. package/dist/core/file-cache.js +113 -1
  145. package/dist/core/flatten/flatten-repo.js +439 -0
  146. package/dist/core/format/osc8-link.js +28 -0
  147. package/dist/core/hook-chains.js +392 -0
  148. package/dist/core/hooks/citation-verify-hook.js +138 -0
  149. package/dist/core/hooks/citation-verify.js +112 -0
  150. package/dist/core/hooks/events.js +46 -0
  151. package/dist/core/hooks/index.js +15 -0
  152. package/dist/core/hooks/registry.js +216 -0
  153. package/dist/core/hooks/runner.js +236 -0
  154. package/dist/core/hooks/v2/event-emitter.js +115 -0
  155. package/dist/core/hooks/v2/executor.js +282 -0
  156. package/dist/core/hooks/v2/index.js +25 -0
  157. package/dist/core/hooks/v2/lifecycle.js +104 -0
  158. package/dist/core/hooks/v2/loader.js +216 -0
  159. package/dist/core/hooks/v2/matcher.js +125 -0
  160. package/dist/core/hooks/v2/trust.js +143 -0
  161. package/dist/core/hooks/v2/types.js +86 -0
  162. package/dist/core/hooks/worktree-events.js +158 -0
  163. package/dist/core/image/renderer.js +71 -0
  164. package/dist/core/init/detector.js +582 -0
  165. package/dist/core/init/template-renderer.js +242 -0
  166. package/dist/core/jobs/registry.js +18 -18
  167. package/dist/core/ledger/results-tsv.js +142 -0
  168. package/dist/core/log-discipline/stdout-redirect.js +51 -0
  169. package/dist/core/lsp/cache.js +105 -0
  170. package/dist/core/lsp/client.js +551 -41
  171. package/dist/core/lsp/language-detect.js +66 -0
  172. package/dist/core/lsp/post-edit-diagnostics.js +171 -0
  173. package/dist/core/lsp/server-detect.js +173 -0
  174. package/dist/core/lsp/symbol-cache.js +162 -0
  175. package/dist/core/lsp/symbol-tools.js +664 -0
  176. package/dist/core/mcp/client.js +97 -28
  177. package/dist/core/mcp/http-server.js +553 -0
  178. package/dist/core/mcp/orchestrator-config.js +192 -0
  179. package/dist/core/mcp/orchestrator-tools.js +806 -0
  180. package/dist/core/mcp/permission.js +190 -0
  181. package/dist/core/mcp/registry.js +39 -17
  182. package/dist/core/mcp/server-tools.js +219 -0
  183. package/dist/core/mcp/server.js +397 -0
  184. package/dist/core/mcp/trust.js +10 -10
  185. package/dist/core/memory/dual-write.js +416 -0
  186. package/dist/core/memory/passive-extract.js +130 -0
  187. package/dist/core/memory/phase1-kinds.js +20 -0
  188. package/dist/core/memory/secret-scanner.js +304 -0
  189. package/dist/core/memory-sync/queue.js +170 -0
  190. package/dist/core/metrics/extract.js +113 -0
  191. package/dist/core/modes/roo-modes.js +68 -0
  192. package/dist/core/notes/notes-paths.js +113 -0
  193. package/dist/core/notes/notes-recorder.js +140 -0
  194. package/dist/core/notes/notes-writer.js +53 -0
  195. package/dist/core/notes/renderers.js +0 -0
  196. package/dist/core/notes/slug.js +105 -0
  197. package/dist/core/onboarding/ensure-initialized.js +133 -0
  198. package/dist/core/onboarding/marker.js +111 -0
  199. package/dist/core/onboarding/telemetry-state.js +108 -0
  200. package/dist/core/output-style/presets.js +176 -0
  201. package/dist/core/output-style/state.js +185 -0
  202. package/dist/core/path-security.js +287 -5
  203. package/dist/core/permission.js +82 -22
  204. package/dist/core/permissions/auto-classifier.js +124 -0
  205. package/dist/core/permissions/bash-parser.js +371 -0
  206. package/dist/core/permissions/circuit-breaker.js +83 -0
  207. package/dist/core/permissions/constrained-edit.js +91 -0
  208. package/dist/core/permissions/gate.js +278 -0
  209. package/dist/core/permissions/index.js +20 -0
  210. package/dist/core/permissions/mode.js +174 -0
  211. package/dist/core/permissions/network-egress.js +137 -0
  212. package/dist/core/permissions/state.js +241 -0
  213. package/dist/core/permissions/tool-class.js +107 -0
  214. package/dist/core/plan-mode/ui-state.js +51 -0
  215. package/dist/core/plans/plan-artifact.js +721 -0
  216. package/dist/core/policy-limits/etag-store.js +122 -0
  217. package/dist/core/prd-check/parser.js +215 -0
  218. package/dist/core/prd-check/reporter.js +127 -0
  219. package/dist/core/prd-check/session-review.js +557 -0
  220. package/dist/core/prd-check/verifiers.js +223 -0
  221. package/dist/core/prompt-cache/client-cache.js +99 -0
  222. package/dist/core/prompts/assembly.js +29 -0
  223. package/dist/core/prompts/registry.js +364 -0
  224. package/dist/core/pugi-gitignore.js +52 -0
  225. package/dist/core/pugi-md/cc-compat-rules.js +735 -0
  226. package/dist/core/pugi-md/context-injector.js +76 -0
  227. package/dist/core/pugi-md/walk-up.js +207 -0
  228. package/dist/core/python/uv-installer.js +270 -0
  229. package/dist/core/python/uv-resolver.js +83 -0
  230. package/dist/core/rate-limit/narrator.js +146 -0
  231. package/dist/core/recipes/cli-types.js +20 -0
  232. package/dist/core/recipes/loader.js +103 -0
  233. package/dist/core/recipes/runner.js +345 -0
  234. package/dist/core/recipes/schema.js +587 -0
  235. package/dist/core/release-notes/parser.js +241 -0
  236. package/dist/core/release-notes/state.js +116 -0
  237. package/dist/core/repl/ask.js +37 -37
  238. package/dist/core/repl/cancellation.js +26 -26
  239. package/dist/core/repl/cap-warning.js +4 -4
  240. package/dist/core/repl/clipboard-read.js +11 -11
  241. package/dist/core/repl/dispatch-fsm.js +12 -12
  242. package/dist/core/repl/engine-bridge.js +303 -0
  243. package/dist/core/repl/history-search.js +15 -15
  244. package/dist/core/repl/history.js +28 -18
  245. package/dist/core/repl/kill-ring.js +5 -5
  246. package/dist/core/repl/model-pricing.js +135 -0
  247. package/dist/core/repl/privacy-banner.js +22 -22
  248. package/dist/core/repl/session.js +2690 -229
  249. package/dist/core/repl/slash-commands.js +540 -41
  250. package/dist/core/repl/store/index.js +1 -1
  251. package/dist/core/repl/store/jsonl-log.js +22 -22
  252. package/dist/core/repl/store/lockfile.js +10 -10
  253. package/dist/core/repl/store/session-store.js +136 -107
  254. package/dist/core/repl/store/types.js +15 -15
  255. package/dist/core/repl/store/uuid-v7.js +12 -12
  256. package/dist/core/repl/tool-route.js +382 -0
  257. package/dist/core/repl/workspace-context.js +43 -21
  258. package/dist/core/repo-map/build.js +125 -0
  259. package/dist/core/repo-map/cache.js +185 -0
  260. package/dist/core/repo-map/extractor.js +254 -0
  261. package/dist/core/repo-map/formatter.js +145 -0
  262. package/dist/core/repo-map/page-rank.js +105 -0
  263. package/dist/core/repo-map/scanner.js +211 -0
  264. package/dist/core/retro/git-collector.js +251 -0
  265. package/dist/core/retro/health-card.js +25 -0
  266. package/dist/core/retro/metrics.js +342 -0
  267. package/dist/core/retro/narrative.js +249 -0
  268. package/dist/core/retro/plane-collector.js +274 -0
  269. package/dist/core/retro/pr-issue-link.js +65 -0
  270. package/dist/core/retro/types.js +16 -0
  271. package/dist/core/retry-budget/budget.js +284 -0
  272. package/dist/core/retry-budget/index.js +5 -0
  273. package/dist/core/retry-budget/retry-cap.js +74 -0
  274. package/dist/core/routing/lead-worker.js +43 -0
  275. package/dist/core/routing/pre-flight-estimator.js +108 -0
  276. package/dist/core/runs/run-tree.js +103 -0
  277. package/dist/core/sandboxing/adapter.js +43 -0
  278. package/dist/core/sandboxing/bubblewrap.js +209 -0
  279. package/dist/core/sandboxing/index.js +78 -0
  280. package/dist/core/sandboxing/none.js +19 -0
  281. package/dist/core/sandboxing/policy.js +97 -0
  282. package/dist/core/sandboxing/seatbelt.js +231 -0
  283. package/dist/core/security/injection-scanner.js +367 -0
  284. package/dist/core/security/output-filter.js +418 -0
  285. package/dist/core/session/env-file.js +105 -0
  286. package/dist/core/session/section-budgets.js +140 -0
  287. package/dist/core/session.js +119 -0
  288. package/dist/core/settings.js +402 -5
  289. package/dist/core/share/formatter.js +271 -0
  290. package/dist/core/share/redactor.js +221 -0
  291. package/dist/core/share/uploader.js +267 -0
  292. package/dist/core/skills/defaults.js +30 -30
  293. package/dist/core/skills/loader.js +22 -22
  294. package/dist/core/skills/sources.js +27 -27
  295. package/dist/core/smoke/headless-driver.js +174 -0
  296. package/dist/core/smoke/orchestrator.js +194 -0
  297. package/dist/core/smoke/runner.js +238 -0
  298. package/dist/core/smoke/scenario-parser.js +316 -0
  299. package/dist/core/statusline.js +99 -0
  300. package/dist/core/subagents/dispatcher-real.js +600 -0
  301. package/dist/core/subagents/dispatcher.js +146 -52
  302. package/dist/core/subagents/index.js +19 -6
  303. package/dist/core/subagents/isolation-matrix.js +213 -0
  304. package/dist/core/subagents/spawn.js +19 -4
  305. package/dist/core/telemetry/emitter.js +229 -0
  306. package/dist/core/telemetry/queue.js +251 -0
  307. package/dist/core/theme/context.js +91 -0
  308. package/dist/core/theme/presets.js +228 -0
  309. package/dist/core/theme/state.js +181 -0
  310. package/dist/core/todos/invariant.js +10 -0
  311. package/dist/core/todos/state.js +177 -0
  312. package/dist/core/tool-schema/compressor.js +89 -0
  313. package/dist/core/transport/version-interceptor.js +166 -0
  314. package/dist/core/trust.js +2 -2
  315. package/dist/core/tui/thinking-block.js +64 -0
  316. package/dist/core/vim/keymap.js +288 -0
  317. package/dist/core/vim/state.js +92 -0
  318. package/dist/core/watch-markers/marker-watcher.js +133 -0
  319. package/dist/core/worktree/include-parser.js +249 -0
  320. package/dist/core/worktree-manager/cleanup.js +123 -0
  321. package/dist/core/worktree-manager/manager.js +303 -0
  322. package/dist/index.js +36 -0
  323. package/dist/runtime/bootstrap.js +190 -0
  324. package/dist/runtime/cli.js +4403 -561
  325. package/dist/runtime/commands/agents.js +31 -31
  326. package/dist/runtime/commands/budget.js +5 -5
  327. package/dist/runtime/commands/cancel.js +231 -0
  328. package/dist/runtime/commands/chain.js +489 -0
  329. package/dist/runtime/commands/codegraph-status.js +227 -0
  330. package/dist/runtime/commands/compact.js +297 -0
  331. package/dist/runtime/commands/config.js +74 -40
  332. package/dist/runtime/commands/cost.js +199 -0
  333. package/dist/runtime/commands/delegate.js +27 -4
  334. package/dist/runtime/commands/dispatch.js +126 -0
  335. package/dist/runtime/commands/doctor.js +579 -0
  336. package/dist/runtime/commands/eval-v1.js +266 -0
  337. package/dist/runtime/commands/feedback.js +184 -0
  338. package/dist/runtime/commands/hooks.js +187 -0
  339. package/dist/runtime/commands/index-cmd.js +459 -0
  340. package/dist/runtime/commands/init.js +254 -0
  341. package/dist/runtime/commands/lsp.js +200 -38
  342. package/dist/runtime/commands/mcp.js +935 -0
  343. package/dist/runtime/commands/memory.js +582 -0
  344. package/dist/runtime/commands/model.js +237 -0
  345. package/dist/runtime/commands/onboarding.js +275 -0
  346. package/dist/runtime/commands/patch.js +12 -12
  347. package/dist/runtime/commands/permissions.js +112 -0
  348. package/dist/runtime/commands/plan.js +143 -0
  349. package/dist/runtime/commands/prd-check.js +285 -0
  350. package/dist/runtime/commands/privacy.js +17 -17
  351. package/dist/runtime/commands/recipe.js +325 -0
  352. package/dist/runtime/commands/redo-blob-store.js +92 -0
  353. package/dist/runtime/commands/redo.js +361 -0
  354. package/dist/runtime/commands/release-notes.js +229 -0
  355. package/dist/runtime/commands/repo-map.js +95 -0
  356. package/dist/runtime/commands/report.js +299 -0
  357. package/dist/runtime/commands/resume.js +118 -0
  358. package/dist/runtime/commands/review-consensus.js +68 -53
  359. package/dist/runtime/commands/rewind.js +333 -0
  360. package/dist/runtime/commands/roster.js +14 -14
  361. package/dist/runtime/commands/servers-cli.js +182 -0
  362. package/dist/runtime/commands/servers.js +236 -0
  363. package/dist/runtime/commands/sessions.js +163 -0
  364. package/dist/runtime/commands/share.js +316 -0
  365. package/dist/runtime/commands/skills.js +31 -31
  366. package/dist/runtime/commands/status.js +186 -0
  367. package/dist/runtime/commands/stickers.js +82 -0
  368. package/dist/runtime/commands/style.js +194 -0
  369. package/dist/runtime/commands/theme.js +196 -0
  370. package/dist/runtime/commands/undo.js +54 -22
  371. package/dist/runtime/commands/update.js +289 -0
  372. package/dist/runtime/commands/vim.js +140 -0
  373. package/dist/runtime/commands/worktree.js +8 -8
  374. package/dist/runtime/commands/worktrees.js +155 -0
  375. package/dist/runtime/deprecation-warning.js +69 -0
  376. package/dist/runtime/engine-exit-code.js +50 -0
  377. package/dist/runtime/headless-repl.js +195 -0
  378. package/dist/runtime/headless.js +548 -0
  379. package/dist/runtime/load-hooks-or-exit.js +71 -0
  380. package/dist/runtime/plan-decompose.js +22 -22
  381. package/dist/runtime/sigint-guard.js +272 -0
  382. package/dist/runtime/stream-renderer.js +195 -0
  383. package/dist/runtime/update-check.js +28 -28
  384. package/dist/runtime/version.js +65 -0
  385. package/dist/runtime/worktree-bootstrap.js +579 -0
  386. package/dist/skills/bundled/batch.js +617 -0
  387. package/dist/skills/bundled/index.js +45 -0
  388. package/dist/skills/bundled/loop.js +358 -0
  389. package/dist/skills/bundled/remember.js +383 -0
  390. package/dist/skills/bundled/simplify.js +289 -0
  391. package/dist/skills/bundled/skillify.js +373 -0
  392. package/dist/skills/bundled/stuck.js +558 -0
  393. package/dist/skills/bundled/verify.js +439 -0
  394. package/dist/testing/vcr.js +486 -0
  395. package/dist/tools/agent-tool.js +229 -0
  396. package/dist/tools/apply-patch.js +89 -28
  397. package/dist/tools/ask-user-question.js +337 -0
  398. package/dist/tools/ask-user.js +115 -0
  399. package/dist/tools/bash.js +811 -49
  400. package/dist/tools/brief.js +224 -0
  401. package/dist/tools/cron.js +433 -0
  402. package/dist/tools/enter-worktree.js +250 -0
  403. package/dist/tools/exit-worktree.js +147 -0
  404. package/dist/tools/file-tools.js +161 -44
  405. package/dist/tools/http-request.js +336 -0
  406. package/dist/tools/lsp-tools.js +377 -1
  407. package/dist/tools/mcp-tool.js +260 -0
  408. package/dist/tools/multi-edit.js +361 -0
  409. package/dist/tools/powershell.js +268 -0
  410. package/dist/tools/registry.js +120 -5
  411. package/dist/tools/server-tools.js +892 -0
  412. package/dist/tools/skill-tool.js +96 -0
  413. package/dist/tools/sleep.js +99 -0
  414. package/dist/tools/synthetic-output.js +133 -0
  415. package/dist/tools/tasks.js +208 -0
  416. package/dist/tools/todo-write.js +184 -0
  417. package/dist/tools/verify-plan-execution.js +295 -0
  418. package/dist/tools/web-fetch-injection-scanner.js +207 -0
  419. package/dist/tools/web-fetch.js +195 -10
  420. package/dist/tools/web-search.js +458 -0
  421. package/dist/tui/agent-progress-card.js +111 -0
  422. package/dist/tui/agent-tree.js +22 -1
  423. package/dist/tui/ask-modal.js +14 -14
  424. package/dist/tui/ask-user-question-chips.js +315 -0
  425. package/dist/tui/ask-user-question-prompt.js +203 -0
  426. package/dist/tui/compact-banner.js +81 -0
  427. package/dist/tui/conversation-pane.js +85 -11
  428. package/dist/tui/cost-table.js +111 -0
  429. package/dist/tui/device-flow.js +2 -2
  430. package/dist/tui/doctor-table.js +46 -0
  431. package/dist/tui/feedback-prompt.js +156 -0
  432. package/dist/tui/input-box.js +247 -32
  433. package/dist/tui/login-picker.js +3 -3
  434. package/dist/tui/markdown-render.js +6 -6
  435. package/dist/tui/multi-file-diff-approval.js +375 -0
  436. package/dist/tui/onboarding-wizard.js +240 -0
  437. package/dist/tui/permissions-picker.js +86 -0
  438. package/dist/tui/render.js +36 -1
  439. package/dist/tui/repl-render.js +239 -25
  440. package/dist/tui/repl-splash-art.js +16 -16
  441. package/dist/tui/repl-splash-mascot.js +48 -24
  442. package/dist/tui/repl-splash.js +22 -22
  443. package/dist/tui/repl.js +125 -45
  444. package/dist/tui/slash-palette.js +6 -6
  445. package/dist/tui/splash.js +2 -2
  446. package/dist/tui/status-bar.js +109 -31
  447. package/dist/tui/status-table.js +7 -0
  448. package/dist/tui/stickers-art.js +136 -0
  449. package/dist/tui/style-table.js +28 -0
  450. package/dist/tui/theme-table.js +29 -0
  451. package/dist/tui/thinking-spinner.js +123 -0
  452. package/dist/tui/tool-stream-pane.js +53 -4
  453. package/dist/tui/update-banner.js +27 -2
  454. package/dist/tui/vim-input.js +267 -0
  455. package/dist/tui/welcome-banner.js +107 -0
  456. package/dist/tui/welcome-data.js +293 -0
  457. package/dist/tui/workspace-context.js +2 -2
  458. package/package.json +29 -6
  459. package/test/scenarios/codegen-create-file.scenario.txt +13 -0
  460. package/test/scenarios/compact-force.scenario.txt +12 -0
  461. package/test/scenarios/identity.scenario.txt +11 -0
  462. package/test/scenarios/persona-handoff.scenario.txt +12 -0
  463. package/test/scenarios/walkback.scenario.txt +12 -0
  464. package/dist/core/engine/compaction-hook.js +0 -154
@@ -0,0 +1,195 @@
1
+ /**
2
+ * PUGI-VERIFY-GATE — verification command detection.
3
+ *
4
+ * Background: Codex dogfood 2026-06-04 surfaced a P0 trust failure
5
+ * where the Pugi engine returned `status: done` + `exitCode: 0`
6
+ * even after `npm test` exited non-zero on a regression the agent
7
+ * itself had introduced. Root cause: no layer of the dispatch
8
+ * pipeline knew which bash invocations were verification commands,
9
+ * so the engine outcome had no way to gate the final status on
10
+ * test/lint/build pass.
11
+ *
12
+ * This module is the deterministic, configurable allowlist of regex
13
+ * patterns the engine uses to recognise verification commands at
14
+ * dispatch time. The detection is intentionally simple (anchored on
15
+ * the head of the command after sudo / env-prefix stripping) so the
16
+ * allowlist stays auditable. False negatives are recoverable (the
17
+ * agent can re-run with a recognised wrapper); false positives would
18
+ * silently down-grade unrelated commands and are forbidden.
19
+ *
20
+ * The pattern table is exported as `VERIFICATION_PATTERNS`; callers
21
+ * use `detectVerificationCommand(cmd)` for the boolean + tool-tag
22
+ * decision. Both surfaces are pure — no I/O, no session state, no
23
+ * environment reads.
24
+ */
25
+ /**
26
+ * Canonical verification allowlist. Patterns target the head of each
27
+ * shell-separated component AFTER:
28
+ * - leading whitespace is trimmed
29
+ * - leading `sudo` / `time` / `env KEY=value` prefixes are stripped
30
+ *
31
+ * Pre-trim the cmd through `extractCommandHead` before matching.
32
+ *
33
+ * When extending: keep the regex anchored (`^`) so a path containing
34
+ * the tool name (`./scripts/npm.sh`) does not false-positive.
35
+ */
36
+ export const VERIFICATION_PATTERNS = [
37
+ // ----- JavaScript / TypeScript ecosystem -----
38
+ // npm test / npm run test / npm run lint / npm run typecheck / npm run build
39
+ { tool: 'npm-test', pattern: /^npm\s+(?:run\s+)?test\b/, category: 'test' },
40
+ { tool: 'npm-lint', pattern: /^npm\s+run\s+lint\b/, category: 'lint' },
41
+ { tool: 'npm-typecheck', pattern: /^npm\s+run\s+typecheck\b/, category: 'typecheck' },
42
+ { tool: 'npm-build', pattern: /^npm\s+run\s+build\b/, category: 'build' },
43
+ // pnpm (with and without -C / --filter prefixes — match the full head)
44
+ { tool: 'pnpm-test', pattern: /^pnpm(?:\s+(?:-C\s+\S+|--filter(?:\s+|=)\S+|-r))*\s+(?:run\s+)?test\b/, category: 'test' },
45
+ { tool: 'pnpm-lint', pattern: /^pnpm(?:\s+(?:-C\s+\S+|--filter(?:\s+|=)\S+|-r))*\s+(?:run\s+)?lint\b/, category: 'lint' },
46
+ { tool: 'pnpm-typecheck', pattern: /^pnpm(?:\s+(?:-C\s+\S+|--filter(?:\s+|=)\S+|-r))*\s+(?:run\s+)?typecheck\b/, category: 'typecheck' },
47
+ { tool: 'pnpm-build', pattern: /^pnpm(?:\s+(?:-C\s+\S+|--filter(?:\s+|=)\S+|-r))*\s+(?:run\s+)?build\b/, category: 'build' },
48
+ // yarn
49
+ { tool: 'yarn-test', pattern: /^yarn\s+(?:run\s+)?test\b/, category: 'test' },
50
+ { tool: 'yarn-lint', pattern: /^yarn\s+(?:run\s+)?lint\b/, category: 'lint' },
51
+ { tool: 'yarn-typecheck', pattern: /^yarn\s+(?:run\s+)?typecheck\b/, category: 'typecheck' },
52
+ { tool: 'yarn-build', pattern: /^yarn\s+(?:run\s+)?build\b/, category: 'build' },
53
+ // Direct test-runner invocations (npx and bare).
54
+ { tool: 'jest', pattern: /^(?:npx\s+)?jest\b/, category: 'test' },
55
+ { tool: 'vitest', pattern: /^(?:npx\s+)?vitest\b/, category: 'test' },
56
+ { tool: 'mocha', pattern: /^(?:npx\s+)?mocha\b/, category: 'test' },
57
+ { tool: 'tsc-typecheck', pattern: /^(?:npx\s+)?tsc\b(?=.*--noEmit|\s*$)/, category: 'typecheck' },
58
+ { tool: 'eslint', pattern: /^(?:npx\s+)?eslint\b/, category: 'lint' },
59
+ { tool: 'node-test', pattern: /^node\s+--test\b/, category: 'test' },
60
+ // ----- Python -----
61
+ { tool: 'pytest', pattern: /^(?:python\s+-m\s+)?pytest\b/, category: 'test' },
62
+ { tool: 'python-unittest', pattern: /^python\s+-m\s+unittest\b/, category: 'test' },
63
+ { tool: 'ruff', pattern: /^ruff\s+check\b/, category: 'lint' },
64
+ { tool: 'mypy', pattern: /^mypy\b/, category: 'typecheck' },
65
+ // ----- Rust -----
66
+ { tool: 'cargo-test', pattern: /^cargo\s+test\b/, category: 'test' },
67
+ { tool: 'cargo-check', pattern: /^cargo\s+check\b/, category: 'typecheck' },
68
+ { tool: 'cargo-clippy', pattern: /^cargo\s+clippy\b/, category: 'lint' },
69
+ { tool: 'cargo-build', pattern: /^cargo\s+build\b/, category: 'build' },
70
+ // ----- Go -----
71
+ { tool: 'go-test', pattern: /^go\s+test\b/, category: 'test' },
72
+ { tool: 'go-vet', pattern: /^go\s+vet\b/, category: 'lint' },
73
+ { tool: 'go-build', pattern: /^go\s+build\b/, category: 'build' },
74
+ // ----- Elixir -----
75
+ { tool: 'mix-test', pattern: /^mix\s+test\b/, category: 'test' },
76
+ // ----- Ruby -----
77
+ { tool: 'rspec', pattern: /^(?:bundle\s+exec\s+)?rspec\b/, category: 'test' },
78
+ { tool: 'rubocop', pattern: /^(?:bundle\s+exec\s+)?rubocop\b/, category: 'lint' },
79
+ // ----- Java / Kotlin / Gradle / Maven -----
80
+ { tool: 'gradle-test', pattern: /^(?:\.\/)?gradlew?\s+test\b/, category: 'test' },
81
+ { tool: 'gradle-build', pattern: /^(?:\.\/)?gradlew?\s+build\b/, category: 'build' },
82
+ { tool: 'maven-test', pattern: /^mvn\s+test\b/, category: 'test' },
83
+ { tool: 'maven-verify', pattern: /^mvn\s+verify\b/, category: 'test' },
84
+ // ----- C/C++ / Make -----
85
+ { tool: 'make-test', pattern: /^make\s+(?:test|check)\b/, category: 'test' },
86
+ { tool: 'ctest', pattern: /^ctest\b/, category: 'test' },
87
+ ];
88
+ const SHELL_SEPARATORS = /\s*(?:&&|\|\||;|\|)\s*/;
89
+ const ENV_ASSIGN = /^[A-Z_][A-Z0-9_]*=\S+$/;
90
+ /**
91
+ * Strip leading `sudo` / `time` / `env A=1 B=2` noise so the verb is
92
+ * the first non-prefix token. Returns the stripped head as a single
93
+ * normalised string. Pure — no side effects.
94
+ *
95
+ * We do NOT strip generic env-variable assignments like `CI=1` that
96
+ * the operator typed inline (e.g. `CI=1 pnpm test`) because the
97
+ * regex allowlist anchors `pnpm` — matching the head after stripping
98
+ * `CI=1` is precisely the intent.
99
+ */
100
+ export function extractCommandHead(component) {
101
+ let head = component.trim();
102
+ // sudo / time wrappers
103
+ while (true) {
104
+ if (head.startsWith('sudo ')) {
105
+ head = head.slice(5).trimStart();
106
+ continue;
107
+ }
108
+ if (head.startsWith('time ')) {
109
+ head = head.slice(5).trimStart();
110
+ continue;
111
+ }
112
+ // env A=1 B=2 prefix (inline env assignments before the verb).
113
+ // Peel one token at a time so `FOO=bar BAZ=qux pnpm test` resolves to `pnpm test`.
114
+ const firstToken = head.split(/\s+/, 1)[0] ?? '';
115
+ if (firstToken !== '' && ENV_ASSIGN.test(firstToken)) {
116
+ head = head.slice(firstToken.length).trimStart();
117
+ continue;
118
+ }
119
+ break;
120
+ }
121
+ return head;
122
+ }
123
+ /**
124
+ * Detect whether a shell command runs a verification step. The
125
+ * predicate scans every `&&` / `;` / `||` / `|`-separated component
126
+ * and returns the first match — a compound command like
127
+ * `cd packages/foo && pnpm test` is correctly flagged on the
128
+ * trailing component.
129
+ *
130
+ * The check is intentionally optimistic: it does not parse `if`,
131
+ * `for`, or function bodies. Operators wrapping verification inside
132
+ * a script (e.g. `./scripts/test.sh`) opt out of the gate; that is
133
+ * recorded in the unverifiedReason as `no_verification_command_run`
134
+ * downstream.
135
+ */
136
+ export function detectVerificationCommand(cmd) {
137
+ if (typeof cmd !== 'string' || cmd.trim() === '') {
138
+ return { isVerification: false, tool: null, matchedComponent: '' };
139
+ }
140
+ const components = cmd.split(SHELL_SEPARATORS);
141
+ for (const raw of components) {
142
+ const head = extractCommandHead(raw);
143
+ if (head === '')
144
+ continue;
145
+ for (const entry of VERIFICATION_PATTERNS) {
146
+ if (entry.pattern.test(head)) {
147
+ return {
148
+ isVerification: true,
149
+ tool: entry.tool,
150
+ matchedComponent: raw.trim(),
151
+ };
152
+ }
153
+ }
154
+ }
155
+ return { isVerification: false, tool: null, matchedComponent: '' };
156
+ }
157
+ /**
158
+ * Phrases the agent uses to dispute ownership of a verification
159
+ * failure. When ANY of these phrases appears in the final assistant
160
+ * text AND the agent mutated files in the same module as a failing
161
+ * test, the outcome's `regressionOwnershipDispute` flag is set so a
162
+ * downstream reviewer can decide whether to escalate.
163
+ *
164
+ * The list is case-insensitive at match time. Punctuation around the
165
+ * phrase is allowed because `.includes()` looks for the substring,
166
+ * not word boundaries (an agent that writes "this is a pre-existing
167
+ * test bug" still trips the flag).
168
+ */
169
+ export const REGRESSION_DISPUTE_PHRASES = [
170
+ 'pre-existing',
171
+ 'preexisting',
172
+ 'pre existing',
173
+ 'not from my changes',
174
+ 'not related to my changes',
175
+ 'unrelated test failure',
176
+ 'unrelated to my changes',
177
+ 'unrelated failure',
178
+ 'not my change',
179
+ ];
180
+ /**
181
+ * Tail trimmer for stderr captured in verification ledger entries.
182
+ * Returns the last `maxBytes` of UTF-8 text, clamped at a hard 2 KB
183
+ * default to match the PUGI-VERIFY-GATE contract.
184
+ */
185
+ export function tailStderr(stderr, maxBytes = 2048) {
186
+ if (typeof stderr !== 'string' || stderr.length === 0)
187
+ return '';
188
+ if (Buffer.byteLength(stderr, 'utf8') <= maxBytes)
189
+ return stderr;
190
+ // Approximate cap by character index — accurate enough for stderr
191
+ // tails that are overwhelmingly ASCII test output.
192
+ const slice = stderr.slice(-maxBytes);
193
+ return slice;
194
+ }
195
+ //# sourceMappingURL=verification-patterns.js.map
@@ -0,0 +1,83 @@
1
+ /**
2
+ * Append-only TSV ledger for pugi-eval-v1 results.
3
+ *
4
+ * Pattern source: backlog #110 (Karpathy autoresearch). The ledger
5
+ * is git-tracked, never rewritten in place, never edited by hand. The
6
+ * column set is frozen as of schema v1; new columns must land in
7
+ * eval-v2 with a separate ledger file.
8
+ *
9
+ * Columns (tab-separated, in this exact order):
10
+ *
11
+ * timestamp UTC ISO 8601
12
+ * git_sha short sha of repo HEAD at run start
13
+ * task_id frozen task id (`NN-slug`)
14
+ * model engine model identifier or `(default)`
15
+ * status pass | fail | budget_exhausted | timeout | engine_error
16
+ * pugi_score per-task score, 2 decimal places
17
+ * tokens tokens reported by engine
18
+ * turns engine turn count
19
+ * tool_calls tool calls executed
20
+ * wall_ms wall-clock duration (ms)
21
+ * exit_code CLI subprocess exit code
22
+ * verifications `<passed>/<total>`
23
+ *
24
+ * Bytes containing TAB or NEWLINE are stripped from string fields
25
+ * before write so a single line is always one TSV record.
26
+ */
27
+ import { appendFileSync, existsSync, mkdirSync, writeFileSync } from 'node:fs';
28
+ import { dirname } from 'node:path';
29
+ export const LEDGER_COLUMNS = [
30
+ 'timestamp',
31
+ 'git_sha',
32
+ 'task_id',
33
+ 'model',
34
+ 'status',
35
+ 'pugi_score',
36
+ 'tokens',
37
+ 'turns',
38
+ 'tool_calls',
39
+ 'wall_ms',
40
+ 'exit_code',
41
+ 'verifications',
42
+ ];
43
+ export const LEDGER_HEADER = LEDGER_COLUMNS.join('\t');
44
+ function safe(s) {
45
+ return s.replace(/[\t\r\n]+/g, ' ');
46
+ }
47
+ export function formatLedgerLine(row) {
48
+ const { timestamp, gitSha, model, result } = row;
49
+ const passed = result.verifications.filter((v) => v.passed).length;
50
+ const total = result.verifications.length;
51
+ const cells = [
52
+ safe(timestamp),
53
+ safe(gitSha),
54
+ safe(result.taskId),
55
+ safe(model),
56
+ safe(result.status),
57
+ result.pugiScore.toFixed(2),
58
+ String(result.tokensUsed),
59
+ String(result.turnsUsed),
60
+ String(result.toolCallCount),
61
+ String(result.wallClockMs),
62
+ String(result.exitCode),
63
+ `${passed}/${total}`,
64
+ ];
65
+ return cells.join('\t');
66
+ }
67
+ /**
68
+ * Append a single row. Creates the file with the header if it does
69
+ * not yet exist; otherwise appends a single line. Never rewrites
70
+ * existing content.
71
+ */
72
+ export function appendLedgerRow(ledgerPath, row) {
73
+ mkdirSync(dirname(ledgerPath), { recursive: true });
74
+ if (!existsSync(ledgerPath)) {
75
+ writeFileSync(ledgerPath, `${LEDGER_HEADER}\n`, { mode: 0o644 });
76
+ }
77
+ appendFileSync(ledgerPath, `${formatLedgerLine(row)}\n`);
78
+ }
79
+ export function appendLedgerRows(ledgerPath, rows) {
80
+ for (const row of rows)
81
+ appendLedgerRow(ledgerPath, row);
82
+ }
83
+ //# sourceMappingURL=ledger.js.map
@@ -0,0 +1,280 @@
1
+ /**
2
+ * Runner for pugi-eval-v1.
3
+ *
4
+ * Per task: spawn a fresh tmp workspace, copy fixture files, invoke
5
+ * the `pugi <command>` subprocess with the brief, capture stdout +
6
+ * exit code + wall-clock, then run the verification checks.
7
+ *
8
+ * The runner is deliberately subprocess-based - mirrors the smoke
9
+ * harness pattern in `core/smoke/headless-driver.ts`. Validating the
10
+ * AS-PUBLISHED CLI is the whole point of a benchmark; bypassing
11
+ * `bin/run.js` would let us miss whole categories of regression
12
+ * (loader cost, env propagation, exit-code handling).
13
+ *
14
+ * Tests inject a `runner` callback that returns a fake `RunCapture`
15
+ * so the meta-spec can exercise scoring + ledger without a real
16
+ * engine.
17
+ */
18
+ import { spawn } from 'node:child_process';
19
+ import { existsSync, mkdirSync, mkdtempSync, rmSync, writeFileSync, readdirSync, statSync } from 'node:fs';
20
+ import { tmpdir } from 'node:os';
21
+ import { dirname, join, resolve } from 'node:path';
22
+ import { computePugiScore } from './scoring.js';
23
+ import { runVerifications } from './verifier.js';
24
+ /**
25
+ * Default executor: spawn `pugi <command> "<brief>" --json --print`
26
+ * inside the workspace. The `--print` flag forces non-interactive
27
+ * mode; `--json` produces the structured envelope the runner parses
28
+ * for `tokensUsed` etc.
29
+ */
30
+ export const subprocessRunner = async (input) => {
31
+ const args = [
32
+ input.spec.command,
33
+ '--print',
34
+ '--json',
35
+ '--intensity',
36
+ input.spec.intensity,
37
+ '--max-turns',
38
+ String(input.spec.maxTurns),
39
+ ];
40
+ if (input.model) {
41
+ args.push('--model', input.model);
42
+ }
43
+ args.push(input.spec.brief);
44
+ const child = spawn(input.pugiBin, args, {
45
+ cwd: input.workspaceRoot,
46
+ env: input.env,
47
+ stdio: ['ignore', 'pipe', 'pipe'],
48
+ });
49
+ const start = Date.now();
50
+ let stdout = '';
51
+ let stderr = '';
52
+ child.stdout?.on('data', (chunk) => {
53
+ stdout += chunk.toString('utf8');
54
+ });
55
+ child.stderr?.on('data', (chunk) => {
56
+ stderr += chunk.toString('utf8');
57
+ });
58
+ let timedOut = false;
59
+ const timer = setTimeout(() => {
60
+ timedOut = true;
61
+ try {
62
+ child.kill('SIGTERM');
63
+ }
64
+ catch {
65
+ /* noop */
66
+ }
67
+ setTimeout(() => {
68
+ try {
69
+ child.kill('SIGKILL');
70
+ }
71
+ catch {
72
+ /* noop */
73
+ }
74
+ }, 5_000);
75
+ }, input.spec.timeoutMs);
76
+ const onAbort = () => {
77
+ try {
78
+ child.kill('SIGTERM');
79
+ }
80
+ catch {
81
+ /* noop */
82
+ }
83
+ };
84
+ input.signal?.addEventListener('abort', onAbort);
85
+ const exitCode = await new Promise((resolveExit) => {
86
+ child.on('exit', (code) => resolveExit(code ?? -1));
87
+ child.on('error', () => resolveExit(-1));
88
+ });
89
+ clearTimeout(timer);
90
+ input.signal?.removeEventListener('abort', onAbort);
91
+ const wallClockMs = Date.now() - start;
92
+ const parsed = parseEnvelope(stdout);
93
+ const budgetExhausted = parsed?.status === 'budget_exceeded' ||
94
+ parsed?.status === 'budget_exhausted';
95
+ const engineError = parsed?.status === 'engine_unavailable' ||
96
+ parsed?.status === 'failed';
97
+ return {
98
+ stdout,
99
+ stderr,
100
+ exitCode,
101
+ wallClockMs,
102
+ tokensUsed: parsed?.tokensUsed ?? 0,
103
+ turnsUsed: parsed?.turnsUsed ?? 0,
104
+ toolCallCount: parsed?.toolCallCount ?? 0,
105
+ timedOut,
106
+ budgetExhausted,
107
+ engineError,
108
+ };
109
+ };
110
+ /**
111
+ * Parse the last JSON envelope from stdout. Pugi `--json` emits one
112
+ * JSON object per invocation; the runner scans for the final `{...}`
113
+ * block so warning lines before it do not break parsing.
114
+ */
115
+ function parseEnvelope(stdout) {
116
+ const trimmed = stdout.trim();
117
+ if (trimmed === '')
118
+ return null;
119
+ // Try the entire trimmed payload first (common case).
120
+ try {
121
+ return JSON.parse(trimmed);
122
+ }
123
+ catch {
124
+ /* fall through to line scan */
125
+ }
126
+ const lines = trimmed.split(/\r?\n/);
127
+ for (let i = lines.length - 1; i >= 0; i -= 1) {
128
+ const line = lines[i].trim();
129
+ if (!line.startsWith('{'))
130
+ continue;
131
+ try {
132
+ return JSON.parse(line);
133
+ }
134
+ catch {
135
+ continue;
136
+ }
137
+ }
138
+ return null;
139
+ }
140
+ function walkFiles(root, prefix, out) {
141
+ let entries;
142
+ try {
143
+ entries = readdirSync(root);
144
+ }
145
+ catch {
146
+ return;
147
+ }
148
+ for (const entry of entries) {
149
+ if (entry === '.pugi' || entry === 'node_modules' || entry === '.git') {
150
+ continue;
151
+ }
152
+ const abs = join(root, entry);
153
+ const rel = prefix === '' ? entry : `${prefix}/${entry}`;
154
+ let st;
155
+ try {
156
+ st = statSync(abs);
157
+ }
158
+ catch {
159
+ continue;
160
+ }
161
+ if (st.isDirectory()) {
162
+ walkFiles(abs, rel, out);
163
+ }
164
+ else if (st.isFile()) {
165
+ out.push(rel);
166
+ }
167
+ }
168
+ }
169
+ function classifyStatus(capture, verificationsAllPassed) {
170
+ if (capture.timedOut)
171
+ return 'timeout';
172
+ if (capture.budgetExhausted)
173
+ return 'budget_exhausted';
174
+ if (capture.engineError)
175
+ return 'engine_error';
176
+ if (capture.exitCode !== 0)
177
+ return 'fail';
178
+ return verificationsAllPassed ? 'pass' : 'fail';
179
+ }
180
+ export function prepareWorkspace(spec) {
181
+ const root = mkdtempSync(join(tmpdir(), `pugi-eval-v1-${spec.id}-`));
182
+ if (spec.fixture) {
183
+ for (const [relPath, body] of Object.entries(spec.fixture)) {
184
+ if (relPath.split(/[\\/]/).includes('..')) {
185
+ throw new Error(`eval-v1 task ${spec.id}: fixture path ${relPath} contains ..`);
186
+ }
187
+ const abs = resolve(root, relPath);
188
+ mkdirSync(dirname(abs), { recursive: true });
189
+ writeFileSync(abs, body, { mode: 0o644 });
190
+ }
191
+ }
192
+ const cleanup = () => {
193
+ try {
194
+ rmSync(root, { recursive: true, force: true });
195
+ }
196
+ catch {
197
+ /* swallow */
198
+ }
199
+ };
200
+ return { root, cleanup };
201
+ }
202
+ export async function runTaskWithCapture(spec, workspaceRoot, capture) {
203
+ const parsed = parseEnvelope(capture.stdout);
204
+ const finalText = parsed?.finalText ?? capture.stdout;
205
+ const verifications = runVerifications(spec.verification, {
206
+ workspaceRoot,
207
+ finalText,
208
+ });
209
+ const allPassed = verifications.every((v) => v.passed);
210
+ const status = classifyStatus(capture, allPassed);
211
+ const filesWritten = [];
212
+ walkFiles(workspaceRoot, '', filesWritten);
213
+ filesWritten.sort();
214
+ const base = {
215
+ taskId: spec.id,
216
+ status,
217
+ tokensUsed: capture.tokensUsed,
218
+ toolCallCount: capture.toolCallCount,
219
+ turnsUsed: capture.turnsUsed,
220
+ wallClockMs: capture.wallClockMs,
221
+ exitCode: capture.exitCode,
222
+ verifications,
223
+ finalText,
224
+ filesWritten,
225
+ };
226
+ const pugiScore = computePugiScore(base, spec);
227
+ return { ...base, pugiScore };
228
+ }
229
+ export async function runOneTask(spec, options) {
230
+ const ws = prepareWorkspace(spec);
231
+ try {
232
+ const capture = await options.runner({
233
+ spec,
234
+ workspaceRoot: ws.root,
235
+ pugiBin: options.pugiBin,
236
+ ...(options.model !== undefined ? { model: options.model } : {}),
237
+ env: options.env,
238
+ ...(options.signal !== undefined ? { signal: options.signal } : {}),
239
+ });
240
+ return await runTaskWithCapture(spec, ws.root, capture);
241
+ }
242
+ finally {
243
+ ws.cleanup();
244
+ }
245
+ }
246
+ export async function runHarness(input) {
247
+ const runner = input.options.runner ?? subprocessRunner;
248
+ const env = input.options.env ?? process.env;
249
+ const onlyFilter = input.options.only
250
+ ? new Set(input.options.only)
251
+ : null;
252
+ const out = [];
253
+ for (const spec of input.specs) {
254
+ if (onlyFilter && !onlyFilter.has(spec.id))
255
+ continue;
256
+ input.options.onTaskStart?.(spec);
257
+ const runOpts = {
258
+ pugiBin: input.options.pugiBin,
259
+ env,
260
+ runner,
261
+ };
262
+ if (input.options.model !== undefined) {
263
+ runOpts.model = input.options.model;
264
+ }
265
+ const result = await runOneTask(spec, runOpts);
266
+ out.push(result);
267
+ input.options.onTaskFinish?.(result);
268
+ }
269
+ if (onlyFilter && out.length === 0) {
270
+ throw new Error(`eval-v1: --task filter matched zero tasks (asked for ${[...onlyFilter].join(', ')})`);
271
+ }
272
+ // Verify path safety: workspace cleanup happened, no temp dirs
273
+ // leaked beyond tmpdir prefix.
274
+ if (!existsSync(tmpdir())) {
275
+ // pathological - tmpdir disappeared. Surface so CI fails loud.
276
+ throw new Error('eval-v1: tmpdir no longer exists post-run');
277
+ }
278
+ return out;
279
+ }
280
+ //# sourceMappingURL=runner.js.map
@@ -0,0 +1,68 @@
1
+ /**
2
+ * pugi_score scoring formula for eval-v1.
3
+ *
4
+ * Per-task score (0..150, higher = better):
5
+ *
6
+ * pugi_score = pass_rate * 100 // 0..100
7
+ * + verification_completeness * 50 // 0.. 50
8
+ * - (tokens_used / max_tokens) * 30 // 0..-30
9
+ * - (wall_clock_ms / timeout_ms) * 20 // 0..-20
10
+ *
11
+ * Where:
12
+ * - `pass_rate` is 1.0 if status is `pass`, 0 otherwise.
13
+ * - `verification_completeness` is `passed_checks / total_checks`.
14
+ * - Token and wall-clock penalties are clamped to [0, 1] so a run
15
+ * that exceeds the budget cap caps the penalty (avoid runaway
16
+ * negative scores that would skew the aggregate).
17
+ *
18
+ * Aggregate is the arithmetic mean across all per-task scores. Mean
19
+ * is defensible because every task contributes equally to the
20
+ * benchmark (we are not weighting by difficulty - eval-v2 may add
21
+ * weights once we have a baseline year of data).
22
+ */
23
+ const PASS_WEIGHT = 100;
24
+ const VERIFICATION_WEIGHT = 50;
25
+ const TOKEN_PENALTY = 30;
26
+ const WALL_PENALTY = 20;
27
+ function clamp01(n) {
28
+ if (!Number.isFinite(n))
29
+ return 1;
30
+ if (n < 0)
31
+ return 0;
32
+ if (n > 1)
33
+ return 1;
34
+ return n;
35
+ }
36
+ /**
37
+ * Compute the per-task pugi_score. Called by the runner before
38
+ * appending the result to the ledger.
39
+ */
40
+ export function computePugiScore(result, spec) {
41
+ const passRate = result.status === 'pass' ? 1 : 0;
42
+ const totalChecks = Math.max(1, result.verifications.length);
43
+ const passedChecks = result.verifications.filter((v) => v.passed).length;
44
+ const completeness = passedChecks / totalChecks;
45
+ const tokenPenalty = clamp01(result.tokensUsed / spec.maxTokens);
46
+ const wallPenalty = clamp01(result.wallClockMs / spec.timeoutMs);
47
+ const score = passRate * PASS_WEIGHT +
48
+ completeness * VERIFICATION_WEIGHT -
49
+ tokenPenalty * TOKEN_PENALTY -
50
+ wallPenalty * WALL_PENALTY;
51
+ // Clamp к [0, 150] so a partial-completeness fail with non-zero
52
+ // penalties never produces a negative aggregate. The penalties are
53
+ // already clamped to [0, 1] individually; this final clamp protects
54
+ // the documented range invariant when verification_completeness is
55
+ // 0 AND budget penalties land.
56
+ const clamped = Math.max(0, Math.min(150, score));
57
+ return Math.round(clamped * 100) / 100;
58
+ }
59
+ /**
60
+ * Aggregate score across an entire harness run. Mean by design.
61
+ */
62
+ export function aggregateScore(results) {
63
+ if (results.length === 0)
64
+ return 0;
65
+ const sum = results.reduce((acc, r) => acc + r.pugiScore, 0);
66
+ return Math.round((sum / results.length) * 100) / 100;
67
+ }
68
+ //# sourceMappingURL=scoring.js.map