@pugi/cli 0.1.0-beta.10 → 0.1.0-beta.101
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +132 -0
- package/LICENSE +1 -1
- package/README.md +55 -11
- package/assets/pugi-prozr2-mascot.ansi +9 -0
- package/bin/run.js +33 -1
- package/dist/commands/deploy.js +40 -40
- package/dist/commands/flatten.js +191 -0
- package/dist/commands/jobs-watch.js +201 -0
- package/dist/commands/jobs.js +42 -27
- package/dist/commands/retro.js +210 -0
- package/dist/commands/smoke.js +133 -0
- package/dist/core/agent-progress/cleanup.js +134 -0
- package/dist/core/agent-progress/schema.js +144 -0
- package/dist/core/agent-progress/writer.js +101 -0
- package/dist/core/agents/adaptive-router.js +330 -0
- package/dist/core/agents/query-decomposer.js +297 -0
- package/dist/core/agents/registry.js +3 -3
- package/dist/core/approvals/shortcut-resolver.js +98 -0
- package/dist/core/artifact-chain/dispatcher.js +148 -0
- package/dist/core/artifact-chain/exporter.js +164 -0
- package/dist/core/artifact-chain/state.js +243 -0
- package/dist/core/artifact-chain/steps.js +169 -0
- package/dist/core/ask-user/question.js +92 -0
- package/dist/core/audit/audit-trail.js +275 -0
- package/dist/core/auth/ensure-authenticated.js +129 -0
- package/dist/core/auth/env-provider.js +238 -0
- package/dist/core/auto-open-browser.js +4 -4
- package/dist/core/auto-update/channels.js +122 -0
- package/dist/core/auto-update/checker.js +241 -0
- package/dist/core/auto-update/state.js +235 -0
- package/dist/core/bare-mode/index.js +107 -0
- package/dist/core/bash/redirect.js +281 -0
- package/dist/core/bash-classifier.js +436 -40
- package/dist/core/checkpoint/resumer.js +149 -0
- package/dist/core/checkpoint/rewinder.js +291 -0
- package/dist/core/checkpoints/shadow-git.js +670 -0
- package/dist/core/citations/parser.js +109 -0
- package/dist/core/classifier/yolo-classifier.js +88 -0
- package/dist/core/codegraph/db.js +506 -0
- package/dist/core/codegraph/decision-store.js +248 -0
- package/dist/core/codegraph/detect-repo.js +459 -0
- package/dist/core/codegraph/install.js +134 -0
- package/dist/core/codegraph/offer-hook.js +220 -0
- package/dist/core/codegraph/parser.js +598 -0
- package/dist/core/codegraph/queries/go.scm +57 -0
- package/dist/core/codegraph/queries/javascript.scm +56 -0
- package/dist/core/codegraph/queries/python.scm +55 -0
- package/dist/core/codegraph/queries/rust.scm +63 -0
- package/dist/core/codegraph/queries/typescript.scm +91 -0
- package/dist/core/codegraph/reindex.js +218 -0
- package/dist/core/codegraph/resolve-edges.js +107 -0
- package/dist/core/codegraph/types.js +34 -0
- package/dist/core/codegraph/watcher.js +440 -0
- package/dist/core/compact/auto-trigger.js +96 -0
- package/dist/core/compact/buffer-rewriter.js +115 -0
- package/dist/core/compact/summarizer.js +208 -0
- package/dist/core/compact/token-counter.js +108 -0
- package/dist/core/consensus/anvil-fanout.js +25 -25
- package/dist/core/consensus/diff-capture.js +121 -12
- package/dist/core/consensus/rubric.js +21 -21
- package/dist/core/context/builder.js +6 -6
- package/dist/core/context/compaction-events.js +8 -8
- package/dist/core/context/compaction.js +31 -31
- package/dist/core/context/index.js +15 -8
- package/dist/core/context/invariants.js +51 -51
- package/dist/core/context/markdown-loader.js +28 -10
- package/dist/core/context/markdown-traverse.js +255 -0
- package/dist/core/context/pugiignore.js +41 -41
- package/dist/core/context/repo-skeleton.js +37 -37
- package/dist/core/context/tool-eviction.js +55 -0
- package/dist/core/context/watcher.js +32 -32
- package/dist/core/context/working-set.js +23 -23
- package/dist/core/coordinator/agent-tools.js +77 -0
- package/dist/core/coordinator/agent-toolset.js +65 -0
- package/dist/core/coordinator/fsm.js +73 -0
- package/dist/core/coordinator/mode-fsm.js +70 -0
- package/dist/core/cost/rate-card.js +129 -0
- package/dist/core/cost/tracker.js +221 -0
- package/dist/core/credentials.js +13 -13
- package/dist/core/cron/scheduler.js +138 -0
- package/dist/core/denial-tracking/index.js +8 -0
- package/dist/core/denial-tracking/state.js +264 -0
- package/dist/core/diagnostics/probe-runner.js +93 -0
- package/dist/core/diagnostics/probes/api.js +46 -0
- package/dist/core/diagnostics/probes/auth.js +93 -0
- package/dist/core/diagnostics/probes/bare-mode.js +42 -0
- package/dist/core/diagnostics/probes/cli-version.js +127 -0
- package/dist/core/diagnostics/probes/config.js +72 -0
- package/dist/core/diagnostics/probes/denial-tracking.js +57 -0
- package/dist/core/diagnostics/probes/disk.js +81 -0
- package/dist/core/diagnostics/probes/engine-live.js +46 -0
- package/dist/core/diagnostics/probes/git.js +65 -0
- package/dist/core/diagnostics/probes/hooks.js +118 -0
- package/dist/core/diagnostics/probes/mcp.js +75 -0
- package/dist/core/diagnostics/probes/node.js +59 -0
- package/dist/core/diagnostics/probes/pnpm.js +36 -0
- package/dist/core/diagnostics/probes/pugi-md.js +89 -0
- package/dist/core/diagnostics/probes/sandbox.js +67 -0
- package/dist/core/diagnostics/probes/session.js +74 -0
- package/dist/core/diagnostics/probes/status-snapshot.js +488 -0
- package/dist/core/diagnostics/probes/workspace.js +63 -0
- package/dist/core/diagnostics/types.js +70 -0
- package/dist/core/dispatch/cache-cleanup.js +197 -0
- package/dist/core/dispatch/cache-handoff.js +295 -0
- package/dist/core/edits/apply-patch-layer-e.js +189 -0
- package/dist/core/edits/dispatch.js +333 -7
- package/dist/core/edits/format-detector.js +260 -0
- package/dist/core/edits/format-matrix.js +26 -0
- package/dist/core/edits/fuzzy-ladder.js +650 -0
- package/dist/core/edits/index.js +5 -1
- package/dist/core/edits/journal.js +199 -0
- package/dist/core/edits/layer-a-apply.js +15 -15
- package/dist/core/edits/layer-a-fuzzy-apply.js +198 -0
- package/dist/core/edits/layer-b-apply.js +9 -9
- package/dist/core/edits/layer-c-apply.js +6 -6
- package/dist/core/edits/layer-d-ast.js +557 -14
- package/dist/core/edits/marker-parser.js +12 -12
- package/dist/core/edits/security-gate.js +27 -27
- package/dist/core/edits/verify-hook.js +273 -0
- package/dist/core/edits/worktree.js +29 -29
- package/dist/core/engine/anvil-client.js +214 -26
- package/dist/core/engine/auto-compact.js +247 -0
- package/dist/core/engine/budgets.js +220 -0
- package/dist/core/engine/compact-llm-summarizer.js +124 -0
- package/dist/core/engine/context-prefix.js +155 -0
- package/dist/core/engine/index.js +1 -1
- package/dist/core/engine/intensity.js +163 -0
- package/dist/core/engine/intent.js +260 -0
- package/dist/core/engine/native-pugi.js +1559 -227
- package/dist/core/engine/prompts.js +219 -19
- package/dist/core/engine/strip-internal-fields.js +124 -0
- package/dist/core/engine/tool-bridge.js +1887 -59
- package/dist/core/engine/verification-patterns.js +195 -0
- package/dist/core/eval/v1/ledger.js +83 -0
- package/dist/core/eval/v1/runner.js +280 -0
- package/dist/core/eval/v1/scoring.js +68 -0
- package/dist/core/eval/v1/task-loader.js +191 -0
- package/dist/core/eval/v1/types.js +14 -0
- package/dist/core/eval/v1/verifier.js +176 -0
- package/dist/core/eval/v1/yaml-parser.js +250 -0
- package/dist/core/evaluation/golden-dataset.js +293 -0
- package/dist/core/feedback/queue.js +177 -0
- package/dist/core/feedback/submitter.js +145 -0
- package/dist/core/file-cache.js +113 -1
- package/dist/core/flatten/flatten-repo.js +439 -0
- package/dist/core/format/osc8-link.js +28 -0
- package/dist/core/hook-chains.js +392 -0
- package/dist/core/hooks/citation-verify-hook.js +138 -0
- package/dist/core/hooks/citation-verify.js +112 -0
- package/dist/core/hooks/events.js +46 -0
- package/dist/core/hooks/index.js +15 -0
- package/dist/core/hooks/registry.js +216 -0
- package/dist/core/hooks/runner.js +236 -0
- package/dist/core/hooks/v2/event-emitter.js +115 -0
- package/dist/core/hooks/v2/executor.js +282 -0
- package/dist/core/hooks/v2/index.js +25 -0
- package/dist/core/hooks/v2/lifecycle.js +104 -0
- package/dist/core/hooks/v2/loader.js +216 -0
- package/dist/core/hooks/v2/matcher.js +125 -0
- package/dist/core/hooks/v2/trust.js +143 -0
- package/dist/core/hooks/v2/types.js +86 -0
- package/dist/core/hooks/worktree-events.js +158 -0
- package/dist/core/image/renderer.js +71 -0
- package/dist/core/init/detector.js +582 -0
- package/dist/core/init/template-renderer.js +242 -0
- package/dist/core/jobs/registry.js +18 -18
- package/dist/core/ledger/results-tsv.js +142 -0
- package/dist/core/log-discipline/stdout-redirect.js +51 -0
- package/dist/core/lsp/cache.js +105 -0
- package/dist/core/lsp/client.js +551 -41
- package/dist/core/lsp/language-detect.js +66 -0
- package/dist/core/lsp/post-edit-diagnostics.js +171 -0
- package/dist/core/lsp/server-detect.js +173 -0
- package/dist/core/lsp/symbol-cache.js +162 -0
- package/dist/core/lsp/symbol-tools.js +664 -0
- package/dist/core/mcp/client.js +97 -28
- package/dist/core/mcp/http-server.js +553 -0
- package/dist/core/mcp/orchestrator-config.js +192 -0
- package/dist/core/mcp/orchestrator-tools.js +806 -0
- package/dist/core/mcp/permission.js +190 -0
- package/dist/core/mcp/registry.js +39 -17
- package/dist/core/mcp/server-tools.js +219 -0
- package/dist/core/mcp/server.js +397 -0
- package/dist/core/mcp/trust.js +10 -10
- package/dist/core/memory/dual-write.js +416 -0
- package/dist/core/memory/passive-extract.js +130 -0
- package/dist/core/memory/phase1-kinds.js +20 -0
- package/dist/core/memory/secret-scanner.js +304 -0
- package/dist/core/memory-sync/queue.js +170 -0
- package/dist/core/metrics/extract.js +113 -0
- package/dist/core/modes/roo-modes.js +68 -0
- package/dist/core/notes/notes-paths.js +113 -0
- package/dist/core/notes/notes-recorder.js +140 -0
- package/dist/core/notes/notes-writer.js +53 -0
- package/dist/core/notes/renderers.js +0 -0
- package/dist/core/notes/slug.js +105 -0
- package/dist/core/onboarding/ensure-initialized.js +133 -0
- package/dist/core/onboarding/marker.js +111 -0
- package/dist/core/onboarding/telemetry-state.js +108 -0
- package/dist/core/output-style/presets.js +176 -0
- package/dist/core/output-style/state.js +185 -0
- package/dist/core/path-security.js +287 -5
- package/dist/core/permission.js +82 -22
- package/dist/core/permissions/auto-classifier.js +124 -0
- package/dist/core/permissions/bash-parser.js +371 -0
- package/dist/core/permissions/circuit-breaker.js +83 -0
- package/dist/core/permissions/constrained-edit.js +91 -0
- package/dist/core/permissions/gate.js +278 -0
- package/dist/core/permissions/index.js +20 -0
- package/dist/core/permissions/mode.js +174 -0
- package/dist/core/permissions/network-egress.js +137 -0
- package/dist/core/permissions/state.js +241 -0
- package/dist/core/permissions/tool-class.js +107 -0
- package/dist/core/plan-mode/ui-state.js +51 -0
- package/dist/core/plans/plan-artifact.js +721 -0
- package/dist/core/policy-limits/etag-store.js +122 -0
- package/dist/core/prd-check/parser.js +215 -0
- package/dist/core/prd-check/reporter.js +127 -0
- package/dist/core/prd-check/session-review.js +557 -0
- package/dist/core/prd-check/verifiers.js +223 -0
- package/dist/core/prompt-cache/client-cache.js +99 -0
- package/dist/core/prompts/assembly.js +29 -0
- package/dist/core/prompts/registry.js +364 -0
- package/dist/core/pugi-gitignore.js +52 -0
- package/dist/core/pugi-md/cc-compat-rules.js +735 -0
- package/dist/core/pugi-md/context-injector.js +76 -0
- package/dist/core/pugi-md/walk-up.js +207 -0
- package/dist/core/python/uv-installer.js +270 -0
- package/dist/core/python/uv-resolver.js +83 -0
- package/dist/core/rate-limit/narrator.js +146 -0
- package/dist/core/recipes/cli-types.js +20 -0
- package/dist/core/recipes/loader.js +103 -0
- package/dist/core/recipes/runner.js +345 -0
- package/dist/core/recipes/schema.js +587 -0
- package/dist/core/release-notes/parser.js +241 -0
- package/dist/core/release-notes/state.js +116 -0
- package/dist/core/repl/ask.js +37 -37
- package/dist/core/repl/cancellation.js +26 -26
- package/dist/core/repl/cap-warning.js +4 -4
- package/dist/core/repl/clipboard-read.js +11 -11
- package/dist/core/repl/dispatch-fsm.js +12 -12
- package/dist/core/repl/engine-bridge.js +303 -0
- package/dist/core/repl/history-search.js +15 -15
- package/dist/core/repl/history.js +28 -18
- package/dist/core/repl/kill-ring.js +5 -5
- package/dist/core/repl/model-pricing.js +135 -0
- package/dist/core/repl/privacy-banner.js +22 -22
- package/dist/core/repl/session.js +2690 -229
- package/dist/core/repl/slash-commands.js +540 -41
- package/dist/core/repl/store/index.js +1 -1
- package/dist/core/repl/store/jsonl-log.js +22 -22
- package/dist/core/repl/store/lockfile.js +10 -10
- package/dist/core/repl/store/session-store.js +136 -107
- package/dist/core/repl/store/types.js +15 -15
- package/dist/core/repl/store/uuid-v7.js +12 -12
- package/dist/core/repl/tool-route.js +382 -0
- package/dist/core/repl/workspace-context.js +43 -21
- package/dist/core/repo-map/build.js +125 -0
- package/dist/core/repo-map/cache.js +185 -0
- package/dist/core/repo-map/extractor.js +254 -0
- package/dist/core/repo-map/formatter.js +145 -0
- package/dist/core/repo-map/page-rank.js +105 -0
- package/dist/core/repo-map/scanner.js +211 -0
- package/dist/core/retro/git-collector.js +251 -0
- package/dist/core/retro/health-card.js +25 -0
- package/dist/core/retro/metrics.js +342 -0
- package/dist/core/retro/narrative.js +249 -0
- package/dist/core/retro/plane-collector.js +274 -0
- package/dist/core/retro/pr-issue-link.js +65 -0
- package/dist/core/retro/types.js +16 -0
- package/dist/core/retry-budget/budget.js +284 -0
- package/dist/core/retry-budget/index.js +5 -0
- package/dist/core/retry-budget/retry-cap.js +74 -0
- package/dist/core/routing/lead-worker.js +43 -0
- package/dist/core/routing/pre-flight-estimator.js +108 -0
- package/dist/core/runs/run-tree.js +103 -0
- package/dist/core/sandboxing/adapter.js +43 -0
- package/dist/core/sandboxing/bubblewrap.js +209 -0
- package/dist/core/sandboxing/index.js +78 -0
- package/dist/core/sandboxing/none.js +19 -0
- package/dist/core/sandboxing/policy.js +97 -0
- package/dist/core/sandboxing/seatbelt.js +231 -0
- package/dist/core/security/injection-scanner.js +367 -0
- package/dist/core/security/output-filter.js +418 -0
- package/dist/core/session/env-file.js +105 -0
- package/dist/core/session/section-budgets.js +140 -0
- package/dist/core/session.js +119 -0
- package/dist/core/settings.js +402 -5
- package/dist/core/share/formatter.js +271 -0
- package/dist/core/share/redactor.js +221 -0
- package/dist/core/share/uploader.js +267 -0
- package/dist/core/skills/defaults.js +30 -30
- package/dist/core/skills/loader.js +22 -22
- package/dist/core/skills/sources.js +27 -27
- package/dist/core/smoke/headless-driver.js +174 -0
- package/dist/core/smoke/orchestrator.js +194 -0
- package/dist/core/smoke/runner.js +238 -0
- package/dist/core/smoke/scenario-parser.js +316 -0
- package/dist/core/statusline.js +99 -0
- package/dist/core/subagents/dispatcher-real.js +600 -0
- package/dist/core/subagents/dispatcher.js +146 -52
- package/dist/core/subagents/index.js +19 -6
- package/dist/core/subagents/isolation-matrix.js +213 -0
- package/dist/core/subagents/spawn.js +19 -4
- package/dist/core/telemetry/emitter.js +229 -0
- package/dist/core/telemetry/queue.js +251 -0
- package/dist/core/theme/context.js +91 -0
- package/dist/core/theme/presets.js +228 -0
- package/dist/core/theme/state.js +181 -0
- package/dist/core/todos/invariant.js +10 -0
- package/dist/core/todos/state.js +177 -0
- package/dist/core/tool-schema/compressor.js +89 -0
- package/dist/core/transport/version-interceptor.js +166 -0
- package/dist/core/trust.js +2 -2
- package/dist/core/tui/thinking-block.js +64 -0
- package/dist/core/vim/keymap.js +288 -0
- package/dist/core/vim/state.js +92 -0
- package/dist/core/watch-markers/marker-watcher.js +133 -0
- package/dist/core/worktree/include-parser.js +249 -0
- package/dist/core/worktree-manager/cleanup.js +123 -0
- package/dist/core/worktree-manager/manager.js +303 -0
- package/dist/index.js +36 -0
- package/dist/runtime/bootstrap.js +190 -0
- package/dist/runtime/cli.js +4403 -561
- package/dist/runtime/commands/agents.js +31 -31
- package/dist/runtime/commands/budget.js +5 -5
- package/dist/runtime/commands/cancel.js +231 -0
- package/dist/runtime/commands/chain.js +489 -0
- package/dist/runtime/commands/codegraph-status.js +227 -0
- package/dist/runtime/commands/compact.js +297 -0
- package/dist/runtime/commands/config.js +74 -40
- package/dist/runtime/commands/cost.js +199 -0
- package/dist/runtime/commands/delegate.js +27 -4
- package/dist/runtime/commands/dispatch.js +126 -0
- package/dist/runtime/commands/doctor.js +579 -0
- package/dist/runtime/commands/eval-v1.js +266 -0
- package/dist/runtime/commands/feedback.js +184 -0
- package/dist/runtime/commands/hooks.js +187 -0
- package/dist/runtime/commands/index-cmd.js +459 -0
- package/dist/runtime/commands/init.js +254 -0
- package/dist/runtime/commands/lsp.js +200 -38
- package/dist/runtime/commands/mcp.js +935 -0
- package/dist/runtime/commands/memory.js +582 -0
- package/dist/runtime/commands/model.js +237 -0
- package/dist/runtime/commands/onboarding.js +275 -0
- package/dist/runtime/commands/patch.js +12 -12
- package/dist/runtime/commands/permissions.js +112 -0
- package/dist/runtime/commands/plan.js +143 -0
- package/dist/runtime/commands/prd-check.js +285 -0
- package/dist/runtime/commands/privacy.js +17 -17
- package/dist/runtime/commands/recipe.js +325 -0
- package/dist/runtime/commands/redo-blob-store.js +92 -0
- package/dist/runtime/commands/redo.js +361 -0
- package/dist/runtime/commands/release-notes.js +229 -0
- package/dist/runtime/commands/repo-map.js +95 -0
- package/dist/runtime/commands/report.js +299 -0
- package/dist/runtime/commands/resume.js +118 -0
- package/dist/runtime/commands/review-consensus.js +68 -53
- package/dist/runtime/commands/rewind.js +333 -0
- package/dist/runtime/commands/roster.js +14 -14
- package/dist/runtime/commands/servers-cli.js +182 -0
- package/dist/runtime/commands/servers.js +236 -0
- package/dist/runtime/commands/sessions.js +163 -0
- package/dist/runtime/commands/share.js +316 -0
- package/dist/runtime/commands/skills.js +31 -31
- package/dist/runtime/commands/status.js +186 -0
- package/dist/runtime/commands/stickers.js +82 -0
- package/dist/runtime/commands/style.js +194 -0
- package/dist/runtime/commands/theme.js +196 -0
- package/dist/runtime/commands/undo.js +54 -22
- package/dist/runtime/commands/update.js +289 -0
- package/dist/runtime/commands/vim.js +140 -0
- package/dist/runtime/commands/worktree.js +8 -8
- package/dist/runtime/commands/worktrees.js +155 -0
- package/dist/runtime/deprecation-warning.js +69 -0
- package/dist/runtime/engine-exit-code.js +50 -0
- package/dist/runtime/headless-repl.js +195 -0
- package/dist/runtime/headless.js +548 -0
- package/dist/runtime/load-hooks-or-exit.js +71 -0
- package/dist/runtime/plan-decompose.js +22 -22
- package/dist/runtime/sigint-guard.js +272 -0
- package/dist/runtime/stream-renderer.js +195 -0
- package/dist/runtime/update-check.js +28 -28
- package/dist/runtime/version.js +65 -0
- package/dist/runtime/worktree-bootstrap.js +579 -0
- package/dist/skills/bundled/batch.js +617 -0
- package/dist/skills/bundled/index.js +45 -0
- package/dist/skills/bundled/loop.js +358 -0
- package/dist/skills/bundled/remember.js +383 -0
- package/dist/skills/bundled/simplify.js +289 -0
- package/dist/skills/bundled/skillify.js +373 -0
- package/dist/skills/bundled/stuck.js +558 -0
- package/dist/skills/bundled/verify.js +439 -0
- package/dist/testing/vcr.js +486 -0
- package/dist/tools/agent-tool.js +229 -0
- package/dist/tools/apply-patch.js +89 -28
- package/dist/tools/ask-user-question.js +337 -0
- package/dist/tools/ask-user.js +115 -0
- package/dist/tools/bash.js +811 -49
- package/dist/tools/brief.js +224 -0
- package/dist/tools/cron.js +433 -0
- package/dist/tools/enter-worktree.js +250 -0
- package/dist/tools/exit-worktree.js +147 -0
- package/dist/tools/file-tools.js +161 -44
- package/dist/tools/http-request.js +336 -0
- package/dist/tools/lsp-tools.js +377 -1
- package/dist/tools/mcp-tool.js +260 -0
- package/dist/tools/multi-edit.js +361 -0
- package/dist/tools/powershell.js +268 -0
- package/dist/tools/registry.js +120 -5
- package/dist/tools/server-tools.js +892 -0
- package/dist/tools/skill-tool.js +96 -0
- package/dist/tools/sleep.js +99 -0
- package/dist/tools/synthetic-output.js +133 -0
- package/dist/tools/tasks.js +208 -0
- package/dist/tools/todo-write.js +184 -0
- package/dist/tools/verify-plan-execution.js +295 -0
- package/dist/tools/web-fetch-injection-scanner.js +207 -0
- package/dist/tools/web-fetch.js +195 -10
- package/dist/tools/web-search.js +458 -0
- package/dist/tui/agent-progress-card.js +111 -0
- package/dist/tui/agent-tree.js +22 -1
- package/dist/tui/ask-modal.js +14 -14
- package/dist/tui/ask-user-question-chips.js +315 -0
- package/dist/tui/ask-user-question-prompt.js +203 -0
- package/dist/tui/compact-banner.js +81 -0
- package/dist/tui/conversation-pane.js +85 -11
- package/dist/tui/cost-table.js +111 -0
- package/dist/tui/device-flow.js +2 -2
- package/dist/tui/doctor-table.js +46 -0
- package/dist/tui/feedback-prompt.js +156 -0
- package/dist/tui/input-box.js +247 -32
- package/dist/tui/login-picker.js +3 -3
- package/dist/tui/markdown-render.js +6 -6
- package/dist/tui/multi-file-diff-approval.js +375 -0
- package/dist/tui/onboarding-wizard.js +240 -0
- package/dist/tui/permissions-picker.js +86 -0
- package/dist/tui/render.js +36 -1
- package/dist/tui/repl-render.js +239 -25
- package/dist/tui/repl-splash-art.js +16 -16
- package/dist/tui/repl-splash-mascot.js +48 -24
- package/dist/tui/repl-splash.js +22 -22
- package/dist/tui/repl.js +125 -45
- package/dist/tui/slash-palette.js +6 -6
- package/dist/tui/splash.js +2 -2
- package/dist/tui/status-bar.js +109 -31
- package/dist/tui/status-table.js +7 -0
- package/dist/tui/stickers-art.js +136 -0
- package/dist/tui/style-table.js +28 -0
- package/dist/tui/theme-table.js +29 -0
- package/dist/tui/thinking-spinner.js +123 -0
- package/dist/tui/tool-stream-pane.js +53 -4
- package/dist/tui/update-banner.js +27 -2
- package/dist/tui/vim-input.js +267 -0
- package/dist/tui/welcome-banner.js +107 -0
- package/dist/tui/welcome-data.js +293 -0
- package/dist/tui/workspace-context.js +2 -2
- package/package.json +29 -6
- package/test/scenarios/codegen-create-file.scenario.txt +13 -0
- package/test/scenarios/compact-force.scenario.txt +12 -0
- package/test/scenarios/identity.scenario.txt +11 -0
- package/test/scenarios/persona-handoff.scenario.txt +12 -0
- package/test/scenarios/walkback.scenario.txt +12 -0
- package/dist/core/engine/compaction-hook.js +0 -154
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PUGI-VERIFY-GATE — verification command detection.
|
|
3
|
+
*
|
|
4
|
+
* Background: Codex dogfood 2026-06-04 surfaced a P0 trust failure
|
|
5
|
+
* where the Pugi engine returned `status: done` + `exitCode: 0`
|
|
6
|
+
* even after `npm test` exited non-zero on a regression the agent
|
|
7
|
+
* itself had introduced. Root cause: no layer of the dispatch
|
|
8
|
+
* pipeline knew which bash invocations were verification commands,
|
|
9
|
+
* so the engine outcome had no way to gate the final status on
|
|
10
|
+
* test/lint/build pass.
|
|
11
|
+
*
|
|
12
|
+
* This module is the deterministic, configurable allowlist of regex
|
|
13
|
+
* patterns the engine uses to recognise verification commands at
|
|
14
|
+
* dispatch time. The detection is intentionally simple (anchored on
|
|
15
|
+
* the head of the command after sudo / env-prefix stripping) so the
|
|
16
|
+
* allowlist stays auditable. False negatives are recoverable (the
|
|
17
|
+
* agent can re-run with a recognised wrapper); false positives would
|
|
18
|
+
* silently down-grade unrelated commands and are forbidden.
|
|
19
|
+
*
|
|
20
|
+
* The pattern table is exported as `VERIFICATION_PATTERNS`; callers
|
|
21
|
+
* use `detectVerificationCommand(cmd)` for the boolean + tool-tag
|
|
22
|
+
* decision. Both surfaces are pure — no I/O, no session state, no
|
|
23
|
+
* environment reads.
|
|
24
|
+
*/
|
|
25
|
+
/**
|
|
26
|
+
* Canonical verification allowlist. Patterns target the head of each
|
|
27
|
+
* shell-separated component AFTER:
|
|
28
|
+
* - leading whitespace is trimmed
|
|
29
|
+
* - leading `sudo` / `time` / `env KEY=value` prefixes are stripped
|
|
30
|
+
*
|
|
31
|
+
* Pre-trim the cmd through `extractCommandHead` before matching.
|
|
32
|
+
*
|
|
33
|
+
* When extending: keep the regex anchored (`^`) so a path containing
|
|
34
|
+
* the tool name (`./scripts/npm.sh`) does not false-positive.
|
|
35
|
+
*/
|
|
36
|
+
export const VERIFICATION_PATTERNS = [
|
|
37
|
+
// ----- JavaScript / TypeScript ecosystem -----
|
|
38
|
+
// npm test / npm run test / npm run lint / npm run typecheck / npm run build
|
|
39
|
+
{ tool: 'npm-test', pattern: /^npm\s+(?:run\s+)?test\b/, category: 'test' },
|
|
40
|
+
{ tool: 'npm-lint', pattern: /^npm\s+run\s+lint\b/, category: 'lint' },
|
|
41
|
+
{ tool: 'npm-typecheck', pattern: /^npm\s+run\s+typecheck\b/, category: 'typecheck' },
|
|
42
|
+
{ tool: 'npm-build', pattern: /^npm\s+run\s+build\b/, category: 'build' },
|
|
43
|
+
// pnpm (with and without -C / --filter prefixes — match the full head)
|
|
44
|
+
{ tool: 'pnpm-test', pattern: /^pnpm(?:\s+(?:-C\s+\S+|--filter(?:\s+|=)\S+|-r))*\s+(?:run\s+)?test\b/, category: 'test' },
|
|
45
|
+
{ tool: 'pnpm-lint', pattern: /^pnpm(?:\s+(?:-C\s+\S+|--filter(?:\s+|=)\S+|-r))*\s+(?:run\s+)?lint\b/, category: 'lint' },
|
|
46
|
+
{ tool: 'pnpm-typecheck', pattern: /^pnpm(?:\s+(?:-C\s+\S+|--filter(?:\s+|=)\S+|-r))*\s+(?:run\s+)?typecheck\b/, category: 'typecheck' },
|
|
47
|
+
{ tool: 'pnpm-build', pattern: /^pnpm(?:\s+(?:-C\s+\S+|--filter(?:\s+|=)\S+|-r))*\s+(?:run\s+)?build\b/, category: 'build' },
|
|
48
|
+
// yarn
|
|
49
|
+
{ tool: 'yarn-test', pattern: /^yarn\s+(?:run\s+)?test\b/, category: 'test' },
|
|
50
|
+
{ tool: 'yarn-lint', pattern: /^yarn\s+(?:run\s+)?lint\b/, category: 'lint' },
|
|
51
|
+
{ tool: 'yarn-typecheck', pattern: /^yarn\s+(?:run\s+)?typecheck\b/, category: 'typecheck' },
|
|
52
|
+
{ tool: 'yarn-build', pattern: /^yarn\s+(?:run\s+)?build\b/, category: 'build' },
|
|
53
|
+
// Direct test-runner invocations (npx and bare).
|
|
54
|
+
{ tool: 'jest', pattern: /^(?:npx\s+)?jest\b/, category: 'test' },
|
|
55
|
+
{ tool: 'vitest', pattern: /^(?:npx\s+)?vitest\b/, category: 'test' },
|
|
56
|
+
{ tool: 'mocha', pattern: /^(?:npx\s+)?mocha\b/, category: 'test' },
|
|
57
|
+
{ tool: 'tsc-typecheck', pattern: /^(?:npx\s+)?tsc\b(?=.*--noEmit|\s*$)/, category: 'typecheck' },
|
|
58
|
+
{ tool: 'eslint', pattern: /^(?:npx\s+)?eslint\b/, category: 'lint' },
|
|
59
|
+
{ tool: 'node-test', pattern: /^node\s+--test\b/, category: 'test' },
|
|
60
|
+
// ----- Python -----
|
|
61
|
+
{ tool: 'pytest', pattern: /^(?:python\s+-m\s+)?pytest\b/, category: 'test' },
|
|
62
|
+
{ tool: 'python-unittest', pattern: /^python\s+-m\s+unittest\b/, category: 'test' },
|
|
63
|
+
{ tool: 'ruff', pattern: /^ruff\s+check\b/, category: 'lint' },
|
|
64
|
+
{ tool: 'mypy', pattern: /^mypy\b/, category: 'typecheck' },
|
|
65
|
+
// ----- Rust -----
|
|
66
|
+
{ tool: 'cargo-test', pattern: /^cargo\s+test\b/, category: 'test' },
|
|
67
|
+
{ tool: 'cargo-check', pattern: /^cargo\s+check\b/, category: 'typecheck' },
|
|
68
|
+
{ tool: 'cargo-clippy', pattern: /^cargo\s+clippy\b/, category: 'lint' },
|
|
69
|
+
{ tool: 'cargo-build', pattern: /^cargo\s+build\b/, category: 'build' },
|
|
70
|
+
// ----- Go -----
|
|
71
|
+
{ tool: 'go-test', pattern: /^go\s+test\b/, category: 'test' },
|
|
72
|
+
{ tool: 'go-vet', pattern: /^go\s+vet\b/, category: 'lint' },
|
|
73
|
+
{ tool: 'go-build', pattern: /^go\s+build\b/, category: 'build' },
|
|
74
|
+
// ----- Elixir -----
|
|
75
|
+
{ tool: 'mix-test', pattern: /^mix\s+test\b/, category: 'test' },
|
|
76
|
+
// ----- Ruby -----
|
|
77
|
+
{ tool: 'rspec', pattern: /^(?:bundle\s+exec\s+)?rspec\b/, category: 'test' },
|
|
78
|
+
{ tool: 'rubocop', pattern: /^(?:bundle\s+exec\s+)?rubocop\b/, category: 'lint' },
|
|
79
|
+
// ----- Java / Kotlin / Gradle / Maven -----
|
|
80
|
+
{ tool: 'gradle-test', pattern: /^(?:\.\/)?gradlew?\s+test\b/, category: 'test' },
|
|
81
|
+
{ tool: 'gradle-build', pattern: /^(?:\.\/)?gradlew?\s+build\b/, category: 'build' },
|
|
82
|
+
{ tool: 'maven-test', pattern: /^mvn\s+test\b/, category: 'test' },
|
|
83
|
+
{ tool: 'maven-verify', pattern: /^mvn\s+verify\b/, category: 'test' },
|
|
84
|
+
// ----- C/C++ / Make -----
|
|
85
|
+
{ tool: 'make-test', pattern: /^make\s+(?:test|check)\b/, category: 'test' },
|
|
86
|
+
{ tool: 'ctest', pattern: /^ctest\b/, category: 'test' },
|
|
87
|
+
];
|
|
88
|
+
const SHELL_SEPARATORS = /\s*(?:&&|\|\||;|\|)\s*/;
|
|
89
|
+
const ENV_ASSIGN = /^[A-Z_][A-Z0-9_]*=\S+$/;
|
|
90
|
+
/**
|
|
91
|
+
* Strip leading `sudo` / `time` / `env A=1 B=2` noise so the verb is
|
|
92
|
+
* the first non-prefix token. Returns the stripped head as a single
|
|
93
|
+
* normalised string. Pure — no side effects.
|
|
94
|
+
*
|
|
95
|
+
* We do NOT strip generic env-variable assignments like `CI=1` that
|
|
96
|
+
* the operator typed inline (e.g. `CI=1 pnpm test`) because the
|
|
97
|
+
* regex allowlist anchors `pnpm` — matching the head after stripping
|
|
98
|
+
* `CI=1` is precisely the intent.
|
|
99
|
+
*/
|
|
100
|
+
export function extractCommandHead(component) {
|
|
101
|
+
let head = component.trim();
|
|
102
|
+
// sudo / time wrappers
|
|
103
|
+
while (true) {
|
|
104
|
+
if (head.startsWith('sudo ')) {
|
|
105
|
+
head = head.slice(5).trimStart();
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
if (head.startsWith('time ')) {
|
|
109
|
+
head = head.slice(5).trimStart();
|
|
110
|
+
continue;
|
|
111
|
+
}
|
|
112
|
+
// env A=1 B=2 prefix (inline env assignments before the verb).
|
|
113
|
+
// Peel one token at a time so `FOO=bar BAZ=qux pnpm test` resolves to `pnpm test`.
|
|
114
|
+
const firstToken = head.split(/\s+/, 1)[0] ?? '';
|
|
115
|
+
if (firstToken !== '' && ENV_ASSIGN.test(firstToken)) {
|
|
116
|
+
head = head.slice(firstToken.length).trimStart();
|
|
117
|
+
continue;
|
|
118
|
+
}
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
return head;
|
|
122
|
+
}
|
|
123
|
+
/**
|
|
124
|
+
* Detect whether a shell command runs a verification step. The
|
|
125
|
+
* predicate scans every `&&` / `;` / `||` / `|`-separated component
|
|
126
|
+
* and returns the first match — a compound command like
|
|
127
|
+
* `cd packages/foo && pnpm test` is correctly flagged on the
|
|
128
|
+
* trailing component.
|
|
129
|
+
*
|
|
130
|
+
* The check is intentionally optimistic: it does not parse `if`,
|
|
131
|
+
* `for`, or function bodies. Operators wrapping verification inside
|
|
132
|
+
* a script (e.g. `./scripts/test.sh`) opt out of the gate; that is
|
|
133
|
+
* recorded in the unverifiedReason as `no_verification_command_run`
|
|
134
|
+
* downstream.
|
|
135
|
+
*/
|
|
136
|
+
export function detectVerificationCommand(cmd) {
|
|
137
|
+
if (typeof cmd !== 'string' || cmd.trim() === '') {
|
|
138
|
+
return { isVerification: false, tool: null, matchedComponent: '' };
|
|
139
|
+
}
|
|
140
|
+
const components = cmd.split(SHELL_SEPARATORS);
|
|
141
|
+
for (const raw of components) {
|
|
142
|
+
const head = extractCommandHead(raw);
|
|
143
|
+
if (head === '')
|
|
144
|
+
continue;
|
|
145
|
+
for (const entry of VERIFICATION_PATTERNS) {
|
|
146
|
+
if (entry.pattern.test(head)) {
|
|
147
|
+
return {
|
|
148
|
+
isVerification: true,
|
|
149
|
+
tool: entry.tool,
|
|
150
|
+
matchedComponent: raw.trim(),
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
return { isVerification: false, tool: null, matchedComponent: '' };
|
|
156
|
+
}
|
|
157
|
+
/**
|
|
158
|
+
* Phrases the agent uses to dispute ownership of a verification
|
|
159
|
+
* failure. When ANY of these phrases appears in the final assistant
|
|
160
|
+
* text AND the agent mutated files in the same module as a failing
|
|
161
|
+
* test, the outcome's `regressionOwnershipDispute` flag is set so a
|
|
162
|
+
* downstream reviewer can decide whether to escalate.
|
|
163
|
+
*
|
|
164
|
+
* The list is case-insensitive at match time. Punctuation around the
|
|
165
|
+
* phrase is allowed because `.includes()` looks for the substring,
|
|
166
|
+
* not word boundaries (an agent that writes "this is a pre-existing
|
|
167
|
+
* test bug" still trips the flag).
|
|
168
|
+
*/
|
|
169
|
+
export const REGRESSION_DISPUTE_PHRASES = [
|
|
170
|
+
'pre-existing',
|
|
171
|
+
'preexisting',
|
|
172
|
+
'pre existing',
|
|
173
|
+
'not from my changes',
|
|
174
|
+
'not related to my changes',
|
|
175
|
+
'unrelated test failure',
|
|
176
|
+
'unrelated to my changes',
|
|
177
|
+
'unrelated failure',
|
|
178
|
+
'not my change',
|
|
179
|
+
];
|
|
180
|
+
/**
|
|
181
|
+
* Tail trimmer for stderr captured in verification ledger entries.
|
|
182
|
+
* Returns the last `maxBytes` of UTF-8 text, clamped at a hard 2 KB
|
|
183
|
+
* default to match the PUGI-VERIFY-GATE contract.
|
|
184
|
+
*/
|
|
185
|
+
export function tailStderr(stderr, maxBytes = 2048) {
|
|
186
|
+
if (typeof stderr !== 'string' || stderr.length === 0)
|
|
187
|
+
return '';
|
|
188
|
+
if (Buffer.byteLength(stderr, 'utf8') <= maxBytes)
|
|
189
|
+
return stderr;
|
|
190
|
+
// Approximate cap by character index — accurate enough for stderr
|
|
191
|
+
// tails that are overwhelmingly ASCII test output.
|
|
192
|
+
const slice = stderr.slice(-maxBytes);
|
|
193
|
+
return slice;
|
|
194
|
+
}
|
|
195
|
+
//# sourceMappingURL=verification-patterns.js.map
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Append-only TSV ledger for pugi-eval-v1 results.
|
|
3
|
+
*
|
|
4
|
+
* Pattern source: backlog #110 (Karpathy autoresearch). The ledger
|
|
5
|
+
* is git-tracked, never rewritten in place, never edited by hand. The
|
|
6
|
+
* column set is frozen as of schema v1; new columns must land in
|
|
7
|
+
* eval-v2 with a separate ledger file.
|
|
8
|
+
*
|
|
9
|
+
* Columns (tab-separated, in this exact order):
|
|
10
|
+
*
|
|
11
|
+
* timestamp UTC ISO 8601
|
|
12
|
+
* git_sha short sha of repo HEAD at run start
|
|
13
|
+
* task_id frozen task id (`NN-slug`)
|
|
14
|
+
* model engine model identifier or `(default)`
|
|
15
|
+
* status pass | fail | budget_exhausted | timeout | engine_error
|
|
16
|
+
* pugi_score per-task score, 2 decimal places
|
|
17
|
+
* tokens tokens reported by engine
|
|
18
|
+
* turns engine turn count
|
|
19
|
+
* tool_calls tool calls executed
|
|
20
|
+
* wall_ms wall-clock duration (ms)
|
|
21
|
+
* exit_code CLI subprocess exit code
|
|
22
|
+
* verifications `<passed>/<total>`
|
|
23
|
+
*
|
|
24
|
+
* Bytes containing TAB or NEWLINE are stripped from string fields
|
|
25
|
+
* before write so a single line is always one TSV record.
|
|
26
|
+
*/
|
|
27
|
+
import { appendFileSync, existsSync, mkdirSync, writeFileSync } from 'node:fs';
|
|
28
|
+
import { dirname } from 'node:path';
|
|
29
|
+
export const LEDGER_COLUMNS = [
|
|
30
|
+
'timestamp',
|
|
31
|
+
'git_sha',
|
|
32
|
+
'task_id',
|
|
33
|
+
'model',
|
|
34
|
+
'status',
|
|
35
|
+
'pugi_score',
|
|
36
|
+
'tokens',
|
|
37
|
+
'turns',
|
|
38
|
+
'tool_calls',
|
|
39
|
+
'wall_ms',
|
|
40
|
+
'exit_code',
|
|
41
|
+
'verifications',
|
|
42
|
+
];
|
|
43
|
+
export const LEDGER_HEADER = LEDGER_COLUMNS.join('\t');
|
|
44
|
+
function safe(s) {
|
|
45
|
+
return s.replace(/[\t\r\n]+/g, ' ');
|
|
46
|
+
}
|
|
47
|
+
export function formatLedgerLine(row) {
|
|
48
|
+
const { timestamp, gitSha, model, result } = row;
|
|
49
|
+
const passed = result.verifications.filter((v) => v.passed).length;
|
|
50
|
+
const total = result.verifications.length;
|
|
51
|
+
const cells = [
|
|
52
|
+
safe(timestamp),
|
|
53
|
+
safe(gitSha),
|
|
54
|
+
safe(result.taskId),
|
|
55
|
+
safe(model),
|
|
56
|
+
safe(result.status),
|
|
57
|
+
result.pugiScore.toFixed(2),
|
|
58
|
+
String(result.tokensUsed),
|
|
59
|
+
String(result.turnsUsed),
|
|
60
|
+
String(result.toolCallCount),
|
|
61
|
+
String(result.wallClockMs),
|
|
62
|
+
String(result.exitCode),
|
|
63
|
+
`${passed}/${total}`,
|
|
64
|
+
];
|
|
65
|
+
return cells.join('\t');
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Append a single row. Creates the file with the header if it does
|
|
69
|
+
* not yet exist; otherwise appends a single line. Never rewrites
|
|
70
|
+
* existing content.
|
|
71
|
+
*/
|
|
72
|
+
export function appendLedgerRow(ledgerPath, row) {
|
|
73
|
+
mkdirSync(dirname(ledgerPath), { recursive: true });
|
|
74
|
+
if (!existsSync(ledgerPath)) {
|
|
75
|
+
writeFileSync(ledgerPath, `${LEDGER_HEADER}\n`, { mode: 0o644 });
|
|
76
|
+
}
|
|
77
|
+
appendFileSync(ledgerPath, `${formatLedgerLine(row)}\n`);
|
|
78
|
+
}
|
|
79
|
+
export function appendLedgerRows(ledgerPath, rows) {
|
|
80
|
+
for (const row of rows)
|
|
81
|
+
appendLedgerRow(ledgerPath, row);
|
|
82
|
+
}
|
|
83
|
+
//# sourceMappingURL=ledger.js.map
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Runner for pugi-eval-v1.
|
|
3
|
+
*
|
|
4
|
+
* Per task: spawn a fresh tmp workspace, copy fixture files, invoke
|
|
5
|
+
* the `pugi <command>` subprocess with the brief, capture stdout +
|
|
6
|
+
* exit code + wall-clock, then run the verification checks.
|
|
7
|
+
*
|
|
8
|
+
* The runner is deliberately subprocess-based - mirrors the smoke
|
|
9
|
+
* harness pattern in `core/smoke/headless-driver.ts`. Validating the
|
|
10
|
+
* AS-PUBLISHED CLI is the whole point of a benchmark; bypassing
|
|
11
|
+
* `bin/run.js` would let us miss whole categories of regression
|
|
12
|
+
* (loader cost, env propagation, exit-code handling).
|
|
13
|
+
*
|
|
14
|
+
* Tests inject a `runner` callback that returns a fake `RunCapture`
|
|
15
|
+
* so the meta-spec can exercise scoring + ledger without a real
|
|
16
|
+
* engine.
|
|
17
|
+
*/
|
|
18
|
+
import { spawn } from 'node:child_process';
|
|
19
|
+
import { existsSync, mkdirSync, mkdtempSync, rmSync, writeFileSync, readdirSync, statSync } from 'node:fs';
|
|
20
|
+
import { tmpdir } from 'node:os';
|
|
21
|
+
import { dirname, join, resolve } from 'node:path';
|
|
22
|
+
import { computePugiScore } from './scoring.js';
|
|
23
|
+
import { runVerifications } from './verifier.js';
|
|
24
|
+
/**
|
|
25
|
+
* Default executor: spawn `pugi <command> "<brief>" --json --print`
|
|
26
|
+
* inside the workspace. The `--print` flag forces non-interactive
|
|
27
|
+
* mode; `--json` produces the structured envelope the runner parses
|
|
28
|
+
* for `tokensUsed` etc.
|
|
29
|
+
*/
|
|
30
|
+
export const subprocessRunner = async (input) => {
|
|
31
|
+
const args = [
|
|
32
|
+
input.spec.command,
|
|
33
|
+
'--print',
|
|
34
|
+
'--json',
|
|
35
|
+
'--intensity',
|
|
36
|
+
input.spec.intensity,
|
|
37
|
+
'--max-turns',
|
|
38
|
+
String(input.spec.maxTurns),
|
|
39
|
+
];
|
|
40
|
+
if (input.model) {
|
|
41
|
+
args.push('--model', input.model);
|
|
42
|
+
}
|
|
43
|
+
args.push(input.spec.brief);
|
|
44
|
+
const child = spawn(input.pugiBin, args, {
|
|
45
|
+
cwd: input.workspaceRoot,
|
|
46
|
+
env: input.env,
|
|
47
|
+
stdio: ['ignore', 'pipe', 'pipe'],
|
|
48
|
+
});
|
|
49
|
+
const start = Date.now();
|
|
50
|
+
let stdout = '';
|
|
51
|
+
let stderr = '';
|
|
52
|
+
child.stdout?.on('data', (chunk) => {
|
|
53
|
+
stdout += chunk.toString('utf8');
|
|
54
|
+
});
|
|
55
|
+
child.stderr?.on('data', (chunk) => {
|
|
56
|
+
stderr += chunk.toString('utf8');
|
|
57
|
+
});
|
|
58
|
+
let timedOut = false;
|
|
59
|
+
const timer = setTimeout(() => {
|
|
60
|
+
timedOut = true;
|
|
61
|
+
try {
|
|
62
|
+
child.kill('SIGTERM');
|
|
63
|
+
}
|
|
64
|
+
catch {
|
|
65
|
+
/* noop */
|
|
66
|
+
}
|
|
67
|
+
setTimeout(() => {
|
|
68
|
+
try {
|
|
69
|
+
child.kill('SIGKILL');
|
|
70
|
+
}
|
|
71
|
+
catch {
|
|
72
|
+
/* noop */
|
|
73
|
+
}
|
|
74
|
+
}, 5_000);
|
|
75
|
+
}, input.spec.timeoutMs);
|
|
76
|
+
const onAbort = () => {
|
|
77
|
+
try {
|
|
78
|
+
child.kill('SIGTERM');
|
|
79
|
+
}
|
|
80
|
+
catch {
|
|
81
|
+
/* noop */
|
|
82
|
+
}
|
|
83
|
+
};
|
|
84
|
+
input.signal?.addEventListener('abort', onAbort);
|
|
85
|
+
const exitCode = await new Promise((resolveExit) => {
|
|
86
|
+
child.on('exit', (code) => resolveExit(code ?? -1));
|
|
87
|
+
child.on('error', () => resolveExit(-1));
|
|
88
|
+
});
|
|
89
|
+
clearTimeout(timer);
|
|
90
|
+
input.signal?.removeEventListener('abort', onAbort);
|
|
91
|
+
const wallClockMs = Date.now() - start;
|
|
92
|
+
const parsed = parseEnvelope(stdout);
|
|
93
|
+
const budgetExhausted = parsed?.status === 'budget_exceeded' ||
|
|
94
|
+
parsed?.status === 'budget_exhausted';
|
|
95
|
+
const engineError = parsed?.status === 'engine_unavailable' ||
|
|
96
|
+
parsed?.status === 'failed';
|
|
97
|
+
return {
|
|
98
|
+
stdout,
|
|
99
|
+
stderr,
|
|
100
|
+
exitCode,
|
|
101
|
+
wallClockMs,
|
|
102
|
+
tokensUsed: parsed?.tokensUsed ?? 0,
|
|
103
|
+
turnsUsed: parsed?.turnsUsed ?? 0,
|
|
104
|
+
toolCallCount: parsed?.toolCallCount ?? 0,
|
|
105
|
+
timedOut,
|
|
106
|
+
budgetExhausted,
|
|
107
|
+
engineError,
|
|
108
|
+
};
|
|
109
|
+
};
|
|
110
|
+
/**
|
|
111
|
+
* Parse the last JSON envelope from stdout. Pugi `--json` emits one
|
|
112
|
+
* JSON object per invocation; the runner scans for the final `{...}`
|
|
113
|
+
* block so warning lines before it do not break parsing.
|
|
114
|
+
*/
|
|
115
|
+
function parseEnvelope(stdout) {
|
|
116
|
+
const trimmed = stdout.trim();
|
|
117
|
+
if (trimmed === '')
|
|
118
|
+
return null;
|
|
119
|
+
// Try the entire trimmed payload first (common case).
|
|
120
|
+
try {
|
|
121
|
+
return JSON.parse(trimmed);
|
|
122
|
+
}
|
|
123
|
+
catch {
|
|
124
|
+
/* fall through to line scan */
|
|
125
|
+
}
|
|
126
|
+
const lines = trimmed.split(/\r?\n/);
|
|
127
|
+
for (let i = lines.length - 1; i >= 0; i -= 1) {
|
|
128
|
+
const line = lines[i].trim();
|
|
129
|
+
if (!line.startsWith('{'))
|
|
130
|
+
continue;
|
|
131
|
+
try {
|
|
132
|
+
return JSON.parse(line);
|
|
133
|
+
}
|
|
134
|
+
catch {
|
|
135
|
+
continue;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
return null;
|
|
139
|
+
}
|
|
140
|
+
function walkFiles(root, prefix, out) {
|
|
141
|
+
let entries;
|
|
142
|
+
try {
|
|
143
|
+
entries = readdirSync(root);
|
|
144
|
+
}
|
|
145
|
+
catch {
|
|
146
|
+
return;
|
|
147
|
+
}
|
|
148
|
+
for (const entry of entries) {
|
|
149
|
+
if (entry === '.pugi' || entry === 'node_modules' || entry === '.git') {
|
|
150
|
+
continue;
|
|
151
|
+
}
|
|
152
|
+
const abs = join(root, entry);
|
|
153
|
+
const rel = prefix === '' ? entry : `${prefix}/${entry}`;
|
|
154
|
+
let st;
|
|
155
|
+
try {
|
|
156
|
+
st = statSync(abs);
|
|
157
|
+
}
|
|
158
|
+
catch {
|
|
159
|
+
continue;
|
|
160
|
+
}
|
|
161
|
+
if (st.isDirectory()) {
|
|
162
|
+
walkFiles(abs, rel, out);
|
|
163
|
+
}
|
|
164
|
+
else if (st.isFile()) {
|
|
165
|
+
out.push(rel);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
function classifyStatus(capture, verificationsAllPassed) {
|
|
170
|
+
if (capture.timedOut)
|
|
171
|
+
return 'timeout';
|
|
172
|
+
if (capture.budgetExhausted)
|
|
173
|
+
return 'budget_exhausted';
|
|
174
|
+
if (capture.engineError)
|
|
175
|
+
return 'engine_error';
|
|
176
|
+
if (capture.exitCode !== 0)
|
|
177
|
+
return 'fail';
|
|
178
|
+
return verificationsAllPassed ? 'pass' : 'fail';
|
|
179
|
+
}
|
|
180
|
+
export function prepareWorkspace(spec) {
|
|
181
|
+
const root = mkdtempSync(join(tmpdir(), `pugi-eval-v1-${spec.id}-`));
|
|
182
|
+
if (spec.fixture) {
|
|
183
|
+
for (const [relPath, body] of Object.entries(spec.fixture)) {
|
|
184
|
+
if (relPath.split(/[\\/]/).includes('..')) {
|
|
185
|
+
throw new Error(`eval-v1 task ${spec.id}: fixture path ${relPath} contains ..`);
|
|
186
|
+
}
|
|
187
|
+
const abs = resolve(root, relPath);
|
|
188
|
+
mkdirSync(dirname(abs), { recursive: true });
|
|
189
|
+
writeFileSync(abs, body, { mode: 0o644 });
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
const cleanup = () => {
|
|
193
|
+
try {
|
|
194
|
+
rmSync(root, { recursive: true, force: true });
|
|
195
|
+
}
|
|
196
|
+
catch {
|
|
197
|
+
/* swallow */
|
|
198
|
+
}
|
|
199
|
+
};
|
|
200
|
+
return { root, cleanup };
|
|
201
|
+
}
|
|
202
|
+
export async function runTaskWithCapture(spec, workspaceRoot, capture) {
|
|
203
|
+
const parsed = parseEnvelope(capture.stdout);
|
|
204
|
+
const finalText = parsed?.finalText ?? capture.stdout;
|
|
205
|
+
const verifications = runVerifications(spec.verification, {
|
|
206
|
+
workspaceRoot,
|
|
207
|
+
finalText,
|
|
208
|
+
});
|
|
209
|
+
const allPassed = verifications.every((v) => v.passed);
|
|
210
|
+
const status = classifyStatus(capture, allPassed);
|
|
211
|
+
const filesWritten = [];
|
|
212
|
+
walkFiles(workspaceRoot, '', filesWritten);
|
|
213
|
+
filesWritten.sort();
|
|
214
|
+
const base = {
|
|
215
|
+
taskId: spec.id,
|
|
216
|
+
status,
|
|
217
|
+
tokensUsed: capture.tokensUsed,
|
|
218
|
+
toolCallCount: capture.toolCallCount,
|
|
219
|
+
turnsUsed: capture.turnsUsed,
|
|
220
|
+
wallClockMs: capture.wallClockMs,
|
|
221
|
+
exitCode: capture.exitCode,
|
|
222
|
+
verifications,
|
|
223
|
+
finalText,
|
|
224
|
+
filesWritten,
|
|
225
|
+
};
|
|
226
|
+
const pugiScore = computePugiScore(base, spec);
|
|
227
|
+
return { ...base, pugiScore };
|
|
228
|
+
}
|
|
229
|
+
export async function runOneTask(spec, options) {
|
|
230
|
+
const ws = prepareWorkspace(spec);
|
|
231
|
+
try {
|
|
232
|
+
const capture = await options.runner({
|
|
233
|
+
spec,
|
|
234
|
+
workspaceRoot: ws.root,
|
|
235
|
+
pugiBin: options.pugiBin,
|
|
236
|
+
...(options.model !== undefined ? { model: options.model } : {}),
|
|
237
|
+
env: options.env,
|
|
238
|
+
...(options.signal !== undefined ? { signal: options.signal } : {}),
|
|
239
|
+
});
|
|
240
|
+
return await runTaskWithCapture(spec, ws.root, capture);
|
|
241
|
+
}
|
|
242
|
+
finally {
|
|
243
|
+
ws.cleanup();
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
export async function runHarness(input) {
|
|
247
|
+
const runner = input.options.runner ?? subprocessRunner;
|
|
248
|
+
const env = input.options.env ?? process.env;
|
|
249
|
+
const onlyFilter = input.options.only
|
|
250
|
+
? new Set(input.options.only)
|
|
251
|
+
: null;
|
|
252
|
+
const out = [];
|
|
253
|
+
for (const spec of input.specs) {
|
|
254
|
+
if (onlyFilter && !onlyFilter.has(spec.id))
|
|
255
|
+
continue;
|
|
256
|
+
input.options.onTaskStart?.(spec);
|
|
257
|
+
const runOpts = {
|
|
258
|
+
pugiBin: input.options.pugiBin,
|
|
259
|
+
env,
|
|
260
|
+
runner,
|
|
261
|
+
};
|
|
262
|
+
if (input.options.model !== undefined) {
|
|
263
|
+
runOpts.model = input.options.model;
|
|
264
|
+
}
|
|
265
|
+
const result = await runOneTask(spec, runOpts);
|
|
266
|
+
out.push(result);
|
|
267
|
+
input.options.onTaskFinish?.(result);
|
|
268
|
+
}
|
|
269
|
+
if (onlyFilter && out.length === 0) {
|
|
270
|
+
throw new Error(`eval-v1: --task filter matched zero tasks (asked for ${[...onlyFilter].join(', ')})`);
|
|
271
|
+
}
|
|
272
|
+
// Verify path safety: workspace cleanup happened, no temp dirs
|
|
273
|
+
// leaked beyond tmpdir prefix.
|
|
274
|
+
if (!existsSync(tmpdir())) {
|
|
275
|
+
// pathological - tmpdir disappeared. Surface so CI fails loud.
|
|
276
|
+
throw new Error('eval-v1: tmpdir no longer exists post-run');
|
|
277
|
+
}
|
|
278
|
+
return out;
|
|
279
|
+
}
|
|
280
|
+
//# sourceMappingURL=runner.js.map
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pugi_score scoring formula for eval-v1.
|
|
3
|
+
*
|
|
4
|
+
* Per-task score (0..150, higher = better):
|
|
5
|
+
*
|
|
6
|
+
* pugi_score = pass_rate * 100 // 0..100
|
|
7
|
+
* + verification_completeness * 50 // 0.. 50
|
|
8
|
+
* - (tokens_used / max_tokens) * 30 // 0..-30
|
|
9
|
+
* - (wall_clock_ms / timeout_ms) * 20 // 0..-20
|
|
10
|
+
*
|
|
11
|
+
* Where:
|
|
12
|
+
* - `pass_rate` is 1.0 if status is `pass`, 0 otherwise.
|
|
13
|
+
* - `verification_completeness` is `passed_checks / total_checks`.
|
|
14
|
+
* - Token and wall-clock penalties are clamped to [0, 1] so a run
|
|
15
|
+
* that exceeds the budget cap caps the penalty (avoid runaway
|
|
16
|
+
* negative scores that would skew the aggregate).
|
|
17
|
+
*
|
|
18
|
+
* Aggregate is the arithmetic mean across all per-task scores. Mean
|
|
19
|
+
* is defensible because every task contributes equally to the
|
|
20
|
+
* benchmark (we are not weighting by difficulty - eval-v2 may add
|
|
21
|
+
* weights once we have a baseline year of data).
|
|
22
|
+
*/
|
|
23
|
+
const PASS_WEIGHT = 100;
|
|
24
|
+
const VERIFICATION_WEIGHT = 50;
|
|
25
|
+
const TOKEN_PENALTY = 30;
|
|
26
|
+
const WALL_PENALTY = 20;
|
|
27
|
+
function clamp01(n) {
|
|
28
|
+
if (!Number.isFinite(n))
|
|
29
|
+
return 1;
|
|
30
|
+
if (n < 0)
|
|
31
|
+
return 0;
|
|
32
|
+
if (n > 1)
|
|
33
|
+
return 1;
|
|
34
|
+
return n;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Compute the per-task pugi_score. Called by the runner before
|
|
38
|
+
* appending the result to the ledger.
|
|
39
|
+
*/
|
|
40
|
+
export function computePugiScore(result, spec) {
|
|
41
|
+
const passRate = result.status === 'pass' ? 1 : 0;
|
|
42
|
+
const totalChecks = Math.max(1, result.verifications.length);
|
|
43
|
+
const passedChecks = result.verifications.filter((v) => v.passed).length;
|
|
44
|
+
const completeness = passedChecks / totalChecks;
|
|
45
|
+
const tokenPenalty = clamp01(result.tokensUsed / spec.maxTokens);
|
|
46
|
+
const wallPenalty = clamp01(result.wallClockMs / spec.timeoutMs);
|
|
47
|
+
const score = passRate * PASS_WEIGHT +
|
|
48
|
+
completeness * VERIFICATION_WEIGHT -
|
|
49
|
+
tokenPenalty * TOKEN_PENALTY -
|
|
50
|
+
wallPenalty * WALL_PENALTY;
|
|
51
|
+
// Clamp к [0, 150] so a partial-completeness fail with non-zero
|
|
52
|
+
// penalties never produces a negative aggregate. The penalties are
|
|
53
|
+
// already clamped to [0, 1] individually; this final clamp protects
|
|
54
|
+
// the documented range invariant when verification_completeness is
|
|
55
|
+
// 0 AND budget penalties land.
|
|
56
|
+
const clamped = Math.max(0, Math.min(150, score));
|
|
57
|
+
return Math.round(clamped * 100) / 100;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Aggregate score across an entire harness run. Mean by design.
|
|
61
|
+
*/
|
|
62
|
+
export function aggregateScore(results) {
|
|
63
|
+
if (results.length === 0)
|
|
64
|
+
return 0;
|
|
65
|
+
const sum = results.reduce((acc, r) => acc + r.pugiScore, 0);
|
|
66
|
+
return Math.round((sum / results.length) * 100) / 100;
|
|
67
|
+
}
|
|
68
|
+
//# sourceMappingURL=scoring.js.map
|