@pugi/cli 0.1.0-beta.93 → 0.1.0-beta.95

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/dist/commands/retro.js +210 -0
  2. package/dist/core/diagnostics/probes/sandbox.js +65 -33
  3. package/dist/core/engine/native-pugi.js +184 -10
  4. package/dist/core/engine/tool-bridge.js +35 -0
  5. package/dist/core/engine/verification-patterns.js +9 -9
  6. package/dist/core/mcp/orchestrator-config.js +192 -0
  7. package/dist/core/mcp/orchestrator-tools.js +147 -3
  8. package/dist/core/pugi-gitignore.js +52 -0
  9. package/dist/core/repl/engine-bridge.js +199 -0
  10. package/dist/core/repl/session.js +395 -6
  11. package/dist/core/repl/tool-route.js +382 -0
  12. package/dist/core/retro/git-collector.js +251 -0
  13. package/dist/core/retro/health-card.js +25 -0
  14. package/dist/core/retro/metrics.js +342 -0
  15. package/dist/core/retro/narrative.js +249 -0
  16. package/dist/core/retro/plane-collector.js +274 -0
  17. package/dist/core/retro/pr-issue-link.js +65 -0
  18. package/dist/core/retro/types.js +16 -0
  19. package/dist/core/sandboxing/adapter.js +29 -0
  20. package/dist/core/sandboxing/index.js +49 -0
  21. package/dist/core/sandboxing/none.js +19 -0
  22. package/dist/core/sandboxing/seatbelt.js +183 -0
  23. package/dist/core/session.js +27 -0
  24. package/dist/core/settings.js +22 -0
  25. package/dist/runtime/cli.js +167 -33
  26. package/dist/runtime/commands/mcp.js +64 -8
  27. package/dist/runtime/deprecation-warning.js +69 -0
  28. package/dist/runtime/headless.js +8 -3
  29. package/dist/runtime/stream-renderer.js +195 -0
  30. package/dist/runtime/version.js +1 -1
  31. package/dist/tui/agent-tree.js +11 -0
  32. package/dist/tui/ask-user-question-chips.js +1 -1
  33. package/dist/tui/multi-file-diff-approval.js +3 -3
  34. package/dist/tui/repl-render.js +42 -0
  35. package/package.json +2 -2
@@ -0,0 +1,210 @@
1
+ import { jsxs as _jsxs, jsx as _jsx } from "react/jsx-runtime";
2
+ import { Box, Text, render } from 'ink';
3
+ import { collectGitContext, countCommitsAheadOfBase, } from '../core/retro/git-collector.js';
4
+ import { ensurePugiGitIgnore } from '../core/pugi-gitignore.js';
5
+ import { computeMetrics } from '../core/retro/metrics.js';
6
+ import { persistRetro } from '../core/retro/narrative.js';
7
+ import { collectPlaneSlice, postRetroToPlane, resolvePlaneConfig, } from '../core/retro/plane-collector.js';
8
+ import { enrichLinks } from '../core/retro/pr-issue-link.js';
9
+ import { computeHealthCard } from '../core/retro/health-card.js';
10
+ /** Parse `7d` | `14d` | `30d` | `24h` into a duration in days
11
+ * (fractional for sub-day windows). Defaults to 7 days when omitted.
12
+ */
13
+ function parseDurationToken(token) {
14
+ if (!token)
15
+ return undefined;
16
+ const match = /^(\d+)(h|d)$/.exec(token);
17
+ if (!match)
18
+ return undefined;
19
+ const value = Number.parseInt(match[1] ?? '0', 10);
20
+ const unit = match[2];
21
+ if (!Number.isFinite(value) || value <= 0)
22
+ return undefined;
23
+ const days = unit === 'h' ? value / 24 : value;
24
+ return { days, label: token };
25
+ }
26
+ function buildWindow(durationDays, label, now) {
27
+ const until = now;
28
+ const sinceMs = until.getTime() - durationDays * 24 * 60 * 60 * 1000;
29
+ const since = new Date(sinceMs);
30
+ // Midnight-align the lower bound to keep `--since` deterministic per day.
31
+ since.setHours(0, 0, 0, 0);
32
+ return { since, until, label, days: Math.max(1, Math.round(durationDays)) };
33
+ }
34
+ function buildPriorWindow(current) {
35
+ const until = new Date(current.since.getTime());
36
+ const sinceMs = until.getTime() - current.days * 24 * 60 * 60 * 1000;
37
+ const since = new Date(sinceMs);
38
+ since.setHours(0, 0, 0, 0);
39
+ return { since, until, label: `prior ${current.label}`, days: current.days };
40
+ }
41
+ function parseRetroArgs(rawArgs, now) {
42
+ const args = [...rawArgs];
43
+ const postPlane = args.includes('--post-plane');
44
+ const enrichPlane = args.includes('--plane') || postPlane;
45
+ const positional = args.filter((a) => !a.startsWith('-'));
46
+ let compare = false;
47
+ let durationToken;
48
+ if (positional[0] === 'compare') {
49
+ compare = true;
50
+ durationToken = positional[1];
51
+ }
52
+ else {
53
+ durationToken = positional[0];
54
+ }
55
+ const parsed = parseDurationToken(durationToken) ?? { days: 7, label: '7d' };
56
+ return {
57
+ window: buildWindow(parsed.days, parsed.label, now),
58
+ compare,
59
+ enrichPlane,
60
+ postPlane,
61
+ };
62
+ }
63
+ function SummaryCard(props) {
64
+ const { persisted, metrics, plane, planePostUrl } = props;
65
+ return (_jsxs(Box, { flexDirection: "column", borderStyle: "single", borderRight: false, borderTop: false, borderBottom: false, paddingLeft: 1, children: [_jsxs(Text, { bold: true, children: ["pugi retro \u00B7 ", metrics.window.label] }), _jsxs(Text, { dimColor: true, children: ["Branch ", metrics.branch.current, " over ", metrics.branch.base] }), _jsxs(Text, { children: [metrics.commits.total, " commits \u00B7 +", metrics.loc.insertions, " / -", metrics.loc.deletions, " LOC \u00B7 ", metrics.activeDays, " active days"] }), _jsxs(Text, { children: ["Focus ", metrics.focus.score, "% on ", metrics.focus.topDir ?? 'n/a', " \u00B7 Streak ", metrics.streak.personalDays, "d personal / ", metrics.streak.teamDays, "d team"] }), metrics.shipOfTheWeek ? (_jsxs(Text, { children: ["Ship of the week: ", metrics.shipOfTheWeek.subject.slice(0, 60)] })) : null, plane ? (_jsxs(Text, { children: ["Plane: closed ", plane.closedIssues.length, " \u00B7 created ", plane.createdIssues.length, " \u00B7 oversized modules ", plane.oversizedModules.length] })) : null, _jsxs(Text, { dimColor: true, children: ["Markdown: ", persisted.markdownPath] }), _jsxs(Text, { dimColor: true, children: ["JSON: ", persisted.jsonPath] }), planePostUrl ? _jsxs(Text, { children: ["Posted to Plane: ", planePostUrl] }) : null] }));
66
+ }
67
+ function renderSummary(props) {
68
+ const app = render(_jsx(SummaryCard, { ...props }));
69
+ app.unmount();
70
+ }
71
+ export async function runRetroCommand(ctx) {
72
+ const now = ctx.now ?? new Date();
73
+ const parsed = parseRetroArgs(ctx.args, now);
74
+ const gitCtx = await collectGitContext({ cwd: ctx.cwd, window: parsed.window });
75
+ if (!gitCtx.hasGit) {
76
+ const msg = 'pugi retro: not a git workspace - initialise git or cd into one.';
77
+ if (ctx.flags.json) {
78
+ ctx.io.write(`${JSON.stringify({ ok: false, error: 'no_git_workspace' })}\n`);
79
+ }
80
+ else {
81
+ ctx.io.writeError(msg);
82
+ }
83
+ return 2;
84
+ }
85
+ // Triple-review P1.2 (): before we write anything
86
+ // under `.pugi/retros/`, guarantee `.gitignore` covers `.pugi/`. Without
87
+ // this, the first customer run of `pugi retro` in a fresh repo would
88
+ // leave retros (and any future `.pugi/settings.json` secret store)
89
+ // tracked by git on the next `git add -A`. Idempotent.
90
+ //
91
+ // Round 2 P1 (2026-06-04): surface failure к stderr — silent catch
92
+ // defeats the gate's purpose. If `.gitignore` is read-only or perms
93
+ // refuse, the operator must know retros may be tracked by git.
94
+ const gitIgnoreCreated = [];
95
+ const gitIgnoreSkipped = [];
96
+ try {
97
+ ensurePugiGitIgnore(ctx.cwd, gitIgnoreCreated, gitIgnoreSkipped);
98
+ }
99
+ catch (err) {
100
+ const reason = err instanceof Error ? err.message : String(err);
101
+ ctx.io.writeError(`pugi retro: could not update .gitignore (${reason}). ` +
102
+ `Manually add ".pugi/" to .gitignore so retros are not tracked.`);
103
+ }
104
+ const toBaseHeadCount = await countCommitsAheadOfBase(ctx.cwd, gitCtx.baseBranch, parsed.window.since);
105
+ const metrics = computeMetrics({
106
+ window: parsed.window,
107
+ currentBranch: gitCtx.currentBranch,
108
+ baseBranch: gitCtx.baseBranch,
109
+ toBaseHeadCount,
110
+ currentUserName: gitCtx.userName,
111
+ currentUserEmail: gitCtx.userEmail,
112
+ commits: gitCtx.commits,
113
+ });
114
+ let compare;
115
+ if (parsed.compare) {
116
+ const priorWindow = buildPriorWindow(parsed.window);
117
+ const priorCtx = await collectGitContext({ cwd: ctx.cwd, window: priorWindow });
118
+ const priorAhead = await countCommitsAheadOfBase(ctx.cwd, gitCtx.baseBranch, priorWindow.since);
119
+ const priorMetrics = computeMetrics({
120
+ window: priorWindow,
121
+ currentBranch: gitCtx.currentBranch,
122
+ baseBranch: gitCtx.baseBranch,
123
+ toBaseHeadCount: priorAhead,
124
+ currentUserName: gitCtx.userName,
125
+ currentUserEmail: gitCtx.userEmail,
126
+ commits: priorCtx.commits,
127
+ });
128
+ compare = { current: metrics, prior: priorMetrics };
129
+ }
130
+ let plane;
131
+ let planeUnavailableReason;
132
+ if (parsed.enrichPlane) {
133
+ const cfgResult = resolvePlaneConfig(ctx.cwd);
134
+ if (!cfgResult.ok) {
135
+ planeUnavailableReason = cfgResult.reason;
136
+ }
137
+ else {
138
+ try {
139
+ const slice = await collectPlaneSlice({
140
+ config: cfgResult.config,
141
+ since: parsed.window.since,
142
+ });
143
+ const links = enrichLinks(gitCtx.commits, slice.closedIssues.concat(slice.createdIssues));
144
+ const health = computeHealthCard(slice.modules);
145
+ plane = {
146
+ ...slice,
147
+ prToIssueLinks: links,
148
+ oversizedModules: health.oversized,
149
+ };
150
+ }
151
+ catch (err) {
152
+ planeUnavailableReason = err instanceof Error ? err.message : String(err);
153
+ }
154
+ }
155
+ }
156
+ const persisted = persistRetro({
157
+ root: ctx.cwd,
158
+ metrics,
159
+ plane,
160
+ compare,
161
+ now,
162
+ });
163
+ let planePostUrl;
164
+ if (parsed.postPlane) {
165
+ if (!plane) {
166
+ ctx.io.writeError(`pugi retro --post-plane: Plane unavailable (${planeUnavailableReason ?? 'unknown'}).`);
167
+ }
168
+ else {
169
+ const cfgResult = resolvePlaneConfig(ctx.cwd);
170
+ if (cfgResult.ok) {
171
+ try {
172
+ const result = await postRetroToPlane({
173
+ config: cfgResult.config,
174
+ markdown: persisted.markdown,
175
+ sequence: persisted.sequence,
176
+ dateLabel: persisted.dateLabel,
177
+ });
178
+ planePostUrl = result.url;
179
+ if (result.alreadyExists) {
180
+ ctx.io.write(`pugi retro: already exists at ${result.url}\n`);
181
+ }
182
+ }
183
+ catch (err) {
184
+ const msg = err instanceof Error ? err.message : String(err);
185
+ ctx.io.writeError(`pugi retro --post-plane failed: ${msg}`);
186
+ }
187
+ }
188
+ }
189
+ }
190
+ if (ctx.flags.json) {
191
+ ctx.io.write(`${JSON.stringify({
192
+ ok: true,
193
+ markdownPath: persisted.markdownPath,
194
+ jsonPath: persisted.jsonPath,
195
+ sequence: persisted.sequence,
196
+ metrics,
197
+ plane: plane ?? null,
198
+ planePostUrl: planePostUrl ?? null,
199
+ planeUnavailableReason: planeUnavailableReason ?? null,
200
+ }, null, 2)}\n`);
201
+ }
202
+ else {
203
+ renderSummary({ persisted, metrics, plane, planePostUrl });
204
+ if (planeUnavailableReason && !plane) {
205
+ ctx.io.writeError(`pugi retro: Plane integration unavailable (${planeUnavailableReason}).`);
206
+ }
207
+ }
208
+ return 0;
209
+ }
210
+ //# sourceMappingURL=retro.js.map
@@ -1,40 +1,72 @@
1
1
  /**
2
2
  * SANDBOX probe — surfaces the current OS-level sandbox posture (
3
- * spec: sandbox-adapter.ts macOS Seatbelt / Linux Landlock / WSL2 detect).
3
+ * Trust Sprint item 6: macOS Seatbelt adapter wired; Linux Landlock
4
+ * and Docker variants still backlog).
4
5
  *
5
- * Pugi sandbox enforcement is tracked under task #5 (P0/L1+L16). Until
6
- * that lands, this probe reports the platform's available primitive and
7
- * a clear "not yet armed" warning so the operator sees the gap in
8
- * `pugi doctor` instead of assuming bash dispatches run jailed.
9
- *
10
- * When the sandbox does ship, the probe upgrade path:
11
- * - Replace the static "not_armed" detail with a real config probe
12
- * (read .pugi/settings.json::sandbox.mode, verify the OS primitive
13
- * resolves, return ok when both line up).
14
- * - Keep the same probe NAME so doctor output / spec assertions
15
- * don't churn.
6
+ * Sources `bash.sandbox` from `.pugi/settings.json`, defaults to
7
+ * `none`. When set to `macOS-seatbelt` the probe verifies the OS
8
+ * primitive is callable and reports `ok` (armed) or `error`
9
+ * (configured-but-unavailable). When set to `none` the probe reports
10
+ * `warn` with the operator-readable reason "policy 'none' selected".
16
11
  */
17
- export function probeSandbox(_ctx) {
18
- const platform = process.platform;
19
- let availablePrimitive;
20
- switch (platform) {
21
- case 'darwin':
22
- availablePrimitive = 'macOS Seatbelt (/usr/bin/sandbox-exec)';
23
- break;
24
- case 'linux':
25
- availablePrimitive = 'Linux Landlock / nsjail (kernel-dependent)';
26
- break;
27
- case 'win32':
28
- availablePrimitive = 'Windows AppContainer / Job Object';
29
- break;
30
- default:
31
- availablePrimitive = `unknown platform ${platform}`;
12
+ import { homedir } from 'node:os';
13
+ import { loadSettings } from '../../settings.js';
14
+ import { probeSandbox as probeSandboxAdapter } from '../../sandboxing/index.js';
15
+ export function probeSandbox(ctx) {
16
+ const settings = loadSettings(ctx.cwd);
17
+ const configured = (settings.bash?.sandbox ?? 'none');
18
+ const home = ctx.home || homedir();
19
+ const extraWritePaths = [`${home}/.pugi`];
20
+ try {
21
+ const state = probeSandboxAdapter({
22
+ mode: configured,
23
+ workspaceRoot: ctx.cwd,
24
+ extraWritePaths,
25
+ });
26
+ if (state.armed) {
27
+ // Discipline-gap honesty (Trust Sprint thesis): the adapter
28
+ // probes ok, but spawn-wrap is NOT yet wired into the bash
29
+ // runner (that file is owned by another agent on PUGI-VERIFY-
30
+ // GATE). Reporting status=ok would overstate the posture — an
31
+ // operator reading 'armed' would assume their bash calls were
32
+ // jailed when they still run with full process privileges. We
33
+ // surface 'warn' with a precise reason instead and flip к 'ok'
34
+ // when the runner indirection lands.
35
+ return {
36
+ name: 'SANDBOX',
37
+ status: 'warn',
38
+ detail: `configured (mode=${state.mode}) but spawn-wrap not yet wired — bash dispatches still run with full process privileges. ` +
39
+ `Adapter posture: ${state.details.join('; ')}`,
40
+ remediation: 'The seatbelt adapter is in-tree and exercised by tests; the bash runner indirection that consumes it lands in a follow-up. ' +
41
+ 'Bash classifier denylist + permission FSM remain in force in the meantime.',
42
+ };
43
+ }
44
+ // Not armed — distinguish "operator chose none" from "configured
45
+ // mode failed". The latter is an error; the former is a documented
46
+ // posture and stays a warning.
47
+ if (state.mode === 'none') {
48
+ return {
49
+ name: 'SANDBOX',
50
+ status: 'warn',
51
+ detail: `not armed: ${state.reason ?? 'mode none'}`,
52
+ remediation: 'Set `bash.sandbox = "macOS-seatbelt"` in .pugi/settings.json on macOS to enable workspace-scoped write isolation. ' +
53
+ 'Bash classifier denylist + permission FSM still apply.',
54
+ };
55
+ }
56
+ return {
57
+ name: 'SANDBOX',
58
+ status: 'error',
59
+ detail: `configured mode "${state.mode}" failed to arm: ${state.reason ?? 'unknown'}`,
60
+ remediation: 'Set `bash.sandbox` to a supported mode for this platform or remove the key to fall back to "none".',
61
+ };
62
+ }
63
+ catch (err) {
64
+ return {
65
+ name: 'SANDBOX',
66
+ status: 'error',
67
+ detail: `sandbox probe threw: ${err.message}`,
68
+ remediation: 'Remove the bash.sandbox key from .pugi/settings.json or set it to "none".',
69
+ };
32
70
  }
33
- return {
34
- name: 'SANDBOX',
35
- status: 'warn',
36
- detail: `OS primitive available: ${availablePrimitive}. Sandbox enforcement NOT yet armed (Pugi task #5 pending — bash tool currently runs с full process privileges).`,
37
- remediation: 'Bash tool dispatches run unsandboxed today. Track progress on the OS-level sandbox adapter via the operator-trust roadmap. Until then, rely on the bash classifier denylist + permission FSM.',
38
- };
39
71
  }
40
72
  //# sourceMappingURL=sandbox.js.map
@@ -5,6 +5,7 @@ import { AsyncEventQueue, EngineEventEmitter, modelSupportsThinking, runEngineLo
5
5
  import { FileReadCache } from '../file-cache.js';
6
6
  import { loadSettings } from '../settings.js';
7
7
  import { openSession, recordToolCall, recordToolResult } from '../session.js';
8
+ import { REGRESSION_DISPUTE_PHRASES } from './verification-patterns.js';
8
9
  import { prewarmRealDispatch } from '../subagents/dispatcher.js';
9
10
  import { resolveAutoCompactConfig, resolveBudget } from './budgets.js';
10
11
  import { maybeCompact } from './auto-compact.js';
@@ -936,15 +937,32 @@ export class NativePugiEngineAdapter {
936
937
  return;
937
938
  }
938
939
  // Translate the loop outcome into an EngineResult.
939
- // `aborted` maps to `blocked`
940
- // because the operator chose the outcome, same shape as
941
- // budget_exhausted / tool_refused.
942
- const status = finalOutcome.status === 'completed'
940
+ // `aborted` maps to `blocked` because the operator chose the
941
+ // outcome, same shape as budget_exhausted / tool_refused.
942
+ //
943
+ // PUGI-VERIFY-GATE: the verification gate runs AFTER this
944
+ // base mapping. When the agent ran verification commands and
945
+ // any exited non-zero, the loop's `completed` collapses to
946
+ // `failed` (the agent's claim of "done" is unverified). When
947
+ // the loop `completed` but no verification command ever ran,
948
+ // we surface `needs_verification` (CLI exit 2) so the operator
949
+ // sees the missing signal instead of false confidence. The
950
+ // gate is non-negotiable per the contract: `done` is reserved
951
+ // for `verified: true` outcomes.
952
+ const baseStatus = finalOutcome.status === 'completed'
943
953
  ? 'done'
944
954
  : finalOutcome.status === 'failed'
945
955
  ? 'failed'
946
956
  : 'blocked';
947
- const summaryPrefix = finalOutcome.status === 'completed'
957
+ const filesChangedList = Array.from(filesChanged).sort();
958
+ const verification = computeVerificationOutcome({
959
+ ledger: session.verificationLedger,
960
+ baseStatus,
961
+ finalText: finalOutcome.finalText,
962
+ filesChanged: filesChangedList,
963
+ });
964
+ const status = verification.status;
965
+ const summaryPrefix = status === 'done'
948
966
  ? ''
949
967
  : finalOutcome.status === 'budget_exhausted'
950
968
  ? '[budget_exhausted] '
@@ -952,8 +970,11 @@ export class NativePugiEngineAdapter {
952
970
  ? '[plan_mode_refused] '
953
971
  : finalOutcome.status === 'aborted'
954
972
  ? '[operator_aborted] '
955
- : '[failed] ';
956
- const filesChangedList = Array.from(filesChanged).sort();
973
+ : status === 'needs_verification'
974
+ ? '[needs_verification] '
975
+ : verification.unverifiedReason === 'verification_command_failed'
976
+ ? '[verification_failed] '
977
+ : '[failed] ';
957
978
  appendSessionMirror(sessionEventsPath, {
958
979
  type: 'outcome',
959
980
  status: finalOutcome.status,
@@ -1014,6 +1035,18 @@ export class NativePugiEngineAdapter {
1014
1035
  const synthesisedFromFiles = finalOutcome.finalText.trim() === '' && filesChangedList.length > 0
1015
1036
  ? `Updated ${filesChangedList.length} file(s): ${filesChangedList.slice(0, 5).join(', ')}${filesChangedList.length > 5 ? ` (+${filesChangedList.length - 5} more)` : ''}`
1016
1037
  : '';
1038
+ // PUGI-VERIFY-GATE: thread verification state into the risks
1039
+ // array so a consumer reading only the legacy fields still
1040
+ // gets a human-readable summary of what was not verified.
1041
+ const baseRisks = finalOutcome.status === 'completed' && status === 'done'
1042
+ ? []
1043
+ : [finalOutcome.reason ?? `outcome=${finalOutcome.status}`];
1044
+ if (verification.unverifiedReason && status !== 'done') {
1045
+ baseRisks.push(`unverified: ${verification.unverifiedReason}`);
1046
+ }
1047
+ if (verification.regressionOwnershipDispute) {
1048
+ baseRisks.push('regression_ownership_dispute: agent disclaimed ownership of failing verification');
1049
+ }
1017
1050
  yield {
1018
1051
  type: 'result',
1019
1052
  result: {
@@ -1022,9 +1055,7 @@ export class NativePugiEngineAdapter {
1022
1055
  filesChanged: filesChangedList,
1023
1056
  patchRefs: [],
1024
1057
  testsRun: [],
1025
- risks: finalOutcome.status === 'completed'
1026
- ? []
1027
- : [finalOutcome.reason ?? `outcome=${finalOutcome.status}`],
1058
+ risks: baseRisks,
1028
1059
  eventRefs: [
1029
1060
  `tool_calls=${finalOutcome.toolCallCount}`,
1030
1061
  `turns=${finalOutcome.turnsUsed}`,
@@ -1039,7 +1070,22 @@ export class NativePugiEngineAdapter {
1039
1070
  `session=${session.id}`,
1040
1071
  `ctx=${ctx.sessionId}`,
1041
1072
  `mirror=${sessionEventsPath}`,
1073
+ // PUGI-VERIFY-GATE: machine-readable verification echo so
1074
+ // downstream consumers (MCP wrapper, cabinet UI, audit
1075
+ // pipeline) can branch on the gate state without parsing
1076
+ // the new structured fields.
1077
+ `verified=${verification.verified}`,
1078
+ `verification_count=${verification.verificationCommands.length}`,
1042
1079
  ],
1080
+ verified: verification.verified,
1081
+ verificationCommands: verification.verificationCommands,
1082
+ verificationFailures: verification.verificationFailures,
1083
+ ...(verification.unverifiedReason !== undefined
1084
+ ? { unverifiedReason: verification.unverifiedReason }
1085
+ : {}),
1086
+ ...(verification.regressionOwnershipDispute
1087
+ ? { regressionOwnershipDispute: true }
1088
+ : {}),
1043
1089
  },
1044
1090
  };
1045
1091
  }
@@ -1439,4 +1485,132 @@ async function expandHierarchyWithImports(hierarchy, cwd) {
1439
1485
  }
1440
1486
  return out;
1441
1487
  }
1488
+ export function computeVerificationOutcome(input) {
1489
+ const { ledger, baseStatus, finalText, filesChanged } = input;
1490
+ const verificationCommands = ledger.map((entry) => entry.command);
1491
+ const failures = ledger
1492
+ .filter((entry) => entry.exitCode !== 0)
1493
+ .map((entry) => ({
1494
+ command: entry.command,
1495
+ exitCode: entry.exitCode,
1496
+ tailStderr: entry.tailStderr,
1497
+ }));
1498
+ // Verification PASS only when at least one verification call ran AND
1499
+ // the most recent (chronologically last) verification exited zero.
1500
+ // The "most recent" rule lets the agent intentionally retry a failed
1501
+ // verification — only the final state matters.
1502
+ const lastCall = ledger.length > 0 ? ledger[ledger.length - 1] : undefined;
1503
+ const ranAny = ledger.length > 0;
1504
+ const lastPassed = lastCall !== undefined && lastCall.exitCode === 0;
1505
+ const anyFailed = failures.length > 0;
1506
+ const verified = ranAny && lastPassed && !anyFailed;
1507
+ // Status precedence:
1508
+ // verification_command_failed > base failure modes > needs_verification > done
1509
+ // Override `baseStatus` ONLY when verification failed (the
1510
+ // agent's loop may have ended `completed` while a test failed) OR
1511
+ // when `baseStatus === 'done'` and no verification ran (the
1512
+ // engine completed but produced no signal of correctness).
1513
+ let status;
1514
+ let unverifiedReason;
1515
+ if (anyFailed) {
1516
+ status = 'failed';
1517
+ unverifiedReason = 'verification_command_failed';
1518
+ }
1519
+ else if (!ranAny && baseStatus === 'done') {
1520
+ status = 'needs_verification';
1521
+ unverifiedReason = 'no_verification_command_run';
1522
+ }
1523
+ else if (baseStatus !== 'done') {
1524
+ status = baseStatus;
1525
+ if (!verified)
1526
+ unverifiedReason = 'verification_inconclusive';
1527
+ }
1528
+ else {
1529
+ status = 'done';
1530
+ }
1531
+ // Regression ownership dispute heuristic. Only meaningful when a
1532
+ // verification command failed; keep the predicate simple and
1533
+ // documented so a future reviewer can audit the false-positive
1534
+ // surface.
1535
+ let regressionOwnershipDispute = false;
1536
+ if (anyFailed && filesChanged.length > 0 && finalText !== '') {
1537
+ const lower = finalText.toLowerCase();
1538
+ const disputed = REGRESSION_DISPUTE_PHRASES.some((phrase) => lower.includes(phrase));
1539
+ if (disputed && agentTouchedFailingModule(filesChanged, failures)) {
1540
+ regressionOwnershipDispute = true;
1541
+ }
1542
+ }
1543
+ return {
1544
+ status,
1545
+ verified,
1546
+ verificationCommands,
1547
+ verificationFailures: failures,
1548
+ ...(unverifiedReason !== undefined ? { unverifiedReason } : {}),
1549
+ regressionOwnershipDispute,
1550
+ };
1551
+ }
1552
+ /**
1553
+ * Predicate: at least one mutated file shares a top-level module
1554
+ * directory with a path referenced in any verification failure's
1555
+ * stderr tail. The rule is intentionally loose ("same dir + same
1556
+ * basename without extension or .test./.spec. infix") so it
1557
+ * catches the typical `src/foo.ts` ↔ `src/foo.test.ts` pairing
1558
+ * without overfitting to one test runner's stack-trace format.
1559
+ *
1560
+ * Implementation: extract every `src/...`-shaped path mention from
1561
+ * each failure's stderr tail, then check whether ANY mutated file
1562
+ * shares a module key with ANY mentioned path. The module key
1563
+ * strips the trailing filename's extension AND any `.test.` /
1564
+ * `.spec.` infix so the pair resolves to the same key.
1565
+ */
1566
+ function agentTouchedFailingModule(filesChanged, failures) {
1567
+ const stderrJoined = failures.map((f) => f.tailStderr).join('\n');
1568
+ if (stderrJoined === '')
1569
+ return false;
1570
+ // Match common test-runner path shapes: `src/foo/bar.ts`,
1571
+ // `apps/x/test/y.spec.ts`, `packages/z/baz.test.ts`. Not
1572
+ // exhaustive — false negatives are acceptable here because the
1573
+ // predicate's job is to FLAG dispute, not enforce it.
1574
+ const pathMentions = new Set();
1575
+ const pathRegex = /(?:^|[\s(])((?:src|app|apps|test|tests|lib|packages)\/[\w./-]+\.[a-zA-Z]+)/g;
1576
+ for (const match of stderrJoined.matchAll(pathRegex)) {
1577
+ const captured = match[1];
1578
+ if (typeof captured === 'string' && captured.length > 0) {
1579
+ pathMentions.add(captured);
1580
+ }
1581
+ }
1582
+ if (pathMentions.size === 0)
1583
+ return false;
1584
+ // Module key strips the trailing filename's extension (and any
1585
+ // `.test.` / `.spec.` infix) so `src/existing.ts` and
1586
+ // `src/existing.test.ts` resolve to the same key. Keep the full
1587
+ // directory path plus the bare basename (no ext) — this catches
1588
+ // the typical `foo.ts` ↔ `foo.test.ts` pairing in the same dir
1589
+ // without overfitting to one test-runner convention.
1590
+ const moduleKey = (p) => {
1591
+ const segments = p.split('/').filter(Boolean);
1592
+ if (segments.length === 0)
1593
+ return '';
1594
+ const lastIndex = segments.length - 1;
1595
+ const bareLast = segments[lastIndex]
1596
+ .replace(/\.(spec|test)\./, '.')
1597
+ .replace(/\.[a-zA-Z][a-zA-Z0-9]*$/, '');
1598
+ const dir = segments.slice(0, lastIndex).join('/');
1599
+ return dir === '' ? bareLast : `${dir}/${bareLast}`;
1600
+ };
1601
+ const failingModuleKeys = new Set();
1602
+ for (const mention of pathMentions) {
1603
+ const key = moduleKey(mention);
1604
+ if (key !== '')
1605
+ failingModuleKeys.add(key);
1606
+ }
1607
+ if (failingModuleKeys.size === 0)
1608
+ return false;
1609
+ for (const file of filesChanged) {
1610
+ const key = moduleKey(file);
1611
+ if (failingModuleKeys.has(key))
1612
+ return true;
1613
+ }
1614
+ return false;
1615
+ }
1442
1616
  //# sourceMappingURL=native-pugi.js.map
@@ -21,6 +21,8 @@ import { webFetchTool } from '../../tools/web-fetch.js';
21
21
  import { webSearchTool } from '../../tools/web-search.js';
22
22
  import { agentTool } from '../../tools/agent-tool.js';
23
23
  import { multiEdit } from '../../tools/multi-edit.js';
24
+ import { recordVerificationCall } from '../session.js';
25
+ import { detectVerificationCommand, tailStderr } from './verification-patterns.js';
24
26
  import { buildMcpToolDefs, defaultNonInteractiveMcpPrompt, dispatchMcpTool, MCP_TOOL_PREFIX, } from '../../tools/mcp-tool.js';
25
27
  import { firePostToolUseFailureChain } from '../hook-chains.js';
26
28
  import { buildDenialContext, DENIAL_REMINDER_THRESHOLD, } from '../denial-tracking/state.js';
@@ -1507,6 +1509,29 @@ function dispatchTool(name, args, ctx) {
1507
1509
  session: ctx.session,
1508
1510
  source: 'agent',
1509
1511
  });
1512
+ // PUGI-VERIFY-GATE: tag verification commands and record them
1513
+ // on the session ledger so the engine outcome assembler can
1514
+ // gate the final `status` on test/lint/build pass. The check
1515
+ // is pure — `detectVerificationCommand` matches the regex
1516
+ // allowlist in `verification-patterns.ts`. Record BEFORE
1517
+ // building the model-facing envelope so the ledger is durable
1518
+ // even if the model stops the loop on this turn.
1519
+ const detection = detectVerificationCommand(command);
1520
+ const verificationFailed = detection.isVerification && result.exitCode !== 0;
1521
+ if (detection.isVerification && detection.tool !== null) {
1522
+ recordVerificationCall(ctx.session, {
1523
+ command,
1524
+ tool: detection.tool,
1525
+ exitCode: result.exitCode,
1526
+ tailStderr: tailStderr(
1527
+ // Prefer buffered stderr; fall back to redirect tail
1528
+ // when stdout/stderr lives on disk (`logPath` mode).
1529
+ result.stderr === '' && typeof result.tail === 'string'
1530
+ ? result.tail
1531
+ : result.stderr),
1532
+ timestamp: new Date().toISOString(),
1533
+ });
1534
+ }
1510
1535
  const parts = [
1511
1536
  `exit=${result.exitCode}`,
1512
1537
  result.stdout ? `stdout:\n${result.stdout}` : '',
@@ -1522,6 +1547,16 @@ function dispatchTool(name, args, ctx) {
1522
1547
  parts.push('truncated=true');
1523
1548
  if (result.timedOut)
1524
1549
  parts.push('timedOut=true');
1550
+ // PUGI-VERIFY-GATE: when a verification command exited non-zero,
1551
+ // tag the envelope so the model cannot honestly claim "tests
1552
+ // pass" — and so the engine outcome assembler can scan the
1553
+ // ledger and gate `done`. The stringified envelope keeps
1554
+ // `exit=N` for legacy parsers; the new `verification.tool=` /
1555
+ // `verification.ok=` lines surface the gate state explicitly.
1556
+ if (detection.isVerification) {
1557
+ parts.push(`verification.tool=${detection.tool}`);
1558
+ parts.push(`verification.ok=${verificationFailed ? 'false' : 'true'}`);
1559
+ }
1525
1560
  const body = parts.filter(Boolean).join('\n');
1526
1561
  return body || '(no output)';
1527
1562
  }
@@ -2,12 +2,12 @@
2
2
  * PUGI-VERIFY-GATE — verification command detection.
3
3
  *
4
4
  * Background: Codex dogfood 2026-06-04 surfaced a P0 trust failure
5
- * where the Pugi engine returned `status: done` + `exitCode: 0` even
6
- * after `npm test` exited non-zero on a regression the agent itself
7
- * had introduced. Root cause: no layer of the dispatch pipeline knew
8
- * which bash invocations were verification commands, so the engine
9
- * outcome had no way to gate the final status on test/lint/build
10
- * pass.
5
+ * where the Pugi engine returned `status: done` + `exitCode: 0`
6
+ * even after `npm test` exited non-zero on a regression the agent
7
+ * itself had introduced. Root cause: no layer of the dispatch
8
+ * pipeline knew which bash invocations were verification commands,
9
+ * so the engine outcome had no way to gate the final status on
10
+ * test/lint/build pass.
11
11
  *
12
12
  * This module is the deterministic, configurable allowlist of regex
13
13
  * patterns the engine uses to recognise verification commands at
@@ -110,7 +110,7 @@ export function extractCommandHead(component) {
110
110
  continue;
111
111
  }
112
112
  // env A=1 B=2 prefix (inline env assignments before the verb).
113
- // We peel one token at a time so `FOO=bar BAZ=qux pnpm test` resolves to `pnpm test`.
113
+ // Peel one token at a time so `FOO=bar BAZ=qux pnpm test` resolves to `pnpm test`.
114
114
  const firstToken = head.split(/\s+/, 1)[0] ?? '';
115
115
  if (firstToken !== '' && ENV_ASSIGN.test(firstToken)) {
116
116
  head = head.slice(firstToken.length).trimStart();
@@ -162,8 +162,8 @@ export function detectVerificationCommand(cmd) {
162
162
  * downstream reviewer can decide whether to escalate.
163
163
  *
164
164
  * The list is case-insensitive at match time. Punctuation around the
165
- * phrase is allowed because `.test()` looks for the substring, not
166
- * word boundaries (an agent that writes "this is a pre-existing
165
+ * phrase is allowed because `.includes()` looks for the substring,
166
+ * not word boundaries (an agent that writes "this is a pre-existing
167
167
  * test bug" still trips the flag).
168
168
  */
169
169
  export const REGRESSION_DISPUTE_PHRASES = [