@linimin/pi-letscook 0.1.32 → 0.1.35

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,363 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
5
+ cd "$ROOT"
6
+
7
+ node <<'NODE'
8
+ const fs = require('node:fs');
9
+ const path = require('node:path');
10
+ const {
11
+ parseReportFields,
12
+ transcribeCanonicalRoleReport,
13
+ } = require('./extensions/completion/role-reporting.js');
14
+
15
+ const read = (file) => fs.readFileSync(file, 'utf8');
16
+ const assertIncludes = (file, snippet) => {
17
+ const text = read(file);
18
+ if (!text.includes(snippet)) {
19
+ throw new Error(`${file} is missing required evaluator-calibration text: ${snippet}`);
20
+ }
21
+ };
22
+
23
+ assertIncludes('package.json', '"evaluator-calibration-test": "bash ./scripts/evaluator-calibration-test.sh"');
24
+ assertIncludes('scripts/release-check.sh', 'npm run evaluator-calibration-test');
25
+ assertIncludes('.agent/verify_completion_stop.sh', 'npm run evaluator-calibration-test >/dev/null');
26
+ assertIncludes('README.md', 'Evaluator calibration now also fails closed on semantically lenient but well-formed reports.');
27
+ assertIncludes('README.md', '`npm run evaluator-calibration-test` drives the packaged transcription path through reviewer yes-with-follow-up, auditor open-contracts-with-`Next mandatory slice: none`, and stop-judge yes-with-open-contracts fixtures while still accepting truthful passing reports.');
28
+ assertIncludes('README.md', 'It also rejects the reproducible `none; ...` bypass family for reviewer follow-up, auditor worktree blockers, and stop-judge open-contract reporting, while still accepting only the exact reviewer routing text `Smallest follow-up slice: none; proceed to completion-auditor.` with terminal punctuation or whitespace only.');
29
+ assertIncludes('README.md', 'includes deterministic active-slice contract coverage plus observability coverage, evaluator calibration, and the rubric-contract regression');
30
+ assertIncludes('CHANGELOG.md', 'added evaluator calibration fixtures for semantically lenient but well-formed reviewer/auditor/stop-judge reports');
31
+ assertIncludes('CHANGELOG.md', 'tightened the reproducible `none; ...` reviewer/auditor/stop-judge bypass checks while still accepting only the exact reviewer `none; proceed to completion-auditor` routing allowance with terminal punctuation or whitespace only');
32
+ assertIncludes('CHANGELOG.md', 'wired `npm run evaluator-calibration-test` into `npm run release-check` and `.agent/verify_completion_stop.sh`');
33
+ assertIncludes('CHANGELOG.md', 'fixed the smoke auto-resume prompt regression');
34
+ assertIncludes('extensions/completion/role-reporting.js', 'Reviewer output cannot mark \'Acceptable as-is: yes\' while naming a follow-up slice other than none.');
35
+ assertIncludes('extensions/completion/role-reporting.js', 'Auditor output cannot mark \'Tracked and unignored worktree is clean: yes\' while listing worktree blockers.');
36
+ assertIncludes('extensions/completion/role-reporting.js', 'Auditor output cannot leave \'Next mandatory slice\' as none while open contracts, blockers, or high-value gaps remain.');
37
+ assertIncludes('extensions/completion/role-reporting.js', 'Stop-judge output cannot mark \'Can the project stop now: yes\' while naming remaining open top-level contract IDs.');
38
+ assertIncludes('extensions/completion/role-reporting.js', 'Stop-judge output cannot mark \'Can the project stop now: yes\' when Blocker count is greater than 0.');
39
+
40
+ const tempRootBase = path.join(process.cwd(), '.agent', 'tmp');
41
+ fs.mkdirSync(tempRootBase, { recursive: true });
42
+ const tempRoot = fs.mkdtempSync(path.join(tempRootBase, 'evaluator-calibration-'));
43
+ const snapshotFiles = {
44
+ sliceHistoryPath: path.join(tempRoot, 'slice-history.jsonl'),
45
+ stopHistoryPath: path.join(tempRoot, 'stop-check-history.jsonl'),
46
+ };
47
+ fs.writeFileSync(snapshotFiles.sliceHistoryPath, '');
48
+ fs.writeFileSync(snapshotFiles.stopHistoryPath, '');
49
+
50
+ const readJsonl = (file) =>
51
+ fs
52
+ .readFileSync(file, 'utf8')
53
+ .split('\n')
54
+ .filter(Boolean)
55
+ .map((line) => JSON.parse(line));
56
+ const assert = (condition, message) => {
57
+ if (!condition) throw new Error(message);
58
+ };
59
+
60
+ const reviewerPass = `MISSION ANCHOR: test mission
61
+ Remaining contract IDs: TEST-CONTRACT
62
+ Rubric:
63
+ - Contract coverage: pass - Locked acceptance criteria match the committed slice.
64
+ - Correctness risk: pass - No blocking regression is evident.
65
+ - Verification evidence: pass - Deterministic proof was rerun successfully.
66
+ - Docs/state parity: pass - Docs and canonical state are aligned.
67
+ Findings: none.
68
+ Acceptable as-is: yes
69
+ Smallest follow-up slice: none; proceed to completion-auditor.`;
70
+
71
+ const reviewerLenient = `MISSION ANCHOR: test mission
72
+ Remaining contract IDs: TEST-CONTRACT
73
+ Rubric:
74
+ - Contract coverage: pass - Locked acceptance criteria match the committed slice.
75
+ - Correctness risk: pass - No blocking regression is evident.
76
+ - Verification evidence: pass - Deterministic proof was rerun successfully.
77
+ - Docs/state parity: pass - Docs and canonical state are aligned.
78
+ Findings: none.
79
+ Acceptable as-is: yes
80
+ Smallest follow-up slice: tighten docs before audit.`;
81
+
82
+ const reviewerNonePrefixedLenient = `MISSION ANCHOR: test mission
83
+ Remaining contract IDs: TEST-CONTRACT
84
+ Rubric:
85
+ - Contract coverage: pass - Locked acceptance criteria match the committed slice.
86
+ - Correctness risk: pass - No blocking regression is evident.
87
+ - Verification evidence: pass - Deterministic proof was rerun successfully.
88
+ - Docs/state parity: pass - Docs and canonical state are aligned.
89
+ Findings: none.
90
+ Acceptable as-is: yes
91
+ Smallest follow-up slice: none; tighten docs before audit.`;
92
+
93
+ const reviewerTrailingTextAfterRoutingLenient = `MISSION ANCHOR: test mission
94
+ Remaining contract IDs: TEST-CONTRACT
95
+ Rubric:
96
+ - Contract coverage: pass - Locked acceptance criteria match the committed slice.
97
+ - Correctness risk: pass - No blocking regression is evident.
98
+ - Verification evidence: pass - Deterministic proof was rerun successfully.
99
+ - Docs/state parity: pass - Docs and canonical state are aligned.
100
+ Findings: none.
101
+ Acceptable as-is: yes
102
+ Smallest follow-up slice: none; proceed to completion-auditor; tighten docs before audit.`;
103
+
104
+ const auditorPass = `MISSION ANCHOR: test mission
105
+ Remaining contract IDs: TEST-CONTRACT
106
+ Rubric:
107
+ - Contract coverage: pass - The accepted slice remains satisfied on HEAD.
108
+ - Correctness risk: concern - One planned contract still keeps the project open.
109
+ - Verification evidence: pass - Verification was rerun for the accepted slice.
110
+ - Docs/state parity: pass - Canonical state can be reconciled truthfully.
111
+ Why the project is still not done: One planned contract remains after this accepted slice.
112
+ Open top-level contract IDs: TEST-CONTRACT
113
+ Blocker count: 0
114
+ High-value gap count: 1
115
+ Tracked and unignored worktree is clean: yes
116
+ Worktree blockers: none
117
+ Next mandatory slice: next-slice
118
+ Stale or conflicting canonical state: no
119
+ Plan truthfully captures remaining slice backlog: yes - one planned slice remains.`;
120
+
121
+ const auditorLenient = `MISSION ANCHOR: test mission
122
+ Remaining contract IDs: TEST-CONTRACT
123
+ Rubric:
124
+ - Contract coverage: pass - The accepted slice remains satisfied on HEAD.
125
+ - Correctness risk: concern - One planned contract still keeps the project open.
126
+ - Verification evidence: pass - Verification was rerun for the accepted slice.
127
+ - Docs/state parity: pass - Canonical state can be reconciled truthfully.
128
+ Why the project is still not done: One planned contract remains after this accepted slice.
129
+ Open top-level contract IDs: TEST-CONTRACT
130
+ Blocker count: 0
131
+ High-value gap count: 1
132
+ Tracked and unignored worktree is clean: yes
133
+ Worktree blockers: modified README.md
134
+ Next mandatory slice: none.
135
+ Stale or conflicting canonical state: no
136
+ Plan truthfully captures remaining slice backlog: yes - one planned slice remains.`;
137
+
138
+ const auditorNonePrefixedLenient = `MISSION ANCHOR: test mission
139
+ Remaining contract IDs: TEST-CONTRACT
140
+ Rubric:
141
+ - Contract coverage: pass - The accepted slice remains satisfied on HEAD.
142
+ - Correctness risk: concern - One planned contract still keeps the project open.
143
+ - Verification evidence: pass - Verification was rerun for the accepted slice.
144
+ - Docs/state parity: pass - Canonical state can be reconciled truthfully.
145
+ Why the project is still not done: One planned contract remains after this accepted slice.
146
+ Open top-level contract IDs: TEST-CONTRACT
147
+ Blocker count: 0
148
+ High-value gap count: 1
149
+ Tracked and unignored worktree is clean: yes
150
+ Worktree blockers: none; modified README.md
151
+ Next mandatory slice: next-slice
152
+ Stale or conflicting canonical state: no
153
+ Plan truthfully captures remaining slice backlog: yes - one planned slice remains.`;
154
+
155
+ const stopJudgePass = `MISSION ANCHOR: test mission
156
+ Remaining contract IDs: none
157
+ Rubric:
158
+ - Contract coverage: pass - All implementation slices are accepted on HEAD.
159
+ - Correctness risk: pass - No remaining blocker or high-value gap is evident.
160
+ - Verification evidence: pass - Final verification passes for the current head.
161
+ - Docs/state parity: pass - Docs, config, and canonical state match shipped behavior.
162
+ Can the project stop now: yes
163
+ Exact remaining open top-level contract IDs: none
164
+ Blocker count: 0
165
+ High-value gap count: 0
166
+ Latest completed slice commit: abcdef1234567890abcdef1234567890abcdef12
167
+ Docs/config/runbooks match shipped behavior: yes
168
+ Tracked and unignored worktree is clean: yes
169
+ Brief justification: Current HEAD satisfies the stop criteria.`;
170
+
171
+ const stopJudgeLenient = `MISSION ANCHOR: test mission
172
+ Remaining contract IDs: none
173
+ Rubric:
174
+ - Contract coverage: pass - All implementation slices are accepted on HEAD.
175
+ - Correctness risk: pass - No additional risk was found.
176
+ - Verification evidence: pass - Final verification passes for the current head.
177
+ - Docs/state parity: pass - Docs, config, and canonical state match shipped behavior.
178
+ Can the project stop now: yes
179
+ Exact remaining open top-level contract IDs: TEST-CONTRACT
180
+ Blocker count: 1
181
+ High-value gap count: 0
182
+ Latest completed slice commit: abcdef1234567890abcdef1234567890abcdef12
183
+ Docs/config/runbooks match shipped behavior: yes
184
+ Tracked and unignored worktree is clean: yes
185
+ Brief justification: This should be rejected because remaining contracts and blockers still exist.`;
186
+
187
+ const stopJudgeNonePrefixedLenient = `MISSION ANCHOR: test mission
188
+ Remaining contract IDs: none
189
+ Rubric:
190
+ - Contract coverage: pass - All implementation slices are accepted on HEAD.
191
+ - Correctness risk: pass - No additional risk was found.
192
+ - Verification evidence: pass - Final verification passes for the current head.
193
+ - Docs/state parity: pass - Docs, config, and canonical state match shipped behavior.
194
+ Can the project stop now: yes
195
+ Exact remaining open top-level contract IDs: none; TEST-CONTRACT
196
+ Blocker count: 0
197
+ High-value gap count: 0
198
+ Latest completed slice commit: abcdef1234567890abcdef1234567890abcdef12
199
+ Docs/config/runbooks match shipped behavior: yes
200
+ Tracked and unignored worktree is clean: yes
201
+ Brief justification: This should be rejected because remaining contracts still exist behind a none-prefixed field.`;
202
+
203
+ (async () => {
204
+ const reviewed = await transcribeCanonicalRoleReport({
205
+ role: 'completion-reviewer',
206
+ output: reviewerPass,
207
+ reportFields: parseReportFields(reviewerPass),
208
+ snapshotFiles,
209
+ headSha: '1111111111111111111111111111111111111111',
210
+ sliceId: 'slice-review',
211
+ recordedAt: 1,
212
+ });
213
+ assert(reviewed.errors.length === 0, `reviewer passing fixture should transcribe cleanly: ${reviewed.errors.join(' | ')}`);
214
+ assert(reviewed.appended.includes('reviewed:slice-review'), 'reviewer passing fixture should append a reviewed record');
215
+ assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 1, 'reviewer passing fixture should create one slice-history record');
216
+
217
+ const reviewerRejected = await transcribeCanonicalRoleReport({
218
+ role: 'completion-reviewer',
219
+ output: reviewerLenient,
220
+ reportFields: parseReportFields(reviewerLenient),
221
+ snapshotFiles,
222
+ headSha: '2222222222222222222222222222222222222222',
223
+ sliceId: 'slice-review',
224
+ recordedAt: 2,
225
+ });
226
+ assert(
227
+ reviewerRejected.errors.some((error) => error.includes('follow-up slice other than none')),
228
+ `reviewer lenient fixture should be rejected for a yes verdict with a follow-up slice: ${reviewerRejected.errors.join(' | ')}`,
229
+ );
230
+ assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 1, 'rejected reviewer fixture must not append history');
231
+
232
+ const reviewerNonePrefixedRejected = await transcribeCanonicalRoleReport({
233
+ role: 'completion-reviewer',
234
+ output: reviewerNonePrefixedLenient,
235
+ reportFields: parseReportFields(reviewerNonePrefixedLenient),
236
+ snapshotFiles,
237
+ headSha: '7777777777777777777777777777777777777777',
238
+ sliceId: 'slice-review',
239
+ recordedAt: 7,
240
+ });
241
+ assert(
242
+ reviewerNonePrefixedRejected.errors.some((error) => error.includes('follow-up slice other than none')),
243
+ `reviewer none-prefixed lenient fixture should be rejected for a yes verdict with contradictory routing text: ${reviewerNonePrefixedRejected.errors.join(' | ')}`,
244
+ );
245
+ assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 1, 'rejected none-prefixed reviewer fixture must not append history');
246
+
247
+ const reviewerTrailingTextAfterRoutingRejected = await transcribeCanonicalRoleReport({
248
+ role: 'completion-reviewer',
249
+ output: reviewerTrailingTextAfterRoutingLenient,
250
+ reportFields: parseReportFields(reviewerTrailingTextAfterRoutingLenient),
251
+ snapshotFiles,
252
+ headSha: '1010101010101010101010101010101010101010',
253
+ sliceId: 'slice-review',
254
+ recordedAt: 10,
255
+ });
256
+ assert(
257
+ reviewerTrailingTextAfterRoutingRejected.errors.some((error) => error.includes('follow-up slice other than none')),
258
+ `reviewer routing-trailing-text fixture should be rejected for extra text after the exact completion-auditor allowance: ${reviewerTrailingTextAfterRoutingRejected.errors.join(' | ')}`,
259
+ );
260
+ assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 1, 'rejected reviewer routing-trailing-text fixture must not append history');
261
+
262
+ const audited = await transcribeCanonicalRoleReport({
263
+ role: 'completion-auditor',
264
+ output: auditorPass,
265
+ reportFields: parseReportFields(auditorPass),
266
+ snapshotFiles,
267
+ headSha: '3333333333333333333333333333333333333333',
268
+ sliceId: 'slice-audit',
269
+ recordedAt: 3,
270
+ });
271
+ assert(audited.errors.length === 0, `auditor passing fixture should transcribe cleanly: ${audited.errors.join(' | ')}`);
272
+ assert(audited.appended.includes('audited:slice-audit'), 'auditor passing fixture should append an audited record');
273
+ assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 2, 'auditor passing fixture should append a second slice-history record');
274
+
275
+ const auditorRejected = await transcribeCanonicalRoleReport({
276
+ role: 'completion-auditor',
277
+ output: auditorLenient,
278
+ reportFields: parseReportFields(auditorLenient),
279
+ snapshotFiles,
280
+ headSha: '4444444444444444444444444444444444444444',
281
+ sliceId: 'slice-audit',
282
+ recordedAt: 4,
283
+ });
284
+ assert(
285
+ auditorRejected.errors.some((error) => error.includes('listing worktree blockers')),
286
+ `auditor lenient fixture should reject clean-yes reports that still list worktree blockers: ${auditorRejected.errors.join(' | ')}`,
287
+ );
288
+ assert(
289
+ auditorRejected.errors.some((error) => error.includes("Next mandatory slice") && error.includes('none')),
290
+ `auditor lenient fixture should reject open-work reports with no next mandatory slice: ${auditorRejected.errors.join(' | ')}`,
291
+ );
292
+ assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 2, 'rejected auditor fixture must not append history');
293
+
294
+ const auditorNonePrefixedRejected = await transcribeCanonicalRoleReport({
295
+ role: 'completion-auditor',
296
+ output: auditorNonePrefixedLenient,
297
+ reportFields: parseReportFields(auditorNonePrefixedLenient),
298
+ snapshotFiles,
299
+ headSha: '8888888888888888888888888888888888888888',
300
+ sliceId: 'slice-audit',
301
+ recordedAt: 8,
302
+ });
303
+ assert(
304
+ auditorNonePrefixedRejected.errors.some((error) => error.includes('listing worktree blockers')),
305
+ `auditor none-prefixed lenient fixture should reject clean-yes reports that smuggle blockers behind none: ${auditorNonePrefixedRejected.errors.join(' | ')}`,
306
+ );
307
+ assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 2, 'rejected none-prefixed auditor fixture must not append history');
308
+
309
+ const judged = await transcribeCanonicalRoleReport({
310
+ role: 'completion-stop-judge',
311
+ output: stopJudgePass,
312
+ reportFields: parseReportFields(stopJudgePass),
313
+ snapshotFiles,
314
+ headSha: '5555555555555555555555555555555555555555',
315
+ recordedAt: 5,
316
+ });
317
+ assert(judged.errors.length === 0, `stop-judge passing fixture should transcribe cleanly: ${judged.errors.join(' | ')}`);
318
+ assert(judged.appended.includes('judgment:555555555555'), 'stop-judge passing fixture should append a judgment record');
319
+ assert(readJsonl(snapshotFiles.stopHistoryPath).length === 1, 'stop-judge passing fixture should create one judgment record');
320
+
321
+ const judgeRejected = await transcribeCanonicalRoleReport({
322
+ role: 'completion-stop-judge',
323
+ output: stopJudgeLenient,
324
+ reportFields: parseReportFields(stopJudgeLenient),
325
+ snapshotFiles,
326
+ headSha: '6666666666666666666666666666666666666666',
327
+ recordedAt: 6,
328
+ });
329
+ assert(
330
+ judgeRejected.errors.some((error) => error.includes('remaining open top-level contract IDs')),
331
+ `stop-judge lenient fixture should reject yes verdicts with open contracts: ${judgeRejected.errors.join(' | ')}`,
332
+ );
333
+ assert(
334
+ judgeRejected.errors.some((error) => error.includes('Blocker count is greater than 0')),
335
+ `stop-judge lenient fixture should reject yes verdicts with blockers: ${judgeRejected.errors.join(' | ')}`,
336
+ );
337
+ assert(readJsonl(snapshotFiles.stopHistoryPath).length === 1, 'rejected stop-judge fixture must not append judgment history');
338
+
339
+ const judgeNonePrefixedRejected = await transcribeCanonicalRoleReport({
340
+ role: 'completion-stop-judge',
341
+ output: stopJudgeNonePrefixedLenient,
342
+ reportFields: parseReportFields(stopJudgeNonePrefixedLenient),
343
+ snapshotFiles,
344
+ headSha: '9999999999999999999999999999999999999999',
345
+ recordedAt: 9,
346
+ });
347
+ assert(
348
+ judgeNonePrefixedRejected.errors.some((error) => error.includes('remaining open top-level contract IDs')),
349
+ `stop-judge none-prefixed lenient fixture should reject yes verdicts with none-prefixed open contracts: ${judgeNonePrefixedRejected.errors.join(' | ')}`,
350
+ );
351
+ assert(readJsonl(snapshotFiles.stopHistoryPath).length === 1, 'rejected none-prefixed stop-judge fixture must not append judgment history');
352
+
353
+ fs.rmSync(tempRoot, { recursive: true, force: true });
354
+ })().catch((error) => {
355
+ try {
356
+ fs.rmSync(tempRoot, { recursive: true, force: true });
357
+ } catch {}
358
+ console.error(error instanceof Error ? error.message : String(error));
359
+ process.exit(1);
360
+ });
361
+ NODE
362
+
363
+ echo "evaluator calibration test passed"
@@ -34,6 +34,37 @@ print(state['mission_anchor'])
34
34
  PY
35
35
  )"
36
36
 
37
+ CHOOSER_SNAPSHOT="$TMPDIR/existing-workflow-chooser.json"
38
+ PI_COMPLETION_EXISTING_WORKFLOW_ACTION=cancel \
39
+ PI_COMPLETION_TEST_EXISTING_WORKFLOW_CHOOSER_PATH="$CHOOSER_SNAPSHOT" \
40
+ PI_COMPLETION_SKIP_DRIVER_KICKOFF=1 \
41
+ pi -e "$PKG_ROOT" -p "/cook replacement mission that should stay in the main chat" \
42
+ >/tmp/pi-completion-refocus-cancel.out 2>/tmp/pi-completion-refocus-cancel.err
43
+
44
+ python3 - "$CHOOSER_SNAPSHOT" "/tmp/pi-completion-refocus-cancel.out" "/tmp/pi-completion-refocus-cancel.err" "$INITIAL_MISSION" <<'PY'
45
+ import json
46
+ import sys
47
+ from pathlib import Path
48
+
49
+ chooser = json.loads(Path(sys.argv[1]).read_text())
50
+ output = Path(sys.argv[2]).read_text() + Path(sys.argv[3]).read_text()
51
+ initial_mission = sys.argv[4]
52
+ state = json.loads(Path('.agent/state.json').read_text())
53
+ plan = json.loads(Path('.agent/plan.json').read_text())
54
+ active = json.loads(Path('.agent/active-slice.json').read_text())
55
+
56
+ assert state['mission_anchor'] == initial_mission, 'cancelled chooser should keep the current mission anchor'
57
+ assert plan['mission_anchor'] == initial_mission, 'cancelled chooser should keep plan.json unchanged'
58
+ assert active['mission_anchor'] == initial_mission, 'cancelled chooser should keep active-slice.json unchanged'
59
+ assert chooser['title'].startswith('Existing completion workflow found'), 'chooser snapshot should describe the existing-workflow prompt'
60
+ assert chooser['choices'][0].startswith('Continue current workflow'), 'chooser should keep the continue option'
61
+ assert chooser['choices'][1].startswith('Abandon current workflow and start this new one'), 'chooser should keep the refocus option'
62
+ assert 'Start/Cancel confirmation' in chooser['choices'][1], 'chooser should mention the approval-only replacement confirmation'
63
+ assert chooser['choices'][2].startswith('Cancel'), 'chooser should keep the cancel option'
64
+ assert 'Discuss changes in the main chat and rerun /cook.' in chooser['choices'][2], 'chooser cancel copy should redirect users back to the main chat and rerun /cook'
65
+ assert 'Discuss changes in the main chat and rerun /cook.' in output, 'chooser cancel output should redirect users back to the main chat and rerun /cook'
66
+ PY
67
+
37
68
  PI_COMPLETION_EXISTING_WORKFLOW_ACTION=refocus \
38
69
  PI_COMPLETION_SKIP_DRIVER_KICKOFF=1 \
39
70
  pi -e "$PKG_ROOT" -p "/cook refocused smoke-test mission with tests and docs" \
@@ -4,11 +4,15 @@ set -euo pipefail
4
4
  ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
5
5
  cd "$ROOT"
6
6
 
7
- echo "[release-check] running startup/refocus/context regressions, including critique-aware /cook confirmation coverage"
7
+ echo "[release-check] running control-plane validation, startup/refocus/context regressions, canonical evidence artifact, active-slice contract, observability, evaluator calibration, and rubric contract coverage"
8
+ bash .agent/verify_completion_control_plane.sh
8
9
  npm run smoke-test
9
10
  npm run refocus-test
10
11
  npm run context-proposal-test
12
+ bash ./scripts/canonical-evidence-artifact-test.sh
13
+ bash ./scripts/active-slice-contract-test.sh
11
14
  npm run observability-status-test
15
+ npm run evaluator-calibration-test
12
16
  npm run rubric-contract-test
13
17
  npm pack --dry-run >/dev/null
14
18
 
@@ -19,7 +19,7 @@ PI_COMPLETION_TEST_DRIVER_PROMPT_PATH="$KICKOFF_PROMPT" \
19
19
  pi -e "$PKG_ROOT" -p "/cook smoke-test mission" \
20
20
  >"$TMPDIR/pi-completion-smoke-bootstrap.out" 2>"$TMPDIR/pi-completion-smoke-bootstrap.err"
21
21
 
22
- for file in .agent/profile.json .agent/state.json .agent/plan.json .agent/active-slice.json; do
22
+ for file in .agent/profile.json .agent/state.json .agent/plan.json .agent/active-slice.json .agent/verification-evidence.json; do
23
23
  [[ -f "$file" ]] || { echo "missing canonical bootstrap file: $file" >&2; exit 1; }
24
24
  done
25
25
 
@@ -38,6 +38,7 @@ profile = json.loads(Path('.agent/profile.json').read_text())
38
38
  state = json.loads(Path('.agent/state.json').read_text())
39
39
  plan = json.loads(Path('.agent/plan.json').read_text())
40
40
  active = json.loads(Path('.agent/active-slice.json').read_text())
41
+ evidence = json.loads(Path('.agent/verification-evidence.json').read_text())
41
42
  kickoff = Path(sys.argv[1]).read_text()
42
43
 
43
44
  assert profile['task_type'] == expected_task_type, 'profile.json task_type mismatch after bootstrap'
@@ -50,6 +51,10 @@ assert active['task_type'] == expected_task_type, 'active-slice.json task_type m
50
51
  assert active['evaluation_profile'] == expected_eval_profile, 'active-slice.json evaluation_profile mismatch after bootstrap'
51
52
  assert active['implementation_surfaces'] == [], 'active-slice.json should scaffold empty implementation_surfaces'
52
53
  assert active['verification_commands'] == [], 'active-slice.json should scaffold empty verification_commands'
54
+ assert evidence['artifact_type'] == 'completion-verification-evidence', 'verification-evidence.json artifact_type mismatch after bootstrap'
55
+ assert evidence['subject_type'] == 'none', 'verification-evidence.json should scaffold idle subject_type'
56
+ assert evidence['verification_commands'] == [], 'verification-evidence.json should scaffold empty verification_commands'
57
+ assert evidence['outcome'] == 'not_recorded', 'verification-evidence.json should scaffold not_recorded outcome'
53
58
  assert 'Canonical routing profile:' in kickoff, 'kickoff prompt should expose canonical routing profile'
54
59
  assert f'- task_type: {expected_task_type}' in kickoff, 'kickoff prompt missing canonical task_type'
55
60
  assert f'- evaluation_profile: {expected_eval_profile}' in kickoff, 'kickoff prompt missing canonical evaluation_profile'
@@ -167,6 +172,56 @@ active.pop('why_now', None)
167
172
  path.write_text(json.dumps(active, indent=2) + '\n')
168
173
  PY
169
174
 
175
+ python3 - <<'PY'
176
+ import json
177
+ from pathlib import Path
178
+ active = json.loads(Path('.agent/active-slice.json').read_text())
179
+ plan_path = Path('.agent/plan.json')
180
+ plan = json.loads(plan_path.read_text())
181
+ plan['candidate_slices'] = [{
182
+ 'slice_id': active['slice_id'],
183
+ 'goal': active['goal'],
184
+ 'acceptance_criteria': active['acceptance_criteria'],
185
+ 'contract_ids': active['contract_ids'],
186
+ 'priority': 1,
187
+ 'status': 'selected',
188
+ 'why_now': 'smoke test exact handoff',
189
+ 'blocked_on': active['blocked_on'],
190
+ 'evidence': [],
191
+ 'locked_notes': active['locked_notes'],
192
+ 'must_fix_findings': active['must_fix_findings'],
193
+ 'implementation_surfaces': ['extensions/completion/index.ts', '.agent/verify_completion_control_plane.sh'],
194
+ 'verification_commands': ['bash .agent/verify_completion_control_plane.sh', 'npm run smoke-test'],
195
+ 'basis_commit': active['basis_commit'],
196
+ 'remaining_contract_ids_before': active['remaining_contract_ids_before'],
197
+ 'release_blocker_count_before': active['release_blocker_count_before'],
198
+ 'high_value_gap_count_before': active['high_value_gap_count_before'],
199
+ }]
200
+ plan_path.write_text(json.dumps(plan, indent=2) + '\n')
201
+ PY
202
+
203
+ python3 - <<'PY'
204
+ import json
205
+ from pathlib import Path
206
+
207
+ active = json.loads(Path('.agent/active-slice.json').read_text())
208
+ evidence = {
209
+ 'schema_version': 1,
210
+ 'artifact_type': 'completion-verification-evidence',
211
+ 'subject_type': 'selected_slice',
212
+ 'slice_id': active['slice_id'],
213
+ 'goal': active['goal'],
214
+ 'contract_ids': active['contract_ids'],
215
+ 'basis_commit': active['basis_commit'],
216
+ 'head_sha': active['basis_commit'],
217
+ 'verification_commands': ['bash .agent/verify_completion_control_plane.sh', 'npm run smoke-test'],
218
+ 'outcome': 'passed',
219
+ 'recorded_at': '2026-05-03T00:00:00Z',
220
+ 'summary': 'Smoke selected-slice evidence matches the temporary active-slice fixture.',
221
+ }
222
+ Path('.agent/verification-evidence.json').write_text(json.dumps(evidence, indent=2) + '\n')
223
+ PY
224
+
170
225
  if bash .agent/verify_completion_control_plane.sh >/dev/null 2>&1; then
171
226
  echo "expected control-plane verification to fail when selected active-slice omits priority/why_now" >&2
172
227
  exit 1
@@ -101,6 +101,7 @@ Ignored canonical execution-state files:
101
101
  - `.agent/active-slice.json`
102
102
  - `.agent/slice-history.jsonl`
103
103
  - `.agent/stop-check-history.jsonl`
104
+ - `.agent/verification-evidence.json`
104
105
  - `.agent/*.log`
105
106
 
106
107
  ## Canonical Inputs
@@ -115,6 +116,7 @@ Read these when making completion decisions:
115
116
  - `.agent/active-slice.json`
116
117
  - `.agent/slice-history.jsonl`
117
118
  - `.agent/stop-check-history.jsonl`
119
+ - `.agent/verification-evidence.json`
118
120
 
119
121
  Optional context only:
120
122
 
@@ -138,6 +140,7 @@ After context compaction, suspected memory loss, stalled-role recovery, or any a
138
140
  - `.agent/state.json`
139
141
  - `.agent/plan.json`
140
142
  - `.agent/active-slice.json`
143
+ - `.agent/verification-evidence.json`
141
144
 
142
145
  The workflow driver must invoke `completion-regrounder` before continuing whenever any of the following is true:
143
146
 
@@ -152,7 +155,7 @@ The exact implementer handoff now includes implementation-scope surfaces and exp
152
155
 
153
156
  The workflow driver must not continue implementation, review, audit, or stop evaluation from compacted conversation memory alone.
154
157
 
155
- After compaction or recovery, `completion-implementer` must also re-read canonical `.agent/state.json`, `.agent/plan.json`, and `.agent/active-slice.json` before resuming work. If `.agent/active-slice.json` still contains a truthful exact handoff snapshot, continue from canonical state rather than asking the user to resend the original caller payload.
158
+ After compaction or recovery, `completion-implementer` must also re-read canonical `.agent/state.json`, `.agent/plan.json`, `.agent/active-slice.json`, and `.agent/verification-evidence.json` before resuming work. If `.agent/active-slice.json` still contains a truthful exact handoff snapshot, continue from canonical state rather than asking the user to resend the original caller payload.
156
159
 
157
160
  ## Shared Report Header
158
161
 
@@ -17,8 +17,29 @@
17
17
  - `.agent/active-slice.json`
18
18
  - `.agent/slice-history.jsonl`
19
19
  - `.agent/stop-check-history.jsonl`
20
+ - `.agent/verification-evidence.json`
20
21
  - `.agent/*.log`
21
22
 
23
+ ## Canonical Inputs
24
+
25
+ Read these when making completion decisions:
26
+
27
+ - `.agent/mission.md`
28
+ - `.agent/README.md`
29
+ - `.agent/profile.json`
30
+ - `.agent/state.json`
31
+ - `.agent/plan.json`
32
+ - `.agent/active-slice.json`
33
+ - `.agent/slice-history.jsonl`
34
+ - `.agent/stop-check-history.jsonl`
35
+ - `.agent/verification-evidence.json`
36
+
37
+ Optional context only:
38
+
39
+ - `.agent/backlog.md`
40
+ - `.agent/handoff.md`
41
+ - `.agent/compact.md`
42
+
22
43
  ## Scratch Space
23
44
 
24
45
  - Use repo-local `.agent/tmp/` as the default temporary workspace for completion.
@@ -332,6 +353,7 @@ After context compaction, suspected memory loss, stalled-role recovery, or any a
332
353
  - `.agent/state.json`
333
354
  - `.agent/plan.json`
334
355
  - `.agent/active-slice.json`
356
+ - `.agent/verification-evidence.json`
335
357
 
336
358
  The workflow root must invoke `completion-regrounder` before continuing whenever any of the following is true:
337
359
 
@@ -344,6 +366,8 @@ The workflow root must invoke `completion-regrounder` before continuing whenever
344
366
 
345
367
  The workflow root must not continue implementation, review, audit, or stop evaluation from compacted conversation memory alone.
346
368
 
369
+ After compaction or recovery, `completion-implementer` must also re-read canonical `.agent/state.json`, `.agent/plan.json`, `.agent/active-slice.json`, and `.agent/verification-evidence.json` before resuming work. If `.agent/active-slice.json` still contains a truthful exact handoff snapshot, continue from canonical state rather than asking the user to resend the original caller payload.
370
+
347
371
  ## Default Priority Policy
348
372
 
349
373
  `completion-default` ranks candidate slices in this order: