@linimin/pi-letscook 0.1.32 → 0.1.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +26 -0
- package/README.md +33 -11
- package/agents/completion-implementer.md +11 -2
- package/extensions/completion/index.ts +426 -262
- package/extensions/completion/role-reporting.js +107 -20
- package/package.json +2 -1
- package/scripts/active-slice-contract-test.sh +242 -0
- package/scripts/canonical-evidence-artifact-test.sh +348 -0
- package/scripts/context-proposal-test.sh +50 -49
- package/scripts/evaluator-calibration-test.sh +363 -0
- package/scripts/refocus-test.sh +31 -0
- package/scripts/release-check.sh +5 -1
- package/scripts/smoke-test.sh +56 -1
- package/skills/completion-protocol/SKILL.md +4 -1
- package/skills/completion-protocol/references/completion.md +24 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
set -euo pipefail
|
|
3
|
+
|
|
4
|
+
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|
5
|
+
cd "$ROOT"
|
|
6
|
+
|
|
7
|
+
node <<'NODE'
|
|
8
|
+
const fs = require('node:fs');
|
|
9
|
+
const path = require('node:path');
|
|
10
|
+
const {
|
|
11
|
+
parseReportFields,
|
|
12
|
+
transcribeCanonicalRoleReport,
|
|
13
|
+
} = require('./extensions/completion/role-reporting.js');
|
|
14
|
+
|
|
15
|
+
const read = (file) => fs.readFileSync(file, 'utf8');
|
|
16
|
+
const assertIncludes = (file, snippet) => {
|
|
17
|
+
const text = read(file);
|
|
18
|
+
if (!text.includes(snippet)) {
|
|
19
|
+
throw new Error(`${file} is missing required evaluator-calibration text: ${snippet}`);
|
|
20
|
+
}
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
assertIncludes('package.json', '"evaluator-calibration-test": "bash ./scripts/evaluator-calibration-test.sh"');
|
|
24
|
+
assertIncludes('scripts/release-check.sh', 'npm run evaluator-calibration-test');
|
|
25
|
+
assertIncludes('.agent/verify_completion_stop.sh', 'npm run evaluator-calibration-test >/dev/null');
|
|
26
|
+
assertIncludes('README.md', 'Evaluator calibration now also fails closed on semantically lenient but well-formed reports.');
|
|
27
|
+
assertIncludes('README.md', '`npm run evaluator-calibration-test` drives the packaged transcription path through reviewer yes-with-follow-up, auditor open-contracts-with-`Next mandatory slice: none`, and stop-judge yes-with-open-contracts fixtures while still accepting truthful passing reports.');
|
|
28
|
+
assertIncludes('README.md', 'It also rejects the reproducible `none; ...` bypass family for reviewer follow-up, auditor worktree blockers, and stop-judge open-contract reporting, while still accepting only the exact reviewer routing text `Smallest follow-up slice: none; proceed to completion-auditor.` with terminal punctuation or whitespace only.');
|
|
29
|
+
assertIncludes('README.md', 'includes deterministic active-slice contract coverage plus observability coverage, evaluator calibration, and the rubric-contract regression');
|
|
30
|
+
assertIncludes('CHANGELOG.md', 'added evaluator calibration fixtures for semantically lenient but well-formed reviewer/auditor/stop-judge reports');
|
|
31
|
+
assertIncludes('CHANGELOG.md', 'tightened the reproducible `none; ...` reviewer/auditor/stop-judge bypass checks while still accepting only the exact reviewer `none; proceed to completion-auditor` routing allowance with terminal punctuation or whitespace only');
|
|
32
|
+
assertIncludes('CHANGELOG.md', 'wired `npm run evaluator-calibration-test` into `npm run release-check` and `.agent/verify_completion_stop.sh`');
|
|
33
|
+
assertIncludes('CHANGELOG.md', 'fixed the smoke auto-resume prompt regression');
|
|
34
|
+
assertIncludes('extensions/completion/role-reporting.js', 'Reviewer output cannot mark \'Acceptable as-is: yes\' while naming a follow-up slice other than none.');
|
|
35
|
+
assertIncludes('extensions/completion/role-reporting.js', 'Auditor output cannot mark \'Tracked and unignored worktree is clean: yes\' while listing worktree blockers.');
|
|
36
|
+
assertIncludes('extensions/completion/role-reporting.js', 'Auditor output cannot leave \'Next mandatory slice\' as none while open contracts, blockers, or high-value gaps remain.');
|
|
37
|
+
assertIncludes('extensions/completion/role-reporting.js', 'Stop-judge output cannot mark \'Can the project stop now: yes\' while naming remaining open top-level contract IDs.');
|
|
38
|
+
assertIncludes('extensions/completion/role-reporting.js', 'Stop-judge output cannot mark \'Can the project stop now: yes\' when Blocker count is greater than 0.');
|
|
39
|
+
|
|
40
|
+
const tempRootBase = path.join(process.cwd(), '.agent', 'tmp');
|
|
41
|
+
fs.mkdirSync(tempRootBase, { recursive: true });
|
|
42
|
+
const tempRoot = fs.mkdtempSync(path.join(tempRootBase, 'evaluator-calibration-'));
|
|
43
|
+
const snapshotFiles = {
|
|
44
|
+
sliceHistoryPath: path.join(tempRoot, 'slice-history.jsonl'),
|
|
45
|
+
stopHistoryPath: path.join(tempRoot, 'stop-check-history.jsonl'),
|
|
46
|
+
};
|
|
47
|
+
fs.writeFileSync(snapshotFiles.sliceHistoryPath, '');
|
|
48
|
+
fs.writeFileSync(snapshotFiles.stopHistoryPath, '');
|
|
49
|
+
|
|
50
|
+
const readJsonl = (file) =>
|
|
51
|
+
fs
|
|
52
|
+
.readFileSync(file, 'utf8')
|
|
53
|
+
.split('\n')
|
|
54
|
+
.filter(Boolean)
|
|
55
|
+
.map((line) => JSON.parse(line));
|
|
56
|
+
const assert = (condition, message) => {
|
|
57
|
+
if (!condition) throw new Error(message);
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
const reviewerPass = `MISSION ANCHOR: test mission
|
|
61
|
+
Remaining contract IDs: TEST-CONTRACT
|
|
62
|
+
Rubric:
|
|
63
|
+
- Contract coverage: pass - Locked acceptance criteria match the committed slice.
|
|
64
|
+
- Correctness risk: pass - No blocking regression is evident.
|
|
65
|
+
- Verification evidence: pass - Deterministic proof was rerun successfully.
|
|
66
|
+
- Docs/state parity: pass - Docs and canonical state are aligned.
|
|
67
|
+
Findings: none.
|
|
68
|
+
Acceptable as-is: yes
|
|
69
|
+
Smallest follow-up slice: none; proceed to completion-auditor.`;
|
|
70
|
+
|
|
71
|
+
const reviewerLenient = `MISSION ANCHOR: test mission
|
|
72
|
+
Remaining contract IDs: TEST-CONTRACT
|
|
73
|
+
Rubric:
|
|
74
|
+
- Contract coverage: pass - Locked acceptance criteria match the committed slice.
|
|
75
|
+
- Correctness risk: pass - No blocking regression is evident.
|
|
76
|
+
- Verification evidence: pass - Deterministic proof was rerun successfully.
|
|
77
|
+
- Docs/state parity: pass - Docs and canonical state are aligned.
|
|
78
|
+
Findings: none.
|
|
79
|
+
Acceptable as-is: yes
|
|
80
|
+
Smallest follow-up slice: tighten docs before audit.`;
|
|
81
|
+
|
|
82
|
+
const reviewerNonePrefixedLenient = `MISSION ANCHOR: test mission
|
|
83
|
+
Remaining contract IDs: TEST-CONTRACT
|
|
84
|
+
Rubric:
|
|
85
|
+
- Contract coverage: pass - Locked acceptance criteria match the committed slice.
|
|
86
|
+
- Correctness risk: pass - No blocking regression is evident.
|
|
87
|
+
- Verification evidence: pass - Deterministic proof was rerun successfully.
|
|
88
|
+
- Docs/state parity: pass - Docs and canonical state are aligned.
|
|
89
|
+
Findings: none.
|
|
90
|
+
Acceptable as-is: yes
|
|
91
|
+
Smallest follow-up slice: none; tighten docs before audit.`;
|
|
92
|
+
|
|
93
|
+
const reviewerTrailingTextAfterRoutingLenient = `MISSION ANCHOR: test mission
|
|
94
|
+
Remaining contract IDs: TEST-CONTRACT
|
|
95
|
+
Rubric:
|
|
96
|
+
- Contract coverage: pass - Locked acceptance criteria match the committed slice.
|
|
97
|
+
- Correctness risk: pass - No blocking regression is evident.
|
|
98
|
+
- Verification evidence: pass - Deterministic proof was rerun successfully.
|
|
99
|
+
- Docs/state parity: pass - Docs and canonical state are aligned.
|
|
100
|
+
Findings: none.
|
|
101
|
+
Acceptable as-is: yes
|
|
102
|
+
Smallest follow-up slice: none; proceed to completion-auditor; tighten docs before audit.`;
|
|
103
|
+
|
|
104
|
+
const auditorPass = `MISSION ANCHOR: test mission
|
|
105
|
+
Remaining contract IDs: TEST-CONTRACT
|
|
106
|
+
Rubric:
|
|
107
|
+
- Contract coverage: pass - The accepted slice remains satisfied on HEAD.
|
|
108
|
+
- Correctness risk: concern - One planned contract still keeps the project open.
|
|
109
|
+
- Verification evidence: pass - Verification was rerun for the accepted slice.
|
|
110
|
+
- Docs/state parity: pass - Canonical state can be reconciled truthfully.
|
|
111
|
+
Why the project is still not done: One planned contract remains after this accepted slice.
|
|
112
|
+
Open top-level contract IDs: TEST-CONTRACT
|
|
113
|
+
Blocker count: 0
|
|
114
|
+
High-value gap count: 1
|
|
115
|
+
Tracked and unignored worktree is clean: yes
|
|
116
|
+
Worktree blockers: none
|
|
117
|
+
Next mandatory slice: next-slice
|
|
118
|
+
Stale or conflicting canonical state: no
|
|
119
|
+
Plan truthfully captures remaining slice backlog: yes - one planned slice remains.`;
|
|
120
|
+
|
|
121
|
+
const auditorLenient = `MISSION ANCHOR: test mission
|
|
122
|
+
Remaining contract IDs: TEST-CONTRACT
|
|
123
|
+
Rubric:
|
|
124
|
+
- Contract coverage: pass - The accepted slice remains satisfied on HEAD.
|
|
125
|
+
- Correctness risk: concern - One planned contract still keeps the project open.
|
|
126
|
+
- Verification evidence: pass - Verification was rerun for the accepted slice.
|
|
127
|
+
- Docs/state parity: pass - Canonical state can be reconciled truthfully.
|
|
128
|
+
Why the project is still not done: One planned contract remains after this accepted slice.
|
|
129
|
+
Open top-level contract IDs: TEST-CONTRACT
|
|
130
|
+
Blocker count: 0
|
|
131
|
+
High-value gap count: 1
|
|
132
|
+
Tracked and unignored worktree is clean: yes
|
|
133
|
+
Worktree blockers: modified README.md
|
|
134
|
+
Next mandatory slice: none.
|
|
135
|
+
Stale or conflicting canonical state: no
|
|
136
|
+
Plan truthfully captures remaining slice backlog: yes - one planned slice remains.`;
|
|
137
|
+
|
|
138
|
+
const auditorNonePrefixedLenient = `MISSION ANCHOR: test mission
|
|
139
|
+
Remaining contract IDs: TEST-CONTRACT
|
|
140
|
+
Rubric:
|
|
141
|
+
- Contract coverage: pass - The accepted slice remains satisfied on HEAD.
|
|
142
|
+
- Correctness risk: concern - One planned contract still keeps the project open.
|
|
143
|
+
- Verification evidence: pass - Verification was rerun for the accepted slice.
|
|
144
|
+
- Docs/state parity: pass - Canonical state can be reconciled truthfully.
|
|
145
|
+
Why the project is still not done: One planned contract remains after this accepted slice.
|
|
146
|
+
Open top-level contract IDs: TEST-CONTRACT
|
|
147
|
+
Blocker count: 0
|
|
148
|
+
High-value gap count: 1
|
|
149
|
+
Tracked and unignored worktree is clean: yes
|
|
150
|
+
Worktree blockers: none; modified README.md
|
|
151
|
+
Next mandatory slice: next-slice
|
|
152
|
+
Stale or conflicting canonical state: no
|
|
153
|
+
Plan truthfully captures remaining slice backlog: yes - one planned slice remains.`;
|
|
154
|
+
|
|
155
|
+
const stopJudgePass = `MISSION ANCHOR: test mission
|
|
156
|
+
Remaining contract IDs: none
|
|
157
|
+
Rubric:
|
|
158
|
+
- Contract coverage: pass - All implementation slices are accepted on HEAD.
|
|
159
|
+
- Correctness risk: pass - No remaining blocker or high-value gap is evident.
|
|
160
|
+
- Verification evidence: pass - Final verification passes for the current head.
|
|
161
|
+
- Docs/state parity: pass - Docs, config, and canonical state match shipped behavior.
|
|
162
|
+
Can the project stop now: yes
|
|
163
|
+
Exact remaining open top-level contract IDs: none
|
|
164
|
+
Blocker count: 0
|
|
165
|
+
High-value gap count: 0
|
|
166
|
+
Latest completed slice commit: abcdef1234567890abcdef1234567890abcdef12
|
|
167
|
+
Docs/config/runbooks match shipped behavior: yes
|
|
168
|
+
Tracked and unignored worktree is clean: yes
|
|
169
|
+
Brief justification: Current HEAD satisfies the stop criteria.`;
|
|
170
|
+
|
|
171
|
+
const stopJudgeLenient = `MISSION ANCHOR: test mission
|
|
172
|
+
Remaining contract IDs: none
|
|
173
|
+
Rubric:
|
|
174
|
+
- Contract coverage: pass - All implementation slices are accepted on HEAD.
|
|
175
|
+
- Correctness risk: pass - No additional risk was found.
|
|
176
|
+
- Verification evidence: pass - Final verification passes for the current head.
|
|
177
|
+
- Docs/state parity: pass - Docs, config, and canonical state match shipped behavior.
|
|
178
|
+
Can the project stop now: yes
|
|
179
|
+
Exact remaining open top-level contract IDs: TEST-CONTRACT
|
|
180
|
+
Blocker count: 1
|
|
181
|
+
High-value gap count: 0
|
|
182
|
+
Latest completed slice commit: abcdef1234567890abcdef1234567890abcdef12
|
|
183
|
+
Docs/config/runbooks match shipped behavior: yes
|
|
184
|
+
Tracked and unignored worktree is clean: yes
|
|
185
|
+
Brief justification: This should be rejected because remaining contracts and blockers still exist.`;
|
|
186
|
+
|
|
187
|
+
const stopJudgeNonePrefixedLenient = `MISSION ANCHOR: test mission
|
|
188
|
+
Remaining contract IDs: none
|
|
189
|
+
Rubric:
|
|
190
|
+
- Contract coverage: pass - All implementation slices are accepted on HEAD.
|
|
191
|
+
- Correctness risk: pass - No additional risk was found.
|
|
192
|
+
- Verification evidence: pass - Final verification passes for the current head.
|
|
193
|
+
- Docs/state parity: pass - Docs, config, and canonical state match shipped behavior.
|
|
194
|
+
Can the project stop now: yes
|
|
195
|
+
Exact remaining open top-level contract IDs: none; TEST-CONTRACT
|
|
196
|
+
Blocker count: 0
|
|
197
|
+
High-value gap count: 0
|
|
198
|
+
Latest completed slice commit: abcdef1234567890abcdef1234567890abcdef12
|
|
199
|
+
Docs/config/runbooks match shipped behavior: yes
|
|
200
|
+
Tracked and unignored worktree is clean: yes
|
|
201
|
+
Brief justification: This should be rejected because remaining contracts still exist behind a none-prefixed field.`;
|
|
202
|
+
|
|
203
|
+
(async () => {
|
|
204
|
+
const reviewed = await transcribeCanonicalRoleReport({
|
|
205
|
+
role: 'completion-reviewer',
|
|
206
|
+
output: reviewerPass,
|
|
207
|
+
reportFields: parseReportFields(reviewerPass),
|
|
208
|
+
snapshotFiles,
|
|
209
|
+
headSha: '1111111111111111111111111111111111111111',
|
|
210
|
+
sliceId: 'slice-review',
|
|
211
|
+
recordedAt: 1,
|
|
212
|
+
});
|
|
213
|
+
assert(reviewed.errors.length === 0, `reviewer passing fixture should transcribe cleanly: ${reviewed.errors.join(' | ')}`);
|
|
214
|
+
assert(reviewed.appended.includes('reviewed:slice-review'), 'reviewer passing fixture should append a reviewed record');
|
|
215
|
+
assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 1, 'reviewer passing fixture should create one slice-history record');
|
|
216
|
+
|
|
217
|
+
const reviewerRejected = await transcribeCanonicalRoleReport({
|
|
218
|
+
role: 'completion-reviewer',
|
|
219
|
+
output: reviewerLenient,
|
|
220
|
+
reportFields: parseReportFields(reviewerLenient),
|
|
221
|
+
snapshotFiles,
|
|
222
|
+
headSha: '2222222222222222222222222222222222222222',
|
|
223
|
+
sliceId: 'slice-review',
|
|
224
|
+
recordedAt: 2,
|
|
225
|
+
});
|
|
226
|
+
assert(
|
|
227
|
+
reviewerRejected.errors.some((error) => error.includes('follow-up slice other than none')),
|
|
228
|
+
`reviewer lenient fixture should be rejected for a yes verdict with a follow-up slice: ${reviewerRejected.errors.join(' | ')}`,
|
|
229
|
+
);
|
|
230
|
+
assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 1, 'rejected reviewer fixture must not append history');
|
|
231
|
+
|
|
232
|
+
const reviewerNonePrefixedRejected = await transcribeCanonicalRoleReport({
|
|
233
|
+
role: 'completion-reviewer',
|
|
234
|
+
output: reviewerNonePrefixedLenient,
|
|
235
|
+
reportFields: parseReportFields(reviewerNonePrefixedLenient),
|
|
236
|
+
snapshotFiles,
|
|
237
|
+
headSha: '7777777777777777777777777777777777777777',
|
|
238
|
+
sliceId: 'slice-review',
|
|
239
|
+
recordedAt: 7,
|
|
240
|
+
});
|
|
241
|
+
assert(
|
|
242
|
+
reviewerNonePrefixedRejected.errors.some((error) => error.includes('follow-up slice other than none')),
|
|
243
|
+
`reviewer none-prefixed lenient fixture should be rejected for a yes verdict with contradictory routing text: ${reviewerNonePrefixedRejected.errors.join(' | ')}`,
|
|
244
|
+
);
|
|
245
|
+
assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 1, 'rejected none-prefixed reviewer fixture must not append history');
|
|
246
|
+
|
|
247
|
+
const reviewerTrailingTextAfterRoutingRejected = await transcribeCanonicalRoleReport({
|
|
248
|
+
role: 'completion-reviewer',
|
|
249
|
+
output: reviewerTrailingTextAfterRoutingLenient,
|
|
250
|
+
reportFields: parseReportFields(reviewerTrailingTextAfterRoutingLenient),
|
|
251
|
+
snapshotFiles,
|
|
252
|
+
headSha: '1010101010101010101010101010101010101010',
|
|
253
|
+
sliceId: 'slice-review',
|
|
254
|
+
recordedAt: 10,
|
|
255
|
+
});
|
|
256
|
+
assert(
|
|
257
|
+
reviewerTrailingTextAfterRoutingRejected.errors.some((error) => error.includes('follow-up slice other than none')),
|
|
258
|
+
`reviewer routing-trailing-text fixture should be rejected for extra text after the exact completion-auditor allowance: ${reviewerTrailingTextAfterRoutingRejected.errors.join(' | ')}`,
|
|
259
|
+
);
|
|
260
|
+
assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 1, 'rejected reviewer routing-trailing-text fixture must not append history');
|
|
261
|
+
|
|
262
|
+
const audited = await transcribeCanonicalRoleReport({
|
|
263
|
+
role: 'completion-auditor',
|
|
264
|
+
output: auditorPass,
|
|
265
|
+
reportFields: parseReportFields(auditorPass),
|
|
266
|
+
snapshotFiles,
|
|
267
|
+
headSha: '3333333333333333333333333333333333333333',
|
|
268
|
+
sliceId: 'slice-audit',
|
|
269
|
+
recordedAt: 3,
|
|
270
|
+
});
|
|
271
|
+
assert(audited.errors.length === 0, `auditor passing fixture should transcribe cleanly: ${audited.errors.join(' | ')}`);
|
|
272
|
+
assert(audited.appended.includes('audited:slice-audit'), 'auditor passing fixture should append an audited record');
|
|
273
|
+
assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 2, 'auditor passing fixture should append a second slice-history record');
|
|
274
|
+
|
|
275
|
+
const auditorRejected = await transcribeCanonicalRoleReport({
|
|
276
|
+
role: 'completion-auditor',
|
|
277
|
+
output: auditorLenient,
|
|
278
|
+
reportFields: parseReportFields(auditorLenient),
|
|
279
|
+
snapshotFiles,
|
|
280
|
+
headSha: '4444444444444444444444444444444444444444',
|
|
281
|
+
sliceId: 'slice-audit',
|
|
282
|
+
recordedAt: 4,
|
|
283
|
+
});
|
|
284
|
+
assert(
|
|
285
|
+
auditorRejected.errors.some((error) => error.includes('listing worktree blockers')),
|
|
286
|
+
`auditor lenient fixture should reject clean-yes reports that still list worktree blockers: ${auditorRejected.errors.join(' | ')}`,
|
|
287
|
+
);
|
|
288
|
+
assert(
|
|
289
|
+
auditorRejected.errors.some((error) => error.includes("Next mandatory slice") && error.includes('none')),
|
|
290
|
+
`auditor lenient fixture should reject open-work reports with no next mandatory slice: ${auditorRejected.errors.join(' | ')}`,
|
|
291
|
+
);
|
|
292
|
+
assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 2, 'rejected auditor fixture must not append history');
|
|
293
|
+
|
|
294
|
+
const auditorNonePrefixedRejected = await transcribeCanonicalRoleReport({
|
|
295
|
+
role: 'completion-auditor',
|
|
296
|
+
output: auditorNonePrefixedLenient,
|
|
297
|
+
reportFields: parseReportFields(auditorNonePrefixedLenient),
|
|
298
|
+
snapshotFiles,
|
|
299
|
+
headSha: '8888888888888888888888888888888888888888',
|
|
300
|
+
sliceId: 'slice-audit',
|
|
301
|
+
recordedAt: 8,
|
|
302
|
+
});
|
|
303
|
+
assert(
|
|
304
|
+
auditorNonePrefixedRejected.errors.some((error) => error.includes('listing worktree blockers')),
|
|
305
|
+
`auditor none-prefixed lenient fixture should reject clean-yes reports that smuggle blockers behind none: ${auditorNonePrefixedRejected.errors.join(' | ')}`,
|
|
306
|
+
);
|
|
307
|
+
assert(readJsonl(snapshotFiles.sliceHistoryPath).length === 2, 'rejected none-prefixed auditor fixture must not append history');
|
|
308
|
+
|
|
309
|
+
const judged = await transcribeCanonicalRoleReport({
|
|
310
|
+
role: 'completion-stop-judge',
|
|
311
|
+
output: stopJudgePass,
|
|
312
|
+
reportFields: parseReportFields(stopJudgePass),
|
|
313
|
+
snapshotFiles,
|
|
314
|
+
headSha: '5555555555555555555555555555555555555555',
|
|
315
|
+
recordedAt: 5,
|
|
316
|
+
});
|
|
317
|
+
assert(judged.errors.length === 0, `stop-judge passing fixture should transcribe cleanly: ${judged.errors.join(' | ')}`);
|
|
318
|
+
assert(judged.appended.includes('judgment:555555555555'), 'stop-judge passing fixture should append a judgment record');
|
|
319
|
+
assert(readJsonl(snapshotFiles.stopHistoryPath).length === 1, 'stop-judge passing fixture should create one judgment record');
|
|
320
|
+
|
|
321
|
+
const judgeRejected = await transcribeCanonicalRoleReport({
|
|
322
|
+
role: 'completion-stop-judge',
|
|
323
|
+
output: stopJudgeLenient,
|
|
324
|
+
reportFields: parseReportFields(stopJudgeLenient),
|
|
325
|
+
snapshotFiles,
|
|
326
|
+
headSha: '6666666666666666666666666666666666666666',
|
|
327
|
+
recordedAt: 6,
|
|
328
|
+
});
|
|
329
|
+
assert(
|
|
330
|
+
judgeRejected.errors.some((error) => error.includes('remaining open top-level contract IDs')),
|
|
331
|
+
`stop-judge lenient fixture should reject yes verdicts with open contracts: ${judgeRejected.errors.join(' | ')}`,
|
|
332
|
+
);
|
|
333
|
+
assert(
|
|
334
|
+
judgeRejected.errors.some((error) => error.includes('Blocker count is greater than 0')),
|
|
335
|
+
`stop-judge lenient fixture should reject yes verdicts with blockers: ${judgeRejected.errors.join(' | ')}`,
|
|
336
|
+
);
|
|
337
|
+
assert(readJsonl(snapshotFiles.stopHistoryPath).length === 1, 'rejected stop-judge fixture must not append judgment history');
|
|
338
|
+
|
|
339
|
+
const judgeNonePrefixedRejected = await transcribeCanonicalRoleReport({
|
|
340
|
+
role: 'completion-stop-judge',
|
|
341
|
+
output: stopJudgeNonePrefixedLenient,
|
|
342
|
+
reportFields: parseReportFields(stopJudgeNonePrefixedLenient),
|
|
343
|
+
snapshotFiles,
|
|
344
|
+
headSha: '9999999999999999999999999999999999999999',
|
|
345
|
+
recordedAt: 9,
|
|
346
|
+
});
|
|
347
|
+
assert(
|
|
348
|
+
judgeNonePrefixedRejected.errors.some((error) => error.includes('remaining open top-level contract IDs')),
|
|
349
|
+
`stop-judge none-prefixed lenient fixture should reject yes verdicts with none-prefixed open contracts: ${judgeNonePrefixedRejected.errors.join(' | ')}`,
|
|
350
|
+
);
|
|
351
|
+
assert(readJsonl(snapshotFiles.stopHistoryPath).length === 1, 'rejected none-prefixed stop-judge fixture must not append judgment history');
|
|
352
|
+
|
|
353
|
+
fs.rmSync(tempRoot, { recursive: true, force: true });
|
|
354
|
+
})().catch((error) => {
|
|
355
|
+
try {
|
|
356
|
+
fs.rmSync(tempRoot, { recursive: true, force: true });
|
|
357
|
+
} catch {}
|
|
358
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
359
|
+
process.exit(1);
|
|
360
|
+
});
|
|
361
|
+
NODE
|
|
362
|
+
|
|
363
|
+
echo "evaluator calibration test passed"
|
package/scripts/refocus-test.sh
CHANGED
|
@@ -34,6 +34,37 @@ print(state['mission_anchor'])
|
|
|
34
34
|
PY
|
|
35
35
|
)"
|
|
36
36
|
|
|
37
|
+
CHOOSER_SNAPSHOT="$TMPDIR/existing-workflow-chooser.json"
|
|
38
|
+
PI_COMPLETION_EXISTING_WORKFLOW_ACTION=cancel \
|
|
39
|
+
PI_COMPLETION_TEST_EXISTING_WORKFLOW_CHOOSER_PATH="$CHOOSER_SNAPSHOT" \
|
|
40
|
+
PI_COMPLETION_SKIP_DRIVER_KICKOFF=1 \
|
|
41
|
+
pi -e "$PKG_ROOT" -p "/cook replacement mission that should stay in the main chat" \
|
|
42
|
+
>/tmp/pi-completion-refocus-cancel.out 2>/tmp/pi-completion-refocus-cancel.err
|
|
43
|
+
|
|
44
|
+
python3 - "$CHOOSER_SNAPSHOT" "/tmp/pi-completion-refocus-cancel.out" "/tmp/pi-completion-refocus-cancel.err" "$INITIAL_MISSION" <<'PY'
|
|
45
|
+
import json
|
|
46
|
+
import sys
|
|
47
|
+
from pathlib import Path
|
|
48
|
+
|
|
49
|
+
chooser = json.loads(Path(sys.argv[1]).read_text())
|
|
50
|
+
output = Path(sys.argv[2]).read_text() + Path(sys.argv[3]).read_text()
|
|
51
|
+
initial_mission = sys.argv[4]
|
|
52
|
+
state = json.loads(Path('.agent/state.json').read_text())
|
|
53
|
+
plan = json.loads(Path('.agent/plan.json').read_text())
|
|
54
|
+
active = json.loads(Path('.agent/active-slice.json').read_text())
|
|
55
|
+
|
|
56
|
+
assert state['mission_anchor'] == initial_mission, 'cancelled chooser should keep the current mission anchor'
|
|
57
|
+
assert plan['mission_anchor'] == initial_mission, 'cancelled chooser should keep plan.json unchanged'
|
|
58
|
+
assert active['mission_anchor'] == initial_mission, 'cancelled chooser should keep active-slice.json unchanged'
|
|
59
|
+
assert chooser['title'].startswith('Existing completion workflow found'), 'chooser snapshot should describe the existing-workflow prompt'
|
|
60
|
+
assert chooser['choices'][0].startswith('Continue current workflow'), 'chooser should keep the continue option'
|
|
61
|
+
assert chooser['choices'][1].startswith('Abandon current workflow and start this new one'), 'chooser should keep the refocus option'
|
|
62
|
+
assert 'Start/Cancel confirmation' in chooser['choices'][1], 'chooser should mention the approval-only replacement confirmation'
|
|
63
|
+
assert chooser['choices'][2].startswith('Cancel'), 'chooser should keep the cancel option'
|
|
64
|
+
assert 'Discuss changes in the main chat and rerun /cook.' in chooser['choices'][2], 'chooser cancel copy should redirect users back to the main chat and rerun /cook'
|
|
65
|
+
assert 'Discuss changes in the main chat and rerun /cook.' in output, 'chooser cancel output should redirect users back to the main chat and rerun /cook'
|
|
66
|
+
PY
|
|
67
|
+
|
|
37
68
|
PI_COMPLETION_EXISTING_WORKFLOW_ACTION=refocus \
|
|
38
69
|
PI_COMPLETION_SKIP_DRIVER_KICKOFF=1 \
|
|
39
70
|
pi -e "$PKG_ROOT" -p "/cook refocused smoke-test mission with tests and docs" \
|
package/scripts/release-check.sh
CHANGED
|
@@ -4,11 +4,15 @@ set -euo pipefail
|
|
|
4
4
|
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
|
5
5
|
cd "$ROOT"
|
|
6
6
|
|
|
7
|
-
echo "[release-check] running startup/refocus/context regressions,
|
|
7
|
+
echo "[release-check] running control-plane validation, startup/refocus/context regressions, canonical evidence artifact, active-slice contract, observability, evaluator calibration, and rubric contract coverage"
|
|
8
|
+
bash .agent/verify_completion_control_plane.sh
|
|
8
9
|
npm run smoke-test
|
|
9
10
|
npm run refocus-test
|
|
10
11
|
npm run context-proposal-test
|
|
12
|
+
bash ./scripts/canonical-evidence-artifact-test.sh
|
|
13
|
+
bash ./scripts/active-slice-contract-test.sh
|
|
11
14
|
npm run observability-status-test
|
|
15
|
+
npm run evaluator-calibration-test
|
|
12
16
|
npm run rubric-contract-test
|
|
13
17
|
npm pack --dry-run >/dev/null
|
|
14
18
|
|
package/scripts/smoke-test.sh
CHANGED
|
@@ -19,7 +19,7 @@ PI_COMPLETION_TEST_DRIVER_PROMPT_PATH="$KICKOFF_PROMPT" \
|
|
|
19
19
|
pi -e "$PKG_ROOT" -p "/cook smoke-test mission" \
|
|
20
20
|
>"$TMPDIR/pi-completion-smoke-bootstrap.out" 2>"$TMPDIR/pi-completion-smoke-bootstrap.err"
|
|
21
21
|
|
|
22
|
-
for file in .agent/profile.json .agent/state.json .agent/plan.json .agent/active-slice.json; do
|
|
22
|
+
for file in .agent/profile.json .agent/state.json .agent/plan.json .agent/active-slice.json .agent/verification-evidence.json; do
|
|
23
23
|
[[ -f "$file" ]] || { echo "missing canonical bootstrap file: $file" >&2; exit 1; }
|
|
24
24
|
done
|
|
25
25
|
|
|
@@ -38,6 +38,7 @@ profile = json.loads(Path('.agent/profile.json').read_text())
|
|
|
38
38
|
state = json.loads(Path('.agent/state.json').read_text())
|
|
39
39
|
plan = json.loads(Path('.agent/plan.json').read_text())
|
|
40
40
|
active = json.loads(Path('.agent/active-slice.json').read_text())
|
|
41
|
+
evidence = json.loads(Path('.agent/verification-evidence.json').read_text())
|
|
41
42
|
kickoff = Path(sys.argv[1]).read_text()
|
|
42
43
|
|
|
43
44
|
assert profile['task_type'] == expected_task_type, 'profile.json task_type mismatch after bootstrap'
|
|
@@ -50,6 +51,10 @@ assert active['task_type'] == expected_task_type, 'active-slice.json task_type m
|
|
|
50
51
|
assert active['evaluation_profile'] == expected_eval_profile, 'active-slice.json evaluation_profile mismatch after bootstrap'
|
|
51
52
|
assert active['implementation_surfaces'] == [], 'active-slice.json should scaffold empty implementation_surfaces'
|
|
52
53
|
assert active['verification_commands'] == [], 'active-slice.json should scaffold empty verification_commands'
|
|
54
|
+
assert evidence['artifact_type'] == 'completion-verification-evidence', 'verification-evidence.json artifact_type mismatch after bootstrap'
|
|
55
|
+
assert evidence['subject_type'] == 'none', 'verification-evidence.json should scaffold idle subject_type'
|
|
56
|
+
assert evidence['verification_commands'] == [], 'verification-evidence.json should scaffold empty verification_commands'
|
|
57
|
+
assert evidence['outcome'] == 'not_recorded', 'verification-evidence.json should scaffold not_recorded outcome'
|
|
53
58
|
assert 'Canonical routing profile:' in kickoff, 'kickoff prompt should expose canonical routing profile'
|
|
54
59
|
assert f'- task_type: {expected_task_type}' in kickoff, 'kickoff prompt missing canonical task_type'
|
|
55
60
|
assert f'- evaluation_profile: {expected_eval_profile}' in kickoff, 'kickoff prompt missing canonical evaluation_profile'
|
|
@@ -167,6 +172,56 @@ active.pop('why_now', None)
|
|
|
167
172
|
path.write_text(json.dumps(active, indent=2) + '\n')
|
|
168
173
|
PY
|
|
169
174
|
|
|
175
|
+
python3 - <<'PY'
|
|
176
|
+
import json
|
|
177
|
+
from pathlib import Path
|
|
178
|
+
active = json.loads(Path('.agent/active-slice.json').read_text())
|
|
179
|
+
plan_path = Path('.agent/plan.json')
|
|
180
|
+
plan = json.loads(plan_path.read_text())
|
|
181
|
+
plan['candidate_slices'] = [{
|
|
182
|
+
'slice_id': active['slice_id'],
|
|
183
|
+
'goal': active['goal'],
|
|
184
|
+
'acceptance_criteria': active['acceptance_criteria'],
|
|
185
|
+
'contract_ids': active['contract_ids'],
|
|
186
|
+
'priority': 1,
|
|
187
|
+
'status': 'selected',
|
|
188
|
+
'why_now': 'smoke test exact handoff',
|
|
189
|
+
'blocked_on': active['blocked_on'],
|
|
190
|
+
'evidence': [],
|
|
191
|
+
'locked_notes': active['locked_notes'],
|
|
192
|
+
'must_fix_findings': active['must_fix_findings'],
|
|
193
|
+
'implementation_surfaces': ['extensions/completion/index.ts', '.agent/verify_completion_control_plane.sh'],
|
|
194
|
+
'verification_commands': ['bash .agent/verify_completion_control_plane.sh', 'npm run smoke-test'],
|
|
195
|
+
'basis_commit': active['basis_commit'],
|
|
196
|
+
'remaining_contract_ids_before': active['remaining_contract_ids_before'],
|
|
197
|
+
'release_blocker_count_before': active['release_blocker_count_before'],
|
|
198
|
+
'high_value_gap_count_before': active['high_value_gap_count_before'],
|
|
199
|
+
}]
|
|
200
|
+
plan_path.write_text(json.dumps(plan, indent=2) + '\n')
|
|
201
|
+
PY
|
|
202
|
+
|
|
203
|
+
python3 - <<'PY'
|
|
204
|
+
import json
|
|
205
|
+
from pathlib import Path
|
|
206
|
+
|
|
207
|
+
active = json.loads(Path('.agent/active-slice.json').read_text())
|
|
208
|
+
evidence = {
|
|
209
|
+
'schema_version': 1,
|
|
210
|
+
'artifact_type': 'completion-verification-evidence',
|
|
211
|
+
'subject_type': 'selected_slice',
|
|
212
|
+
'slice_id': active['slice_id'],
|
|
213
|
+
'goal': active['goal'],
|
|
214
|
+
'contract_ids': active['contract_ids'],
|
|
215
|
+
'basis_commit': active['basis_commit'],
|
|
216
|
+
'head_sha': active['basis_commit'],
|
|
217
|
+
'verification_commands': ['bash .agent/verify_completion_control_plane.sh', 'npm run smoke-test'],
|
|
218
|
+
'outcome': 'passed',
|
|
219
|
+
'recorded_at': '2026-05-03T00:00:00Z',
|
|
220
|
+
'summary': 'Smoke selected-slice evidence matches the temporary active-slice fixture.',
|
|
221
|
+
}
|
|
222
|
+
Path('.agent/verification-evidence.json').write_text(json.dumps(evidence, indent=2) + '\n')
|
|
223
|
+
PY
|
|
224
|
+
|
|
170
225
|
if bash .agent/verify_completion_control_plane.sh >/dev/null 2>&1; then
|
|
171
226
|
echo "expected control-plane verification to fail when selected active-slice omits priority/why_now" >&2
|
|
172
227
|
exit 1
|
|
@@ -101,6 +101,7 @@ Ignored canonical execution-state files:
|
|
|
101
101
|
- `.agent/active-slice.json`
|
|
102
102
|
- `.agent/slice-history.jsonl`
|
|
103
103
|
- `.agent/stop-check-history.jsonl`
|
|
104
|
+
- `.agent/verification-evidence.json`
|
|
104
105
|
- `.agent/*.log`
|
|
105
106
|
|
|
106
107
|
## Canonical Inputs
|
|
@@ -115,6 +116,7 @@ Read these when making completion decisions:
|
|
|
115
116
|
- `.agent/active-slice.json`
|
|
116
117
|
- `.agent/slice-history.jsonl`
|
|
117
118
|
- `.agent/stop-check-history.jsonl`
|
|
119
|
+
- `.agent/verification-evidence.json`
|
|
118
120
|
|
|
119
121
|
Optional context only:
|
|
120
122
|
|
|
@@ -138,6 +140,7 @@ After context compaction, suspected memory loss, stalled-role recovery, or any a
|
|
|
138
140
|
- `.agent/state.json`
|
|
139
141
|
- `.agent/plan.json`
|
|
140
142
|
- `.agent/active-slice.json`
|
|
143
|
+
- `.agent/verification-evidence.json`
|
|
141
144
|
|
|
142
145
|
The workflow driver must invoke `completion-regrounder` before continuing whenever any of the following is true:
|
|
143
146
|
|
|
@@ -152,7 +155,7 @@ The exact implementer handoff now includes implementation-scope surfaces and exp
|
|
|
152
155
|
|
|
153
156
|
The workflow driver must not continue implementation, review, audit, or stop evaluation from compacted conversation memory alone.
|
|
154
157
|
|
|
155
|
-
After compaction or recovery, `completion-implementer` must also re-read canonical `.agent/state.json`, `.agent/plan.json`,
|
|
158
|
+
After compaction or recovery, `completion-implementer` must also re-read canonical `.agent/state.json`, `.agent/plan.json`, `.agent/active-slice.json`, and `.agent/verification-evidence.json` before resuming work. If `.agent/active-slice.json` still contains a truthful exact handoff snapshot, continue from canonical state rather than asking the user to resend the original caller payload.
|
|
156
159
|
|
|
157
160
|
## Shared Report Header
|
|
158
161
|
|
|
@@ -17,8 +17,29 @@
|
|
|
17
17
|
- `.agent/active-slice.json`
|
|
18
18
|
- `.agent/slice-history.jsonl`
|
|
19
19
|
- `.agent/stop-check-history.jsonl`
|
|
20
|
+
- `.agent/verification-evidence.json`
|
|
20
21
|
- `.agent/*.log`
|
|
21
22
|
|
|
23
|
+
## Canonical Inputs
|
|
24
|
+
|
|
25
|
+
Read these when making completion decisions:
|
|
26
|
+
|
|
27
|
+
- `.agent/mission.md`
|
|
28
|
+
- `.agent/README.md`
|
|
29
|
+
- `.agent/profile.json`
|
|
30
|
+
- `.agent/state.json`
|
|
31
|
+
- `.agent/plan.json`
|
|
32
|
+
- `.agent/active-slice.json`
|
|
33
|
+
- `.agent/slice-history.jsonl`
|
|
34
|
+
- `.agent/stop-check-history.jsonl`
|
|
35
|
+
- `.agent/verification-evidence.json`
|
|
36
|
+
|
|
37
|
+
Optional context only:
|
|
38
|
+
|
|
39
|
+
- `.agent/backlog.md`
|
|
40
|
+
- `.agent/handoff.md`
|
|
41
|
+
- `.agent/compact.md`
|
|
42
|
+
|
|
22
43
|
## Scratch Space
|
|
23
44
|
|
|
24
45
|
- Use repo-local `.agent/tmp/` as the default temporary workspace for completion.
|
|
@@ -332,6 +353,7 @@ After context compaction, suspected memory loss, stalled-role recovery, or any a
|
|
|
332
353
|
- `.agent/state.json`
|
|
333
354
|
- `.agent/plan.json`
|
|
334
355
|
- `.agent/active-slice.json`
|
|
356
|
+
- `.agent/verification-evidence.json`
|
|
335
357
|
|
|
336
358
|
The workflow root must invoke `completion-regrounder` before continuing whenever any of the following is true:
|
|
337
359
|
|
|
@@ -344,6 +366,8 @@ The workflow root must invoke `completion-regrounder` before continuing whenever
|
|
|
344
366
|
|
|
345
367
|
The workflow root must not continue implementation, review, audit, or stop evaluation from compacted conversation memory alone.
|
|
346
368
|
|
|
369
|
+
After compaction or recovery, `completion-implementer` must also re-read canonical `.agent/state.json`, `.agent/plan.json`, `.agent/active-slice.json`, and `.agent/verification-evidence.json` before resuming work. If `.agent/active-slice.json` still contains a truthful exact handoff snapshot, continue from canonical state rather than asking the user to resend the original caller payload.
|
|
370
|
+
|
|
347
371
|
## Default Priority Policy
|
|
348
372
|
|
|
349
373
|
`completion-default` ranks candidate slices in this order:
|