@besales/ops-framework 0.1.31 → 0.1.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +7 -0
- package/README.md +12 -1
- package/bin/lib/review-budget-utils.mjs +106 -0
- package/bin/lib/review-budget-utils.test.mjs +55 -0
- package/bin/providers/external-cli-checker.mjs +3 -1
- package/bin/run-check.mjs +61 -4
- package/bin/run-verify.mjs +55 -2
- package/config/default-agents.json +4 -2
- package/package.json +1 -1
- package/prompts/checker.md +1 -0
- package/prompts/planner.md +1 -1
- package/prompts/supervisor.md +2 -1
- package/prompts/verifier.md +2 -1
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.1.32
|
|
4
|
+
|
|
5
|
+
- Required explicit `review-budget-approval.json` before `--force-review-budget` can bypass Check/Verify review budgets.
|
|
6
|
+
- Added external CLI provider timeouts from `reviewBudgets.*.providerTimeoutMs` so Check/Verify provider calls cannot silently exceed the stage SLA.
|
|
7
|
+
- Recorded denied force attempts in Check/Verify timelines as `review_budget_force_denied`.
|
|
8
|
+
- Updated review budget docs and prompts so force review is treated as a human-approved exception, not a normal retry path.
|
|
9
|
+
|
|
3
10
|
## 0.1.31
|
|
4
11
|
|
|
5
12
|
- Added bounded review budgets for Check and Verify: default 3 minute stage SLA and one external provider run per stage.
|
package/README.md
CHANGED
|
@@ -194,11 +194,22 @@ External `run-check` and `run-verify` are bounded by default:
|
|
|
194
194
|
|
|
195
195
|
- stage SLA: `180000ms`;
|
|
196
196
|
- max external provider runs per stage: `1`.
|
|
197
|
+
- external provider timeout: `180000ms`.
|
|
197
198
|
|
|
198
199
|
When the budget is exceeded, the framework writes `human_arbitration_required`
|
|
199
200
|
instead of starting another provider loop. Consolidate the remaining findings in
|
|
200
201
|
task artifacts, or rerun with `--force-review-budget` only after explicit human
|
|
201
|
-
approval.
|
|
202
|
+
approval recorded in `review-budget-approval.json`:
|
|
203
|
+
|
|
204
|
+
```json
|
|
205
|
+
{
|
|
206
|
+
"approved": true,
|
|
207
|
+
"stage": "check",
|
|
208
|
+
"reason": "Human approved one extra external review after consolidated remediation.",
|
|
209
|
+
"approvedBy": "human",
|
|
210
|
+
"expiresAt": "2026-06-05T12:00:00.000Z"
|
|
211
|
+
}
|
|
212
|
+
```
|
|
202
213
|
|
|
203
214
|
## Learning Loop
|
|
204
215
|
|
|
@@ -1,7 +1,11 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
|
|
1
4
|
export function resolveStageReviewBudget(config, stage) {
|
|
2
5
|
const defaults = {
|
|
3
6
|
stageSlaMs: 180000,
|
|
4
7
|
maxExternalRunsPerStage: 1,
|
|
8
|
+
providerTimeoutMs: 180000,
|
|
5
9
|
};
|
|
6
10
|
const reviewBudgets = config.reviewBudgets || {};
|
|
7
11
|
return {
|
|
@@ -68,6 +72,108 @@ export function evaluateReviewBudget({ budget, summary, force = false }) {
|
|
|
68
72
|
return { ok: true, reason: null };
|
|
69
73
|
}
|
|
70
74
|
|
|
75
|
+
export function readReviewBudgetApproval({ taskDir, stage, now = new Date() }) {
|
|
76
|
+
const approvalPath = path.join(taskDir, 'review-budget-approval.json');
|
|
77
|
+
if (!fs.existsSync(approvalPath)) {
|
|
78
|
+
return {
|
|
79
|
+
ok: false,
|
|
80
|
+
reason: 'missing_review_budget_approval',
|
|
81
|
+
message: 'Using --force-review-budget requires review-budget-approval.json with approved=true.',
|
|
82
|
+
path: 'review-budget-approval.json',
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
let approval;
|
|
87
|
+
try {
|
|
88
|
+
approval = JSON.parse(fs.readFileSync(approvalPath, 'utf8'));
|
|
89
|
+
} catch (error) {
|
|
90
|
+
return {
|
|
91
|
+
ok: false,
|
|
92
|
+
reason: 'invalid_review_budget_approval',
|
|
93
|
+
message: `review-budget-approval.json is invalid JSON: ${error.message}`,
|
|
94
|
+
path: 'review-budget-approval.json',
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (approval.approved !== true) {
|
|
99
|
+
return {
|
|
100
|
+
ok: false,
|
|
101
|
+
reason: 'review_budget_approval_not_approved',
|
|
102
|
+
message: 'review-budget-approval.json must contain approved=true.',
|
|
103
|
+
path: 'review-budget-approval.json',
|
|
104
|
+
approval,
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if (!['check', 'verify', 'both'].includes(String(approval.stage || ''))) {
|
|
109
|
+
return {
|
|
110
|
+
ok: false,
|
|
111
|
+
reason: 'review_budget_approval_stage_invalid',
|
|
112
|
+
message: 'review-budget-approval.json stage must be check, verify or both.',
|
|
113
|
+
path: 'review-budget-approval.json',
|
|
114
|
+
approval,
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if (approval.stage !== stage && approval.stage !== 'both') {
|
|
119
|
+
return {
|
|
120
|
+
ok: false,
|
|
121
|
+
reason: 'review_budget_approval_stage_mismatch',
|
|
122
|
+
message: `review-budget-approval.json is for stage=${approval.stage}, not ${stage}.`,
|
|
123
|
+
path: 'review-budget-approval.json',
|
|
124
|
+
approval,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (!approval.reason || typeof approval.reason !== 'string') {
|
|
129
|
+
return {
|
|
130
|
+
ok: false,
|
|
131
|
+
reason: 'review_budget_approval_reason_missing',
|
|
132
|
+
message: 'review-budget-approval.json must contain a human-readable reason.',
|
|
133
|
+
path: 'review-budget-approval.json',
|
|
134
|
+
approval,
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if (!approval.approvedBy || typeof approval.approvedBy !== 'string') {
|
|
139
|
+
return {
|
|
140
|
+
ok: false,
|
|
141
|
+
reason: 'review_budget_approval_approver_missing',
|
|
142
|
+
message: 'review-budget-approval.json must contain approvedBy.',
|
|
143
|
+
path: 'review-budget-approval.json',
|
|
144
|
+
approval,
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (approval.expiresAt) {
|
|
149
|
+
const expiresAt = new Date(approval.expiresAt);
|
|
150
|
+
if (Number.isNaN(expiresAt.getTime())) {
|
|
151
|
+
return {
|
|
152
|
+
ok: false,
|
|
153
|
+
reason: 'review_budget_approval_expiry_invalid',
|
|
154
|
+
message: 'review-budget-approval.json expiresAt must be a valid ISO timestamp when present.',
|
|
155
|
+
path: 'review-budget-approval.json',
|
|
156
|
+
approval,
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
if (expiresAt.getTime() <= now.getTime()) {
|
|
160
|
+
return {
|
|
161
|
+
ok: false,
|
|
162
|
+
reason: 'review_budget_approval_expired',
|
|
163
|
+
message: `review-budget-approval.json expired at ${expiresAt.toISOString()}.`,
|
|
164
|
+
path: 'review-budget-approval.json',
|
|
165
|
+
approval,
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return {
|
|
171
|
+
ok: true,
|
|
172
|
+
path: 'review-budget-approval.json',
|
|
173
|
+
approval,
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
|
|
71
177
|
function firstValidDate(values) {
|
|
72
178
|
for (const value of values) {
|
|
73
179
|
const date = new Date(value);
|
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
import { describe, expect, it } from 'vitest';
|
|
2
|
+
import fs from 'node:fs';
|
|
3
|
+
import os from 'node:os';
|
|
4
|
+
import path from 'node:path';
|
|
2
5
|
import {
|
|
3
6
|
evaluateReviewBudget,
|
|
7
|
+
readReviewBudgetApproval,
|
|
4
8
|
resolveStageReviewBudget,
|
|
5
9
|
summarizeReviewBudgetWindow,
|
|
6
10
|
} from './review-budget-utils.mjs';
|
|
@@ -80,11 +84,62 @@ describe('review budget utils', () => {
|
|
|
80
84
|
verify: {
|
|
81
85
|
stageSlaMs: 120000,
|
|
82
86
|
maxExternalRunsPerStage: 2,
|
|
87
|
+
providerTimeoutMs: 90000,
|
|
83
88
|
},
|
|
84
89
|
},
|
|
85
90
|
}, 'verify')).toEqual({
|
|
86
91
|
stageSlaMs: 120000,
|
|
87
92
|
maxExternalRunsPerStage: 2,
|
|
93
|
+
providerTimeoutMs: 90000,
|
|
94
|
+
});
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
it('requires a valid force approval artifact', () => {
|
|
98
|
+
const taskDir = fs.mkdtempSync(path.join(os.tmpdir(), 'review-budget-approval-'));
|
|
99
|
+
|
|
100
|
+
expect(readReviewBudgetApproval({ taskDir, stage: 'verify' })).toMatchObject({
|
|
101
|
+
ok: false,
|
|
102
|
+
reason: 'missing_review_budget_approval',
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
fs.writeFileSync(path.join(taskDir, 'review-budget-approval.json'), JSON.stringify({
|
|
106
|
+
approved: true,
|
|
107
|
+
stage: 'verify',
|
|
108
|
+
reason: 'Human approved one extra external Verify after consolidated remediation.',
|
|
109
|
+
approvedBy: 'human',
|
|
110
|
+
expiresAt: '2026-06-04T12:30:00.000Z',
|
|
111
|
+
}, null, 2));
|
|
112
|
+
|
|
113
|
+
expect(readReviewBudgetApproval({
|
|
114
|
+
taskDir,
|
|
115
|
+
stage: 'verify',
|
|
116
|
+
now: new Date('2026-06-04T12:00:00.000Z'),
|
|
117
|
+
})).toMatchObject({
|
|
118
|
+
ok: true,
|
|
119
|
+
approval: {
|
|
120
|
+
approved: true,
|
|
121
|
+
stage: 'verify',
|
|
122
|
+
},
|
|
123
|
+
});
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
it('rejects expired force approval artifacts', () => {
|
|
127
|
+
const taskDir = fs.mkdtempSync(path.join(os.tmpdir(), 'review-budget-expired-'));
|
|
128
|
+
fs.writeFileSync(path.join(taskDir, 'review-budget-approval.json'), JSON.stringify({
|
|
129
|
+
approved: true,
|
|
130
|
+
stage: 'both',
|
|
131
|
+
reason: 'Expired approval',
|
|
132
|
+
approvedBy: 'human',
|
|
133
|
+
expiresAt: '2026-06-04T12:00:00.000Z',
|
|
134
|
+
}, null, 2));
|
|
135
|
+
|
|
136
|
+
expect(readReviewBudgetApproval({
|
|
137
|
+
taskDir,
|
|
138
|
+
stage: 'check',
|
|
139
|
+
now: new Date('2026-06-04T12:00:01.000Z'),
|
|
140
|
+
})).toMatchObject({
|
|
141
|
+
ok: false,
|
|
142
|
+
reason: 'review_budget_approval_expired',
|
|
88
143
|
});
|
|
89
144
|
});
|
|
90
145
|
});
|
|
@@ -47,6 +47,7 @@ export async function runExternalCliChecker({
|
|
|
47
47
|
reasoningEffort,
|
|
48
48
|
prompt,
|
|
49
49
|
cwd,
|
|
50
|
+
timeoutMs,
|
|
50
51
|
}) {
|
|
51
52
|
if (!providerConfig) {
|
|
52
53
|
const error = new Error(`Unknown external CLI provider: ${providerName}`);
|
|
@@ -68,10 +69,11 @@ export async function runExternalCliChecker({
|
|
|
68
69
|
input: providerConfig.input === 'stdin' ? prompt : undefined,
|
|
69
70
|
encoding: 'utf8',
|
|
70
71
|
maxBuffer: 1024 * 1024 * 20,
|
|
72
|
+
timeout: timeoutMs || undefined,
|
|
71
73
|
});
|
|
72
74
|
|
|
73
75
|
if (result.error) {
|
|
74
|
-
result.error.failureReason = 'provider_unavailable';
|
|
76
|
+
result.error.failureReason = result.error.code === 'ETIMEDOUT' ? 'timeout' : 'provider_unavailable';
|
|
75
77
|
throw result.error;
|
|
76
78
|
}
|
|
77
79
|
if (result.status !== 0) {
|
package/bin/run-check.mjs
CHANGED
|
@@ -47,6 +47,7 @@ import {
|
|
|
47
47
|
} from './lib/task-manifest-utils.mjs';
|
|
48
48
|
import {
|
|
49
49
|
evaluateReviewBudget,
|
|
50
|
+
readReviewBudgetApproval,
|
|
50
51
|
resolveStageReviewBudget,
|
|
51
52
|
summarizeReviewBudgetWindow,
|
|
52
53
|
} from './lib/review-budget-utils.mjs';
|
|
@@ -239,6 +240,57 @@ async function runMain() {
|
|
|
239
240
|
return;
|
|
240
241
|
}
|
|
241
242
|
|
|
243
|
+
const forceApproval = forceReviewBudget
|
|
244
|
+
? readReviewBudgetApproval({ taskDir, stage: 'check' })
|
|
245
|
+
: { ok: true, approval: null };
|
|
246
|
+
if (!forceApproval.ok) {
|
|
247
|
+
const budget = resolveStageReviewBudget(readAgentsConfig(), 'check');
|
|
248
|
+
const summary = summarizeReviewBudgetWindow({
|
|
249
|
+
timeline: readTimeline(taskDir, 'check-timeline.json'),
|
|
250
|
+
stage: 'check',
|
|
251
|
+
now: new Date(),
|
|
252
|
+
});
|
|
253
|
+
writeReviewBudgetReturn({
|
|
254
|
+
taskDir,
|
|
255
|
+
taskId,
|
|
256
|
+
checkContext,
|
|
257
|
+
checkerConfig,
|
|
258
|
+
checkerPromptSha,
|
|
259
|
+
cacheKey,
|
|
260
|
+
reason: forceApproval.reason,
|
|
261
|
+
message: forceApproval.message,
|
|
262
|
+
budget,
|
|
263
|
+
summary,
|
|
264
|
+
startedAt: runStartedAt,
|
|
265
|
+
approval: forceApproval,
|
|
266
|
+
});
|
|
267
|
+
appendCheckTimeline(taskDir, {
|
|
268
|
+
event: 'review_budget_force_denied',
|
|
269
|
+
verdict: 'human_arbitration_required',
|
|
270
|
+
reason: forceApproval.reason,
|
|
271
|
+
message: forceApproval.message,
|
|
272
|
+
budget,
|
|
273
|
+
summary,
|
|
274
|
+
timing: buildTiming(runStartedAt),
|
|
275
|
+
});
|
|
276
|
+
recordLlmInputUsage({
|
|
277
|
+
taskDir,
|
|
278
|
+
stage: 'check',
|
|
279
|
+
packMeta: promptPayload.pack.meta,
|
|
280
|
+
attempts: [
|
|
281
|
+
...llmInputAttempts,
|
|
282
|
+
buildAttemptRecord(promptPayload.pack.meta, `review_budget_force_denied:${forceApproval.reason}`),
|
|
283
|
+
],
|
|
284
|
+
rerunCount,
|
|
285
|
+
timing: buildTiming(runStartedAt),
|
|
286
|
+
});
|
|
287
|
+
refreshTaskManifestAfterCheck(taskDir);
|
|
288
|
+
runValidator(taskArg);
|
|
289
|
+
console.log(`Checker force review denied ${taskId}: human_arbitration_required`);
|
|
290
|
+
console.log(`- reason: ${forceApproval.reason}`);
|
|
291
|
+
return;
|
|
292
|
+
}
|
|
293
|
+
|
|
242
294
|
const reviewBudget = evaluateCurrentReviewBudget({
|
|
243
295
|
taskDir,
|
|
244
296
|
stage: 'check',
|
|
@@ -258,6 +310,7 @@ async function runMain() {
|
|
|
258
310
|
budget: reviewBudget.budget,
|
|
259
311
|
summary: reviewBudget.summary,
|
|
260
312
|
startedAt: runStartedAt,
|
|
313
|
+
approval: forceApproval,
|
|
261
314
|
});
|
|
262
315
|
appendCheckTimeline(taskDir, {
|
|
263
316
|
event: 'review_budget_blocked',
|
|
@@ -304,6 +357,7 @@ async function runMain() {
|
|
|
304
357
|
checkerConfig,
|
|
305
358
|
messages: promptPayload.messages,
|
|
306
359
|
prompt: promptPayload.prompt,
|
|
360
|
+
timeoutMs: reviewBudget.budget.providerTimeoutMs,
|
|
307
361
|
});
|
|
308
362
|
appendCheckTimeline(taskDir, {
|
|
309
363
|
event: 'provider_completed',
|
|
@@ -434,6 +488,7 @@ function writeReviewBudgetReturn({
|
|
|
434
488
|
budget,
|
|
435
489
|
summary,
|
|
436
490
|
startedAt,
|
|
491
|
+
approval = null,
|
|
437
492
|
}) {
|
|
438
493
|
const result = {
|
|
439
494
|
taskId,
|
|
@@ -468,6 +523,7 @@ function writeReviewBudgetReturn({
|
|
|
468
523
|
reason,
|
|
469
524
|
budget,
|
|
470
525
|
summary,
|
|
526
|
+
approval,
|
|
471
527
|
forceFlag: '--force-review-budget',
|
|
472
528
|
},
|
|
473
529
|
readyForHumanGate: false,
|
|
@@ -489,13 +545,13 @@ function writeReviewBudgetReturn({
|
|
|
489
545
|
'## Budget',
|
|
490
546
|
'',
|
|
491
547
|
'```json',
|
|
492
|
-
JSON.stringify({ reason, budget, summary }, null, 2),
|
|
548
|
+
JSON.stringify({ reason, budget, summary, approval }, null, 2),
|
|
493
549
|
'```',
|
|
494
550
|
'',
|
|
495
551
|
'## Required decision',
|
|
496
552
|
'',
|
|
497
553
|
'- Consolidate all remaining Check findings into `plan.md`, `status.md`, and `check-resolution.md`, then run one fresh Check after the window resets; or',
|
|
498
|
-
'- Ask the human to approve an extra external review
|
|
554
|
+
'- Ask the human to approve an extra external review by writing `review-budget-approval.json`, then rerun with `--force-review-budget`.',
|
|
499
555
|
'',
|
|
500
556
|
'## Timing',
|
|
501
557
|
'',
|
|
@@ -510,7 +566,7 @@ function writeReviewBudgetReturn({
|
|
|
510
566
|
checkVerdict: '`human_arbitration_required`',
|
|
511
567
|
checkResult: '- `check.result.json`: current; review budget blocked external Checker invocation',
|
|
512
568
|
supervisorAction: 'Check review budget blocked another external provider loop.',
|
|
513
|
-
nextStep: 'Human Arbitration:
|
|
569
|
+
nextStep: 'Human Arbitration: write `review-budget-approval.json` before using `--force-review-budget`, or consolidate remaining findings before a fresh Check.',
|
|
514
570
|
humanApproval: 'yes',
|
|
515
571
|
});
|
|
516
572
|
appendOrchestrationLog(taskDir, `Check review budget blocked external checker; reason=${reason}; elapsedMs=${summary.elapsedMs}; providerStarted=${summary.providerStarted}; maxExternalRuns=${budget.maxExternalRunsPerStage}; stageSlaMs=${budget.stageSlaMs}`);
|
|
@@ -927,7 +983,7 @@ function buildCheckerPromptPayload({
|
|
|
927
983
|
};
|
|
928
984
|
}
|
|
929
985
|
|
|
930
|
-
async function runProvider({ checkerConfig, messages, prompt }) {
|
|
986
|
+
async function runProvider({ checkerConfig, messages, prompt, timeoutMs }) {
|
|
931
987
|
if (checkerConfig.provider === 'openai') {
|
|
932
988
|
return runOpenAiChecker({
|
|
933
989
|
apiKey: process.env.OPENAI_API_KEY,
|
|
@@ -944,6 +1000,7 @@ async function runProvider({ checkerConfig, messages, prompt }) {
|
|
|
944
1000
|
reasoningEffort: checkerConfig.reasoningEffort,
|
|
945
1001
|
prompt,
|
|
946
1002
|
cwd: repoRoot,
|
|
1003
|
+
timeoutMs,
|
|
947
1004
|
});
|
|
948
1005
|
}
|
|
949
1006
|
|
package/bin/run-verify.mjs
CHANGED
|
@@ -30,6 +30,7 @@ import {
|
|
|
30
30
|
import { recordLlmInputUsage } from './lib/task-manifest-utils.mjs';
|
|
31
31
|
import {
|
|
32
32
|
evaluateReviewBudget,
|
|
33
|
+
readReviewBudgetApproval,
|
|
33
34
|
resolveStageReviewBudget,
|
|
34
35
|
summarizeReviewBudgetWindow,
|
|
35
36
|
} from './lib/review-budget-utils.mjs';
|
|
@@ -210,6 +211,54 @@ async function runMain() {
|
|
|
210
211
|
return;
|
|
211
212
|
}
|
|
212
213
|
|
|
214
|
+
const forceApproval = forceReviewBudget
|
|
215
|
+
? readReviewBudgetApproval({ taskDir, stage: 'verify' })
|
|
216
|
+
: { ok: true, approval: null };
|
|
217
|
+
if (!forceApproval.ok) {
|
|
218
|
+
const budget = resolveStageReviewBudget(readAgentsConfig(), 'verify');
|
|
219
|
+
const summary = summarizeReviewBudgetWindow({
|
|
220
|
+
timeline: readTimeline(taskDir, 'verify-timeline.json'),
|
|
221
|
+
stage: 'verify',
|
|
222
|
+
now: new Date(),
|
|
223
|
+
});
|
|
224
|
+
writeVerifyReviewBudgetReturn({
|
|
225
|
+
taskDir,
|
|
226
|
+
taskId,
|
|
227
|
+
verifierConfig,
|
|
228
|
+
verifierRunId,
|
|
229
|
+
planSha,
|
|
230
|
+
executionSha,
|
|
231
|
+
reason: forceApproval.reason,
|
|
232
|
+
message: forceApproval.message,
|
|
233
|
+
budget,
|
|
234
|
+
summary,
|
|
235
|
+
approval: forceApproval,
|
|
236
|
+
});
|
|
237
|
+
appendVerifyTimeline(taskDir, {
|
|
238
|
+
event: 'review_budget_force_denied',
|
|
239
|
+
verdict: 'human_arbitration_required',
|
|
240
|
+
reason: forceApproval.reason,
|
|
241
|
+
message: forceApproval.message,
|
|
242
|
+
budget,
|
|
243
|
+
summary,
|
|
244
|
+
timing: buildTiming(runStartedAt),
|
|
245
|
+
});
|
|
246
|
+
recordLlmInputUsage({
|
|
247
|
+
taskDir,
|
|
248
|
+
stage: 'verify',
|
|
249
|
+
packMeta: promptPayload.pack.meta,
|
|
250
|
+
attempts: [
|
|
251
|
+
...llmInputAttempts,
|
|
252
|
+
buildAttemptRecord(promptPayload.pack.meta, `review_budget_force_denied:${forceApproval.reason}`),
|
|
253
|
+
],
|
|
254
|
+
rerunCount,
|
|
255
|
+
timing: buildTiming(runStartedAt),
|
|
256
|
+
});
|
|
257
|
+
console.log(`Verifier force review denied ${taskId}: human_arbitration_required`);
|
|
258
|
+
console.log(`- reason: ${forceApproval.reason}`);
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
261
|
+
|
|
213
262
|
const reviewBudget = evaluateCurrentReviewBudget({
|
|
214
263
|
taskDir,
|
|
215
264
|
stage: 'verify',
|
|
@@ -228,6 +277,7 @@ async function runMain() {
|
|
|
228
277
|
message: reviewBudget.message,
|
|
229
278
|
budget: reviewBudget.budget,
|
|
230
279
|
summary: reviewBudget.summary,
|
|
280
|
+
approval: forceApproval,
|
|
231
281
|
});
|
|
232
282
|
appendVerifyTimeline(taskDir, {
|
|
233
283
|
event: 'review_budget_blocked',
|
|
@@ -275,6 +325,7 @@ async function runMain() {
|
|
|
275
325
|
reasoningEffort: verifierConfig.reasoningEffort,
|
|
276
326
|
prompt: promptPayload.prompt,
|
|
277
327
|
cwd: repoRoot,
|
|
328
|
+
timeoutMs: reviewBudget.budget.providerTimeoutMs,
|
|
278
329
|
});
|
|
279
330
|
appendVerifyTimeline(taskDir, {
|
|
280
331
|
event: 'provider_completed',
|
|
@@ -412,6 +463,7 @@ function writeVerifyReviewBudgetReturn({
|
|
|
412
463
|
message,
|
|
413
464
|
budget,
|
|
414
465
|
summary,
|
|
466
|
+
approval = null,
|
|
415
467
|
}) {
|
|
416
468
|
const verifyMarkdown = [
|
|
417
469
|
'# Verify',
|
|
@@ -429,13 +481,13 @@ function writeVerifyReviewBudgetReturn({
|
|
|
429
481
|
'## Budget',
|
|
430
482
|
'',
|
|
431
483
|
'```json',
|
|
432
|
-
JSON.stringify({ reason, budget, summary }, null, 2),
|
|
484
|
+
JSON.stringify({ reason, budget, summary, approval }, null, 2),
|
|
433
485
|
'```',
|
|
434
486
|
'',
|
|
435
487
|
'## Required decision',
|
|
436
488
|
'',
|
|
437
489
|
'- Consolidate remaining Verify findings in `execution.md` / evidence artifacts, then run one fresh Verify after the window resets; or',
|
|
438
|
-
'- Ask the human to approve an extra external review
|
|
490
|
+
'- Ask the human to approve an extra external review by writing `review-budget-approval.json`, then rerun with `--force-review-budget`.',
|
|
439
491
|
].join('\n');
|
|
440
492
|
const result = {
|
|
441
493
|
schemaVersion: 1,
|
|
@@ -474,6 +526,7 @@ function writeVerifyReviewBudgetReturn({
|
|
|
474
526
|
reason,
|
|
475
527
|
budget,
|
|
476
528
|
summary,
|
|
529
|
+
approval,
|
|
477
530
|
forceFlag: '--force-review-budget',
|
|
478
531
|
},
|
|
479
532
|
};
|
|
@@ -21,11 +21,13 @@
|
|
|
21
21
|
"reviewBudgets": {
|
|
22
22
|
"check": {
|
|
23
23
|
"stageSlaMs": 180000,
|
|
24
|
-
"maxExternalRunsPerStage": 1
|
|
24
|
+
"maxExternalRunsPerStage": 1,
|
|
25
|
+
"providerTimeoutMs": 180000
|
|
25
26
|
},
|
|
26
27
|
"verify": {
|
|
27
28
|
"stageSlaMs": 180000,
|
|
28
|
-
"maxExternalRunsPerStage": 1
|
|
29
|
+
"maxExternalRunsPerStage": 1,
|
|
30
|
+
"providerTimeoutMs": 180000
|
|
29
31
|
}
|
|
30
32
|
},
|
|
31
33
|
"checkerProviders": {
|
package/package.json
CHANGED
package/prompts/checker.md
CHANGED
|
@@ -75,6 +75,7 @@ Project-specific context приходит только через task artifacts
|
|
|
75
75
|
27. Если plan/task/checker-context показывает golden set/eval/regression fixtures/label cards/ground truth, Checker должен требовать `## Label Card Schema`, `## Coverage Matrix`, `## Negative / Edge Cases` и `## Harness Boundary`. Golden set без schema/coverage/negative cases/source evidence/non-goals/manual-vs-automated boundary является `return_to_plan`, даже если есть общий текст про expected outputs.
|
|
76
76
|
28. Если remaining issue является процессной ясностью, wording polish или удобством статуса, а план уже содержит executable scope, acceptance, risk gates and verification evidence path, не возвращай `return_to_plan`; запиши как non-blocking note или human question. Цель Check - предотвратить дорогие ошибки до Execute, а не создавать повторные внешние циклы ради косметики.
|
|
77
77
|
29. Если видишь несколько related blockers, объедини их в один consolidated finding с полным checklist. Не выдавай только первый найденный blocker, если следующий внешний Check очевидно найдет соседний.
|
|
78
|
+
30. Если review budget уже требует Human Arbitration, не предлагай `--force-review-budget` как обычный retry. Он допустим только при наличии human-approved `review-budget-approval.json`.
|
|
78
79
|
|
|
79
80
|
## Контракт выхода
|
|
80
81
|
|
package/prompts/planner.md
CHANGED
|
@@ -54,7 +54,7 @@
|
|
|
54
54
|
18. Plan должен назвать risk tier (`R0`-`R5`), execution target and execution budget. Для `R1/R2` можно разрешить fast loop inside approved scope, но обязательно назвать stop rules.
|
|
55
55
|
19. План проверки должен быть ladder-based: micro-verify during Execute, slice-verify before completion and external Verify requirement for closeout/high-risk claims.
|
|
56
56
|
20. После `return_to_plan` Planner обязан выполнить один consolidated remediation pass: закрыть все blocking findings, precheck checklist и obvious adjacent gaps в `plan.md`/`check-resolution.md` до следующего Check. Не запускай внешний Check после единичной мелкой правки, если другие known blockers остаются открыты.
|
|
57
|
-
21. Если Check остановлен review budget gate (`human_arbitration_required` с `reviewBudget.reason`), Planner не должен пытаться обойти это повторным запуском. Нужно либо запросить human approval
|
|
57
|
+
21. Если Check остановлен review budget gate (`human_arbitration_required` с `reviewBudget.reason`), Planner не должен пытаться обойти это повторным запуском. Нужно либо запросить human approval и записать `review-budget-approval.json` перед `--force-review-budget`, либо укрупнить remediation и вернуться к Check после явного решения.
|
|
58
58
|
20. План должен описывать meaningful slice. Не дроби локальную работу на отдельный Plan/Check/Verify для каждого микрофикса, если риски и target остаются внутри одного approved tier.
|
|
59
59
|
21. Если risk triggers или `checker-context-pack.md` показывают O2/O3 hot-path work, Planner обязан добавить `## Optimization Strategy`: tier, hot paths, expected data size, chosen efficient approach, anti-patterns avoided and bounded optimizer budget/stop rule. Цель gate — предотвратить очевидно неэффективное решение до Execute, а не запускать бесконечную оптимизацию.
|
|
60
60
|
22. Если задача создает golden set/eval/regression fixtures/label cards/ground truth, Planner обязан добавить `## Label Card Schema`, `## Coverage Matrix`, `## Negative / Edge Cases` и `## Harness Boundary`. Golden set должен быть test contract with expected outputs, non-goals, source refs, missing coverage policy and manual-vs-automated boundary.
|
package/prompts/supervisor.md
CHANGED
|
@@ -61,8 +61,9 @@ Supervisor является code-level orchestrator по контракту: rou
|
|
|
61
61
|
27. Если external verifier/checker/browser tooling начинает тратить непропорционально много времени или блокируется окружением, Supervisor обязан остановить loop и вынести human decision: принять internal verify/evidence, запустить external escalation вручную или изменить scope.
|
|
62
62
|
28. Если deterministic Check preflight создал `precheck-remediation.md`, Supervisor не должен запускать повторный Check после точечной правки одного пункта. Сначала Planner/Executor должен закрыть весь checklist или явно отметить not-applicable с evidence/human decision в `plan.md`/`status.md`, затем допускается один fresh Check.
|
|
63
63
|
29. Перед повторным Check после deterministic precheck Supervisor обязан сверить, что `precheck-remediation.md` был использован как consolidated checklist: все listed gates отражены в plan/research/status, а не закрывались по одному через серию precheck loops.
|
|
64
|
-
30. External Check и external Verify имеют stage SLA по умолчанию 3
|
|
64
|
+
30. External Check и external Verify имеют stage SLA по умолчанию 3 минуты, максимум один external provider run на фазу и hard provider timeout 3 минуты. Если `check.result.json` или `verify.result.json` вернул `human_arbitration_required` с `reviewBudget.reason`, Supervisor не запускает еще один внешний review без явного human approval, записанного в `review-budget-approval.json`, и `--force-review-budget`.
|
|
65
65
|
31. После `return_to_plan` / `return_to_execute` Supervisor должен требовать один consolidated remediation pass. Запрещено запускать серию внешних Check/Verify для мелких последовательных правок, если их можно закрыть в одном artifact update.
|
|
66
|
+
32. `--force-review-budget` запрещен как обычный retry flag. Он допустим только после human decision и должен быть виден в timeline вместе с approval artifact; без approval artifact команда должна остаться на Human Arbitration.
|
|
66
67
|
|
|
67
68
|
## Hard Gate: Material Scope Expansion -> Brief Reset
|
|
68
69
|
|
package/prompts/verifier.md
CHANGED
|
@@ -48,7 +48,8 @@
|
|
|
48
48
|
24. Environment/tooling failures внешнего verifier/browser smoke не должны превращаться в бесконечный `return_to_execute` loop. Если implementation evidence достаточно, но внешний инструмент заблокирован окружением, используй `pass_with_notes` или `human_arbitration_required` согласно риску.
|
|
49
49
|
25. Если `plan.md` содержит golden set/eval/regression fixture sections, verifier должен проверить `Golden Set / Regression Evidence`: label cards follow schema, coverage matrix is filled, negative/edge cases are selected or documented missing, expected outputs/non-goals are inspectable, source refs/snippets exist and manual-vs-automated harness boundary is explicit.
|
|
50
50
|
26. External Verify должен укладываться в bounded review model: один внешний provider run по умолчанию. Если остаются несколько blockers, верни один consolidated `return_to_execute` finding с полным checklist. Minor documentation/status polish не должен запускать новый внешний цикл, если acceptance/evidence покрыты.
|
|
51
|
-
27. Если review budget gate уже вернул `human_arbitration_required`, не предлагай повторный external Verify как обычный следующий шаг. Следующий шаг: consolidated execution fix, internal evidence decision или явный human approval
|
|
51
|
+
27. Если review budget gate уже вернул `human_arbitration_required`, не предлагай повторный external Verify как обычный следующий шаг. Следующий шаг: consolidated execution fix, internal evidence decision или явный human approval, записанный в `review-budget-approval.json`, перед `--force-review-budget`.
|
|
52
|
+
28. External provider timeout является hard gate. Если verifier не успевает уложиться в 3 минуты, верни `verifier_failed`/`timeout` или `human_arbitration_required`; не предлагай бесконечный retry с тем же input.
|
|
52
53
|
|
|
53
54
|
## Контракт выхода
|
|
54
55
|
|