@besales/ops-framework 0.1.30 → 0.1.32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +14 -0
- package/README.md +23 -0
- package/bin/lib/review-budget-utils.mjs +185 -0
- package/bin/lib/review-budget-utils.test.mjs +145 -0
- package/bin/providers/external-cli-checker.mjs +3 -1
- package/bin/run-check.mjs +252 -1
- package/bin/run-verify.mjs +236 -0
- package/config/default-agents.json +12 -0
- package/package.json +1 -1
- package/prompts/checker.md +3 -0
- package/prompts/planner.md +2 -0
- package/prompts/supervisor.md +3 -0
- package/prompts/verifier.md +3 -0
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,19 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 0.1.32
|
|
4
|
+
|
|
5
|
+
- Required explicit `review-budget-approval.json` before `--force-review-budget` can bypass Check/Verify review budgets.
|
|
6
|
+
- Added external CLI provider timeouts from `reviewBudgets.*.providerTimeoutMs` so Check/Verify provider calls cannot silently exceed the stage SLA.
|
|
7
|
+
- Recorded denied force attempts in Check/Verify timelines as `review_budget_force_denied`.
|
|
8
|
+
- Updated review budget docs and prompts so force review is treated as a human-approved exception, not a normal retry path.
|
|
9
|
+
|
|
10
|
+
## 0.1.31
|
|
11
|
+
|
|
12
|
+
- Added bounded review budgets for Check and Verify: default 3 minute stage SLA and one external provider run per stage.
|
|
13
|
+
- Added review budget gates that route repeated external Check/Verify loops to `human_arbitration_required` instead of launching another provider call; humans can explicitly override with `--force-review-budget`.
|
|
14
|
+
- Added `review_budget_summary` telemetry to Check/Verify timelines so elapsed time, provider runs, deterministic blocks and provider duration are visible without manual log math.
|
|
15
|
+
- Updated Planner/Checker/Verifier/Supervisor prompts to prefer consolidated remediation over one-item external review loops.
|
|
16
|
+
|
|
3
17
|
## 0.1.30
|
|
4
18
|
|
|
5
19
|
- Added a generic `golden-set-regression` risk trigger for golden sets, eval fixtures, label cards, ground-truth datasets and regression checklists.
|
package/README.md
CHANGED
|
@@ -188,6 +188,29 @@ Do not commit that `file:` dependency to production projects. It is only for pac
|
|
|
188
188
|
- `initiative-requirements`
|
|
189
189
|
- `test/self-test`
|
|
190
190
|
|
|
191
|
+
## Review Budgets
|
|
192
|
+
|
|
193
|
+
External `run-check` and `run-verify` are bounded by default:
|
|
194
|
+
|
|
195
|
+
- stage SLA: `180000ms`;
|
|
196
|
+
- max external provider runs per stage: `1`.
|
|
197
|
+
- external provider timeout: `180000ms`.
|
|
198
|
+
|
|
199
|
+
When the budget is exceeded, the framework writes `human_arbitration_required`
|
|
200
|
+
instead of starting another provider loop. Consolidate the remaining findings in
|
|
201
|
+
task artifacts, or rerun with `--force-review-budget` only after explicit human
|
|
202
|
+
approval recorded in `review-budget-approval.json`:
|
|
203
|
+
|
|
204
|
+
```json
|
|
205
|
+
{
|
|
206
|
+
"approved": true,
|
|
207
|
+
"stage": "check",
|
|
208
|
+
"reason": "Human approved one extra external review after consolidated remediation.",
|
|
209
|
+
"approvedBy": "human",
|
|
210
|
+
"expiresAt": "2026-06-05T12:00:00.000Z"
|
|
211
|
+
}
|
|
212
|
+
```
|
|
213
|
+
|
|
191
214
|
## Learning Loop
|
|
192
215
|
|
|
193
216
|
Learning is controlled and human-approved:
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
|
|
4
|
+
export function resolveStageReviewBudget(config, stage) {
|
|
5
|
+
const defaults = {
|
|
6
|
+
stageSlaMs: 180000,
|
|
7
|
+
maxExternalRunsPerStage: 1,
|
|
8
|
+
providerTimeoutMs: 180000,
|
|
9
|
+
};
|
|
10
|
+
const reviewBudgets = config.reviewBudgets || {};
|
|
11
|
+
return {
|
|
12
|
+
...defaults,
|
|
13
|
+
...(reviewBudgets[stage] || {}),
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export function summarizeReviewBudgetWindow({
|
|
18
|
+
timeline = [],
|
|
19
|
+
stage,
|
|
20
|
+
now = new Date(),
|
|
21
|
+
}) {
|
|
22
|
+
const terminalEvents = stage === 'verify'
|
|
23
|
+
? new Set(['pass', 'pass_with_notes', 'human_arbitration_required', 'verifier_failed'])
|
|
24
|
+
: new Set(['ready_for_human_gate', 'human_arbitration_required', 'checker_failed']);
|
|
25
|
+
let lastTerminalIndex = -1;
|
|
26
|
+
for (let index = 0; index < timeline.length; index += 1) {
|
|
27
|
+
const event = timeline[index];
|
|
28
|
+
if ((event.event === `${stage}_completed` || event.event === 'check_completed') && terminalEvents.has(event.verdict)) {
|
|
29
|
+
lastTerminalIndex = index;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
const windowEvents = timeline.slice(lastTerminalIndex + 1);
|
|
34
|
+
const firstEventAt = firstValidDate(windowEvents.map((event) => event.at)) || now;
|
|
35
|
+
const providerStarted = windowEvents.filter((event) => event.event === 'provider_started').length;
|
|
36
|
+
const providerCompleted = windowEvents.filter((event) => event.event === 'provider_completed').length;
|
|
37
|
+
const deterministicBlocks = windowEvents.filter((event) => event.event.includes('deterministic')).length;
|
|
38
|
+
const elapsedMs = Math.max(0, now.getTime() - firstEventAt.getTime());
|
|
39
|
+
const providerMs = windowEvents
|
|
40
|
+
.filter((event) => event.event === 'provider_completed')
|
|
41
|
+
.reduce((sum, event) => sum + Number(event.providerTiming?.durationMs || 0), 0);
|
|
42
|
+
|
|
43
|
+
return {
|
|
44
|
+
windowStartedAt: firstEventAt.toISOString(),
|
|
45
|
+
elapsedMs,
|
|
46
|
+
providerStarted,
|
|
47
|
+
providerCompleted,
|
|
48
|
+
deterministicBlocks,
|
|
49
|
+
providerMs,
|
|
50
|
+
eventCount: windowEvents.length,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export function evaluateReviewBudget({ budget, summary, force = false }) {
|
|
55
|
+
if (force) {
|
|
56
|
+
return { ok: true, reason: null };
|
|
57
|
+
}
|
|
58
|
+
if (summary.providerStarted >= budget.maxExternalRunsPerStage) {
|
|
59
|
+
return {
|
|
60
|
+
ok: false,
|
|
61
|
+
reason: 'max_external_runs_per_stage_exceeded',
|
|
62
|
+
message: `External review run budget exceeded: providerStarted=${summary.providerStarted}, maxExternalRunsPerStage=${budget.maxExternalRunsPerStage}.`,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
if (summary.elapsedMs >= budget.stageSlaMs) {
|
|
66
|
+
return {
|
|
67
|
+
ok: false,
|
|
68
|
+
reason: 'stage_sla_exceeded',
|
|
69
|
+
message: `Review stage SLA exceeded: elapsedMs=${summary.elapsedMs}, stageSlaMs=${budget.stageSlaMs}.`,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
return { ok: true, reason: null };
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export function readReviewBudgetApproval({ taskDir, stage, now = new Date() }) {
|
|
76
|
+
const approvalPath = path.join(taskDir, 'review-budget-approval.json');
|
|
77
|
+
if (!fs.existsSync(approvalPath)) {
|
|
78
|
+
return {
|
|
79
|
+
ok: false,
|
|
80
|
+
reason: 'missing_review_budget_approval',
|
|
81
|
+
message: 'Using --force-review-budget requires review-budget-approval.json with approved=true.',
|
|
82
|
+
path: 'review-budget-approval.json',
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
let approval;
|
|
87
|
+
try {
|
|
88
|
+
approval = JSON.parse(fs.readFileSync(approvalPath, 'utf8'));
|
|
89
|
+
} catch (error) {
|
|
90
|
+
return {
|
|
91
|
+
ok: false,
|
|
92
|
+
reason: 'invalid_review_budget_approval',
|
|
93
|
+
message: `review-budget-approval.json is invalid JSON: ${error.message}`,
|
|
94
|
+
path: 'review-budget-approval.json',
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (approval.approved !== true) {
|
|
99
|
+
return {
|
|
100
|
+
ok: false,
|
|
101
|
+
reason: 'review_budget_approval_not_approved',
|
|
102
|
+
message: 'review-budget-approval.json must contain approved=true.',
|
|
103
|
+
path: 'review-budget-approval.json',
|
|
104
|
+
approval,
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if (!['check', 'verify', 'both'].includes(String(approval.stage || ''))) {
|
|
109
|
+
return {
|
|
110
|
+
ok: false,
|
|
111
|
+
reason: 'review_budget_approval_stage_invalid',
|
|
112
|
+
message: 'review-budget-approval.json stage must be check, verify or both.',
|
|
113
|
+
path: 'review-budget-approval.json',
|
|
114
|
+
approval,
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if (approval.stage !== stage && approval.stage !== 'both') {
|
|
119
|
+
return {
|
|
120
|
+
ok: false,
|
|
121
|
+
reason: 'review_budget_approval_stage_mismatch',
|
|
122
|
+
message: `review-budget-approval.json is for stage=${approval.stage}, not ${stage}.`,
|
|
123
|
+
path: 'review-budget-approval.json',
|
|
124
|
+
approval,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (!approval.reason || typeof approval.reason !== 'string') {
|
|
129
|
+
return {
|
|
130
|
+
ok: false,
|
|
131
|
+
reason: 'review_budget_approval_reason_missing',
|
|
132
|
+
message: 'review-budget-approval.json must contain a human-readable reason.',
|
|
133
|
+
path: 'review-budget-approval.json',
|
|
134
|
+
approval,
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if (!approval.approvedBy || typeof approval.approvedBy !== 'string') {
|
|
139
|
+
return {
|
|
140
|
+
ok: false,
|
|
141
|
+
reason: 'review_budget_approval_approver_missing',
|
|
142
|
+
message: 'review-budget-approval.json must contain approvedBy.',
|
|
143
|
+
path: 'review-budget-approval.json',
|
|
144
|
+
approval,
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if (approval.expiresAt) {
|
|
149
|
+
const expiresAt = new Date(approval.expiresAt);
|
|
150
|
+
if (Number.isNaN(expiresAt.getTime())) {
|
|
151
|
+
return {
|
|
152
|
+
ok: false,
|
|
153
|
+
reason: 'review_budget_approval_expiry_invalid',
|
|
154
|
+
message: 'review-budget-approval.json expiresAt must be a valid ISO timestamp when present.',
|
|
155
|
+
path: 'review-budget-approval.json',
|
|
156
|
+
approval,
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
if (expiresAt.getTime() <= now.getTime()) {
|
|
160
|
+
return {
|
|
161
|
+
ok: false,
|
|
162
|
+
reason: 'review_budget_approval_expired',
|
|
163
|
+
message: `review-budget-approval.json expired at ${expiresAt.toISOString()}.`,
|
|
164
|
+
path: 'review-budget-approval.json',
|
|
165
|
+
approval,
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return {
|
|
171
|
+
ok: true,
|
|
172
|
+
path: 'review-budget-approval.json',
|
|
173
|
+
approval,
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
function firstValidDate(values) {
|
|
178
|
+
for (const value of values) {
|
|
179
|
+
const date = new Date(value);
|
|
180
|
+
if (!Number.isNaN(date.getTime())) {
|
|
181
|
+
return date;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
return null;
|
|
185
|
+
}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import { describe, expect, it } from 'vitest';
|
|
2
|
+
import fs from 'node:fs';
|
|
3
|
+
import os from 'node:os';
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
import {
|
|
6
|
+
evaluateReviewBudget,
|
|
7
|
+
readReviewBudgetApproval,
|
|
8
|
+
resolveStageReviewBudget,
|
|
9
|
+
summarizeReviewBudgetWindow,
|
|
10
|
+
} from './review-budget-utils.mjs';
|
|
11
|
+
|
|
12
|
+
describe('review budget utils', () => {
|
|
13
|
+
it('blocks a second external check in the same active stage window', () => {
|
|
14
|
+
const now = new Date('2026-06-04T12:02:00.000Z');
|
|
15
|
+
const summary = summarizeReviewBudgetWindow({
|
|
16
|
+
stage: 'check',
|
|
17
|
+
now,
|
|
18
|
+
timeline: [
|
|
19
|
+
{ at: '2026-06-04T12:00:00.000Z', event: 'check_started' },
|
|
20
|
+
{ at: '2026-06-04T12:00:02.000Z', event: 'provider_started' },
|
|
21
|
+
{
|
|
22
|
+
at: '2026-06-04T12:01:30.000Z',
|
|
23
|
+
event: 'provider_completed',
|
|
24
|
+
providerTiming: { durationMs: 88000 },
|
|
25
|
+
verdict: 'return_to_plan',
|
|
26
|
+
},
|
|
27
|
+
],
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
expect(summary.providerStarted).toBe(1);
|
|
31
|
+
expect(evaluateReviewBudget({
|
|
32
|
+
budget: { stageSlaMs: 180000, maxExternalRunsPerStage: 1 },
|
|
33
|
+
summary,
|
|
34
|
+
})).toMatchObject({
|
|
35
|
+
ok: false,
|
|
36
|
+
reason: 'max_external_runs_per_stage_exceeded',
|
|
37
|
+
});
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
it('blocks when wall-clock stage SLA is exceeded even without provider time', () => {
|
|
41
|
+
const summary = summarizeReviewBudgetWindow({
|
|
42
|
+
stage: 'check',
|
|
43
|
+
now: new Date('2026-06-04T12:04:00.000Z'),
|
|
44
|
+
timeline: [
|
|
45
|
+
{ at: '2026-06-04T12:00:00.000Z', event: 'check_started' },
|
|
46
|
+
{ at: '2026-06-04T12:00:01.000Z', event: 'deterministic_precheck_blocked' },
|
|
47
|
+
],
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
expect(summary.providerStarted).toBe(0);
|
|
51
|
+
expect(evaluateReviewBudget({
|
|
52
|
+
budget: { stageSlaMs: 180000, maxExternalRunsPerStage: 1 },
|
|
53
|
+
summary,
|
|
54
|
+
})).toMatchObject({
|
|
55
|
+
ok: false,
|
|
56
|
+
reason: 'stage_sla_exceeded',
|
|
57
|
+
});
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it('resets the active check window after ready_for_human_gate', () => {
|
|
61
|
+
const summary = summarizeReviewBudgetWindow({
|
|
62
|
+
stage: 'check',
|
|
63
|
+
now: new Date('2026-06-04T12:10:30.000Z'),
|
|
64
|
+
timeline: [
|
|
65
|
+
{ at: '2026-06-04T12:00:00.000Z', event: 'check_started' },
|
|
66
|
+
{ at: '2026-06-04T12:00:05.000Z', event: 'provider_started' },
|
|
67
|
+
{ at: '2026-06-04T12:01:00.000Z', event: 'provider_completed', verdict: 'ready_for_human_gate' },
|
|
68
|
+
{ at: '2026-06-04T12:01:00.100Z', event: 'check_completed', verdict: 'ready_for_human_gate' },
|
|
69
|
+
{ at: '2026-06-04T12:10:00.000Z', event: 'check_started' },
|
|
70
|
+
],
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
expect(summary.providerStarted).toBe(0);
|
|
74
|
+
expect(summary.elapsedMs).toBe(30000);
|
|
75
|
+
expect(evaluateReviewBudget({
|
|
76
|
+
budget: { stageSlaMs: 180000, maxExternalRunsPerStage: 1 },
|
|
77
|
+
summary,
|
|
78
|
+
})).toMatchObject({ ok: true });
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it('uses configured stage budgets over defaults', () => {
|
|
82
|
+
expect(resolveStageReviewBudget({
|
|
83
|
+
reviewBudgets: {
|
|
84
|
+
verify: {
|
|
85
|
+
stageSlaMs: 120000,
|
|
86
|
+
maxExternalRunsPerStage: 2,
|
|
87
|
+
providerTimeoutMs: 90000,
|
|
88
|
+
},
|
|
89
|
+
},
|
|
90
|
+
}, 'verify')).toEqual({
|
|
91
|
+
stageSlaMs: 120000,
|
|
92
|
+
maxExternalRunsPerStage: 2,
|
|
93
|
+
providerTimeoutMs: 90000,
|
|
94
|
+
});
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
it('requires a valid force approval artifact', () => {
|
|
98
|
+
const taskDir = fs.mkdtempSync(path.join(os.tmpdir(), 'review-budget-approval-'));
|
|
99
|
+
|
|
100
|
+
expect(readReviewBudgetApproval({ taskDir, stage: 'verify' })).toMatchObject({
|
|
101
|
+
ok: false,
|
|
102
|
+
reason: 'missing_review_budget_approval',
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
fs.writeFileSync(path.join(taskDir, 'review-budget-approval.json'), JSON.stringify({
|
|
106
|
+
approved: true,
|
|
107
|
+
stage: 'verify',
|
|
108
|
+
reason: 'Human approved one extra external Verify after consolidated remediation.',
|
|
109
|
+
approvedBy: 'human',
|
|
110
|
+
expiresAt: '2026-06-04T12:30:00.000Z',
|
|
111
|
+
}, null, 2));
|
|
112
|
+
|
|
113
|
+
expect(readReviewBudgetApproval({
|
|
114
|
+
taskDir,
|
|
115
|
+
stage: 'verify',
|
|
116
|
+
now: new Date('2026-06-04T12:00:00.000Z'),
|
|
117
|
+
})).toMatchObject({
|
|
118
|
+
ok: true,
|
|
119
|
+
approval: {
|
|
120
|
+
approved: true,
|
|
121
|
+
stage: 'verify',
|
|
122
|
+
},
|
|
123
|
+
});
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
it('rejects expired force approval artifacts', () => {
|
|
127
|
+
const taskDir = fs.mkdtempSync(path.join(os.tmpdir(), 'review-budget-expired-'));
|
|
128
|
+
fs.writeFileSync(path.join(taskDir, 'review-budget-approval.json'), JSON.stringify({
|
|
129
|
+
approved: true,
|
|
130
|
+
stage: 'both',
|
|
131
|
+
reason: 'Expired approval',
|
|
132
|
+
approvedBy: 'human',
|
|
133
|
+
expiresAt: '2026-06-04T12:00:00.000Z',
|
|
134
|
+
}, null, 2));
|
|
135
|
+
|
|
136
|
+
expect(readReviewBudgetApproval({
|
|
137
|
+
taskDir,
|
|
138
|
+
stage: 'check',
|
|
139
|
+
now: new Date('2026-06-04T12:00:01.000Z'),
|
|
140
|
+
})).toMatchObject({
|
|
141
|
+
ok: false,
|
|
142
|
+
reason: 'review_budget_approval_expired',
|
|
143
|
+
});
|
|
144
|
+
});
|
|
145
|
+
});
|
|
@@ -47,6 +47,7 @@ export async function runExternalCliChecker({
|
|
|
47
47
|
reasoningEffort,
|
|
48
48
|
prompt,
|
|
49
49
|
cwd,
|
|
50
|
+
timeoutMs,
|
|
50
51
|
}) {
|
|
51
52
|
if (!providerConfig) {
|
|
52
53
|
const error = new Error(`Unknown external CLI provider: ${providerName}`);
|
|
@@ -68,10 +69,11 @@ export async function runExternalCliChecker({
|
|
|
68
69
|
input: providerConfig.input === 'stdin' ? prompt : undefined,
|
|
69
70
|
encoding: 'utf8',
|
|
70
71
|
maxBuffer: 1024 * 1024 * 20,
|
|
72
|
+
timeout: timeoutMs || undefined,
|
|
71
73
|
});
|
|
72
74
|
|
|
73
75
|
if (result.error) {
|
|
74
|
-
result.error.failureReason = 'provider_unavailable';
|
|
76
|
+
result.error.failureReason = result.error.code === 'ETIMEDOUT' ? 'timeout' : 'provider_unavailable';
|
|
75
77
|
throw result.error;
|
|
76
78
|
}
|
|
77
79
|
if (result.status !== 0) {
|
package/bin/run-check.mjs
CHANGED
|
@@ -45,6 +45,12 @@ import {
|
|
|
45
45
|
validateManifest,
|
|
46
46
|
writeTaskManifest,
|
|
47
47
|
} from './lib/task-manifest-utils.mjs';
|
|
48
|
+
import {
|
|
49
|
+
evaluateReviewBudget,
|
|
50
|
+
readReviewBudgetApproval,
|
|
51
|
+
resolveStageReviewBudget,
|
|
52
|
+
summarizeReviewBudgetWindow,
|
|
53
|
+
} from './lib/review-budget-utils.mjs';
|
|
48
54
|
|
|
49
55
|
function main() {
|
|
50
56
|
runMain().catch((error) => {
|
|
@@ -64,6 +70,8 @@ async function runMain() {
|
|
|
64
70
|
const taskId = path.basename(taskDir);
|
|
65
71
|
const dryRun = getFlag(args, 'dry-run', false) === true;
|
|
66
72
|
const noCache = getFlag(args, 'no-cache', false) === true;
|
|
73
|
+
const forceReviewBudget = getFlag(args, 'force-review-budget', false) === true
|
|
74
|
+
|| getFlag(args, 'force-external-review', false) === true;
|
|
67
75
|
const checkerConfig = resolveCheckerConfig(args);
|
|
68
76
|
const runStartedAt = new Date();
|
|
69
77
|
appendCheckTimeline(taskDir, {
|
|
@@ -72,6 +80,7 @@ async function runMain() {
|
|
|
72
80
|
model: checkerConfig.model,
|
|
73
81
|
noCache,
|
|
74
82
|
dryRun,
|
|
83
|
+
forceReviewBudget,
|
|
75
84
|
});
|
|
76
85
|
|
|
77
86
|
let checkContext = ensureFreshCheckContext(taskDir, taskId);
|
|
@@ -231,6 +240,107 @@ async function runMain() {
|
|
|
231
240
|
return;
|
|
232
241
|
}
|
|
233
242
|
|
|
243
|
+
const forceApproval = forceReviewBudget
|
|
244
|
+
? readReviewBudgetApproval({ taskDir, stage: 'check' })
|
|
245
|
+
: { ok: true, approval: null };
|
|
246
|
+
if (!forceApproval.ok) {
|
|
247
|
+
const budget = resolveStageReviewBudget(readAgentsConfig(), 'check');
|
|
248
|
+
const summary = summarizeReviewBudgetWindow({
|
|
249
|
+
timeline: readTimeline(taskDir, 'check-timeline.json'),
|
|
250
|
+
stage: 'check',
|
|
251
|
+
now: new Date(),
|
|
252
|
+
});
|
|
253
|
+
writeReviewBudgetReturn({
|
|
254
|
+
taskDir,
|
|
255
|
+
taskId,
|
|
256
|
+
checkContext,
|
|
257
|
+
checkerConfig,
|
|
258
|
+
checkerPromptSha,
|
|
259
|
+
cacheKey,
|
|
260
|
+
reason: forceApproval.reason,
|
|
261
|
+
message: forceApproval.message,
|
|
262
|
+
budget,
|
|
263
|
+
summary,
|
|
264
|
+
startedAt: runStartedAt,
|
|
265
|
+
approval: forceApproval,
|
|
266
|
+
});
|
|
267
|
+
appendCheckTimeline(taskDir, {
|
|
268
|
+
event: 'review_budget_force_denied',
|
|
269
|
+
verdict: 'human_arbitration_required',
|
|
270
|
+
reason: forceApproval.reason,
|
|
271
|
+
message: forceApproval.message,
|
|
272
|
+
budget,
|
|
273
|
+
summary,
|
|
274
|
+
timing: buildTiming(runStartedAt),
|
|
275
|
+
});
|
|
276
|
+
recordLlmInputUsage({
|
|
277
|
+
taskDir,
|
|
278
|
+
stage: 'check',
|
|
279
|
+
packMeta: promptPayload.pack.meta,
|
|
280
|
+
attempts: [
|
|
281
|
+
...llmInputAttempts,
|
|
282
|
+
buildAttemptRecord(promptPayload.pack.meta, `review_budget_force_denied:${forceApproval.reason}`),
|
|
283
|
+
],
|
|
284
|
+
rerunCount,
|
|
285
|
+
timing: buildTiming(runStartedAt),
|
|
286
|
+
});
|
|
287
|
+
refreshTaskManifestAfterCheck(taskDir);
|
|
288
|
+
runValidator(taskArg);
|
|
289
|
+
console.log(`Checker force review denied ${taskId}: human_arbitration_required`);
|
|
290
|
+
console.log(`- reason: ${forceApproval.reason}`);
|
|
291
|
+
return;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
const reviewBudget = evaluateCurrentReviewBudget({
|
|
295
|
+
taskDir,
|
|
296
|
+
stage: 'check',
|
|
297
|
+
config: readAgentsConfig(),
|
|
298
|
+
force: forceReviewBudget,
|
|
299
|
+
});
|
|
300
|
+
if (!reviewBudget.ok) {
|
|
301
|
+
writeReviewBudgetReturn({
|
|
302
|
+
taskDir,
|
|
303
|
+
taskId,
|
|
304
|
+
checkContext,
|
|
305
|
+
checkerConfig,
|
|
306
|
+
checkerPromptSha,
|
|
307
|
+
cacheKey,
|
|
308
|
+
reason: reviewBudget.reason,
|
|
309
|
+
message: reviewBudget.message,
|
|
310
|
+
budget: reviewBudget.budget,
|
|
311
|
+
summary: reviewBudget.summary,
|
|
312
|
+
startedAt: runStartedAt,
|
|
313
|
+
approval: forceApproval,
|
|
314
|
+
});
|
|
315
|
+
appendCheckTimeline(taskDir, {
|
|
316
|
+
event: 'review_budget_blocked',
|
|
317
|
+
verdict: 'human_arbitration_required',
|
|
318
|
+
reason: reviewBudget.reason,
|
|
319
|
+
message: reviewBudget.message,
|
|
320
|
+
budget: reviewBudget.budget,
|
|
321
|
+
summary: reviewBudget.summary,
|
|
322
|
+
timing: buildTiming(runStartedAt),
|
|
323
|
+
});
|
|
324
|
+
recordLlmInputUsage({
|
|
325
|
+
taskDir,
|
|
326
|
+
stage: 'check',
|
|
327
|
+
packMeta: promptPayload.pack.meta,
|
|
328
|
+
attempts: [
|
|
329
|
+
...llmInputAttempts,
|
|
330
|
+
buildAttemptRecord(promptPayload.pack.meta, `review_budget_blocked:${reviewBudget.reason}`),
|
|
331
|
+
],
|
|
332
|
+
rerunCount,
|
|
333
|
+
timing: buildTiming(runStartedAt),
|
|
334
|
+
});
|
|
335
|
+
refreshTaskManifestAfterCheck(taskDir);
|
|
336
|
+
runValidator(taskArg);
|
|
337
|
+
console.log(`Checker review budget blocked ${taskId}: human_arbitration_required`);
|
|
338
|
+
console.log(`- reason: ${reviewBudget.reason}`);
|
|
339
|
+
console.log(`- elapsedMs: ${reviewBudget.summary.elapsedMs}`);
|
|
340
|
+
console.log(`- providerStarted: ${reviewBudget.summary.providerStarted}`);
|
|
341
|
+
return;
|
|
342
|
+
}
|
|
343
|
+
|
|
234
344
|
try {
|
|
235
345
|
const providerStartedAt = new Date();
|
|
236
346
|
appendCheckTimeline(taskDir, {
|
|
@@ -247,6 +357,7 @@ async function runMain() {
|
|
|
247
357
|
checkerConfig,
|
|
248
358
|
messages: promptPayload.messages,
|
|
249
359
|
prompt: promptPayload.prompt,
|
|
360
|
+
timeoutMs: reviewBudget.budget.providerTimeoutMs,
|
|
250
361
|
});
|
|
251
362
|
appendCheckTimeline(taskDir, {
|
|
252
363
|
event: 'provider_completed',
|
|
@@ -319,6 +430,7 @@ async function runMain() {
|
|
|
319
430
|
cacheKeySha,
|
|
320
431
|
timing: buildTiming(runStartedAt),
|
|
321
432
|
});
|
|
433
|
+
appendReviewBudgetSummary(taskDir);
|
|
322
434
|
if (!isContextInsufficientResult(providerOutput.checkResultJson)) {
|
|
323
435
|
storeInCache({ taskDir, cacheKeySha });
|
|
324
436
|
}
|
|
@@ -337,6 +449,129 @@ async function runMain() {
|
|
|
337
449
|
console.log(`- finalEstimatedInputTokens: ${promptPayload.pack.meta.estimatedTokens}`);
|
|
338
450
|
}
|
|
339
451
|
|
|
452
|
+
function evaluateCurrentReviewBudget({ taskDir, stage, config, force }) {
|
|
453
|
+
const budget = resolveStageReviewBudget(config, stage);
|
|
454
|
+
const summary = summarizeReviewBudgetWindow({
|
|
455
|
+
timeline: readTimeline(taskDir, 'check-timeline.json'),
|
|
456
|
+
stage,
|
|
457
|
+
now: new Date(),
|
|
458
|
+
});
|
|
459
|
+
return {
|
|
460
|
+
...evaluateReviewBudget({ budget, summary, force }),
|
|
461
|
+
budget,
|
|
462
|
+
summary,
|
|
463
|
+
};
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
function readTimeline(taskDir, fileName) {
|
|
467
|
+
const timelinePath = path.join(taskDir, fileName);
|
|
468
|
+
if (!fs.existsSync(timelinePath)) {
|
|
469
|
+
return [];
|
|
470
|
+
}
|
|
471
|
+
try {
|
|
472
|
+
const parsed = JSON.parse(fs.readFileSync(timelinePath, 'utf8'));
|
|
473
|
+
return Array.isArray(parsed) ? parsed : [];
|
|
474
|
+
} catch {
|
|
475
|
+
return [];
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
function writeReviewBudgetReturn({
|
|
480
|
+
taskDir,
|
|
481
|
+
taskId,
|
|
482
|
+
checkContext,
|
|
483
|
+
checkerConfig,
|
|
484
|
+
checkerPromptSha,
|
|
485
|
+
cacheKey,
|
|
486
|
+
reason,
|
|
487
|
+
message,
|
|
488
|
+
budget,
|
|
489
|
+
summary,
|
|
490
|
+
startedAt,
|
|
491
|
+
approval = null,
|
|
492
|
+
}) {
|
|
493
|
+
const result = {
|
|
494
|
+
taskId,
|
|
495
|
+
stage: 'Check',
|
|
496
|
+
checkerProvider: 'review-budget',
|
|
497
|
+
checkerModel: 'none',
|
|
498
|
+
planSha: checkContext.planSha,
|
|
499
|
+
memorySha: checkContext.memorySha,
|
|
500
|
+
riskProfile: checkContext.riskProfile,
|
|
501
|
+
verdict: 'human_arbitration_required',
|
|
502
|
+
failureReason: null,
|
|
503
|
+
blockingFindings: 1,
|
|
504
|
+
nonBlockingFindings: 0,
|
|
505
|
+
humanQuestions: 1,
|
|
506
|
+
findings: [
|
|
507
|
+
{
|
|
508
|
+
id: 'F-001',
|
|
509
|
+
severity: 'blocking',
|
|
510
|
+
claimCategory: 'human_decision_required',
|
|
511
|
+
claim: message,
|
|
512
|
+
evidenceRefs: [
|
|
513
|
+
{
|
|
514
|
+
type: 'file',
|
|
515
|
+
ref: 'check-timeline.json',
|
|
516
|
+
},
|
|
517
|
+
],
|
|
518
|
+
affectedPlanSections: ['Plan/Check orchestration'],
|
|
519
|
+
expectedCorrection: 'Stop the repeated external review loop. Consolidate remaining findings into plan/status/check-resolution, or ask the human to approve one extra external review with --force-review-budget.',
|
|
520
|
+
},
|
|
521
|
+
],
|
|
522
|
+
reviewBudget: {
|
|
523
|
+
reason,
|
|
524
|
+
budget,
|
|
525
|
+
summary,
|
|
526
|
+
approval,
|
|
527
|
+
forceFlag: '--force-review-budget',
|
|
528
|
+
},
|
|
529
|
+
readyForHumanGate: false,
|
|
530
|
+
createdAt: new Date().toISOString(),
|
|
531
|
+
};
|
|
532
|
+
const markdown = [
|
|
533
|
+
'# Check',
|
|
534
|
+
'',
|
|
535
|
+
'## итоговая оценка',
|
|
536
|
+
'',
|
|
537
|
+
'`human_arbitration_required`',
|
|
538
|
+
'',
|
|
539
|
+
'## Review Budget Gate',
|
|
540
|
+
'',
|
|
541
|
+
message,
|
|
542
|
+
'',
|
|
543
|
+
'The external Checker was not invoked. The framework enforces a bounded Check stage so quality work happens through consolidated remediation instead of repeated provider loops.',
|
|
544
|
+
'',
|
|
545
|
+
'## Budget',
|
|
546
|
+
'',
|
|
547
|
+
'```json',
|
|
548
|
+
JSON.stringify({ reason, budget, summary, approval }, null, 2),
|
|
549
|
+
'```',
|
|
550
|
+
'',
|
|
551
|
+
'## Required decision',
|
|
552
|
+
'',
|
|
553
|
+
'- Consolidate all remaining Check findings into `plan.md`, `status.md`, and `check-resolution.md`, then run one fresh Check after the window resets; or',
|
|
554
|
+
'- Ask the human to approve an extra external review by writing `review-budget-approval.json`, then rerun with `--force-review-budget`.',
|
|
555
|
+
'',
|
|
556
|
+
'## Timing',
|
|
557
|
+
'',
|
|
558
|
+
`- Duration: ${buildTiming(startedAt).durationMs}ms`,
|
|
559
|
+
].join('\n');
|
|
560
|
+
|
|
561
|
+
writeTaskFile(taskDir, 'check.md', appendRunnerMetadata(markdown, checkerPromptSha, cacheKey));
|
|
562
|
+
writeTaskFile(taskDir, 'check.result.json', JSON.stringify(result, null, 2));
|
|
563
|
+
updateStatus(taskDir, {
|
|
564
|
+
stage: 'Human Arbitration',
|
|
565
|
+
routingDecision: `review_budget_blocked:${reason}`,
|
|
566
|
+
checkVerdict: '`human_arbitration_required`',
|
|
567
|
+
checkResult: '- `check.result.json`: current; review budget blocked external Checker invocation',
|
|
568
|
+
supervisorAction: 'Check review budget blocked another external provider loop.',
|
|
569
|
+
nextStep: 'Human Arbitration: write `review-budget-approval.json` before using `--force-review-budget`, or consolidate remaining findings before a fresh Check.',
|
|
570
|
+
humanApproval: 'yes',
|
|
571
|
+
});
|
|
572
|
+
appendOrchestrationLog(taskDir, `Check review budget blocked external checker; reason=${reason}; elapsedMs=${summary.elapsedMs}; providerStarted=${summary.providerStarted}; maxExternalRuns=${budget.maxExternalRunsPerStage}; stageSlaMs=${budget.stageSlaMs}`);
|
|
573
|
+
}
|
|
574
|
+
|
|
340
575
|
function refreshTaskManifestAfterCheck(taskDir) {
|
|
341
576
|
const manifest = buildTaskManifest({ taskDir });
|
|
342
577
|
writeTaskManifest(taskDir, manifest);
|
|
@@ -366,6 +601,21 @@ function appendCheckTimeline(taskDir, event) {
|
|
|
366
601
|
writeTaskFile(taskDir, 'check-timeline.json', JSON.stringify(existing, null, 2));
|
|
367
602
|
}
|
|
368
603
|
|
|
604
|
+
function appendReviewBudgetSummary(taskDir) {
|
|
605
|
+
const config = readAgentsConfig();
|
|
606
|
+
const budget = resolveStageReviewBudget(config, 'check');
|
|
607
|
+
const summary = summarizeReviewBudgetWindow({
|
|
608
|
+
timeline: readTimeline(taskDir, 'check-timeline.json'),
|
|
609
|
+
stage: 'check',
|
|
610
|
+
now: new Date(),
|
|
611
|
+
});
|
|
612
|
+
appendCheckTimeline(taskDir, {
|
|
613
|
+
event: 'review_budget_summary',
|
|
614
|
+
budget,
|
|
615
|
+
summary,
|
|
616
|
+
});
|
|
617
|
+
}
|
|
618
|
+
|
|
369
619
|
function buildAttemptRecord(packMeta, outcome) {
|
|
370
620
|
return {
|
|
371
621
|
mode: packMeta.mode,
|
|
@@ -733,7 +983,7 @@ function buildCheckerPromptPayload({
|
|
|
733
983
|
};
|
|
734
984
|
}
|
|
735
985
|
|
|
736
|
-
async function runProvider({ checkerConfig, messages, prompt }) {
|
|
986
|
+
async function runProvider({ checkerConfig, messages, prompt, timeoutMs }) {
|
|
737
987
|
if (checkerConfig.provider === 'openai') {
|
|
738
988
|
return runOpenAiChecker({
|
|
739
989
|
apiKey: process.env.OPENAI_API_KEY,
|
|
@@ -750,6 +1000,7 @@ async function runProvider({ checkerConfig, messages, prompt }) {
|
|
|
750
1000
|
reasoningEffort: checkerConfig.reasoningEffort,
|
|
751
1001
|
prompt,
|
|
752
1002
|
cwd: repoRoot,
|
|
1003
|
+
timeoutMs,
|
|
753
1004
|
});
|
|
754
1005
|
}
|
|
755
1006
|
|
package/bin/run-verify.mjs
CHANGED
|
@@ -28,6 +28,12 @@ import {
|
|
|
28
28
|
summarizePackForConsole,
|
|
29
29
|
} from './lib/llm-input-pack-utils.mjs';
|
|
30
30
|
import { recordLlmInputUsage } from './lib/task-manifest-utils.mjs';
|
|
31
|
+
import {
|
|
32
|
+
evaluateReviewBudget,
|
|
33
|
+
readReviewBudgetApproval,
|
|
34
|
+
resolveStageReviewBudget,
|
|
35
|
+
summarizeReviewBudgetWindow,
|
|
36
|
+
} from './lib/review-budget-utils.mjs';
|
|
31
37
|
|
|
32
38
|
function main() {
|
|
33
39
|
runMain().catch((error) => {
|
|
@@ -47,12 +53,15 @@ async function runMain() {
|
|
|
47
53
|
const taskId = path.basename(taskDir);
|
|
48
54
|
const verifierConfig = resolveVerifierConfig(args);
|
|
49
55
|
const force = getFlag(args, 'force', false) === true;
|
|
56
|
+
const forceReviewBudget = getFlag(args, 'force-review-budget', false) === true
|
|
57
|
+
|| getFlag(args, 'force-external-review', false) === true;
|
|
50
58
|
const runStartedAt = new Date();
|
|
51
59
|
appendVerifyTimeline(taskDir, {
|
|
52
60
|
event: 'verify_started',
|
|
53
61
|
mode: verifierConfig.mode,
|
|
54
62
|
provider: verifierConfig.provider,
|
|
55
63
|
model: verifierConfig.model,
|
|
64
|
+
forceReviewBudget,
|
|
56
65
|
});
|
|
57
66
|
const planSha = hashTaskMarkdown(taskDir, 'plan.md');
|
|
58
67
|
const executionSha = hashTaskMarkdown(taskDir, 'execution.md');
|
|
@@ -202,6 +211,101 @@ async function runMain() {
|
|
|
202
211
|
return;
|
|
203
212
|
}
|
|
204
213
|
|
|
214
|
+
const forceApproval = forceReviewBudget
|
|
215
|
+
? readReviewBudgetApproval({ taskDir, stage: 'verify' })
|
|
216
|
+
: { ok: true, approval: null };
|
|
217
|
+
if (!forceApproval.ok) {
|
|
218
|
+
const budget = resolveStageReviewBudget(readAgentsConfig(), 'verify');
|
|
219
|
+
const summary = summarizeReviewBudgetWindow({
|
|
220
|
+
timeline: readTimeline(taskDir, 'verify-timeline.json'),
|
|
221
|
+
stage: 'verify',
|
|
222
|
+
now: new Date(),
|
|
223
|
+
});
|
|
224
|
+
writeVerifyReviewBudgetReturn({
|
|
225
|
+
taskDir,
|
|
226
|
+
taskId,
|
|
227
|
+
verifierConfig,
|
|
228
|
+
verifierRunId,
|
|
229
|
+
planSha,
|
|
230
|
+
executionSha,
|
|
231
|
+
reason: forceApproval.reason,
|
|
232
|
+
message: forceApproval.message,
|
|
233
|
+
budget,
|
|
234
|
+
summary,
|
|
235
|
+
approval: forceApproval,
|
|
236
|
+
});
|
|
237
|
+
appendVerifyTimeline(taskDir, {
|
|
238
|
+
event: 'review_budget_force_denied',
|
|
239
|
+
verdict: 'human_arbitration_required',
|
|
240
|
+
reason: forceApproval.reason,
|
|
241
|
+
message: forceApproval.message,
|
|
242
|
+
budget,
|
|
243
|
+
summary,
|
|
244
|
+
timing: buildTiming(runStartedAt),
|
|
245
|
+
});
|
|
246
|
+
recordLlmInputUsage({
|
|
247
|
+
taskDir,
|
|
248
|
+
stage: 'verify',
|
|
249
|
+
packMeta: promptPayload.pack.meta,
|
|
250
|
+
attempts: [
|
|
251
|
+
...llmInputAttempts,
|
|
252
|
+
buildAttemptRecord(promptPayload.pack.meta, `review_budget_force_denied:${forceApproval.reason}`),
|
|
253
|
+
],
|
|
254
|
+
rerunCount,
|
|
255
|
+
timing: buildTiming(runStartedAt),
|
|
256
|
+
});
|
|
257
|
+
console.log(`Verifier force review denied ${taskId}: human_arbitration_required`);
|
|
258
|
+
console.log(`- reason: ${forceApproval.reason}`);
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
const reviewBudget = evaluateCurrentReviewBudget({
|
|
263
|
+
taskDir,
|
|
264
|
+
stage: 'verify',
|
|
265
|
+
config: readAgentsConfig(),
|
|
266
|
+
force: forceReviewBudget,
|
|
267
|
+
});
|
|
268
|
+
if (!reviewBudget.ok) {
|
|
269
|
+
writeVerifyReviewBudgetReturn({
|
|
270
|
+
taskDir,
|
|
271
|
+
taskId,
|
|
272
|
+
verifierConfig,
|
|
273
|
+
verifierRunId,
|
|
274
|
+
planSha,
|
|
275
|
+
executionSha,
|
|
276
|
+
reason: reviewBudget.reason,
|
|
277
|
+
message: reviewBudget.message,
|
|
278
|
+
budget: reviewBudget.budget,
|
|
279
|
+
summary: reviewBudget.summary,
|
|
280
|
+
approval: forceApproval,
|
|
281
|
+
});
|
|
282
|
+
appendVerifyTimeline(taskDir, {
|
|
283
|
+
event: 'review_budget_blocked',
|
|
284
|
+
verdict: 'human_arbitration_required',
|
|
285
|
+
reason: reviewBudget.reason,
|
|
286
|
+
message: reviewBudget.message,
|
|
287
|
+
budget: reviewBudget.budget,
|
|
288
|
+
summary: reviewBudget.summary,
|
|
289
|
+
timing: buildTiming(runStartedAt),
|
|
290
|
+
});
|
|
291
|
+
recordLlmInputUsage({
|
|
292
|
+
taskDir,
|
|
293
|
+
stage: 'verify',
|
|
294
|
+
packMeta: promptPayload.pack.meta,
|
|
295
|
+
attempts: [
|
|
296
|
+
...llmInputAttempts,
|
|
297
|
+
buildAttemptRecord(promptPayload.pack.meta, `review_budget_blocked:${reviewBudget.reason}`),
|
|
298
|
+
],
|
|
299
|
+
rerunCount,
|
|
300
|
+
timing: buildTiming(runStartedAt),
|
|
301
|
+
});
|
|
302
|
+
console.log(`Verifier review budget blocked ${taskId}: human_arbitration_required`);
|
|
303
|
+
console.log(`- reason: ${reviewBudget.reason}`);
|
|
304
|
+
console.log(`- elapsedMs: ${reviewBudget.summary.elapsedMs}`);
|
|
305
|
+
console.log(`- providerStarted: ${reviewBudget.summary.providerStarted}`);
|
|
306
|
+
return;
|
|
307
|
+
}
|
|
308
|
+
|
|
205
309
|
try {
|
|
206
310
|
const providerStartedAt = new Date();
|
|
207
311
|
appendVerifyTimeline(taskDir, {
|
|
@@ -221,6 +325,7 @@ async function runMain() {
|
|
|
221
325
|
reasoningEffort: verifierConfig.reasoningEffort,
|
|
222
326
|
prompt: promptPayload.prompt,
|
|
223
327
|
cwd: repoRoot,
|
|
328
|
+
timeoutMs: reviewBudget.budget.providerTimeoutMs,
|
|
224
329
|
});
|
|
225
330
|
appendVerifyTimeline(taskDir, {
|
|
226
331
|
event: 'provider_completed',
|
|
@@ -310,6 +415,7 @@ async function runMain() {
|
|
|
310
415
|
finalEstimatedTokens: finalPack?.meta?.estimatedTokens || null,
|
|
311
416
|
timing: buildTiming(runStartedAt),
|
|
312
417
|
});
|
|
418
|
+
appendReviewBudgetSummary(taskDir);
|
|
313
419
|
appendOrchestrationLog(taskDir, `external CLI verifier completed via ${verifierConfig.provider}; verdict=${verifyResultJson.verdict}; runId=${verifierRunId}`);
|
|
314
420
|
console.log(`Verifier run completed for ${taskId}: ${verifyResultJson.verdict}`);
|
|
315
421
|
console.log(`- verifierRunId: ${verifierRunId}`);
|
|
@@ -319,6 +425,121 @@ async function runMain() {
|
|
|
319
425
|
}
|
|
320
426
|
}
|
|
321
427
|
|
|
428
|
+
function evaluateCurrentReviewBudget({ taskDir, stage, config, force }) {
|
|
429
|
+
const budget = resolveStageReviewBudget(config, stage);
|
|
430
|
+
const summary = summarizeReviewBudgetWindow({
|
|
431
|
+
timeline: readTimeline(taskDir, 'verify-timeline.json'),
|
|
432
|
+
stage,
|
|
433
|
+
now: new Date(),
|
|
434
|
+
});
|
|
435
|
+
return {
|
|
436
|
+
...evaluateReviewBudget({ budget, summary, force }),
|
|
437
|
+
budget,
|
|
438
|
+
summary,
|
|
439
|
+
};
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
function readTimeline(taskDir, fileName) {
|
|
443
|
+
const timelinePath = path.join(taskDir, fileName);
|
|
444
|
+
if (!fs.existsSync(timelinePath)) {
|
|
445
|
+
return [];
|
|
446
|
+
}
|
|
447
|
+
try {
|
|
448
|
+
const parsed = JSON.parse(fs.readFileSync(timelinePath, 'utf8'));
|
|
449
|
+
return Array.isArray(parsed) ? parsed : [];
|
|
450
|
+
} catch {
|
|
451
|
+
return [];
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
function writeVerifyReviewBudgetReturn({
|
|
456
|
+
taskDir,
|
|
457
|
+
taskId,
|
|
458
|
+
verifierConfig,
|
|
459
|
+
verifierRunId,
|
|
460
|
+
planSha,
|
|
461
|
+
executionSha,
|
|
462
|
+
reason,
|
|
463
|
+
message,
|
|
464
|
+
budget,
|
|
465
|
+
summary,
|
|
466
|
+
approval = null,
|
|
467
|
+
}) {
|
|
468
|
+
const verifyMarkdown = [
|
|
469
|
+
'# Verify',
|
|
470
|
+
'',
|
|
471
|
+
'## Verdict',
|
|
472
|
+
'',
|
|
473
|
+
'`human_arbitration_required`',
|
|
474
|
+
'',
|
|
475
|
+
'## Review Budget Gate',
|
|
476
|
+
'',
|
|
477
|
+
message,
|
|
478
|
+
'',
|
|
479
|
+
'The external Verifier was not invoked. The framework enforces a bounded Verify stage so execution fixes are consolidated instead of repeatedly rechecked by an external provider.',
|
|
480
|
+
'',
|
|
481
|
+
'## Budget',
|
|
482
|
+
'',
|
|
483
|
+
'```json',
|
|
484
|
+
JSON.stringify({ reason, budget, summary, approval }, null, 2),
|
|
485
|
+
'```',
|
|
486
|
+
'',
|
|
487
|
+
'## Required decision',
|
|
488
|
+
'',
|
|
489
|
+
'- Consolidate remaining Verify findings in `execution.md` / evidence artifacts, then run one fresh Verify after the window resets; or',
|
|
490
|
+
'- Ask the human to approve an extra external review by writing `review-budget-approval.json`, then rerun with `--force-review-budget`.',
|
|
491
|
+
].join('\n');
|
|
492
|
+
const result = {
|
|
493
|
+
schemaVersion: 1,
|
|
494
|
+
taskId,
|
|
495
|
+
planSha,
|
|
496
|
+
executionSha,
|
|
497
|
+
verificationMode: 'external_cli',
|
|
498
|
+
verifierProvider: 'review-budget',
|
|
499
|
+
verifierModel: 'none',
|
|
500
|
+
verifierRunId,
|
|
501
|
+
verdict: 'human_arbitration_required',
|
|
502
|
+
failureReason: null,
|
|
503
|
+
readyForRetrospective: false,
|
|
504
|
+
counts: {
|
|
505
|
+
blockingFindings: 1,
|
|
506
|
+
nonBlockingFindings: 0,
|
|
507
|
+
questions: 1,
|
|
508
|
+
},
|
|
509
|
+
findings: [
|
|
510
|
+
{
|
|
511
|
+
id: 'V-001',
|
|
512
|
+
severity: 'blocking',
|
|
513
|
+
claimCategory: 'runtime_risk',
|
|
514
|
+
affectedArtifacts: ['verify-timeline.json'],
|
|
515
|
+
claim: message,
|
|
516
|
+
evidenceRefs: [
|
|
517
|
+
{
|
|
518
|
+
type: 'artifact',
|
|
519
|
+
ref: 'verify-timeline.json',
|
|
520
|
+
},
|
|
521
|
+
],
|
|
522
|
+
expectedCorrection: 'Stop the repeated external Verify loop. Consolidate remaining execution fixes or ask the human to approve one extra external review with --force-review-budget.',
|
|
523
|
+
},
|
|
524
|
+
],
|
|
525
|
+
reviewBudget: {
|
|
526
|
+
reason,
|
|
527
|
+
budget,
|
|
528
|
+
summary,
|
|
529
|
+
approval,
|
|
530
|
+
forceFlag: '--force-review-budget',
|
|
531
|
+
},
|
|
532
|
+
};
|
|
533
|
+
|
|
534
|
+
writeTaskFile(taskDir, 'verify.md', verifyMarkdown);
|
|
535
|
+
writeTaskFile(taskDir, 'verify.result.json', JSON.stringify(result, null, 2));
|
|
536
|
+
updateStatusForVerifyResult(taskDir, result, {
|
|
537
|
+
reused: false,
|
|
538
|
+
verifierMode: 'external_cli',
|
|
539
|
+
});
|
|
540
|
+
appendOrchestrationLog(taskDir, `Verify review budget blocked external verifier; reason=${reason}; elapsedMs=${summary.elapsedMs}; providerStarted=${summary.providerStarted}; maxExternalRuns=${budget.maxExternalRunsPerStage}; stageSlaMs=${budget.stageSlaMs}`);
|
|
541
|
+
}
|
|
542
|
+
|
|
322
543
|
function buildTiming(startedAt, completedAt = new Date()) {
|
|
323
544
|
return {
|
|
324
545
|
startedAt: startedAt.toISOString(),
|
|
@@ -426,6 +647,21 @@ function appendVerifyTimeline(taskDir, event) {
|
|
|
426
647
|
writeTaskFile(taskDir, 'verify-timeline.json', JSON.stringify(existing, null, 2));
|
|
427
648
|
}
|
|
428
649
|
|
|
650
|
+
function appendReviewBudgetSummary(taskDir) {
|
|
651
|
+
const config = readAgentsConfig();
|
|
652
|
+
const budget = resolveStageReviewBudget(config, 'verify');
|
|
653
|
+
const summary = summarizeReviewBudgetWindow({
|
|
654
|
+
timeline: readTimeline(taskDir, 'verify-timeline.json'),
|
|
655
|
+
stage: 'verify',
|
|
656
|
+
now: new Date(),
|
|
657
|
+
});
|
|
658
|
+
appendVerifyTimeline(taskDir, {
|
|
659
|
+
event: 'review_budget_summary',
|
|
660
|
+
budget,
|
|
661
|
+
summary,
|
|
662
|
+
});
|
|
663
|
+
}
|
|
664
|
+
|
|
429
665
|
function buildAttemptRecord(packMeta, outcome) {
|
|
430
666
|
return {
|
|
431
667
|
mode: packMeta.mode,
|
|
@@ -18,6 +18,18 @@
|
|
|
18
18
|
"isolatedContext": true,
|
|
19
19
|
"readOnly": true
|
|
20
20
|
},
|
|
21
|
+
"reviewBudgets": {
|
|
22
|
+
"check": {
|
|
23
|
+
"stageSlaMs": 180000,
|
|
24
|
+
"maxExternalRunsPerStage": 1,
|
|
25
|
+
"providerTimeoutMs": 180000
|
|
26
|
+
},
|
|
27
|
+
"verify": {
|
|
28
|
+
"stageSlaMs": 180000,
|
|
29
|
+
"maxExternalRunsPerStage": 1,
|
|
30
|
+
"providerTimeoutMs": 180000
|
|
31
|
+
}
|
|
32
|
+
},
|
|
21
33
|
"checkerProviders": {
|
|
22
34
|
"codex-cli": {
|
|
23
35
|
"command": "${CODEX_CLI_COMMAND}",
|
package/package.json
CHANGED
package/prompts/checker.md
CHANGED
|
@@ -73,6 +73,9 @@ Project-specific context приходит только через task artifacts
|
|
|
73
73
|
25. Если в task artifacts есть `precheck-remediation.md`, Checker должен проверить, что весь checklist был закрыт одним consolidated plan update. Не создавай новый мелкий blocker по одному пункту из старого checklist, если оставшиеся пункты тоже очевидно не закрыты: верни consolidated finding, ссылающийся на `precheck-remediation.md`.
|
|
74
74
|
26. Minor process/evidence polish не должен блокировать Human Gate, если deterministic gates закрыты, acceptance criteria покрыты, scope/risk/security/data correctness не нарушены, а остаток можно безопасно записать как `non_blocking` или human question.
|
|
75
75
|
27. Если plan/task/checker-context показывает golden set/eval/regression fixtures/label cards/ground truth, Checker должен требовать `## Label Card Schema`, `## Coverage Matrix`, `## Negative / Edge Cases` и `## Harness Boundary`. Golden set без schema/coverage/negative cases/source evidence/non-goals/manual-vs-automated boundary является `return_to_plan`, даже если есть общий текст про expected outputs.
|
|
76
|
+
28. Если remaining issue является процессной ясностью, wording polish или удобством статуса, а план уже содержит executable scope, acceptance, risk gates and verification evidence path, не возвращай `return_to_plan`; запиши как non-blocking note или human question. Цель Check - предотвратить дорогие ошибки до Execute, а не создавать повторные внешние циклы ради косметики.
|
|
77
|
+
29. Если видишь несколько related blockers, объедини их в один consolidated finding с полным checklist. Не выдавай только первый найденный blocker, если следующий внешний Check очевидно найдет соседний.
|
|
78
|
+
30. Если review budget уже требует Human Arbitration, не предлагай `--force-review-budget` как обычный retry. Он допустим только при наличии human-approved `review-budget-approval.json`.
|
|
76
79
|
|
|
77
80
|
## Контракт выхода
|
|
78
81
|
|
package/prompts/planner.md
CHANGED
|
@@ -53,6 +53,8 @@
|
|
|
53
53
|
17. Если Planner знает факт только из conversation context, этот факт нужно перенести в artifact: `brief.md`, `research.md`, `status.md` или `human_decision` evidence. Невидимый контекст не является evidence.
|
|
54
54
|
18. Plan должен назвать risk tier (`R0`-`R5`), execution target and execution budget. Для `R1/R2` можно разрешить fast loop inside approved scope, но обязательно назвать stop rules.
|
|
55
55
|
19. План проверки должен быть ladder-based: micro-verify during Execute, slice-verify before completion and external Verify requirement for closeout/high-risk claims.
|
|
56
|
+
20. После `return_to_plan` Planner обязан выполнить один consolidated remediation pass: закрыть все blocking findings, precheck checklist и obvious adjacent gaps в `plan.md`/`check-resolution.md` до следующего Check. Не запускай внешний Check после единичной мелкой правки, если другие known blockers остаются открыты.
|
|
57
|
+
21. Если Check остановлен review budget gate (`human_arbitration_required` с `reviewBudget.reason`), Planner не должен пытаться обойти это повторным запуском. Нужно либо запросить human approval и записать `review-budget-approval.json` перед `--force-review-budget`, либо укрупнить remediation и вернуться к Check после явного решения.
|
|
56
58
|
20. План должен описывать meaningful slice. Не дроби локальную работу на отдельный Plan/Check/Verify для каждого микрофикса, если риски и target остаются внутри одного approved tier.
|
|
57
59
|
21. Если risk triggers или `checker-context-pack.md` показывают O2/O3 hot-path work, Planner обязан добавить `## Optimization Strategy`: tier, hot paths, expected data size, chosen efficient approach, anti-patterns avoided and bounded optimizer budget/stop rule. Цель gate — предотвратить очевидно неэффективное решение до Execute, а не запускать бесконечную оптимизацию.
|
|
58
60
|
22. Если задача создает golden set/eval/regression fixtures/label cards/ground truth, Planner обязан добавить `## Label Card Schema`, `## Coverage Matrix`, `## Negative / Edge Cases` и `## Harness Boundary`. Golden set должен быть test contract with expected outputs, non-goals, source refs, missing coverage policy and manual-vs-automated boundary.
|
package/prompts/supervisor.md
CHANGED
|
@@ -61,6 +61,9 @@ Supervisor является code-level orchestrator по контракту: rou
|
|
|
61
61
|
27. Если external verifier/checker/browser tooling начинает тратить непропорционально много времени или блокируется окружением, Supervisor обязан остановить loop и вынести human decision: принять internal verify/evidence, запустить external escalation вручную или изменить scope.
|
|
62
62
|
28. Если deterministic Check preflight создал `precheck-remediation.md`, Supervisor не должен запускать повторный Check после точечной правки одного пункта. Сначала Planner/Executor должен закрыть весь checklist или явно отметить not-applicable с evidence/human decision в `plan.md`/`status.md`, затем допускается один fresh Check.
|
|
63
63
|
29. Перед повторным Check после deterministic precheck Supervisor обязан сверить, что `precheck-remediation.md` был использован как consolidated checklist: все listed gates отражены в plan/research/status, а не закрывались по одному через серию precheck loops.
|
|
64
|
+
30. External Check и external Verify имеют stage SLA по умолчанию 3 минуты, максимум один external provider run на фазу и hard provider timeout 3 минуты. Если `check.result.json` или `verify.result.json` вернул `human_arbitration_required` с `reviewBudget.reason`, Supervisor не запускает еще один внешний review без явного human approval, записанного в `review-budget-approval.json`, и `--force-review-budget`.
|
|
65
|
+
31. После `return_to_plan` / `return_to_execute` Supervisor должен требовать один consolidated remediation pass. Запрещено запускать серию внешних Check/Verify для мелких последовательных правок, если их можно закрыть в одном artifact update.
|
|
66
|
+
32. `--force-review-budget` запрещен как обычный retry flag. Он допустим только после human decision и должен быть виден в timeline вместе с approval artifact; без approval artifact команда должна остаться на Human Arbitration.
|
|
64
67
|
|
|
65
68
|
## Hard Gate: Material Scope Expansion -> Brief Reset
|
|
66
69
|
|
package/prompts/verifier.md
CHANGED
|
@@ -47,6 +47,9 @@
|
|
|
47
47
|
23. Если task содержит `precheck-remediation.md`, verifier должен проверить только применимые пункты, которые дошли до Execute. Не возвращай `return_to_execute` из-за старого precheck checklist, если план закрыл его до Human Gate и фактическая реализация покрывает acceptance.
|
|
48
48
|
24. Environment/tooling failures внешнего verifier/browser smoke не должны превращаться в бесконечный `return_to_execute` loop. Если implementation evidence достаточно, но внешний инструмент заблокирован окружением, используй `pass_with_notes` или `human_arbitration_required` согласно риску.
|
|
49
49
|
25. Если `plan.md` содержит golden set/eval/regression fixture sections, verifier должен проверить `Golden Set / Regression Evidence`: label cards follow schema, coverage matrix is filled, negative/edge cases are selected or documented missing, expected outputs/non-goals are inspectable, source refs/snippets exist and manual-vs-automated harness boundary is explicit.
|
|
50
|
+
26. External Verify должен укладываться в bounded review model: один внешний provider run по умолчанию. Если остаются несколько blockers, верни один consolidated `return_to_execute` finding с полным checklist. Minor documentation/status polish не должен запускать новый внешний цикл, если acceptance/evidence покрыты.
|
|
51
|
+
27. Если review budget gate уже вернул `human_arbitration_required`, не предлагай повторный external Verify как обычный следующий шаг. Следующий шаг: consolidated execution fix, internal evidence decision или явный human approval, записанный в `review-budget-approval.json`, перед `--force-review-budget`.
|
|
52
|
+
28. External provider timeout является hard gate. Если verifier не успевает уложиться в 3 минуты, верни `verifier_failed`/`timeout` или `human_arbitration_required`; не предлагай бесконечный retry с тем же input.
|
|
50
53
|
|
|
51
54
|
## Контракт выхода
|
|
52
55
|
|