@zhixuan92/multi-model-agent-core 3.8.0 → 3.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/config/schema.d.ts +12 -0
- package/dist/config/schema.d.ts.map +1 -1
- package/dist/config/schema.js +16 -9
- package/dist/config/schema.js.map +1 -1
- package/dist/executors/_shared/findings-schema.d.ts +61 -21
- package/dist/executors/_shared/findings-schema.d.ts.map +1 -1
- package/dist/executors/_shared/findings-schema.js +27 -15
- package/dist/executors/_shared/findings-schema.js.map +1 -1
- package/dist/executors/audit.d.ts.map +1 -1
- package/dist/executors/audit.js +2 -1
- package/dist/executors/audit.js.map +1 -1
- package/dist/executors/debug.d.ts.map +1 -1
- package/dist/executors/debug.js +2 -1
- package/dist/executors/debug.js.map +1 -1
- package/dist/executors/execute-plan.d.ts.map +1 -1
- package/dist/executors/execute-plan.js +2 -1
- package/dist/executors/execute-plan.js.map +1 -1
- package/dist/executors/investigate.d.ts.map +1 -1
- package/dist/executors/investigate.js +4 -2
- package/dist/executors/investigate.js.map +1 -1
- package/dist/executors/review.d.ts.map +1 -1
- package/dist/executors/review.js +2 -1
- package/dist/executors/review.js.map +1 -1
- package/dist/executors/verify.d.ts.map +1 -1
- package/dist/executors/verify.js +2 -1
- package/dist/executors/verify.js.map +1 -1
- package/dist/heartbeat.d.ts +3 -0
- package/dist/heartbeat.d.ts.map +1 -1
- package/dist/heartbeat.js +5 -0
- package/dist/heartbeat.js.map +1 -1
- package/dist/intake/resolve.d.ts.map +1 -1
- package/dist/intake/resolve.js +21 -6
- package/dist/intake/resolve.js.map +1 -1
- package/dist/observability/events.d.ts +192 -23
- package/dist/observability/events.d.ts.map +1 -1
- package/dist/observability/events.js +50 -11
- package/dist/observability/events.js.map +1 -1
- package/dist/review/aggregate-result.d.ts +1 -1
- package/dist/review/aggregate-result.d.ts.map +1 -1
- package/dist/review/aggregate-result.js.map +1 -1
- package/dist/review/diff-review.d.ts +7 -2
- package/dist/review/diff-review.d.ts.map +1 -1
- package/dist/review/diff-review.js +11 -2
- package/dist/review/diff-review.js.map +1 -1
- package/dist/review/quality-only-prompts.d.ts +11 -5
- package/dist/review/quality-only-prompts.d.ts.map +1 -1
- package/dist/review/quality-only-prompts.js +98 -51
- package/dist/review/quality-only-prompts.js.map +1 -1
- package/dist/review/quality-reviewer.d.ts +51 -10
- package/dist/review/quality-reviewer.d.ts.map +1 -1
- package/dist/review/quality-reviewer.js +163 -21
- package/dist/review/quality-reviewer.js.map +1 -1
- package/dist/review/spec-reviewer.d.ts +1 -1
- package/dist/review/spec-reviewer.d.ts.map +1 -1
- package/dist/review/spec-reviewer.js +4 -3
- package/dist/review/spec-reviewer.js.map +1 -1
- package/dist/run-tasks/index.d.ts +1 -0
- package/dist/run-tasks/index.d.ts.map +1 -1
- package/dist/run-tasks/reviewed-lifecycle.d.ts +16 -3
- package/dist/run-tasks/reviewed-lifecycle.d.ts.map +1 -1
- package/dist/run-tasks/reviewed-lifecycle.js +278 -212
- package/dist/run-tasks/reviewed-lifecycle.js.map +1 -1
- package/dist/run-tasks/stage-idle-tracker.d.ts +14 -0
- package/dist/run-tasks/stage-idle-tracker.d.ts.map +1 -0
- package/dist/run-tasks/stage-idle-tracker.js +17 -0
- package/dist/run-tasks/stage-idle-tracker.js.map +1 -0
- package/dist/tool-schemas/audit.d.ts +2 -0
- package/dist/tool-schemas/audit.d.ts.map +1 -1
- package/dist/tool-schemas/debug.d.ts +2 -0
- package/dist/tool-schemas/debug.d.ts.map +1 -1
- package/dist/tool-schemas/delegate.d.ts +2 -0
- package/dist/tool-schemas/delegate.d.ts.map +1 -1
- package/dist/tool-schemas/execute-plan.d.ts +2 -0
- package/dist/tool-schemas/execute-plan.d.ts.map +1 -1
- package/dist/tool-schemas/investigate.d.ts +2 -0
- package/dist/tool-schemas/investigate.d.ts.map +1 -1
- package/dist/tool-schemas/retry.d.ts +2 -0
- package/dist/tool-schemas/retry.d.ts.map +1 -1
- package/dist/tool-schemas/review.d.ts +2 -0
- package/dist/tool-schemas/review.d.ts.map +1 -1
- package/dist/tool-schemas/shared-output.d.ts +2 -0
- package/dist/tool-schemas/shared-output.d.ts.map +1 -1
- package/dist/tool-schemas/shared-output.js +1 -1
- package/dist/tool-schemas/shared-output.js.map +1 -1
- package/dist/tool-schemas/verify.d.ts +2 -0
- package/dist/tool-schemas/verify.d.ts.map +1 -1
- package/dist/types.d.ts +8 -2
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js.map +1 -1
- package/package.json +1 -1
|
@@ -7,6 +7,8 @@ import { pickEscalation, pickReviewer, maxRowsFor, } from '../escalation/policy.
|
|
|
7
7
|
import { runWithFallback, makeSyntheticRunResult, TRANSPORT_FAILURES, isReviewTransportFailure, } from '../escalation/fallback.js';
|
|
8
8
|
import { findModelCapabilities, extractCanonicalModelName } from '../routing/model-profiles.js';
|
|
9
9
|
import { HeartbeatTimer } from '../heartbeat.js';
|
|
10
|
+
import { newStageIdleTracker, snapshotIdle } from './stage-idle-tracker.js';
|
|
11
|
+
import { DEFAULT_TASK_TIMEOUT_MS, DEFAULT_STALL_TIMEOUT_MS } from '../config/schema.js';
|
|
10
12
|
import { runSpecReview } from '../review/spec-reviewer.js';
|
|
11
13
|
import { makeSkippedReviewResult } from '../review/skipped-result.js';
|
|
12
14
|
import { runQualityReview } from '../review/quality-reviewer.js';
|
|
@@ -28,14 +30,14 @@ const READ_ONLY_TOOL_NAMES = new Set([
|
|
|
28
30
|
]);
|
|
29
31
|
export function emptyStats() {
|
|
30
32
|
return {
|
|
31
|
-
implementing: { stage: 'implementing', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null },
|
|
32
|
-
spec_rework: { stage: 'spec_rework', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null },
|
|
33
|
-
quality_rework: { stage: 'quality_rework', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null },
|
|
34
|
-
committing: { stage: 'committing', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null },
|
|
35
|
-
verifying: { stage: 'verifying', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null, outcome: null, skipReason: null },
|
|
36
|
-
spec_review: { stage: 'spec_review', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null, verdict: null, roundsUsed: null },
|
|
37
|
-
quality_review: { stage: 'quality_review', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null, verdict: null, roundsUsed: null },
|
|
38
|
-
diff_review: { stage: 'diff_review', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null, verdict: null, roundsUsed: null },
|
|
33
|
+
implementing: { stage: 'implementing', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null, maxIdleMs: null, totalIdleMs: null, activityEvents: null },
|
|
34
|
+
spec_rework: { stage: 'spec_rework', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null, maxIdleMs: null, totalIdleMs: null, activityEvents: null },
|
|
35
|
+
quality_rework: { stage: 'quality_rework', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null, maxIdleMs: null, totalIdleMs: null, activityEvents: null },
|
|
36
|
+
committing: { stage: 'committing', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null, maxIdleMs: null, totalIdleMs: null, activityEvents: null },
|
|
37
|
+
verifying: { stage: 'verifying', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null, maxIdleMs: null, totalIdleMs: null, activityEvents: null, outcome: null, skipReason: null },
|
|
38
|
+
spec_review: { stage: 'spec_review', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null, maxIdleMs: null, totalIdleMs: null, activityEvents: null, verdict: null, roundsUsed: null },
|
|
39
|
+
quality_review: { stage: 'quality_review', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null, maxIdleMs: null, totalIdleMs: null, activityEvents: null, verdict: null, roundsUsed: null },
|
|
40
|
+
diff_review: { stage: 'diff_review', entered: false, durationMs: null, costUSD: null, agentTier: null, modelFamily: null, model: null, maxIdleMs: null, totalIdleMs: null, activityEvents: null, verdict: null, roundsUsed: null },
|
|
39
41
|
};
|
|
40
42
|
}
|
|
41
43
|
const FAMILY_MAP = {
|
|
@@ -50,7 +52,7 @@ function modelFamily(model) {
|
|
|
50
52
|
const raw = dash > 0 ? canonical.slice(0, dash) : canonical;
|
|
51
53
|
return FAMILY_MAP[raw.toLowerCase()] ?? 'other';
|
|
52
54
|
}
|
|
53
|
-
export function endBaseStage(stats, name, t0, c0, agent, finalCostUSD) {
|
|
55
|
+
export function endBaseStage(stats, name, t0, c0, agent, finalCostUSD, idle) {
|
|
54
56
|
// Cast through unknown — TS can't narrow stats[name] on a union-typed index;
|
|
55
57
|
// the runtime invariant (set name's slot to its matching variant) is enforced
|
|
56
58
|
// by the helper signature and tested by tests/run-tasks/stage-stats.test.ts.
|
|
@@ -62,9 +64,12 @@ export function endBaseStage(stats, name, t0, c0, agent, finalCostUSD) {
|
|
|
62
64
|
agentTier: agent.tier,
|
|
63
65
|
modelFamily: modelFamily(agent.model),
|
|
64
66
|
model: agent.model,
|
|
67
|
+
maxIdleMs: idle?.maxIdleMs ?? null,
|
|
68
|
+
totalIdleMs: idle?.totalIdleMs ?? null,
|
|
69
|
+
activityEvents: idle?.activityEvents ?? null,
|
|
65
70
|
};
|
|
66
71
|
}
|
|
67
|
-
export function endReviewStage(stats, name, t0, c0, agent, finalCostUSD, verdict, roundsUsed) {
|
|
72
|
+
export function endReviewStage(stats, name, t0, c0, agent, finalCostUSD, idle, verdict, roundsUsed) {
|
|
68
73
|
stats[name] = {
|
|
69
74
|
stage: name,
|
|
70
75
|
entered: true,
|
|
@@ -73,11 +78,14 @@ export function endReviewStage(stats, name, t0, c0, agent, finalCostUSD, verdict
|
|
|
73
78
|
agentTier: agent.tier,
|
|
74
79
|
modelFamily: modelFamily(agent.model),
|
|
75
80
|
model: agent.model,
|
|
81
|
+
maxIdleMs: idle?.maxIdleMs ?? null,
|
|
82
|
+
totalIdleMs: idle?.totalIdleMs ?? null,
|
|
83
|
+
activityEvents: idle?.activityEvents ?? null,
|
|
76
84
|
verdict,
|
|
77
85
|
roundsUsed,
|
|
78
86
|
};
|
|
79
87
|
}
|
|
80
|
-
export function endVerifyStage(stats, t0, c0, agent, finalCostUSD, outcome, skipReason) {
|
|
88
|
+
export function endVerifyStage(stats, t0, c0, agent, finalCostUSD, idle, outcome, skipReason) {
|
|
81
89
|
stats.verifying = {
|
|
82
90
|
stage: 'verifying',
|
|
83
91
|
entered: true,
|
|
@@ -86,6 +94,9 @@ export function endVerifyStage(stats, t0, c0, agent, finalCostUSD, outcome, skip
|
|
|
86
94
|
agentTier: agent.tier,
|
|
87
95
|
modelFamily: modelFamily(agent.model),
|
|
88
96
|
model: agent.model,
|
|
97
|
+
maxIdleMs: idle?.maxIdleMs ?? null,
|
|
98
|
+
totalIdleMs: idle?.totalIdleMs ?? null,
|
|
99
|
+
activityEvents: idle?.activityEvents ?? null,
|
|
89
100
|
outcome,
|
|
90
101
|
skipReason,
|
|
91
102
|
};
|
|
@@ -151,8 +162,9 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
151
162
|
diagnostics?.logger !== undefined ||
|
|
152
163
|
bus !== undefined;
|
|
153
164
|
// Synthesize an onProgress sink when the caller didn't pass one — the
|
|
154
|
-
// heartbeat needs a place to emit heartbeat events
|
|
155
|
-
//
|
|
165
|
+
// heartbeat needs a place to emit heartbeat events. Discards events if
|
|
166
|
+
// there is no external consumer. wrappedOnProgress (defined below) is
|
|
167
|
+
// ALWAYS defined and feeds the stall watchdog regardless of consumers.
|
|
156
168
|
const synthOnProgress = onProgress ?? (() => { });
|
|
157
169
|
const heartbeat = needHeartbeat
|
|
158
170
|
? new HeartbeatTimer((event) => {
|
|
@@ -162,6 +174,7 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
162
174
|
// only via explicit emit calls at lifecycle points; the
|
|
163
175
|
// heartbeat tick no longer infers transitions (P5).
|
|
164
176
|
const sinceLastMs = Date.now() - prevEventAtMs;
|
|
177
|
+
const tickInfo = heartbeat?.getHeartbeatTickInfo();
|
|
165
178
|
emitTaskEvent('heartbeat', {
|
|
166
179
|
elapsed: event.elapsed,
|
|
167
180
|
stage: event.stage,
|
|
@@ -173,6 +186,7 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
173
186
|
text: textEmissionChars,
|
|
174
187
|
cost: event.costUSD,
|
|
175
188
|
idle_ms: sinceLastMs,
|
|
189
|
+
stage_idle_ms: tickInfo?.stageIdleMs ?? sinceLastMs,
|
|
176
190
|
});
|
|
177
191
|
}
|
|
178
192
|
synthOnProgress(taskIndex, event);
|
|
@@ -207,94 +221,108 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
207
221
|
const implModel = resolved.provider.config.model;
|
|
208
222
|
const progressCounters = { filesRead: 0, filesWritten: 0, toolCalls: 0 };
|
|
209
223
|
const verboseStream = verboseStreamRaw;
|
|
210
|
-
let prevEventAtMs =
|
|
224
|
+
let prevEventAtMs = Date.now();
|
|
211
225
|
// Wrap whenever we have ANY consumer for InternalRunnerEvent (heartbeat,
|
|
212
226
|
// verbose stream, or verbose logger). Previously this only wrapped when
|
|
213
227
|
// the caller passed onProgress, so --verbose + HTTP handlers (which don't
|
|
214
228
|
// pass onProgress) silently dropped every tool_call / turn_complete event.
|
|
215
229
|
let textEmissionChars = 0;
|
|
216
|
-
const markRunnerEvent = () => {
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
230
|
+
const markRunnerEvent = () => {
|
|
231
|
+
const now = Date.now();
|
|
232
|
+
const gap = now - stageIdle.stageLastEventMs;
|
|
233
|
+
if (gap > stageIdle.stageMaxIdleMs)
|
|
234
|
+
stageIdle.stageMaxIdleMs = gap;
|
|
235
|
+
if (gap > taskMaxIdleMs)
|
|
236
|
+
taskMaxIdleMs = gap;
|
|
237
|
+
if (gap > 1000)
|
|
238
|
+
stageIdle.stageTotalIdleMs += gap;
|
|
239
|
+
stageIdle.stageActivityCount += 1;
|
|
240
|
+
stageIdle.stageLastEventMs = now;
|
|
241
|
+
lastRunnerEventAtMs = now;
|
|
242
|
+
};
|
|
243
|
+
const wrappedOnProgress = (event) => {
|
|
244
|
+
// Watchdog: fire on every activity event regardless of telemetry consumers.
|
|
245
|
+
// Without this, a no-consumer caller leaves lastRunnerEventAtMs frozen at
|
|
246
|
+
// taskStartMs and the stall watchdog fires at stallTimeoutMs regardless of
|
|
247
|
+
// actual LLM activity.
|
|
248
|
+
if (event.kind === 'turn_start' || event.kind === 'text_emission' || event.kind === 'tool_call' || event.kind === 'turn_complete') {
|
|
249
|
+
markRunnerEvent();
|
|
250
|
+
}
|
|
251
|
+
if (!needHeartbeat)
|
|
252
|
+
return;
|
|
253
|
+
if (event.kind === 'worker_start') {
|
|
254
|
+
emitTaskEvent('worker_start', {
|
|
255
|
+
model: event.model,
|
|
256
|
+
providerType: event.providerType,
|
|
257
|
+
tier: event.tier,
|
|
258
|
+
});
|
|
259
|
+
}
|
|
260
|
+
if (event.kind === 'turn_start') {
|
|
261
|
+
heartbeat?.markEvent('llm');
|
|
262
|
+
prevEventAtMs = Date.now();
|
|
263
|
+
if (verbose) {
|
|
264
|
+
emitTaskEvent('turn_start', {
|
|
265
|
+
turn: event.turn,
|
|
266
|
+
provider: event.provider,
|
|
224
267
|
model: event.model,
|
|
225
|
-
providerType: event.providerType,
|
|
226
|
-
tier: event.tier,
|
|
227
268
|
});
|
|
228
269
|
}
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
270
|
+
}
|
|
271
|
+
if (event.kind === 'text_emission') {
|
|
272
|
+
prevEventAtMs = Date.now();
|
|
273
|
+
heartbeat?.markEvent('text');
|
|
274
|
+
textEmissionChars += event.chars;
|
|
275
|
+
if (verbose && event.chars > 0) {
|
|
276
|
+
const preview = event.preview.length > 60
|
|
277
|
+
? event.preview.slice(0, 57) + '...'
|
|
278
|
+
: event.preview;
|
|
279
|
+
emitTaskEvent('text_emission', {
|
|
280
|
+
chars: event.chars,
|
|
281
|
+
total: textEmissionChars,
|
|
282
|
+
preview,
|
|
283
|
+
});
|
|
240
284
|
}
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
emitTaskEvent('text_emission', {
|
|
249
|
-
chars: event.chars,
|
|
250
|
-
total: textEmissionChars,
|
|
251
|
-
preview,
|
|
252
|
-
});
|
|
253
|
-
}
|
|
285
|
+
}
|
|
286
|
+
if (event.kind === 'tool_call') {
|
|
287
|
+
heartbeat?.markEvent('tool');
|
|
288
|
+
progressCounters.toolCalls++;
|
|
289
|
+
const name = event.toolSummary.split('(')[0];
|
|
290
|
+
if (name === 'readFile' || name === 'grep' || name === 'glob' || name === 'listFiles') {
|
|
291
|
+
progressCounters.filesRead++;
|
|
254
292
|
}
|
|
255
|
-
if (
|
|
256
|
-
|
|
257
|
-
progressCounters.toolCalls++;
|
|
258
|
-
const name = event.toolSummary.split('(')[0];
|
|
259
|
-
if (name === 'readFile' || name === 'grep' || name === 'glob' || name === 'listFiles') {
|
|
260
|
-
progressCounters.filesRead++;
|
|
261
|
-
}
|
|
262
|
-
else if (name === 'writeFile' || name === 'editFile') {
|
|
263
|
-
progressCounters.filesWritten++;
|
|
264
|
-
}
|
|
265
|
-
heartbeat?.updateProgress(progressCounters.filesRead, progressCounters.filesWritten, progressCounters.toolCalls);
|
|
266
|
-
const now = verbose ? Date.now() : 0;
|
|
267
|
-
const sincePrevMs = verbose ? now - prevEventAtMs : 0;
|
|
268
|
-
if (verbose)
|
|
269
|
-
prevEventAtMs = now;
|
|
270
|
-
if (verbose) {
|
|
271
|
-
emitTaskEvent('tool_call', {
|
|
272
|
-
tool: event.toolSummary,
|
|
273
|
-
duration_ms: sincePrevMs,
|
|
274
|
-
});
|
|
275
|
-
}
|
|
293
|
+
else if (name === 'writeFile' || name === 'editFile') {
|
|
294
|
+
progressCounters.filesWritten++;
|
|
276
295
|
}
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
if (verbose) {
|
|
287
|
-
emitTaskEvent('turn_complete', {
|
|
288
|
-
input_tokens: event.cumulativeInputTokens,
|
|
289
|
-
output_tokens: event.cumulativeOutputTokens,
|
|
290
|
-
cost: costUSD,
|
|
291
|
-
duration_ms: turnDurMs,
|
|
292
|
-
provider: resolved.provider.config.model,
|
|
293
|
-
});
|
|
294
|
-
}
|
|
296
|
+
heartbeat?.updateProgress(progressCounters.filesRead, progressCounters.filesWritten, progressCounters.toolCalls);
|
|
297
|
+
const now = Date.now();
|
|
298
|
+
const sincePrevMs = now - prevEventAtMs;
|
|
299
|
+
prevEventAtMs = now;
|
|
300
|
+
if (verbose) {
|
|
301
|
+
emitTaskEvent('tool_call', {
|
|
302
|
+
tool: event.toolSummary,
|
|
303
|
+
duration_ms: sincePrevMs,
|
|
304
|
+
});
|
|
295
305
|
}
|
|
296
306
|
}
|
|
297
|
-
|
|
307
|
+
if (event.kind === 'turn_complete') {
|
|
308
|
+
heartbeat?.markEvent('llm');
|
|
309
|
+
const costUSD = computeCostUSD(event.cumulativeInputTokens, event.cumulativeOutputTokens, resolved.provider.config);
|
|
310
|
+
const savedCostUSD = computeSavedCostUSD(costUSD, event.cumulativeInputTokens, event.cumulativeOutputTokens, task.parentModel);
|
|
311
|
+
heartbeat?.updateCost(costUSD, savedCostUSD);
|
|
312
|
+
const nowTurn = Date.now();
|
|
313
|
+
const turnDurMs = nowTurn - prevEventAtMs;
|
|
314
|
+
prevEventAtMs = nowTurn;
|
|
315
|
+
if (verbose) {
|
|
316
|
+
emitTaskEvent('turn_complete', {
|
|
317
|
+
input_tokens: event.cumulativeInputTokens,
|
|
318
|
+
output_tokens: event.cumulativeOutputTokens,
|
|
319
|
+
cost: costUSD,
|
|
320
|
+
duration_ms: turnDurMs,
|
|
321
|
+
provider: resolved.provider.config.model,
|
|
322
|
+
});
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
};
|
|
298
326
|
const cwd = task.cwd ?? process.cwd();
|
|
299
327
|
const taskStartMs = Date.now();
|
|
300
328
|
// Hard task-level wall-clock cap. Once Date.now() crosses this, no new
|
|
@@ -302,16 +330,31 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
302
330
|
// any in-flight call gets a per-call timeoutMs clamped to remaining
|
|
303
331
|
// budget so it returns its salvage promptly. The user gets *something*
|
|
304
332
|
// back instead of an open-ended retry storm.
|
|
305
|
-
const taskTimeoutMs = task.timeoutMs ?? config.defaults.timeoutMs ??
|
|
333
|
+
const taskTimeoutMs = task.timeoutMs ?? config.defaults.timeoutMs ?? DEFAULT_TASK_TIMEOUT_MS;
|
|
306
334
|
const taskDeadlineMs = taskStartMs + taskTimeoutMs;
|
|
307
335
|
// Stall watchdog: when no LLM / tool / text event has fired for this
|
|
308
336
|
// many ms, the in-flight runner is force-aborted via `stallController`.
|
|
309
337
|
// Catches "model is silently thinking forever" and "transport hung" —
|
|
310
338
|
// both invisible to the wall-clock cap until the very end.
|
|
311
|
-
const stallTimeoutMs = config.defaults.stallTimeoutMs ??
|
|
339
|
+
const stallTimeoutMs = config.defaults.stallTimeoutMs ?? DEFAULT_STALL_TIMEOUT_MS;
|
|
312
340
|
const stallController = new AbortController();
|
|
313
341
|
let lastRunnerEventAtMs = taskStartMs;
|
|
342
|
+
let stageIdle = newStageIdleTracker(taskStartMs);
|
|
343
|
+
let taskMaxIdleMs = 0;
|
|
314
344
|
let stallFired = false;
|
|
345
|
+
// Track the current stage so the terminal transition can pass an accurate
|
|
346
|
+
// `from`. Initialized to 'implementing' (matching HeartbeatTimer.start's
|
|
347
|
+
// initial stage). Updated on every transitionStage call.
|
|
348
|
+
let currentStage = 'implementing';
|
|
349
|
+
function transitionStage(from, to, heartbeatPayload, jsonlPayload) {
|
|
350
|
+
if (heartbeatPayload !== null)
|
|
351
|
+
heartbeat?.transition(heartbeatPayload);
|
|
352
|
+
if (jsonlPayload !== null) {
|
|
353
|
+
emitTaskEvent('stage_change', { from, to, ...jsonlPayload });
|
|
354
|
+
}
|
|
355
|
+
stageIdle = newStageIdleTracker(Date.now());
|
|
356
|
+
currentStage = to;
|
|
357
|
+
}
|
|
315
358
|
const commits = [];
|
|
316
359
|
let commitError;
|
|
317
360
|
let specAttemptIndex = 0;
|
|
@@ -410,18 +453,17 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
410
453
|
const defaultVerification = { status: 'skipped', steps: [], totalDurationMs: 0, skipReason: 'no_command' };
|
|
411
454
|
let latestVerification = defaultVerification;
|
|
412
455
|
async function runVerificationStage() {
|
|
413
|
-
|
|
414
|
-
heartbeat?.setStage('verifying', 4);
|
|
456
|
+
transitionStage('implementing', 'verifying', { stage: 'verifying', stageIndex: 4 }, {});
|
|
415
457
|
const overallVerificationStart = Date.now();
|
|
416
458
|
const verifyCostStart = runningCostUSD();
|
|
417
459
|
const verification = await runVerifyStage({
|
|
418
460
|
cwd,
|
|
419
461
|
verifyCommand: task.verifyCommand,
|
|
420
|
-
taskTimeoutMs: task.timeoutMs ?? config.defaults.timeoutMs ??
|
|
462
|
+
taskTimeoutMs: task.timeoutMs ?? config.defaults.timeoutMs ?? DEFAULT_TASK_TIMEOUT_MS,
|
|
421
463
|
taskStartMs,
|
|
422
464
|
});
|
|
423
465
|
latestVerification = verification;
|
|
424
|
-
endVerifyStage(stats, overallVerificationStart, verifyCostStart, implementerAgentInfo, runningCostUSD(), verification.status === 'passed' ? 'passed'
|
|
466
|
+
endVerifyStage(stats, overallVerificationStart, verifyCostStart, implementerAgentInfo, runningCostUSD(), snapshotIdle(stageIdle), verification.status === 'passed' ? 'passed'
|
|
425
467
|
: verification.status === 'failed' ? 'failed'
|
|
426
468
|
: verification.status === 'skipped' ? 'skipped'
|
|
427
469
|
: 'not_applicable', verification.skipReason ?? null);
|
|
@@ -642,12 +684,12 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
642
684
|
const validCommit = implReport?.commit ?? await repairCommitMetadata(implReport?.commitDiagnostic ?? 'no commit block emitted');
|
|
643
685
|
if (!validCommit)
|
|
644
686
|
return;
|
|
645
|
-
|
|
687
|
+
transitionStage('verifying', 'committing', { stage: 'committing', stageIndex: 7 }, null);
|
|
646
688
|
const commitT0 = Date.now();
|
|
647
689
|
const commitC0 = runningCostUSD();
|
|
648
690
|
const c = await runCommitStage({ cwd, filesWritten: implResult.filesWritten, commit: validCommit });
|
|
649
691
|
commits.push(c);
|
|
650
|
-
endBaseStage(stats, 'committing', commitT0, commitC0, implementerAgentInfo, runningCostUSD());
|
|
692
|
+
endBaseStage(stats, 'committing', commitT0, commitC0, implementerAgentInfo, runningCostUSD(), snapshotIdle(stageIdle));
|
|
651
693
|
}
|
|
652
694
|
}
|
|
653
695
|
// Tracks the final RunResult across every exit path so the `finally` block
|
|
@@ -657,10 +699,16 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
657
699
|
// catch path. Without this, the recorder only fires on 2 of ~5 exit paths.
|
|
658
700
|
let __finalRunResult;
|
|
659
701
|
const __recordOnce = (r) => {
|
|
660
|
-
// Stamp stallTriggered on every exit path.
|
|
661
|
-
// by this scope; surfacing it on the
|
|
662
|
-
// telemetry) distinguish "no progress"
|
|
663
|
-
|
|
702
|
+
// Stamp stallTriggered and taskMaxIdleMs on every exit path.
|
|
703
|
+
// The watchdog flag is owned by this scope; surfacing it on the
|
|
704
|
+
// RunResult lets the caller (and telemetry) distinguish "no progress"
|
|
705
|
+
// aborts from cap exhaustion. taskMaxIdleMs is always populated so the
|
|
706
|
+
// task_completed JSONL event has it regardless of early return.
|
|
707
|
+
const stamped = {
|
|
708
|
+
...r,
|
|
709
|
+
...(stallFired ? { stallTriggered: true } : {}),
|
|
710
|
+
taskMaxIdleMs,
|
|
711
|
+
};
|
|
664
712
|
if (__finalRunResult === undefined)
|
|
665
713
|
__finalRunResult = stamped;
|
|
666
714
|
return stamped;
|
|
@@ -745,7 +793,7 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
745
793
|
latestAttemptedImpl = { tier: initialImpl.usedTier, result: implResult };
|
|
746
794
|
lastNonRejectedImpl = { tier: initialImpl.usedTier, result: implResult };
|
|
747
795
|
implementerHistory.push(initialImpl.usedTier);
|
|
748
|
-
endBaseStage(stats, 'implementing', implT0, implC0, implementerAgentInfo, runningCostUSD());
|
|
796
|
+
endBaseStage(stats, 'implementing', implT0, implC0, implementerAgentInfo, runningCostUSD(), snapshotIdle(stageIdle));
|
|
749
797
|
specAttemptIndex = 1;
|
|
750
798
|
const implReport = implResult.status === 'ok' ? parseStructuredReport(implResult.output) : undefined;
|
|
751
799
|
const workerStatus = extractWorkerStatus(implReport);
|
|
@@ -762,7 +810,7 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
762
810
|
const filePathsSkipped = !filePathsInteracted;
|
|
763
811
|
if (implResult.filesWritten.length === 0 && reviewPolicy !== 'quality_only') {
|
|
764
812
|
if (reviewPolicy === 'off') {
|
|
765
|
-
|
|
813
|
+
transitionStage('verifying', 'terminal', null, {});
|
|
766
814
|
const terminal = resolveOffTerminal({
|
|
767
815
|
...implResult,
|
|
768
816
|
workerStatus,
|
|
@@ -841,7 +889,7 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
841
889
|
};
|
|
842
890
|
}
|
|
843
891
|
if (reviewPolicy === 'off') {
|
|
844
|
-
|
|
892
|
+
transitionStage('verifying', 'terminal', null, {});
|
|
845
893
|
const terminal = resolveOffTerminal({
|
|
846
894
|
...implResult,
|
|
847
895
|
workerStatus,
|
|
@@ -874,10 +922,9 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
874
922
|
if (reviewPolicy === 'diff_only') {
|
|
875
923
|
const diffUnavailable = new Map();
|
|
876
924
|
const diffReviewerTier = pickReviewer({ loop: 'spec', attemptIndex: 0, baseTier: resolved.slot });
|
|
877
|
-
|
|
925
|
+
transitionStage('verifying', 'diff_review', { stage: 'diff_review', stageIndex: 2, reviewRound: 1, attemptCap: 1 }, {});
|
|
878
926
|
const diffReviewT0 = Date.now();
|
|
879
927
|
const diffReviewC0 = runningCostUSD();
|
|
880
|
-
heartbeat?.transition({ stage: 'diff_review', stageIndex: 2, reviewRound: 1, attemptCap: 1 });
|
|
881
928
|
const diffReviewT0_commit = Date.now();
|
|
882
929
|
const diffReviewC0_commit = runningCostUSD();
|
|
883
930
|
const diffCall = await runWithFallback({
|
|
@@ -887,7 +934,7 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
887
934
|
isTransportFailure: (r) => isReviewTransportFailure(r),
|
|
888
935
|
getStatus: (r) => r.status,
|
|
889
936
|
makeSyntheticFailure: () => makeSkippedReviewResult('all_tiers_unavailable'),
|
|
890
|
-
call: (provider) => runDiffReview({ cwd, diff: evidence.fullDiff, diffTruncated: evidence.diffTruncated, verification, worker: { call: (prompt) => provider.run(prompt) } }),
|
|
937
|
+
call: (provider) => runDiffReview({ cwd, diff: evidence.fullDiff, diffTruncated: evidence.diffTruncated, verification, worker: { call: (prompt, opts) => provider.run(prompt, { abortSignal: opts?.abortSignal, timeoutMs: opts?.timeoutMs }) }, taskDeadlineMs, abortSignal: stallController.signal }),
|
|
891
938
|
});
|
|
892
939
|
if (diffCall.fallbackFired) {
|
|
893
940
|
emitFallback({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'diff', attempt: 0, role: 'diffReviewer', assignedTier: diffReviewerTier, usedTier: diffCall.usedTier, reason: diffCall.fallbackReason, triggeringStatus: diffCall.fallbackTriggeringStatus, violatesSeparation: diffCall.usedTier === implementerHistory[implementerHistory.length - 1] });
|
|
@@ -897,8 +944,17 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
897
944
|
emitFallbackUnavailable({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'diff', attempt: 0, role: 'diffReviewer', assignedTier: diffReviewerTier, reason: diffCall.unavailableReason });
|
|
898
945
|
}
|
|
899
946
|
const verdict = diffCall.bothUnavailable || isReviewTransportFailure(diffCall.result) ? makeSkippedReviewResult('all_tiers_unavailable') : diffCall.result;
|
|
900
|
-
emitTaskEvent('review_decision', {
|
|
901
|
-
|
|
947
|
+
emitTaskEvent('review_decision', {
|
|
948
|
+
stage: 'diff_review',
|
|
949
|
+
verdict: 'kind' in verdict
|
|
950
|
+
? (verdict.kind === 'approve' ? 'approved'
|
|
951
|
+
: verdict.kind === 'concerns' ? 'concerns'
|
|
952
|
+
: verdict.kind === 'reject' ? 'changes_required'
|
|
953
|
+
: 'error') // verdict.kind === 'transport_failure'
|
|
954
|
+
: 'skipped',
|
|
955
|
+
round: 1,
|
|
956
|
+
});
|
|
957
|
+
endReviewStage(stats, 'diff_review', diffReviewT0_commit, diffReviewC0_commit, implementerAgentInfo, runningCostUSD(), snapshotIdle(stageIdle),
|
|
902
958
|
// Diff review uses 'approve' | 'concerns' | 'reject' | 'transport_failure' (DiffReviewVerdict),
|
|
903
959
|
// distinct from spec/quality verdicts. Map to the telemetry verdict enum here.
|
|
904
960
|
'kind' in verdict
|
|
@@ -929,7 +985,7 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
929
985
|
let specReviewT0 = 0;
|
|
930
986
|
let specReviewC0 = null;
|
|
931
987
|
if (reviewPolicy !== 'quality_only') {
|
|
932
|
-
|
|
988
|
+
transitionStage('verifying', 'spec_review', { stage: 'spec_review', stageIndex: 2, reviewRound: 1, attemptCap: maxSpecRows }, null);
|
|
933
989
|
const initialReviewerTier = pickReviewer({ loop: 'spec', attemptIndex: 0, baseTier: resolved.slot });
|
|
934
990
|
specReviewT0 = Date.now();
|
|
935
991
|
specReviewC0 = runningCostUSD();
|
|
@@ -940,7 +996,7 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
940
996
|
isTransportFailure: (r) => isReviewTransportFailure(r),
|
|
941
997
|
getStatus: (r) => r.status,
|
|
942
998
|
makeSyntheticFailure: () => makeSkippedReviewResult('all_tiers_unavailable'),
|
|
943
|
-
call: (provider) => runSpecReview(provider, packet, effectiveImplReport, fileContents, implResult.toolCalls, task.planContext, evidence.block),
|
|
999
|
+
call: (provider) => runSpecReview(provider, packet, effectiveImplReport, fileContents, implResult.toolCalls, task.planContext, evidence.block, taskDeadlineMs, stallController.signal, wrappedOnProgress),
|
|
944
1000
|
});
|
|
945
1001
|
if (initialSpecReview.bothUnavailable) {
|
|
946
1002
|
emitFallbackUnavailable({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'spec', attempt: 0, role: 'specReviewer', assignedTier: initialReviewerTier, reason: initialSpecReview.unavailableReason });
|
|
@@ -972,8 +1028,7 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
972
1028
|
const decision = pickEscalation({ loop: 'spec', attemptIndex: specAttemptIndex, baseTier: resolved.slot });
|
|
973
1029
|
if (decision.isEscalated)
|
|
974
1030
|
emitEscalationEvent('spec', specAttemptIndex, decision);
|
|
975
|
-
|
|
976
|
-
heartbeat?.transition({ stage: 'spec_rework', stageIndex: 3, reviewRound: specAttemptIndex, attemptCap: maxSpecRows });
|
|
1031
|
+
transitionStage('spec_review', 'spec_rework', { stage: 'spec_rework', stageIndex: 3, reviewRound: specAttemptIndex, attemptCap: maxSpecRows }, { attempt: specAttemptIndex, attemptCap: maxSpecRows, implTier: decision.impl, reviewerTier: decision.reviewer, escalated: decision.isEscalated });
|
|
977
1032
|
const feedback = specResult.findings.length > 0 ? `\n\n## Spec Review Feedback (round ${specAttemptIndex}):\n${specResult.findings.map(f => `- ${f}`).join('\n')}` : '';
|
|
978
1033
|
const reworkTask = withDoneCondition({ ...task, prompt: `${task.prompt}${feedback}` });
|
|
979
1034
|
const reworkCall = await runWithFallback({ assigned: decision.impl, providerFor, unavailableTiers: specUnavailable, isTransportFailure: (r) => TRANSPORT_FAILURES.has(r.status) && r.capExhausted === undefined, getStatus: (r) => r.status, makeSyntheticFailure: (assigned) => makeSyntheticRunResult(assigned, 'all_tiers_unavailable'), call: (provider) => delegateWithEscalation(reworkTask, [provider], { explicitlyPinned: true, onProgress: wrappedOnProgress, taskDeadlineMs, abortSignal: stallController.signal, assignedTier: decision.impl }) });
|
|
@@ -996,8 +1051,8 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
996
1051
|
const reworkReport = parseStructuredReport(finalImplResult.output);
|
|
997
1052
|
finalImplReport = reworkReport.summary ? reworkReport : buildFallbackImplReport(finalImplResult);
|
|
998
1053
|
fileContents = await readImplementerFileContents(finalImplResult.filesWritten, task.cwd);
|
|
999
|
-
|
|
1000
|
-
const reviewCall = await runWithFallback({ assigned: decision.reviewer, providerFor, unavailableTiers: specUnavailable, isTransportFailure: (r) => isReviewTransportFailure(r), getStatus: (r) => r.status, makeSyntheticFailure: () => makeSkippedReviewResult('all_tiers_unavailable'), call: (provider) => runSpecReview(provider, packet, finalImplReport, fileContents, finalImplResult.toolCalls, task.planContext, evidence.block) });
|
|
1054
|
+
transitionStage('spec_rework', 'spec_review', { stage: 'spec_review', stageIndex: 2, reviewRound: specAttemptIndex + 1, attemptCap: maxSpecRows }, null);
|
|
1055
|
+
const reviewCall = await runWithFallback({ assigned: decision.reviewer, providerFor, unavailableTiers: specUnavailable, isTransportFailure: (r) => isReviewTransportFailure(r), getStatus: (r) => r.status, makeSyntheticFailure: () => makeSkippedReviewResult('all_tiers_unavailable'), call: (provider) => runSpecReview(provider, packet, finalImplReport, fileContents, finalImplResult.toolCalls, task.planContext, evidence.block, taskDeadlineMs, stallController.signal, wrappedOnProgress) });
|
|
1001
1056
|
if (reviewCall.bothUnavailable) {
|
|
1002
1057
|
emitFallbackUnavailable({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'spec', attempt: specAttemptIndex, role: 'specReviewer', assignedTier: decision.reviewer, reason: reviewCall.unavailableReason });
|
|
1003
1058
|
fallbackOverrides.push({ role: 'specReviewer', loop: 'spec', attempt: specAttemptIndex, assigned: decision.reviewer, used: reviewCall.usedTier, reason: reviewCall.unavailableReason, triggeringStatus: reviewCall.fallbackTriggeringStatus, bothUnavailable: true });
|
|
@@ -1042,10 +1097,10 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
1042
1097
|
if (reviewPolicy === 'full' || reviewPolicy === 'quality_only') {
|
|
1043
1098
|
qualityUnavailable = new Map();
|
|
1044
1099
|
const qualityReviewerTier = pickReviewer({ loop: 'quality', attemptIndex: 0, baseTier: resolved.slot });
|
|
1045
|
-
|
|
1100
|
+
transitionStage(currentStage, 'quality_review', { stage: 'quality_review', stageIndex: 4, reviewRound: 1, attemptCap: maxQualityRows }, null);
|
|
1046
1101
|
qualityReviewT0 = Date.now();
|
|
1047
1102
|
qualityReviewC0 = runningCostUSD();
|
|
1048
|
-
const initialQuality = await runWithFallback({ assigned: qualityReviewerTier, providerFor, unavailableTiers: qualityUnavailable, isTransportFailure: (r) => isReviewTransportFailure(r), getStatus: (r) => r.status, makeSyntheticFailure: () => makeSkippedReviewResult('all_tiers_unavailable'), call: (provider) => runQualityReview(provider, packet, specReport ?? finalImplReport, fileContents, finalImplResult.toolCalls, finalImplResult.filesWritten, evidence.block, qualityReviewPromptBuilder, finalImplResult.output) });
|
|
1103
|
+
const initialQuality = await runWithFallback({ assigned: qualityReviewerTier, providerFor, unavailableTiers: qualityUnavailable, isTransportFailure: (r) => isReviewTransportFailure(r), getStatus: (r) => r.status, makeSyntheticFailure: () => makeSkippedReviewResult('all_tiers_unavailable'), call: (provider) => runQualityReview(provider, packet, specReport ?? finalImplReport, fileContents, finalImplResult.toolCalls, finalImplResult.filesWritten, evidence.block, qualityReviewPromptBuilder, finalImplResult.output, taskDeadlineMs, stallController.signal, wrappedOnProgress) });
|
|
1049
1104
|
if (initialQuality.bothUnavailable) {
|
|
1050
1105
|
emitFallbackUnavailable({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'quality', attempt: 0, role: 'qualityReviewer', assignedTier: qualityReviewerTier, reason: initialQuality.unavailableReason });
|
|
1051
1106
|
fallbackOverrides.push({ role: 'qualityReviewer', loop: 'quality', attempt: 0, assigned: qualityReviewerTier, used: initialQuality.usedTier, reason: initialQuality.unavailableReason, triggeringStatus: initialQuality.fallbackTriggeringStatus, bothUnavailable: true });
|
|
@@ -1059,102 +1114,91 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
1059
1114
|
}
|
|
1060
1115
|
}
|
|
1061
1116
|
qualityResult = initialQuality.result;
|
|
1117
|
+
qualityAttemptIndex = 1;
|
|
1118
|
+
if (reviewDidNotReject(qualityResult.status))
|
|
1119
|
+
lastNonRejectedImpl = { tier: implementerHistory[implementerHistory.length - 1], result: finalImplResult };
|
|
1062
1120
|
if (reviewPolicy === 'quality_only') {
|
|
1121
|
+
// Annotation model: emit one quality event per pass with severity-correction
|
|
1122
|
+
// and mean-confidence summary fields. Then we are done — no rework loop.
|
|
1123
|
+
const annotated = qualityResult.annotatedFindings ?? [];
|
|
1124
|
+
const severityCorrections = annotated.filter(f => f.reviewerSeverity !== undefined).length;
|
|
1125
|
+
const meanConfidence = annotated.length > 0
|
|
1126
|
+
? Math.round((annotated.reduce((s, f) => s + f.reviewerConfidence, 0) / annotated.length) * 100) / 100
|
|
1127
|
+
: null;
|
|
1063
1128
|
emitTaskEvent('read_only_review.quality', {
|
|
1064
1129
|
route: routeKey,
|
|
1065
|
-
verdict: qualityResult.status === '
|
|
1066
|
-
: qualityResult.status === '
|
|
1067
|
-
:
|
|
1068
|
-
: 'error',
|
|
1130
|
+
verdict: qualityResult.status === 'annotated' ? 'annotated'
|
|
1131
|
+
: qualityResult.status === 'skipped' ? 'skipped'
|
|
1132
|
+
: 'error',
|
|
1069
1133
|
iterationIndex: 1,
|
|
1070
|
-
findingsReviewed:
|
|
1071
|
-
findingsFlagged:
|
|
1134
|
+
findingsReviewed: annotated.length,
|
|
1135
|
+
findingsFlagged: severityCorrections,
|
|
1136
|
+
severityCorrections,
|
|
1137
|
+
meanConfidence,
|
|
1072
1138
|
durationMs: Date.now() - qualityReviewT0,
|
|
1073
1139
|
costUSD: runningCostUSD() !== null && qualityReviewC0 !== null ? runningCostUSD() - qualityReviewC0 : null,
|
|
1074
1140
|
});
|
|
1075
1141
|
}
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
emitEscalationEvent('quality', qualityAttemptIndex, decision);
|
|
1089
|
-
emitTaskEvent('stage_change', { from: 'quality_review', to: 'quality_rework', attempt: qualityAttemptIndex, attemptCap: maxQualityRows, implTier: decision.impl, reviewerTier: decision.reviewer, escalated: decision.isEscalated });
|
|
1090
|
-
if (reviewPolicy === 'quality_only') {
|
|
1091
|
-
emitTaskEvent('read_only_review.rework', {
|
|
1092
|
-
route: routeKey,
|
|
1093
|
-
iterationIndex: qualityAttemptIndex,
|
|
1094
|
-
triggeringIssues: qualityResult.findings?.length ?? 0,
|
|
1095
|
-
});
|
|
1096
|
-
}
|
|
1097
|
-
heartbeat?.transition({ stage: 'quality_rework', stageIndex: 5, reviewRound: qualityAttemptIndex, attemptCap: maxQualityRows });
|
|
1098
|
-
const feedback = qualityResult.findings.length > 0 ? `\n\n## Quality Review Feedback (round ${qualityAttemptIndex}):\n${qualityResult.findings.map(f => `- ${f}`).join('\n')}` : '';
|
|
1099
|
-
const reworkTask = withDoneCondition({ ...task, prompt: `${task.prompt}${feedback}` });
|
|
1100
|
-
const reworkCall = await runWithFallback({ assigned: decision.impl, providerFor, unavailableTiers: qualityUnavailable, isTransportFailure: (r) => TRANSPORT_FAILURES.has(r.status) && r.capExhausted === undefined, getStatus: (r) => r.status, makeSyntheticFailure: (assigned) => makeSyntheticRunResult(assigned, 'all_tiers_unavailable'), call: (provider) => delegateWithEscalation(reworkTask, [provider], { explicitlyPinned: true, onProgress: wrappedOnProgress, taskDeadlineMs, abortSignal: stallController.signal, assignedTier: decision.impl }) });
|
|
1101
|
-
if (reworkCall.fallbackFired || reworkCall.bothUnavailable)
|
|
1102
|
-
fallbackOverrides.push({ role: 'implementer', loop: 'quality', attempt: qualityAttemptIndex, assigned: decision.impl, used: reworkCall.usedTier, reason: (reworkCall.fallbackReason ?? reworkCall.unavailableReason), triggeringStatus: reworkCall.fallbackTriggeringStatus, bothUnavailable: reworkCall.bothUnavailable });
|
|
1103
|
-
if (reworkCall.fallbackFired)
|
|
1104
|
-
emitFallback({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'quality', attempt: qualityAttemptIndex, role: 'implementer', assignedTier: decision.impl, usedTier: reworkCall.usedTier, reason: reworkCall.fallbackReason, triggeringStatus: reworkCall.fallbackTriggeringStatus, violatesSeparation: false });
|
|
1105
|
-
if (reworkCall.bothUnavailable) {
|
|
1106
|
-
emitFallbackUnavailable({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'quality', attempt: qualityAttemptIndex, role: 'implementer', assignedTier: decision.impl, reason: reworkCall.unavailableReason });
|
|
1142
|
+
else {
|
|
1143
|
+
// Artifact-route gating model — keep the rework loop.
|
|
1144
|
+
let prevQualityFindings = [...(qualityResult.findings ?? [])];
|
|
1145
|
+
while (qualityResult.status === 'changes_required') {
|
|
1146
|
+
if (qualityAttemptIndex >= maxQualityRows)
|
|
1147
|
+
return abortReviewLoop(finalImplResult, 'round_cap', 'review round cap reached before quality rework', 'quality');
|
|
1148
|
+
const currentCostUSD = taskCostUSD();
|
|
1149
|
+
if (currentCostUSD !== null && maxCostUSD !== undefined && currentCostUSD >= 0.8 * maxCostUSD) {
|
|
1150
|
+
emitTaskEvent('cost_check', { stage: 'quality_rework', tripped: true, cost_used_usd: currentCostUSD, cost_cap_usd: maxCostUSD, cost_available: true });
|
|
1151
|
+
return abortReviewLoop(finalImplResult, 'cost_ceiling', 'cost ceiling reached before quality rework', 'quality');
|
|
1152
|
+
}
|
|
1153
|
+
const decision = pickEscalation({ loop: 'quality', attemptIndex: qualityAttemptIndex, baseTier: resolved.slot });
|
|
1107
1154
|
if (decision.isEscalated)
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
emitFallbackUnavailable({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'quality', attempt: qualityAttemptIndex, role: 'qualityReviewer', assignedTier: decision.reviewer, reason: reviewCall.unavailableReason });
|
|
1123
|
-
fallbackOverrides.push({ role: 'qualityReviewer', loop: 'quality', attempt: qualityAttemptIndex, assigned: decision.reviewer, used: reviewCall.usedTier, reason: reviewCall.unavailableReason, triggeringStatus: reviewCall.fallbackTriggeringStatus, bothUnavailable: true });
|
|
1124
|
-
qualityReviewerHistory.push('skipped');
|
|
1125
|
-
}
|
|
1126
|
-
else {
|
|
1127
|
-
qualityReviewerHistory.push(reviewCall.usedTier);
|
|
1128
|
-
if (reviewCall.fallbackFired) {
|
|
1129
|
-
emitFallback({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'quality', attempt: qualityAttemptIndex, role: 'qualityReviewer', assignedTier: decision.reviewer, usedTier: reviewCall.usedTier, reason: reviewCall.fallbackReason, triggeringStatus: reviewCall.fallbackTriggeringStatus, violatesSeparation: reviewCall.usedTier === implementerHistory[implementerHistory.length - 1] });
|
|
1130
|
-
fallbackOverrides.push({ role: 'qualityReviewer', loop: 'quality', attempt: qualityAttemptIndex, assigned: decision.reviewer, used: reviewCall.usedTier, reason: reviewCall.fallbackReason, triggeringStatus: reviewCall.fallbackTriggeringStatus, bothUnavailable: false });
|
|
1155
|
+
emitEscalationEvent('quality', qualityAttemptIndex, decision);
|
|
1156
|
+
transitionStage('quality_review', 'quality_rework', { stage: 'quality_rework', stageIndex: 5, reviewRound: qualityAttemptIndex, attemptCap: maxQualityRows }, { attempt: qualityAttemptIndex, attemptCap: maxQualityRows, implTier: decision.impl, reviewerTier: decision.reviewer, escalated: decision.isEscalated });
|
|
1157
|
+
const feedback = qualityResult.findings.length > 0 ? `\n\n## Quality Review Feedback (round ${qualityAttemptIndex}):\n${qualityResult.findings.map(f => `- ${f}`).join('\n')}` : '';
|
|
1158
|
+
const reworkTask = withDoneCondition({ ...task, prompt: `${task.prompt}${feedback}` });
|
|
1159
|
+
const reworkCall = await runWithFallback({ assigned: decision.impl, providerFor, unavailableTiers: qualityUnavailable, isTransportFailure: (r) => TRANSPORT_FAILURES.has(r.status) && r.capExhausted === undefined, getStatus: (r) => r.status, makeSyntheticFailure: (assigned) => makeSyntheticRunResult(assigned, 'all_tiers_unavailable'), call: (provider) => delegateWithEscalation(reworkTask, [provider], { explicitlyPinned: true, onProgress: wrappedOnProgress, taskDeadlineMs, abortSignal: stallController.signal, assignedTier: decision.impl }) });
|
|
1160
|
+
if (reworkCall.fallbackFired || reworkCall.bothUnavailable)
|
|
1161
|
+
fallbackOverrides.push({ role: 'implementer', loop: 'quality', attempt: qualityAttemptIndex, assigned: decision.impl, used: reworkCall.usedTier, reason: (reworkCall.fallbackReason ?? reworkCall.unavailableReason), triggeringStatus: reworkCall.fallbackTriggeringStatus, bothUnavailable: reworkCall.bothUnavailable });
|
|
1162
|
+
if (reworkCall.fallbackFired)
|
|
1163
|
+
emitFallback({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'quality', attempt: qualityAttemptIndex, role: 'implementer', assignedTier: decision.impl, usedTier: reworkCall.usedTier, reason: reworkCall.fallbackReason, triggeringStatus: reworkCall.fallbackTriggeringStatus, violatesSeparation: false });
|
|
1164
|
+
if (reworkCall.bothUnavailable) {
|
|
1165
|
+
emitFallbackUnavailable({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'quality', attempt: qualityAttemptIndex, role: 'implementer', assignedTier: decision.impl, reason: reworkCall.unavailableReason });
|
|
1166
|
+
if (decision.isEscalated)
|
|
1167
|
+
emitEscalationUnavailable({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'quality', attempt: qualityAttemptIndex, role: 'implementer', wantedTier: decision.impl, reason: reworkCall.unavailableReason });
|
|
1168
|
+
return __recordOnce(adaptForAllTiersUnavailable(reworkCall.result, 'quality', qualityAttemptIndex));
|
|
1131
1169
|
}
|
|
1170
|
+
finalImplResult = reworkCall.result;
|
|
1171
|
+
latestAttemptedImpl = { tier: reworkCall.usedTier, result: finalImplResult };
|
|
1172
|
+
implementerHistory.push(reworkCall.usedTier);
|
|
1173
|
+
const reworkReport = parseStructuredReport(finalImplResult.output);
|
|
1174
|
+
finalImplReport = reworkReport.summary ? reworkReport : buildFallbackImplReport(finalImplResult);
|
|
1175
|
+
fileContents = await readImplementerFileContents(finalImplResult.filesWritten, task.cwd);
|
|
1176
|
+
transitionStage('quality_rework', 'quality_review', { stage: 'quality_review', stageIndex: 4, reviewRound: qualityAttemptIndex + 1, attemptCap: maxQualityRows }, null);
|
|
1177
|
+
const reviewCall = await runWithFallback({ assigned: decision.reviewer, providerFor, unavailableTiers: qualityUnavailable, isTransportFailure: (r) => isReviewTransportFailure(r), getStatus: (r) => r.status, makeSyntheticFailure: () => makeSkippedReviewResult('all_tiers_unavailable'), call: (provider) => runQualityReview(provider, packet, finalImplReport, fileContents, finalImplResult.toolCalls, finalImplResult.filesWritten, evidence.block, qualityReviewPromptBuilder, finalImplResult.output, taskDeadlineMs, stallController.signal, wrappedOnProgress) });
|
|
1178
|
+
if (reviewCall.bothUnavailable) {
|
|
1179
|
+
emitFallbackUnavailable({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'quality', attempt: qualityAttemptIndex, role: 'qualityReviewer', assignedTier: decision.reviewer, reason: reviewCall.unavailableReason });
|
|
1180
|
+
fallbackOverrides.push({ role: 'qualityReviewer', loop: 'quality', attempt: qualityAttemptIndex, assigned: decision.reviewer, used: reviewCall.usedTier, reason: reviewCall.unavailableReason, triggeringStatus: reviewCall.fallbackTriggeringStatus, bothUnavailable: true });
|
|
1181
|
+
qualityReviewerHistory.push('skipped');
|
|
1182
|
+
}
|
|
1183
|
+
else {
|
|
1184
|
+
qualityReviewerHistory.push(reviewCall.usedTier);
|
|
1185
|
+
if (reviewCall.fallbackFired) {
|
|
1186
|
+
emitFallback({ batchId: heartbeatWiring?.batchId ?? '', taskIndex, loop: 'quality', attempt: qualityAttemptIndex, role: 'qualityReviewer', assignedTier: decision.reviewer, usedTier: reviewCall.usedTier, reason: reviewCall.fallbackReason, triggeringStatus: reviewCall.fallbackTriggeringStatus, violatesSeparation: reviewCall.usedTier === implementerHistory[implementerHistory.length - 1] });
|
|
1187
|
+
fallbackOverrides.push({ role: 'qualityReviewer', loop: 'quality', attempt: qualityAttemptIndex, assigned: decision.reviewer, used: reviewCall.usedTier, reason: reviewCall.fallbackReason, triggeringStatus: reviewCall.fallbackTriggeringStatus, bothUnavailable: false });
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
qualityResult = reviewCall.result;
|
|
1191
|
+
if (reviewDidNotReject(qualityResult.status))
|
|
1192
|
+
lastNonRejectedImpl = { tier: implementerHistory[implementerHistory.length - 1], result: finalImplResult };
|
|
1193
|
+
qualityAttemptIndex++;
|
|
1194
|
+
if (qualityResult.status === 'approved' || qualityResult.status === 'skipped')
|
|
1195
|
+
break;
|
|
1196
|
+
const currentFindings = [...(qualityResult.findings ?? [])].sort().join('\0');
|
|
1197
|
+
const prevFindings = [...prevQualityFindings].sort().join('\0');
|
|
1198
|
+
if (currentFindings === prevFindings && currentFindings !== '')
|
|
1199
|
+
break;
|
|
1200
|
+
prevQualityFindings = [...(qualityResult.findings ?? [])];
|
|
1132
1201
|
}
|
|
1133
|
-
qualityResult = reviewCall.result;
|
|
1134
|
-
if (reviewPolicy === 'quality_only') {
|
|
1135
|
-
emitTaskEvent('read_only_review.quality', {
|
|
1136
|
-
route: routeKey,
|
|
1137
|
-
verdict: qualityResult.status === 'approved' ? 'approved'
|
|
1138
|
-
: qualityResult.status === 'changes_required' ? 'changes_required'
|
|
1139
|
-
: qualityResult.status === 'skipped' ? 'skipped'
|
|
1140
|
-
: 'error',
|
|
1141
|
-
iterationIndex: qualityAttemptIndex + 1,
|
|
1142
|
-
findingsReviewed: qualityResult.findings?.length ?? 0,
|
|
1143
|
-
findingsFlagged: qualityResult.status === 'changes_required' ? (qualityResult.findings?.length ?? 0) : 0,
|
|
1144
|
-
durationMs: Date.now() - reworkQualityT0,
|
|
1145
|
-
costUSD: runningCostUSD() !== null && reworkQualityC0 !== null ? runningCostUSD() - reworkQualityC0 : null,
|
|
1146
|
-
});
|
|
1147
|
-
}
|
|
1148
|
-
if (reviewDidNotReject(qualityResult.status))
|
|
1149
|
-
lastNonRejectedImpl = { tier: implementerHistory[implementerHistory.length - 1], result: finalImplResult };
|
|
1150
|
-
qualityAttemptIndex++;
|
|
1151
|
-
if (qualityResult.status === 'approved' || qualityResult.status === 'skipped')
|
|
1152
|
-
break;
|
|
1153
|
-
const currentFindings = [...(qualityResult.findings ?? [])].sort().join('\0');
|
|
1154
|
-
const prevFindings = [...prevQualityFindings].sort().join('\0');
|
|
1155
|
-
if (currentFindings === prevFindings && currentFindings !== '')
|
|
1156
|
-
break;
|
|
1157
|
-
prevQualityFindings = [...(qualityResult.findings ?? [])];
|
|
1158
1202
|
}
|
|
1159
1203
|
}
|
|
1160
1204
|
const finalReport = specReport ?? finalImplReport;
|
|
@@ -1180,17 +1224,18 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
1180
1224
|
? 'skipped'
|
|
1181
1225
|
: (['approved', 'changes_required', 'skipped', 'error', 'api_error', 'network_error', 'timeout'].includes(specStatus) ? specStatus : 'error');
|
|
1182
1226
|
if (reviewPolicy !== 'quality_only') {
|
|
1183
|
-
endReviewStage(stats, 'spec_review', specReviewT0, specReviewC0, implementerAgentInfo, runningCostUSD(), specStatus === 'approved' ? 'approved'
|
|
1227
|
+
endReviewStage(stats, 'spec_review', specReviewT0, specReviewC0, implementerAgentInfo, runningCostUSD(), snapshotIdle(stageIdle), specStatus === 'approved' ? 'approved'
|
|
1184
1228
|
: specStatus === 'changes_required' ? 'changes_required'
|
|
1185
1229
|
: specStatus === 'skipped' ? 'skipped'
|
|
1186
1230
|
: specStatus === 'not_applicable' ? 'not_applicable'
|
|
1187
1231
|
: 'error', specAttemptIndex - 1);
|
|
1188
1232
|
}
|
|
1189
1233
|
const qualityAggregateStatus = qualityResult.status;
|
|
1190
|
-
endReviewStage(stats, 'quality_review', qualityReviewT0, qualityReviewC0, implementerAgentInfo, runningCostUSD(), qualityResult.status === 'approved' ? 'approved'
|
|
1234
|
+
endReviewStage(stats, 'quality_review', qualityReviewT0, qualityReviewC0, implementerAgentInfo, runningCostUSD(), snapshotIdle(stageIdle), qualityResult.status === 'approved' ? 'approved'
|
|
1191
1235
|
: qualityResult.status === 'changes_required' ? 'changes_required'
|
|
1192
|
-
: qualityResult.status === '
|
|
1193
|
-
: '
|
|
1236
|
+
: qualityResult.status === 'annotated' ? 'annotated'
|
|
1237
|
+
: qualityResult.status === 'skipped' ? 'skipped'
|
|
1238
|
+
: 'error', qualityAttemptIndex - 1);
|
|
1194
1239
|
const aggregated = aggregateResult(finalReport, specReport, qualityResult.report, specAggregateStatus, qualityAggregateStatus);
|
|
1195
1240
|
// File artifact verification: check whether output targets exist on disk after all work.
|
|
1196
1241
|
// Only applies when status is ok; non-ok statuses skip verification entirely.
|
|
@@ -1205,8 +1250,8 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
1205
1250
|
: finalImplResult.status === 'ok' && fileArtifactsMissing
|
|
1206
1251
|
? 'incomplete'
|
|
1207
1252
|
: finalImplResult.status;
|
|
1208
|
-
const specEnvelopeStatus = (specStatus === 'api_error' || specStatus === 'network_error' || specStatus === 'timeout' ? 'error' : specStatus);
|
|
1209
|
-
const qualityEnvelopeStatus = qualityResult.status === 'api_error' || qualityResult.status === 'network_error' || qualityResult.status === 'timeout' ? 'error' : qualityResult.status;
|
|
1253
|
+
const specEnvelopeStatus = (specStatus === 'api_error' || specStatus === 'network_error' || specStatus === 'timeout' || specStatus === 'api_aborted' ? 'error' : specStatus);
|
|
1254
|
+
const qualityEnvelopeStatus = qualityResult.status === 'api_error' || qualityResult.status === 'network_error' || qualityResult.status === 'timeout' || qualityResult.status === 'api_aborted' ? 'error' : qualityResult.status;
|
|
1210
1255
|
const runResult = {
|
|
1211
1256
|
...finalImplResult,
|
|
1212
1257
|
status: finalStatus,
|
|
@@ -1238,10 +1283,9 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
1238
1283
|
emitTaskEvent('read_only_review.terminal', {
|
|
1239
1284
|
route: routeKey,
|
|
1240
1285
|
roundsUsed: qualityAttemptIndex,
|
|
1241
|
-
finalQualityVerdict: qualityResult.status === '
|
|
1242
|
-
: qualityResult.status === '
|
|
1243
|
-
:
|
|
1244
|
-
: 'error',
|
|
1286
|
+
finalQualityVerdict: qualityResult.status === 'annotated' ? 'annotated'
|
|
1287
|
+
: qualityResult.status === 'skipped' ? 'skipped'
|
|
1288
|
+
: 'error',
|
|
1245
1289
|
costUSD: taskCostUSD(),
|
|
1246
1290
|
durationMs: Date.now() - taskStartMs,
|
|
1247
1291
|
});
|
|
@@ -1276,8 +1320,30 @@ export async function executeReviewedLifecycle(task, resolved, config, taskIndex
|
|
|
1276
1320
|
});
|
|
1277
1321
|
}
|
|
1278
1322
|
catch { /* silent */ }
|
|
1323
|
+
// NEW in v3.9.0: local JSONL emit. Distinct from cloud — local is
|
|
1324
|
+
// for verbose/observability consumers; cloud is for telemetry sink.
|
|
1325
|
+
try {
|
|
1326
|
+
const r = __finalRunResult;
|
|
1327
|
+
emitTaskEvent('task_completed', {
|
|
1328
|
+
status: r.status,
|
|
1329
|
+
workerStatus: r.workerStatus ?? null,
|
|
1330
|
+
turns: r.turns,
|
|
1331
|
+
durationMs: r.durationMs ?? null,
|
|
1332
|
+
filesRead: r.filesRead?.length ?? 0,
|
|
1333
|
+
filesWritten: r.filesWritten?.length ?? 0,
|
|
1334
|
+
toolCalls: r.toolCalls?.length ?? 0,
|
|
1335
|
+
inputTokens: r.usage.inputTokens,
|
|
1336
|
+
outputTokens: r.usage.outputTokens,
|
|
1337
|
+
costUSD: r.usage.costUSD,
|
|
1338
|
+
taskMaxIdleMs: r.taskMaxIdleMs ?? null,
|
|
1339
|
+
stallTriggered: r.stallTriggered ?? false,
|
|
1340
|
+
// JSON-stringify so verbose-stream primitives check passes
|
|
1341
|
+
stages: JSON.stringify(r.stageStats ?? emptyStats()),
|
|
1342
|
+
});
|
|
1343
|
+
}
|
|
1344
|
+
catch { /* silent — never break the user task */ }
|
|
1279
1345
|
}
|
|
1280
|
-
|
|
1346
|
+
transitionStage(currentStage, 'terminal', { stage: 'terminal', stageIndex: 8 }, null);
|
|
1281
1347
|
heartbeat?.stop();
|
|
1282
1348
|
clearInterval(stallWatchdogInterval);
|
|
1283
1349
|
}
|