agentxchain 2.144.0 → 2.146.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dashboard/app.js +3 -0
- package/dashboard/components/notifications.js +127 -0
- package/dashboard/index.html +1 -0
- package/package.json +1 -1
- package/scripts/release-bump.sh +82 -29
- package/scripts/release-downstream-truth.sh +16 -8
- package/src/commands/init.js +66 -31
- package/src/commands/restart.js +18 -3
- package/src/commands/resume.js +38 -0
- package/src/commands/status.js +37 -3
- package/src/commands/step.js +38 -0
- package/src/lib/config.js +4 -1
- package/src/lib/dashboard/actions.js +9 -3
- package/src/lib/dashboard/bridge-server.js +11 -0
- package/src/lib/dashboard/notifications-reader.js +91 -0
- package/src/lib/dashboard/state-reader.js +16 -4
- package/src/lib/governed-state.js +160 -0
- package/src/lib/intake.js +47 -0
- package/src/lib/intent-startup-migration.js +23 -1
- package/src/lib/recent-event-summary.js +2 -0
- package/src/lib/run-events.js +1 -0
- package/src/lib/run-history.js +23 -2
- package/src/lib/run-loop.js +3 -2
- package/src/lib/stale-turn-watchdog.js +380 -0
- package/src/lib/turn-checkpoint.js +4 -0
package/src/lib/run-history.js
CHANGED
|
@@ -42,10 +42,18 @@ export function recordRunHistory(root, state, config, status) {
|
|
|
42
42
|
const filePath = join(root, RUN_HISTORY_PATH);
|
|
43
43
|
mkdirSync(dirname(filePath), { recursive: true });
|
|
44
44
|
|
|
45
|
-
const
|
|
45
|
+
const allHistoryEntries = readJsonlSafe(root, HISTORY_PATH);
|
|
46
46
|
const ledgerEntries = readJsonlSafe(root, LEDGER_PATH);
|
|
47
47
|
|
|
48
|
-
//
|
|
48
|
+
// BUG-50: filter history entries to the current run only.
|
|
49
|
+
// history.jsonl accumulates across runs; using all entries causes fresh
|
|
50
|
+
// run records to inherit parent run phases_completed/total_turns.
|
|
51
|
+
const currentRunId = state?.run_id || null;
|
|
52
|
+
const historyEntries = currentRunId
|
|
53
|
+
? allHistoryEntries.filter(e => e.run_id === currentRunId)
|
|
54
|
+
: allHistoryEntries;
|
|
55
|
+
|
|
56
|
+
// Extract unique phases and roles from THIS run's turn history only
|
|
49
57
|
const phasesCompleted = [...new Set(historyEntries.map(e => e.phase).filter(Boolean))];
|
|
50
58
|
const rolesUsed = [...new Set(historyEntries.map(e => e.role).filter(Boolean))];
|
|
51
59
|
|
|
@@ -84,6 +92,7 @@ export function recordRunHistory(root, state, config, status) {
|
|
|
84
92
|
connector_used: connectorUsed,
|
|
85
93
|
model_used: modelUsed,
|
|
86
94
|
provenance: normalizeRunProvenance(state?.provenance),
|
|
95
|
+
parent_context: buildParentContextSummary(state),
|
|
87
96
|
retrospective: buildRunRetrospective({
|
|
88
97
|
state,
|
|
89
98
|
config,
|
|
@@ -317,6 +326,18 @@ function buildRecentAcceptedTurnSnapshot(entries) {
|
|
|
317
326
|
}));
|
|
318
327
|
}
|
|
319
328
|
|
|
329
|
+
function buildParentContextSummary(state) {
|
|
330
|
+
const parentRunId = state?.provenance?.parent_run_id || state?.inherited_context?.parent_run_id || null;
|
|
331
|
+
if (!parentRunId) return null;
|
|
332
|
+
|
|
333
|
+
return {
|
|
334
|
+
parent_run_id: parentRunId,
|
|
335
|
+
parent_status: state?.inherited_context?.parent_status || null,
|
|
336
|
+
parent_completed_at: state?.inherited_context?.parent_completed_at || null,
|
|
337
|
+
inherited_at: state?.inherited_context?.inherited_at || null,
|
|
338
|
+
};
|
|
339
|
+
}
|
|
340
|
+
|
|
320
341
|
function buildRunRetrospective({ state, config, status, historyEntries }) {
|
|
321
342
|
const acceptedTurns = historyEntries.filter((entry) => entry && typeof entry === 'object');
|
|
322
343
|
const lastAcceptedTurn = acceptedTurns[acceptedTurns.length - 1] || null;
|
package/src/lib/run-loop.js
CHANGED
|
@@ -38,7 +38,7 @@ import { runAdmissionControl } from './admission-control.js';
|
|
|
38
38
|
import { appendFileSync, mkdirSync, writeFileSync } from 'fs';
|
|
39
39
|
import { join, dirname } from 'path';
|
|
40
40
|
import { evaluateApprovalSlaReminders } from './notification-runner.js';
|
|
41
|
-
import {
|
|
41
|
+
import { validatePreemptionMarker } from './intake.js';
|
|
42
42
|
import { buildTimeoutBlockedReason, evaluateTimeouts } from './timeout-evaluator.js';
|
|
43
43
|
|
|
44
44
|
const DEFAULT_MAX_TURNS = 50;
|
|
@@ -139,7 +139,8 @@ export async function runLoop(root, config, callbacks, options = {}) {
|
|
|
139
139
|
// interruption).
|
|
140
140
|
const activeTurnCount = getActiveTurnCount(state);
|
|
141
141
|
if (activeTurnCount === 0) {
|
|
142
|
-
|
|
142
|
+
// BUG-48: validate marker against live intent state before preempting
|
|
143
|
+
const marker = validatePreemptionMarker(root);
|
|
143
144
|
if (marker && marker.priority === 'p0') {
|
|
144
145
|
emit({ type: 'priority_injected', intent_id: marker.intent_id, priority: marker.priority });
|
|
145
146
|
const result = makeResult(false, 'priority_preempted', state, turnsExecuted, turnHistory, gatesApproved, errors);
|
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Stale Turn Watchdog — BUG-47 + BUG-51
|
|
3
|
+
*
|
|
4
|
+
* Two-tier lazy idle-threshold detection:
|
|
5
|
+
*
|
|
6
|
+
* 1. **Fast startup watchdog (BUG-51):** if an active turn has been dispatched
|
|
7
|
+
* for >30 seconds with NO dispatch-progress file, NO staged result, and NO
|
|
8
|
+
* recent events, it is a "ghost turn" — the subprocess never attached.
|
|
9
|
+
* Transitions to `failed_start` immediately.
|
|
10
|
+
*
|
|
11
|
+
* Design note: the watchdog intentionally keys on turn-scoped
|
|
12
|
+
* dispatch-progress rather than `stdout.log` existence. Dispatch-progress is
|
|
13
|
+
* a framework-authored signal with a stable per-turn contract across runtime
|
|
14
|
+
* wiring; `stdout.log` is adapter-authored visibility output and is allowed
|
|
15
|
+
* to be best-effort. Using dispatch-progress therefore gives us the same
|
|
16
|
+
* operator-facing "no first byte / no worker heartbeat" detection without
|
|
17
|
+
* coupling the watchdog to adapter-specific log-attachment details.
|
|
18
|
+
*
|
|
19
|
+
* 2. **Stale turn watchdog (BUG-47):** if an active turn has status "running"
|
|
20
|
+
* for >N minutes with no event log activity AND no staged result file,
|
|
21
|
+
* report it as stalled.
|
|
22
|
+
*
|
|
23
|
+
* Fires on CLI invocations (status, resume, step --resume) rather than
|
|
24
|
+
* requiring a background daemon.
|
|
25
|
+
*
|
|
26
|
+
* Default thresholds:
|
|
27
|
+
* - Startup watchdog: 30 seconds (configurable via run_loop.startup_watchdog_ms)
|
|
28
|
+
* - local_cli stale turns: 10 minutes
|
|
29
|
+
* - api_proxy stale turns: 5 minutes
|
|
30
|
+
* - Configurable via run_loop.stale_turn_threshold_ms in agentxchain.json
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
34
|
+
import { join } from 'node:path';
|
|
35
|
+
import { safeWriteJson } from './safe-write.js';
|
|
36
|
+
import { emitRunEvent, readRunEvents } from './run-events.js';
|
|
37
|
+
import { getTurnStagingResultPath } from './turn-paths.js';
|
|
38
|
+
import { getDispatchProgressRelativePath } from './dispatch-progress.js';
|
|
39
|
+
|
|
40
|
+
const DEFAULT_LOCAL_CLI_THRESHOLD_MS = 10 * 60 * 1000; // 10 minutes
|
|
41
|
+
const DEFAULT_API_PROXY_THRESHOLD_MS = 5 * 60 * 1000; // 5 minutes
|
|
42
|
+
const DEFAULT_STARTUP_WATCHDOG_MS = 30 * 1000; // 30 seconds (BUG-51)
|
|
43
|
+
const LEGACY_STAGING_PATH = '.agentxchain/staging/turn-result.json';
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Check all active turns for stale "running" status.
|
|
47
|
+
*
|
|
48
|
+
* @param {string} root - project root directory
|
|
49
|
+
* @param {object} state - current governed state
|
|
50
|
+
* @param {object} config - normalized config
|
|
51
|
+
* @returns {Array<{ turn_id: string, role: string, runtime_id: string, running_ms: number, threshold_ms: number, recommendation: string }>}
|
|
52
|
+
*/
|
|
53
|
+
export function detectStaleTurns(root, state, config) {
|
|
54
|
+
const activeTurns = state?.active_turns || {};
|
|
55
|
+
const stale = [];
|
|
56
|
+
const now = Date.now();
|
|
57
|
+
|
|
58
|
+
for (const [turnId, turn] of Object.entries(activeTurns)) {
|
|
59
|
+
if (turn.status !== 'running' && turn.status !== 'retrying') continue;
|
|
60
|
+
if (!turn.started_at) continue;
|
|
61
|
+
|
|
62
|
+
const startedAt = new Date(turn.started_at).getTime();
|
|
63
|
+
if (isNaN(startedAt)) continue;
|
|
64
|
+
|
|
65
|
+
const runningMs = now - startedAt;
|
|
66
|
+
const threshold = resolveThreshold(turn, config);
|
|
67
|
+
|
|
68
|
+
if (runningMs < threshold) continue;
|
|
69
|
+
|
|
70
|
+
if (hasTurnScopedStagedResult(root, turnId)) continue;
|
|
71
|
+
|
|
72
|
+
const progressPath = join(root, getDispatchProgressRelativePath(turnId));
|
|
73
|
+
if (existsSync(progressPath)) {
|
|
74
|
+
try {
|
|
75
|
+
const progress = JSON.parse(readFileSync(progressPath, 'utf8'));
|
|
76
|
+
const lastActivity = progress.last_activity_at
|
|
77
|
+
? new Date(progress.last_activity_at).getTime()
|
|
78
|
+
: 0;
|
|
79
|
+
// If there was activity within the threshold, not stale
|
|
80
|
+
if (lastActivity > 0 && (now - lastActivity) < threshold) continue;
|
|
81
|
+
} catch {
|
|
82
|
+
// ignore parse errors
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (hasRecentTurnEventActivity(root, turnId, startedAt, threshold, now)) continue;
|
|
87
|
+
|
|
88
|
+
const runningMinutes = Math.floor(runningMs / 60000);
|
|
89
|
+
stale.push({
|
|
90
|
+
turn_id: turnId,
|
|
91
|
+
role: turn.assigned_role || 'unknown',
|
|
92
|
+
runtime_id: turn.runtime_id || 'unknown',
|
|
93
|
+
running_ms: runningMs,
|
|
94
|
+
threshold_ms: threshold,
|
|
95
|
+
recommendation: `Turn ${turnId} has been running for ${runningMinutes}m with no output. `
|
|
96
|
+
+ `Run \`agentxchain reissue-turn --turn ${turnId} --reason stale\` to recover.`,
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return stale;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* BUG-51: Detect ghost-dispatched turns — subprocess never started.
|
|
105
|
+
*
|
|
106
|
+
* A ghost turn is one that has been in "running" or "retrying" status for
|
|
107
|
+
* longer than the startup watchdog threshold (default 30s) AND has:
|
|
108
|
+
* - no dispatch-progress file (framework-observed proof that no subprocess
|
|
109
|
+
* output or heartbeat was attached)
|
|
110
|
+
* - no staged result file
|
|
111
|
+
* - no recent turn-scoped events (beyond the initial turn_dispatched)
|
|
112
|
+
*
|
|
113
|
+
* This is a stricter, faster check than detectStaleTurns (BUG-47).
|
|
114
|
+
* Ghost turns transition to "failed_start" rather than "stalled".
|
|
115
|
+
*
|
|
116
|
+
* @param {string} root - project root directory
|
|
117
|
+
* @param {object} state - current governed state
|
|
118
|
+
* @param {object} config - normalized config
|
|
119
|
+
* @returns {Array<{ turn_id: string, role: string, runtime_id: string, running_ms: number, threshold_ms: number, recommendation: string, failure_type: string }>}
|
|
120
|
+
*/
|
|
121
|
+
export function detectGhostTurns(root, state, config) {
|
|
122
|
+
const activeTurns = state?.active_turns || {};
|
|
123
|
+
const ghosts = [];
|
|
124
|
+
const now = Date.now();
|
|
125
|
+
const startupThreshold = resolveStartupThreshold(config);
|
|
126
|
+
|
|
127
|
+
for (const [turnId, turn] of Object.entries(activeTurns)) {
|
|
128
|
+
if (turn.status !== 'running' && turn.status !== 'retrying') continue;
|
|
129
|
+
if (!turn.started_at) continue;
|
|
130
|
+
|
|
131
|
+
const startedAt = new Date(turn.started_at).getTime();
|
|
132
|
+
if (isNaN(startedAt)) continue;
|
|
133
|
+
|
|
134
|
+
const runningMs = now - startedAt;
|
|
135
|
+
if (runningMs < startupThreshold) continue;
|
|
136
|
+
|
|
137
|
+
// Ghost detection: NO dispatch-progress file means subprocess never attached
|
|
138
|
+
const progressPath = join(root, getDispatchProgressRelativePath(turnId));
|
|
139
|
+
const hasProgress = existsSync(progressPath);
|
|
140
|
+
|
|
141
|
+
// If dispatch-progress exists, subprocess started — this is NOT a ghost turn.
|
|
142
|
+
// The regular stale-turn watchdog (BUG-47) will handle it if it goes silent.
|
|
143
|
+
if (hasProgress) continue;
|
|
144
|
+
|
|
145
|
+
// Also check for staged result (unlikely without progress, but be safe)
|
|
146
|
+
if (hasTurnScopedStagedResult(root, turnId)) continue;
|
|
147
|
+
|
|
148
|
+
// Check for any turn-scoped events beyond the initial dispatch event
|
|
149
|
+
if (hasRecentTurnEventActivity(root, turnId, startedAt, startupThreshold, now)) continue;
|
|
150
|
+
|
|
151
|
+
const runningSeconds = Math.floor(runningMs / 1000);
|
|
152
|
+
const failureType = 'no_subprocess_output';
|
|
153
|
+
ghosts.push({
|
|
154
|
+
turn_id: turnId,
|
|
155
|
+
role: turn.assigned_role || 'unknown',
|
|
156
|
+
runtime_id: turn.runtime_id || 'unknown',
|
|
157
|
+
running_ms: runningMs,
|
|
158
|
+
threshold_ms: startupThreshold,
|
|
159
|
+
failure_type: failureType,
|
|
160
|
+
recommendation: `Turn ${turnId} has been dispatched for ${runningSeconds}s with no subprocess output. `
|
|
161
|
+
+ `The subprocess likely never started. `
|
|
162
|
+
+ `Run \`agentxchain reissue-turn --turn ${turnId} --reason ghost\` to recover.`,
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
return ghosts;
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Detect stale turns and emit turn_stalled events for each.
|
|
171
|
+
* Returns the stale turn list for caller display.
|
|
172
|
+
*/
|
|
173
|
+
export function detectAndEmitStaleTurns(root, state, config) {
|
|
174
|
+
return reconcileStaleTurns(root, state, config).stale_turns;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// ── Internal ────────────────────────────────────────────────────────────────
|
|
178
|
+
|
|
179
|
+
export function reconcileStaleTurns(root, state, config) {
|
|
180
|
+
if (!state || typeof state !== 'object') {
|
|
181
|
+
return { stale_turns: [], ghost_turns: [], state, changed: false };
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// BUG-51: Fast startup watchdog — detect ghost turns first (30s threshold)
|
|
185
|
+
const ghosts = detectGhostTurns(root, state, config);
|
|
186
|
+
|
|
187
|
+
// BUG-47: Stale turn watchdog — detect turns that started but went silent (10m threshold)
|
|
188
|
+
// Exclude turns already caught by ghost detection to avoid double-counting
|
|
189
|
+
const ghostIds = new Set(ghosts.map(g => g.turn_id));
|
|
190
|
+
const stale = detectStaleTurns(root, state, config).filter(s => !ghostIds.has(s.turn_id));
|
|
191
|
+
|
|
192
|
+
if (ghosts.length === 0 && stale.length === 0) {
|
|
193
|
+
return { stale_turns: [], ghost_turns: [], state, changed: false };
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
const nowIso = new Date().toISOString();
|
|
197
|
+
const activeTurns = { ...(state.active_turns || {}) };
|
|
198
|
+
const budgetReservations = { ...(state.budget_reservations || {}) };
|
|
199
|
+
let changed = false;
|
|
200
|
+
|
|
201
|
+
// Process ghost turns (BUG-51) — transition to failed_start
|
|
202
|
+
for (const entry of ghosts) {
|
|
203
|
+
const turn = activeTurns[entry.turn_id];
|
|
204
|
+
if (!turn || (turn.status !== 'running' && turn.status !== 'retrying')) continue;
|
|
205
|
+
|
|
206
|
+
activeTurns[entry.turn_id] = {
|
|
207
|
+
...turn,
|
|
208
|
+
status: 'failed_start',
|
|
209
|
+
failed_start_at: nowIso,
|
|
210
|
+
failed_start_reason: entry.failure_type,
|
|
211
|
+
failed_start_previous_status: turn.status,
|
|
212
|
+
failed_start_threshold_ms: entry.threshold_ms,
|
|
213
|
+
failed_start_running_ms: entry.running_ms,
|
|
214
|
+
recovery_command: `agentxchain reissue-turn --turn ${entry.turn_id} --reason ghost`,
|
|
215
|
+
};
|
|
216
|
+
changed = true;
|
|
217
|
+
|
|
218
|
+
// BUG-51 fix #6: Release budget reservation for ghost turns
|
|
219
|
+
delete budgetReservations[entry.turn_id];
|
|
220
|
+
|
|
221
|
+
emitRunEvent(root, 'turn_start_failed', {
|
|
222
|
+
run_id: state?.run_id || null,
|
|
223
|
+
phase: state?.phase || null,
|
|
224
|
+
status: 'blocked',
|
|
225
|
+
turn: { turn_id: entry.turn_id, role_id: entry.role },
|
|
226
|
+
payload: {
|
|
227
|
+
running_ms: entry.running_ms,
|
|
228
|
+
threshold_ms: entry.threshold_ms,
|
|
229
|
+
runtime_id: entry.runtime_id,
|
|
230
|
+
failure_type: entry.failure_type,
|
|
231
|
+
recommendation: entry.recommendation,
|
|
232
|
+
},
|
|
233
|
+
});
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// Process stale turns (BUG-47) — transition to stalled
|
|
237
|
+
for (const entry of stale) {
|
|
238
|
+
const turn = activeTurns[entry.turn_id];
|
|
239
|
+
if (!turn || (turn.status !== 'running' && turn.status !== 'retrying')) continue;
|
|
240
|
+
|
|
241
|
+
activeTurns[entry.turn_id] = {
|
|
242
|
+
...turn,
|
|
243
|
+
status: 'stalled',
|
|
244
|
+
stalled_at: nowIso,
|
|
245
|
+
stalled_reason: 'no_output_within_threshold',
|
|
246
|
+
stalled_previous_status: turn.status,
|
|
247
|
+
stalled_threshold_ms: entry.threshold_ms,
|
|
248
|
+
stalled_running_ms: entry.running_ms,
|
|
249
|
+
recovery_command: `agentxchain reissue-turn --turn ${entry.turn_id} --reason stale`,
|
|
250
|
+
};
|
|
251
|
+
changed = true;
|
|
252
|
+
|
|
253
|
+
// BUG-51 fix #6: Release budget reservation for stale turns too
|
|
254
|
+
delete budgetReservations[entry.turn_id];
|
|
255
|
+
|
|
256
|
+
emitRunEvent(root, 'turn_stalled', {
|
|
257
|
+
run_id: state?.run_id || null,
|
|
258
|
+
phase: state?.phase || null,
|
|
259
|
+
status: 'blocked',
|
|
260
|
+
turn: { turn_id: entry.turn_id, role_id: entry.role },
|
|
261
|
+
payload: {
|
|
262
|
+
running_ms: entry.running_ms,
|
|
263
|
+
threshold_ms: entry.threshold_ms,
|
|
264
|
+
runtime_id: entry.runtime_id,
|
|
265
|
+
recommendation: entry.recommendation,
|
|
266
|
+
},
|
|
267
|
+
});
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
if (!changed) {
|
|
271
|
+
return { stale_turns: stale, ghost_turns: ghosts, state, changed: false };
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
const allDetected = [...ghosts, ...stale];
|
|
275
|
+
const primary = allDetected[0];
|
|
276
|
+
const category = ghosts.length > 0 ? 'ghost_turn' : 'stale_turn';
|
|
277
|
+
const blockedOn = allDetected.length === 1
|
|
278
|
+
? `turn:${primary.failure_type ? 'failed_start' : 'stalled'}:${primary.turn_id}`
|
|
279
|
+
: ghosts.length > 0 ? 'turns:failed_start' : 'turns:stalled';
|
|
280
|
+
|
|
281
|
+
const nextState = {
|
|
282
|
+
...state,
|
|
283
|
+
status: 'blocked',
|
|
284
|
+
active_turns: activeTurns,
|
|
285
|
+
budget_reservations: budgetReservations,
|
|
286
|
+
blocked_on: blockedOn,
|
|
287
|
+
blocked_reason: {
|
|
288
|
+
category,
|
|
289
|
+
blocked_at: nowIso,
|
|
290
|
+
turn_id: primary.turn_id,
|
|
291
|
+
recovery: {
|
|
292
|
+
typed_reason: category,
|
|
293
|
+
owner: 'human',
|
|
294
|
+
recovery_action: primary.recommendation,
|
|
295
|
+
turn_retained: true,
|
|
296
|
+
detail: primary.recommendation,
|
|
297
|
+
},
|
|
298
|
+
},
|
|
299
|
+
};
|
|
300
|
+
|
|
301
|
+
safeWriteJson(join(root, '.agentxchain', 'state.json'), nextState);
|
|
302
|
+
emitRunEvent(root, 'run_blocked', {
|
|
303
|
+
run_id: nextState.run_id || null,
|
|
304
|
+
phase: nextState.phase || null,
|
|
305
|
+
status: 'blocked',
|
|
306
|
+
turn: { turn_id: primary.turn_id, role_id: primary.role },
|
|
307
|
+
payload: {
|
|
308
|
+
category,
|
|
309
|
+
ghost_turn_ids: ghosts.map((entry) => entry.turn_id),
|
|
310
|
+
stalled_turn_ids: stale.map((entry) => entry.turn_id),
|
|
311
|
+
},
|
|
312
|
+
});
|
|
313
|
+
return { stale_turns: stale, ghost_turns: ghosts, state: nextState, changed: true };
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
function resolveThreshold(turn, config) {
|
|
317
|
+
// Config override takes precedence
|
|
318
|
+
const configThreshold = config?.run_loop?.stale_turn_threshold_ms;
|
|
319
|
+
if (typeof configThreshold === 'number' && configThreshold > 0) {
|
|
320
|
+
return configThreshold;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Runtime-type-based defaults
|
|
324
|
+
const runtimeId = turn.runtime_id || '';
|
|
325
|
+
const runtimeConfig = config?.runtimes?.[runtimeId];
|
|
326
|
+
const runtimeType = runtimeConfig?.type || '';
|
|
327
|
+
|
|
328
|
+
if (runtimeType === 'api_proxy') {
|
|
329
|
+
return DEFAULT_API_PROXY_THRESHOLD_MS;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
return DEFAULT_LOCAL_CLI_THRESHOLD_MS;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
function resolveStartupThreshold(config) {
|
|
336
|
+
const configThreshold = config?.run_loop?.startup_watchdog_ms;
|
|
337
|
+
if (typeof configThreshold === 'number' && configThreshold > 0) {
|
|
338
|
+
return configThreshold;
|
|
339
|
+
}
|
|
340
|
+
return DEFAULT_STARTUP_WATCHDOG_MS;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
function hasRecentTurnEventActivity(root, turnId, startedAt, threshold, now) {
|
|
344
|
+
try {
|
|
345
|
+
const events = readRunEvents(root, { limit: 200 });
|
|
346
|
+
for (let i = events.length - 1; i >= 0; i--) {
|
|
347
|
+
const event = events[i];
|
|
348
|
+
if (event?.turn?.turn_id !== turnId) continue;
|
|
349
|
+
if (event.event_type === 'turn_stalled' || event.event_type === 'turn_start_failed') continue;
|
|
350
|
+
const timestamp = Date.parse(event.timestamp || '');
|
|
351
|
+
if (!Number.isFinite(timestamp)) continue;
|
|
352
|
+
if (timestamp < startedAt) continue;
|
|
353
|
+
if ((now - timestamp) < threshold) {
|
|
354
|
+
return true;
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
} catch {
|
|
358
|
+
return false;
|
|
359
|
+
}
|
|
360
|
+
return false;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
function hasTurnScopedStagedResult(root, turnId) {
|
|
364
|
+
const turnScopedPath = join(root, getTurnStagingResultPath(turnId));
|
|
365
|
+
if (existsSync(turnScopedPath)) {
|
|
366
|
+
return true;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
const legacyPath = join(root, LEGACY_STAGING_PATH);
|
|
370
|
+
if (!existsSync(legacyPath)) {
|
|
371
|
+
return false;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
try {
|
|
375
|
+
const parsed = JSON.parse(readFileSync(legacyPath, 'utf8'));
|
|
376
|
+
return parsed?.turn_id === turnId;
|
|
377
|
+
} catch {
|
|
378
|
+
return false;
|
|
379
|
+
}
|
|
380
|
+
}
|
|
@@ -267,6 +267,10 @@ export function checkpointAcceptedTurn(root, opts = {}) {
|
|
|
267
267
|
if (state) {
|
|
268
268
|
writeState(root, {
|
|
269
269
|
...state,
|
|
270
|
+
// BUG-49: advance accepted_integration_ref to the new checkpoint SHA
|
|
271
|
+
// so drift detection compares against the current checkpoint, not a
|
|
272
|
+
// stale ref from the parent run or the pre-checkpoint state.
|
|
273
|
+
accepted_integration_ref: `git:${checkpointSha}`,
|
|
270
274
|
last_completed_turn: {
|
|
271
275
|
turn_id: entry.turn_id,
|
|
272
276
|
role: entry.role || null,
|