agentxchain 2.144.0 → 2.146.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -42,10 +42,18 @@ export function recordRunHistory(root, state, config, status) {
42
42
  const filePath = join(root, RUN_HISTORY_PATH);
43
43
  mkdirSync(dirname(filePath), { recursive: true });
44
44
 
45
- const historyEntries = readJsonlSafe(root, HISTORY_PATH);
45
+ const allHistoryEntries = readJsonlSafe(root, HISTORY_PATH);
46
46
  const ledgerEntries = readJsonlSafe(root, LEDGER_PATH);
47
47
 
48
- // Extract unique phases and roles from turn history
48
+ // BUG-50: filter history entries to the current run only.
49
+ // history.jsonl accumulates across runs; using all entries causes fresh
50
+ // run records to inherit parent run phases_completed/total_turns.
51
+ const currentRunId = state?.run_id || null;
52
+ const historyEntries = currentRunId
53
+ ? allHistoryEntries.filter(e => e.run_id === currentRunId)
54
+ : allHistoryEntries;
55
+
56
+ // Extract unique phases and roles from THIS run's turn history only
49
57
  const phasesCompleted = [...new Set(historyEntries.map(e => e.phase).filter(Boolean))];
50
58
  const rolesUsed = [...new Set(historyEntries.map(e => e.role).filter(Boolean))];
51
59
 
@@ -84,6 +92,7 @@ export function recordRunHistory(root, state, config, status) {
84
92
  connector_used: connectorUsed,
85
93
  model_used: modelUsed,
86
94
  provenance: normalizeRunProvenance(state?.provenance),
95
+ parent_context: buildParentContextSummary(state),
87
96
  retrospective: buildRunRetrospective({
88
97
  state,
89
98
  config,
@@ -317,6 +326,18 @@ function buildRecentAcceptedTurnSnapshot(entries) {
317
326
  }));
318
327
  }
319
328
 
329
+ function buildParentContextSummary(state) {
330
+ const parentRunId = state?.provenance?.parent_run_id || state?.inherited_context?.parent_run_id || null;
331
+ if (!parentRunId) return null;
332
+
333
+ return {
334
+ parent_run_id: parentRunId,
335
+ parent_status: state?.inherited_context?.parent_status || null,
336
+ parent_completed_at: state?.inherited_context?.parent_completed_at || null,
337
+ inherited_at: state?.inherited_context?.inherited_at || null,
338
+ };
339
+ }
340
+
320
341
  function buildRunRetrospective({ state, config, status, historyEntries }) {
321
342
  const acceptedTurns = historyEntries.filter((entry) => entry && typeof entry === 'object');
322
343
  const lastAcceptedTurn = acceptedTurns[acceptedTurns.length - 1] || null;
@@ -38,7 +38,7 @@ import { runAdmissionControl } from './admission-control.js';
38
38
  import { appendFileSync, mkdirSync, writeFileSync } from 'fs';
39
39
  import { join, dirname } from 'path';
40
40
  import { evaluateApprovalSlaReminders } from './notification-runner.js';
41
- import { readPreemptionMarker } from './intake.js';
41
+ import { validatePreemptionMarker } from './intake.js';
42
42
  import { buildTimeoutBlockedReason, evaluateTimeouts } from './timeout-evaluator.js';
43
43
 
44
44
  const DEFAULT_MAX_TURNS = 50;
@@ -139,7 +139,8 @@ export async function runLoop(root, config, callbacks, options = {}) {
139
139
  // interruption).
140
140
  const activeTurnCount = getActiveTurnCount(state);
141
141
  if (activeTurnCount === 0) {
142
- const marker = readPreemptionMarker(root);
142
+ // BUG-48: validate marker against live intent state before preempting
143
+ const marker = validatePreemptionMarker(root);
143
144
  if (marker && marker.priority === 'p0') {
144
145
  emit({ type: 'priority_injected', intent_id: marker.intent_id, priority: marker.priority });
145
146
  const result = makeResult(false, 'priority_preempted', state, turnsExecuted, turnHistory, gatesApproved, errors);
@@ -0,0 +1,380 @@
1
+ /**
2
+ * Stale Turn Watchdog — BUG-47 + BUG-51
3
+ *
4
+ * Two-tier lazy idle-threshold detection:
5
+ *
6
+ * 1. **Fast startup watchdog (BUG-51):** if an active turn has been dispatched
7
+ * for >30 seconds with NO dispatch-progress file, NO staged result, and NO
8
+ * recent events, it is a "ghost turn" — the subprocess never attached.
9
+ * Transitions to `failed_start` immediately.
10
+ *
11
+ * Design note: the watchdog intentionally keys on turn-scoped
12
+ * dispatch-progress rather than `stdout.log` existence. Dispatch-progress is
13
+ * a framework-authored signal with a stable per-turn contract across runtime
14
+ * wiring; `stdout.log` is adapter-authored visibility output and is allowed
15
+ * to be best-effort. Using dispatch-progress therefore gives us the same
16
+ * operator-facing "no first byte / no worker heartbeat" detection without
17
+ * coupling the watchdog to adapter-specific log-attachment details.
18
+ *
19
+ * 2. **Stale turn watchdog (BUG-47):** if an active turn has status "running"
20
+ * for >N minutes with no event log activity AND no staged result file,
21
+ * report it as stalled.
22
+ *
23
+ * Fires on CLI invocations (status, resume, step --resume) rather than
24
+ * requiring a background daemon.
25
+ *
26
+ * Default thresholds:
27
+ * - Startup watchdog: 30 seconds (configurable via run_loop.startup_watchdog_ms)
28
+ * - local_cli stale turns: 10 minutes
29
+ * - api_proxy stale turns: 5 minutes
30
+ * - Configurable via run_loop.stale_turn_threshold_ms in agentxchain.json
31
+ */
32
+
33
+ import { existsSync, readFileSync } from 'node:fs';
34
+ import { join } from 'node:path';
35
+ import { safeWriteJson } from './safe-write.js';
36
+ import { emitRunEvent, readRunEvents } from './run-events.js';
37
+ import { getTurnStagingResultPath } from './turn-paths.js';
38
+ import { getDispatchProgressRelativePath } from './dispatch-progress.js';
39
+
40
+ const DEFAULT_LOCAL_CLI_THRESHOLD_MS = 10 * 60 * 1000; // 10 minutes
41
+ const DEFAULT_API_PROXY_THRESHOLD_MS = 5 * 60 * 1000; // 5 minutes
42
+ const DEFAULT_STARTUP_WATCHDOG_MS = 30 * 1000; // 30 seconds (BUG-51)
43
+ const LEGACY_STAGING_PATH = '.agentxchain/staging/turn-result.json';
44
+
45
+ /**
46
+ * Check all active turns for stale "running" status.
47
+ *
48
+ * @param {string} root - project root directory
49
+ * @param {object} state - current governed state
50
+ * @param {object} config - normalized config
51
+ * @returns {Array<{ turn_id: string, role: string, runtime_id: string, running_ms: number, threshold_ms: number, recommendation: string }>}
52
+ */
53
+ export function detectStaleTurns(root, state, config) {
54
+ const activeTurns = state?.active_turns || {};
55
+ const stale = [];
56
+ const now = Date.now();
57
+
58
+ for (const [turnId, turn] of Object.entries(activeTurns)) {
59
+ if (turn.status !== 'running' && turn.status !== 'retrying') continue;
60
+ if (!turn.started_at) continue;
61
+
62
+ const startedAt = new Date(turn.started_at).getTime();
63
+ if (isNaN(startedAt)) continue;
64
+
65
+ const runningMs = now - startedAt;
66
+ const threshold = resolveThreshold(turn, config);
67
+
68
+ if (runningMs < threshold) continue;
69
+
70
+ if (hasTurnScopedStagedResult(root, turnId)) continue;
71
+
72
+ const progressPath = join(root, getDispatchProgressRelativePath(turnId));
73
+ if (existsSync(progressPath)) {
74
+ try {
75
+ const progress = JSON.parse(readFileSync(progressPath, 'utf8'));
76
+ const lastActivity = progress.last_activity_at
77
+ ? new Date(progress.last_activity_at).getTime()
78
+ : 0;
79
+ // If there was activity within the threshold, not stale
80
+ if (lastActivity > 0 && (now - lastActivity) < threshold) continue;
81
+ } catch {
82
+ // ignore parse errors
83
+ }
84
+ }
85
+
86
+ if (hasRecentTurnEventActivity(root, turnId, startedAt, threshold, now)) continue;
87
+
88
+ const runningMinutes = Math.floor(runningMs / 60000);
89
+ stale.push({
90
+ turn_id: turnId,
91
+ role: turn.assigned_role || 'unknown',
92
+ runtime_id: turn.runtime_id || 'unknown',
93
+ running_ms: runningMs,
94
+ threshold_ms: threshold,
95
+ recommendation: `Turn ${turnId} has been running for ${runningMinutes}m with no output. `
96
+ + `Run \`agentxchain reissue-turn --turn ${turnId} --reason stale\` to recover.`,
97
+ });
98
+ }
99
+
100
+ return stale;
101
+ }
102
+
103
+ /**
104
+ * BUG-51: Detect ghost-dispatched turns — subprocess never started.
105
+ *
106
+ * A ghost turn is one that has been in "running" or "retrying" status for
107
+ * longer than the startup watchdog threshold (default 30s) AND has:
108
+ * - no dispatch-progress file (framework-observed proof that no subprocess
109
+ * output or heartbeat was attached)
110
+ * - no staged result file
111
+ * - no recent turn-scoped events (beyond the initial turn_dispatched)
112
+ *
113
+ * This is a stricter, faster check than detectStaleTurns (BUG-47).
114
+ * Ghost turns transition to "failed_start" rather than "stalled".
115
+ *
116
+ * @param {string} root - project root directory
117
+ * @param {object} state - current governed state
118
+ * @param {object} config - normalized config
119
+ * @returns {Array<{ turn_id: string, role: string, runtime_id: string, running_ms: number, threshold_ms: number, recommendation: string, failure_type: string }>}
120
+ */
121
+ export function detectGhostTurns(root, state, config) {
122
+ const activeTurns = state?.active_turns || {};
123
+ const ghosts = [];
124
+ const now = Date.now();
125
+ const startupThreshold = resolveStartupThreshold(config);
126
+
127
+ for (const [turnId, turn] of Object.entries(activeTurns)) {
128
+ if (turn.status !== 'running' && turn.status !== 'retrying') continue;
129
+ if (!turn.started_at) continue;
130
+
131
+ const startedAt = new Date(turn.started_at).getTime();
132
+ if (isNaN(startedAt)) continue;
133
+
134
+ const runningMs = now - startedAt;
135
+ if (runningMs < startupThreshold) continue;
136
+
137
+ // Ghost detection: NO dispatch-progress file means subprocess never attached
138
+ const progressPath = join(root, getDispatchProgressRelativePath(turnId));
139
+ const hasProgress = existsSync(progressPath);
140
+
141
+ // If dispatch-progress exists, subprocess started — this is NOT a ghost turn.
142
+ // The regular stale-turn watchdog (BUG-47) will handle it if it goes silent.
143
+ if (hasProgress) continue;
144
+
145
+ // Also check for staged result (unlikely without progress, but be safe)
146
+ if (hasTurnScopedStagedResult(root, turnId)) continue;
147
+
148
+ // Check for any turn-scoped events beyond the initial dispatch event
149
+ if (hasRecentTurnEventActivity(root, turnId, startedAt, startupThreshold, now)) continue;
150
+
151
+ const runningSeconds = Math.floor(runningMs / 1000);
152
+ const failureType = 'no_subprocess_output';
153
+ ghosts.push({
154
+ turn_id: turnId,
155
+ role: turn.assigned_role || 'unknown',
156
+ runtime_id: turn.runtime_id || 'unknown',
157
+ running_ms: runningMs,
158
+ threshold_ms: startupThreshold,
159
+ failure_type: failureType,
160
+ recommendation: `Turn ${turnId} has been dispatched for ${runningSeconds}s with no subprocess output. `
161
+ + `The subprocess likely never started. `
162
+ + `Run \`agentxchain reissue-turn --turn ${turnId} --reason ghost\` to recover.`,
163
+ });
164
+ }
165
+
166
+ return ghosts;
167
+ }
168
+
169
+ /**
170
+ * Detect stale turns and emit turn_stalled events for each.
171
+ * Returns the stale turn list for caller display.
172
+ */
173
+ export function detectAndEmitStaleTurns(root, state, config) {
174
+ return reconcileStaleTurns(root, state, config).stale_turns;
175
+ }
176
+
177
+ // ── Internal ────────────────────────────────────────────────────────────────
178
+
179
+ export function reconcileStaleTurns(root, state, config) {
180
+ if (!state || typeof state !== 'object') {
181
+ return { stale_turns: [], ghost_turns: [], state, changed: false };
182
+ }
183
+
184
+ // BUG-51: Fast startup watchdog — detect ghost turns first (30s threshold)
185
+ const ghosts = detectGhostTurns(root, state, config);
186
+
187
+ // BUG-47: Stale turn watchdog — detect turns that started but went silent (10m threshold)
188
+ // Exclude turns already caught by ghost detection to avoid double-counting
189
+ const ghostIds = new Set(ghosts.map(g => g.turn_id));
190
+ const stale = detectStaleTurns(root, state, config).filter(s => !ghostIds.has(s.turn_id));
191
+
192
+ if (ghosts.length === 0 && stale.length === 0) {
193
+ return { stale_turns: [], ghost_turns: [], state, changed: false };
194
+ }
195
+
196
+ const nowIso = new Date().toISOString();
197
+ const activeTurns = { ...(state.active_turns || {}) };
198
+ const budgetReservations = { ...(state.budget_reservations || {}) };
199
+ let changed = false;
200
+
201
+ // Process ghost turns (BUG-51) — transition to failed_start
202
+ for (const entry of ghosts) {
203
+ const turn = activeTurns[entry.turn_id];
204
+ if (!turn || (turn.status !== 'running' && turn.status !== 'retrying')) continue;
205
+
206
+ activeTurns[entry.turn_id] = {
207
+ ...turn,
208
+ status: 'failed_start',
209
+ failed_start_at: nowIso,
210
+ failed_start_reason: entry.failure_type,
211
+ failed_start_previous_status: turn.status,
212
+ failed_start_threshold_ms: entry.threshold_ms,
213
+ failed_start_running_ms: entry.running_ms,
214
+ recovery_command: `agentxchain reissue-turn --turn ${entry.turn_id} --reason ghost`,
215
+ };
216
+ changed = true;
217
+
218
+ // BUG-51 fix #6: Release budget reservation for ghost turns
219
+ delete budgetReservations[entry.turn_id];
220
+
221
+ emitRunEvent(root, 'turn_start_failed', {
222
+ run_id: state?.run_id || null,
223
+ phase: state?.phase || null,
224
+ status: 'blocked',
225
+ turn: { turn_id: entry.turn_id, role_id: entry.role },
226
+ payload: {
227
+ running_ms: entry.running_ms,
228
+ threshold_ms: entry.threshold_ms,
229
+ runtime_id: entry.runtime_id,
230
+ failure_type: entry.failure_type,
231
+ recommendation: entry.recommendation,
232
+ },
233
+ });
234
+ }
235
+
236
+ // Process stale turns (BUG-47) — transition to stalled
237
+ for (const entry of stale) {
238
+ const turn = activeTurns[entry.turn_id];
239
+ if (!turn || (turn.status !== 'running' && turn.status !== 'retrying')) continue;
240
+
241
+ activeTurns[entry.turn_id] = {
242
+ ...turn,
243
+ status: 'stalled',
244
+ stalled_at: nowIso,
245
+ stalled_reason: 'no_output_within_threshold',
246
+ stalled_previous_status: turn.status,
247
+ stalled_threshold_ms: entry.threshold_ms,
248
+ stalled_running_ms: entry.running_ms,
249
+ recovery_command: `agentxchain reissue-turn --turn ${entry.turn_id} --reason stale`,
250
+ };
251
+ changed = true;
252
+
253
+ // BUG-51 fix #6: Release budget reservation for stale turns too
254
+ delete budgetReservations[entry.turn_id];
255
+
256
+ emitRunEvent(root, 'turn_stalled', {
257
+ run_id: state?.run_id || null,
258
+ phase: state?.phase || null,
259
+ status: 'blocked',
260
+ turn: { turn_id: entry.turn_id, role_id: entry.role },
261
+ payload: {
262
+ running_ms: entry.running_ms,
263
+ threshold_ms: entry.threshold_ms,
264
+ runtime_id: entry.runtime_id,
265
+ recommendation: entry.recommendation,
266
+ },
267
+ });
268
+ }
269
+
270
+ if (!changed) {
271
+ return { stale_turns: stale, ghost_turns: ghosts, state, changed: false };
272
+ }
273
+
274
+ const allDetected = [...ghosts, ...stale];
275
+ const primary = allDetected[0];
276
+ const category = ghosts.length > 0 ? 'ghost_turn' : 'stale_turn';
277
+ const blockedOn = allDetected.length === 1
278
+ ? `turn:${primary.failure_type ? 'failed_start' : 'stalled'}:${primary.turn_id}`
279
+ : ghosts.length > 0 ? 'turns:failed_start' : 'turns:stalled';
280
+
281
+ const nextState = {
282
+ ...state,
283
+ status: 'blocked',
284
+ active_turns: activeTurns,
285
+ budget_reservations: budgetReservations,
286
+ blocked_on: blockedOn,
287
+ blocked_reason: {
288
+ category,
289
+ blocked_at: nowIso,
290
+ turn_id: primary.turn_id,
291
+ recovery: {
292
+ typed_reason: category,
293
+ owner: 'human',
294
+ recovery_action: primary.recommendation,
295
+ turn_retained: true,
296
+ detail: primary.recommendation,
297
+ },
298
+ },
299
+ };
300
+
301
+ safeWriteJson(join(root, '.agentxchain', 'state.json'), nextState);
302
+ emitRunEvent(root, 'run_blocked', {
303
+ run_id: nextState.run_id || null,
304
+ phase: nextState.phase || null,
305
+ status: 'blocked',
306
+ turn: { turn_id: primary.turn_id, role_id: primary.role },
307
+ payload: {
308
+ category,
309
+ ghost_turn_ids: ghosts.map((entry) => entry.turn_id),
310
+ stalled_turn_ids: stale.map((entry) => entry.turn_id),
311
+ },
312
+ });
313
+ return { stale_turns: stale, ghost_turns: ghosts, state: nextState, changed: true };
314
+ }
315
+
316
+ function resolveThreshold(turn, config) {
317
+ // Config override takes precedence
318
+ const configThreshold = config?.run_loop?.stale_turn_threshold_ms;
319
+ if (typeof configThreshold === 'number' && configThreshold > 0) {
320
+ return configThreshold;
321
+ }
322
+
323
+ // Runtime-type-based defaults
324
+ const runtimeId = turn.runtime_id || '';
325
+ const runtimeConfig = config?.runtimes?.[runtimeId];
326
+ const runtimeType = runtimeConfig?.type || '';
327
+
328
+ if (runtimeType === 'api_proxy') {
329
+ return DEFAULT_API_PROXY_THRESHOLD_MS;
330
+ }
331
+
332
+ return DEFAULT_LOCAL_CLI_THRESHOLD_MS;
333
+ }
334
+
335
+ function resolveStartupThreshold(config) {
336
+ const configThreshold = config?.run_loop?.startup_watchdog_ms;
337
+ if (typeof configThreshold === 'number' && configThreshold > 0) {
338
+ return configThreshold;
339
+ }
340
+ return DEFAULT_STARTUP_WATCHDOG_MS;
341
+ }
342
+
343
+ function hasRecentTurnEventActivity(root, turnId, startedAt, threshold, now) {
344
+ try {
345
+ const events = readRunEvents(root, { limit: 200 });
346
+ for (let i = events.length - 1; i >= 0; i--) {
347
+ const event = events[i];
348
+ if (event?.turn?.turn_id !== turnId) continue;
349
+ if (event.event_type === 'turn_stalled' || event.event_type === 'turn_start_failed') continue;
350
+ const timestamp = Date.parse(event.timestamp || '');
351
+ if (!Number.isFinite(timestamp)) continue;
352
+ if (timestamp < startedAt) continue;
353
+ if ((now - timestamp) < threshold) {
354
+ return true;
355
+ }
356
+ }
357
+ } catch {
358
+ return false;
359
+ }
360
+ return false;
361
+ }
362
+
363
+ function hasTurnScopedStagedResult(root, turnId) {
364
+ const turnScopedPath = join(root, getTurnStagingResultPath(turnId));
365
+ if (existsSync(turnScopedPath)) {
366
+ return true;
367
+ }
368
+
369
+ const legacyPath = join(root, LEGACY_STAGING_PATH);
370
+ if (!existsSync(legacyPath)) {
371
+ return false;
372
+ }
373
+
374
+ try {
375
+ const parsed = JSON.parse(readFileSync(legacyPath, 'utf8'));
376
+ return parsed?.turn_id === turnId;
377
+ } catch {
378
+ return false;
379
+ }
380
+ }
@@ -267,6 +267,10 @@ export function checkpointAcceptedTurn(root, opts = {}) {
267
267
  if (state) {
268
268
  writeState(root, {
269
269
  ...state,
270
+ // BUG-49: advance accepted_integration_ref to the new checkpoint SHA
271
+ // so drift detection compares against the current checkpoint, not a
272
+ // stale ref from the parent run or the pre-checkpoint state.
273
+ accepted_integration_ref: `git:${checkpointSha}`,
270
274
  last_completed_turn: {
271
275
  turn_id: entry.turn_id,
272
276
  role: entry.role || null,