agentxchain 2.145.0 → 2.147.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dashboard/app.js +3 -0
  2. package/dashboard/components/notifications.js +127 -0
  3. package/dashboard/index.html +1 -0
  4. package/package.json +1 -1
  5. package/scripts/publish-npm.sh +16 -0
  6. package/scripts/release-downstream-truth.sh +16 -8
  7. package/scripts/sync-homebrew.sh +14 -1
  8. package/scripts/verify-post-publish.sh +55 -4
  9. package/src/commands/init.js +66 -31
  10. package/src/commands/reissue-turn.js +16 -0
  11. package/src/commands/reject-turn.js +14 -1
  12. package/src/commands/restart.js +33 -3
  13. package/src/commands/resume.js +78 -66
  14. package/src/commands/run.js +67 -10
  15. package/src/commands/schedule.js +34 -7
  16. package/src/commands/status.js +38 -5
  17. package/src/commands/step.js +117 -34
  18. package/src/lib/adapters/api-proxy-adapter.js +8 -0
  19. package/src/lib/adapters/local-cli-adapter.js +131 -13
  20. package/src/lib/adapters/manual-adapter.js +9 -10
  21. package/src/lib/adapters/mcp-adapter.js +3 -5
  22. package/src/lib/adapters/remote-agent-adapter.js +3 -5
  23. package/src/lib/config.js +4 -1
  24. package/src/lib/continuous-run.js +71 -6
  25. package/src/lib/dashboard/actions.js +9 -3
  26. package/src/lib/dashboard/bridge-server.js +11 -0
  27. package/src/lib/dashboard/notifications-reader.js +91 -0
  28. package/src/lib/dashboard/state-reader.js +16 -4
  29. package/src/lib/dispatch-bundle.js +1 -1
  30. package/src/lib/dispatch-progress.js +5 -3
  31. package/src/lib/governed-state.js +355 -13
  32. package/src/lib/intake.js +10 -1
  33. package/src/lib/normalized-config.js +51 -1
  34. package/src/lib/recent-event-summary.js +12 -0
  35. package/src/lib/run-events.js +4 -0
  36. package/src/lib/run-loop.js +67 -2
  37. package/src/lib/runner-interface.js +1 -0
  38. package/src/lib/schema.js +7 -0
  39. package/src/lib/schemas/agentxchain-config.schema.json +15 -1
  40. package/src/lib/staged-result-proof.js +43 -0
  41. package/src/lib/stale-turn-watchdog.js +308 -34
  42. package/src/lib/turn-result-shape.js +38 -0
  43. package/src/lib/turn-result-validator.js +4 -1
@@ -31,6 +31,7 @@ import {
31
31
  getActiveTurnCount,
32
32
  getActiveTurns,
33
33
  getMaxConcurrentTurns,
34
+ transitionActiveTurnLifecycle,
34
35
  RUNNER_INTERFACE_VERSION,
35
36
  } from './runner-interface.js';
36
37
 
@@ -40,6 +41,18 @@ import { join, dirname } from 'path';
40
41
  import { evaluateApprovalSlaReminders } from './notification-runner.js';
41
42
  import { validatePreemptionMarker } from './intake.js';
42
43
  import { buildTimeoutBlockedReason, evaluateTimeouts } from './timeout-evaluator.js';
44
+ import { hasMinimumTurnResultShape } from './turn-result-shape.js';
45
+
46
+ // Per DEC-RUN-LOOP-MIN-SHAPE-SYMMETRY-001 (Turn 33): runLoop is the SDK boundary
47
+ // any third-party runner can wire (see website-v2/docs/build-your-own-runner.mdx).
48
+ // In-repo adapters (api_proxy, mcp, local_cli, remote_agent) already validate
49
+ // staged-result shape before write per DEC-MINIMUM-TURN-RESULT-SHAPE-001, and
50
+ // run.js's dispatch callback re-validates before returning per
51
+ // DEC-RUN-STAGED-READ-SHAPE-GUARD-001. Third-party callbacks have no such
52
+ // obligation. runLoop must therefore validate dispatchResult.turnResult shape
53
+ // before persisting it as a governed staged-result artifact.
54
+ const MIN_SHAPE_REJECTION_REASON =
55
+ 'staged result missing minimum governed envelope (schema_version + identity + lifecycle fields)';
43
56
 
44
57
  const DEFAULT_MAX_TURNS = 50;
45
58
 
@@ -182,7 +195,7 @@ async function executeSequentialTurn(root, config, state, callbacks, emit, error
182
195
  let assignState;
183
196
  const activeTurn = getActiveTurn(state);
184
197
 
185
- if (activeTurn && (activeTurn.status === 'running' || activeTurn.status === 'retrying')) {
198
+ if (activeTurn && isDispatchableActiveTurn(activeTurn)) {
186
199
  turn = activeTurn;
187
200
  assignState = state;
188
201
  } else {
@@ -224,7 +237,7 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
224
237
  const activeTurns = getActiveTurns(state);
225
238
  const turnsToDispatch = [];
226
239
  for (const turn of Object.values(activeTurns)) {
227
- if (turn.status === 'running' || turn.status === 'retrying') {
240
+ if (isDispatchableActiveTurn(turn)) {
228
241
  turnsToDispatch.push({ turn, state });
229
242
  }
230
243
  }
@@ -317,6 +330,7 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
317
330
  errors.push(`writeDispatchBundle(${turn.assigned_role}): ${bundleResult.error}`);
318
331
  continue;
319
332
  }
333
+ transitionActiveTurnLifecycle(root, turn.turn_id, 'dispatched');
320
334
  const stagingPath = getTurnStagingResultPath(turn.turn_id);
321
335
  contexts.push({
322
336
  turn,
@@ -362,6 +376,23 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
362
376
  continue;
363
377
  }
364
378
 
379
+ if (dispatchResult.accept && !hasMinimumTurnResultShape(dispatchResult.turnResult)) {
380
+ // DEC-RUN-LOOP-MIN-SHAPE-SYMMETRY-001: third-party dispatch callback claimed
381
+ // accept=true but returned a payload missing the minimum envelope. Refuse to
382
+ // stage; convert to standard rejection so the run state advances cleanly.
383
+ const validationResult = { stage: 'dispatch', errors: [MIN_SHAPE_REJECTION_REASON] };
384
+ rejectTurn(root, config, validationResult, MIN_SHAPE_REJECTION_REASON, { turnId: turn.turn_id });
385
+ history.push({ role: roleId, turn_id: turn.turn_id, accepted: false });
386
+ emit({ type: 'turn_rejected', turn, role: roleId, reason: MIN_SHAPE_REJECTION_REASON });
387
+ const postRejectState = loadState(root, config);
388
+ if (postRejectState?.status === 'blocked') {
389
+ errors.push(`Turn rejected for ${roleId}, retries exhausted`);
390
+ emit({ type: 'blocked', state: postRejectState });
391
+ return { terminal: true, ok: false, stop_reason: 'reject_exhausted', history, acceptedCount };
392
+ }
393
+ continue;
394
+ }
395
+
365
396
  if (dispatchResult.accept) {
366
397
  const absStaging = join(root, ctx.stagingPath);
367
398
  mkdirSync(dirname(absStaging), { recursive: true });
@@ -409,6 +440,12 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
409
440
  }
410
441
  emit({ type: 'turn_accepted', turn, role: roleId, state: acceptResult.state });
411
442
  } else {
443
+ if (dispatchResult?.blocked === true) {
444
+ history.push({ role: roleId, turn_id: turn.turn_id, accepted: false, blocked: true });
445
+ const blockedState = loadState(root, config);
446
+ emit({ type: 'blocked', state: blockedState });
447
+ return { terminal: true, ok: false, stop_reason: 'blocked', history, acceptedCount };
448
+ }
412
449
  const validationResult = {
413
450
  stage: 'dispatch',
414
451
  errors: [dispatchResult.reason || 'Dispatch callback rejected the turn'],
@@ -449,6 +486,10 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
449
486
  return { terminal: false, history, acceptedCount };
450
487
  }
451
488
 
489
+ function isDispatchableActiveTurn(turn) {
490
+ return ['assigned', 'dispatched', 'starting', 'running', 'retrying'].includes(turn?.status);
491
+ }
492
+
452
493
  /**
453
494
  * Dispatch a single turn and process its result.
454
495
  */
@@ -463,6 +504,7 @@ async function dispatchAndProcess(root, config, turn, assignState, callbacks, em
463
504
  errors.push(`writeDispatchBundle(${roleId}): ${bundleResult.error}`);
464
505
  return { terminal: true, ok: false, stop_reason: 'blocked', history };
465
506
  }
507
+ transitionActiveTurnLifecycle(root, turn.turn_id, 'dispatched');
466
508
 
467
509
  const stagingPath = getTurnStagingResultPath(turn.turn_id);
468
510
  const context = {
@@ -488,6 +530,22 @@ async function dispatchAndProcess(root, config, turn, assignState, callbacks, em
488
530
  return { terminal: true, ok: false, stop_reason: 'blocked', history };
489
531
  }
490
532
 
533
+ if (dispatchResult.accept && !hasMinimumTurnResultShape(dispatchResult.turnResult)) {
534
+ // DEC-RUN-LOOP-MIN-SHAPE-SYMMETRY-001: same boundary as parallel branch.
535
+ // Refuse to stage; convert to a standard rejection.
536
+ const validationResult = { stage: 'dispatch', errors: [MIN_SHAPE_REJECTION_REASON] };
537
+ rejectTurn(root, config, validationResult, MIN_SHAPE_REJECTION_REASON);
538
+ history.push({ role: roleId, turn_id: turn.turn_id, accepted: false });
539
+ emit({ type: 'turn_rejected', turn, role: roleId, reason: MIN_SHAPE_REJECTION_REASON });
540
+ const postRejectState = loadState(root, config);
541
+ if (postRejectState?.status === 'blocked') {
542
+ errors.push(`Turn rejected for ${roleId}, retries exhausted`);
543
+ emit({ type: 'blocked', state: postRejectState });
544
+ return { terminal: true, ok: false, stop_reason: 'reject_exhausted', history };
545
+ }
546
+ return { terminal: false, accepted: false, history };
547
+ }
548
+
491
549
  if (dispatchResult.accept) {
492
550
  const absStaging = join(root, stagingPath);
493
551
  mkdirSync(dirname(absStaging), { recursive: true });
@@ -537,6 +595,13 @@ async function dispatchAndProcess(root, config, turn, assignState, callbacks, em
537
595
  return { terminal: false, accepted: true, history };
538
596
  }
539
597
 
598
+ if (dispatchResult?.blocked === true) {
599
+ history.push({ role: roleId, turn_id: turn.turn_id, accepted: false, blocked: true });
600
+ const blockedState = loadState(root, config);
601
+ emit({ type: 'blocked', state: blockedState });
602
+ return { terminal: true, ok: false, stop_reason: 'blocked', history };
603
+ }
604
+
540
605
  // Rejection
541
606
  const validationResult = {
542
607
  stage: 'dispatch',
@@ -41,6 +41,7 @@ export {
41
41
  releaseAcceptanceLock as releaseLock,
42
42
  refreshTurnBaselineSnapshot,
43
43
  reissueTurn,
44
+ transitionActiveTurnLifecycle,
44
45
  } from './governed-state.js';
45
46
 
46
47
  // ── Dispatch ────────────────────────────────────────────────────────────────
package/src/lib/schema.js CHANGED
@@ -35,6 +35,13 @@ export function validateGovernedStateSchema(data) {
35
35
  // but validators and read-only surfaces still tolerate reserved/manual states.
36
36
  const VALID_RUN_STATUSES = ['idle', 'active', 'paused', 'blocked', 'completed', 'failed'];
37
37
  const isV1_1 = data?.schema_version === '1.1';
38
+ // NOTE: `current_turn` is the persisted v1.0 schema field. Under v1.1 it is
39
+ // not a persisted field at all — `loadProjectState()` re-attaches it as a
40
+ // non-enumerable getter alias over `active_turns` after normalization
41
+ // (DEC-CURRENT-TURN-COMPAT-ALIAS-001). This validator runs against the
42
+ // persisted shape, so an `own` property named `current_turn` on a v1.1 doc
43
+ // means "stray persisted-shape leak from a legacy write" and is rejected
44
+ // below — it does NOT mean the runtime alias is going away.
38
45
  const hasLegacyCurrentTurn = Object.prototype.hasOwnProperty.call(data || {}, 'current_turn');
39
46
 
40
47
  function validateTurn(turn, label) {
@@ -85,7 +85,21 @@
85
85
  "type": "object"
86
86
  },
87
87
  "run_loop": {
88
- "type": "object"
88
+ "type": "object",
89
+ "description": "Runner control knobs for execution watchdogs and automation behavior.",
90
+ "properties": {
91
+ "startup_watchdog_ms": {
92
+ "type": "integer",
93
+ "minimum": 1,
94
+ "description": "Milliseconds to wait after dispatch for worker attach/first-output proof before retaining the turn as failed_start. Default 30000."
95
+ },
96
+ "stale_turn_threshold_ms": {
97
+ "type": "integer",
98
+ "minimum": 1,
99
+ "description": "Milliseconds to wait before a started turn that previously produced output is treated as stale. Default 600000 for local_cli turns and 300000 for api_proxy turns."
100
+ }
101
+ },
102
+ "additionalProperties": true
89
103
  },
90
104
  "mission_planner": {
91
105
  "type": "object"
@@ -0,0 +1,43 @@
1
+ /**
2
+ * Staged turn-result proof helpers.
3
+ *
4
+ * Per DEC-BUG51-STAGING-PLACEHOLDER-NOT-PROOF-001: a turn-scoped staged-result
5
+ * file is proof of execution only when it contains meaningful result content.
6
+ * Adapter-authored placeholders (`{}`, blank, whitespace-only) are cleanup
7
+ * artifacts — watchdog, adapter, and recovery code must treat them as absent.
8
+ *
9
+ * This module centralizes that check so every surface (local-cli adapter,
10
+ * manual adapter, stale-turn watchdog) uses the same rule.
11
+ */
12
+
13
+ import { existsSync, readFileSync } from 'node:fs';
14
+
15
+ /**
16
+ * Returns true when the staged-result file at `filePath` exists AND contains
17
+ * content that is not a placeholder (empty, whitespace-only, or `{}`).
18
+ *
19
+ * Trim-aware: `{}\n`, ` {}\n`, and `{}` are all rejected. Legitimate turn
20
+ * results carry the full governed schema and are far larger than the
21
+ * placeholder shapes this function filters.
22
+ *
23
+ * @param {string} filePath - absolute path to the staged-result file
24
+ * @returns {boolean}
25
+ */
26
+ export function hasMeaningfulStagedResult(filePath) {
27
+ if (!existsSync(filePath)) {
28
+ return false;
29
+ }
30
+
31
+ let raw;
32
+ try {
33
+ raw = readFileSync(filePath, 'utf8');
34
+ } catch {
35
+ return false;
36
+ }
37
+
38
+ const trimmed = raw.trim();
39
+ if (trimmed === '' || trimmed === '{}') {
40
+ return false;
41
+ }
42
+ return true;
43
+ }
@@ -1,16 +1,32 @@
1
1
  /**
2
- * Stale Turn Watchdog — BUG-47
2
+ * Stale Turn Watchdog — BUG-47 + BUG-51
3
3
  *
4
- * Lazy idle-threshold detection: if an active turn has status "running"
5
- * for >N seconds with no event log activity AND no staged result file,
6
- * report it as stalled.
4
+ * Two-tier lazy idle-threshold detection:
5
+ *
6
+ * 1. **Fast startup watchdog (BUG-51):** if an active turn has been
7
+ * `dispatched`/`starting`/`running` for >30 seconds with NO startup proof
8
+ * (no first-byte output recorded on the turn or in dispatch-progress) and
9
+ * NO staged result, it is a "ghost turn" — the subprocess never reached a
10
+ * healthy running state. Transitions to `failed_start` immediately.
11
+ *
12
+ * Design note: the watchdog intentionally keys on first-output proof from
13
+ * the framework-owned dispatch-progress contract rather than `stdout.log`
14
+ * existence. `stdout.log` is adapter-authored visibility output and may be
15
+ * absent even when the adapter is wired correctly. First-output timestamps
16
+ * and output-line counters are the stable health contract across runtime
17
+ * wiring.
18
+ *
19
+ * 2. **Stale turn watchdog (BUG-47):** if an active turn has status "running"
20
+ * for >N minutes with no event log activity AND no staged result file,
21
+ * report it as stalled.
7
22
  *
8
23
  * Fires on CLI invocations (status, resume, step --resume) rather than
9
24
  * requiring a background daemon.
10
25
  *
11
26
  * Default thresholds:
12
- * - local_cli turns: 10 minutes
13
- * - api_proxy turns: 5 minutes
27
+ * - Startup watchdog: 30 seconds (configurable via run_loop.startup_watchdog_ms)
28
+ * - local_cli stale turns: 10 minutes
29
+ * - api_proxy stale turns: 5 minutes
14
30
  * - Configurable via run_loop.stale_turn_threshold_ms in agentxchain.json
15
31
  */
16
32
 
@@ -20,9 +36,11 @@ import { safeWriteJson } from './safe-write.js';
20
36
  import { emitRunEvent, readRunEvents } from './run-events.js';
21
37
  import { getTurnStagingResultPath } from './turn-paths.js';
22
38
  import { getDispatchProgressRelativePath } from './dispatch-progress.js';
39
+ import { hasMeaningfulStagedResult } from './staged-result-proof.js';
23
40
 
24
41
  const DEFAULT_LOCAL_CLI_THRESHOLD_MS = 10 * 60 * 1000; // 10 minutes
25
42
  const DEFAULT_API_PROXY_THRESHOLD_MS = 5 * 60 * 1000; // 5 minutes
43
+ const DEFAULT_STARTUP_WATCHDOG_MS = 30 * 1000; // 30 seconds (BUG-51)
26
44
  const LEGACY_STAGING_PATH = '.agentxchain/staging/turn-result.json';
27
45
 
28
46
  /**
@@ -83,6 +101,62 @@ export function detectStaleTurns(root, state, config) {
83
101
  return stale;
84
102
  }
85
103
 
104
+ /**
105
+ * BUG-51: Detect ghost-dispatched turns — subprocess never started.
106
+ *
107
+ * A ghost turn is one that has been in `dispatched`, `starting`, `running`, or
108
+ * `retrying` longer than the startup watchdog threshold (default 30s) AND has:
109
+ * - no startup proof (no `first_output_at` on the turn or dispatch-progress,
110
+ * and no recorded output line counts)
111
+ * - no staged result file
112
+ *
113
+ * This is a stricter, faster check than detectStaleTurns (BUG-47).
114
+ * Ghost turns transition to "failed_start" rather than "stalled".
115
+ *
116
+ * @param {string} root - project root directory
117
+ * @param {object} state - current governed state
118
+ * @param {object} config - normalized config
119
+ * @returns {Array<{ turn_id: string, role: string, runtime_id: string, running_ms: number, threshold_ms: number, recommendation: string, failure_type: string }>}
120
+ */
121
+ export function detectGhostTurns(root, state, config) {
122
+ const activeTurns = state?.active_turns || {};
123
+ const ghosts = [];
124
+ const now = Date.now();
125
+ const startupThreshold = resolveStartupThreshold(config);
126
+
127
+ for (const [turnId, turn] of Object.entries(activeTurns)) {
128
+ if (!['dispatched', 'starting', 'running', 'retrying'].includes(turn.status)) continue;
129
+
130
+ const lifecycleStart = parseGhostLifecycleStart(turn);
131
+ if (!Number.isFinite(lifecycleStart)) continue;
132
+
133
+ const runningMs = now - lifecycleStart;
134
+ if (runningMs < startupThreshold) continue;
135
+
136
+ const progressPath = join(root, getDispatchProgressRelativePath(turnId));
137
+ const progress = readDispatchProgressSafe(progressPath);
138
+
139
+ if (hasTurnScopedStagedResult(root, turnId)) continue;
140
+ if (hasStartupProof(turn, progress)) continue;
141
+
142
+ const runningSeconds = Math.floor(runningMs / 1000);
143
+ const failureType = classifyStartupFailureType(turn, progress);
144
+ ghosts.push({
145
+ turn_id: turnId,
146
+ role: turn.assigned_role || 'unknown',
147
+ runtime_id: turn.runtime_id || 'unknown',
148
+ running_ms: runningMs,
149
+ threshold_ms: startupThreshold,
150
+ failure_type: failureType,
151
+ recommendation: `Turn ${turnId} has been dispatched for ${runningSeconds}s with no subprocess output. `
152
+ + `The subprocess likely never started. `
153
+ + `Run \`agentxchain reissue-turn --turn ${turnId} --reason ghost\` to recover.`,
154
+ });
155
+ }
156
+
157
+ return ghosts;
158
+ }
159
+
86
160
  /**
87
161
  * Detect stale turns and emit turn_stalled events for each.
88
162
  * Returns the stale turn list for caller display.
@@ -95,18 +169,36 @@ export function detectAndEmitStaleTurns(root, state, config) {
95
169
 
96
170
  export function reconcileStaleTurns(root, state, config) {
97
171
  if (!state || typeof state !== 'object') {
98
- return { stale_turns: [], state, changed: false };
172
+ return { stale_turns: [], ghost_turns: [], state, changed: false };
99
173
  }
100
174
 
101
- const stale = detectStaleTurns(root, state, config);
102
- if (stale.length === 0) {
103
- return { stale_turns: [], state, changed: false };
175
+ // BUG-51: Fast startup watchdog — detect ghost turns first (30s threshold)
176
+ const ghosts = detectGhostTurns(root, state, config);
177
+
178
+ // BUG-47: Stale turn watchdog — detect turns that started but went silent (10m threshold)
179
+ // Exclude turns already caught by ghost detection to avoid double-counting
180
+ const ghostIds = new Set(ghosts.map(g => g.turn_id));
181
+ const stale = detectStaleTurns(root, state, config).filter(s => !ghostIds.has(s.turn_id));
182
+
183
+ if (ghosts.length === 0 && stale.length === 0) {
184
+ return { stale_turns: [], ghost_turns: [], state, changed: false };
104
185
  }
105
186
 
106
187
  const nowIso = new Date().toISOString();
107
188
  const activeTurns = { ...(state.active_turns || {}) };
189
+ const budgetReservations = { ...(state.budget_reservations || {}) };
108
190
  let changed = false;
109
191
 
192
+ // Process ghost turns (BUG-51) — transition to failed_start
193
+ for (const entry of ghosts) {
194
+ const applied = applyStartupFailureToActiveTurn(activeTurns, budgetReservations, entry, nowIso);
195
+ if (applied) {
196
+ emitStartupFailureEvent(root, state, entry);
197
+ changed = true;
198
+ }
199
+ }
200
+
201
+ // Process stale turns (BUG-47) — transition to stalled
110
202
  for (const entry of stale) {
111
203
  const turn = activeTurns[entry.turn_id];
112
204
  if (!turn || (turn.status !== 'running' && turn.status !== 'retrying')) continue;
@@ -123,6 +215,9 @@ export function reconcileStaleTurns(root, state, config) {
123
215
  };
124
216
  changed = true;
125
217
 
218
+ // BUG-51 fix #6: Release budget reservation for stale turns too
219
+ delete budgetReservations[entry.turn_id];
220
+
126
221
  emitRunEvent(root, 'turn_stalled', {
127
222
  run_id: state?.run_id || null,
128
223
  phase: state?.phase || null,
@@ -138,28 +233,12 @@ export function reconcileStaleTurns(root, state, config) {
138
233
  }
139
234
 
140
235
  if (!changed) {
141
- return { stale_turns: stale, state, changed: false };
236
+ return { stale_turns: stale, ghost_turns: ghosts, state, changed: false };
142
237
  }
143
238
 
144
- const primary = stale[0];
145
- const nextState = {
146
- ...state,
147
- status: 'blocked',
148
- active_turns: activeTurns,
149
- blocked_on: stale.length === 1 ? `turn:stalled:${primary.turn_id}` : 'turns:stalled',
150
- blocked_reason: {
151
- category: 'stale_turn',
152
- blocked_at: nowIso,
153
- turn_id: primary.turn_id,
154
- recovery: {
155
- typed_reason: 'stale_turn',
156
- owner: 'human',
157
- recovery_action: primary.recommendation,
158
- turn_retained: true,
159
- detail: primary.recommendation,
160
- },
161
- },
162
- };
239
+ const nextState = buildBlockedStateFromEntries(state, activeTurns, budgetReservations, ghosts, stale, nowIso);
240
+ const primary = [...ghosts, ...stale][0];
241
+ const category = ghosts.length > 0 ? 'ghost_turn' : 'stale_turn';
163
242
 
164
243
  safeWriteJson(join(root, '.agentxchain', 'state.json'), nextState);
165
244
  emitRunEvent(root, 'run_blocked', {
@@ -168,11 +247,12 @@ export function reconcileStaleTurns(root, state, config) {
168
247
  status: 'blocked',
169
248
  turn: { turn_id: primary.turn_id, role_id: primary.role },
170
249
  payload: {
171
- category: 'stale_turn',
250
+ category,
251
+ ghost_turn_ids: ghosts.map((entry) => entry.turn_id),
172
252
  stalled_turn_ids: stale.map((entry) => entry.turn_id),
173
253
  },
174
254
  });
175
- return { stale_turns: stale, state: nextState, changed: true };
255
+ return { stale_turns: stale, ghost_turns: ghosts, state: nextState, changed: true };
176
256
  }
177
257
 
178
258
  function resolveThreshold(turn, config) {
@@ -194,13 +274,71 @@ function resolveThreshold(turn, config) {
194
274
  return DEFAULT_LOCAL_CLI_THRESHOLD_MS;
195
275
  }
196
276
 
277
+ function resolveStartupThreshold(config) {
278
+ const configThreshold = config?.run_loop?.startup_watchdog_ms;
279
+ if (typeof configThreshold === 'number' && configThreshold > 0) {
280
+ return configThreshold;
281
+ }
282
+ return DEFAULT_STARTUP_WATCHDOG_MS;
283
+ }
284
+
285
+ export function failTurnStartup(root, state, config, turnId, details = {}) {
286
+ if (!state || typeof state !== 'object') {
287
+ return { ok: false, error: 'No governed state found' };
288
+ }
289
+
290
+ const turn = state.active_turns?.[turnId];
291
+ if (!turn) {
292
+ return { ok: false, error: `Turn ${turnId} not found in active turns` };
293
+ }
294
+
295
+ const nowIso = new Date().toISOString();
296
+ const activeTurns = { ...(state.active_turns || {}) };
297
+ const budgetReservations = { ...(state.budget_reservations || {}) };
298
+ const entry = {
299
+ turn_id: turnId,
300
+ role: turn.assigned_role || 'unknown',
301
+ runtime_id: turn.runtime_id || 'unknown',
302
+ running_ms: details.running_ms ?? computeLifecycleAgeMs(turn),
303
+ threshold_ms: details.threshold_ms ?? resolveStartupThreshold(config),
304
+ failure_type: classifyStartupFailureType(turn, null, details.failure_type || 'no_subprocess_output'),
305
+ recommendation: details.recommendation
306
+ || `Turn ${turnId} failed to start cleanly. Run \`agentxchain reissue-turn --turn ${turnId} --reason ghost\` to recover.`,
307
+ };
308
+
309
+ if (!applyStartupFailureToActiveTurn(activeTurns, budgetReservations, entry, nowIso)) {
310
+ return { ok: false, error: `Turn ${turnId} is not eligible for startup failure transition` };
311
+ }
312
+
313
+ const nextState = buildBlockedStateFromEntries(state, activeTurns, budgetReservations, [entry], [], nowIso);
314
+ safeWriteJson(join(root, '.agentxchain', 'state.json'), nextState);
315
+ emitStartupFailureEvent(root, state, entry);
316
+ emitRunEvent(root, 'run_blocked', {
317
+ run_id: nextState.run_id || null,
318
+ phase: nextState.phase || null,
319
+ status: 'blocked',
320
+ turn: { turn_id: entry.turn_id, role_id: entry.role },
321
+ payload: {
322
+ category: 'ghost_turn',
323
+ ghost_turn_ids: [entry.turn_id],
324
+ stalled_turn_ids: [],
325
+ },
326
+ });
327
+ return { ok: true, state: nextState, turn: nextState.active_turns?.[turnId] || null };
328
+ }
329
+
197
330
  function hasRecentTurnEventActivity(root, turnId, startedAt, threshold, now) {
198
331
  try {
199
332
  const events = readRunEvents(root, { limit: 200 });
200
333
  for (let i = events.length - 1; i >= 0; i--) {
201
334
  const event = events[i];
202
335
  if (event?.turn?.turn_id !== turnId) continue;
203
- if (event.event_type === 'turn_stalled') continue;
336
+ if (
337
+ event.event_type === 'turn_stalled'
338
+ || event.event_type === 'turn_start_failed'
339
+ || event.event_type === 'runtime_spawn_failed'
340
+ || event.event_type === 'stdout_attach_failed'
341
+ ) continue;
204
342
  const timestamp = Date.parse(event.timestamp || '');
205
343
  if (!Number.isFinite(timestamp)) continue;
206
344
  if (timestamp < startedAt) continue;
@@ -214,9 +352,145 @@ function hasRecentTurnEventActivity(root, turnId, startedAt, threshold, now) {
214
352
  return false;
215
353
  }
216
354
 
355
+ function applyStartupFailureToActiveTurn(activeTurns, budgetReservations, entry, nowIso) {
356
+ const turn = activeTurns[entry.turn_id];
357
+ if (!turn || !['dispatched', 'starting', 'running', 'retrying'].includes(turn.status)) {
358
+ return false;
359
+ }
360
+
361
+ activeTurns[entry.turn_id] = {
362
+ ...turn,
363
+ status: 'failed_start',
364
+ failed_start_at: nowIso,
365
+ failed_start_reason: entry.failure_type,
366
+ failed_start_previous_status: turn.status,
367
+ failed_start_threshold_ms: entry.threshold_ms,
368
+ failed_start_running_ms: entry.running_ms,
369
+ recovery_command: `agentxchain reissue-turn --turn ${entry.turn_id} --reason ghost`,
370
+ };
371
+ delete budgetReservations[entry.turn_id];
372
+ return true;
373
+ }
374
+
375
+ function emitStartupFailureEvent(root, state, entry) {
376
+ const payload = {
377
+ running_ms: entry.running_ms,
378
+ threshold_ms: entry.threshold_ms,
379
+ runtime_id: entry.runtime_id,
380
+ failure_type: entry.failure_type,
381
+ recommendation: entry.recommendation,
382
+ };
383
+ const details = {
384
+ run_id: state?.run_id || null,
385
+ phase: state?.phase || null,
386
+ status: 'blocked',
387
+ turn: { turn_id: entry.turn_id, role_id: entry.role },
388
+ payload,
389
+ };
390
+ emitRunEvent(root, 'turn_start_failed', details);
391
+ const failureEventType = mapStartupFailureEventType(entry.failure_type);
392
+ if (failureEventType) {
393
+ emitRunEvent(root, failureEventType, details);
394
+ }
395
+ }
396
+
397
+ function buildBlockedStateFromEntries(state, activeTurns, budgetReservations, ghosts, stale, nowIso) {
398
+ const allDetected = [...ghosts, ...stale];
399
+ const primary = allDetected[0];
400
+ const category = ghosts.length > 0 ? 'ghost_turn' : 'stale_turn';
401
+ const blockedOn = allDetected.length === 1
402
+ ? `turn:${primary.failure_type ? 'failed_start' : 'stalled'}:${primary.turn_id}`
403
+ : ghosts.length > 0 ? 'turns:failed_start' : 'turns:stalled';
404
+
405
+ return {
406
+ ...state,
407
+ status: 'blocked',
408
+ active_turns: activeTurns,
409
+ budget_reservations: budgetReservations,
410
+ blocked_on: blockedOn,
411
+ blocked_reason: {
412
+ category,
413
+ blocked_at: nowIso,
414
+ turn_id: primary.turn_id,
415
+ recovery: {
416
+ typed_reason: category,
417
+ owner: 'human',
418
+ recovery_action: primary.recommendation,
419
+ turn_retained: true,
420
+ detail: primary.recommendation,
421
+ },
422
+ },
423
+ };
424
+ }
425
+
426
+ function parseGhostLifecycleStart(turn) {
427
+ if (turn.status === 'dispatched') {
428
+ return Date.parse(turn.dispatched_at || turn.assigned_at || '');
429
+ }
430
+ return Date.parse(turn.started_at || turn.dispatched_at || turn.assigned_at || '');
431
+ }
432
+
433
+ function computeLifecycleAgeMs(turn) {
434
+ const start = parseGhostLifecycleStart(turn);
435
+ if (!Number.isFinite(start)) return 0;
436
+ return Math.max(0, Date.now() - start);
437
+ }
438
+
439
+ function readDispatchProgressSafe(progressPath) {
440
+ if (!existsSync(progressPath)) {
441
+ return null;
442
+ }
443
+ try {
444
+ return JSON.parse(readFileSync(progressPath, 'utf8'));
445
+ } catch {
446
+ return null;
447
+ }
448
+ }
449
+
450
+ function classifyStartupFailureType(turn, progress, fallback = 'no_subprocess_output') {
451
+ if (fallback === 'runtime_spawn_failed' || fallback === 'stdout_attach_failed') {
452
+ return fallback;
453
+ }
454
+ if (turn?.status === 'dispatched') {
455
+ return 'runtime_spawn_failed';
456
+ }
457
+ const hasWorkerAttachProof = Boolean(
458
+ turn?.worker_attached_at
459
+ || turn?.worker_pid != null
460
+ || progress?.pid != null,
461
+ );
462
+ if (turn?.status === 'starting' || hasWorkerAttachProof) {
463
+ return 'stdout_attach_failed';
464
+ }
465
+ return fallback;
466
+ }
467
+
468
+ function mapStartupFailureEventType(failureType) {
469
+ if (failureType === 'runtime_spawn_failed') {
470
+ return 'runtime_spawn_failed';
471
+ }
472
+ if (failureType === 'stdout_attach_failed') {
473
+ return 'stdout_attach_failed';
474
+ }
475
+ return null;
476
+ }
477
+
478
+ function hasStartupProof(turn, progress) {
479
+ if (turn.first_output_at) {
480
+ return true;
481
+ }
482
+ if (!progress || typeof progress !== 'object') {
483
+ return false;
484
+ }
485
+ if (progress.first_output_at) {
486
+ return true;
487
+ }
488
+ return Number(progress.output_lines || 0) > 0 || Number(progress.stderr_lines || 0) > 0;
489
+ }
490
+
217
491
  function hasTurnScopedStagedResult(root, turnId) {
218
492
  const turnScopedPath = join(root, getTurnStagingResultPath(turnId));
219
- if (existsSync(turnScopedPath)) {
493
+ if (hasMeaningfulStagedResult(turnScopedPath)) {
220
494
  return true;
221
495
  }
222
496