agentxchain 2.145.0 → 2.147.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dashboard/app.js +3 -0
- package/dashboard/components/notifications.js +127 -0
- package/dashboard/index.html +1 -0
- package/package.json +1 -1
- package/scripts/publish-npm.sh +16 -0
- package/scripts/release-downstream-truth.sh +16 -8
- package/scripts/sync-homebrew.sh +14 -1
- package/scripts/verify-post-publish.sh +55 -4
- package/src/commands/init.js +66 -31
- package/src/commands/reissue-turn.js +16 -0
- package/src/commands/reject-turn.js +14 -1
- package/src/commands/restart.js +33 -3
- package/src/commands/resume.js +78 -66
- package/src/commands/run.js +67 -10
- package/src/commands/schedule.js +34 -7
- package/src/commands/status.js +38 -5
- package/src/commands/step.js +117 -34
- package/src/lib/adapters/api-proxy-adapter.js +8 -0
- package/src/lib/adapters/local-cli-adapter.js +131 -13
- package/src/lib/adapters/manual-adapter.js +9 -10
- package/src/lib/adapters/mcp-adapter.js +3 -5
- package/src/lib/adapters/remote-agent-adapter.js +3 -5
- package/src/lib/config.js +4 -1
- package/src/lib/continuous-run.js +71 -6
- package/src/lib/dashboard/actions.js +9 -3
- package/src/lib/dashboard/bridge-server.js +11 -0
- package/src/lib/dashboard/notifications-reader.js +91 -0
- package/src/lib/dashboard/state-reader.js +16 -4
- package/src/lib/dispatch-bundle.js +1 -1
- package/src/lib/dispatch-progress.js +5 -3
- package/src/lib/governed-state.js +355 -13
- package/src/lib/intake.js +10 -1
- package/src/lib/normalized-config.js +51 -1
- package/src/lib/recent-event-summary.js +12 -0
- package/src/lib/run-events.js +4 -0
- package/src/lib/run-loop.js +67 -2
- package/src/lib/runner-interface.js +1 -0
- package/src/lib/schema.js +7 -0
- package/src/lib/schemas/agentxchain-config.schema.json +15 -1
- package/src/lib/staged-result-proof.js +43 -0
- package/src/lib/stale-turn-watchdog.js +308 -34
- package/src/lib/turn-result-shape.js +38 -0
- package/src/lib/turn-result-validator.js +4 -1
package/src/lib/run-loop.js
CHANGED
|
@@ -31,6 +31,7 @@ import {
|
|
|
31
31
|
getActiveTurnCount,
|
|
32
32
|
getActiveTurns,
|
|
33
33
|
getMaxConcurrentTurns,
|
|
34
|
+
transitionActiveTurnLifecycle,
|
|
34
35
|
RUNNER_INTERFACE_VERSION,
|
|
35
36
|
} from './runner-interface.js';
|
|
36
37
|
|
|
@@ -40,6 +41,18 @@ import { join, dirname } from 'path';
|
|
|
40
41
|
import { evaluateApprovalSlaReminders } from './notification-runner.js';
|
|
41
42
|
import { validatePreemptionMarker } from './intake.js';
|
|
42
43
|
import { buildTimeoutBlockedReason, evaluateTimeouts } from './timeout-evaluator.js';
|
|
44
|
+
import { hasMinimumTurnResultShape } from './turn-result-shape.js';
|
|
45
|
+
|
|
46
|
+
// Per DEC-RUN-LOOP-MIN-SHAPE-SYMMETRY-001 (Turn 33): runLoop is the SDK boundary
|
|
47
|
+
// any third-party runner can wire (see website-v2/docs/build-your-own-runner.mdx).
|
|
48
|
+
// In-repo adapters (api_proxy, mcp, local_cli, remote_agent) already validate
|
|
49
|
+
// staged-result shape before write per DEC-MINIMUM-TURN-RESULT-SHAPE-001, and
|
|
50
|
+
// run.js's dispatch callback re-validates before returning per
|
|
51
|
+
// DEC-RUN-STAGED-READ-SHAPE-GUARD-001. Third-party callbacks have no such
|
|
52
|
+
// obligation. runLoop must therefore validate dispatchResult.turnResult shape
|
|
53
|
+
// before persisting it as a governed staged-result artifact.
|
|
54
|
+
const MIN_SHAPE_REJECTION_REASON =
|
|
55
|
+
'staged result missing minimum governed envelope (schema_version + identity + lifecycle fields)';
|
|
43
56
|
|
|
44
57
|
const DEFAULT_MAX_TURNS = 50;
|
|
45
58
|
|
|
@@ -182,7 +195,7 @@ async function executeSequentialTurn(root, config, state, callbacks, emit, error
|
|
|
182
195
|
let assignState;
|
|
183
196
|
const activeTurn = getActiveTurn(state);
|
|
184
197
|
|
|
185
|
-
if (activeTurn && (activeTurn
|
|
198
|
+
if (activeTurn && isDispatchableActiveTurn(activeTurn)) {
|
|
186
199
|
turn = activeTurn;
|
|
187
200
|
assignState = state;
|
|
188
201
|
} else {
|
|
@@ -224,7 +237,7 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
|
|
|
224
237
|
const activeTurns = getActiveTurns(state);
|
|
225
238
|
const turnsToDispatch = [];
|
|
226
239
|
for (const turn of Object.values(activeTurns)) {
|
|
227
|
-
if (turn
|
|
240
|
+
if (isDispatchableActiveTurn(turn)) {
|
|
228
241
|
turnsToDispatch.push({ turn, state });
|
|
229
242
|
}
|
|
230
243
|
}
|
|
@@ -317,6 +330,7 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
|
|
|
317
330
|
errors.push(`writeDispatchBundle(${turn.assigned_role}): ${bundleResult.error}`);
|
|
318
331
|
continue;
|
|
319
332
|
}
|
|
333
|
+
transitionActiveTurnLifecycle(root, turn.turn_id, 'dispatched');
|
|
320
334
|
const stagingPath = getTurnStagingResultPath(turn.turn_id);
|
|
321
335
|
contexts.push({
|
|
322
336
|
turn,
|
|
@@ -362,6 +376,23 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
|
|
|
362
376
|
continue;
|
|
363
377
|
}
|
|
364
378
|
|
|
379
|
+
if (dispatchResult.accept && !hasMinimumTurnResultShape(dispatchResult.turnResult)) {
|
|
380
|
+
// DEC-RUN-LOOP-MIN-SHAPE-SYMMETRY-001: third-party dispatch callback claimed
|
|
381
|
+
// accept=true but returned a payload missing the minimum envelope. Refuse to
|
|
382
|
+
// stage; convert to standard rejection so the run state advances cleanly.
|
|
383
|
+
const validationResult = { stage: 'dispatch', errors: [MIN_SHAPE_REJECTION_REASON] };
|
|
384
|
+
rejectTurn(root, config, validationResult, MIN_SHAPE_REJECTION_REASON, { turnId: turn.turn_id });
|
|
385
|
+
history.push({ role: roleId, turn_id: turn.turn_id, accepted: false });
|
|
386
|
+
emit({ type: 'turn_rejected', turn, role: roleId, reason: MIN_SHAPE_REJECTION_REASON });
|
|
387
|
+
const postRejectState = loadState(root, config);
|
|
388
|
+
if (postRejectState?.status === 'blocked') {
|
|
389
|
+
errors.push(`Turn rejected for ${roleId}, retries exhausted`);
|
|
390
|
+
emit({ type: 'blocked', state: postRejectState });
|
|
391
|
+
return { terminal: true, ok: false, stop_reason: 'reject_exhausted', history, acceptedCount };
|
|
392
|
+
}
|
|
393
|
+
continue;
|
|
394
|
+
}
|
|
395
|
+
|
|
365
396
|
if (dispatchResult.accept) {
|
|
366
397
|
const absStaging = join(root, ctx.stagingPath);
|
|
367
398
|
mkdirSync(dirname(absStaging), { recursive: true });
|
|
@@ -409,6 +440,12 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
|
|
|
409
440
|
}
|
|
410
441
|
emit({ type: 'turn_accepted', turn, role: roleId, state: acceptResult.state });
|
|
411
442
|
} else {
|
|
443
|
+
if (dispatchResult?.blocked === true) {
|
|
444
|
+
history.push({ role: roleId, turn_id: turn.turn_id, accepted: false, blocked: true });
|
|
445
|
+
const blockedState = loadState(root, config);
|
|
446
|
+
emit({ type: 'blocked', state: blockedState });
|
|
447
|
+
return { terminal: true, ok: false, stop_reason: 'blocked', history, acceptedCount };
|
|
448
|
+
}
|
|
412
449
|
const validationResult = {
|
|
413
450
|
stage: 'dispatch',
|
|
414
451
|
errors: [dispatchResult.reason || 'Dispatch callback rejected the turn'],
|
|
@@ -449,6 +486,10 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
|
|
|
449
486
|
return { terminal: false, history, acceptedCount };
|
|
450
487
|
}
|
|
451
488
|
|
|
489
|
+
function isDispatchableActiveTurn(turn) {
|
|
490
|
+
return ['assigned', 'dispatched', 'starting', 'running', 'retrying'].includes(turn?.status);
|
|
491
|
+
}
|
|
492
|
+
|
|
452
493
|
/**
|
|
453
494
|
* Dispatch a single turn and process its result.
|
|
454
495
|
*/
|
|
@@ -463,6 +504,7 @@ async function dispatchAndProcess(root, config, turn, assignState, callbacks, em
|
|
|
463
504
|
errors.push(`writeDispatchBundle(${roleId}): ${bundleResult.error}`);
|
|
464
505
|
return { terminal: true, ok: false, stop_reason: 'blocked', history };
|
|
465
506
|
}
|
|
507
|
+
transitionActiveTurnLifecycle(root, turn.turn_id, 'dispatched');
|
|
466
508
|
|
|
467
509
|
const stagingPath = getTurnStagingResultPath(turn.turn_id);
|
|
468
510
|
const context = {
|
|
@@ -488,6 +530,22 @@ async function dispatchAndProcess(root, config, turn, assignState, callbacks, em
|
|
|
488
530
|
return { terminal: true, ok: false, stop_reason: 'blocked', history };
|
|
489
531
|
}
|
|
490
532
|
|
|
533
|
+
if (dispatchResult.accept && !hasMinimumTurnResultShape(dispatchResult.turnResult)) {
|
|
534
|
+
// DEC-RUN-LOOP-MIN-SHAPE-SYMMETRY-001: same boundary as parallel branch.
|
|
535
|
+
// Refuse to stage; convert to a standard rejection.
|
|
536
|
+
const validationResult = { stage: 'dispatch', errors: [MIN_SHAPE_REJECTION_REASON] };
|
|
537
|
+
rejectTurn(root, config, validationResult, MIN_SHAPE_REJECTION_REASON);
|
|
538
|
+
history.push({ role: roleId, turn_id: turn.turn_id, accepted: false });
|
|
539
|
+
emit({ type: 'turn_rejected', turn, role: roleId, reason: MIN_SHAPE_REJECTION_REASON });
|
|
540
|
+
const postRejectState = loadState(root, config);
|
|
541
|
+
if (postRejectState?.status === 'blocked') {
|
|
542
|
+
errors.push(`Turn rejected for ${roleId}, retries exhausted`);
|
|
543
|
+
emit({ type: 'blocked', state: postRejectState });
|
|
544
|
+
return { terminal: true, ok: false, stop_reason: 'reject_exhausted', history };
|
|
545
|
+
}
|
|
546
|
+
return { terminal: false, accepted: false, history };
|
|
547
|
+
}
|
|
548
|
+
|
|
491
549
|
if (dispatchResult.accept) {
|
|
492
550
|
const absStaging = join(root, stagingPath);
|
|
493
551
|
mkdirSync(dirname(absStaging), { recursive: true });
|
|
@@ -537,6 +595,13 @@ async function dispatchAndProcess(root, config, turn, assignState, callbacks, em
|
|
|
537
595
|
return { terminal: false, accepted: true, history };
|
|
538
596
|
}
|
|
539
597
|
|
|
598
|
+
if (dispatchResult?.blocked === true) {
|
|
599
|
+
history.push({ role: roleId, turn_id: turn.turn_id, accepted: false, blocked: true });
|
|
600
|
+
const blockedState = loadState(root, config);
|
|
601
|
+
emit({ type: 'blocked', state: blockedState });
|
|
602
|
+
return { terminal: true, ok: false, stop_reason: 'blocked', history };
|
|
603
|
+
}
|
|
604
|
+
|
|
540
605
|
// Rejection
|
|
541
606
|
const validationResult = {
|
|
542
607
|
stage: 'dispatch',
|
package/src/lib/schema.js
CHANGED
|
@@ -35,6 +35,13 @@ export function validateGovernedStateSchema(data) {
|
|
|
35
35
|
// but validators and read-only surfaces still tolerate reserved/manual states.
|
|
36
36
|
const VALID_RUN_STATUSES = ['idle', 'active', 'paused', 'blocked', 'completed', 'failed'];
|
|
37
37
|
const isV1_1 = data?.schema_version === '1.1';
|
|
38
|
+
// NOTE: `current_turn` is the persisted v1.0 schema field. Under v1.1 it is
|
|
39
|
+
// not a persisted field at all — `loadProjectState()` re-attaches it as a
|
|
40
|
+
// non-enumerable getter alias over `active_turns` after normalization
|
|
41
|
+
// (DEC-CURRENT-TURN-COMPAT-ALIAS-001). This validator runs against the
|
|
42
|
+
// persisted shape, so an `own` property named `current_turn` on a v1.1 doc
|
|
43
|
+
// means "stray persisted-shape leak from a legacy write" and is rejected
|
|
44
|
+
// below — it does NOT mean the runtime alias is going away.
|
|
38
45
|
const hasLegacyCurrentTurn = Object.prototype.hasOwnProperty.call(data || {}, 'current_turn');
|
|
39
46
|
|
|
40
47
|
function validateTurn(turn, label) {
|
|
@@ -85,7 +85,21 @@
|
|
|
85
85
|
"type": "object"
|
|
86
86
|
},
|
|
87
87
|
"run_loop": {
|
|
88
|
-
"type": "object"
|
|
88
|
+
"type": "object",
|
|
89
|
+
"description": "Runner control knobs for execution watchdogs and automation behavior.",
|
|
90
|
+
"properties": {
|
|
91
|
+
"startup_watchdog_ms": {
|
|
92
|
+
"type": "integer",
|
|
93
|
+
"minimum": 1,
|
|
94
|
+
"description": "Milliseconds to wait after dispatch for worker attach/first-output proof before retaining the turn as failed_start. Default 30000."
|
|
95
|
+
},
|
|
96
|
+
"stale_turn_threshold_ms": {
|
|
97
|
+
"type": "integer",
|
|
98
|
+
"minimum": 1,
|
|
99
|
+
"description": "Milliseconds to wait before a started turn that previously produced output is treated as stale. Default 600000 for local_cli turns and 300000 for api_proxy turns."
|
|
100
|
+
}
|
|
101
|
+
},
|
|
102
|
+
"additionalProperties": true
|
|
89
103
|
},
|
|
90
104
|
"mission_planner": {
|
|
91
105
|
"type": "object"
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Staged turn-result proof helpers.
|
|
3
|
+
*
|
|
4
|
+
* Per DEC-BUG51-STAGING-PLACEHOLDER-NOT-PROOF-001: a turn-scoped staged-result
|
|
5
|
+
* file is proof of execution only when it contains meaningful result content.
|
|
6
|
+
* Adapter-authored placeholders (`{}`, blank, whitespace-only) are cleanup
|
|
7
|
+
* artifacts — watchdog, adapter, and recovery code must treat them as absent.
|
|
8
|
+
*
|
|
9
|
+
* This module centralizes that check so every surface (local-cli adapter,
|
|
10
|
+
* manual adapter, stale-turn watchdog) uses the same rule.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Returns true when the staged-result file at `filePath` exists AND contains
|
|
17
|
+
* content that is not a placeholder (empty, whitespace-only, or `{}`).
|
|
18
|
+
*
|
|
19
|
+
* Trim-aware: `{}\n`, ` {}\n`, and `{}` are all rejected. Legitimate turn
|
|
20
|
+
* results carry the full governed schema and are far larger than the
|
|
21
|
+
* placeholder shapes this function filters.
|
|
22
|
+
*
|
|
23
|
+
* @param {string} filePath - absolute path to the staged-result file
|
|
24
|
+
* @returns {boolean}
|
|
25
|
+
*/
|
|
26
|
+
export function hasMeaningfulStagedResult(filePath) {
|
|
27
|
+
if (!existsSync(filePath)) {
|
|
28
|
+
return false;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
let raw;
|
|
32
|
+
try {
|
|
33
|
+
raw = readFileSync(filePath, 'utf8');
|
|
34
|
+
} catch {
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const trimmed = raw.trim();
|
|
39
|
+
if (trimmed === '' || trimmed === '{}') {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
return true;
|
|
43
|
+
}
|
|
@@ -1,16 +1,32 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Stale Turn Watchdog — BUG-47
|
|
2
|
+
* Stale Turn Watchdog — BUG-47 + BUG-51
|
|
3
3
|
*
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
*
|
|
4
|
+
* Two-tier lazy idle-threshold detection:
|
|
5
|
+
*
|
|
6
|
+
* 1. **Fast startup watchdog (BUG-51):** if an active turn has been
|
|
7
|
+
* `dispatched`/`starting`/`running` for >30 seconds with NO startup proof
|
|
8
|
+
* (no first-byte output recorded on the turn or in dispatch-progress) and
|
|
9
|
+
* NO staged result, it is a "ghost turn" — the subprocess never reached a
|
|
10
|
+
* healthy running state. Transitions to `failed_start` immediately.
|
|
11
|
+
*
|
|
12
|
+
* Design note: the watchdog intentionally keys on first-output proof from
|
|
13
|
+
* the framework-owned dispatch-progress contract rather than `stdout.log`
|
|
14
|
+
* existence. `stdout.log` is adapter-authored visibility output and may be
|
|
15
|
+
* absent even when the adapter is wired correctly. First-output timestamps
|
|
16
|
+
* and output-line counters are the stable health contract across runtime
|
|
17
|
+
* wiring.
|
|
18
|
+
*
|
|
19
|
+
* 2. **Stale turn watchdog (BUG-47):** if an active turn has status "running"
|
|
20
|
+
* for >N minutes with no event log activity AND no staged result file,
|
|
21
|
+
* report it as stalled.
|
|
7
22
|
*
|
|
8
23
|
* Fires on CLI invocations (status, resume, step --resume) rather than
|
|
9
24
|
* requiring a background daemon.
|
|
10
25
|
*
|
|
11
26
|
* Default thresholds:
|
|
12
|
-
* -
|
|
13
|
-
* -
|
|
27
|
+
* - Startup watchdog: 30 seconds (configurable via run_loop.startup_watchdog_ms)
|
|
28
|
+
* - local_cli stale turns: 10 minutes
|
|
29
|
+
* - api_proxy stale turns: 5 minutes
|
|
14
30
|
* - Configurable via run_loop.stale_turn_threshold_ms in agentxchain.json
|
|
15
31
|
*/
|
|
16
32
|
|
|
@@ -20,9 +36,11 @@ import { safeWriteJson } from './safe-write.js';
|
|
|
20
36
|
import { emitRunEvent, readRunEvents } from './run-events.js';
|
|
21
37
|
import { getTurnStagingResultPath } from './turn-paths.js';
|
|
22
38
|
import { getDispatchProgressRelativePath } from './dispatch-progress.js';
|
|
39
|
+
import { hasMeaningfulStagedResult } from './staged-result-proof.js';
|
|
23
40
|
|
|
24
41
|
const DEFAULT_LOCAL_CLI_THRESHOLD_MS = 10 * 60 * 1000; // 10 minutes
|
|
25
42
|
const DEFAULT_API_PROXY_THRESHOLD_MS = 5 * 60 * 1000; // 5 minutes
|
|
43
|
+
const DEFAULT_STARTUP_WATCHDOG_MS = 30 * 1000; // 30 seconds (BUG-51)
|
|
26
44
|
const LEGACY_STAGING_PATH = '.agentxchain/staging/turn-result.json';
|
|
27
45
|
|
|
28
46
|
/**
|
|
@@ -83,6 +101,62 @@ export function detectStaleTurns(root, state, config) {
|
|
|
83
101
|
return stale;
|
|
84
102
|
}
|
|
85
103
|
|
|
104
|
+
/**
|
|
105
|
+
* BUG-51: Detect ghost-dispatched turns — subprocess never started.
|
|
106
|
+
*
|
|
107
|
+
* A ghost turn is one that has been in `dispatched`, `starting`, `running`, or
|
|
108
|
+
* `retrying` longer than the startup watchdog threshold (default 30s) AND has:
|
|
109
|
+
* - no startup proof (no `first_output_at` on the turn or dispatch-progress,
|
|
110
|
+
* and no recorded output line counts)
|
|
111
|
+
* - no staged result file
|
|
112
|
+
*
|
|
113
|
+
* This is a stricter, faster check than detectStaleTurns (BUG-47).
|
|
114
|
+
* Ghost turns transition to "failed_start" rather than "stalled".
|
|
115
|
+
*
|
|
116
|
+
* @param {string} root - project root directory
|
|
117
|
+
* @param {object} state - current governed state
|
|
118
|
+
* @param {object} config - normalized config
|
|
119
|
+
* @returns {Array<{ turn_id: string, role: string, runtime_id: string, running_ms: number, threshold_ms: number, recommendation: string, failure_type: string }>}
|
|
120
|
+
*/
|
|
121
|
+
export function detectGhostTurns(root, state, config) {
|
|
122
|
+
const activeTurns = state?.active_turns || {};
|
|
123
|
+
const ghosts = [];
|
|
124
|
+
const now = Date.now();
|
|
125
|
+
const startupThreshold = resolveStartupThreshold(config);
|
|
126
|
+
|
|
127
|
+
for (const [turnId, turn] of Object.entries(activeTurns)) {
|
|
128
|
+
if (!['dispatched', 'starting', 'running', 'retrying'].includes(turn.status)) continue;
|
|
129
|
+
|
|
130
|
+
const lifecycleStart = parseGhostLifecycleStart(turn);
|
|
131
|
+
if (!Number.isFinite(lifecycleStart)) continue;
|
|
132
|
+
|
|
133
|
+
const runningMs = now - lifecycleStart;
|
|
134
|
+
if (runningMs < startupThreshold) continue;
|
|
135
|
+
|
|
136
|
+
const progressPath = join(root, getDispatchProgressRelativePath(turnId));
|
|
137
|
+
const progress = readDispatchProgressSafe(progressPath);
|
|
138
|
+
|
|
139
|
+
if (hasTurnScopedStagedResult(root, turnId)) continue;
|
|
140
|
+
if (hasStartupProof(turn, progress)) continue;
|
|
141
|
+
|
|
142
|
+
const runningSeconds = Math.floor(runningMs / 1000);
|
|
143
|
+
const failureType = classifyStartupFailureType(turn, progress);
|
|
144
|
+
ghosts.push({
|
|
145
|
+
turn_id: turnId,
|
|
146
|
+
role: turn.assigned_role || 'unknown',
|
|
147
|
+
runtime_id: turn.runtime_id || 'unknown',
|
|
148
|
+
running_ms: runningMs,
|
|
149
|
+
threshold_ms: startupThreshold,
|
|
150
|
+
failure_type: failureType,
|
|
151
|
+
recommendation: `Turn ${turnId} has been dispatched for ${runningSeconds}s with no subprocess output. `
|
|
152
|
+
+ `The subprocess likely never started. `
|
|
153
|
+
+ `Run \`agentxchain reissue-turn --turn ${turnId} --reason ghost\` to recover.`,
|
|
154
|
+
});
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return ghosts;
|
|
158
|
+
}
|
|
159
|
+
|
|
86
160
|
/**
|
|
87
161
|
* Detect stale turns and emit turn_stalled events for each.
|
|
88
162
|
* Returns the stale turn list for caller display.
|
|
@@ -95,18 +169,36 @@ export function detectAndEmitStaleTurns(root, state, config) {
|
|
|
95
169
|
|
|
96
170
|
export function reconcileStaleTurns(root, state, config) {
|
|
97
171
|
if (!state || typeof state !== 'object') {
|
|
98
|
-
return { stale_turns: [], state, changed: false };
|
|
172
|
+
return { stale_turns: [], ghost_turns: [], state, changed: false };
|
|
99
173
|
}
|
|
100
174
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
175
|
+
// BUG-51: Fast startup watchdog — detect ghost turns first (30s threshold)
|
|
176
|
+
const ghosts = detectGhostTurns(root, state, config);
|
|
177
|
+
|
|
178
|
+
// BUG-47: Stale turn watchdog — detect turns that started but went silent (10m threshold)
|
|
179
|
+
// Exclude turns already caught by ghost detection to avoid double-counting
|
|
180
|
+
const ghostIds = new Set(ghosts.map(g => g.turn_id));
|
|
181
|
+
const stale = detectStaleTurns(root, state, config).filter(s => !ghostIds.has(s.turn_id));
|
|
182
|
+
|
|
183
|
+
if (ghosts.length === 0 && stale.length === 0) {
|
|
184
|
+
return { stale_turns: [], ghost_turns: [], state, changed: false };
|
|
104
185
|
}
|
|
105
186
|
|
|
106
187
|
const nowIso = new Date().toISOString();
|
|
107
188
|
const activeTurns = { ...(state.active_turns || {}) };
|
|
189
|
+
const budgetReservations = { ...(state.budget_reservations || {}) };
|
|
108
190
|
let changed = false;
|
|
109
191
|
|
|
192
|
+
// Process ghost turns (BUG-51) — transition to failed_start
|
|
193
|
+
for (const entry of ghosts) {
|
|
194
|
+
const applied = applyStartupFailureToActiveTurn(activeTurns, budgetReservations, entry, nowIso);
|
|
195
|
+
if (applied) {
|
|
196
|
+
emitStartupFailureEvent(root, state, entry);
|
|
197
|
+
changed = true;
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Process stale turns (BUG-47) — transition to stalled
|
|
110
202
|
for (const entry of stale) {
|
|
111
203
|
const turn = activeTurns[entry.turn_id];
|
|
112
204
|
if (!turn || (turn.status !== 'running' && turn.status !== 'retrying')) continue;
|
|
@@ -123,6 +215,9 @@ export function reconcileStaleTurns(root, state, config) {
|
|
|
123
215
|
};
|
|
124
216
|
changed = true;
|
|
125
217
|
|
|
218
|
+
// BUG-51 fix #6: Release budget reservation for stale turns too
|
|
219
|
+
delete budgetReservations[entry.turn_id];
|
|
220
|
+
|
|
126
221
|
emitRunEvent(root, 'turn_stalled', {
|
|
127
222
|
run_id: state?.run_id || null,
|
|
128
223
|
phase: state?.phase || null,
|
|
@@ -138,28 +233,12 @@ export function reconcileStaleTurns(root, state, config) {
|
|
|
138
233
|
}
|
|
139
234
|
|
|
140
235
|
if (!changed) {
|
|
141
|
-
return { stale_turns: stale, state, changed: false };
|
|
236
|
+
return { stale_turns: stale, ghost_turns: ghosts, state, changed: false };
|
|
142
237
|
}
|
|
143
238
|
|
|
144
|
-
const
|
|
145
|
-
const
|
|
146
|
-
|
|
147
|
-
status: 'blocked',
|
|
148
|
-
active_turns: activeTurns,
|
|
149
|
-
blocked_on: stale.length === 1 ? `turn:stalled:${primary.turn_id}` : 'turns:stalled',
|
|
150
|
-
blocked_reason: {
|
|
151
|
-
category: 'stale_turn',
|
|
152
|
-
blocked_at: nowIso,
|
|
153
|
-
turn_id: primary.turn_id,
|
|
154
|
-
recovery: {
|
|
155
|
-
typed_reason: 'stale_turn',
|
|
156
|
-
owner: 'human',
|
|
157
|
-
recovery_action: primary.recommendation,
|
|
158
|
-
turn_retained: true,
|
|
159
|
-
detail: primary.recommendation,
|
|
160
|
-
},
|
|
161
|
-
},
|
|
162
|
-
};
|
|
239
|
+
const nextState = buildBlockedStateFromEntries(state, activeTurns, budgetReservations, ghosts, stale, nowIso);
|
|
240
|
+
const primary = [...ghosts, ...stale][0];
|
|
241
|
+
const category = ghosts.length > 0 ? 'ghost_turn' : 'stale_turn';
|
|
163
242
|
|
|
164
243
|
safeWriteJson(join(root, '.agentxchain', 'state.json'), nextState);
|
|
165
244
|
emitRunEvent(root, 'run_blocked', {
|
|
@@ -168,11 +247,12 @@ export function reconcileStaleTurns(root, state, config) {
|
|
|
168
247
|
status: 'blocked',
|
|
169
248
|
turn: { turn_id: primary.turn_id, role_id: primary.role },
|
|
170
249
|
payload: {
|
|
171
|
-
category
|
|
250
|
+
category,
|
|
251
|
+
ghost_turn_ids: ghosts.map((entry) => entry.turn_id),
|
|
172
252
|
stalled_turn_ids: stale.map((entry) => entry.turn_id),
|
|
173
253
|
},
|
|
174
254
|
});
|
|
175
|
-
return { stale_turns: stale, state: nextState, changed: true };
|
|
255
|
+
return { stale_turns: stale, ghost_turns: ghosts, state: nextState, changed: true };
|
|
176
256
|
}
|
|
177
257
|
|
|
178
258
|
function resolveThreshold(turn, config) {
|
|
@@ -194,13 +274,71 @@ function resolveThreshold(turn, config) {
|
|
|
194
274
|
return DEFAULT_LOCAL_CLI_THRESHOLD_MS;
|
|
195
275
|
}
|
|
196
276
|
|
|
277
|
+
function resolveStartupThreshold(config) {
|
|
278
|
+
const configThreshold = config?.run_loop?.startup_watchdog_ms;
|
|
279
|
+
if (typeof configThreshold === 'number' && configThreshold > 0) {
|
|
280
|
+
return configThreshold;
|
|
281
|
+
}
|
|
282
|
+
return DEFAULT_STARTUP_WATCHDOG_MS;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
export function failTurnStartup(root, state, config, turnId, details = {}) {
|
|
286
|
+
if (!state || typeof state !== 'object') {
|
|
287
|
+
return { ok: false, error: 'No governed state found' };
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
const turn = state.active_turns?.[turnId];
|
|
291
|
+
if (!turn) {
|
|
292
|
+
return { ok: false, error: `Turn ${turnId} not found in active turns` };
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
const nowIso = new Date().toISOString();
|
|
296
|
+
const activeTurns = { ...(state.active_turns || {}) };
|
|
297
|
+
const budgetReservations = { ...(state.budget_reservations || {}) };
|
|
298
|
+
const entry = {
|
|
299
|
+
turn_id: turnId,
|
|
300
|
+
role: turn.assigned_role || 'unknown',
|
|
301
|
+
runtime_id: turn.runtime_id || 'unknown',
|
|
302
|
+
running_ms: details.running_ms ?? computeLifecycleAgeMs(turn),
|
|
303
|
+
threshold_ms: details.threshold_ms ?? resolveStartupThreshold(config),
|
|
304
|
+
failure_type: classifyStartupFailureType(turn, null, details.failure_type || 'no_subprocess_output'),
|
|
305
|
+
recommendation: details.recommendation
|
|
306
|
+
|| `Turn ${turnId} failed to start cleanly. Run \`agentxchain reissue-turn --turn ${turnId} --reason ghost\` to recover.`,
|
|
307
|
+
};
|
|
308
|
+
|
|
309
|
+
if (!applyStartupFailureToActiveTurn(activeTurns, budgetReservations, entry, nowIso)) {
|
|
310
|
+
return { ok: false, error: `Turn ${turnId} is not eligible for startup failure transition` };
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
const nextState = buildBlockedStateFromEntries(state, activeTurns, budgetReservations, [entry], [], nowIso);
|
|
314
|
+
safeWriteJson(join(root, '.agentxchain', 'state.json'), nextState);
|
|
315
|
+
emitStartupFailureEvent(root, state, entry);
|
|
316
|
+
emitRunEvent(root, 'run_blocked', {
|
|
317
|
+
run_id: nextState.run_id || null,
|
|
318
|
+
phase: nextState.phase || null,
|
|
319
|
+
status: 'blocked',
|
|
320
|
+
turn: { turn_id: entry.turn_id, role_id: entry.role },
|
|
321
|
+
payload: {
|
|
322
|
+
category: 'ghost_turn',
|
|
323
|
+
ghost_turn_ids: [entry.turn_id],
|
|
324
|
+
stalled_turn_ids: [],
|
|
325
|
+
},
|
|
326
|
+
});
|
|
327
|
+
return { ok: true, state: nextState, turn: nextState.active_turns?.[turnId] || null };
|
|
328
|
+
}
|
|
329
|
+
|
|
197
330
|
function hasRecentTurnEventActivity(root, turnId, startedAt, threshold, now) {
|
|
198
331
|
try {
|
|
199
332
|
const events = readRunEvents(root, { limit: 200 });
|
|
200
333
|
for (let i = events.length - 1; i >= 0; i--) {
|
|
201
334
|
const event = events[i];
|
|
202
335
|
if (event?.turn?.turn_id !== turnId) continue;
|
|
203
|
-
if (
|
|
336
|
+
if (
|
|
337
|
+
event.event_type === 'turn_stalled'
|
|
338
|
+
|| event.event_type === 'turn_start_failed'
|
|
339
|
+
|| event.event_type === 'runtime_spawn_failed'
|
|
340
|
+
|| event.event_type === 'stdout_attach_failed'
|
|
341
|
+
) continue;
|
|
204
342
|
const timestamp = Date.parse(event.timestamp || '');
|
|
205
343
|
if (!Number.isFinite(timestamp)) continue;
|
|
206
344
|
if (timestamp < startedAt) continue;
|
|
@@ -214,9 +352,145 @@ function hasRecentTurnEventActivity(root, turnId, startedAt, threshold, now) {
|
|
|
214
352
|
return false;
|
|
215
353
|
}
|
|
216
354
|
|
|
355
|
+
function applyStartupFailureToActiveTurn(activeTurns, budgetReservations, entry, nowIso) {
|
|
356
|
+
const turn = activeTurns[entry.turn_id];
|
|
357
|
+
if (!turn || !['dispatched', 'starting', 'running', 'retrying'].includes(turn.status)) {
|
|
358
|
+
return false;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
activeTurns[entry.turn_id] = {
|
|
362
|
+
...turn,
|
|
363
|
+
status: 'failed_start',
|
|
364
|
+
failed_start_at: nowIso,
|
|
365
|
+
failed_start_reason: entry.failure_type,
|
|
366
|
+
failed_start_previous_status: turn.status,
|
|
367
|
+
failed_start_threshold_ms: entry.threshold_ms,
|
|
368
|
+
failed_start_running_ms: entry.running_ms,
|
|
369
|
+
recovery_command: `agentxchain reissue-turn --turn ${entry.turn_id} --reason ghost`,
|
|
370
|
+
};
|
|
371
|
+
delete budgetReservations[entry.turn_id];
|
|
372
|
+
return true;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
function emitStartupFailureEvent(root, state, entry) {
|
|
376
|
+
const payload = {
|
|
377
|
+
running_ms: entry.running_ms,
|
|
378
|
+
threshold_ms: entry.threshold_ms,
|
|
379
|
+
runtime_id: entry.runtime_id,
|
|
380
|
+
failure_type: entry.failure_type,
|
|
381
|
+
recommendation: entry.recommendation,
|
|
382
|
+
};
|
|
383
|
+
const details = {
|
|
384
|
+
run_id: state?.run_id || null,
|
|
385
|
+
phase: state?.phase || null,
|
|
386
|
+
status: 'blocked',
|
|
387
|
+
turn: { turn_id: entry.turn_id, role_id: entry.role },
|
|
388
|
+
payload,
|
|
389
|
+
};
|
|
390
|
+
emitRunEvent(root, 'turn_start_failed', details);
|
|
391
|
+
const failureEventType = mapStartupFailureEventType(entry.failure_type);
|
|
392
|
+
if (failureEventType) {
|
|
393
|
+
emitRunEvent(root, failureEventType, details);
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
function buildBlockedStateFromEntries(state, activeTurns, budgetReservations, ghosts, stale, nowIso) {
|
|
398
|
+
const allDetected = [...ghosts, ...stale];
|
|
399
|
+
const primary = allDetected[0];
|
|
400
|
+
const category = ghosts.length > 0 ? 'ghost_turn' : 'stale_turn';
|
|
401
|
+
const blockedOn = allDetected.length === 1
|
|
402
|
+
? `turn:${primary.failure_type ? 'failed_start' : 'stalled'}:${primary.turn_id}`
|
|
403
|
+
: ghosts.length > 0 ? 'turns:failed_start' : 'turns:stalled';
|
|
404
|
+
|
|
405
|
+
return {
|
|
406
|
+
...state,
|
|
407
|
+
status: 'blocked',
|
|
408
|
+
active_turns: activeTurns,
|
|
409
|
+
budget_reservations: budgetReservations,
|
|
410
|
+
blocked_on: blockedOn,
|
|
411
|
+
blocked_reason: {
|
|
412
|
+
category,
|
|
413
|
+
blocked_at: nowIso,
|
|
414
|
+
turn_id: primary.turn_id,
|
|
415
|
+
recovery: {
|
|
416
|
+
typed_reason: category,
|
|
417
|
+
owner: 'human',
|
|
418
|
+
recovery_action: primary.recommendation,
|
|
419
|
+
turn_retained: true,
|
|
420
|
+
detail: primary.recommendation,
|
|
421
|
+
},
|
|
422
|
+
},
|
|
423
|
+
};
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
function parseGhostLifecycleStart(turn) {
|
|
427
|
+
if (turn.status === 'dispatched') {
|
|
428
|
+
return Date.parse(turn.dispatched_at || turn.assigned_at || '');
|
|
429
|
+
}
|
|
430
|
+
return Date.parse(turn.started_at || turn.dispatched_at || turn.assigned_at || '');
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
function computeLifecycleAgeMs(turn) {
|
|
434
|
+
const start = parseGhostLifecycleStart(turn);
|
|
435
|
+
if (!Number.isFinite(start)) return 0;
|
|
436
|
+
return Math.max(0, Date.now() - start);
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
function readDispatchProgressSafe(progressPath) {
|
|
440
|
+
if (!existsSync(progressPath)) {
|
|
441
|
+
return null;
|
|
442
|
+
}
|
|
443
|
+
try {
|
|
444
|
+
return JSON.parse(readFileSync(progressPath, 'utf8'));
|
|
445
|
+
} catch {
|
|
446
|
+
return null;
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
function classifyStartupFailureType(turn, progress, fallback = 'no_subprocess_output') {
|
|
451
|
+
if (fallback === 'runtime_spawn_failed' || fallback === 'stdout_attach_failed') {
|
|
452
|
+
return fallback;
|
|
453
|
+
}
|
|
454
|
+
if (turn?.status === 'dispatched') {
|
|
455
|
+
return 'runtime_spawn_failed';
|
|
456
|
+
}
|
|
457
|
+
const hasWorkerAttachProof = Boolean(
|
|
458
|
+
turn?.worker_attached_at
|
|
459
|
+
|| turn?.worker_pid != null
|
|
460
|
+
|| progress?.pid != null,
|
|
461
|
+
);
|
|
462
|
+
if (turn?.status === 'starting' || hasWorkerAttachProof) {
|
|
463
|
+
return 'stdout_attach_failed';
|
|
464
|
+
}
|
|
465
|
+
return fallback;
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
function mapStartupFailureEventType(failureType) {
|
|
469
|
+
if (failureType === 'runtime_spawn_failed') {
|
|
470
|
+
return 'runtime_spawn_failed';
|
|
471
|
+
}
|
|
472
|
+
if (failureType === 'stdout_attach_failed') {
|
|
473
|
+
return 'stdout_attach_failed';
|
|
474
|
+
}
|
|
475
|
+
return null;
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
function hasStartupProof(turn, progress) {
|
|
479
|
+
if (turn.first_output_at) {
|
|
480
|
+
return true;
|
|
481
|
+
}
|
|
482
|
+
if (!progress || typeof progress !== 'object') {
|
|
483
|
+
return false;
|
|
484
|
+
}
|
|
485
|
+
if (progress.first_output_at) {
|
|
486
|
+
return true;
|
|
487
|
+
}
|
|
488
|
+
return Number(progress.output_lines || 0) > 0 || Number(progress.stderr_lines || 0) > 0;
|
|
489
|
+
}
|
|
490
|
+
|
|
217
491
|
function hasTurnScopedStagedResult(root, turnId) {
|
|
218
492
|
const turnScopedPath = join(root, getTurnStagingResultPath(turnId));
|
|
219
|
-
if (
|
|
493
|
+
if (hasMeaningfulStagedResult(turnScopedPath)) {
|
|
220
494
|
return true;
|
|
221
495
|
}
|
|
222
496
|
|