agentxchain 2.146.0 → 2.147.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/publish-npm.sh +16 -0
- package/scripts/sync-homebrew.sh +14 -1
- package/scripts/verify-post-publish.sh +55 -4
- package/src/commands/reissue-turn.js +16 -0
- package/src/commands/reject-turn.js +14 -1
- package/src/commands/restart.js +15 -0
- package/src/commands/resume.js +61 -66
- package/src/commands/run.js +67 -10
- package/src/commands/schedule.js +34 -7
- package/src/commands/status.js +20 -0
- package/src/commands/step.js +100 -34
- package/src/lib/adapters/api-proxy-adapter.js +8 -0
- package/src/lib/adapters/local-cli-adapter.js +131 -13
- package/src/lib/adapters/manual-adapter.js +9 -10
- package/src/lib/adapters/mcp-adapter.js +3 -5
- package/src/lib/adapters/remote-agent-adapter.js +3 -5
- package/src/lib/continuous-run.js +71 -6
- package/src/lib/dispatch-bundle.js +1 -1
- package/src/lib/dispatch-progress.js +5 -3
- package/src/lib/governed-state.js +224 -13
- package/src/lib/intake.js +10 -1
- package/src/lib/normalized-config.js +51 -1
- package/src/lib/recent-event-summary.js +11 -0
- package/src/lib/run-events.js +4 -0
- package/src/lib/run-loop.js +67 -2
- package/src/lib/runner-interface.js +1 -0
- package/src/lib/schema.js +7 -0
- package/src/lib/schemas/agentxchain-config.schema.json +15 -1
- package/src/lib/staged-result-proof.js +43 -0
- package/src/lib/stale-turn-watchdog.js +218 -90
- package/src/lib/turn-result-shape.js +38 -0
- package/src/lib/turn-result-validator.js +4 -1
package/src/lib/run-loop.js
CHANGED
|
@@ -31,6 +31,7 @@ import {
|
|
|
31
31
|
getActiveTurnCount,
|
|
32
32
|
getActiveTurns,
|
|
33
33
|
getMaxConcurrentTurns,
|
|
34
|
+
transitionActiveTurnLifecycle,
|
|
34
35
|
RUNNER_INTERFACE_VERSION,
|
|
35
36
|
} from './runner-interface.js';
|
|
36
37
|
|
|
@@ -40,6 +41,18 @@ import { join, dirname } from 'path';
|
|
|
40
41
|
import { evaluateApprovalSlaReminders } from './notification-runner.js';
|
|
41
42
|
import { validatePreemptionMarker } from './intake.js';
|
|
42
43
|
import { buildTimeoutBlockedReason, evaluateTimeouts } from './timeout-evaluator.js';
|
|
44
|
+
import { hasMinimumTurnResultShape } from './turn-result-shape.js';
|
|
45
|
+
|
|
46
|
+
// Per DEC-RUN-LOOP-MIN-SHAPE-SYMMETRY-001 (Turn 33): runLoop is the SDK boundary
|
|
47
|
+
// any third-party runner can wire (see website-v2/docs/build-your-own-runner.mdx).
|
|
48
|
+
// In-repo adapters (api_proxy, mcp, local_cli, remote_agent) already validate
|
|
49
|
+
// staged-result shape before write per DEC-MINIMUM-TURN-RESULT-SHAPE-001, and
|
|
50
|
+
// run.js's dispatch callback re-validates before returning per
|
|
51
|
+
// DEC-RUN-STAGED-READ-SHAPE-GUARD-001. Third-party callbacks have no such
|
|
52
|
+
// obligation. runLoop must therefore validate dispatchResult.turnResult shape
|
|
53
|
+
// before persisting it as a governed staged-result artifact.
|
|
54
|
+
const MIN_SHAPE_REJECTION_REASON =
|
|
55
|
+
'staged result missing minimum governed envelope (schema_version + identity + lifecycle fields)';
|
|
43
56
|
|
|
44
57
|
const DEFAULT_MAX_TURNS = 50;
|
|
45
58
|
|
|
@@ -182,7 +195,7 @@ async function executeSequentialTurn(root, config, state, callbacks, emit, error
|
|
|
182
195
|
let assignState;
|
|
183
196
|
const activeTurn = getActiveTurn(state);
|
|
184
197
|
|
|
185
|
-
if (activeTurn && (activeTurn
|
|
198
|
+
if (activeTurn && isDispatchableActiveTurn(activeTurn)) {
|
|
186
199
|
turn = activeTurn;
|
|
187
200
|
assignState = state;
|
|
188
201
|
} else {
|
|
@@ -224,7 +237,7 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
|
|
|
224
237
|
const activeTurns = getActiveTurns(state);
|
|
225
238
|
const turnsToDispatch = [];
|
|
226
239
|
for (const turn of Object.values(activeTurns)) {
|
|
227
|
-
if (turn
|
|
240
|
+
if (isDispatchableActiveTurn(turn)) {
|
|
228
241
|
turnsToDispatch.push({ turn, state });
|
|
229
242
|
}
|
|
230
243
|
}
|
|
@@ -317,6 +330,7 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
|
|
|
317
330
|
errors.push(`writeDispatchBundle(${turn.assigned_role}): ${bundleResult.error}`);
|
|
318
331
|
continue;
|
|
319
332
|
}
|
|
333
|
+
transitionActiveTurnLifecycle(root, turn.turn_id, 'dispatched');
|
|
320
334
|
const stagingPath = getTurnStagingResultPath(turn.turn_id);
|
|
321
335
|
contexts.push({
|
|
322
336
|
turn,
|
|
@@ -362,6 +376,23 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
|
|
|
362
376
|
continue;
|
|
363
377
|
}
|
|
364
378
|
|
|
379
|
+
if (dispatchResult.accept && !hasMinimumTurnResultShape(dispatchResult.turnResult)) {
|
|
380
|
+
// DEC-RUN-LOOP-MIN-SHAPE-SYMMETRY-001: third-party dispatch callback claimed
|
|
381
|
+
// accept=true but returned a payload missing the minimum envelope. Refuse to
|
|
382
|
+
// stage; convert to standard rejection so the run state advances cleanly.
|
|
383
|
+
const validationResult = { stage: 'dispatch', errors: [MIN_SHAPE_REJECTION_REASON] };
|
|
384
|
+
rejectTurn(root, config, validationResult, MIN_SHAPE_REJECTION_REASON, { turnId: turn.turn_id });
|
|
385
|
+
history.push({ role: roleId, turn_id: turn.turn_id, accepted: false });
|
|
386
|
+
emit({ type: 'turn_rejected', turn, role: roleId, reason: MIN_SHAPE_REJECTION_REASON });
|
|
387
|
+
const postRejectState = loadState(root, config);
|
|
388
|
+
if (postRejectState?.status === 'blocked') {
|
|
389
|
+
errors.push(`Turn rejected for ${roleId}, retries exhausted`);
|
|
390
|
+
emit({ type: 'blocked', state: postRejectState });
|
|
391
|
+
return { terminal: true, ok: false, stop_reason: 'reject_exhausted', history, acceptedCount };
|
|
392
|
+
}
|
|
393
|
+
continue;
|
|
394
|
+
}
|
|
395
|
+
|
|
365
396
|
if (dispatchResult.accept) {
|
|
366
397
|
const absStaging = join(root, ctx.stagingPath);
|
|
367
398
|
mkdirSync(dirname(absStaging), { recursive: true });
|
|
@@ -409,6 +440,12 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
|
|
|
409
440
|
}
|
|
410
441
|
emit({ type: 'turn_accepted', turn, role: roleId, state: acceptResult.state });
|
|
411
442
|
} else {
|
|
443
|
+
if (dispatchResult?.blocked === true) {
|
|
444
|
+
history.push({ role: roleId, turn_id: turn.turn_id, accepted: false, blocked: true });
|
|
445
|
+
const blockedState = loadState(root, config);
|
|
446
|
+
emit({ type: 'blocked', state: blockedState });
|
|
447
|
+
return { terminal: true, ok: false, stop_reason: 'blocked', history, acceptedCount };
|
|
448
|
+
}
|
|
412
449
|
const validationResult = {
|
|
413
450
|
stage: 'dispatch',
|
|
414
451
|
errors: [dispatchResult.reason || 'Dispatch callback rejected the turn'],
|
|
@@ -449,6 +486,10 @@ async function executeParallelTurns(root, config, state, maxConcurrent, callback
|
|
|
449
486
|
return { terminal: false, history, acceptedCount };
|
|
450
487
|
}
|
|
451
488
|
|
|
489
|
+
function isDispatchableActiveTurn(turn) {
|
|
490
|
+
return ['assigned', 'dispatched', 'starting', 'running', 'retrying'].includes(turn?.status);
|
|
491
|
+
}
|
|
492
|
+
|
|
452
493
|
/**
|
|
453
494
|
* Dispatch a single turn and process its result.
|
|
454
495
|
*/
|
|
@@ -463,6 +504,7 @@ async function dispatchAndProcess(root, config, turn, assignState, callbacks, em
|
|
|
463
504
|
errors.push(`writeDispatchBundle(${roleId}): ${bundleResult.error}`);
|
|
464
505
|
return { terminal: true, ok: false, stop_reason: 'blocked', history };
|
|
465
506
|
}
|
|
507
|
+
transitionActiveTurnLifecycle(root, turn.turn_id, 'dispatched');
|
|
466
508
|
|
|
467
509
|
const stagingPath = getTurnStagingResultPath(turn.turn_id);
|
|
468
510
|
const context = {
|
|
@@ -488,6 +530,22 @@ async function dispatchAndProcess(root, config, turn, assignState, callbacks, em
|
|
|
488
530
|
return { terminal: true, ok: false, stop_reason: 'blocked', history };
|
|
489
531
|
}
|
|
490
532
|
|
|
533
|
+
if (dispatchResult.accept && !hasMinimumTurnResultShape(dispatchResult.turnResult)) {
|
|
534
|
+
// DEC-RUN-LOOP-MIN-SHAPE-SYMMETRY-001: same boundary as parallel branch.
|
|
535
|
+
// Refuse to stage; convert to a standard rejection.
|
|
536
|
+
const validationResult = { stage: 'dispatch', errors: [MIN_SHAPE_REJECTION_REASON] };
|
|
537
|
+
rejectTurn(root, config, validationResult, MIN_SHAPE_REJECTION_REASON);
|
|
538
|
+
history.push({ role: roleId, turn_id: turn.turn_id, accepted: false });
|
|
539
|
+
emit({ type: 'turn_rejected', turn, role: roleId, reason: MIN_SHAPE_REJECTION_REASON });
|
|
540
|
+
const postRejectState = loadState(root, config);
|
|
541
|
+
if (postRejectState?.status === 'blocked') {
|
|
542
|
+
errors.push(`Turn rejected for ${roleId}, retries exhausted`);
|
|
543
|
+
emit({ type: 'blocked', state: postRejectState });
|
|
544
|
+
return { terminal: true, ok: false, stop_reason: 'reject_exhausted', history };
|
|
545
|
+
}
|
|
546
|
+
return { terminal: false, accepted: false, history };
|
|
547
|
+
}
|
|
548
|
+
|
|
491
549
|
if (dispatchResult.accept) {
|
|
492
550
|
const absStaging = join(root, stagingPath);
|
|
493
551
|
mkdirSync(dirname(absStaging), { recursive: true });
|
|
@@ -537,6 +595,13 @@ async function dispatchAndProcess(root, config, turn, assignState, callbacks, em
|
|
|
537
595
|
return { terminal: false, accepted: true, history };
|
|
538
596
|
}
|
|
539
597
|
|
|
598
|
+
if (dispatchResult?.blocked === true) {
|
|
599
|
+
history.push({ role: roleId, turn_id: turn.turn_id, accepted: false, blocked: true });
|
|
600
|
+
const blockedState = loadState(root, config);
|
|
601
|
+
emit({ type: 'blocked', state: blockedState });
|
|
602
|
+
return { terminal: true, ok: false, stop_reason: 'blocked', history };
|
|
603
|
+
}
|
|
604
|
+
|
|
540
605
|
// Rejection
|
|
541
606
|
const validationResult = {
|
|
542
607
|
stage: 'dispatch',
|
package/src/lib/schema.js
CHANGED
|
@@ -35,6 +35,13 @@ export function validateGovernedStateSchema(data) {
|
|
|
35
35
|
// but validators and read-only surfaces still tolerate reserved/manual states.
|
|
36
36
|
const VALID_RUN_STATUSES = ['idle', 'active', 'paused', 'blocked', 'completed', 'failed'];
|
|
37
37
|
const isV1_1 = data?.schema_version === '1.1';
|
|
38
|
+
// NOTE: `current_turn` is the persisted v1.0 schema field. Under v1.1 it is
|
|
39
|
+
// not a persisted field at all — `loadProjectState()` re-attaches it as a
|
|
40
|
+
// non-enumerable getter alias over `active_turns` after normalization
|
|
41
|
+
// (DEC-CURRENT-TURN-COMPAT-ALIAS-001). This validator runs against the
|
|
42
|
+
// persisted shape, so an `own` property named `current_turn` on a v1.1 doc
|
|
43
|
+
// means "stray persisted-shape leak from a legacy write" and is rejected
|
|
44
|
+
// below — it does NOT mean the runtime alias is going away.
|
|
38
45
|
const hasLegacyCurrentTurn = Object.prototype.hasOwnProperty.call(data || {}, 'current_turn');
|
|
39
46
|
|
|
40
47
|
function validateTurn(turn, label) {
|
|
@@ -85,7 +85,21 @@
|
|
|
85
85
|
"type": "object"
|
|
86
86
|
},
|
|
87
87
|
"run_loop": {
|
|
88
|
-
"type": "object"
|
|
88
|
+
"type": "object",
|
|
89
|
+
"description": "Runner control knobs for execution watchdogs and automation behavior.",
|
|
90
|
+
"properties": {
|
|
91
|
+
"startup_watchdog_ms": {
|
|
92
|
+
"type": "integer",
|
|
93
|
+
"minimum": 1,
|
|
94
|
+
"description": "Milliseconds to wait after dispatch for worker attach/first-output proof before retaining the turn as failed_start. Default 30000."
|
|
95
|
+
},
|
|
96
|
+
"stale_turn_threshold_ms": {
|
|
97
|
+
"type": "integer",
|
|
98
|
+
"minimum": 1,
|
|
99
|
+
"description": "Milliseconds to wait before a started turn that previously produced output is treated as stale. Default 600000 for local_cli turns and 300000 for api_proxy turns."
|
|
100
|
+
}
|
|
101
|
+
},
|
|
102
|
+
"additionalProperties": true
|
|
89
103
|
},
|
|
90
104
|
"mission_planner": {
|
|
91
105
|
"type": "object"
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Staged turn-result proof helpers.
|
|
3
|
+
*
|
|
4
|
+
* Per DEC-BUG51-STAGING-PLACEHOLDER-NOT-PROOF-001: a turn-scoped staged-result
|
|
5
|
+
* file is proof of execution only when it contains meaningful result content.
|
|
6
|
+
* Adapter-authored placeholders (`{}`, blank, whitespace-only) are cleanup
|
|
7
|
+
* artifacts — watchdog, adapter, and recovery code must treat them as absent.
|
|
8
|
+
*
|
|
9
|
+
* This module centralizes that check so every surface (local-cli adapter,
|
|
10
|
+
* manual adapter, stale-turn watchdog) uses the same rule.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Returns true when the staged-result file at `filePath` exists AND contains
|
|
17
|
+
* content that is not a placeholder (empty, whitespace-only, or `{}`).
|
|
18
|
+
*
|
|
19
|
+
* Trim-aware: `{}\n`, ` {}\n`, and `{}` are all rejected. Legitimate turn
|
|
20
|
+
* results carry the full governed schema and are far larger than the
|
|
21
|
+
* placeholder shapes this function filters.
|
|
22
|
+
*
|
|
23
|
+
* @param {string} filePath - absolute path to the staged-result file
|
|
24
|
+
* @returns {boolean}
|
|
25
|
+
*/
|
|
26
|
+
export function hasMeaningfulStagedResult(filePath) {
|
|
27
|
+
if (!existsSync(filePath)) {
|
|
28
|
+
return false;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
let raw;
|
|
32
|
+
try {
|
|
33
|
+
raw = readFileSync(filePath, 'utf8');
|
|
34
|
+
} catch {
|
|
35
|
+
return false;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const trimmed = raw.trim();
|
|
39
|
+
if (trimmed === '' || trimmed === '{}') {
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
return true;
|
|
43
|
+
}
|
|
@@ -3,18 +3,18 @@
|
|
|
3
3
|
*
|
|
4
4
|
* Two-tier lazy idle-threshold detection:
|
|
5
5
|
*
|
|
6
|
-
* 1. **Fast startup watchdog (BUG-51):** if an active turn has been
|
|
7
|
-
* for >30 seconds with NO
|
|
8
|
-
*
|
|
9
|
-
*
|
|
6
|
+
* 1. **Fast startup watchdog (BUG-51):** if an active turn has been
|
|
7
|
+
* `dispatched`/`starting`/`running` for >30 seconds with NO startup proof
|
|
8
|
+
* (no first-byte output recorded on the turn or in dispatch-progress) and
|
|
9
|
+
* NO staged result, it is a "ghost turn" — the subprocess never reached a
|
|
10
|
+
* healthy running state. Transitions to `failed_start` immediately.
|
|
10
11
|
*
|
|
11
|
-
* Design note: the watchdog intentionally keys on
|
|
12
|
-
* dispatch-progress rather than `stdout.log`
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
16
|
-
*
|
|
17
|
-
* coupling the watchdog to adapter-specific log-attachment details.
|
|
12
|
+
* Design note: the watchdog intentionally keys on first-output proof from
|
|
13
|
+
* the framework-owned dispatch-progress contract rather than `stdout.log`
|
|
14
|
+
* existence. `stdout.log` is adapter-authored visibility output and may be
|
|
15
|
+
* absent even when the adapter is wired correctly. First-output timestamps
|
|
16
|
+
* and output-line counters are the stable health contract across runtime
|
|
17
|
+
* wiring.
|
|
18
18
|
*
|
|
19
19
|
* 2. **Stale turn watchdog (BUG-47):** if an active turn has status "running"
|
|
20
20
|
* for >N minutes with no event log activity AND no staged result file,
|
|
@@ -36,6 +36,7 @@ import { safeWriteJson } from './safe-write.js';
|
|
|
36
36
|
import { emitRunEvent, readRunEvents } from './run-events.js';
|
|
37
37
|
import { getTurnStagingResultPath } from './turn-paths.js';
|
|
38
38
|
import { getDispatchProgressRelativePath } from './dispatch-progress.js';
|
|
39
|
+
import { hasMeaningfulStagedResult } from './staged-result-proof.js';
|
|
39
40
|
|
|
40
41
|
const DEFAULT_LOCAL_CLI_THRESHOLD_MS = 10 * 60 * 1000; // 10 minutes
|
|
41
42
|
const DEFAULT_API_PROXY_THRESHOLD_MS = 5 * 60 * 1000; // 5 minutes
|
|
@@ -103,12 +104,11 @@ export function detectStaleTurns(root, state, config) {
|
|
|
103
104
|
/**
|
|
104
105
|
* BUG-51: Detect ghost-dispatched turns — subprocess never started.
|
|
105
106
|
*
|
|
106
|
-
* A ghost turn is one that has been in
|
|
107
|
-
* longer than the startup watchdog threshold (default 30s) AND has:
|
|
108
|
-
* - no
|
|
109
|
-
*
|
|
107
|
+
* A ghost turn is one that has been in `dispatched`, `starting`, `running`, or
|
|
108
|
+
* `retrying` longer than the startup watchdog threshold (default 30s) AND has:
|
|
109
|
+
* - no startup proof (no `first_output_at` on the turn or dispatch-progress,
|
|
110
|
+
* and no recorded output line counts)
|
|
110
111
|
* - no staged result file
|
|
111
|
-
* - no recent turn-scoped events (beyond the initial turn_dispatched)
|
|
112
112
|
*
|
|
113
113
|
* This is a stricter, faster check than detectStaleTurns (BUG-47).
|
|
114
114
|
* Ghost turns transition to "failed_start" rather than "stalled".
|
|
@@ -125,31 +125,22 @@ export function detectGhostTurns(root, state, config) {
|
|
|
125
125
|
const startupThreshold = resolveStartupThreshold(config);
|
|
126
126
|
|
|
127
127
|
for (const [turnId, turn] of Object.entries(activeTurns)) {
|
|
128
|
-
if (
|
|
129
|
-
if (!turn.started_at) continue;
|
|
128
|
+
if (!['dispatched', 'starting', 'running', 'retrying'].includes(turn.status)) continue;
|
|
130
129
|
|
|
131
|
-
const
|
|
132
|
-
if (
|
|
130
|
+
const lifecycleStart = parseGhostLifecycleStart(turn);
|
|
131
|
+
if (!Number.isFinite(lifecycleStart)) continue;
|
|
133
132
|
|
|
134
|
-
const runningMs = now -
|
|
133
|
+
const runningMs = now - lifecycleStart;
|
|
135
134
|
if (runningMs < startupThreshold) continue;
|
|
136
135
|
|
|
137
|
-
// Ghost detection: NO dispatch-progress file means subprocess never attached
|
|
138
136
|
const progressPath = join(root, getDispatchProgressRelativePath(turnId));
|
|
139
|
-
const
|
|
137
|
+
const progress = readDispatchProgressSafe(progressPath);
|
|
140
138
|
|
|
141
|
-
// If dispatch-progress exists, subprocess started — this is NOT a ghost turn.
|
|
142
|
-
// The regular stale-turn watchdog (BUG-47) will handle it if it goes silent.
|
|
143
|
-
if (hasProgress) continue;
|
|
144
|
-
|
|
145
|
-
// Also check for staged result (unlikely without progress, but be safe)
|
|
146
139
|
if (hasTurnScopedStagedResult(root, turnId)) continue;
|
|
147
|
-
|
|
148
|
-
// Check for any turn-scoped events beyond the initial dispatch event
|
|
149
|
-
if (hasRecentTurnEventActivity(root, turnId, startedAt, startupThreshold, now)) continue;
|
|
140
|
+
if (hasStartupProof(turn, progress)) continue;
|
|
150
141
|
|
|
151
142
|
const runningSeconds = Math.floor(runningMs / 1000);
|
|
152
|
-
const failureType =
|
|
143
|
+
const failureType = classifyStartupFailureType(turn, progress);
|
|
153
144
|
ghosts.push({
|
|
154
145
|
turn_id: turnId,
|
|
155
146
|
role: turn.assigned_role || 'unknown',
|
|
@@ -200,37 +191,11 @@ export function reconcileStaleTurns(root, state, config) {
|
|
|
200
191
|
|
|
201
192
|
// Process ghost turns (BUG-51) — transition to failed_start
|
|
202
193
|
for (const entry of ghosts) {
|
|
203
|
-
const
|
|
204
|
-
if (
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
status: 'failed_start',
|
|
209
|
-
failed_start_at: nowIso,
|
|
210
|
-
failed_start_reason: entry.failure_type,
|
|
211
|
-
failed_start_previous_status: turn.status,
|
|
212
|
-
failed_start_threshold_ms: entry.threshold_ms,
|
|
213
|
-
failed_start_running_ms: entry.running_ms,
|
|
214
|
-
recovery_command: `agentxchain reissue-turn --turn ${entry.turn_id} --reason ghost`,
|
|
215
|
-
};
|
|
216
|
-
changed = true;
|
|
217
|
-
|
|
218
|
-
// BUG-51 fix #6: Release budget reservation for ghost turns
|
|
219
|
-
delete budgetReservations[entry.turn_id];
|
|
220
|
-
|
|
221
|
-
emitRunEvent(root, 'turn_start_failed', {
|
|
222
|
-
run_id: state?.run_id || null,
|
|
223
|
-
phase: state?.phase || null,
|
|
224
|
-
status: 'blocked',
|
|
225
|
-
turn: { turn_id: entry.turn_id, role_id: entry.role },
|
|
226
|
-
payload: {
|
|
227
|
-
running_ms: entry.running_ms,
|
|
228
|
-
threshold_ms: entry.threshold_ms,
|
|
229
|
-
runtime_id: entry.runtime_id,
|
|
230
|
-
failure_type: entry.failure_type,
|
|
231
|
-
recommendation: entry.recommendation,
|
|
232
|
-
},
|
|
233
|
-
});
|
|
194
|
+
const applied = applyStartupFailureToActiveTurn(activeTurns, budgetReservations, entry, nowIso);
|
|
195
|
+
if (applied) {
|
|
196
|
+
emitStartupFailureEvent(root, state, entry);
|
|
197
|
+
changed = true;
|
|
198
|
+
}
|
|
234
199
|
}
|
|
235
200
|
|
|
236
201
|
// Process stale turns (BUG-47) — transition to stalled
|
|
@@ -271,32 +236,9 @@ export function reconcileStaleTurns(root, state, config) {
|
|
|
271
236
|
return { stale_turns: stale, ghost_turns: ghosts, state, changed: false };
|
|
272
237
|
}
|
|
273
238
|
|
|
274
|
-
const
|
|
275
|
-
const primary =
|
|
239
|
+
const nextState = buildBlockedStateFromEntries(state, activeTurns, budgetReservations, ghosts, stale, nowIso);
|
|
240
|
+
const primary = [...ghosts, ...stale][0];
|
|
276
241
|
const category = ghosts.length > 0 ? 'ghost_turn' : 'stale_turn';
|
|
277
|
-
const blockedOn = allDetected.length === 1
|
|
278
|
-
? `turn:${primary.failure_type ? 'failed_start' : 'stalled'}:${primary.turn_id}`
|
|
279
|
-
: ghosts.length > 0 ? 'turns:failed_start' : 'turns:stalled';
|
|
280
|
-
|
|
281
|
-
const nextState = {
|
|
282
|
-
...state,
|
|
283
|
-
status: 'blocked',
|
|
284
|
-
active_turns: activeTurns,
|
|
285
|
-
budget_reservations: budgetReservations,
|
|
286
|
-
blocked_on: blockedOn,
|
|
287
|
-
blocked_reason: {
|
|
288
|
-
category,
|
|
289
|
-
blocked_at: nowIso,
|
|
290
|
-
turn_id: primary.turn_id,
|
|
291
|
-
recovery: {
|
|
292
|
-
typed_reason: category,
|
|
293
|
-
owner: 'human',
|
|
294
|
-
recovery_action: primary.recommendation,
|
|
295
|
-
turn_retained: true,
|
|
296
|
-
detail: primary.recommendation,
|
|
297
|
-
},
|
|
298
|
-
},
|
|
299
|
-
};
|
|
300
242
|
|
|
301
243
|
safeWriteJson(join(root, '.agentxchain', 'state.json'), nextState);
|
|
302
244
|
emitRunEvent(root, 'run_blocked', {
|
|
@@ -340,13 +282,63 @@ function resolveStartupThreshold(config) {
|
|
|
340
282
|
return DEFAULT_STARTUP_WATCHDOG_MS;
|
|
341
283
|
}
|
|
342
284
|
|
|
285
|
+
export function failTurnStartup(root, state, config, turnId, details = {}) {
|
|
286
|
+
if (!state || typeof state !== 'object') {
|
|
287
|
+
return { ok: false, error: 'No governed state found' };
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
const turn = state.active_turns?.[turnId];
|
|
291
|
+
if (!turn) {
|
|
292
|
+
return { ok: false, error: `Turn ${turnId} not found in active turns` };
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
const nowIso = new Date().toISOString();
|
|
296
|
+
const activeTurns = { ...(state.active_turns || {}) };
|
|
297
|
+
const budgetReservations = { ...(state.budget_reservations || {}) };
|
|
298
|
+
const entry = {
|
|
299
|
+
turn_id: turnId,
|
|
300
|
+
role: turn.assigned_role || 'unknown',
|
|
301
|
+
runtime_id: turn.runtime_id || 'unknown',
|
|
302
|
+
running_ms: details.running_ms ?? computeLifecycleAgeMs(turn),
|
|
303
|
+
threshold_ms: details.threshold_ms ?? resolveStartupThreshold(config),
|
|
304
|
+
failure_type: classifyStartupFailureType(turn, null, details.failure_type || 'no_subprocess_output'),
|
|
305
|
+
recommendation: details.recommendation
|
|
306
|
+
|| `Turn ${turnId} failed to start cleanly. Run \`agentxchain reissue-turn --turn ${turnId} --reason ghost\` to recover.`,
|
|
307
|
+
};
|
|
308
|
+
|
|
309
|
+
if (!applyStartupFailureToActiveTurn(activeTurns, budgetReservations, entry, nowIso)) {
|
|
310
|
+
return { ok: false, error: `Turn ${turnId} is not eligible for startup failure transition` };
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
const nextState = buildBlockedStateFromEntries(state, activeTurns, budgetReservations, [entry], [], nowIso);
|
|
314
|
+
safeWriteJson(join(root, '.agentxchain', 'state.json'), nextState);
|
|
315
|
+
emitStartupFailureEvent(root, state, entry);
|
|
316
|
+
emitRunEvent(root, 'run_blocked', {
|
|
317
|
+
run_id: nextState.run_id || null,
|
|
318
|
+
phase: nextState.phase || null,
|
|
319
|
+
status: 'blocked',
|
|
320
|
+
turn: { turn_id: entry.turn_id, role_id: entry.role },
|
|
321
|
+
payload: {
|
|
322
|
+
category: 'ghost_turn',
|
|
323
|
+
ghost_turn_ids: [entry.turn_id],
|
|
324
|
+
stalled_turn_ids: [],
|
|
325
|
+
},
|
|
326
|
+
});
|
|
327
|
+
return { ok: true, state: nextState, turn: nextState.active_turns?.[turnId] || null };
|
|
328
|
+
}
|
|
329
|
+
|
|
343
330
|
function hasRecentTurnEventActivity(root, turnId, startedAt, threshold, now) {
|
|
344
331
|
try {
|
|
345
332
|
const events = readRunEvents(root, { limit: 200 });
|
|
346
333
|
for (let i = events.length - 1; i >= 0; i--) {
|
|
347
334
|
const event = events[i];
|
|
348
335
|
if (event?.turn?.turn_id !== turnId) continue;
|
|
349
|
-
if (
|
|
336
|
+
if (
|
|
337
|
+
event.event_type === 'turn_stalled'
|
|
338
|
+
|| event.event_type === 'turn_start_failed'
|
|
339
|
+
|| event.event_type === 'runtime_spawn_failed'
|
|
340
|
+
|| event.event_type === 'stdout_attach_failed'
|
|
341
|
+
) continue;
|
|
350
342
|
const timestamp = Date.parse(event.timestamp || '');
|
|
351
343
|
if (!Number.isFinite(timestamp)) continue;
|
|
352
344
|
if (timestamp < startedAt) continue;
|
|
@@ -360,9 +352,145 @@ function hasRecentTurnEventActivity(root, turnId, startedAt, threshold, now) {
|
|
|
360
352
|
return false;
|
|
361
353
|
}
|
|
362
354
|
|
|
355
|
+
function applyStartupFailureToActiveTurn(activeTurns, budgetReservations, entry, nowIso) {
|
|
356
|
+
const turn = activeTurns[entry.turn_id];
|
|
357
|
+
if (!turn || !['dispatched', 'starting', 'running', 'retrying'].includes(turn.status)) {
|
|
358
|
+
return false;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
activeTurns[entry.turn_id] = {
|
|
362
|
+
...turn,
|
|
363
|
+
status: 'failed_start',
|
|
364
|
+
failed_start_at: nowIso,
|
|
365
|
+
failed_start_reason: entry.failure_type,
|
|
366
|
+
failed_start_previous_status: turn.status,
|
|
367
|
+
failed_start_threshold_ms: entry.threshold_ms,
|
|
368
|
+
failed_start_running_ms: entry.running_ms,
|
|
369
|
+
recovery_command: `agentxchain reissue-turn --turn ${entry.turn_id} --reason ghost`,
|
|
370
|
+
};
|
|
371
|
+
delete budgetReservations[entry.turn_id];
|
|
372
|
+
return true;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
function emitStartupFailureEvent(root, state, entry) {
|
|
376
|
+
const payload = {
|
|
377
|
+
running_ms: entry.running_ms,
|
|
378
|
+
threshold_ms: entry.threshold_ms,
|
|
379
|
+
runtime_id: entry.runtime_id,
|
|
380
|
+
failure_type: entry.failure_type,
|
|
381
|
+
recommendation: entry.recommendation,
|
|
382
|
+
};
|
|
383
|
+
const details = {
|
|
384
|
+
run_id: state?.run_id || null,
|
|
385
|
+
phase: state?.phase || null,
|
|
386
|
+
status: 'blocked',
|
|
387
|
+
turn: { turn_id: entry.turn_id, role_id: entry.role },
|
|
388
|
+
payload,
|
|
389
|
+
};
|
|
390
|
+
emitRunEvent(root, 'turn_start_failed', details);
|
|
391
|
+
const failureEventType = mapStartupFailureEventType(entry.failure_type);
|
|
392
|
+
if (failureEventType) {
|
|
393
|
+
emitRunEvent(root, failureEventType, details);
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
function buildBlockedStateFromEntries(state, activeTurns, budgetReservations, ghosts, stale, nowIso) {
|
|
398
|
+
const allDetected = [...ghosts, ...stale];
|
|
399
|
+
const primary = allDetected[0];
|
|
400
|
+
const category = ghosts.length > 0 ? 'ghost_turn' : 'stale_turn';
|
|
401
|
+
const blockedOn = allDetected.length === 1
|
|
402
|
+
? `turn:${primary.failure_type ? 'failed_start' : 'stalled'}:${primary.turn_id}`
|
|
403
|
+
: ghosts.length > 0 ? 'turns:failed_start' : 'turns:stalled';
|
|
404
|
+
|
|
405
|
+
return {
|
|
406
|
+
...state,
|
|
407
|
+
status: 'blocked',
|
|
408
|
+
active_turns: activeTurns,
|
|
409
|
+
budget_reservations: budgetReservations,
|
|
410
|
+
blocked_on: blockedOn,
|
|
411
|
+
blocked_reason: {
|
|
412
|
+
category,
|
|
413
|
+
blocked_at: nowIso,
|
|
414
|
+
turn_id: primary.turn_id,
|
|
415
|
+
recovery: {
|
|
416
|
+
typed_reason: category,
|
|
417
|
+
owner: 'human',
|
|
418
|
+
recovery_action: primary.recommendation,
|
|
419
|
+
turn_retained: true,
|
|
420
|
+
detail: primary.recommendation,
|
|
421
|
+
},
|
|
422
|
+
},
|
|
423
|
+
};
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
function parseGhostLifecycleStart(turn) {
|
|
427
|
+
if (turn.status === 'dispatched') {
|
|
428
|
+
return Date.parse(turn.dispatched_at || turn.assigned_at || '');
|
|
429
|
+
}
|
|
430
|
+
return Date.parse(turn.started_at || turn.dispatched_at || turn.assigned_at || '');
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
function computeLifecycleAgeMs(turn) {
|
|
434
|
+
const start = parseGhostLifecycleStart(turn);
|
|
435
|
+
if (!Number.isFinite(start)) return 0;
|
|
436
|
+
return Math.max(0, Date.now() - start);
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
function readDispatchProgressSafe(progressPath) {
|
|
440
|
+
if (!existsSync(progressPath)) {
|
|
441
|
+
return null;
|
|
442
|
+
}
|
|
443
|
+
try {
|
|
444
|
+
return JSON.parse(readFileSync(progressPath, 'utf8'));
|
|
445
|
+
} catch {
|
|
446
|
+
return null;
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
function classifyStartupFailureType(turn, progress, fallback = 'no_subprocess_output') {
|
|
451
|
+
if (fallback === 'runtime_spawn_failed' || fallback === 'stdout_attach_failed') {
|
|
452
|
+
return fallback;
|
|
453
|
+
}
|
|
454
|
+
if (turn?.status === 'dispatched') {
|
|
455
|
+
return 'runtime_spawn_failed';
|
|
456
|
+
}
|
|
457
|
+
const hasWorkerAttachProof = Boolean(
|
|
458
|
+
turn?.worker_attached_at
|
|
459
|
+
|| turn?.worker_pid != null
|
|
460
|
+
|| progress?.pid != null,
|
|
461
|
+
);
|
|
462
|
+
if (turn?.status === 'starting' || hasWorkerAttachProof) {
|
|
463
|
+
return 'stdout_attach_failed';
|
|
464
|
+
}
|
|
465
|
+
return fallback;
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
function mapStartupFailureEventType(failureType) {
|
|
469
|
+
if (failureType === 'runtime_spawn_failed') {
|
|
470
|
+
return 'runtime_spawn_failed';
|
|
471
|
+
}
|
|
472
|
+
if (failureType === 'stdout_attach_failed') {
|
|
473
|
+
return 'stdout_attach_failed';
|
|
474
|
+
}
|
|
475
|
+
return null;
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
function hasStartupProof(turn, progress) {
|
|
479
|
+
if (turn.first_output_at) {
|
|
480
|
+
return true;
|
|
481
|
+
}
|
|
482
|
+
if (!progress || typeof progress !== 'object') {
|
|
483
|
+
return false;
|
|
484
|
+
}
|
|
485
|
+
if (progress.first_output_at) {
|
|
486
|
+
return true;
|
|
487
|
+
}
|
|
488
|
+
return Number(progress.output_lines || 0) > 0 || Number(progress.stderr_lines || 0) > 0;
|
|
489
|
+
}
|
|
490
|
+
|
|
363
491
|
function hasTurnScopedStagedResult(root, turnId) {
|
|
364
492
|
const turnScopedPath = join(root, getTurnStagingResultPath(turnId));
|
|
365
|
-
if (
|
|
493
|
+
if (hasMeaningfulStagedResult(turnScopedPath)) {
|
|
366
494
|
return true;
|
|
367
495
|
}
|
|
368
496
|
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Lightweight staged turn-result shape guard.
|
|
3
|
+
*
|
|
4
|
+
* This is intentionally weaker than full acceptance validation. It exists for
|
|
5
|
+
* adapter pre-stage checks so obviously incomplete payloads (`{}`,
|
|
6
|
+
* `{"turn_id":"t1"}`, etc.) are rejected before they can be written into the
|
|
7
|
+
* governed staging path and mistaken for meaningful execution output.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
function isNonEmptyString(value) {
|
|
11
|
+
return typeof value === 'string' && value.trim() !== '';
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Returns true when `value` has the minimum governed turn-result envelope:
|
|
16
|
+
* - `schema_version`
|
|
17
|
+
* - at least one identity field (`run_id` or `turn_id`)
|
|
18
|
+
* - at least one lifecycle field (`status`, `role`, or `runtime_id`)
|
|
19
|
+
*
|
|
20
|
+
* Full schema validation still happens later via `validateStagedTurnResult`.
|
|
21
|
+
*
|
|
22
|
+
* @param {unknown} value
|
|
23
|
+
* @returns {boolean}
|
|
24
|
+
*/
|
|
25
|
+
export function hasMinimumTurnResultShape(value) {
|
|
26
|
+
if (!value || typeof value !== 'object' || Array.isArray(value)) {
|
|
27
|
+
return false;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const candidate = /** @type {Record<string, unknown>} */ (value);
|
|
31
|
+
const hasSchemaVersion = isNonEmptyString(candidate.schema_version);
|
|
32
|
+
const hasIdentity = isNonEmptyString(candidate.run_id) || isNonEmptyString(candidate.turn_id);
|
|
33
|
+
const hasLifecycle = isNonEmptyString(candidate.status)
|
|
34
|
+
|| isNonEmptyString(candidate.role)
|
|
35
|
+
|| isNonEmptyString(candidate.runtime_id);
|
|
36
|
+
|
|
37
|
+
return hasSchemaVersion && hasIdentity && hasLifecycle;
|
|
38
|
+
}
|
|
@@ -75,7 +75,10 @@ export function validateStagedTurnResult(root, state, config, opts = {}) {
|
|
|
75
75
|
const normContext = {};
|
|
76
76
|
if (state) {
|
|
77
77
|
normContext.phase = state.phase;
|
|
78
|
-
//
|
|
78
|
+
// Prefer active_turns (the persisted schema field); fall back to the
|
|
79
|
+
// current_turn compatibility alias for callers that pass a state shape
|
|
80
|
+
// built outside loadProjectState() (e.g. raw fixtures). Both surfaces are
|
|
81
|
+
// live per DEC-CURRENT-TURN-COMPAT-ALIAS-001 — current_turn is not legacy.
|
|
79
82
|
const activeTurn = getActiveTurn(state) || state.current_turn;
|
|
80
83
|
if (activeTurn) {
|
|
81
84
|
const roleKey = activeTurn.assigned_role || activeTurn.role;
|