agentxchain 2.152.0 → 2.153.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -753,6 +753,10 @@ program
753
753
  .option('--triage-approval <mode>', 'Triage policy for vision-derived intents: auto or human (default: config or auto)')
754
754
  .option('--max-idle-cycles <n>', 'Stop after N consecutive idle cycles with no derivable work (default: 3)', parseInt)
755
755
  .option('--session-budget <usd>', 'Cumulative session-level budget cap in USD for continuous mode', parseFloat)
756
+ .option('--auto-retry-on-ghost', 'Enable bounded automatic retry for continuous-mode startup ghost turns')
757
+ .option('--no-auto-retry-on-ghost', 'Disable bounded automatic retry for continuous-mode startup ghost turns')
758
+ .option('--auto-retry-on-ghost-max-retries <n>', 'Maximum startup ghost retries per continuous run (default: config or 3)', parseInt)
759
+ .option('--auto-retry-on-ghost-cooldown-seconds <n>', 'Seconds to wait between startup ghost retries (default: config or 5)', parseInt)
756
760
  .option('--auto-checkpoint', 'Auto-commit accepted writable turns after acceptance')
757
761
  .option('--no-auto-checkpoint', 'Disable automatic checkpointing after accepted writable turns')
758
762
  .action(runCommand);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentxchain",
3
- "version": "2.152.0",
3
+ "version": "2.153.0",
4
4
  "description": "CLI for AgentXchain — governed multi-agent software delivery",
5
5
  "type": "module",
6
6
  "bin": {
@@ -25,6 +25,14 @@ import {
25
25
  import { loadProjectState } from './config.js';
26
26
  import { safeWriteJson } from './safe-write.js';
27
27
  import { emitRunEvent } from './run-events.js';
28
+ import { reissueTurn } from './governed-state.js';
29
+ import {
30
+ applyGhostRetryAttempt,
31
+ applyGhostRetryExhaustion,
32
+ buildGhostRetryDiagnosticBundle,
33
+ buildGhostRetryExhaustionMirror,
34
+ classifyGhostRetryDecision,
35
+ } from './ghost-retry.js';
28
36
  import {
29
37
  archiveStaleIntentsForRun,
30
38
  formatLegacyIntentMigrationNotice,
@@ -127,6 +135,178 @@ function getBlockedCategory(state) {
127
135
  return state?.blocked_reason?.category || null;
128
136
  }
129
137
 
138
+ function writeGovernedState(root, state) {
139
+ safeWriteJson(join(root, '.agentxchain', 'state.json'), state);
140
+ }
141
+
142
+ function clearGhostBlockerAfterReissue(root, state) {
143
+ const nextState = {
144
+ ...state,
145
+ status: 'active',
146
+ blocked_on: null,
147
+ blocked_reason: null,
148
+ escalation: null,
149
+ };
150
+ writeGovernedState(root, nextState);
151
+ return nextState;
152
+ }
153
+
154
+ async function maybeAutoRetryGhostBlocker(context, session, contOpts, blockedState, log = console.log) {
155
+ const { root, config } = context;
156
+ const decision = classifyGhostRetryDecision({
157
+ state: blockedState,
158
+ session,
159
+ autoRetryOnGhost: contOpts.autoRetryOnGhost,
160
+ runId: session.current_run_id || blockedState?.run_id || null,
161
+ });
162
+
163
+ if (decision.decision === 'retry') {
164
+ const oldTurnId = decision.ghost.turn_id;
165
+ const oldTurn = blockedState?.active_turns?.[oldTurnId] || {};
166
+ const reissued = reissueTurn(root, config, {
167
+ turnId: oldTurnId,
168
+ reason: 'auto_retry_ghost',
169
+ });
170
+ if (!reissued.ok) {
171
+ log(`Ghost auto-retry skipped: ${reissued.error}`);
172
+ return null;
173
+ }
174
+
175
+ const runId = session.current_run_id || blockedState?.run_id || reissued.state?.run_id || null;
176
+ const attempt = decision.attempts + 1;
177
+ const nowIso = new Date().toISOString();
178
+ const nextState = clearGhostBlockerAfterReissue(root, reissued.state);
179
+ // Slice 2c: pass runtime/role/timing fields so the fingerprint log can
180
+ // drive same-signature early-stop detection on subsequent invocations.
181
+ const oldRuntimeId = oldTurn.runtime_id || reissued.newTurn.runtime_id || null;
182
+ const oldRoleId = oldTurn.assigned_role || reissued.newTurn.assigned_role || null;
183
+ const oldRunningMs = oldTurn.failed_start_running_ms ?? null;
184
+ const oldThresholdMs = oldTurn.failed_start_threshold_ms ?? null;
185
+ const nextSession = applyGhostRetryAttempt(session, {
186
+ runId,
187
+ oldTurnId,
188
+ newTurnId: reissued.newTurn.turn_id,
189
+ failureType: decision.ghost.failure_type,
190
+ maxRetries: decision.maxRetries,
191
+ nowIso,
192
+ runtimeId: oldRuntimeId,
193
+ roleId: oldRoleId,
194
+ runningMs: oldRunningMs,
195
+ thresholdMs: oldThresholdMs,
196
+ });
197
+ Object.assign(session, nextSession, {
198
+ status: 'running',
199
+ current_run_id: runId,
200
+ });
201
+ writeContinuousSession(root, session);
202
+
203
+ emitRunEvent(root, 'auto_retried_ghost', {
204
+ run_id: runId,
205
+ phase: nextState.phase || blockedState?.phase || null,
206
+ status: 'active',
207
+ turn: { turn_id: reissued.newTurn.turn_id, role_id: reissued.newTurn.assigned_role },
208
+ intent_id: oldTurn.intake_context?.intent_id || null,
209
+ payload: {
210
+ old_turn_id: oldTurnId,
211
+ new_turn_id: reissued.newTurn.turn_id,
212
+ failure_type: decision.ghost.failure_type,
213
+ attempt,
214
+ max_retries_per_run: decision.maxRetries,
215
+ runtime_id: oldTurn.runtime_id || reissued.newTurn.runtime_id || null,
216
+ running_ms: oldTurn.failed_start_running_ms ?? null,
217
+ threshold_ms: oldTurn.failed_start_threshold_ms ?? null,
218
+ },
219
+ });
220
+
221
+ log(`Ghost turn auto-retried (${attempt}/${decision.maxRetries}): ${oldTurnId} -> ${reissued.newTurn.turn_id}`);
222
+ if ((contOpts.autoRetryOnGhost?.cooldownSeconds ?? 0) > 0) {
223
+ await new Promise((resolve) => setTimeout(resolve, contOpts.autoRetryOnGhost.cooldownSeconds * 1000));
224
+ }
225
+ return {
226
+ ok: true,
227
+ status: 'running',
228
+ action: 'auto_retried_ghost',
229
+ run_id: runId,
230
+ old_turn_id: oldTurnId,
231
+ new_turn_id: reissued.newTurn.turn_id,
232
+ attempt,
233
+ max_retries_per_run: decision.maxRetries,
234
+ };
235
+ }
236
+
237
+ if (decision.decision === 'exhausted') {
238
+ const runId = session.current_run_id || blockedState?.run_id || null;
239
+ const oldTurnId = decision.ghost.turn_id;
240
+ const oldTurn = blockedState?.active_turns?.[oldTurnId] || {};
241
+ const manualDetail = blockedState?.blocked_reason?.recovery?.detail
242
+ || blockedState?.blocked_reason?.recovery?.recovery_action
243
+ || null;
244
+ // Slice 2c: build the per-attempt diagnostic bundle from the session's
245
+ // recorded attempts_log. This is the payload the operator needs to
246
+ // decide their next move (bump retries, change runtime, raise watchdog,
247
+ // or file a new bug). Also pass signatureRepeat into the mirror so the
248
+ // status surface distinguishes raw exhaustion from pattern-based early
249
+ // stop.
250
+ const diagnosticBundle = buildGhostRetryDiagnosticBundle(session);
251
+ const signatureRepeat = decision.signatureRepeat || null;
252
+ const detail = buildGhostRetryExhaustionMirror({
253
+ attempts: decision.attempts,
254
+ maxRetries: decision.maxRetries,
255
+ failureType: decision.ghost.failure_type,
256
+ manualRecoveryDetail: manualDetail,
257
+ signatureRepeat,
258
+ });
259
+ const nextState = {
260
+ ...blockedState,
261
+ blocked_reason: {
262
+ ...(blockedState.blocked_reason || {}),
263
+ recovery: {
264
+ ...(blockedState.blocked_reason?.recovery || {}),
265
+ detail,
266
+ },
267
+ },
268
+ };
269
+ writeGovernedState(root, nextState);
270
+ const nextSession = applyGhostRetryExhaustion(session, {
271
+ runId,
272
+ failureType: decision.ghost.failure_type,
273
+ turnId: oldTurnId,
274
+ maxRetries: decision.maxRetries,
275
+ nowIso: new Date().toISOString(),
276
+ });
277
+ Object.assign(session, nextSession, { status: 'paused' });
278
+ writeContinuousSession(root, session);
279
+
280
+ emitRunEvent(root, 'ghost_retry_exhausted', {
281
+ run_id: runId,
282
+ phase: blockedState?.phase || null,
283
+ status: 'blocked',
284
+ turn: { turn_id: oldTurnId, role_id: oldTurn.assigned_role || null },
285
+ intent_id: oldTurn.intake_context?.intent_id || null,
286
+ payload: {
287
+ turn_id: oldTurnId,
288
+ attempts: decision.attempts,
289
+ max_retries_per_run: decision.maxRetries,
290
+ failure_type: decision.ghost.failure_type,
291
+ runtime_id: oldTurn.runtime_id || null,
292
+ exhaustion_reason: signatureRepeat ? 'same_signature_repeat' : 'retry_budget_exhausted',
293
+ signature_repeat: signatureRepeat,
294
+ diagnostic_bundle: diagnosticBundle,
295
+ diagnostic_refs: {
296
+ recovery_action: blockedState?.blocked_reason?.recovery?.recovery_action || null,
297
+ },
298
+ },
299
+ });
300
+ const tag = signatureRepeat
301
+ ? `same_signature_repeat [${signatureRepeat.signature}] after ${signatureRepeat.consecutive} attempts`
302
+ : `${decision.attempts}/${decision.maxRetries}`;
303
+ log(`Ghost auto-retry exhausted (${tag}) for ${oldTurnId}.`);
304
+ return null;
305
+ }
306
+
307
+ return null;
308
+ }
309
+
130
310
  // ---------------------------------------------------------------------------
131
311
  // Intake queue check
132
312
  // ---------------------------------------------------------------------------
@@ -301,6 +481,11 @@ export function seedFromVision(root, visionPath, options = {}) {
301
481
 
302
482
  export function resolveContinuousOptions(opts, config) {
303
483
  const configCont = config?.run_loop?.continuous || {};
484
+ const configGhostRetry = configCont.auto_retry_on_ghost || {};
485
+ const explicitConfigGhostEnabled = Object.prototype.hasOwnProperty.call(configGhostRetry, 'enabled');
486
+ const fullAutoGhostDefault = Boolean((opts.continuous ?? configCont.enabled ?? false) && isFullAutoApprovalPolicy(config));
487
+ const resolvedGhostEnabled = opts.autoRetryOnGhost
488
+ ?? (explicitConfigGhostEnabled ? configGhostRetry.enabled : fullAutoGhostDefault);
304
489
 
305
490
  return {
306
491
  enabled: opts.continuous ?? configCont.enabled ?? false,
@@ -313,9 +498,25 @@ export function resolveContinuousOptions(opts, config) {
313
498
  cooldownSeconds: opts.cooldownSeconds ?? configCont.cooldown_seconds ?? 5,
314
499
  perSessionMaxUsd: opts.sessionBudget ?? configCont.per_session_max_usd ?? null,
315
500
  autoCheckpoint: opts.autoCheckpoint ?? configCont.auto_checkpoint ?? true,
501
+ autoRetryOnGhost: {
502
+ enabled: resolvedGhostEnabled ?? false,
503
+ maxRetriesPerRun: opts.autoRetryOnGhostMaxRetries
504
+ ?? configGhostRetry.max_retries_per_run
505
+ ?? 3,
506
+ cooldownSeconds: opts.autoRetryOnGhostCooldownSeconds
507
+ ?? configGhostRetry.cooldown_seconds
508
+ ?? 5,
509
+ },
316
510
  };
317
511
  }
318
512
 
513
+ export function isFullAutoApprovalPolicy(config) {
514
+ const policy = config?.approval_policy;
515
+ if (!policy || typeof policy !== 'object') return false;
516
+ return policy.phase_transitions?.default === 'auto_approve'
517
+ && policy.run_completion?.action === 'auto_approve';
518
+ }
519
+
319
520
  // ---------------------------------------------------------------------------
320
521
  // Single-step continuous advancement primitive
321
522
  // ---------------------------------------------------------------------------
@@ -370,6 +571,8 @@ export async function advanceContinuousRunOnce(context, session, contOpts, execu
370
571
  if (session.status === 'paused') {
371
572
  const governedState = loadProjectState(root, context.config);
372
573
  if (governedState?.status === 'blocked') {
574
+ const retried = await maybeAutoRetryGhostBlocker(context, session, contOpts, governedState, log);
575
+ if (retried) return retried;
373
576
  // Still blocked — stay paused, do not attempt new work
374
577
  writeContinuousSession(root, session);
375
578
  return {
@@ -406,7 +609,10 @@ export async function advanceContinuousRunOnce(context, session, contOpts, execu
406
609
  const resumeStopReason = execution.result?.stop_reason;
407
610
 
408
611
  if (isBlockedContinuousExecution(execution)) {
409
- const blockedRecoveryAction = getBlockedRecoveryAction(execution?.result?.state || loadProjectState(root, context.config));
612
+ const blockedState = execution?.result?.state || loadProjectState(root, context.config);
613
+ const retried = await maybeAutoRetryGhostBlocker(context, session, contOpts, blockedState, log);
614
+ if (retried) return retried;
615
+ const blockedRecoveryAction = getBlockedRecoveryAction(blockedState);
410
616
  session.status = 'paused';
411
617
  log(blockedRecoveryAction
412
618
  ? `Resumed run blocked again — continuous loop re-paused. Recovery: ${blockedRecoveryAction}`
@@ -418,7 +624,7 @@ export async function advanceContinuousRunOnce(context, session, contOpts, execu
418
624
  action: 'run_blocked',
419
625
  run_id: session.current_run_id,
420
626
  recovery_action: blockedRecoveryAction,
421
- blocked_category: getBlockedCategory(execution?.result?.state || loadProjectState(root, context.config)),
627
+ blocked_category: getBlockedCategory(blockedState),
422
628
  };
423
629
  }
424
630
 
@@ -435,6 +641,64 @@ export async function advanceContinuousRunOnce(context, session, contOpts, execu
435
641
  return { ok: true, status: 'running', action: 'resumed_after_unblock', run_id: session.current_run_id };
436
642
  }
437
643
 
644
+ const activeGovernedState = loadProjectState(root, context.config);
645
+ if (
646
+ session.current_run_id
647
+ && activeGovernedState?.status === 'active'
648
+ && activeGovernedState.run_id === session.current_run_id
649
+ && Object.keys(activeGovernedState.active_turns || {}).length > 0
650
+ ) {
651
+ log('Continuing active governed run.');
652
+ let execution;
653
+ try {
654
+ execution = await executeGovernedRun(context, {
655
+ autoApprove: true,
656
+ autoCheckpoint: contOpts.autoCheckpoint,
657
+ report: true,
658
+ log,
659
+ });
660
+ } catch (err) {
661
+ session.status = 'failed';
662
+ writeContinuousSession(root, session);
663
+ return { ok: false, status: 'failed', action: 'run_failed', stop_reason: err.message, run_id: session.current_run_id };
664
+ }
665
+
666
+ session.cumulative_spent_usd = (session.cumulative_spent_usd || 0) + getExecutionRunSpentUsd(execution);
667
+ const resumeStopReason = execution.result?.stop_reason;
668
+
669
+ if (isBlockedContinuousExecution(execution)) {
670
+ const blockedState = execution?.result?.state || loadProjectState(root, context.config);
671
+ const retried = await maybeAutoRetryGhostBlocker(context, session, contOpts, blockedState, log);
672
+ if (retried) return retried;
673
+ const blockedRecoveryAction = getBlockedRecoveryAction(blockedState);
674
+ session.status = 'paused';
675
+ log(blockedRecoveryAction
676
+ ? `Active run blocked — continuous loop paused. Recovery: ${blockedRecoveryAction}`
677
+ : 'Active run blocked — continuous loop paused.');
678
+ writeContinuousSession(root, session);
679
+ return {
680
+ ok: true,
681
+ status: 'blocked',
682
+ action: 'run_blocked',
683
+ run_id: session.current_run_id,
684
+ recovery_action: blockedRecoveryAction,
685
+ blocked_category: getBlockedCategory(blockedState),
686
+ };
687
+ }
688
+
689
+ if (execution.exitCode !== 0 || !execution.result) {
690
+ session.status = 'failed';
691
+ writeContinuousSession(root, session);
692
+ return { ok: false, status: 'failed', action: 'run_failed', stop_reason: resumeStopReason || `exit_code_${execution.exitCode}`, run_id: session.current_run_id };
693
+ }
694
+
695
+ session.runs_completed += 1;
696
+ session.current_run_id = execution.result?.state?.run_id || session.current_run_id;
697
+ log(`Active run completed (${session.runs_completed}/${contOpts.maxRuns}): ${resumeStopReason || 'completed'}`);
698
+ writeContinuousSession(root, session);
699
+ return { ok: true, status: 'running', action: 'continued_active_run', run_id: session.current_run_id };
700
+ }
701
+
438
702
  // Validate vision file
439
703
  if (!existsSync(absVisionPath)) {
440
704
  session.status = 'failed';
@@ -573,7 +837,10 @@ export async function advanceContinuousRunOnce(context, session, contOpts, execu
573
837
  }
574
838
 
575
839
  if (isBlockedContinuousExecution(execution)) {
576
- const blockedRecoveryAction = getBlockedRecoveryAction(execution?.result?.state || loadProjectState(root, context.config));
840
+ const blockedState = execution?.result?.state || loadProjectState(root, context.config);
841
+ const retried = await maybeAutoRetryGhostBlocker(context, session, contOpts, blockedState, log);
842
+ if (retried) return retried;
843
+ const blockedRecoveryAction = getBlockedRecoveryAction(blockedState);
577
844
  const resolved = resolveIntent(root, targetIntentId);
578
845
  if (!resolved.ok) {
579
846
  log(`Continuous resolve error: ${resolved.error}`);
@@ -593,7 +860,7 @@ export async function advanceContinuousRunOnce(context, session, contOpts, execu
593
860
  run_id: session.current_run_id,
594
861
  intent_id: targetIntentId,
595
862
  recovery_action: blockedRecoveryAction,
596
- blocked_category: getBlockedCategory(execution?.result?.state || loadProjectState(root, context.config)),
863
+ blocked_category: getBlockedCategory(blockedState),
597
864
  };
598
865
  }
599
866
 
@@ -0,0 +1,447 @@
1
+ /**
2
+ * ghost-retry.js — Pure decision helper for BUG-61 continuous-mode ghost-turn
3
+ * auto-recovery.
4
+ *
5
+ * This module is deliberately pure (no disk I/O, no subprocess spawn): it takes
6
+ * the blocked governed state plus the continuous session snapshot and returns a
7
+ * decision record the continuous loop can act on.
8
+ *
9
+ * Slice 2a ships the decision helper + state-shape primitives. Slice 2b wires
10
+ * it into `advanceContinuousRunOnce()` and covers `reissueTurn()` side-effects
11
+ * + cooldowns + command-chain beta scenarios.
12
+ *
13
+ * Contracts:
14
+ * - Retry is eligible ONLY when `blocked_reason.category === "ghost_turn"`
15
+ * AND an active turn exists with `status === "failed_start"` AND a typed
16
+ * BUG-51 startup failure (`runtime_spawn_failed` or `stdout_attach_failed`).
17
+ * - Retry budget is run-scoped: switching `run_id` resets the counter to 0.
18
+ * - Staged results on the ghost turn disqualify retry (defer to accept flow).
19
+ * - Exhaustion returns `decision: "exhausted"` — the caller is responsible
20
+ * for mirroring the outcome into governed state's
21
+ * `blocked_reason.recovery.detail` per DEC-BUG61-GHOST-RETRY-STATE-OWNERSHIP-001.
22
+ */
23
+
24
+ export const GHOST_FAILURE_TYPES = Object.freeze([
25
+ 'runtime_spawn_failed',
26
+ 'stdout_attach_failed',
27
+ ]);
28
+
29
+ /**
30
+ * Slice 2c: same-signature early stop threshold.
31
+ *
32
+ * When N consecutive recorded attempts share the same fingerprint
33
+ * `(runtime_id, role_id, failure_type)`, the retry budget is NOT exhausted in
34
+ * raw count terms but the pattern signals a systematic failure that further
35
+ * retries will not clear. At that point the loop stops early with
36
+ * `decision: "exhausted"` and `reason: "same_signature_repeat"`. The threshold
37
+ * is deliberately low (2) because the BUG-61 contract is "retry transient
38
+ * ghosts" — a second identical signature is already non-transient evidence.
39
+ *
40
+ * Not configurable via `auto_retry_on_ghost` in v1; the value is a framework
41
+ * invariant. If evidence emerges that 2 is too aggressive, promote to config
42
+ * through a new DEC rather than silently widening.
43
+ */
44
+ export const SIGNATURE_REPEAT_THRESHOLD = 2;
45
+
46
+ /**
47
+ * Read (or default) the ghost_retry state object from a continuous session.
48
+ * Returns a plain object; callers should spread/clone before mutating.
49
+ */
50
+ export function readGhostRetryState(session) {
51
+ const gr = session?.ghost_retry;
52
+ if (!gr || typeof gr !== 'object') {
53
+ return {
54
+ run_id: null,
55
+ attempts: 0,
56
+ max_retries_per_run: null,
57
+ last_old_turn_id: null,
58
+ last_new_turn_id: null,
59
+ last_failure_type: null,
60
+ last_retried_at: null,
61
+ exhausted: false,
62
+ attempts_log: [],
63
+ };
64
+ }
65
+ return {
66
+ run_id: gr.run_id ?? null,
67
+ attempts: Number.isInteger(gr.attempts) && gr.attempts >= 0 ? gr.attempts : 0,
68
+ max_retries_per_run: Number.isInteger(gr.max_retries_per_run) ? gr.max_retries_per_run : null,
69
+ last_old_turn_id: gr.last_old_turn_id ?? null,
70
+ last_new_turn_id: gr.last_new_turn_id ?? null,
71
+ last_failure_type: gr.last_failure_type ?? null,
72
+ last_retried_at: gr.last_retried_at ?? null,
73
+ exhausted: Boolean(gr.exhausted),
74
+ attempts_log: Array.isArray(gr.attempts_log) ? gr.attempts_log : [],
75
+ };
76
+ }
77
+
78
+ /**
79
+ * Reset the ghost_retry counter when the active run_id differs from the last
80
+ * recorded run_id. Returns the reset state (does not mutate input).
81
+ */
82
+ export function resetGhostRetryForRun(session, runId) {
83
+ const current = readGhostRetryState(session);
84
+ if (current.run_id === runId) return current;
85
+ return {
86
+ run_id: runId ?? null,
87
+ attempts: 0,
88
+ max_retries_per_run: current.max_retries_per_run,
89
+ last_old_turn_id: null,
90
+ last_new_turn_id: null,
91
+ last_failure_type: null,
92
+ last_retried_at: null,
93
+ exhausted: false,
94
+ attempts_log: [],
95
+ };
96
+ }
97
+
98
+ /**
99
+ * Build the fingerprint string for a recorded attempt. Same shape as the
100
+ * HUMAN-ROADMAP's "same runtime, same role, same prompt shape" guidance —
101
+ * we key on (runtime_id, role_id, failure_type). Prompt shape is implicitly
102
+ * stable across same-turn reissues because `reissueTurn()` re-renders the
103
+ * same dispatch bundle.
104
+ *
105
+ * `null`/missing fields are normalized to `?` so partial records compare
106
+ * consistently rather than silently matching.
107
+ */
108
+ export function buildAttemptFingerprint(attempt) {
109
+ const runtime = attempt?.runtime_id ?? '?';
110
+ const role = attempt?.role_id ?? '?';
111
+ const failure = attempt?.failure_type ?? '?';
112
+ return `${runtime}|${role}|${failure}`;
113
+ }
114
+
115
+ /**
116
+ * Classify whether the tail of `attemptsLog` shows `threshold` consecutive
117
+ * identical fingerprints. Returns:
118
+ * - `{ triggered: false, signature: null, consecutive: 0 }` when not hit
119
+ * - `{ triggered: true, signature, consecutive }` when hit
120
+ *
121
+ * The caller decides what to do with the trigger (slice 2c routes it into
122
+ * `decision: "exhausted"` with `reason: "same_signature_repeat"`).
123
+ */
124
+ export function classifySameSignatureExhaustion(attemptsLog, threshold = SIGNATURE_REPEAT_THRESHOLD) {
125
+ if (!Array.isArray(attemptsLog) || attemptsLog.length < threshold) {
126
+ return { triggered: false, signature: null, consecutive: 0 };
127
+ }
128
+ if (!Number.isInteger(threshold) || threshold < 2) {
129
+ return { triggered: false, signature: null, consecutive: 0 };
130
+ }
131
+ const tail = attemptsLog.slice(-threshold);
132
+ const signatures = tail.map(buildAttemptFingerprint);
133
+ const first = signatures[0];
134
+ if (!first || first === '?|?|?') {
135
+ return { triggered: false, signature: null, consecutive: 0 };
136
+ }
137
+ const allMatch = signatures.every((s) => s === first);
138
+ if (!allMatch) {
139
+ return { triggered: false, signature: null, consecutive: 0 };
140
+ }
141
+ return { triggered: true, signature: first, consecutive: threshold };
142
+ }
143
+
144
+ /**
145
+ * Locate the primary ghost turn from governed state.
146
+ *
147
+ * Inputs expected (matches shape written by `stale-turn-watchdog.js`):
148
+ * - `state.blocked_reason.category === "ghost_turn"`
149
+ * - `state.blocked_reason.turn_id`
150
+ * - `state.active_turns[turnId].status === "failed_start"`
151
+ * - `state.active_turns[turnId].failed_start_reason` is one of
152
+ * GHOST_FAILURE_TYPES
153
+ *
154
+ * Returns the turn object + failure type, or null when no eligible turn is
155
+ * found. Does NOT consult disk.
156
+ */
157
+ export function findPrimaryGhostTurn(state) {
158
+ if (!state || typeof state !== 'object') return null;
159
+ const blockedReason = state.blocked_reason;
160
+ if (!blockedReason || blockedReason.category !== 'ghost_turn') return null;
161
+
162
+ const activeTurns = state.active_turns || {};
163
+ const hintedTurnId = blockedReason.turn_id;
164
+ const candidateIds = hintedTurnId && activeTurns[hintedTurnId]
165
+ ? [hintedTurnId]
166
+ : Object.keys(activeTurns);
167
+
168
+ for (const turnId of candidateIds) {
169
+ const turn = activeTurns[turnId];
170
+ if (!turn) continue;
171
+ if (turn.status !== 'failed_start') continue;
172
+ const failureType = turn.failed_start_reason;
173
+ if (!GHOST_FAILURE_TYPES.includes(failureType)) continue;
174
+ if (hasMeaningfulStagedResult(turn)) continue;
175
+ return { turn_id: turnId, turn, failure_type: failureType };
176
+ }
177
+ return null;
178
+ }
179
+
180
+ /**
181
+ * Best-effort detector for a meaningful staged result. If the turn has already
182
+ * produced a structured result the caller should NOT auto-retry — the accept
183
+ * pipeline owns that path.
184
+ */
185
+ function hasMeaningfulStagedResult(turn) {
186
+ if (!turn) return false;
187
+ const staged = turn.staged_result ?? turn.result ?? null;
188
+ if (!staged) return false;
189
+ if (typeof staged !== 'object') return Boolean(staged);
190
+ // Ignore purely-null / empty shells the watchdog may leave behind.
191
+ for (const value of Object.values(staged)) {
192
+ if (value !== null && value !== undefined && value !== '') return true;
193
+ }
194
+ return false;
195
+ }
196
+
197
+ /**
198
+ * Classify the retry decision given the current blocked state + session +
199
+ * resolved options.
200
+ *
201
+ * @param {object} params
202
+ * @param {object} params.state - governed state (has blocked_reason + active_turns)
203
+ * @param {object} params.session - continuous session (source of truth for retry counter)
204
+ * @param {object} params.autoRetryOnGhost - resolved continuous options block: { enabled, maxRetriesPerRun, cooldownSeconds }
205
+ * @param {string|null} [params.runId] - the run_id the continuous loop believes is active (defaults to state.run_id)
206
+ * @returns {{
207
+ * decision: 'retry' | 'exhausted' | 'skip_non_ghost' | 'missing_active_ghost' | 'disabled' | 'missing_run_id',
208
+ * reason: string,
209
+ * attempts: number,
210
+ * maxRetries: number,
211
+ * retryState: object,
212
+ * ghost?: { turn_id: string, failure_type: string },
213
+ * signatureRepeat?: { signature: string, consecutive: number }
214
+ * }}
215
+ *
216
+ * Exhaustion lanes (added in slice 2c):
217
+ * - `reason: "retry budget exhausted (N/N)"` — raw counter cap hit
218
+ * - `reason: "same_signature_repeat (<signature>)"` — N consecutive
219
+ * identical fingerprints recorded; continuing is unlikely to help. This
220
+ * lane can fire BEFORE the raw counter cap — we stop as soon as the
221
+ * pattern is visible.
222
+ */
223
+ export function classifyGhostRetryDecision({ state, session, autoRetryOnGhost, runId } = {}) {
224
+ const opts = autoRetryOnGhost || {};
225
+ const enabled = Boolean(opts.enabled);
226
+ const maxRetries = Number.isInteger(opts.maxRetriesPerRun) && opts.maxRetriesPerRun > 0
227
+ ? opts.maxRetriesPerRun
228
+ : 3;
229
+
230
+ if (!enabled) {
231
+ return {
232
+ decision: 'disabled',
233
+ reason: 'auto_retry_on_ghost.enabled is false',
234
+ attempts: 0,
235
+ maxRetries,
236
+ retryState: readGhostRetryState(session),
237
+ };
238
+ }
239
+
240
+ const category = state?.blocked_reason?.category;
241
+ if (category !== 'ghost_turn') {
242
+ return {
243
+ decision: 'skip_non_ghost',
244
+ reason: `blocked_reason.category=${category ?? 'null'} is not ghost_turn`,
245
+ attempts: 0,
246
+ maxRetries,
247
+ retryState: readGhostRetryState(session),
248
+ };
249
+ }
250
+
251
+ const ghost = findPrimaryGhostTurn(state);
252
+ if (!ghost) {
253
+ return {
254
+ decision: 'missing_active_ghost',
255
+ reason: 'blocked_reason names a ghost but no active turn has a typed BUG-51 failed_start',
256
+ attempts: 0,
257
+ maxRetries,
258
+ retryState: readGhostRetryState(session),
259
+ };
260
+ }
261
+
262
+ const effectiveRunId = runId ?? state?.run_id ?? null;
263
+ if (!effectiveRunId) {
264
+ return {
265
+ decision: 'missing_run_id',
266
+ reason: 'cannot scope retry counter without a run_id',
267
+ attempts: 0,
268
+ maxRetries,
269
+ retryState: readGhostRetryState(session),
270
+ ghost: { turn_id: ghost.turn_id, failure_type: ghost.failure_type },
271
+ };
272
+ }
273
+
274
+ const resetState = resetGhostRetryForRun(session, effectiveRunId);
275
+ const attempts = resetState.attempts;
276
+
277
+ if (attempts >= maxRetries) {
278
+ return {
279
+ decision: 'exhausted',
280
+ reason: `retry budget exhausted (${attempts}/${maxRetries})`,
281
+ attempts,
282
+ maxRetries,
283
+ retryState: { ...resetState, max_retries_per_run: maxRetries, exhausted: true },
284
+ ghost: { turn_id: ghost.turn_id, failure_type: ghost.failure_type },
285
+ };
286
+ }
287
+
288
+ // Slice 2c: same-signature early stop. If the recorded attempts log shows
289
+ // SIGNATURE_REPEAT_THRESHOLD consecutive identical fingerprints, stop early
290
+ // with a distinct reason so the caller can surface "pattern detected, not
291
+ // transient" in the exhaustion bundle.
292
+ const sigCheck = classifySameSignatureExhaustion(resetState.attempts_log, SIGNATURE_REPEAT_THRESHOLD);
293
+ if (sigCheck.triggered) {
294
+ return {
295
+ decision: 'exhausted',
296
+ reason: `same_signature_repeat (${sigCheck.signature})`,
297
+ attempts,
298
+ maxRetries,
299
+ retryState: { ...resetState, max_retries_per_run: maxRetries, exhausted: true },
300
+ ghost: { turn_id: ghost.turn_id, failure_type: ghost.failure_type },
301
+ signatureRepeat: { signature: sigCheck.signature, consecutive: sigCheck.consecutive },
302
+ };
303
+ }
304
+
305
+ return {
306
+ decision: 'retry',
307
+ reason: `retry budget available (${attempts}/${maxRetries})`,
308
+ attempts,
309
+ maxRetries,
310
+ retryState: { ...resetState, max_retries_per_run: maxRetries },
311
+ ghost: { turn_id: ghost.turn_id, failure_type: ghost.failure_type },
312
+ };
313
+ }
314
+
315
+ /**
316
+ * Apply a successful auto-retry to a session snapshot. Returns a NEW session
317
+ * object with the ghost_retry counter incremented and last_* fields updated.
318
+ * Does not write to disk; the caller owns persistence.
319
+ */
320
+ export function applyGhostRetryAttempt(session, {
321
+ runId,
322
+ oldTurnId,
323
+ newTurnId,
324
+ failureType,
325
+ maxRetries,
326
+ nowIso,
327
+ runtimeId = null,
328
+ roleId = null,
329
+ runningMs = null,
330
+ thresholdMs = null,
331
+ }) {
332
+ const base = resetGhostRetryForRun(session, runId);
333
+ const at = nowIso || new Date().toISOString();
334
+ // Slice 2c: append a per-attempt fingerprint record. The log is the source
335
+ // of truth for same-signature early-stop detection and the exhaustion
336
+ // diagnostic bundle. We cap its size to 10 entries to prevent unbounded
337
+ // growth on misbehaving projects — the tail is what matters for pattern
338
+ // detection.
339
+ const nextEntry = {
340
+ attempt: base.attempts + 1,
341
+ old_turn_id: oldTurnId ?? null,
342
+ new_turn_id: newTurnId ?? null,
343
+ runtime_id: runtimeId ?? null,
344
+ role_id: roleId ?? null,
345
+ failure_type: failureType ?? null,
346
+ running_ms: runningMs ?? null,
347
+ threshold_ms: thresholdMs ?? null,
348
+ retried_at: at,
349
+ };
350
+ const attemptsLog = [...base.attempts_log, nextEntry].slice(-10);
351
+ const ghost_retry = {
352
+ run_id: runId ?? null,
353
+ attempts: base.attempts + 1,
354
+ max_retries_per_run: Number.isInteger(maxRetries) ? maxRetries : base.max_retries_per_run,
355
+ last_old_turn_id: oldTurnId ?? null,
356
+ last_new_turn_id: newTurnId ?? null,
357
+ last_failure_type: failureType ?? null,
358
+ last_retried_at: at,
359
+ exhausted: false,
360
+ attempts_log: attemptsLog,
361
+ };
362
+ return { ...(session || {}), ghost_retry };
363
+ }
364
+
365
+ /**
366
+ * Apply an exhaustion outcome to a session snapshot. Returns a NEW session
367
+ * with the counter preserved, `exhausted: true`, and last-failure metadata.
368
+ */
369
+ export function applyGhostRetryExhaustion(session, { runId, failureType, turnId, maxRetries, nowIso }) {
370
+ const base = resetGhostRetryForRun(session, runId);
371
+ const ghost_retry = {
372
+ run_id: runId ?? null,
373
+ attempts: base.attempts,
374
+ max_retries_per_run: Number.isInteger(maxRetries) ? maxRetries : base.max_retries_per_run,
375
+ last_old_turn_id: turnId ?? base.last_old_turn_id,
376
+ last_new_turn_id: null,
377
+ last_failure_type: failureType ?? base.last_failure_type,
378
+ last_retried_at: nowIso || base.last_retried_at,
379
+ exhausted: true,
380
+ // Slice 2c: preserve the per-attempt fingerprint log into the exhausted
381
+ // state so the operator-facing session.json still has the diagnostic
382
+ // payload after the loop pauses. Without this, the log would be dropped
383
+ // exactly when it is most useful.
384
+ attempts_log: Array.isArray(base.attempts_log) ? base.attempts_log : [],
385
+ };
386
+ return { ...(session || {}), ghost_retry };
387
+ }
388
+
389
+ /**
390
+ * Build the human-readable mirror string the continuous loop should write
391
+ * into governed state's `blocked_reason.recovery.detail` at exhaustion time.
392
+ * Matches the shape `stale-turn-watchdog.js` already uses for that field.
393
+ *
394
+ * Slice 2c: accepts optional `signatureRepeat` and adds a brief inline note
395
+ * so operators see the distinction between raw-budget exhaustion and
396
+ * pattern-based early stop in the status surface.
397
+ */
398
+ export function buildGhostRetryExhaustionMirror({
399
+ attempts,
400
+ maxRetries,
401
+ failureType,
402
+ manualRecoveryDetail,
403
+ signatureRepeat = null,
404
+ }) {
405
+ const count = `${attempts}/${maxRetries}`;
406
+ const ft = failureType || 'ghost_turn';
407
+ const suffix = manualRecoveryDetail ? ` ${manualRecoveryDetail}` : '';
408
+ if (signatureRepeat && signatureRepeat.signature) {
409
+ const sig = signatureRepeat.signature;
410
+ const consec = signatureRepeat.consecutive || 2;
411
+ return `Auto-retry stopped early after ${consec} consecutive same-signature attempts [${sig}] (${ft}); last attempt ${count}.${suffix}`;
412
+ }
413
+ return `Auto-retry exhausted after ${count} attempts (${ft}).${suffix}`;
414
+ }
415
+
416
+ /**
417
+ * Slice 2c: build the per-attempt diagnostic bundle that rides on the
418
+ * `ghost_retry_exhausted` event payload AND gets surfaced in CLI status so
419
+ * the operator has enough evidence to decide between (a) bumping
420
+ * `max_retries_per_run`, (b) changing the runtime, (c) raising
421
+ * `startup_watchdog_ms`, or (d) filing a new BUG-54-class regression.
422
+ *
423
+ * Output shape:
424
+ * {
425
+ * attempts_log: [...per-attempt records, most recent last...],
426
+ * fingerprint_summary: [{ signature, count }, ...] sorted by count desc,
427
+ * final_signature: string | null
428
+ * }
429
+ */
430
+ export function buildGhostRetryDiagnosticBundle(sessionOrState) {
431
+ const state = sessionOrState && typeof sessionOrState === 'object' && sessionOrState.ghost_retry
432
+ ? readGhostRetryState(sessionOrState)
433
+ : (Array.isArray(sessionOrState?.attempts_log)
434
+ ? { attempts_log: sessionOrState.attempts_log }
435
+ : { attempts_log: [] });
436
+ const log = Array.isArray(state.attempts_log) ? state.attempts_log : [];
437
+ const counts = new Map();
438
+ for (const entry of log) {
439
+ const sig = buildAttemptFingerprint(entry);
440
+ counts.set(sig, (counts.get(sig) || 0) + 1);
441
+ }
442
+ const fingerprint_summary = Array.from(counts.entries())
443
+ .map(([signature, count]) => ({ signature, count }))
444
+ .sort((a, b) => b.count - a.count);
445
+ const final_signature = log.length > 0 ? buildAttemptFingerprint(log[log.length - 1]) : null;
446
+ return { attempts_log: log, fingerprint_summary, final_signature };
447
+ }
@@ -1514,7 +1514,12 @@ function buildConflictDetail(conflict) {
1514
1514
  }
1515
1515
 
1516
1516
  function hasBlockingActiveTurn(activeTurns) {
1517
- return Object.values(activeTurns || {}).some((turn) => turn?.status === 'failed' || turn?.status === 'conflicted');
1517
+ return Object.values(activeTurns || {}).some((turn) => [
1518
+ 'failed',
1519
+ 'conflicted',
1520
+ 'failed_start',
1521
+ 'stalled',
1522
+ ].includes(turn?.status));
1518
1523
  }
1519
1524
 
1520
1525
  function findHistoryTurnRequest(historyEntries, turnId, kind) {
@@ -640,9 +640,38 @@ export function validateRunLoopConfig(runLoop) {
640
640
  }
641
641
  validateRunLoopPositiveInteger('run_loop.startup_watchdog_ms', runLoop.startup_watchdog_ms, errors);
642
642
  validateRunLoopPositiveInteger('run_loop.stale_turn_threshold_ms', runLoop.stale_turn_threshold_ms, errors);
643
+ if (runLoop.continuous !== undefined && runLoop.continuous !== null) {
644
+ validateRunLoopContinuousConfig('run_loop.continuous', runLoop.continuous, errors);
645
+ }
643
646
  return errors;
644
647
  }
645
648
 
649
+ function validateRunLoopContinuousConfig(path, continuous, errors) {
650
+ if (typeof continuous !== 'object' || Array.isArray(continuous)) {
651
+ errors.push(`${path} must be an object`);
652
+ return;
653
+ }
654
+ if (continuous.auto_retry_on_ghost !== undefined && continuous.auto_retry_on_ghost !== null) {
655
+ validateAutoRetryOnGhostConfig(`${path}.auto_retry_on_ghost`, continuous.auto_retry_on_ghost, errors);
656
+ }
657
+ }
658
+
659
+ function validateAutoRetryOnGhostConfig(path, value, errors) {
660
+ if (typeof value !== 'object' || Array.isArray(value)) {
661
+ errors.push(`${path} must be an object`);
662
+ return;
663
+ }
664
+ if ('enabled' in value && typeof value.enabled !== 'boolean') {
665
+ errors.push(`${path}.enabled must be a boolean`);
666
+ }
667
+ if ('max_retries_per_run' in value) {
668
+ validatePositiveInteger(`${path}.max_retries_per_run`, value.max_retries_per_run, 'retry count', errors);
669
+ }
670
+ if ('cooldown_seconds' in value) {
671
+ validatePositiveInteger(`${path}.cooldown_seconds`, value.cooldown_seconds, 'seconds', errors);
672
+ }
673
+ }
674
+
646
675
  function validateRunLoopPositiveInteger(path, value, errors) {
647
676
  if (value === undefined || value === null) {
648
677
  return;
@@ -656,6 +685,15 @@ function validateRunLoopPositiveInteger(path, value, errors) {
656
685
  }
657
686
  }
658
687
 
688
+ function validatePositiveInteger(path, value, unitLabel, errors) {
689
+ if (value === undefined || value === null) {
690
+ return;
691
+ }
692
+ if (typeof value !== 'number' || !Number.isInteger(value) || value < 1) {
693
+ errors.push(`${path} must be a positive integer (${unitLabel})`);
694
+ }
695
+ }
696
+
659
697
  function validateRuntimePositiveInteger(path, value, errors) {
660
698
  if (value === undefined || value === null) {
661
699
  return;
@@ -44,6 +44,8 @@ export const VALID_RUN_EVENTS = [
44
44
  'human_escalation_resolved',
45
45
  'dispatch_progress',
46
46
  'session_continuation',
47
+ 'auto_retried_ghost',
48
+ 'ghost_retry_exhausted',
47
49
  ];
48
50
 
49
51
  /**
@@ -104,6 +104,34 @@
104
104
  "type": "integer",
105
105
  "minimum": 1,
106
106
  "description": "Milliseconds to wait before a started turn that previously produced output is treated as stale. Default 600000 for local_cli turns and 300000 for api_proxy turns."
107
+ },
108
+ "continuous": {
109
+ "type": "object",
110
+ "description": "Continuous-run control knobs.",
111
+ "properties": {
112
+ "auto_retry_on_ghost": {
113
+ "type": "object",
114
+ "description": "Bounded ghost-turn retry policy for continuous/full-auto sessions.",
115
+ "properties": {
116
+ "enabled": {
117
+ "type": "boolean",
118
+ "description": "Enable bounded automatic reissue for startup ghost turns. Defaults false unless full-auto approval policy posture promotes it."
119
+ },
120
+ "max_retries_per_run": {
121
+ "type": "integer",
122
+ "minimum": 1,
123
+ "description": "Maximum ghost retries per run before leaving manual recovery visible. Default 3."
124
+ },
125
+ "cooldown_seconds": {
126
+ "type": "integer",
127
+ "minimum": 1,
128
+ "description": "Seconds to wait between automatic ghost retries. Default 5."
129
+ }
130
+ },
131
+ "additionalProperties": true
132
+ }
133
+ },
134
+ "additionalProperties": true
107
135
  }
108
136
  },
109
137
  "additionalProperties": true