agentxchain 2.146.0 → 2.148.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/publish-npm.sh +16 -0
- package/scripts/sync-homebrew.sh +14 -1
- package/scripts/verify-post-publish.sh +55 -4
- package/src/commands/reissue-turn.js +16 -0
- package/src/commands/reject-turn.js +14 -1
- package/src/commands/restart.js +15 -0
- package/src/commands/resume.js +61 -66
- package/src/commands/run.js +67 -10
- package/src/commands/schedule.js +34 -7
- package/src/commands/status.js +20 -0
- package/src/commands/step.js +100 -34
- package/src/lib/adapters/api-proxy-adapter.js +8 -0
- package/src/lib/adapters/local-cli-adapter.js +271 -16
- package/src/lib/adapters/manual-adapter.js +9 -10
- package/src/lib/adapters/mcp-adapter.js +3 -5
- package/src/lib/adapters/remote-agent-adapter.js +3 -5
- package/src/lib/continuous-run.js +71 -6
- package/src/lib/dispatch-bundle.js +1 -1
- package/src/lib/dispatch-progress.js +5 -3
- package/src/lib/governed-state.js +258 -17
- package/src/lib/intake.js +10 -1
- package/src/lib/normalized-config.js +51 -1
- package/src/lib/recent-event-summary.js +11 -0
- package/src/lib/run-events.js +4 -0
- package/src/lib/run-loop.js +67 -2
- package/src/lib/runner-interface.js +1 -0
- package/src/lib/schema.js +7 -0
- package/src/lib/schemas/agentxchain-config.schema.json +15 -1
- package/src/lib/schemas/turn-result.schema.json +8 -2
- package/src/lib/staged-result-proof.js +43 -0
- package/src/lib/stale-turn-watchdog.js +218 -90
- package/src/lib/turn-checkpoint.js +65 -1
- package/src/lib/turn-result-shape.js +38 -0
- package/src/lib/turn-result-validator.js +15 -3
package/src/commands/status.js
CHANGED
|
@@ -382,6 +382,16 @@ function renderGovernedStatus(context, opts) {
|
|
|
382
382
|
console.log(` ${chalk.dim('Recover:')} ${chalk.cyan(`agentxchain reject-turn --turn ${turn.turn_id}`)} — reject and retry`);
|
|
383
383
|
console.log(` ${chalk.dim(' or:')} ${chalk.cyan(`agentxchain accept-turn --turn ${turn.turn_id}`)} — re-attempt acceptance`);
|
|
384
384
|
}
|
|
385
|
+
if (turn.status === 'failed_start') {
|
|
386
|
+
console.log(` ${chalk.dim('Reason:')} ${turn.failed_start_reason || 'no_subprocess_output'}`);
|
|
387
|
+
const recover = turn.recovery_command || `agentxchain reissue-turn --turn ${turn.turn_id} --reason ghost`;
|
|
388
|
+
console.log(` ${chalk.dim('Recover:')} ${chalk.cyan(recover)}`);
|
|
389
|
+
}
|
|
390
|
+
if (turn.status === 'stalled') {
|
|
391
|
+
console.log(` ${chalk.dim('Reason:')} ${turn.stalled_reason || 'no_output_within_threshold'}`);
|
|
392
|
+
const recover = turn.recovery_command || `agentxchain reissue-turn --turn ${turn.turn_id} --reason stale`;
|
|
393
|
+
console.log(` ${chalk.dim('Recover:')} ${chalk.cyan(recover)}`);
|
|
394
|
+
}
|
|
385
395
|
}
|
|
386
396
|
} else if (singleActiveTurn) {
|
|
387
397
|
console.log(` ${chalk.dim('Turn:')} ${singleActiveTurn.turn_id}`);
|
|
@@ -432,6 +442,16 @@ function renderGovernedStatus(context, opts) {
|
|
|
432
442
|
console.log(` ${chalk.dim('Resolve:')} ${chalk.cyan(reassignAction.command)}`);
|
|
433
443
|
console.log(` ${chalk.dim(' or:')} ${chalk.cyan(mergeAction.command)}`);
|
|
434
444
|
}
|
|
445
|
+
if (singleActiveTurn.status === 'failed_start') {
|
|
446
|
+
console.log(` ${chalk.dim('Reason:')} ${singleActiveTurn.failed_start_reason || 'no_subprocess_output'}`);
|
|
447
|
+
const recover = singleActiveTurn.recovery_command || `agentxchain reissue-turn --turn ${singleActiveTurn.turn_id} --reason ghost`;
|
|
448
|
+
console.log(` ${chalk.dim('Recover:')} ${chalk.cyan(recover)}`);
|
|
449
|
+
}
|
|
450
|
+
if (singleActiveTurn.status === 'stalled') {
|
|
451
|
+
console.log(` ${chalk.dim('Reason:')} ${singleActiveTurn.stalled_reason || 'no_output_within_threshold'}`);
|
|
452
|
+
const recover = singleActiveTurn.recovery_command || `agentxchain reissue-turn --turn ${singleActiveTurn.turn_id} --reason stale`;
|
|
453
|
+
console.log(` ${chalk.dim('Recover:')} ${chalk.cyan(recover)}`);
|
|
454
|
+
}
|
|
435
455
|
} else {
|
|
436
456
|
console.log(` ${chalk.dim('Turn:')} ${chalk.yellow('No active turn')}`);
|
|
437
457
|
}
|
package/src/commands/step.js
CHANGED
|
@@ -35,7 +35,9 @@ import {
|
|
|
35
35
|
getActiveTurnCount,
|
|
36
36
|
getActiveTurns,
|
|
37
37
|
reactivateGovernedRun,
|
|
38
|
+
reconcilePhaseAdvanceBeforeDispatch,
|
|
38
39
|
refreshTurnBaselineSnapshot,
|
|
40
|
+
transitionActiveTurnLifecycle,
|
|
39
41
|
STATE_PATH,
|
|
40
42
|
} from '../lib/governed-state.js';
|
|
41
43
|
import { getMaxConcurrentTurns } from '../lib/normalized-config.js';
|
|
@@ -70,7 +72,7 @@ import { resolveGovernedRole } from '../lib/role-resolution.js';
|
|
|
70
72
|
import { shouldSuggestManualQaFallback } from '../lib/manual-qa-fallback.js';
|
|
71
73
|
import { evaluateApprovalSlaReminders } from '../lib/notification-runner.js';
|
|
72
74
|
import { consumeNextApprovedIntent } from '../lib/intake.js';
|
|
73
|
-
import { reconcileStaleTurns } from '../lib/stale-turn-watchdog.js';
|
|
75
|
+
import { failTurnStartup, reconcileStaleTurns } from '../lib/stale-turn-watchdog.js';
|
|
74
76
|
|
|
75
77
|
export async function stepCommand(opts) {
|
|
76
78
|
const context = loadProjectContext();
|
|
@@ -260,39 +262,14 @@ export async function stepCommand(opts) {
|
|
|
260
262
|
printDispatchBundleWarnings(bundleResult);
|
|
261
263
|
}
|
|
262
264
|
|
|
263
|
-
//
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
console.log(chalk.red(`Failed to reactivate run: ${reactivated.error}`));
|
|
272
|
-
process.exit(1);
|
|
273
|
-
}
|
|
274
|
-
state = reactivated.state;
|
|
275
|
-
if (reactivated.migration_notice) {
|
|
276
|
-
console.log(chalk.yellow(reactivated.migration_notice));
|
|
277
|
-
}
|
|
278
|
-
if (reactivated.phantom_notice) {
|
|
279
|
-
console.log(chalk.yellow(reactivated.phantom_notice));
|
|
280
|
-
}
|
|
281
|
-
skipAssignment = true;
|
|
282
|
-
|
|
283
|
-
// BUG-1 fix: refresh baseline snapshot to capture files dirtied between assignment and dispatch
|
|
284
|
-
refreshTurnBaselineSnapshot(root, pausedTurn.turn_id);
|
|
285
|
-
state = JSON.parse(readFileSync(join(root, '.agentxchain/state.json'), 'utf8'));
|
|
286
|
-
|
|
287
|
-
const bundleResult = writeDispatchBundle(root, state, config);
|
|
288
|
-
if (!bundleResult.ok) {
|
|
289
|
-
console.log(chalk.red(`Failed to write dispatch bundle: ${bundleResult.error}`));
|
|
290
|
-
process.exit(1);
|
|
291
|
-
}
|
|
292
|
-
bundleWritten = true;
|
|
293
|
-
printDispatchBundleWarnings(bundleResult);
|
|
294
|
-
}
|
|
295
|
-
}
|
|
265
|
+
// Removed (Turn 25): the `paused + failed/retrying retained turn → re-dispatch`
|
|
266
|
+
// branch is unreachable under the current schema. See the matching deletion in
|
|
267
|
+
// `cli/src/commands/resume.js` for the full citation chain (schema.js:184 +
|
|
268
|
+
// governed-state.js:2191-2204 + the line-187 short-circuit above). The reachable
|
|
269
|
+
// retained-turn re-dispatch path for `step --resume` is the `state.status ===
|
|
270
|
+
// 'blocked' && activeCount > 0` branch at line 193 above. Per
|
|
271
|
+
// `DEC-UNREACHABLE-BRANCH-COVERAGE-001`, dead branches are removed once the
|
|
272
|
+
// schema citation + migration citation are documented.
|
|
296
273
|
|
|
297
274
|
// idle → initialize run
|
|
298
275
|
if (!skipAssignment && state.status === 'idle' && !state.run_id) {
|
|
@@ -344,6 +321,27 @@ export async function stepCommand(opts) {
|
|
|
344
321
|
}
|
|
345
322
|
}
|
|
346
323
|
|
|
324
|
+
if (!skipAssignment) {
|
|
325
|
+
const phaseReconciliation = reconcilePhaseAdvanceBeforeDispatch(root, config, state);
|
|
326
|
+
if (!phaseReconciliation.ok && !phaseReconciliation.state) {
|
|
327
|
+
console.log(chalk.red(`Failed to reconcile phase gate before dispatch: ${phaseReconciliation.error}`));
|
|
328
|
+
process.exit(1);
|
|
329
|
+
}
|
|
330
|
+
state = phaseReconciliation.state || state;
|
|
331
|
+
if (phaseReconciliation.advanced) {
|
|
332
|
+
console.log(chalk.green(`Advanced phase before dispatch: ${phaseReconciliation.from_phase} → ${phaseReconciliation.to_phase}`));
|
|
333
|
+
}
|
|
334
|
+
if (state.pending_phase_transition || state.pending_run_completion) {
|
|
335
|
+
evaluateApprovalSlaReminders(root, config, state);
|
|
336
|
+
printRecoverySummary(state, 'This run is awaiting approval.', config);
|
|
337
|
+
process.exit(1);
|
|
338
|
+
}
|
|
339
|
+
if (state.status === 'blocked') {
|
|
340
|
+
printRecoverySummary(state, 'This run is blocked.', config);
|
|
341
|
+
process.exit(1);
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
347
345
|
// Assign the turn
|
|
348
346
|
if (!skipAssignment) {
|
|
349
347
|
const roleId = resolveTargetRole(opts, state, config);
|
|
@@ -448,6 +446,10 @@ export async function stepCommand(opts) {
|
|
|
448
446
|
console.log(chalk.red(`Failed to finalize dispatch manifest: ${manifestResult.error}`));
|
|
449
447
|
process.exit(1);
|
|
450
448
|
}
|
|
449
|
+
const dispatched = transitionActiveTurnLifecycle(root, turn.turn_id, 'dispatched');
|
|
450
|
+
if (dispatched.ok) {
|
|
451
|
+
state = dispatched.state;
|
|
452
|
+
}
|
|
451
453
|
}
|
|
452
454
|
|
|
453
455
|
const controller = new AbortController();
|
|
@@ -456,6 +458,13 @@ export async function stepCommand(opts) {
|
|
|
456
458
|
});
|
|
457
459
|
|
|
458
460
|
if (runtimeType === 'api_proxy') {
|
|
461
|
+
const running = transitionActiveTurnLifecycle(root, turn.turn_id, 'running', {
|
|
462
|
+
stream: 'request',
|
|
463
|
+
at: new Date().toISOString(),
|
|
464
|
+
});
|
|
465
|
+
if (running.ok) {
|
|
466
|
+
state = running.state;
|
|
467
|
+
}
|
|
459
468
|
console.log(chalk.cyan(`Dispatching to API proxy: ${runtime?.provider || '(unknown)'} / ${runtime?.model || '(unknown)'}`));
|
|
460
469
|
console.log(chalk.dim(`Turn: ${turn.turn_id} Role: ${roleId} Phase: ${state.phase}`));
|
|
461
470
|
|
|
@@ -535,6 +544,13 @@ export async function stepCommand(opts) {
|
|
|
535
544
|
}
|
|
536
545
|
console.log('');
|
|
537
546
|
} else if (runtimeType === 'mcp') {
|
|
547
|
+
const running = transitionActiveTurnLifecycle(root, turn.turn_id, 'running', {
|
|
548
|
+
stream: 'request',
|
|
549
|
+
at: new Date().toISOString(),
|
|
550
|
+
});
|
|
551
|
+
if (running.ok) {
|
|
552
|
+
state = running.state;
|
|
553
|
+
}
|
|
538
554
|
const mcpTransport = resolveMcpTransport(runtime);
|
|
539
555
|
console.log(chalk.cyan(`Dispatching to MCP ${mcpTransport}: ${describeMcpRuntimeTarget(runtime)}`));
|
|
540
556
|
console.log(chalk.dim(`Turn: ${turn.turn_id} Role: ${roleId} Phase: ${state.phase} Tool: ${runtime?.tool_name || 'agentxchain_turn'}`));
|
|
@@ -589,6 +605,13 @@ export async function stepCommand(opts) {
|
|
|
589
605
|
console.log(chalk.green(`MCP tool completed${mcpResult.toolName ? ` (${mcpResult.toolName})` : ''}. Staged result detected.`));
|
|
590
606
|
console.log('');
|
|
591
607
|
} else if (runtimeType === 'remote_agent') {
|
|
608
|
+
const running = transitionActiveTurnLifecycle(root, turn.turn_id, 'running', {
|
|
609
|
+
stream: 'request',
|
|
610
|
+
at: new Date().toISOString(),
|
|
611
|
+
});
|
|
612
|
+
if (running.ok) {
|
|
613
|
+
state = running.state;
|
|
614
|
+
}
|
|
592
615
|
console.log(chalk.cyan(`Dispatching to remote agent: ${describeRemoteAgentTarget(runtime)}`));
|
|
593
616
|
console.log(chalk.dim(`Turn: ${turn.turn_id} Role: ${roleId} Phase: ${state.phase}`));
|
|
594
617
|
|
|
@@ -667,8 +690,25 @@ export async function stepCommand(opts) {
|
|
|
667
690
|
|
|
668
691
|
// BUG-6: stream subprocess output by default (--stream or --verbose), suppress with --quiet
|
|
669
692
|
const shouldStream = opts.stream || opts.verbose || false;
|
|
693
|
+
let runningMarked = false;
|
|
694
|
+
const ensureStartingState = (pid = null, at = new Date().toISOString()) => {
|
|
695
|
+
const starting = transitionActiveTurnLifecycle(root, turn.turn_id, 'starting', { pid, at });
|
|
696
|
+
if (starting.ok) {
|
|
697
|
+
state = starting.state;
|
|
698
|
+
}
|
|
699
|
+
};
|
|
700
|
+
const ensureRunningState = (stream = 'stdout', at = new Date().toISOString()) => {
|
|
701
|
+
if (runningMarked) return;
|
|
702
|
+
runningMarked = true;
|
|
703
|
+
const running = transitionActiveTurnLifecycle(root, turn.turn_id, 'running', { stream, at });
|
|
704
|
+
if (running.ok) {
|
|
705
|
+
state = running.state;
|
|
706
|
+
}
|
|
707
|
+
};
|
|
670
708
|
const cliResult = await dispatchLocalCli(root, state, config, {
|
|
671
709
|
signal: controller.signal,
|
|
710
|
+
onSpawnAttached: ({ pid, at }) => ensureStartingState(pid, at),
|
|
711
|
+
onFirstOutput: ({ at, stream }) => ensureRunningState(stream, at),
|
|
672
712
|
onStdout: shouldStream ? (text) => process.stdout.write(chalk.dim(text)) : undefined,
|
|
673
713
|
onStderr: shouldStream ? (text) => process.stderr.write(chalk.yellow(text)) : undefined,
|
|
674
714
|
verifyManifest: true,
|
|
@@ -714,6 +754,28 @@ export async function stepCommand(opts) {
|
|
|
714
754
|
process.exit(1);
|
|
715
755
|
}
|
|
716
756
|
|
|
757
|
+
if (cliResult.startupFailure) {
|
|
758
|
+
const freshState = loadProjectState(root, config) || state;
|
|
759
|
+
const failed = failTurnStartup(root, freshState, config, turn.turn_id, {
|
|
760
|
+
failure_type: cliResult.startupFailureType || 'no_subprocess_output',
|
|
761
|
+
threshold_ms: config?.run_loop?.startup_watchdog_ms ?? 30_000,
|
|
762
|
+
running_ms: freshState?.active_turns?.[turn.turn_id]?.started_at
|
|
763
|
+
? Math.max(0, Date.now() - new Date(freshState.active_turns[turn.turn_id].started_at).getTime())
|
|
764
|
+
: 0,
|
|
765
|
+
recommendation: `Turn ${turn.turn_id} failed to start within the startup watchdog window. Run \`agentxchain reissue-turn --turn ${turn.turn_id} --reason ghost\` to recover.`,
|
|
766
|
+
});
|
|
767
|
+
if (failed.ok) {
|
|
768
|
+
state = failed.state;
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
console.log('');
|
|
772
|
+
console.log(chalk.red(`Turn startup failed: ${cliResult.error}`));
|
|
773
|
+
console.log(chalk.dim('The turn was retained as failed_start. You can:'));
|
|
774
|
+
console.log(chalk.dim(` - Reissue immediately: agentxchain reissue-turn --turn ${turn.turn_id} --reason ghost`));
|
|
775
|
+
console.log(chalk.dim(' - Inspect status: agentxchain status'));
|
|
776
|
+
process.exit(1);
|
|
777
|
+
}
|
|
778
|
+
|
|
717
779
|
if (!cliResult.ok) {
|
|
718
780
|
const blocked = markRunBlocked(root, {
|
|
719
781
|
blockedOn: `dispatch:${cliResult.exitCode != null ? `exit-${cliResult.exitCode}` : 'subprocess_failed'}`,
|
|
@@ -744,6 +806,10 @@ export async function stepCommand(opts) {
|
|
|
744
806
|
process.exit(1);
|
|
745
807
|
}
|
|
746
808
|
|
|
809
|
+
if (!runningMarked) {
|
|
810
|
+
ensureRunningState('staged_result', cliResult.firstOutputAt || new Date().toISOString());
|
|
811
|
+
}
|
|
812
|
+
|
|
747
813
|
console.log(chalk.green('Subprocess completed. Staged result detected.'));
|
|
748
814
|
console.log('');
|
|
749
815
|
} else {
|
|
@@ -29,6 +29,7 @@
|
|
|
29
29
|
import { readFileSync, writeFileSync, existsSync, mkdirSync, rmSync } from 'fs';
|
|
30
30
|
import { join } from 'path';
|
|
31
31
|
import { evaluateTokenBudget, SYSTEM_PROMPT, SEPARATOR } from '../token-budget.js';
|
|
32
|
+
import { hasMinimumTurnResultShape } from '../turn-result-shape.js';
|
|
32
33
|
import {
|
|
33
34
|
getDispatchApiRequestPath,
|
|
34
35
|
getDispatchContextPath,
|
|
@@ -1072,6 +1073,13 @@ export async function dispatchApiProxy(root, state, config, options = {}) {
|
|
|
1072
1073
|
turnResult.cost = { ...aggregateUsage };
|
|
1073
1074
|
}
|
|
1074
1075
|
|
|
1076
|
+
if (!hasMinimumTurnResultShape(turnResult)) {
|
|
1077
|
+
return {
|
|
1078
|
+
ok: false,
|
|
1079
|
+
error: 'API response did not contain a valid turn result with the minimum governed turn-result fields',
|
|
1080
|
+
};
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1075
1083
|
// Stage the turn result
|
|
1076
1084
|
try {
|
|
1077
1085
|
writeFileSync(
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
*/
|
|
19
19
|
|
|
20
20
|
import { spawn } from 'child_process';
|
|
21
|
-
import { existsSync, readFileSync,
|
|
21
|
+
import { existsSync, readFileSync, mkdirSync, writeFileSync } from 'fs';
|
|
22
22
|
import { join } from 'path';
|
|
23
23
|
import {
|
|
24
24
|
getDispatchContextPath,
|
|
@@ -29,6 +29,16 @@ import {
|
|
|
29
29
|
getTurnStagingResultPath,
|
|
30
30
|
} from '../turn-paths.js';
|
|
31
31
|
import { verifyDispatchManifestForAdapter } from '../dispatch-manifest.js';
|
|
32
|
+
import { hasMeaningfulStagedResult } from '../staged-result-proof.js';
|
|
33
|
+
|
|
34
|
+
const DIAGNOSTIC_ENV_KEYS = [
|
|
35
|
+
'PATH',
|
|
36
|
+
'HOME',
|
|
37
|
+
'PWD',
|
|
38
|
+
'SHELL',
|
|
39
|
+
'TMPDIR',
|
|
40
|
+
'AGENTXCHAIN_TURN_ID',
|
|
41
|
+
];
|
|
32
42
|
|
|
33
43
|
/**
|
|
34
44
|
* Launch a local CLI subprocess for a governed turn.
|
|
@@ -37,7 +47,7 @@ import { verifyDispatchManifestForAdapter } from '../dispatch-manifest.js';
|
|
|
37
47
|
* passes them as the prompt to the configured CLI command.
|
|
38
48
|
*
|
|
39
49
|
* @param {string} root - project root directory
|
|
40
|
-
* @param {object} state - current governed state (must
|
|
50
|
+
* @param {object} state - current governed state (must expose an active turn via active_turns; current_turn is a non-enumerable compatibility alias re-attached on load, not a persisted schema field)
|
|
41
51
|
* @param {object} config - normalized config
|
|
42
52
|
* @param {object} [options]
|
|
43
53
|
* @param {AbortSignal} [options.signal] - abort signal for cancellation
|
|
@@ -48,7 +58,15 @@ import { verifyDispatchManifestForAdapter } from '../dispatch-manifest.js';
|
|
|
48
58
|
* @returns {Promise<{ ok: boolean, exitCode?: number, timedOut?: boolean, aborted?: boolean, error?: string, logs?: string[] }>}
|
|
49
59
|
*/
|
|
50
60
|
export async function dispatchLocalCli(root, state, config, options = {}) {
|
|
51
|
-
const {
|
|
61
|
+
const {
|
|
62
|
+
signal,
|
|
63
|
+
onStdout,
|
|
64
|
+
onStderr,
|
|
65
|
+
onSpawnAttached,
|
|
66
|
+
onFirstOutput,
|
|
67
|
+
startupWatchdogMs = config?.run_loop?.startup_watchdog_ms ?? 30_000,
|
|
68
|
+
turnId,
|
|
69
|
+
} = options;
|
|
52
70
|
|
|
53
71
|
const turn = resolveTargetTurn(state, turnId);
|
|
54
72
|
if (!turn) {
|
|
@@ -103,6 +121,10 @@ export async function dispatchLocalCli(root, state, config, options = {}) {
|
|
|
103
121
|
|
|
104
122
|
// Capture logs for dispatch record
|
|
105
123
|
const logs = [];
|
|
124
|
+
const runtimeCwd = runtime.cwd ? join(root, runtime.cwd) : root;
|
|
125
|
+
const spawnEnv = { ...process.env, AGENTXCHAIN_TURN_ID: turn.turn_id };
|
|
126
|
+
const stdinBytes = transport === 'stdin' ? Buffer.byteLength(fullPrompt, 'utf8') : 0;
|
|
127
|
+
const diagnosticArgs = redactPromptArgs(args, fullPrompt, transport);
|
|
106
128
|
|
|
107
129
|
return new Promise((resolve) => {
|
|
108
130
|
if (signal?.aborted) {
|
|
@@ -112,37 +134,143 @@ export async function dispatchLocalCli(root, state, config, options = {}) {
|
|
|
112
134
|
|
|
113
135
|
let child;
|
|
114
136
|
try {
|
|
137
|
+
appendDiagnostic(logs, 'spawn_prepare', {
|
|
138
|
+
runtime_id: runtimeId,
|
|
139
|
+
turn_id: turn.turn_id,
|
|
140
|
+
command,
|
|
141
|
+
args: diagnosticArgs,
|
|
142
|
+
cwd: runtimeCwd,
|
|
143
|
+
prompt_transport: transport,
|
|
144
|
+
stdin_bytes: stdinBytes,
|
|
145
|
+
env: pickDiagnosticEnv(spawnEnv),
|
|
146
|
+
});
|
|
115
147
|
child = spawn(command, args, {
|
|
116
|
-
cwd:
|
|
148
|
+
cwd: runtimeCwd,
|
|
117
149
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
118
|
-
env:
|
|
150
|
+
env: spawnEnv,
|
|
119
151
|
});
|
|
120
152
|
} catch (err) {
|
|
121
|
-
|
|
153
|
+
appendDiagnostic(logs, 'spawn_error', normalizeDiagnosticError(err));
|
|
154
|
+
resolve({
|
|
155
|
+
ok: false,
|
|
156
|
+
startupFailure: true,
|
|
157
|
+
startupFailureType: 'runtime_spawn_failed',
|
|
158
|
+
error: `Failed to spawn "${command}": ${err.message}`,
|
|
159
|
+
logs,
|
|
160
|
+
});
|
|
122
161
|
return;
|
|
123
162
|
}
|
|
124
163
|
|
|
125
164
|
let settled = false;
|
|
165
|
+
let firstOutputAt = null;
|
|
166
|
+
let spawnConfirmedAt = null;
|
|
167
|
+
let spawnConfirmedAtMs = null;
|
|
168
|
+
let firstOutputLatencyMs = null;
|
|
169
|
+
let startupWatchdog = null;
|
|
170
|
+
let startupTimedOut = false;
|
|
171
|
+
let startupFailureType = null;
|
|
172
|
+
let stdoutBytes = 0;
|
|
173
|
+
let stderrBytes = 0;
|
|
174
|
+
|
|
126
175
|
const settle = (result) => {
|
|
127
176
|
if (settled) return;
|
|
128
177
|
settled = true;
|
|
129
178
|
resolve(result);
|
|
130
179
|
};
|
|
131
180
|
|
|
181
|
+
const clearStartupWatchdog = () => {
|
|
182
|
+
if (startupWatchdog) {
|
|
183
|
+
clearTimeout(startupWatchdog);
|
|
184
|
+
startupWatchdog = null;
|
|
185
|
+
}
|
|
186
|
+
};
|
|
187
|
+
|
|
188
|
+
const armStartupWatchdog = () => {
|
|
189
|
+
if (startupWatchdog || !(startupWatchdogMs > 0 && Number.isFinite(startupWatchdogMs))) {
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
startupWatchdog = setTimeout(() => {
|
|
193
|
+
if (firstOutputAt || isStagedResultReady(join(root, getTurnStagingResultPath(turn.turn_id)))) {
|
|
194
|
+
return;
|
|
195
|
+
}
|
|
196
|
+
startupTimedOut = true;
|
|
197
|
+
startupFailureType = 'no_subprocess_output';
|
|
198
|
+
logs.push(`[adapter] Startup watchdog fired after ${Math.round(startupWatchdogMs / 1000)}s with no output.`);
|
|
199
|
+
appendDiagnostic(logs, 'startup_watchdog_fired', {
|
|
200
|
+
startup_watchdog_ms: startupWatchdogMs,
|
|
201
|
+
pid: child.pid ?? null,
|
|
202
|
+
spawn_confirmed_at: spawnConfirmedAt,
|
|
203
|
+
elapsed_since_spawn_ms: spawnConfirmedAtMs == null ? null : Math.max(0, Date.now() - spawnConfirmedAtMs),
|
|
204
|
+
});
|
|
205
|
+
try {
|
|
206
|
+
child.kill('SIGTERM');
|
|
207
|
+
} catch {}
|
|
208
|
+
}, startupWatchdogMs);
|
|
209
|
+
};
|
|
210
|
+
|
|
211
|
+
const recordFirstOutput = (stream) => {
|
|
212
|
+
if (firstOutputAt) return;
|
|
213
|
+
firstOutputAt = new Date().toISOString();
|
|
214
|
+
firstOutputLatencyMs = spawnConfirmedAtMs == null ? null : Math.max(0, Date.now() - spawnConfirmedAtMs);
|
|
215
|
+
clearStartupWatchdog();
|
|
216
|
+
appendDiagnostic(logs, 'first_output', {
|
|
217
|
+
at: firstOutputAt,
|
|
218
|
+
stream,
|
|
219
|
+
pid: child.pid ?? null,
|
|
220
|
+
startup_latency_ms: firstOutputLatencyMs,
|
|
221
|
+
});
|
|
222
|
+
if (onFirstOutput) {
|
|
223
|
+
try {
|
|
224
|
+
onFirstOutput({ pid: child.pid ?? null, at: firstOutputAt, stream });
|
|
225
|
+
} catch {}
|
|
226
|
+
}
|
|
227
|
+
};
|
|
228
|
+
|
|
229
|
+
child.once('spawn', () => {
|
|
230
|
+
spawnConfirmedAtMs = Date.now();
|
|
231
|
+
spawnConfirmedAt = new Date().toISOString();
|
|
232
|
+
appendDiagnostic(logs, 'spawn_attached', {
|
|
233
|
+
pid: child.pid ?? null,
|
|
234
|
+
at: spawnConfirmedAt,
|
|
235
|
+
startup_watchdog_ms: startupWatchdogMs,
|
|
236
|
+
});
|
|
237
|
+
if (onSpawnAttached) {
|
|
238
|
+
try {
|
|
239
|
+
onSpawnAttached({ pid: child.pid ?? null, at: spawnConfirmedAt });
|
|
240
|
+
} catch {}
|
|
241
|
+
}
|
|
242
|
+
armStartupWatchdog();
|
|
243
|
+
});
|
|
244
|
+
|
|
132
245
|
// Deliver prompt via stdin if transport is "stdin"; otherwise close immediately
|
|
133
246
|
if (child.stdin) {
|
|
247
|
+
child.stdin.on('error', (err) => {
|
|
248
|
+
appendDiagnostic(logs, 'stdin_error', {
|
|
249
|
+
at: new Date().toISOString(),
|
|
250
|
+
stdin_bytes: stdinBytes,
|
|
251
|
+
...normalizeDiagnosticError(err),
|
|
252
|
+
});
|
|
253
|
+
});
|
|
134
254
|
try {
|
|
135
255
|
if (transport === 'stdin') {
|
|
136
256
|
child.stdin.write(fullPrompt);
|
|
137
257
|
}
|
|
138
258
|
child.stdin.end();
|
|
139
|
-
} catch {
|
|
259
|
+
} catch (err) {
|
|
260
|
+
appendDiagnostic(logs, 'stdin_error', {
|
|
261
|
+
at: new Date().toISOString(),
|
|
262
|
+
stdin_bytes: stdinBytes,
|
|
263
|
+
...normalizeDiagnosticError(err),
|
|
264
|
+
});
|
|
265
|
+
}
|
|
140
266
|
}
|
|
141
267
|
|
|
142
268
|
// Collect stdout/stderr
|
|
143
269
|
if (child.stdout) {
|
|
144
270
|
child.stdout.on('data', (chunk) => {
|
|
145
271
|
const text = chunk.toString();
|
|
272
|
+
stdoutBytes += Buffer.byteLength(text);
|
|
273
|
+
recordFirstOutput('stdout');
|
|
146
274
|
logs.push(text);
|
|
147
275
|
if (onStdout) onStdout(text);
|
|
148
276
|
});
|
|
@@ -151,6 +279,8 @@ export async function dispatchLocalCli(root, state, config, options = {}) {
|
|
|
151
279
|
if (child.stderr) {
|
|
152
280
|
child.stderr.on('data', (chunk) => {
|
|
153
281
|
const text = chunk.toString();
|
|
282
|
+
stderrBytes += Buffer.byteLength(text);
|
|
283
|
+
recordFirstOutput('stderr');
|
|
154
284
|
logs.push('[stderr] ' + text);
|
|
155
285
|
if (onStderr) onStderr(text);
|
|
156
286
|
});
|
|
@@ -180,6 +310,7 @@ export async function dispatchLocalCli(root, state, config, options = {}) {
|
|
|
180
310
|
// Abort signal handling
|
|
181
311
|
const onAbort = () => {
|
|
182
312
|
logs.push('[adapter] Abort signal received. Sending SIGTERM.');
|
|
313
|
+
clearStartupWatchdog();
|
|
183
314
|
clearTimeout(timeoutHandle);
|
|
184
315
|
clearTimeout(sigkillHandle);
|
|
185
316
|
try {
|
|
@@ -197,6 +328,7 @@ export async function dispatchLocalCli(root, state, config, options = {}) {
|
|
|
197
328
|
|
|
198
329
|
// Process exit
|
|
199
330
|
child.on('close', (exitCode, killSignal) => {
|
|
331
|
+
clearStartupWatchdog();
|
|
200
332
|
clearTimeout(timeoutHandle);
|
|
201
333
|
clearTimeout(sigkillHandle);
|
|
202
334
|
if (signal) signal.removeEventListener('abort', onAbort);
|
|
@@ -210,17 +342,81 @@ export async function dispatchLocalCli(root, state, config, options = {}) {
|
|
|
210
342
|
|
|
211
343
|
// Check if staged result was written (regardless of exit code)
|
|
212
344
|
const hasResult = isStagedResultReady(join(root, getTurnStagingResultPath(turn.turn_id)));
|
|
345
|
+
if (hasResult && !firstOutputAt) {
|
|
346
|
+
recordFirstOutput('staged_result');
|
|
347
|
+
}
|
|
348
|
+
const exitDiagnostic = {
|
|
349
|
+
pid: child.pid ?? null,
|
|
350
|
+
exit_code: exitCode,
|
|
351
|
+
signal: killSignal,
|
|
352
|
+
spawn_confirmed_at: spawnConfirmedAt,
|
|
353
|
+
elapsed_since_spawn_ms: spawnConfirmedAtMs == null ? null : Math.max(0, Date.now() - spawnConfirmedAtMs),
|
|
354
|
+
first_output_at: firstOutputAt,
|
|
355
|
+
startup_latency_ms: firstOutputLatencyMs,
|
|
356
|
+
stdout_bytes: stdoutBytes,
|
|
357
|
+
stderr_bytes: stderrBytes,
|
|
358
|
+
staged_result_ready: hasResult,
|
|
359
|
+
};
|
|
360
|
+
if (startupTimedOut) {
|
|
361
|
+
exitDiagnostic.startup_failure_type = startupFailureType || 'no_subprocess_output';
|
|
362
|
+
} else if (!spawnConfirmedAt) {
|
|
363
|
+
exitDiagnostic.startup_failure_type = 'runtime_spawn_failed';
|
|
364
|
+
} else if (timedOut) {
|
|
365
|
+
exitDiagnostic.timed_out = true;
|
|
366
|
+
} else if (!firstOutputAt) {
|
|
367
|
+
exitDiagnostic.startup_failure_type = 'no_subprocess_output';
|
|
368
|
+
}
|
|
369
|
+
appendDiagnostic(logs, 'process_exit', exitDiagnostic);
|
|
213
370
|
|
|
214
371
|
if (hasResult) {
|
|
215
|
-
settle({ ok: true, exitCode, timedOut: false, aborted: false, logs });
|
|
372
|
+
settle({ ok: true, exitCode, timedOut: false, aborted: false, logs, firstOutputAt });
|
|
373
|
+
} else if (startupTimedOut) {
|
|
374
|
+
settle({
|
|
375
|
+
ok: false,
|
|
376
|
+
exitCode,
|
|
377
|
+
timedOut: false,
|
|
378
|
+
aborted: false,
|
|
379
|
+
startupFailure: true,
|
|
380
|
+
startupFailureType: startupFailureType || 'no_subprocess_output',
|
|
381
|
+
startupWatchdogMs,
|
|
382
|
+
firstOutputAt,
|
|
383
|
+
error: `Subprocess produced no output within ${Math.round(startupWatchdogMs / 1000)}s and did not stage a turn result.`,
|
|
384
|
+
logs,
|
|
385
|
+
});
|
|
386
|
+
} else if (!spawnConfirmedAt) {
|
|
387
|
+
settle({
|
|
388
|
+
ok: false,
|
|
389
|
+
exitCode,
|
|
390
|
+
timedOut: false,
|
|
391
|
+
aborted: false,
|
|
392
|
+
startupFailure: true,
|
|
393
|
+
startupFailureType: 'runtime_spawn_failed',
|
|
394
|
+
firstOutputAt,
|
|
395
|
+
error: `Subprocess exited (code ${exitCode}) before reporting a successful spawn or staging a turn result.`,
|
|
396
|
+
logs,
|
|
397
|
+
});
|
|
216
398
|
} else if (timedOut) {
|
|
217
399
|
settle({ ok: false, exitCode, timedOut: true, aborted: false, error: 'Turn timed out without producing a staged result.', logs });
|
|
400
|
+
} else if (!firstOutputAt) {
|
|
401
|
+
settle({
|
|
402
|
+
ok: false,
|
|
403
|
+
exitCode,
|
|
404
|
+
timedOut: false,
|
|
405
|
+
aborted: false,
|
|
406
|
+
startupFailure: true,
|
|
407
|
+
startupFailureType: 'no_subprocess_output',
|
|
408
|
+
startupWatchdogMs,
|
|
409
|
+
firstOutputAt,
|
|
410
|
+
error: `Subprocess exited (code ${exitCode}) before producing output or staging a turn result.`,
|
|
411
|
+
logs,
|
|
412
|
+
});
|
|
218
413
|
} else {
|
|
219
414
|
settle({
|
|
220
415
|
ok: false,
|
|
221
416
|
exitCode,
|
|
222
417
|
timedOut: false,
|
|
223
418
|
aborted: false,
|
|
419
|
+
firstOutputAt,
|
|
224
420
|
error: `Subprocess exited (code ${exitCode}) without writing a staged turn result to ${getTurnStagingResultPath(turn.turn_id)}.`,
|
|
225
421
|
logs,
|
|
226
422
|
});
|
|
@@ -228,10 +424,37 @@ export async function dispatchLocalCli(root, state, config, options = {}) {
|
|
|
228
424
|
});
|
|
229
425
|
|
|
230
426
|
child.on('error', (err) => {
|
|
427
|
+
clearStartupWatchdog();
|
|
231
428
|
clearTimeout(timeoutHandle);
|
|
232
429
|
clearTimeout(sigkillHandle);
|
|
233
430
|
if (signal) signal.removeEventListener('abort', onAbort);
|
|
234
|
-
|
|
431
|
+
// BUG-54 hypothesis #1 fix: explicitly release stdio streams on the
|
|
432
|
+
// error path so Node reclaims pipe handles immediately instead of
|
|
433
|
+
// waiting for GC. Without this, repeated `runtime_spawn_failed` turns
|
|
434
|
+
// leak ~4 handles per failure until the next GC sweep, which in a
|
|
435
|
+
// long-running `run --continuous` session can push the parent process
|
|
436
|
+
// toward its fd limit and cascade additional spawn failures.
|
|
437
|
+
try { child.stdin?.destroy(); } catch {}
|
|
438
|
+
try { child.stdout?.destroy(); } catch {}
|
|
439
|
+
try { child.stderr?.destroy(); } catch {}
|
|
440
|
+
appendDiagnostic(logs, 'spawn_error', {
|
|
441
|
+
pid: child.pid ?? null,
|
|
442
|
+
spawn_confirmed_at: spawnConfirmedAt,
|
|
443
|
+
elapsed_since_spawn_ms: spawnConfirmedAtMs == null ? null : Math.max(0, Date.now() - spawnConfirmedAtMs),
|
|
444
|
+
first_output_at: firstOutputAt,
|
|
445
|
+
startup_latency_ms: firstOutputLatencyMs,
|
|
446
|
+
stdout_bytes: stdoutBytes,
|
|
447
|
+
stderr_bytes: stderrBytes,
|
|
448
|
+
...normalizeDiagnosticError(err),
|
|
449
|
+
});
|
|
450
|
+
settle({
|
|
451
|
+
ok: false,
|
|
452
|
+
startupFailure: !firstOutputAt,
|
|
453
|
+
startupFailureType: !firstOutputAt ? 'runtime_spawn_failed' : null,
|
|
454
|
+
firstOutputAt,
|
|
455
|
+
error: `Subprocess error: ${err.message}`,
|
|
456
|
+
logs,
|
|
457
|
+
});
|
|
235
458
|
});
|
|
236
459
|
});
|
|
237
460
|
}
|
|
@@ -322,15 +545,13 @@ function resolvePromptTransport(runtime) {
|
|
|
322
545
|
|
|
323
546
|
/**
|
|
324
547
|
* Check if the staged result file exists and has meaningful content.
|
|
548
|
+
* Delegates to the shared `hasMeaningfulStagedResult` helper so watchdog,
|
|
549
|
+
* manual adapter, and local-cli adapter all agree on what counts as proof.
|
|
550
|
+
* Per DEC-BUG51-STAGING-PLACEHOLDER-NOT-PROOF-001, placeholders (`{}`, blank,
|
|
551
|
+
* whitespace-only, or `{}\n`) are cleanup artifacts, not evidence.
|
|
325
552
|
*/
|
|
326
553
|
function isStagedResultReady(filePath) {
|
|
327
|
-
|
|
328
|
-
if (!existsSync(filePath)) return false;
|
|
329
|
-
const stat = statSync(filePath);
|
|
330
|
-
return stat.size > 2; // Must be more than just "{}" or empty
|
|
331
|
-
} catch {
|
|
332
|
-
return false;
|
|
333
|
-
}
|
|
554
|
+
return hasMeaningfulStagedResult(filePath);
|
|
334
555
|
}
|
|
335
556
|
|
|
336
557
|
function resolveTargetTurn(state, turnId) {
|
|
@@ -340,4 +561,38 @@ function resolveTargetTurn(state, turnId) {
|
|
|
340
561
|
return state?.current_turn || Object.values(state?.active_turns || {})[0];
|
|
341
562
|
}
|
|
342
563
|
|
|
564
|
+
function appendDiagnostic(logs, label, payload) {
|
|
565
|
+
logs.push(`[adapter:diag] ${label} ${JSON.stringify(payload)}\n`);
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
function pickDiagnosticEnv(env) {
|
|
569
|
+
return Object.fromEntries(
|
|
570
|
+
DIAGNOSTIC_ENV_KEYS
|
|
571
|
+
.filter((key) => typeof env?.[key] === 'string' && env[key].length > 0)
|
|
572
|
+
.map((key) => [key, env[key]]),
|
|
573
|
+
);
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
function redactPromptArgs(args, fullPrompt, transport) {
|
|
577
|
+
const promptPlaceholder = `<prompt:${Buffer.byteLength(fullPrompt, 'utf8')} bytes>`;
|
|
578
|
+
return args.map((arg) => {
|
|
579
|
+
if (typeof arg !== 'string') {
|
|
580
|
+
return arg;
|
|
581
|
+
}
|
|
582
|
+
if (transport === 'argv' && arg === fullPrompt) {
|
|
583
|
+
return promptPlaceholder;
|
|
584
|
+
}
|
|
585
|
+
return arg;
|
|
586
|
+
});
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
function normalizeDiagnosticError(err) {
|
|
590
|
+
return {
|
|
591
|
+
code: err?.code || null,
|
|
592
|
+
errno: err?.errno || null,
|
|
593
|
+
syscall: err?.syscall || null,
|
|
594
|
+
message: err?.message || String(err),
|
|
595
|
+
};
|
|
596
|
+
}
|
|
597
|
+
|
|
343
598
|
export { resolvePromptTransport };
|