@agentmeshhq/agent 0.4.5 → 0.4.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/__tests__/auth-doctor-integration.test.d.ts +14 -0
- package/dist/__tests__/auth-doctor-integration.test.js +130 -0
- package/dist/__tests__/auth-doctor-integration.test.js.map +1 -0
- package/dist/__tests__/auth-guard.integration.test.d.ts +12 -0
- package/dist/__tests__/auth-guard.integration.test.js +132 -0
- package/dist/__tests__/auth-guard.integration.test.js.map +1 -0
- package/dist/__tests__/auth-guard.test.d.ts +17 -0
- package/dist/__tests__/auth-guard.test.js +483 -0
- package/dist/__tests__/auth-guard.test.js.map +1 -0
- package/dist/__tests__/done-state-guard.integration.test.d.ts +1 -0
- package/dist/__tests__/done-state-guard.integration.test.js +281 -0
- package/dist/__tests__/done-state-guard.integration.test.js.map +1 -0
- package/dist/__tests__/done-state-guard.test.d.ts +1 -0
- package/dist/__tests__/done-state-guard.test.js +327 -0
- package/dist/__tests__/done-state-guard.test.js.map +1 -0
- package/dist/__tests__/session-recovery.test.d.ts +1 -0
- package/dist/__tests__/session-recovery.test.js +16 -0
- package/dist/__tests__/session-recovery.test.js.map +1 -0
- package/dist/__tests__/tmux-runtime.test.d.ts +1 -0
- package/dist/__tests__/tmux-runtime.test.js +113 -0
- package/dist/__tests__/tmux-runtime.test.js.map +1 -0
- package/dist/cli/auth.d.ts +11 -0
- package/dist/cli/auth.js +92 -0
- package/dist/cli/auth.js.map +1 -0
- package/dist/cli/index.js +45 -1
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/local.d.ts +4 -2
- package/dist/cli/local.js +257 -108
- package/dist/cli/local.js.map +1 -1
- package/dist/cli/migrate.d.ts +1 -0
- package/dist/cli/migrate.js +14 -10
- package/dist/cli/migrate.js.map +1 -1
- package/dist/cli/start.d.ts +2 -0
- package/dist/cli/start.js +3 -0
- package/dist/cli/start.js.map +1 -1
- package/dist/cli/test.d.ts +1 -0
- package/dist/cli/test.js +15 -9
- package/dist/cli/test.js.map +1 -1
- package/dist/config/schema.d.ts +11 -0
- package/dist/config/schema.js.map +1 -1
- package/dist/core/auth-guard.d.ts +155 -0
- package/dist/core/auth-guard.js +498 -0
- package/dist/core/auth-guard.js.map +1 -0
- package/dist/core/auth-sync.d.ts +105 -0
- package/dist/core/auth-sync.js +263 -0
- package/dist/core/auth-sync.js.map +1 -0
- package/dist/core/daemon/context-template.js +65 -0
- package/dist/core/daemon/context-template.js.map +1 -1
- package/dist/core/daemon/done-state-guard.d.ts +63 -0
- package/dist/core/daemon/done-state-guard.js +102 -0
- package/dist/core/daemon/done-state-guard.js.map +1 -0
- package/dist/core/daemon/session-recovery.d.ts +1 -0
- package/dist/core/daemon/session-recovery.js +7 -0
- package/dist/core/daemon/session-recovery.js.map +1 -0
- package/dist/core/daemon/tmux-session.d.ts +1 -0
- package/dist/core/daemon/tmux-session.js +1 -1
- package/dist/core/daemon/tmux-session.js.map +1 -1
- package/dist/core/daemon.d.ts +18 -1
- package/dist/core/daemon.js +220 -35
- package/dist/core/daemon.js.map +1 -1
- package/dist/core/registry.d.ts +9 -1
- package/dist/core/registry.js +28 -1
- package/dist/core/registry.js.map +1 -1
- package/dist/core/tmux-runtime.d.ts +11 -2
- package/dist/core/tmux-runtime.js +45 -19
- package/dist/core/tmux-runtime.js.map +1 -1
- package/dist/core/tmux.d.ts +1 -1
- package/dist/core/tmux.js +7 -3
- package/dist/core/tmux.js.map +1 -1
- package/package.json +12 -11
- package/LICENSE +0 -21
package/dist/core/daemon.d.ts
CHANGED
|
@@ -28,6 +28,8 @@ export interface DaemonOptions {
|
|
|
28
28
|
role?: string;
|
|
29
29
|
/** Auto-accept pending handoffs in worker mode (default: enabled for --worker) */
|
|
30
30
|
autoAcceptHandoffs?: boolean;
|
|
31
|
+
/** Run agent in fully autonomous mode — injects runtime-specific non-interactive flags */
|
|
32
|
+
autonomous?: boolean;
|
|
31
33
|
}
|
|
32
34
|
export declare class AgentDaemon {
|
|
33
35
|
private agentName;
|
|
@@ -55,16 +57,20 @@ export declare class AgentDaemon {
|
|
|
55
57
|
private projectCode;
|
|
56
58
|
private projectRole;
|
|
57
59
|
private autoAcceptHandoffs;
|
|
60
|
+
private autonomous;
|
|
58
61
|
private healthCheckInterval;
|
|
59
62
|
private stopCleanupScheduler;
|
|
63
|
+
private authHealthWatcher;
|
|
60
64
|
private _preStartSessionId;
|
|
61
65
|
private _attemptedResumeSessionId;
|
|
62
66
|
private stuckSince;
|
|
63
|
-
private nudgeSentAt;
|
|
64
67
|
private lastPendingHandoffAlertAt;
|
|
65
68
|
private remoteAutomationPaused;
|
|
66
69
|
private lastAutonomyPolicyFetchAt;
|
|
67
70
|
private pendingClaimCreations;
|
|
71
|
+
private sessionRecoveryAttempts;
|
|
72
|
+
private lastSessionRecoveryAt;
|
|
73
|
+
private initialInboxCheckComplete;
|
|
68
74
|
constructor(options: DaemonOptions);
|
|
69
75
|
start(): Promise<void>;
|
|
70
76
|
/**
|
|
@@ -74,6 +80,7 @@ export declare class AgentDaemon {
|
|
|
74
80
|
private autoAcceptPendingHandoffs;
|
|
75
81
|
private autoAcceptHandoffFromEvent;
|
|
76
82
|
private isAutomationPaused;
|
|
83
|
+
private sweepInboxOnWebSocketConnect;
|
|
77
84
|
private refreshRemoteAutonomyPolicy;
|
|
78
85
|
private acceptHandoffWithRetry;
|
|
79
86
|
private checkPendingHandoffSla;
|
|
@@ -88,6 +95,7 @@ export declare class AgentDaemon {
|
|
|
88
95
|
* Handles session death - logs crash and attempts auto-restart
|
|
89
96
|
*/
|
|
90
97
|
private handleSessionDeath;
|
|
98
|
+
private tryRecoverSession;
|
|
91
99
|
/**
|
|
92
100
|
* Handles stuck agent - sends nudge first, then restarts if still stuck
|
|
93
101
|
*/
|
|
@@ -115,6 +123,15 @@ export declare class AgentDaemon {
|
|
|
115
123
|
* Resolves workdir from --project flag: looks up project by code, clones repo, self-assigns.
|
|
116
124
|
*/
|
|
117
125
|
private resolveProjectWorkdir;
|
|
126
|
+
/**
|
|
127
|
+
* Evaluates whether this restart should resume in-flight work or come up idle.
|
|
128
|
+
*
|
|
129
|
+
* Pulls claims, inbox, and recent handoff history from HQ, then delegates to
|
|
130
|
+
* the pure `evaluateRestartState` function for the actual decision.
|
|
131
|
+
*
|
|
132
|
+
* Failures are non-fatal — defaults to `idle` so we fail safe.
|
|
133
|
+
*/
|
|
134
|
+
private evaluateDoneStateGuard;
|
|
118
135
|
/**
|
|
119
136
|
* Fetches assignments from HQ and validates workdir setup
|
|
120
137
|
* Uses project.workdir from HQ as source of truth, falls back to helpful instructions
|
package/dist/core/daemon.js
CHANGED
|
@@ -4,21 +4,24 @@ import os from "node:os";
|
|
|
4
4
|
import path from "node:path";
|
|
5
5
|
import { getAgentState, loadState, updateAgentInState } from "../config/loader.js";
|
|
6
6
|
import { loadContext, loadOrCreateContext, saveContext } from "../context/index.js";
|
|
7
|
+
import { preflightAgentAuth, startAuthHealthWatcher, } from "./auth-guard.js";
|
|
7
8
|
import { startCleanupScheduler } from "./cleanup/scheduler.js";
|
|
8
9
|
import { renderMissingWorkdirMessage } from "./daemon/assignment-message.js";
|
|
9
10
|
import { bootstrapDaemon } from "./daemon/bootstrap.js";
|
|
10
11
|
import { removeClaudeMd, writeClaudeMd } from "./daemon/context-template.js";
|
|
11
12
|
import { formatCrashLog } from "./daemon/crash-log.js";
|
|
13
|
+
import { evaluateRestartState, filterActiveClaimsForAgent, filterCompletedHandoffsForAgent, formatRestartLifecycleLog, } from "./daemon/done-state-guard.js";
|
|
12
14
|
import { cleanupGitAuth, setupGitAuth } from "./daemon/git-auth.js";
|
|
13
|
-
import {
|
|
15
|
+
import { getStuckDetail } from "./daemon/health-policy.js";
|
|
14
16
|
import { writeSandboxOpencodeConfig } from "./daemon/sandbox-config.js";
|
|
17
|
+
import { isRecoverableSessionFailure } from "./daemon/session-recovery.js";
|
|
15
18
|
import { captureAgentChildPids, persistRunningState } from "./daemon/state.js";
|
|
16
19
|
import { startTmuxRuntimeSession } from "./daemon/tmux-session.js";
|
|
17
20
|
import { configureGitIdentity, setupWorkspace, updateWorkspaceFromRemote, validatePushAccess, } from "./daemon/workspace.js";
|
|
18
21
|
import { findPendingHandoffBreaches } from "./handoff-sla.js";
|
|
19
22
|
import { Heartbeat } from "./heartbeat.js";
|
|
20
|
-
import { handleWebSocketEvent, injectOnboardMessage, injectRestoredContext, injectStartupMessage, } from "./injector.js";
|
|
21
|
-
import { checkInbox, createClaim, createSelfAssignment, fetchAssignments, fetchOnboard, fetchProjectByCode, getAgentAutonomyState, getHandoff, listClaims, registerAgent, releaseClaim, updateHandoffStatusWithRetry, } from "./registry.js";
|
|
23
|
+
import { handleWebSocketEvent, injectInboxItems, injectOnboardMessage, injectRestoredContext, injectStartupMessage, } from "./injector.js";
|
|
24
|
+
import { checkInbox, createClaim, createSelfAssignment, fetchAssignments, fetchHandoffsForAgent, fetchOnboard, fetchProjectByCode, getAgentAutonomyState, getHandoff, listClaims, registerAgent, releaseClaim, updateHandoffStatusWithRetry, } from "./registry.js";
|
|
22
25
|
import { getRunnerDisplayName } from "./runner.js";
|
|
23
26
|
import { DockerSandbox } from "./sandbox.js";
|
|
24
27
|
import { getLatestSessionId, waitForNewSessionId } from "./session-id.js";
|
|
@@ -26,10 +29,11 @@ import { captureSessionContext, captureSessionOutput, destroySession, isSessionH
|
|
|
26
29
|
import { prepareOpenCodeRuntime } from "./tmux-runtime.js";
|
|
27
30
|
import { checkAgentProgress, cleanupOrphanContainers, isProcessRunning, sendNudge, } from "./watchdog.js";
|
|
28
31
|
import { AgentWebSocket } from "./websocket.js";
|
|
29
|
-
//
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
const
|
|
32
|
+
// SLA breach alert thresholds — configurable via env vars
|
|
33
|
+
// AGENTMESH_HANDOFF_SLA_MINUTES: minutes before a pending handoff is considered a breach (default 5)
|
|
34
|
+
// AGENTMESH_HANDOFF_SLA_COOLDOWN_MS: ms between repeated SLA alerts for the same breach (default 5 min)
|
|
35
|
+
const PENDING_HANDOFF_SLA_MINUTES = Number(process.env.AGENTMESH_HANDOFF_SLA_MINUTES ?? 5);
|
|
36
|
+
const PENDING_HANDOFF_ALERT_COOLDOWN_MS = Number(process.env.AGENTMESH_HANDOFF_SLA_COOLDOWN_MS ?? 5 * 60 * 1000);
|
|
33
37
|
const AUTO_CLAIM_SCOPE_PREFIX = "handoff:";
|
|
34
38
|
const AUTO_CLAIM_TTL_SECONDS = 1800;
|
|
35
39
|
// Path to the sandbox OpenCode config (permissive permissions)
|
|
@@ -65,18 +69,22 @@ export class AgentDaemon {
|
|
|
65
69
|
projectCode;
|
|
66
70
|
projectRole;
|
|
67
71
|
autoAcceptHandoffs;
|
|
72
|
+
autonomous;
|
|
68
73
|
healthCheckInterval = null;
|
|
69
74
|
stopCleanupScheduler = null;
|
|
75
|
+
authHealthWatcher = null;
|
|
70
76
|
// Session resume tracking
|
|
71
77
|
_preStartSessionId;
|
|
72
78
|
_attemptedResumeSessionId;
|
|
73
79
|
// Stuck detection tracking
|
|
74
80
|
stuckSince = null;
|
|
75
|
-
nudgeSentAt = null;
|
|
76
81
|
lastPendingHandoffAlertAt = null;
|
|
77
82
|
remoteAutomationPaused = false;
|
|
78
83
|
lastAutonomyPolicyFetchAt = null;
|
|
79
84
|
pendingClaimCreations = new Set();
|
|
85
|
+
sessionRecoveryAttempts = 0;
|
|
86
|
+
lastSessionRecoveryAt = null;
|
|
87
|
+
initialInboxCheckComplete = false;
|
|
80
88
|
constructor(options) {
|
|
81
89
|
const boot = bootstrapDaemon(options);
|
|
82
90
|
this.config = boot.config;
|
|
@@ -94,6 +102,7 @@ export class AgentDaemon {
|
|
|
94
102
|
this.projectCode = boot.projectCode;
|
|
95
103
|
this.projectRole = boot.projectRole;
|
|
96
104
|
this.autoAcceptHandoffs = boot.autoAcceptHandoffs;
|
|
105
|
+
this.autonomous = options.autonomous ?? false;
|
|
97
106
|
this.runnerConfig = boot.runnerConfig;
|
|
98
107
|
const runnerName = getRunnerDisplayName(this.runnerConfig.type);
|
|
99
108
|
console.log(`Runner: ${runnerName}`);
|
|
@@ -130,6 +139,13 @@ export class AgentDaemon {
|
|
|
130
139
|
// Register with hub first (needed for assignment check)
|
|
131
140
|
console.log("Registering with AgentMesh hub...");
|
|
132
141
|
console.log(`Existing state: ${existingState ? `agentId=${existingState.agentId}` : "none"}`);
|
|
142
|
+
// Derive agent_type from runtime flags when not explicitly set in config.
|
|
143
|
+
// - explicit agentConfig.agentType always wins
|
|
144
|
+
// - --worker → "worker" (requires team_id on hub)
|
|
145
|
+
// - --autonomous (no --worker) → "autonomous" (standalone, visible, no team needed)
|
|
146
|
+
// - neither → "system" (hidden background agent)
|
|
147
|
+
const effectiveAgentType = this.agentConfig.agentType ??
|
|
148
|
+
(this.isWorkerAgent ? "worker" : this.autonomous ? "autonomous" : "system");
|
|
133
149
|
const registration = await registerAgent({
|
|
134
150
|
url: this.config.hubUrl,
|
|
135
151
|
apiKey: this.config.apiKey,
|
|
@@ -138,6 +154,7 @@ export class AgentDaemon {
|
|
|
138
154
|
agentName: this.agentName,
|
|
139
155
|
model: this.agentConfig.model || this.config.defaults.model,
|
|
140
156
|
restoreContext: this.shouldRestoreContext,
|
|
157
|
+
agentType: effectiveAgentType,
|
|
141
158
|
});
|
|
142
159
|
this.agentId = registration.agentId;
|
|
143
160
|
this.token = registration.token;
|
|
@@ -214,6 +231,17 @@ export class AgentDaemon {
|
|
|
214
231
|
`Use --serve-port to specify a different port.`);
|
|
215
232
|
}
|
|
216
233
|
}
|
|
234
|
+
// Preflight: ensure per-agent auth symlink is valid before launching runner (Epic #470)
|
|
235
|
+
if (this.runnerConfig.type === "opencode") {
|
|
236
|
+
const { ok, result } = preflightAgentAuth(this.agentName);
|
|
237
|
+
if (!ok) {
|
|
238
|
+
console.warn(`[AUTH] Startup preflight failed for ${this.agentName}: ${result.message}`);
|
|
239
|
+
console.warn("[AUTH] Agent may fail provider calls. Run: agentmesh auth doctor --repair");
|
|
240
|
+
}
|
|
241
|
+
else if (result.status === "repaired") {
|
|
242
|
+
console.log(`[AUTH] Auth repaired at startup: ${result.message}`);
|
|
243
|
+
}
|
|
244
|
+
}
|
|
217
245
|
// Choose runtime mode: sandbox > serve > tmux
|
|
218
246
|
if (this.sandboxMode) {
|
|
219
247
|
await this.startSandboxMode();
|
|
@@ -229,6 +257,7 @@ export class AgentDaemon {
|
|
|
229
257
|
workdir: this.agentConfig.workdir,
|
|
230
258
|
runnerEnv: this.runnerConfig.env,
|
|
231
259
|
shouldRestoreContext: this.shouldRestoreContext,
|
|
260
|
+
autonomous: this.autonomous,
|
|
232
261
|
});
|
|
233
262
|
this._preStartSessionId = sessionStart.preStartSessionId;
|
|
234
263
|
this._attemptedResumeSessionId = sessionStart.attemptedResumeSessionId;
|
|
@@ -304,6 +333,7 @@ export class AgentDaemon {
|
|
|
304
333
|
},
|
|
305
334
|
onConnect: () => {
|
|
306
335
|
console.log("WebSocket reconnected with new token");
|
|
336
|
+
void this.sweepInboxOnWebSocketConnect();
|
|
307
337
|
},
|
|
308
338
|
onDisconnect: () => {
|
|
309
339
|
console.log("WebSocket disconnected");
|
|
@@ -333,6 +363,7 @@ export class AgentDaemon {
|
|
|
333
363
|
},
|
|
334
364
|
onConnect: () => {
|
|
335
365
|
console.log("WebSocket connected");
|
|
366
|
+
void this.sweepInboxOnWebSocketConnect();
|
|
336
367
|
},
|
|
337
368
|
onDisconnect: () => {
|
|
338
369
|
console.log("WebSocket disconnected");
|
|
@@ -345,17 +376,40 @@ export class AgentDaemon {
|
|
|
345
376
|
// Wait for TUI to initialize before injecting messages
|
|
346
377
|
await new Promise((resolve) => setTimeout(resolve, 3000));
|
|
347
378
|
await this.refreshRemoteAutonomyPolicy(true);
|
|
379
|
+
// -----------------------------------------------------------------------
|
|
380
|
+
// Done-state guard (Epic #497): determine restart state before injecting
|
|
381
|
+
// any work. If prior cycle is done, come up idle and skip auto-accept.
|
|
382
|
+
// -----------------------------------------------------------------------
|
|
383
|
+
const restartDecision = await this.evaluateDoneStateGuard();
|
|
384
|
+
console.log(formatRestartLifecycleLog(restartDecision));
|
|
385
|
+
updateAgentInState(this.agentName, {
|
|
386
|
+
lastRestartState: restartDecision.state,
|
|
387
|
+
lastRestartReason: restartDecision.reason,
|
|
388
|
+
lastRestartDecisionAt: new Date().toISOString(),
|
|
389
|
+
});
|
|
348
390
|
// Check inbox and auto-nudge with full handoff details
|
|
349
391
|
console.log("Checking inbox...");
|
|
350
392
|
try {
|
|
351
393
|
const inboxItems = await checkInbox(this.config.hubUrl, this.config.workspace, this.token);
|
|
352
|
-
|
|
353
|
-
|
|
394
|
+
// If the done-state guard says prior work is done, do NOT auto-accept inbox
|
|
395
|
+
// items from the stale cycle — come up idle and wait for a fresh handoff.
|
|
396
|
+
if (restartDecision.state === "idle" || restartDecision.state === "blocked") {
|
|
397
|
+
console.log(`[RESTART] Skipping auto-accept: agent is ${restartDecision.state}. ` +
|
|
398
|
+
"Any inbox items will be surfaced but not auto-claimed.");
|
|
399
|
+
injectStartupMessage(this.agentName, inboxItems.length, inboxItems);
|
|
400
|
+
}
|
|
401
|
+
else {
|
|
402
|
+
const remainingItems = await this.autoAcceptPendingHandoffs(inboxItems);
|
|
403
|
+
injectStartupMessage(this.agentName, remainingItems.length, remainingItems);
|
|
404
|
+
}
|
|
354
405
|
}
|
|
355
406
|
catch (error) {
|
|
356
407
|
console.error("Failed to check inbox:", error);
|
|
357
408
|
injectStartupMessage(this.agentName, 0);
|
|
358
409
|
}
|
|
410
|
+
finally {
|
|
411
|
+
this.initialInboxCheckComplete = true;
|
|
412
|
+
}
|
|
359
413
|
// Inject onboard project context
|
|
360
414
|
if (this.onboardData?.project) {
|
|
361
415
|
await new Promise((resolve) => setTimeout(resolve, 1000));
|
|
@@ -548,6 +602,21 @@ Nudge agent:
|
|
|
548
602
|
const state = getAgentState(this.agentName);
|
|
549
603
|
return state?.automationPaused === true || this.remoteAutomationPaused;
|
|
550
604
|
}
|
|
605
|
+
async sweepInboxOnWebSocketConnect() {
|
|
606
|
+
if (!this.token || !this.initialInboxCheckComplete) {
|
|
607
|
+
return;
|
|
608
|
+
}
|
|
609
|
+
try {
|
|
610
|
+
const inboxItems = await checkInbox(this.config.hubUrl, this.config.workspace, this.token);
|
|
611
|
+
const remainingItems = await this.autoAcceptPendingHandoffs(inboxItems);
|
|
612
|
+
if (remainingItems.length > 0) {
|
|
613
|
+
injectInboxItems(this.agentName, remainingItems);
|
|
614
|
+
}
|
|
615
|
+
}
|
|
616
|
+
catch (error) {
|
|
617
|
+
console.warn(`[WS] Failed inbox sweep on connect: ${error.message}`);
|
|
618
|
+
}
|
|
619
|
+
}
|
|
551
620
|
async refreshRemoteAutonomyPolicy(force = false) {
|
|
552
621
|
if (!this.token || !this.agentId) {
|
|
553
622
|
return;
|
|
@@ -714,6 +783,19 @@ Nudge agent:
|
|
|
714
783
|
// Skip health monitoring for serve mode (no tmux session)
|
|
715
784
|
if (this.serveMode)
|
|
716
785
|
return;
|
|
786
|
+
// Start periodic auth healthcheck for opencode runners (Epic #470)
|
|
787
|
+
if (this.runnerConfig.type === "opencode") {
|
|
788
|
+
this.authHealthWatcher = startAuthHealthWatcher(this.agentName, (event) => {
|
|
789
|
+
if (event.type === "auth-health-degraded") {
|
|
790
|
+
console.warn(`[AUTH] ${event.message}`);
|
|
791
|
+
console.warn("[AUTH] Run: agentmesh auth doctor --repair");
|
|
792
|
+
}
|
|
793
|
+
else if (event.type === "auth-health-repaired") {
|
|
794
|
+
console.log(`[AUTH] ${event.message}`);
|
|
795
|
+
}
|
|
796
|
+
// auth-health-ok is silent to avoid log noise
|
|
797
|
+
});
|
|
798
|
+
}
|
|
717
799
|
const logDir = path.join(os.homedir(), ".agentmesh", "logs");
|
|
718
800
|
if (!fs.existsSync(logDir)) {
|
|
719
801
|
fs.mkdirSync(logDir, { recursive: true });
|
|
@@ -730,6 +812,11 @@ Nudge agent:
|
|
|
730
812
|
await this.handleSessionDeath(health.reason || "unknown", logDir);
|
|
731
813
|
return;
|
|
732
814
|
}
|
|
815
|
+
// Healthy again - clear recovery counters
|
|
816
|
+
if (this.sessionRecoveryAttempts > 0) {
|
|
817
|
+
this.sessionRecoveryAttempts = 0;
|
|
818
|
+
this.lastSessionRecoveryAt = null;
|
|
819
|
+
}
|
|
733
820
|
// Session is alive - check progress watchdog
|
|
734
821
|
const progress = checkAgentProgress(this.agentName, containerName);
|
|
735
822
|
if (progress.status === "waiting_for_human") {
|
|
@@ -737,7 +824,6 @@ Nudge agent:
|
|
|
737
824
|
if (this.stuckSince) {
|
|
738
825
|
// Clear any prior stuck tracking since the agent signalled a legitimate wait
|
|
739
826
|
this.stuckSince = null;
|
|
740
|
-
this.nudgeSentAt = null;
|
|
741
827
|
updateAgentInState(this.agentName, { stuckSince: undefined, status: "waiting" });
|
|
742
828
|
}
|
|
743
829
|
console.log(`[HEALTH] Agent is waiting for human input: ${progress.details}`);
|
|
@@ -750,7 +836,6 @@ Nudge agent:
|
|
|
750
836
|
if (this.stuckSince) {
|
|
751
837
|
console.log(`[HEALTH] Agent resumed activity`);
|
|
752
838
|
this.stuckSince = null;
|
|
753
|
-
this.nudgeSentAt = null;
|
|
754
839
|
updateAgentInState(this.agentName, { stuckSince: undefined, status: "running" });
|
|
755
840
|
}
|
|
756
841
|
}
|
|
@@ -781,6 +866,15 @@ Nudge agent:
|
|
|
781
866
|
lastOutput,
|
|
782
867
|
});
|
|
783
868
|
fs.appendFileSync(logFile, crashLog);
|
|
869
|
+
// Recoverable local tmux failures should self-heal in worker mode.
|
|
870
|
+
const recovered = await this.tryRecoverSession(reason);
|
|
871
|
+
if (recovered) {
|
|
872
|
+
console.warn(`[RECOVERY] Session recovered after "${reason}"`);
|
|
873
|
+
updateAgentInState(this.agentName, {
|
|
874
|
+
status: "running",
|
|
875
|
+
});
|
|
876
|
+
return;
|
|
877
|
+
}
|
|
784
878
|
// Save context before marking as failed
|
|
785
879
|
if (this.agentId) {
|
|
786
880
|
this.saveAgentContext();
|
|
@@ -799,6 +893,71 @@ Nudge agent:
|
|
|
799
893
|
this.healthCheckInterval = null;
|
|
800
894
|
}
|
|
801
895
|
}
|
|
896
|
+
async tryRecoverSession(reason) {
|
|
897
|
+
if (!this.isWorkerAgent || this.serveMode || this.sandboxMode) {
|
|
898
|
+
return false;
|
|
899
|
+
}
|
|
900
|
+
if (!isRecoverableSessionFailure(reason)) {
|
|
901
|
+
return false;
|
|
902
|
+
}
|
|
903
|
+
const now = Date.now();
|
|
904
|
+
if (this.lastSessionRecoveryAt &&
|
|
905
|
+
now - this.lastSessionRecoveryAt.getTime() < 15_000 &&
|
|
906
|
+
this.sessionRecoveryAttempts >= 2) {
|
|
907
|
+
return false;
|
|
908
|
+
}
|
|
909
|
+
this.sessionRecoveryAttempts += 1;
|
|
910
|
+
this.lastSessionRecoveryAt = new Date(now);
|
|
911
|
+
try {
|
|
912
|
+
console.warn(`[RECOVERY] Attempt ${this.sessionRecoveryAttempts}: recreating session for ${this.agentName}`);
|
|
913
|
+
const sessionStart = startTmuxRuntimeSession({
|
|
914
|
+
agentName: this.agentName,
|
|
915
|
+
agentId: this.agentId,
|
|
916
|
+
command: this.agentConfig.command,
|
|
917
|
+
workdir: this.agentConfig.workdir,
|
|
918
|
+
runnerEnv: this.runnerConfig.env,
|
|
919
|
+
shouldRestoreContext: false,
|
|
920
|
+
autonomous: this.autonomous,
|
|
921
|
+
});
|
|
922
|
+
this._preStartSessionId = sessionStart.preStartSessionId;
|
|
923
|
+
this._attemptedResumeSessionId = sessionStart.attemptedResumeSessionId;
|
|
924
|
+
if (this.token && this.agentId) {
|
|
925
|
+
updateSessionEnvironment(this.agentName, {
|
|
926
|
+
AGENT_TOKEN: this.token,
|
|
927
|
+
AGENTMESH_AGENT_ID: this.agentId,
|
|
928
|
+
});
|
|
929
|
+
}
|
|
930
|
+
await new Promise((resolve) => setTimeout(resolve, 1500));
|
|
931
|
+
const health = isSessionHealthy(this.agentName);
|
|
932
|
+
if (!health.healthy) {
|
|
933
|
+
return false;
|
|
934
|
+
}
|
|
935
|
+
if (this.token) {
|
|
936
|
+
// Re-evaluate done-state guard on session recovery (Epic #497)
|
|
937
|
+
const recoveryDecision = await this.evaluateDoneStateGuard();
|
|
938
|
+
console.log(`[RECOVERY] ${formatRestartLifecycleLog(recoveryDecision)}`);
|
|
939
|
+
updateAgentInState(this.agentName, {
|
|
940
|
+
lastRestartState: recoveryDecision.state,
|
|
941
|
+
lastRestartReason: recoveryDecision.reason,
|
|
942
|
+
lastRestartDecisionAt: new Date().toISOString(),
|
|
943
|
+
});
|
|
944
|
+
const inboxItems = await checkInbox(this.config.hubUrl, this.config.workspace, this.token);
|
|
945
|
+
if (recoveryDecision.state === "idle" || recoveryDecision.state === "blocked") {
|
|
946
|
+
console.log(`[RECOVERY] Prior work done — coming up ${recoveryDecision.state}, not auto-resuming.`);
|
|
947
|
+
injectStartupMessage(this.agentName, inboxItems.length, inboxItems);
|
|
948
|
+
}
|
|
949
|
+
else {
|
|
950
|
+
const remainingItems = await this.autoAcceptPendingHandoffs(inboxItems);
|
|
951
|
+
injectStartupMessage(this.agentName, remainingItems.length, remainingItems);
|
|
952
|
+
}
|
|
953
|
+
}
|
|
954
|
+
return true;
|
|
955
|
+
}
|
|
956
|
+
catch (error) {
|
|
957
|
+
console.warn(`[RECOVERY] Session recovery failed: ${error.message}`);
|
|
958
|
+
return false;
|
|
959
|
+
}
|
|
960
|
+
}
|
|
802
961
|
/**
|
|
803
962
|
* Handles stuck agent - sends nudge first, then restarts if still stuck
|
|
804
963
|
*/
|
|
@@ -813,34 +972,15 @@ Nudge agent:
|
|
|
813
972
|
status: "stuck",
|
|
814
973
|
});
|
|
815
974
|
}
|
|
816
|
-
//
|
|
975
|
+
// Worker agents: log the stuck state but do not auto-nudge.
|
|
976
|
+
// Auto-nudging interrupts agents mid-task and causes more harm than good.
|
|
977
|
+
// Operators can nudge manually via CLI or the hub API if needed.
|
|
817
978
|
if (this.isWorkerAgent) {
|
|
818
|
-
|
|
819
|
-
if (!this.nudgeSentAt) {
|
|
820
|
-
console.log(`[HEALTH] Sending nudge to worker agent...`);
|
|
821
|
-
const nudgeMessage = getNudgeMessage(progress);
|
|
822
|
-
const sent = sendNudge(this.agentName, nudgeMessage);
|
|
823
|
-
if (sent) {
|
|
824
|
-
this.nudgeSentAt = now;
|
|
825
|
-
console.log(`[HEALTH] Nudge sent successfully`);
|
|
826
|
-
}
|
|
827
|
-
else {
|
|
828
|
-
console.log(`[HEALTH] Failed to send nudge`);
|
|
829
|
-
}
|
|
830
|
-
return;
|
|
831
|
-
}
|
|
832
|
-
// Check if enough time has passed since nudge
|
|
833
|
-
if (isWithinNudgeWaitWindow(this.nudgeSentAt, NUDGE_WAIT_MS, now)) {
|
|
834
|
-
// Still waiting for agent to respond to nudge
|
|
835
|
-
return;
|
|
836
|
-
}
|
|
837
|
-
// Nudge grace period expired — log warning but do NOT restart
|
|
838
|
-
console.log(`[HEALTH] Agent still stuck after nudge. Manual intervention required.`);
|
|
979
|
+
console.log(`[HEALTH] Worker agent stuck — manual intervention required if needed.`);
|
|
839
980
|
updateAgentInState(this.agentName, {
|
|
840
981
|
status: "waiting",
|
|
841
982
|
});
|
|
842
983
|
void this.releaseAllAutoClaims("worker waiting for human intervention");
|
|
843
|
-
sendNudge(this.agentName, "[AgentMesh] Worker still blocked after nudge. Please request human intervention or resume once approvals are available.");
|
|
844
984
|
}
|
|
845
985
|
}
|
|
846
986
|
async stop() {
|
|
@@ -856,6 +996,11 @@ Nudge agent:
|
|
|
856
996
|
this.stopCleanupScheduler();
|
|
857
997
|
this.stopCleanupScheduler = null;
|
|
858
998
|
}
|
|
999
|
+
// Stop auth health watcher
|
|
1000
|
+
if (this.authHealthWatcher) {
|
|
1001
|
+
this.authHealthWatcher.stop();
|
|
1002
|
+
this.authHealthWatcher = null;
|
|
1003
|
+
}
|
|
859
1004
|
// Save context before stopping
|
|
860
1005
|
if (this.agentId) {
|
|
861
1006
|
console.log("Saving agent context...");
|
|
@@ -1187,6 +1332,46 @@ Logs: docker logs ${containerName}
|
|
|
1187
1332
|
console.warn(`Could not auto-assign to project: ${error.message}`);
|
|
1188
1333
|
}
|
|
1189
1334
|
}
|
|
1335
|
+
// ---------------------------------------------------------------------------
|
|
1336
|
+
// Done-state guard (Epic #497)
|
|
1337
|
+
// ---------------------------------------------------------------------------
|
|
1338
|
+
/**
|
|
1339
|
+
* Evaluates whether this restart should resume in-flight work or come up idle.
|
|
1340
|
+
*
|
|
1341
|
+
* Pulls claims, inbox, and recent handoff history from HQ, then delegates to
|
|
1342
|
+
* the pure `evaluateRestartState` function for the actual decision.
|
|
1343
|
+
*
|
|
1344
|
+
* Failures are non-fatal — defaults to `idle` so we fail safe.
|
|
1345
|
+
*/
|
|
1346
|
+
async evaluateDoneStateGuard() {
|
|
1347
|
+
const safeIdle = (reason) => ({
|
|
1348
|
+
state: "idle",
|
|
1349
|
+
reason,
|
|
1350
|
+
});
|
|
1351
|
+
if (!this.token || !this.agentId) {
|
|
1352
|
+
return safeIdle("no token or agentId — cannot evaluate done-state");
|
|
1353
|
+
}
|
|
1354
|
+
try {
|
|
1355
|
+
const [claimsRaw, inboxRaw, handoffsRaw] = await Promise.all([
|
|
1356
|
+
listClaims(this.config.hubUrl, this.config.workspace, this.token).catch(() => []),
|
|
1357
|
+
checkInbox(this.config.hubUrl, this.config.workspace, this.token).catch(() => []),
|
|
1358
|
+
fetchHandoffsForAgent(this.config.hubUrl, this.config.workspace, this.token, this.agentId).catch(() => []),
|
|
1359
|
+
]);
|
|
1360
|
+
const activeClaims = filterActiveClaimsForAgent(claimsRaw, this.agentId);
|
|
1361
|
+
const completedHandoffs = filterCompletedHandoffsForAgent(handoffsRaw, this.agentId);
|
|
1362
|
+
return evaluateRestartState({
|
|
1363
|
+
activeClaims,
|
|
1364
|
+
inboxItems: inboxRaw,
|
|
1365
|
+
completedHandoffs,
|
|
1366
|
+
automationPaused: this.isAutomationPaused(),
|
|
1367
|
+
});
|
|
1368
|
+
}
|
|
1369
|
+
catch (error) {
|
|
1370
|
+
// Fail safe to idle — do not speculatively resume on error
|
|
1371
|
+
console.warn(`[RESTART] Done-state guard error (defaulting to idle): ${error.message}`);
|
|
1372
|
+
return safeIdle(`guard evaluation failed: ${error.message}`);
|
|
1373
|
+
}
|
|
1374
|
+
}
|
|
1190
1375
|
/**
|
|
1191
1376
|
* Fetches assignments from HQ and validates workdir setup
|
|
1192
1377
|
* Uses project.workdir from HQ as source of truth, falls back to helpful instructions
|