@agentmeshhq/agent 0.4.5 → 0.4.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/dist/__tests__/auth-doctor-integration.test.d.ts +14 -0
  2. package/dist/__tests__/auth-doctor-integration.test.js +130 -0
  3. package/dist/__tests__/auth-doctor-integration.test.js.map +1 -0
  4. package/dist/__tests__/auth-guard.integration.test.d.ts +12 -0
  5. package/dist/__tests__/auth-guard.integration.test.js +132 -0
  6. package/dist/__tests__/auth-guard.integration.test.js.map +1 -0
  7. package/dist/__tests__/auth-guard.test.d.ts +17 -0
  8. package/dist/__tests__/auth-guard.test.js +483 -0
  9. package/dist/__tests__/auth-guard.test.js.map +1 -0
  10. package/dist/__tests__/done-state-guard.integration.test.d.ts +1 -0
  11. package/dist/__tests__/done-state-guard.integration.test.js +281 -0
  12. package/dist/__tests__/done-state-guard.integration.test.js.map +1 -0
  13. package/dist/__tests__/done-state-guard.test.d.ts +1 -0
  14. package/dist/__tests__/done-state-guard.test.js +327 -0
  15. package/dist/__tests__/done-state-guard.test.js.map +1 -0
  16. package/dist/__tests__/session-recovery.test.d.ts +1 -0
  17. package/dist/__tests__/session-recovery.test.js +16 -0
  18. package/dist/__tests__/session-recovery.test.js.map +1 -0
  19. package/dist/__tests__/tmux-runtime.test.d.ts +1 -0
  20. package/dist/__tests__/tmux-runtime.test.js +113 -0
  21. package/dist/__tests__/tmux-runtime.test.js.map +1 -0
  22. package/dist/cli/auth.d.ts +11 -0
  23. package/dist/cli/auth.js +92 -0
  24. package/dist/cli/auth.js.map +1 -0
  25. package/dist/cli/index.js +45 -1
  26. package/dist/cli/index.js.map +1 -1
  27. package/dist/cli/local.d.ts +4 -2
  28. package/dist/cli/local.js +257 -108
  29. package/dist/cli/local.js.map +1 -1
  30. package/dist/cli/migrate.d.ts +1 -0
  31. package/dist/cli/migrate.js +14 -10
  32. package/dist/cli/migrate.js.map +1 -1
  33. package/dist/cli/start.d.ts +2 -0
  34. package/dist/cli/start.js +3 -0
  35. package/dist/cli/start.js.map +1 -1
  36. package/dist/cli/test.d.ts +1 -0
  37. package/dist/cli/test.js +15 -9
  38. package/dist/cli/test.js.map +1 -1
  39. package/dist/config/schema.d.ts +11 -0
  40. package/dist/config/schema.js.map +1 -1
  41. package/dist/core/auth-guard.d.ts +155 -0
  42. package/dist/core/auth-guard.js +498 -0
  43. package/dist/core/auth-guard.js.map +1 -0
  44. package/dist/core/auth-sync.d.ts +105 -0
  45. package/dist/core/auth-sync.js +263 -0
  46. package/dist/core/auth-sync.js.map +1 -0
  47. package/dist/core/daemon/context-template.js +65 -0
  48. package/dist/core/daemon/context-template.js.map +1 -1
  49. package/dist/core/daemon/done-state-guard.d.ts +63 -0
  50. package/dist/core/daemon/done-state-guard.js +102 -0
  51. package/dist/core/daemon/done-state-guard.js.map +1 -0
  52. package/dist/core/daemon/session-recovery.d.ts +1 -0
  53. package/dist/core/daemon/session-recovery.js +7 -0
  54. package/dist/core/daemon/session-recovery.js.map +1 -0
  55. package/dist/core/daemon/tmux-session.d.ts +1 -0
  56. package/dist/core/daemon/tmux-session.js +1 -1
  57. package/dist/core/daemon/tmux-session.js.map +1 -1
  58. package/dist/core/daemon.d.ts +18 -1
  59. package/dist/core/daemon.js +220 -35
  60. package/dist/core/daemon.js.map +1 -1
  61. package/dist/core/registry.d.ts +9 -1
  62. package/dist/core/registry.js +28 -1
  63. package/dist/core/registry.js.map +1 -1
  64. package/dist/core/tmux-runtime.d.ts +11 -2
  65. package/dist/core/tmux-runtime.js +45 -19
  66. package/dist/core/tmux-runtime.js.map +1 -1
  67. package/dist/core/tmux.d.ts +1 -1
  68. package/dist/core/tmux.js +7 -3
  69. package/dist/core/tmux.js.map +1 -1
  70. package/package.json +12 -11
  71. package/LICENSE +0 -21
@@ -28,6 +28,8 @@ export interface DaemonOptions {
28
28
  role?: string;
29
29
  /** Auto-accept pending handoffs in worker mode (default: enabled for --worker) */
30
30
  autoAcceptHandoffs?: boolean;
31
+ /** Run agent in fully autonomous mode — injects runtime-specific non-interactive flags */
32
+ autonomous?: boolean;
31
33
  }
32
34
  export declare class AgentDaemon {
33
35
  private agentName;
@@ -55,16 +57,20 @@ export declare class AgentDaemon {
55
57
  private projectCode;
56
58
  private projectRole;
57
59
  private autoAcceptHandoffs;
60
+ private autonomous;
58
61
  private healthCheckInterval;
59
62
  private stopCleanupScheduler;
63
+ private authHealthWatcher;
60
64
  private _preStartSessionId;
61
65
  private _attemptedResumeSessionId;
62
66
  private stuckSince;
63
- private nudgeSentAt;
64
67
  private lastPendingHandoffAlertAt;
65
68
  private remoteAutomationPaused;
66
69
  private lastAutonomyPolicyFetchAt;
67
70
  private pendingClaimCreations;
71
+ private sessionRecoveryAttempts;
72
+ private lastSessionRecoveryAt;
73
+ private initialInboxCheckComplete;
68
74
  constructor(options: DaemonOptions);
69
75
  start(): Promise<void>;
70
76
  /**
@@ -74,6 +80,7 @@ export declare class AgentDaemon {
74
80
  private autoAcceptPendingHandoffs;
75
81
  private autoAcceptHandoffFromEvent;
76
82
  private isAutomationPaused;
83
+ private sweepInboxOnWebSocketConnect;
77
84
  private refreshRemoteAutonomyPolicy;
78
85
  private acceptHandoffWithRetry;
79
86
  private checkPendingHandoffSla;
@@ -88,6 +95,7 @@ export declare class AgentDaemon {
88
95
  * Handles session death - logs crash and attempts auto-restart
89
96
  */
90
97
  private handleSessionDeath;
98
+ private tryRecoverSession;
91
99
  /**
92
100
  * Handles stuck agent - sends nudge first, then restarts if still stuck
93
101
  */
@@ -115,6 +123,15 @@ export declare class AgentDaemon {
115
123
  * Resolves workdir from --project flag: looks up project by code, clones repo, self-assigns.
116
124
  */
117
125
  private resolveProjectWorkdir;
126
+ /**
127
+ * Evaluates whether this restart should resume in-flight work or come up idle.
128
+ *
129
+ * Pulls claims, inbox, and recent handoff history from HQ, then delegates to
130
+ * the pure `evaluateRestartState` function for the actual decision.
131
+ *
132
+ * Failures are non-fatal — defaults to `idle` so we fail safe.
133
+ */
134
+ private evaluateDoneStateGuard;
118
135
  /**
119
136
  * Fetches assignments from HQ and validates workdir setup
120
137
  * Uses project.workdir from HQ as source of truth, falls back to helpful instructions
@@ -4,21 +4,24 @@ import os from "node:os";
4
4
  import path from "node:path";
5
5
  import { getAgentState, loadState, updateAgentInState } from "../config/loader.js";
6
6
  import { loadContext, loadOrCreateContext, saveContext } from "../context/index.js";
7
+ import { preflightAgentAuth, startAuthHealthWatcher, } from "./auth-guard.js";
7
8
  import { startCleanupScheduler } from "./cleanup/scheduler.js";
8
9
  import { renderMissingWorkdirMessage } from "./daemon/assignment-message.js";
9
10
  import { bootstrapDaemon } from "./daemon/bootstrap.js";
10
11
  import { removeClaudeMd, writeClaudeMd } from "./daemon/context-template.js";
11
12
  import { formatCrashLog } from "./daemon/crash-log.js";
13
+ import { evaluateRestartState, filterActiveClaimsForAgent, filterCompletedHandoffsForAgent, formatRestartLifecycleLog, } from "./daemon/done-state-guard.js";
12
14
  import { cleanupGitAuth, setupGitAuth } from "./daemon/git-auth.js";
13
- import { getNudgeMessage, getStuckDetail, isWithinNudgeWaitWindow, } from "./daemon/health-policy.js";
15
+ import { getStuckDetail } from "./daemon/health-policy.js";
14
16
  import { writeSandboxOpencodeConfig } from "./daemon/sandbox-config.js";
17
+ import { isRecoverableSessionFailure } from "./daemon/session-recovery.js";
15
18
  import { captureAgentChildPids, persistRunningState } from "./daemon/state.js";
16
19
  import { startTmuxRuntimeSession } from "./daemon/tmux-session.js";
17
20
  import { configureGitIdentity, setupWorkspace, updateWorkspaceFromRemote, validatePushAccess, } from "./daemon/workspace.js";
18
21
  import { findPendingHandoffBreaches } from "./handoff-sla.js";
19
22
  import { Heartbeat } from "./heartbeat.js";
20
- import { handleWebSocketEvent, injectOnboardMessage, injectRestoredContext, injectStartupMessage, } from "./injector.js";
21
- import { checkInbox, createClaim, createSelfAssignment, fetchAssignments, fetchOnboard, fetchProjectByCode, getAgentAutonomyState, getHandoff, listClaims, registerAgent, releaseClaim, updateHandoffStatusWithRetry, } from "./registry.js";
23
+ import { handleWebSocketEvent, injectInboxItems, injectOnboardMessage, injectRestoredContext, injectStartupMessage, } from "./injector.js";
24
+ import { checkInbox, createClaim, createSelfAssignment, fetchAssignments, fetchHandoffsForAgent, fetchOnboard, fetchProjectByCode, getAgentAutonomyState, getHandoff, listClaims, registerAgent, releaseClaim, updateHandoffStatusWithRetry, } from "./registry.js";
22
25
  import { getRunnerDisplayName } from "./runner.js";
23
26
  import { DockerSandbox } from "./sandbox.js";
24
27
  import { getLatestSessionId, waitForNewSessionId } from "./session-id.js";
@@ -26,10 +29,11 @@ import { captureSessionContext, captureSessionOutput, destroySession, isSessionH
26
29
  import { prepareOpenCodeRuntime } from "./tmux-runtime.js";
27
30
  import { checkAgentProgress, cleanupOrphanContainers, isProcessRunning, sendNudge, } from "./watchdog.js";
28
31
  import { AgentWebSocket } from "./websocket.js";
29
- // Time to wait after nudging before marking as stuck (2 minutes)
30
- const NUDGE_WAIT_MS = 2 * 60 * 1000;
31
- const PENDING_HANDOFF_SLA_MINUTES = 5;
32
- const PENDING_HANDOFF_ALERT_COOLDOWN_MS = 5 * 60 * 1000;
32
+ // SLA breach alert thresholds configurable via env vars
33
+ // AGENTMESH_HANDOFF_SLA_MINUTES: minutes before a pending handoff is considered a breach (default 5)
34
+ // AGENTMESH_HANDOFF_SLA_COOLDOWN_MS: ms between repeated SLA alerts for the same breach (default 5 min)
35
+ const PENDING_HANDOFF_SLA_MINUTES = Number(process.env.AGENTMESH_HANDOFF_SLA_MINUTES ?? 5);
36
+ const PENDING_HANDOFF_ALERT_COOLDOWN_MS = Number(process.env.AGENTMESH_HANDOFF_SLA_COOLDOWN_MS ?? 5 * 60 * 1000);
33
37
  const AUTO_CLAIM_SCOPE_PREFIX = "handoff:";
34
38
  const AUTO_CLAIM_TTL_SECONDS = 1800;
35
39
  // Path to the sandbox OpenCode config (permissive permissions)
@@ -65,18 +69,22 @@ export class AgentDaemon {
65
69
  projectCode;
66
70
  projectRole;
67
71
  autoAcceptHandoffs;
72
+ autonomous;
68
73
  healthCheckInterval = null;
69
74
  stopCleanupScheduler = null;
75
+ authHealthWatcher = null;
70
76
  // Session resume tracking
71
77
  _preStartSessionId;
72
78
  _attemptedResumeSessionId;
73
79
  // Stuck detection tracking
74
80
  stuckSince = null;
75
- nudgeSentAt = null;
76
81
  lastPendingHandoffAlertAt = null;
77
82
  remoteAutomationPaused = false;
78
83
  lastAutonomyPolicyFetchAt = null;
79
84
  pendingClaimCreations = new Set();
85
+ sessionRecoveryAttempts = 0;
86
+ lastSessionRecoveryAt = null;
87
+ initialInboxCheckComplete = false;
80
88
  constructor(options) {
81
89
  const boot = bootstrapDaemon(options);
82
90
  this.config = boot.config;
@@ -94,6 +102,7 @@ export class AgentDaemon {
94
102
  this.projectCode = boot.projectCode;
95
103
  this.projectRole = boot.projectRole;
96
104
  this.autoAcceptHandoffs = boot.autoAcceptHandoffs;
105
+ this.autonomous = options.autonomous ?? false;
97
106
  this.runnerConfig = boot.runnerConfig;
98
107
  const runnerName = getRunnerDisplayName(this.runnerConfig.type);
99
108
  console.log(`Runner: ${runnerName}`);
@@ -130,6 +139,13 @@ export class AgentDaemon {
130
139
  // Register with hub first (needed for assignment check)
131
140
  console.log("Registering with AgentMesh hub...");
132
141
  console.log(`Existing state: ${existingState ? `agentId=${existingState.agentId}` : "none"}`);
142
+ // Derive agent_type from runtime flags when not explicitly set in config.
143
+ // - explicit agentConfig.agentType always wins
144
+ // - --worker → "worker" (requires team_id on hub)
145
+ // - --autonomous (no --worker) → "autonomous" (standalone, visible, no team needed)
146
+ // - neither → "system" (hidden background agent)
147
+ const effectiveAgentType = this.agentConfig.agentType ??
148
+ (this.isWorkerAgent ? "worker" : this.autonomous ? "autonomous" : "system");
133
149
  const registration = await registerAgent({
134
150
  url: this.config.hubUrl,
135
151
  apiKey: this.config.apiKey,
@@ -138,6 +154,7 @@ export class AgentDaemon {
138
154
  agentName: this.agentName,
139
155
  model: this.agentConfig.model || this.config.defaults.model,
140
156
  restoreContext: this.shouldRestoreContext,
157
+ agentType: effectiveAgentType,
141
158
  });
142
159
  this.agentId = registration.agentId;
143
160
  this.token = registration.token;
@@ -214,6 +231,17 @@ export class AgentDaemon {
214
231
  `Use --serve-port to specify a different port.`);
215
232
  }
216
233
  }
234
+ // Preflight: ensure per-agent auth symlink is valid before launching runner (Epic #470)
235
+ if (this.runnerConfig.type === "opencode") {
236
+ const { ok, result } = preflightAgentAuth(this.agentName);
237
+ if (!ok) {
238
+ console.warn(`[AUTH] Startup preflight failed for ${this.agentName}: ${result.message}`);
239
+ console.warn("[AUTH] Agent may fail provider calls. Run: agentmesh auth doctor --repair");
240
+ }
241
+ else if (result.status === "repaired") {
242
+ console.log(`[AUTH] Auth repaired at startup: ${result.message}`);
243
+ }
244
+ }
217
245
  // Choose runtime mode: sandbox > serve > tmux
218
246
  if (this.sandboxMode) {
219
247
  await this.startSandboxMode();
@@ -229,6 +257,7 @@ export class AgentDaemon {
229
257
  workdir: this.agentConfig.workdir,
230
258
  runnerEnv: this.runnerConfig.env,
231
259
  shouldRestoreContext: this.shouldRestoreContext,
260
+ autonomous: this.autonomous,
232
261
  });
233
262
  this._preStartSessionId = sessionStart.preStartSessionId;
234
263
  this._attemptedResumeSessionId = sessionStart.attemptedResumeSessionId;
@@ -304,6 +333,7 @@ export class AgentDaemon {
304
333
  },
305
334
  onConnect: () => {
306
335
  console.log("WebSocket reconnected with new token");
336
+ void this.sweepInboxOnWebSocketConnect();
307
337
  },
308
338
  onDisconnect: () => {
309
339
  console.log("WebSocket disconnected");
@@ -333,6 +363,7 @@ export class AgentDaemon {
333
363
  },
334
364
  onConnect: () => {
335
365
  console.log("WebSocket connected");
366
+ void this.sweepInboxOnWebSocketConnect();
336
367
  },
337
368
  onDisconnect: () => {
338
369
  console.log("WebSocket disconnected");
@@ -345,17 +376,40 @@ export class AgentDaemon {
345
376
  // Wait for TUI to initialize before injecting messages
346
377
  await new Promise((resolve) => setTimeout(resolve, 3000));
347
378
  await this.refreshRemoteAutonomyPolicy(true);
379
+ // -----------------------------------------------------------------------
380
+ // Done-state guard (Epic #497): determine restart state before injecting
381
+ // any work. If prior cycle is done, come up idle and skip auto-accept.
382
+ // -----------------------------------------------------------------------
383
+ const restartDecision = await this.evaluateDoneStateGuard();
384
+ console.log(formatRestartLifecycleLog(restartDecision));
385
+ updateAgentInState(this.agentName, {
386
+ lastRestartState: restartDecision.state,
387
+ lastRestartReason: restartDecision.reason,
388
+ lastRestartDecisionAt: new Date().toISOString(),
389
+ });
348
390
  // Check inbox and auto-nudge with full handoff details
349
391
  console.log("Checking inbox...");
350
392
  try {
351
393
  const inboxItems = await checkInbox(this.config.hubUrl, this.config.workspace, this.token);
352
- const remainingItems = await this.autoAcceptPendingHandoffs(inboxItems);
353
- injectStartupMessage(this.agentName, remainingItems.length, remainingItems);
394
+ // If the done-state guard says prior work is done, do NOT auto-accept inbox
395
+ // items from the stale cycle — come up idle and wait for a fresh handoff.
396
+ if (restartDecision.state === "idle" || restartDecision.state === "blocked") {
397
+ console.log(`[RESTART] Skipping auto-accept: agent is ${restartDecision.state}. ` +
398
+ "Any inbox items will be surfaced but not auto-claimed.");
399
+ injectStartupMessage(this.agentName, inboxItems.length, inboxItems);
400
+ }
401
+ else {
402
+ const remainingItems = await this.autoAcceptPendingHandoffs(inboxItems);
403
+ injectStartupMessage(this.agentName, remainingItems.length, remainingItems);
404
+ }
354
405
  }
355
406
  catch (error) {
356
407
  console.error("Failed to check inbox:", error);
357
408
  injectStartupMessage(this.agentName, 0);
358
409
  }
410
+ finally {
411
+ this.initialInboxCheckComplete = true;
412
+ }
359
413
  // Inject onboard project context
360
414
  if (this.onboardData?.project) {
361
415
  await new Promise((resolve) => setTimeout(resolve, 1000));
@@ -548,6 +602,21 @@ Nudge agent:
548
602
  const state = getAgentState(this.agentName);
549
603
  return state?.automationPaused === true || this.remoteAutomationPaused;
550
604
  }
605
+ async sweepInboxOnWebSocketConnect() {
606
+ if (!this.token || !this.initialInboxCheckComplete) {
607
+ return;
608
+ }
609
+ try {
610
+ const inboxItems = await checkInbox(this.config.hubUrl, this.config.workspace, this.token);
611
+ const remainingItems = await this.autoAcceptPendingHandoffs(inboxItems);
612
+ if (remainingItems.length > 0) {
613
+ injectInboxItems(this.agentName, remainingItems);
614
+ }
615
+ }
616
+ catch (error) {
617
+ console.warn(`[WS] Failed inbox sweep on connect: ${error.message}`);
618
+ }
619
+ }
551
620
  async refreshRemoteAutonomyPolicy(force = false) {
552
621
  if (!this.token || !this.agentId) {
553
622
  return;
@@ -714,6 +783,19 @@ Nudge agent:
714
783
  // Skip health monitoring for serve mode (no tmux session)
715
784
  if (this.serveMode)
716
785
  return;
786
+ // Start periodic auth healthcheck for opencode runners (Epic #470)
787
+ if (this.runnerConfig.type === "opencode") {
788
+ this.authHealthWatcher = startAuthHealthWatcher(this.agentName, (event) => {
789
+ if (event.type === "auth-health-degraded") {
790
+ console.warn(`[AUTH] ${event.message}`);
791
+ console.warn("[AUTH] Run: agentmesh auth doctor --repair");
792
+ }
793
+ else if (event.type === "auth-health-repaired") {
794
+ console.log(`[AUTH] ${event.message}`);
795
+ }
796
+ // auth-health-ok is silent to avoid log noise
797
+ });
798
+ }
717
799
  const logDir = path.join(os.homedir(), ".agentmesh", "logs");
718
800
  if (!fs.existsSync(logDir)) {
719
801
  fs.mkdirSync(logDir, { recursive: true });
@@ -730,6 +812,11 @@ Nudge agent:
730
812
  await this.handleSessionDeath(health.reason || "unknown", logDir);
731
813
  return;
732
814
  }
815
+ // Healthy again - clear recovery counters
816
+ if (this.sessionRecoveryAttempts > 0) {
817
+ this.sessionRecoveryAttempts = 0;
818
+ this.lastSessionRecoveryAt = null;
819
+ }
733
820
  // Session is alive - check progress watchdog
734
821
  const progress = checkAgentProgress(this.agentName, containerName);
735
822
  if (progress.status === "waiting_for_human") {
@@ -737,7 +824,6 @@ Nudge agent:
737
824
  if (this.stuckSince) {
738
825
  // Clear any prior stuck tracking since the agent signalled a legitimate wait
739
826
  this.stuckSince = null;
740
- this.nudgeSentAt = null;
741
827
  updateAgentInState(this.agentName, { stuckSince: undefined, status: "waiting" });
742
828
  }
743
829
  console.log(`[HEALTH] Agent is waiting for human input: ${progress.details}`);
@@ -750,7 +836,6 @@ Nudge agent:
750
836
  if (this.stuckSince) {
751
837
  console.log(`[HEALTH] Agent resumed activity`);
752
838
  this.stuckSince = null;
753
- this.nudgeSentAt = null;
754
839
  updateAgentInState(this.agentName, { stuckSince: undefined, status: "running" });
755
840
  }
756
841
  }
@@ -781,6 +866,15 @@ Nudge agent:
781
866
  lastOutput,
782
867
  });
783
868
  fs.appendFileSync(logFile, crashLog);
869
+ // Recoverable local tmux failures should self-heal in worker mode.
870
+ const recovered = await this.tryRecoverSession(reason);
871
+ if (recovered) {
872
+ console.warn(`[RECOVERY] Session recovered after "${reason}"`);
873
+ updateAgentInState(this.agentName, {
874
+ status: "running",
875
+ });
876
+ return;
877
+ }
784
878
  // Save context before marking as failed
785
879
  if (this.agentId) {
786
880
  this.saveAgentContext();
@@ -799,6 +893,71 @@ Nudge agent:
799
893
  this.healthCheckInterval = null;
800
894
  }
801
895
  }
896
+ async tryRecoverSession(reason) {
897
+ if (!this.isWorkerAgent || this.serveMode || this.sandboxMode) {
898
+ return false;
899
+ }
900
+ if (!isRecoverableSessionFailure(reason)) {
901
+ return false;
902
+ }
903
+ const now = Date.now();
904
+ if (this.lastSessionRecoveryAt &&
905
+ now - this.lastSessionRecoveryAt.getTime() < 15_000 &&
906
+ this.sessionRecoveryAttempts >= 2) {
907
+ return false;
908
+ }
909
+ this.sessionRecoveryAttempts += 1;
910
+ this.lastSessionRecoveryAt = new Date(now);
911
+ try {
912
+ console.warn(`[RECOVERY] Attempt ${this.sessionRecoveryAttempts}: recreating session for ${this.agentName}`);
913
+ const sessionStart = startTmuxRuntimeSession({
914
+ agentName: this.agentName,
915
+ agentId: this.agentId,
916
+ command: this.agentConfig.command,
917
+ workdir: this.agentConfig.workdir,
918
+ runnerEnv: this.runnerConfig.env,
919
+ shouldRestoreContext: false,
920
+ autonomous: this.autonomous,
921
+ });
922
+ this._preStartSessionId = sessionStart.preStartSessionId;
923
+ this._attemptedResumeSessionId = sessionStart.attemptedResumeSessionId;
924
+ if (this.token && this.agentId) {
925
+ updateSessionEnvironment(this.agentName, {
926
+ AGENT_TOKEN: this.token,
927
+ AGENTMESH_AGENT_ID: this.agentId,
928
+ });
929
+ }
930
+ await new Promise((resolve) => setTimeout(resolve, 1500));
931
+ const health = isSessionHealthy(this.agentName);
932
+ if (!health.healthy) {
933
+ return false;
934
+ }
935
+ if (this.token) {
936
+ // Re-evaluate done-state guard on session recovery (Epic #497)
937
+ const recoveryDecision = await this.evaluateDoneStateGuard();
938
+ console.log(`[RECOVERY] ${formatRestartLifecycleLog(recoveryDecision)}`);
939
+ updateAgentInState(this.agentName, {
940
+ lastRestartState: recoveryDecision.state,
941
+ lastRestartReason: recoveryDecision.reason,
942
+ lastRestartDecisionAt: new Date().toISOString(),
943
+ });
944
+ const inboxItems = await checkInbox(this.config.hubUrl, this.config.workspace, this.token);
945
+ if (recoveryDecision.state === "idle" || recoveryDecision.state === "blocked") {
946
+ console.log(`[RECOVERY] Prior work done — coming up ${recoveryDecision.state}, not auto-resuming.`);
947
+ injectStartupMessage(this.agentName, inboxItems.length, inboxItems);
948
+ }
949
+ else {
950
+ const remainingItems = await this.autoAcceptPendingHandoffs(inboxItems);
951
+ injectStartupMessage(this.agentName, remainingItems.length, remainingItems);
952
+ }
953
+ }
954
+ return true;
955
+ }
956
+ catch (error) {
957
+ console.warn(`[RECOVERY] Session recovery failed: ${error.message}`);
958
+ return false;
959
+ }
960
+ }
802
961
  /**
803
962
  * Handles stuck agent - sends nudge first, then restarts if still stuck
804
963
  */
@@ -813,34 +972,15 @@ Nudge agent:
813
972
  status: "stuck",
814
973
  });
815
974
  }
816
- // Nudge worker agents don't escalate to restart
975
+ // Worker agents: log the stuck state but do not auto-nudge.
976
+ // Auto-nudging interrupts agents mid-task and causes more harm than good.
977
+ // Operators can nudge manually via CLI or the hub API if needed.
817
978
  if (this.isWorkerAgent) {
818
- // If we haven't sent a nudge yet, send one
819
- if (!this.nudgeSentAt) {
820
- console.log(`[HEALTH] Sending nudge to worker agent...`);
821
- const nudgeMessage = getNudgeMessage(progress);
822
- const sent = sendNudge(this.agentName, nudgeMessage);
823
- if (sent) {
824
- this.nudgeSentAt = now;
825
- console.log(`[HEALTH] Nudge sent successfully`);
826
- }
827
- else {
828
- console.log(`[HEALTH] Failed to send nudge`);
829
- }
830
- return;
831
- }
832
- // Check if enough time has passed since nudge
833
- if (isWithinNudgeWaitWindow(this.nudgeSentAt, NUDGE_WAIT_MS, now)) {
834
- // Still waiting for agent to respond to nudge
835
- return;
836
- }
837
- // Nudge grace period expired — log warning but do NOT restart
838
- console.log(`[HEALTH] Agent still stuck after nudge. Manual intervention required.`);
979
+ console.log(`[HEALTH] Worker agent stuck manual intervention required if needed.`);
839
980
  updateAgentInState(this.agentName, {
840
981
  status: "waiting",
841
982
  });
842
983
  void this.releaseAllAutoClaims("worker waiting for human intervention");
843
- sendNudge(this.agentName, "[AgentMesh] Worker still blocked after nudge. Please request human intervention or resume once approvals are available.");
844
984
  }
845
985
  }
846
986
  async stop() {
@@ -856,6 +996,11 @@ Nudge agent:
856
996
  this.stopCleanupScheduler();
857
997
  this.stopCleanupScheduler = null;
858
998
  }
999
+ // Stop auth health watcher
1000
+ if (this.authHealthWatcher) {
1001
+ this.authHealthWatcher.stop();
1002
+ this.authHealthWatcher = null;
1003
+ }
859
1004
  // Save context before stopping
860
1005
  if (this.agentId) {
861
1006
  console.log("Saving agent context...");
@@ -1187,6 +1332,46 @@ Logs: docker logs ${containerName}
1187
1332
  console.warn(`Could not auto-assign to project: ${error.message}`);
1188
1333
  }
1189
1334
  }
1335
+ // ---------------------------------------------------------------------------
1336
+ // Done-state guard (Epic #497)
1337
+ // ---------------------------------------------------------------------------
1338
+ /**
1339
+ * Evaluates whether this restart should resume in-flight work or come up idle.
1340
+ *
1341
+ * Pulls claims, inbox, and recent handoff history from HQ, then delegates to
1342
+ * the pure `evaluateRestartState` function for the actual decision.
1343
+ *
1344
+ * Failures are non-fatal — defaults to `idle` so we fail safe.
1345
+ */
1346
+ async evaluateDoneStateGuard() {
1347
+ const safeIdle = (reason) => ({
1348
+ state: "idle",
1349
+ reason,
1350
+ });
1351
+ if (!this.token || !this.agentId) {
1352
+ return safeIdle("no token or agentId — cannot evaluate done-state");
1353
+ }
1354
+ try {
1355
+ const [claimsRaw, inboxRaw, handoffsRaw] = await Promise.all([
1356
+ listClaims(this.config.hubUrl, this.config.workspace, this.token).catch(() => []),
1357
+ checkInbox(this.config.hubUrl, this.config.workspace, this.token).catch(() => []),
1358
+ fetchHandoffsForAgent(this.config.hubUrl, this.config.workspace, this.token, this.agentId).catch(() => []),
1359
+ ]);
1360
+ const activeClaims = filterActiveClaimsForAgent(claimsRaw, this.agentId);
1361
+ const completedHandoffs = filterCompletedHandoffsForAgent(handoffsRaw, this.agentId);
1362
+ return evaluateRestartState({
1363
+ activeClaims,
1364
+ inboxItems: inboxRaw,
1365
+ completedHandoffs,
1366
+ automationPaused: this.isAutomationPaused(),
1367
+ });
1368
+ }
1369
+ catch (error) {
1370
+ // Fail safe to idle — do not speculatively resume on error
1371
+ console.warn(`[RESTART] Done-state guard error (defaulting to idle): ${error.message}`);
1372
+ return safeIdle(`guard evaluation failed: ${error.message}`);
1373
+ }
1374
+ }
1190
1375
  /**
1191
1376
  * Fetches assignments from HQ and validates workdir setup
1192
1377
  * Uses project.workdir from HQ as source of truth, falls back to helpful instructions