@botcord/daemon 0.2.9 → 0.2.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,10 +9,17 @@
9
9
  import WebSocket from "ws";
10
10
  import { buildDaemonWebSocketUrl, CONTROL_FRAME_TYPES, jcsCanonicalize, resolveHubControlPublicKey, verifyEd25519, } from "@botcord/protocol-core";
11
11
  import { log as daemonLog } from "./log.js";
12
- import { writeAuthExpiredFlag, } from "./user-auth.js";
12
+ import { AuthRefreshRejectedError, writeAuthExpiredFlag, } from "./user-auth.js";
13
13
  /** Exponential backoff plan for transient disconnects. */
14
14
  const RECONNECT_BACKOFF_MS = [1000, 2000, 4000, 8000, 16000, 30000];
15
- const KEEPALIVE_INTERVAL_MS = 25_000;
15
+ /**
16
+ * Keepalive cadence. Has to stay below the smallest idle-timeout in any
17
+ * intermediary on the daemon → Hub WS path. Cloudflare and AWS ALB both
18
+ * default to ~60s of idle without app-level data, and some tunnels strip
19
+ * WS-level ping/pong control frames entirely — hence we send an app-level
20
+ * `pong` heartbeat alongside `ws.ping()` rather than relying on it alone.
21
+ */
22
+ const KEEPALIVE_INTERVAL_MS = 20_000;
16
23
  const REPLAY_DEDUPE_CAP = 256;
17
24
  /**
18
25
  * Build the canonical signing input for a control frame: RFC 8785 (JCS)
@@ -91,8 +98,18 @@ export class ControlChannel {
91
98
  });
92
99
  this.connectInflight = this.connect().catch((err) => {
93
100
  // Initial connect failure surfaces to the caller; subsequent
94
- // reconnects are handled opaquely inside onClose.
95
- this.scheduleReconnect(err);
101
+ // reconnects are handled opaquely inside onClose. A refresh-rejected
102
+ // error means the refresh token itself is dead — no point retrying;
103
+ // writeAuthExpiredFlag was already called in user-auth.refresh().
104
+ if (err instanceof AuthRefreshRejectedError) {
105
+ this.stopRequested = true;
106
+ daemonLog.warn("control-channel: refresh rejected; stopping (re-login required)", {
107
+ status: err.status,
108
+ });
109
+ }
110
+ else {
111
+ this.scheduleReconnect(err);
112
+ }
96
113
  throw err;
97
114
  });
98
115
  try {
@@ -188,12 +205,24 @@ export class ControlChannel {
188
205
  const ws = this.ws;
189
206
  if (!ws || ws.readyState !== WebSocket.OPEN)
190
207
  return;
208
+ // WS-level ping for normal cases.
191
209
  try {
192
210
  ws.ping();
193
211
  }
194
212
  catch {
195
213
  // ignore — next failed send will trigger close
196
214
  }
215
+ // App-level heartbeat: a `pong` daemon-initiated frame. Hub recognizes
216
+ // it via `_DAEMON_INITIATED_TYPES` and bumps `last_seen_at`. Critical
217
+ // when an intermediary (Cloudflare, AWS ALB, some k8s ingresses)
218
+ // drops WS-level control frames — those proxies idle-close the WS at
219
+ // ~60s without app-level activity, masquerading as a clean 1006 to
220
+ // both peers.
221
+ this.send({
222
+ id: `hb_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
223
+ type: "pong",
224
+ ts: Date.now(),
225
+ });
197
226
  }, this.keepaliveMs);
198
227
  }
199
228
  stopKeepalive() {
@@ -223,6 +252,13 @@ export class ControlChannel {
223
252
  scheduleReconnect(err) {
224
253
  if (this.stopRequested)
225
254
  return;
255
+ if (err instanceof AuthRefreshRejectedError) {
256
+ this.stopRequested = true;
257
+ daemonLog.warn("control-channel: refresh rejected; halting reconnect (re-login required)", {
258
+ status: err.status,
259
+ });
260
+ return;
261
+ }
226
262
  const attempt = this.reconnectAttempts;
227
263
  this.reconnectAttempts = attempt + 1;
228
264
  const delay = this.backoff[Math.min(attempt, this.backoff.length - 1)];
@@ -254,6 +290,14 @@ export class ControlChannel {
254
290
  return;
255
291
  }
256
292
  if (!frame || typeof frame.id !== "string" || typeof frame.type !== "string") {
293
+ // Hub ack responses for daemon-initiated frames (runtime_snapshot push,
294
+ // heartbeat, etc.) carry `{id, ok}` and no `type`. They're expected,
295
+ // not malformed — drop silently. Anything else stays a warn.
296
+ if (frame &&
297
+ typeof frame.id === "string" &&
298
+ typeof frame.ok === "boolean") {
299
+ return;
300
+ }
257
301
  daemonLog.warn("control-channel: malformed frame", { frame });
258
302
  return;
259
303
  }
package/dist/doctor.js CHANGED
@@ -156,6 +156,9 @@ export function renderDoctor(input) {
156
156
  const r = rows[i];
157
157
  const e = input.runtimes[i];
158
158
  lines.push(`${pad(r.runtime, widths.runtime)} ${pad(r.name, widths.name)} ${pad(r.status, widths.status)} ${pad(r.version, widths.version)} ${r.path}`);
159
+ if (!e.result.available && e.installHint) {
160
+ lines.push(` → ${e.installHint}`);
161
+ }
159
162
  if (e.endpoints && e.endpoints.length > 0) {
160
163
  for (const ep of e.endpoints) {
161
164
  const mark = ep.reachable ? "✓" : "✗";
@@ -1,7 +1,11 @@
1
1
  import { AcpRuntimeAdapter, type AcpPermissionRequest, type AcpPermissionResponse, type AcpUpdateCtx, type AcpUpdateParams } from "./acp-stream.js";
2
2
  import { type ProbeDeps } from "./probe.js";
3
3
  import type { RuntimeProbeResult, RuntimeRunOptions } from "../types.js";
4
- /** Resolve the `hermes-acp` executable on PATH. */
4
+ /**
5
+ * Resolve the `hermes-acp` executable. Tries PATH first, then falls back to
6
+ * the upstream install.sh's private venv location (`~/.hermes/...`) before
7
+ * giving up. `BOTCORD_HERMES_AGENT_BIN` always wins via the adapter override.
8
+ */
5
9
  export declare function resolveHermesAcpCommand(deps?: ProbeDeps): string | null;
6
10
  /** Probe whether `hermes-acp` is installed and report its version. */
7
11
  export declare function probeHermesAgent(deps?: ProbeDeps): RuntimeProbeResult;
@@ -3,10 +3,34 @@ import path from "node:path";
3
3
  import { agentHermesHomeDir, agentHermesWorkspaceDir, ensureAgentHermesWorkspace, } from "../../agent-workspace.js";
4
4
  import { buildCliEnv } from "../cli-resolver.js";
5
5
  import { AcpRuntimeAdapter, } from "./acp-stream.js";
6
- import { readCommandVersion, resolveCommandOnPath } from "./probe.js";
7
- /** Resolve the `hermes-acp` executable on PATH. */
6
+ import { firstExistingPath, readCommandVersion, resolveCommandOnPath, resolveHomePath, } from "./probe.js";
7
+ /**
8
+ * Known absolute locations of the `hermes-acp` entry point when it is not on
9
+ * PATH. The upstream `scripts/install.sh` (curl|bash installer) installs a
10
+ * private virtualenv under `~/.hermes/hermes-agent/venv/` and only symlinks
11
+ * the user-facing `hermes` command into `~/.local/bin/` — the `hermes-acp`
12
+ * entry point stays inside the venv. Without a fallback, daemon's PATH-only
13
+ * probe misses every user who installed via the README-recommended script.
14
+ */
15
+ const HERMES_ACP_FALLBACK_RELATIVE_PATHS = [
16
+ path.join(".hermes", "hermes-agent", "venv", "bin", "hermes-acp"),
17
+ ];
18
+ const HERMES_ACP_FALLBACK_SYSTEM_PATHS = [
19
+ "/opt/hermes/hermes-agent/venv/bin/hermes-acp",
20
+ ];
21
+ /**
22
+ * Resolve the `hermes-acp` executable. Tries PATH first, then falls back to
23
+ * the upstream install.sh's private venv location (`~/.hermes/...`) before
24
+ * giving up. `BOTCORD_HERMES_AGENT_BIN` always wins via the adapter override.
25
+ */
8
26
  export function resolveHermesAcpCommand(deps = {}) {
9
- return resolveCommandOnPath("hermes-acp", deps);
27
+ const onPath = resolveCommandOnPath("hermes-acp", deps);
28
+ if (onPath)
29
+ return onPath;
30
+ return firstExistingPath([
31
+ ...HERMES_ACP_FALLBACK_RELATIVE_PATHS.map((p) => resolveHomePath(p, deps)),
32
+ ...HERMES_ACP_FALLBACK_SYSTEM_PATHS,
33
+ ], deps);
10
34
  }
11
35
  /** Probe whether `hermes-acp` is installed and report its version. */
12
36
  export function probeHermesAgent(deps = {}) {
@@ -23,6 +23,11 @@ export interface RuntimeModule {
23
23
  * config loader rejects routing turns to this adapter.
24
24
  */
25
25
  supportsRun?: boolean;
26
+ /**
27
+ * Short, single-line install hint shown by `doctor` when the runtime
28
+ * probes as unavailable. Helps users recover without reading source.
29
+ */
30
+ installHint?: string;
26
31
  }
27
32
  /** Built-in runtime module entry for Claude Code. */
28
33
  export declare const claudeCodeModule: RuntimeModule;
@@ -58,6 +63,7 @@ export interface RuntimeProbeEntry {
58
63
  binary: string;
59
64
  supportsRun: boolean;
60
65
  result: RuntimeProbeResult;
66
+ installHint?: string;
61
67
  }
62
68
  /** Probe every registered runtime and report installation status. */
63
69
  export declare function detectRuntimes(): RuntimeProbeEntry[];
@@ -28,6 +28,7 @@ export const hermesAgentModule = {
28
28
  envVar: "BOTCORD_HERMES_AGENT_BIN",
29
29
  probe: () => probeHermesAgent(),
30
30
  create: () => new HermesAgentAdapter(),
31
+ installHint: 'Install: pip install "hermes-agent[acp]" (or set BOTCORD_HERMES_AGENT_BIN to the absolute path of hermes-acp)',
31
32
  };
32
33
  /** Built-in runtime module entry for Gemini (probe-only stub). */
33
34
  export const geminiModule = {
@@ -110,6 +111,7 @@ export function detectRuntimes() {
110
111
  binary: m.binary,
111
112
  supportsRun: m.supportsRun !== false,
112
113
  result,
114
+ installHint: m.installHint,
113
115
  });
114
116
  }
115
117
  return out;
@@ -56,15 +56,19 @@ export declare function adoptDiscoveredOpenclawAgents(ctx: {
56
56
  export declare function addAgentToConfig(cfg: DaemonConfig, agentId: string): DaemonConfig | null;
57
57
  /** Inverse of {@link addAgentToConfig}. Returns `null` on no-op. */
58
58
  export declare function removeAgentFromConfig(cfg: DaemonConfig, agentId: string): DaemonConfig | null;
59
+ /** Drop the cache (e.g. before a `doctor`-style interactive re-probe). */
60
+ export declare function clearRuntimeProbeCache(): void;
59
61
  /**
60
62
  * Probe every registered adapter and shape the result as the wire-level
61
63
  * {@link ListRuntimesResult} — used by both the `list_runtimes` ack path and
62
64
  * the daemon-side first-connect `runtime_snapshot` push in `daemon.ts`.
63
65
  *
64
- * Kept pure: the only side effects are `detectRuntimes()` itself (which the
65
- * gateway already isolates from throwing) and reading the wall clock.
66
+ * Cached for {@link RUNTIME_PROBE_CACHE_TTL_MS}; pass `{ force: true }` to
67
+ * bypass the cache.
66
68
  */
67
- export declare function collectRuntimeSnapshot(): ListRuntimesResult;
69
+ export declare function collectRuntimeSnapshot(opts?: {
70
+ force?: boolean;
71
+ }): ListRuntimesResult;
68
72
  /** Maximum number of `endpoints[]` entries persisted per runtime (RFC §3.8.2). */
69
73
  export declare const RUNTIME_ENDPOINTS_CAP = 32;
70
74
  /** Injection seam for L2 + L3 endpoint probes — kept testable + side-effect-free. */
package/dist/provision.js CHANGED
@@ -768,15 +768,34 @@ export function removeAgentFromConfig(cfg, agentId) {
768
768
  // ---------------------------------------------------------------------------
769
769
  // runtime-discovery snapshot (plan §8.5)
770
770
  // ---------------------------------------------------------------------------
771
+ /**
772
+ * TTL for the L1 runtime-detection cache. `detectRuntimes()` shells out to
773
+ * each adapter binary (claude / codex / gemini / openclaw / hermes) to read
774
+ * `--version`, which routinely costs 1.5–2s in aggregate — long enough to
775
+ * push `list_runtimes` past the Hub's 10s ack budget when combined with the
776
+ * 3s openclaw gateway probe. Versions don't change between dashboard refresh
777
+ * clicks, so cache the L1 snapshot briefly and recompute on miss.
778
+ */
779
+ const RUNTIME_PROBE_CACHE_TTL_MS = 30_000;
780
+ let _runtimeProbeCache = null;
781
+ /** Drop the cache (e.g. before a `doctor`-style interactive re-probe). */
782
+ export function clearRuntimeProbeCache() {
783
+ _runtimeProbeCache = null;
784
+ }
771
785
  /**
772
786
  * Probe every registered adapter and shape the result as the wire-level
773
787
  * {@link ListRuntimesResult} — used by both the `list_runtimes` ack path and
774
788
  * the daemon-side first-connect `runtime_snapshot` push in `daemon.ts`.
775
789
  *
776
- * Kept pure: the only side effects are `detectRuntimes()` itself (which the
777
- * gateway already isolates from throwing) and reading the wall clock.
790
+ * Cached for {@link RUNTIME_PROBE_CACHE_TTL_MS}; pass `{ force: true }` to
791
+ * bypass the cache.
778
792
  */
779
- export function collectRuntimeSnapshot() {
793
+ export function collectRuntimeSnapshot(opts = {}) {
794
+ if (!opts.force &&
795
+ _runtimeProbeCache &&
796
+ Date.now() - _runtimeProbeCache.at < RUNTIME_PROBE_CACHE_TTL_MS) {
797
+ return _runtimeProbeCache.value;
798
+ }
780
799
  const entries = detectRuntimes();
781
800
  const runtimes = entries.map((entry) => {
782
801
  const record = {
@@ -796,7 +815,9 @@ export function collectRuntimeSnapshot() {
796
815
  // enough; filling a synthetic message would be misleading.
797
816
  return record;
798
817
  });
799
- return { runtimes, probedAt: Date.now() };
818
+ const value = { runtimes, probedAt: Date.now() };
819
+ _runtimeProbeCache = { at: Date.now(), value };
820
+ return value;
800
821
  }
801
822
  /** Maximum number of `endpoints[]` entries persisted per runtime (RFC §3.8.2). */
802
823
  export const RUNTIME_ENDPOINTS_CAP = 32;
@@ -1024,7 +1045,7 @@ export async function collectRuntimeSnapshotAsync(opts = {}) {
1024
1045
  if (gateways.length === 0)
1025
1046
  return base;
1026
1047
  // Default daemon-side budget is 3s — it must stay below the Hub's
1027
- // `list_runtimes` ack wait (5s, see backend/hub/routers/daemon_control.py)
1048
+ // `list_runtimes` ack wait (10s, see backend/hub/routers/daemon_control.py)
1028
1049
  // so a single slow gateway can't blow the whole snapshot to a 504.
1029
1050
  const timeoutMs = opts.timeoutMs ?? 3000;
1030
1051
  const capped = gateways.slice(0, RUNTIME_ENDPOINTS_CAP);
@@ -40,6 +40,15 @@ export declare function writeAuthExpiredFlag(file?: string): void;
40
40
  export declare function clearAuthExpiredFlag(file?: string): void;
41
41
  /** Returns true if the stored access token is within `windowMs` of expiry. */
42
42
  export declare function isTokenNearExpiry(record: UserAuthRecord, windowMs?: number): boolean;
43
+ /**
44
+ * Thrown when the Hub rejects a refresh token (401/403). Signals that the
45
+ * user must re-login — reconnect loops should stop instead of hammering
46
+ * the refresh endpoint forever with a known-bad token.
47
+ */
48
+ export declare class AuthRefreshRejectedError extends Error {
49
+ readonly status: number;
50
+ constructor(status: number, message: string);
51
+ }
43
52
  /**
44
53
  * Stateful helper that owns the in-memory copy of user-auth and knows how
45
54
  * to refresh it. Used by the control channel so reconnects always carry
package/dist/user-auth.js CHANGED
@@ -144,6 +144,19 @@ export function clearAuthExpiredFlag(file = AUTH_EXPIRED_FLAG_PATH) {
144
144
  export function isTokenNearExpiry(record, windowMs = 60_000) {
145
145
  return record.expiresAt - Date.now() <= windowMs;
146
146
  }
147
+ /**
148
+ * Thrown when the Hub rejects a refresh token (401/403). Signals that the
149
+ * user must re-login — reconnect loops should stop instead of hammering
150
+ * the refresh endpoint forever with a known-bad token.
151
+ */
152
+ export class AuthRefreshRejectedError extends Error {
153
+ status;
154
+ constructor(status, message) {
155
+ super(message);
156
+ this.name = "AuthRefreshRejectedError";
157
+ this.status = status;
158
+ }
159
+ }
147
160
  /**
148
161
  * Stateful helper that owns the in-memory copy of user-auth and knows how
149
162
  * to refresh it. Used by the control channel so reconnects always carry
@@ -197,13 +210,37 @@ export class UserAuthManager {
197
210
  expiresInMs: current.expiresAt - Date.now(),
198
211
  });
199
212
  this.refreshInflight = (async () => {
200
- const tok = await refreshDaemonToken(current.hubUrl, current.refreshToken);
213
+ // Refresh tokens rotate server-side. If another local process (e.g. a
214
+ // second daemon racing on the same user-auth.json) refreshed in the
215
+ // meantime, the on-disk refreshToken now differs from our in-memory
216
+ // copy — using the in-memory one would 401 because the server already
217
+ // invalidated it. Re-read disk first and adopt any newer record.
218
+ let basis = current;
219
+ try {
220
+ const onDisk = loadUserAuth(this.file);
221
+ if (onDisk && onDisk.refreshToken !== current.refreshToken) {
222
+ daemonLog.info("user-auth refresh: adopting newer on-disk token", {
223
+ userId: onDisk.userId,
224
+ expiresAt: onDisk.expiresAt,
225
+ });
226
+ this.record = onDisk;
227
+ if (!isTokenNearExpiry(onDisk))
228
+ return onDisk;
229
+ basis = onDisk;
230
+ }
231
+ }
232
+ catch (err) {
233
+ daemonLog.debug("user-auth refresh: disk reread failed (ignored)", {
234
+ error: err instanceof Error ? err.message : String(err),
235
+ });
236
+ }
237
+ const tok = await refreshDaemonToken(basis.hubUrl, basis.refreshToken);
201
238
  const next = {
202
- ...current,
239
+ ...basis,
203
240
  accessToken: tok.accessToken,
204
241
  refreshToken: tok.refreshToken,
205
242
  expiresAt: Date.now() + tok.expiresIn * 1000,
206
- hubUrl: tok.hubUrl || current.hubUrl,
243
+ hubUrl: tok.hubUrl || basis.hubUrl,
207
244
  };
208
245
  saveUserAuth(next, this.file);
209
246
  this.record = next;
@@ -213,10 +250,22 @@ export class UserAuthManager {
213
250
  });
214
251
  return next;
215
252
  })().catch((err) => {
253
+ const status = typeof err.status === "number"
254
+ ? (err.status)
255
+ : null;
256
+ const message = err instanceof Error ? err.message : String(err);
216
257
  daemonLog.warn("user-auth refresh: failed", {
217
258
  userId: current.userId,
218
- error: err instanceof Error ? err.message : String(err),
259
+ status,
260
+ error: message,
219
261
  });
262
+ if (status === 401 || status === 403) {
263
+ // Refresh token is permanently dead — write the expired flag so
264
+ // `status` surfaces it and re-throw a typed error so the control
265
+ // channel can stop reconnect loops instead of hammering the Hub.
266
+ writeAuthExpiredFlag();
267
+ throw new AuthRefreshRejectedError(status, message);
268
+ }
220
269
  throw err;
221
270
  }).finally(() => {
222
271
  this.refreshInflight = null;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@botcord/daemon",
3
- "version": "0.2.9",
3
+ "version": "0.2.11",
4
4
  "description": "BotCord local daemon — bridges Hub inbox push to local Claude Code / Codex / Gemini CLIs",
5
5
  "type": "module",
6
6
  "bin": {
@@ -1,4 +1,4 @@
1
- import { describe, expect, it, vi } from "vitest";
1
+ import { beforeEach, describe, expect, it, vi } from "vitest";
2
2
 
3
3
  // Hoisted mock for `../adapters/runtimes.js` so each suite can stub
4
4
  // `detectRuntimes()` independently — we want coverage of the "empty
@@ -24,7 +24,13 @@ vi.mock("../adapters/runtimes.js", async () => {
24
24
  };
25
25
  });
26
26
 
27
- const { collectRuntimeSnapshot, createProvisioner } = await import("../provision.js");
27
+ const { collectRuntimeSnapshot, clearRuntimeProbeCache, createProvisioner } = await import("../provision.js");
28
+
29
+ beforeEach(() => {
30
+ // The L1 probe is memoized for 30s in production; tests rotate the
31
+ // mocked runtime list between cases, so reset before each.
32
+ clearRuntimeProbeCache();
33
+ });
28
34
  const { pushRuntimeSnapshot } = await import("../daemon.js");
29
35
  const { CONTROL_FRAME_TYPES } = await import("@botcord/protocol-core");
30
36
  import type { GatewayChannelConfig, GatewayRuntimeSnapshot } from "../gateway/index.js";
@@ -18,13 +18,21 @@ import {
18
18
  } from "@botcord/protocol-core";
19
19
  import { log as daemonLog } from "./log.js";
20
20
  import {
21
+ AuthRefreshRejectedError,
21
22
  writeAuthExpiredFlag,
22
23
  type UserAuthManager,
23
24
  } from "./user-auth.js";
24
25
 
25
26
  /** Exponential backoff plan for transient disconnects. */
26
27
  const RECONNECT_BACKOFF_MS = [1000, 2000, 4000, 8000, 16000, 30000];
27
- const KEEPALIVE_INTERVAL_MS = 25_000;
28
+ /**
29
+ * Keepalive cadence. Has to stay below the smallest idle-timeout in any
30
+ * intermediary on the daemon → Hub WS path. Cloudflare and AWS ALB both
31
+ * default to ~60s of idle without app-level data, and some tunnels strip
32
+ * WS-level ping/pong control frames entirely — hence we send an app-level
33
+ * `pong` heartbeat alongside `ws.ping()` rather than relying on it alone.
34
+ */
35
+ const KEEPALIVE_INTERVAL_MS = 20_000;
28
36
  const REPLAY_DEDUPE_CAP = 256;
29
37
 
30
38
  /**
@@ -142,8 +150,17 @@ export class ControlChannel {
142
150
  });
143
151
  this.connectInflight = this.connect().catch((err) => {
144
152
  // Initial connect failure surfaces to the caller; subsequent
145
- // reconnects are handled opaquely inside onClose.
146
- this.scheduleReconnect(err);
153
+ // reconnects are handled opaquely inside onClose. A refresh-rejected
154
+ // error means the refresh token itself is dead — no point retrying;
155
+ // writeAuthExpiredFlag was already called in user-auth.refresh().
156
+ if (err instanceof AuthRefreshRejectedError) {
157
+ this.stopRequested = true;
158
+ daemonLog.warn("control-channel: refresh rejected; stopping (re-login required)", {
159
+ status: err.status,
160
+ });
161
+ } else {
162
+ this.scheduleReconnect(err);
163
+ }
147
164
  throw err;
148
165
  });
149
166
  try {
@@ -248,11 +265,23 @@ export class ControlChannel {
248
265
  this.keepaliveTimer = setInterval(() => {
249
266
  const ws = this.ws;
250
267
  if (!ws || ws.readyState !== WebSocket.OPEN) return;
268
+ // WS-level ping for normal cases.
251
269
  try {
252
270
  ws.ping();
253
271
  } catch {
254
272
  // ignore — next failed send will trigger close
255
273
  }
274
+ // App-level heartbeat: a `pong` daemon-initiated frame. Hub recognizes
275
+ // it via `_DAEMON_INITIATED_TYPES` and bumps `last_seen_at`. Critical
276
+ // when an intermediary (Cloudflare, AWS ALB, some k8s ingresses)
277
+ // drops WS-level control frames — those proxies idle-close the WS at
278
+ // ~60s without app-level activity, masquerading as a clean 1006 to
279
+ // both peers.
280
+ this.send({
281
+ id: `hb_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
282
+ type: "pong",
283
+ ts: Date.now(),
284
+ });
256
285
  }, this.keepaliveMs);
257
286
  }
258
287
 
@@ -285,6 +314,13 @@ export class ControlChannel {
285
314
 
286
315
  private scheduleReconnect(err?: unknown): void {
287
316
  if (this.stopRequested) return;
317
+ if (err instanceof AuthRefreshRejectedError) {
318
+ this.stopRequested = true;
319
+ daemonLog.warn("control-channel: refresh rejected; halting reconnect (re-login required)", {
320
+ status: err.status,
321
+ });
322
+ return;
323
+ }
288
324
  const attempt = this.reconnectAttempts;
289
325
  this.reconnectAttempts = attempt + 1;
290
326
  const delay = this.backoff[Math.min(attempt, this.backoff.length - 1)];
@@ -314,6 +350,16 @@ export class ControlChannel {
314
350
  return;
315
351
  }
316
352
  if (!frame || typeof frame.id !== "string" || typeof frame.type !== "string") {
353
+ // Hub ack responses for daemon-initiated frames (runtime_snapshot push,
354
+ // heartbeat, etc.) carry `{id, ok}` and no `type`. They're expected,
355
+ // not malformed — drop silently. Anything else stays a warn.
356
+ if (
357
+ frame &&
358
+ typeof (frame as { id?: unknown }).id === "string" &&
359
+ typeof (frame as { ok?: unknown }).ok === "boolean"
360
+ ) {
361
+ return;
362
+ }
317
363
  daemonLog.warn("control-channel: malformed frame", { frame });
318
364
  return;
319
365
  }
package/src/doctor.ts CHANGED
@@ -257,6 +257,9 @@ export function renderDoctor(input: DoctorInput): string {
257
257
  lines.push(
258
258
  `${pad(r.runtime, widths.runtime)} ${pad(r.name, widths.name)} ${pad(r.status, widths.status)} ${pad(r.version, widths.version)} ${r.path}`,
259
259
  );
260
+ if (!e.result.available && e.installHint) {
261
+ lines.push(` → ${e.installHint}`);
262
+ }
260
263
  if (e.endpoints && e.endpoints.length > 0) {
261
264
  for (const ep of e.endpoints) {
262
265
  const mark = ep.reachable ? "✓" : "✗";
@@ -2,6 +2,7 @@ import { afterAll, beforeAll, describe, expect, it } from "vitest";
2
2
  import {
3
3
  chmodSync,
4
4
  existsSync,
5
+ mkdirSync,
5
6
  mkdtempSync,
6
7
  readFileSync,
7
8
  rmSync,
@@ -9,7 +10,10 @@ import {
9
10
  } from "node:fs";
10
11
  import os from "node:os";
11
12
  import path from "node:path";
12
- import { HermesAgentAdapter } from "../runtimes/hermes-agent.js";
13
+ import {
14
+ HermesAgentAdapter,
15
+ resolveHermesAcpCommand,
16
+ } from "../runtimes/hermes-agent.js";
13
17
  import { agentHermesWorkspaceDir } from "../../agent-workspace.js";
14
18
 
15
19
  // Spawn a tiny Node "ACP server" we control instead of the real hermes-acp.
@@ -288,6 +292,30 @@ describe("HermesAgentAdapter", () => {
288
292
  expect(res.error).toMatch(/aborted before spawn/);
289
293
  });
290
294
 
295
+ it("resolveHermesAcpCommand falls back to ~/.hermes venv when PATH lookup fails", () => {
296
+ // Upstream `scripts/install.sh` puts hermes-acp at
297
+ // ~/.hermes/hermes-agent/venv/bin/hermes-acp and only symlinks `hermes`
298
+ // into ~/.local/bin. Simulate that layout: `which hermes-acp` fails,
299
+ // but the venv path exists on disk.
300
+ const fakeHome = mkdtempSync(path.join(os.tmpdir(), "hermes-fallback-"));
301
+ const venvBin = path.join(fakeHome, ".hermes", "hermes-agent", "venv", "bin");
302
+ const target = path.join(venvBin, "hermes-acp");
303
+ mkdirSync(venvBin, { recursive: true });
304
+ writeFileSync(target, "#!/bin/sh\nexit 0\n", { mode: 0o755 });
305
+ chmodSync(target, 0o755);
306
+
307
+ const resolved = resolveHermesAcpCommand({
308
+ env: { PATH: "/nonexistent" },
309
+ homeDir: fakeHome,
310
+ execFileSyncFn: (() => {
311
+ throw new Error("which: not found");
312
+ }) as never,
313
+ });
314
+ expect(resolved).toBe(target);
315
+
316
+ rmSync(fakeHome, { recursive: true, force: true });
317
+ });
318
+
291
319
  it("surfaces non-zero exit with stderr snippet", async () => {
292
320
  const p = path.join(tmpRoot, "boom.js");
293
321
  writeFileSync(
@@ -13,12 +13,45 @@ import {
13
13
  type AcpUpdateCtx,
14
14
  type AcpUpdateParams,
15
15
  } from "./acp-stream.js";
16
- import { readCommandVersion, resolveCommandOnPath, type ProbeDeps } from "./probe.js";
16
+ import {
17
+ firstExistingPath,
18
+ readCommandVersion,
19
+ resolveCommandOnPath,
20
+ resolveHomePath,
21
+ type ProbeDeps,
22
+ } from "./probe.js";
17
23
  import type { RuntimeProbeResult, RuntimeRunOptions, StreamBlock } from "../types.js";
18
24
 
19
- /** Resolve the `hermes-acp` executable on PATH. */
25
+ /**
26
+ * Known absolute locations of the `hermes-acp` entry point when it is not on
27
+ * PATH. The upstream `scripts/install.sh` (curl|bash installer) installs a
28
+ * private virtualenv under `~/.hermes/hermes-agent/venv/` and only symlinks
29
+ * the user-facing `hermes` command into `~/.local/bin/` — the `hermes-acp`
30
+ * entry point stays inside the venv. Without a fallback, daemon's PATH-only
31
+ * probe misses every user who installed via the README-recommended script.
32
+ */
33
+ const HERMES_ACP_FALLBACK_RELATIVE_PATHS = [
34
+ path.join(".hermes", "hermes-agent", "venv", "bin", "hermes-acp"),
35
+ ];
36
+ const HERMES_ACP_FALLBACK_SYSTEM_PATHS = [
37
+ "/opt/hermes/hermes-agent/venv/bin/hermes-acp",
38
+ ];
39
+
40
+ /**
41
+ * Resolve the `hermes-acp` executable. Tries PATH first, then falls back to
42
+ * the upstream install.sh's private venv location (`~/.hermes/...`) before
43
+ * giving up. `BOTCORD_HERMES_AGENT_BIN` always wins via the adapter override.
44
+ */
20
45
  export function resolveHermesAcpCommand(deps: ProbeDeps = {}): string | null {
21
- return resolveCommandOnPath("hermes-acp", deps);
46
+ const onPath = resolveCommandOnPath("hermes-acp", deps);
47
+ if (onPath) return onPath;
48
+ return firstExistingPath(
49
+ [
50
+ ...HERMES_ACP_FALLBACK_RELATIVE_PATHS.map((p) => resolveHomePath(p, deps)),
51
+ ...HERMES_ACP_FALLBACK_SYSTEM_PATHS,
52
+ ],
53
+ deps,
54
+ );
22
55
  }
23
56
 
24
57
  /** Probe whether `hermes-acp` is installed and report its version. */
@@ -29,6 +29,11 @@ export interface RuntimeModule {
29
29
  * config loader rejects routing turns to this adapter.
30
30
  */
31
31
  supportsRun?: boolean;
32
+ /**
33
+ * Short, single-line install hint shown by `doctor` when the runtime
34
+ * probes as unavailable. Helps users recover without reading source.
35
+ */
36
+ installHint?: string;
32
37
  }
33
38
 
34
39
  /** Built-in runtime module entry for Claude Code. */
@@ -58,6 +63,8 @@ export const hermesAgentModule: RuntimeModule = {
58
63
  envVar: "BOTCORD_HERMES_AGENT_BIN",
59
64
  probe: () => probeHermesAgent(),
60
65
  create: () => new HermesAgentAdapter(),
66
+ installHint:
67
+ 'Install: pip install "hermes-agent[acp]" (or set BOTCORD_HERMES_AGENT_BIN to the absolute path of hermes-acp)',
61
68
  };
62
69
 
63
70
  /** Built-in runtime module entry for Gemini (probe-only stub). */
@@ -143,6 +150,7 @@ export interface RuntimeProbeEntry {
143
150
  binary: string;
144
151
  supportsRun: boolean;
145
152
  result: RuntimeProbeResult;
153
+ installHint?: string;
146
154
  }
147
155
 
148
156
  /** Probe every registered runtime and report installation status. */
@@ -161,6 +169,7 @@ export function detectRuntimes(): RuntimeProbeEntry[] {
161
169
  binary: m.binary,
162
170
  supportsRun: m.supportsRun !== false,
163
171
  result,
172
+ installHint: m.installHint,
164
173
  });
165
174
  }
166
175
  return out;
package/src/provision.ts CHANGED
@@ -903,15 +903,39 @@ export function removeAgentFromConfig(
903
903
  // runtime-discovery snapshot (plan §8.5)
904
904
  // ---------------------------------------------------------------------------
905
905
 
906
+ /**
907
+ * TTL for the L1 runtime-detection cache. `detectRuntimes()` shells out to
908
+ * each adapter binary (claude / codex / gemini / openclaw / hermes) to read
909
+ * `--version`, which routinely costs 1.5–2s in aggregate — long enough to
910
+ * push `list_runtimes` past the Hub's 10s ack budget when combined with the
911
+ * 3s openclaw gateway probe. Versions don't change between dashboard refresh
912
+ * clicks, so cache the L1 snapshot briefly and recompute on miss.
913
+ */
914
+ const RUNTIME_PROBE_CACHE_TTL_MS = 30_000;
915
+
916
+ let _runtimeProbeCache: { at: number; value: ListRuntimesResult } | null = null;
917
+
918
+ /** Drop the cache (e.g. before a `doctor`-style interactive re-probe). */
919
+ export function clearRuntimeProbeCache(): void {
920
+ _runtimeProbeCache = null;
921
+ }
922
+
906
923
  /**
907
924
  * Probe every registered adapter and shape the result as the wire-level
908
925
  * {@link ListRuntimesResult} — used by both the `list_runtimes` ack path and
909
926
  * the daemon-side first-connect `runtime_snapshot` push in `daemon.ts`.
910
927
  *
911
- * Kept pure: the only side effects are `detectRuntimes()` itself (which the
912
- * gateway already isolates from throwing) and reading the wall clock.
928
+ * Cached for {@link RUNTIME_PROBE_CACHE_TTL_MS}; pass `{ force: true }` to
929
+ * bypass the cache.
913
930
  */
914
- export function collectRuntimeSnapshot(): ListRuntimesResult {
931
+ export function collectRuntimeSnapshot(opts: { force?: boolean } = {}): ListRuntimesResult {
932
+ if (
933
+ !opts.force &&
934
+ _runtimeProbeCache &&
935
+ Date.now() - _runtimeProbeCache.at < RUNTIME_PROBE_CACHE_TTL_MS
936
+ ) {
937
+ return _runtimeProbeCache.value;
938
+ }
915
939
  const entries = detectRuntimes();
916
940
  const runtimes: RuntimeProbeResult[] = entries.map((entry) => {
917
941
  const record: RuntimeProbeResult = {
@@ -929,7 +953,9 @@ export function collectRuntimeSnapshot(): ListRuntimesResult {
929
953
  // enough; filling a synthetic message would be misleading.
930
954
  return record;
931
955
  });
932
- return { runtimes, probedAt: Date.now() };
956
+ const value: ListRuntimesResult = { runtimes, probedAt: Date.now() };
957
+ _runtimeProbeCache = { at: Date.now(), value };
958
+ return value;
933
959
  }
934
960
 
935
961
  /** Maximum number of `endpoints[]` entries persisted per runtime (RFC §3.8.2). */
@@ -1208,7 +1234,7 @@ export async function collectRuntimeSnapshotAsync(opts: {
1208
1234
  const gateways = opts.cfg?.openclawGateways ?? [];
1209
1235
  if (gateways.length === 0) return base;
1210
1236
  // Default daemon-side budget is 3s — it must stay below the Hub's
1211
- // `list_runtimes` ack wait (5s, see backend/hub/routers/daemon_control.py)
1237
+ // `list_runtimes` ack wait (10s, see backend/hub/routers/daemon_control.py)
1212
1238
  // so a single slow gateway can't blow the whole snapshot to a 504.
1213
1239
  const timeoutMs = opts.timeoutMs ?? 3000;
1214
1240
  const capped = gateways.slice(0, RUNTIME_ENDPOINTS_CAP);
package/src/user-auth.ts CHANGED
@@ -188,6 +188,20 @@ export function isTokenNearExpiry(record: UserAuthRecord, windowMs = 60_000): bo
188
188
  return record.expiresAt - Date.now() <= windowMs;
189
189
  }
190
190
 
191
+ /**
192
+ * Thrown when the Hub rejects a refresh token (401/403). Signals that the
193
+ * user must re-login — reconnect loops should stop instead of hammering
194
+ * the refresh endpoint forever with a known-bad token.
195
+ */
196
+ export class AuthRefreshRejectedError extends Error {
197
+ readonly status: number;
198
+ constructor(status: number, message: string) {
199
+ super(message);
200
+ this.name = "AuthRefreshRejectedError";
201
+ this.status = status;
202
+ }
203
+ }
204
+
191
205
  /**
192
206
  * Stateful helper that owns the in-memory copy of user-auth and knows how
193
207
  * to refresh it. Used by the control channel so reconnects always carry
@@ -245,13 +259,35 @@ export class UserAuthManager {
245
259
  expiresInMs: current.expiresAt - Date.now(),
246
260
  });
247
261
  this.refreshInflight = (async () => {
248
- const tok = await refreshDaemonToken(current.hubUrl, current.refreshToken);
262
+ // Refresh tokens rotate server-side. If another local process (e.g. a
263
+ // second daemon racing on the same user-auth.json) refreshed in the
264
+ // meantime, the on-disk refreshToken now differs from our in-memory
265
+ // copy — using the in-memory one would 401 because the server already
266
+ // invalidated it. Re-read disk first and adopt any newer record.
267
+ let basis = current;
268
+ try {
269
+ const onDisk = loadUserAuth(this.file);
270
+ if (onDisk && onDisk.refreshToken !== current.refreshToken) {
271
+ daemonLog.info("user-auth refresh: adopting newer on-disk token", {
272
+ userId: onDisk.userId,
273
+ expiresAt: onDisk.expiresAt,
274
+ });
275
+ this.record = onDisk;
276
+ if (!isTokenNearExpiry(onDisk)) return onDisk;
277
+ basis = onDisk;
278
+ }
279
+ } catch (err) {
280
+ daemonLog.debug("user-auth refresh: disk reread failed (ignored)", {
281
+ error: err instanceof Error ? err.message : String(err),
282
+ });
283
+ }
284
+ const tok = await refreshDaemonToken(basis.hubUrl, basis.refreshToken);
249
285
  const next: UserAuthRecord = {
250
- ...current,
286
+ ...basis,
251
287
  accessToken: tok.accessToken,
252
288
  refreshToken: tok.refreshToken,
253
289
  expiresAt: Date.now() + tok.expiresIn * 1000,
254
- hubUrl: tok.hubUrl || current.hubUrl,
290
+ hubUrl: tok.hubUrl || basis.hubUrl,
255
291
  };
256
292
  saveUserAuth(next, this.file);
257
293
  this.record = next;
@@ -261,10 +297,23 @@ export class UserAuthManager {
261
297
  });
262
298
  return next;
263
299
  })().catch((err) => {
300
+ const status =
301
+ typeof (err as { status?: unknown }).status === "number"
302
+ ? ((err as { status: number }).status)
303
+ : null;
304
+ const message = err instanceof Error ? err.message : String(err);
264
305
  daemonLog.warn("user-auth refresh: failed", {
265
306
  userId: current.userId,
266
- error: err instanceof Error ? err.message : String(err),
307
+ status,
308
+ error: message,
267
309
  });
310
+ if (status === 401 || status === 403) {
311
+ // Refresh token is permanently dead — write the expired flag so
312
+ // `status` surfaces it and re-throw a typed error so the control
313
+ // channel can stop reconnect loops instead of hammering the Hub.
314
+ writeAuthExpiredFlag();
315
+ throw new AuthRefreshRejectedError(status, message);
316
+ }
268
317
  throw err;
269
318
  }).finally(() => {
270
319
  this.refreshInflight = null;