@botcord/daemon 0.2.10 → 0.2.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -164,6 +164,98 @@ function writeIfMissing(filePath, content) {
164
164
  return;
165
165
  writeFileSync(filePath, content, { mode: 0o600 });
166
166
  }
167
+ const HERMES_PROVIDER_ENV_KEYS = new Set([
168
+ "ANTHROPIC_API_KEY",
169
+ "ANTHROPIC_TOKEN",
170
+ "AWS_ACCESS_KEY_ID",
171
+ "AWS_BEARER_TOKEN_BEDROCK",
172
+ "AWS_DEFAULT_REGION",
173
+ "AWS_PROFILE",
174
+ "AWS_REGION",
175
+ "AWS_SECRET_ACCESS_KEY",
176
+ "AWS_SESSION_TOKEN",
177
+ "CEREBRAS_API_KEY",
178
+ "DEEPSEEK_API_KEY",
179
+ "GEMINI_API_KEY",
180
+ "GOOGLE_API_KEY",
181
+ "GROQ_API_KEY",
182
+ "HERMES_INFERENCE_MODEL",
183
+ "HERMES_INFERENCE_PROVIDER",
184
+ "MISTRAL_API_KEY",
185
+ "OPENAI_API_KEY",
186
+ "OPENAI_BASE_URL",
187
+ "OPENROUTER_API_KEY",
188
+ "OPENROUTER_BASE_URL",
189
+ "TOGETHER_API_KEY",
190
+ "XAI_API_KEY",
191
+ ]);
192
+ function parseEnvKeys(content) {
193
+ const keys = new Set();
194
+ for (const line of content.split(/\r?\n/)) {
195
+ const match = line.match(/^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=/);
196
+ if (match)
197
+ keys.add(match[1]);
198
+ }
199
+ return keys;
200
+ }
201
+ /**
202
+ * Seed per-agent Hermes credentials from the user's normal ~/.hermes/.env.
203
+ * Only provider/model variables are copied; BotCord credentials, chat tokens,
204
+ * and unrelated integration secrets are intentionally left behind.
205
+ */
206
+ function mergeHermesProviderEnv(targetEnv) {
207
+ const sourceEnv = path.join(homedir(), ".hermes", ".env");
208
+ if (!existsSync(sourceEnv))
209
+ return;
210
+ let targetContent = "";
211
+ try {
212
+ targetContent = existsSync(targetEnv) ? readFileSync(targetEnv, "utf8") : "";
213
+ }
214
+ catch {
215
+ targetContent = "";
216
+ }
217
+ const targetKeys = parseEnvKeys(targetContent);
218
+ const additions = [];
219
+ let sourceContent = "";
220
+ try {
221
+ sourceContent = readFileSync(sourceEnv, "utf8");
222
+ }
223
+ catch {
224
+ return;
225
+ }
226
+ for (const rawLine of sourceContent.split(/\r?\n/)) {
227
+ const match = rawLine.match(/^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=/);
228
+ if (!match)
229
+ continue;
230
+ const key = match[1];
231
+ if (!HERMES_PROVIDER_ENV_KEYS.has(key) || targetKeys.has(key))
232
+ continue;
233
+ additions.push(rawLine);
234
+ targetKeys.add(key);
235
+ }
236
+ if (additions.length === 0)
237
+ return;
238
+ const prefix = targetContent.endsWith("\n") || targetContent.length === 0 ? "" : "\n";
239
+ const header = targetContent.includes("Imported from ~/.hermes/.env")
240
+ ? ""
241
+ : "# Imported provider credentials from ~/.hermes/.env for BotCord-managed Hermes.\n";
242
+ writeFileSync(targetEnv, `${targetContent}${prefix}${header}${additions.join("\n")}\n`, {
243
+ mode: 0o600,
244
+ });
245
+ }
246
+ function seedHermesConfig(hermesHome) {
247
+ const source = path.join(homedir(), ".hermes", "config.yaml");
248
+ const target = path.join(hermesHome, "config.yaml");
249
+ if (!existsSync(source) || existsSync(target))
250
+ return;
251
+ try {
252
+ copyFileSync(source, target);
253
+ chmodSync(target, 0o600);
254
+ }
255
+ catch {
256
+ /* best-effort */
257
+ }
258
+ }
167
259
  /**
168
260
  * Best-effort link user's `~/.codex/auth.json` into the per-agent CODEX_HOME.
169
261
  * Prefers a symlink (auto-follows `codex login` refreshes) and falls back to
@@ -233,6 +325,8 @@ export function ensureAgentHermesWorkspace(agentId) {
233
325
  mkdirTolerant(hermesWorkspace);
234
326
  writeIfMissing(path.join(hermesHome, ".env"), "# hermes-agent environment overrides for this BotCord agent.\n" +
235
327
  "# Add e.g. HERMES_INFERENCE_PROVIDER=openrouter, OPENROUTER_API_KEY=...\n");
328
+ seedHermesConfig(hermesHome);
329
+ mergeHermesProviderEnv(path.join(hermesHome, ".env"));
236
330
  return { hermesHome, hermesWorkspace };
237
331
  }
238
332
  /**
@@ -12,7 +12,14 @@ import { log as daemonLog } from "./log.js";
12
12
  import { AuthRefreshRejectedError, writeAuthExpiredFlag, } from "./user-auth.js";
13
13
  /** Exponential backoff plan for transient disconnects. */
14
14
  const RECONNECT_BACKOFF_MS = [1000, 2000, 4000, 8000, 16000, 30000];
15
- const KEEPALIVE_INTERVAL_MS = 25_000;
15
+ /**
16
+ * Keepalive cadence. Has to stay below the smallest idle-timeout in any
17
+ * intermediary on the daemon → Hub WS path. Cloudflare and AWS ALB both
18
+ * default to ~60s of idle without app-level data, and some tunnels strip
19
+ * WS-level ping/pong control frames entirely — hence we send an app-level
20
+ * `pong` heartbeat alongside `ws.ping()` rather than relying on it alone.
21
+ */
22
+ const KEEPALIVE_INTERVAL_MS = 20_000;
16
23
  const REPLAY_DEDUPE_CAP = 256;
17
24
  /**
18
25
  * Build the canonical signing input for a control frame: RFC 8785 (JCS)
@@ -198,12 +205,24 @@ export class ControlChannel {
198
205
  const ws = this.ws;
199
206
  if (!ws || ws.readyState !== WebSocket.OPEN)
200
207
  return;
208
+ // WS-level ping for normal cases.
201
209
  try {
202
210
  ws.ping();
203
211
  }
204
212
  catch {
205
213
  // ignore — next failed send will trigger close
206
214
  }
215
+ // App-level heartbeat: a `pong` daemon-initiated frame. Hub recognizes
216
+ // it via `_DAEMON_INITIATED_TYPES` and bumps `last_seen_at`. Critical
217
+ // when an intermediary (Cloudflare, AWS ALB, some k8s ingresses)
218
+ // drops WS-level control frames — those proxies idle-close the WS at
219
+ // ~60s without app-level activity, masquerading as a clean 1006 to
220
+ // both peers.
221
+ this.send({
222
+ id: `hb_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
223
+ type: "pong",
224
+ ts: Date.now(),
225
+ });
207
226
  }, this.keepaliveMs);
208
227
  }
209
228
  stopKeepalive() {
@@ -271,6 +290,14 @@ export class ControlChannel {
271
290
  return;
272
291
  }
273
292
  if (!frame || typeof frame.id !== "string" || typeof frame.type !== "string") {
293
+ // Hub ack responses for daemon-initiated frames (runtime_snapshot push,
294
+ // heartbeat, etc.) carry `{id, ok}` and no `type`. They're expected,
295
+ // not malformed — drop silently. Anything else stays a warn.
296
+ if (frame &&
297
+ typeof frame.id === "string" &&
298
+ typeof frame.ok === "boolean") {
299
+ return;
300
+ }
274
301
  daemonLog.warn("control-channel: malformed frame", { frame });
275
302
  return;
276
303
  }
@@ -56,15 +56,19 @@ export declare function adoptDiscoveredOpenclawAgents(ctx: {
56
56
  export declare function addAgentToConfig(cfg: DaemonConfig, agentId: string): DaemonConfig | null;
57
57
  /** Inverse of {@link addAgentToConfig}. Returns `null` on no-op. */
58
58
  export declare function removeAgentFromConfig(cfg: DaemonConfig, agentId: string): DaemonConfig | null;
59
+ /** Drop the cache (e.g. before a `doctor`-style interactive re-probe). */
60
+ export declare function clearRuntimeProbeCache(): void;
59
61
  /**
60
62
  * Probe every registered adapter and shape the result as the wire-level
61
63
  * {@link ListRuntimesResult} — used by both the `list_runtimes` ack path and
62
64
  * the daemon-side first-connect `runtime_snapshot` push in `daemon.ts`.
63
65
  *
64
- * Kept pure: the only side effects are `detectRuntimes()` itself (which the
65
- * gateway already isolates from throwing) and reading the wall clock.
66
+ * Cached for {@link RUNTIME_PROBE_CACHE_TTL_MS}; pass `{ force: true }` to
67
+ * bypass the cache.
66
68
  */
67
- export declare function collectRuntimeSnapshot(): ListRuntimesResult;
69
+ export declare function collectRuntimeSnapshot(opts?: {
70
+ force?: boolean;
71
+ }): ListRuntimesResult;
68
72
  /** Maximum number of `endpoints[]` entries persisted per runtime (RFC §3.8.2). */
69
73
  export declare const RUNTIME_ENDPOINTS_CAP = 32;
70
74
  /** Injection seam for L2 + L3 endpoint probes — kept testable + side-effect-free. */
package/dist/provision.js CHANGED
@@ -768,15 +768,34 @@ export function removeAgentFromConfig(cfg, agentId) {
768
768
  // ---------------------------------------------------------------------------
769
769
  // runtime-discovery snapshot (plan §8.5)
770
770
  // ---------------------------------------------------------------------------
771
+ /**
772
+ * TTL for the L1 runtime-detection cache. `detectRuntimes()` shells out to
773
+ * each adapter binary (claude / codex / gemini / openclaw / hermes) to read
774
+ * `--version`, which routinely costs 1.5–2s in aggregate — long enough to
775
+ * push `list_runtimes` past the Hub's 10s ack budget when combined with the
776
+ * 3s openclaw gateway probe. Versions don't change between dashboard refresh
777
+ * clicks, so cache the L1 snapshot briefly and recompute on miss.
778
+ */
779
+ const RUNTIME_PROBE_CACHE_TTL_MS = 30_000;
780
+ let _runtimeProbeCache = null;
781
+ /** Drop the cache (e.g. before a `doctor`-style interactive re-probe). */
782
+ export function clearRuntimeProbeCache() {
783
+ _runtimeProbeCache = null;
784
+ }
771
785
  /**
772
786
  * Probe every registered adapter and shape the result as the wire-level
773
787
  * {@link ListRuntimesResult} — used by both the `list_runtimes` ack path and
774
788
  * the daemon-side first-connect `runtime_snapshot` push in `daemon.ts`.
775
789
  *
776
- * Kept pure: the only side effects are `detectRuntimes()` itself (which the
777
- * gateway already isolates from throwing) and reading the wall clock.
790
+ * Cached for {@link RUNTIME_PROBE_CACHE_TTL_MS}; pass `{ force: true }` to
791
+ * bypass the cache.
778
792
  */
779
- export function collectRuntimeSnapshot() {
793
+ export function collectRuntimeSnapshot(opts = {}) {
794
+ if (!opts.force &&
795
+ _runtimeProbeCache &&
796
+ Date.now() - _runtimeProbeCache.at < RUNTIME_PROBE_CACHE_TTL_MS) {
797
+ return _runtimeProbeCache.value;
798
+ }
780
799
  const entries = detectRuntimes();
781
800
  const runtimes = entries.map((entry) => {
782
801
  const record = {
@@ -796,7 +815,9 @@ export function collectRuntimeSnapshot() {
796
815
  // enough; filling a synthetic message would be misleading.
797
816
  return record;
798
817
  });
799
- return { runtimes, probedAt: Date.now() };
818
+ const value = { runtimes, probedAt: Date.now() };
819
+ _runtimeProbeCache = { at: Date.now(), value };
820
+ return value;
800
821
  }
801
822
  /** Maximum number of `endpoints[]` entries persisted per runtime (RFC §3.8.2). */
802
823
  export const RUNTIME_ENDPOINTS_CAP = 32;
@@ -1024,7 +1045,7 @@ export async function collectRuntimeSnapshotAsync(opts = {}) {
1024
1045
  if (gateways.length === 0)
1025
1046
  return base;
1026
1047
  // Default daemon-side budget is 3s — it must stay below the Hub's
1027
- // `list_runtimes` ack wait (5s, see backend/hub/routers/daemon_control.py)
1048
+ // `list_runtimes` ack wait (10s, see backend/hub/routers/daemon_control.py)
1028
1049
  // so a single slow gateway can't blow the whole snapshot to a 504.
1029
1050
  const timeoutMs = opts.timeoutMs ?? 3000;
1030
1051
  const capped = gateways.slice(0, RUNTIME_ENDPOINTS_CAP);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@botcord/daemon",
3
- "version": "0.2.10",
3
+ "version": "0.2.12",
4
4
  "description": "BotCord local daemon — bridges Hub inbox push to local Claude Code / Codex / Gemini CLIs",
5
5
  "type": "module",
6
6
  "bin": {
@@ -12,10 +12,12 @@ import os from "node:os";
12
12
  import path from "node:path";
13
13
 
14
14
  import {
15
+ agentHermesHomeDir,
15
16
  agentHomeDir,
16
17
  agentStateDir,
17
18
  agentWorkspaceDir,
18
19
  applyAgentIdentity,
20
+ ensureAgentHermesWorkspace,
19
21
  ensureAgentWorkspace,
20
22
  } from "../agent-workspace.js";
21
23
 
@@ -88,6 +90,56 @@ describe("ensureAgentWorkspace", () => {
88
90
  expect(readFileSync(memoryPath, "utf8")).toBe("my custom notes\n");
89
91
  });
90
92
 
93
+ it("seeds Hermes config and provider env without copying unrelated secrets", () => {
94
+ const globalHermes = path.join(tmpHome, ".hermes");
95
+ mkdirSync(globalHermes, { recursive: true });
96
+ writeFileSync(
97
+ path.join(globalHermes, ".env"),
98
+ [
99
+ "OPENAI_API_KEY=sk-test",
100
+ "HERMES_INFERENCE_PROVIDER=custom",
101
+ "BOTCORD_PRIVATE_KEY=must-not-copy",
102
+ "TELEGRAM_BOT_TOKEN=must-not-copy",
103
+ "AWS_REGION=us-east-1",
104
+ "",
105
+ ].join("\n"),
106
+ );
107
+ writeFileSync(
108
+ path.join(globalHermes, "config.yaml"),
109
+ "model:\n provider: custom\n default: anthropic/claude-opus-4.6\n",
110
+ );
111
+
112
+ const { hermesHome } = ensureAgentHermesWorkspace("ag_hermes_seed");
113
+ const env = readFileSync(path.join(hermesHome, ".env"), "utf8");
114
+ const config = readFileSync(path.join(hermesHome, "config.yaml"), "utf8");
115
+
116
+ expect(env).toContain("OPENAI_API_KEY=sk-test");
117
+ expect(env).toContain("HERMES_INFERENCE_PROVIDER=custom");
118
+ expect(env).toContain("AWS_REGION=us-east-1");
119
+ expect(env).not.toContain("BOTCORD_PRIVATE_KEY");
120
+ expect(env).not.toContain("TELEGRAM_BOT_TOKEN");
121
+ expect(config).toContain("provider: custom");
122
+ });
123
+
124
+ it("does not overwrite existing per-agent Hermes env values", () => {
125
+ const globalHermes = path.join(tmpHome, ".hermes");
126
+ mkdirSync(globalHermes, { recursive: true });
127
+ writeFileSync(
128
+ path.join(globalHermes, ".env"),
129
+ "OPENAI_API_KEY=global\nOPENROUTER_API_KEY=openrouter\n",
130
+ );
131
+ const agentHome = agentHermesHomeDir("ag_hermes_keep");
132
+ mkdirSync(agentHome, { recursive: true });
133
+ writeFileSync(path.join(agentHome, ".env"), "OPENAI_API_KEY=local\n");
134
+
135
+ ensureAgentHermesWorkspace("ag_hermes_keep");
136
+ const env = readFileSync(path.join(agentHome, ".env"), "utf8");
137
+
138
+ expect(env).toContain("OPENAI_API_KEY=local");
139
+ expect(env).not.toContain("OPENAI_API_KEY=global");
140
+ expect(env).toContain("OPENROUTER_API_KEY=openrouter");
141
+ });
142
+
91
143
  it("identity.md renders the bio placeholder when bio is missing", () => {
92
144
  ensureAgentWorkspace("ag_nobio", { displayName: "Nameless" });
93
145
  const identity = readFileSync(path.join(agentWorkspaceDir("ag_nobio"), "identity.md"), "utf8");
@@ -1,4 +1,4 @@
1
- import { describe, expect, it, vi } from "vitest";
1
+ import { beforeEach, describe, expect, it, vi } from "vitest";
2
2
 
3
3
  // Hoisted mock for `../adapters/runtimes.js` so each suite can stub
4
4
  // `detectRuntimes()` independently — we want coverage of the "empty
@@ -24,7 +24,13 @@ vi.mock("../adapters/runtimes.js", async () => {
24
24
  };
25
25
  });
26
26
 
27
- const { collectRuntimeSnapshot, createProvisioner } = await import("../provision.js");
27
+ const { collectRuntimeSnapshot, clearRuntimeProbeCache, createProvisioner } = await import("../provision.js");
28
+
29
+ beforeEach(() => {
30
+ // The L1 probe is memoized for 30s in production; tests rotate the
31
+ // mocked runtime list between cases, so reset before each.
32
+ clearRuntimeProbeCache();
33
+ });
28
34
  const { pushRuntimeSnapshot } = await import("../daemon.js");
29
35
  const { CONTROL_FRAME_TYPES } = await import("@botcord/protocol-core");
30
36
  import type { GatewayChannelConfig, GatewayRuntimeSnapshot } from "../gateway/index.js";
@@ -195,6 +195,98 @@ function writeIfMissing(filePath: string, content: string): void {
195
195
  writeFileSync(filePath, content, { mode: 0o600 });
196
196
  }
197
197
 
198
+ const HERMES_PROVIDER_ENV_KEYS = new Set([
199
+ "ANTHROPIC_API_KEY",
200
+ "ANTHROPIC_TOKEN",
201
+ "AWS_ACCESS_KEY_ID",
202
+ "AWS_BEARER_TOKEN_BEDROCK",
203
+ "AWS_DEFAULT_REGION",
204
+ "AWS_PROFILE",
205
+ "AWS_REGION",
206
+ "AWS_SECRET_ACCESS_KEY",
207
+ "AWS_SESSION_TOKEN",
208
+ "CEREBRAS_API_KEY",
209
+ "DEEPSEEK_API_KEY",
210
+ "GEMINI_API_KEY",
211
+ "GOOGLE_API_KEY",
212
+ "GROQ_API_KEY",
213
+ "HERMES_INFERENCE_MODEL",
214
+ "HERMES_INFERENCE_PROVIDER",
215
+ "MISTRAL_API_KEY",
216
+ "OPENAI_API_KEY",
217
+ "OPENAI_BASE_URL",
218
+ "OPENROUTER_API_KEY",
219
+ "OPENROUTER_BASE_URL",
220
+ "TOGETHER_API_KEY",
221
+ "XAI_API_KEY",
222
+ ]);
223
+
224
+ function parseEnvKeys(content: string): Set<string> {
225
+ const keys = new Set<string>();
226
+ for (const line of content.split(/\r?\n/)) {
227
+ const match = line.match(/^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=/);
228
+ if (match) keys.add(match[1]);
229
+ }
230
+ return keys;
231
+ }
232
+
233
+ /**
234
+ * Seed per-agent Hermes credentials from the user's normal ~/.hermes/.env.
235
+ * Only provider/model variables are copied; BotCord credentials, chat tokens,
236
+ * and unrelated integration secrets are intentionally left behind.
237
+ */
238
+ function mergeHermesProviderEnv(targetEnv: string): void {
239
+ const sourceEnv = path.join(homedir(), ".hermes", ".env");
240
+ if (!existsSync(sourceEnv)) return;
241
+
242
+ let targetContent = "";
243
+ try {
244
+ targetContent = existsSync(targetEnv) ? readFileSync(targetEnv, "utf8") : "";
245
+ } catch {
246
+ targetContent = "";
247
+ }
248
+ const targetKeys = parseEnvKeys(targetContent);
249
+ const additions: string[] = [];
250
+
251
+ let sourceContent = "";
252
+ try {
253
+ sourceContent = readFileSync(sourceEnv, "utf8");
254
+ } catch {
255
+ return;
256
+ }
257
+
258
+ for (const rawLine of sourceContent.split(/\r?\n/)) {
259
+ const match = rawLine.match(/^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=/);
260
+ if (!match) continue;
261
+ const key = match[1];
262
+ if (!HERMES_PROVIDER_ENV_KEYS.has(key) || targetKeys.has(key)) continue;
263
+ additions.push(rawLine);
264
+ targetKeys.add(key);
265
+ }
266
+ if (additions.length === 0) return;
267
+
268
+ const prefix = targetContent.endsWith("\n") || targetContent.length === 0 ? "" : "\n";
269
+ const header =
270
+ targetContent.includes("Imported from ~/.hermes/.env")
271
+ ? ""
272
+ : "# Imported provider credentials from ~/.hermes/.env for BotCord-managed Hermes.\n";
273
+ writeFileSync(targetEnv, `${targetContent}${prefix}${header}${additions.join("\n")}\n`, {
274
+ mode: 0o600,
275
+ });
276
+ }
277
+
278
+ function seedHermesConfig(hermesHome: string): void {
279
+ const source = path.join(homedir(), ".hermes", "config.yaml");
280
+ const target = path.join(hermesHome, "config.yaml");
281
+ if (!existsSync(source) || existsSync(target)) return;
282
+ try {
283
+ copyFileSync(source, target);
284
+ chmodSync(target, 0o600);
285
+ } catch {
286
+ /* best-effort */
287
+ }
288
+ }
289
+
198
290
  /**
199
291
  * Best-effort link user's `~/.codex/auth.json` into the per-agent CODEX_HOME.
200
292
  * Prefers a symlink (auto-follows `codex login` refreshes) and falls back to
@@ -267,6 +359,8 @@ export function ensureAgentHermesWorkspace(agentId: string): {
267
359
  "# hermes-agent environment overrides for this BotCord agent.\n" +
268
360
  "# Add e.g. HERMES_INFERENCE_PROVIDER=openrouter, OPENROUTER_API_KEY=...\n",
269
361
  );
362
+ seedHermesConfig(hermesHome);
363
+ mergeHermesProviderEnv(path.join(hermesHome, ".env"));
270
364
  return { hermesHome, hermesWorkspace };
271
365
  }
272
366
 
@@ -25,7 +25,14 @@ import {
25
25
 
26
26
  /** Exponential backoff plan for transient disconnects. */
27
27
  const RECONNECT_BACKOFF_MS = [1000, 2000, 4000, 8000, 16000, 30000];
28
- const KEEPALIVE_INTERVAL_MS = 25_000;
28
+ /**
29
+ * Keepalive cadence. Has to stay below the smallest idle-timeout in any
30
+ * intermediary on the daemon → Hub WS path. Cloudflare and AWS ALB both
31
+ * default to ~60s of idle without app-level data, and some tunnels strip
32
+ * WS-level ping/pong control frames entirely — hence we send an app-level
33
+ * `pong` heartbeat alongside `ws.ping()` rather than relying on it alone.
34
+ */
35
+ const KEEPALIVE_INTERVAL_MS = 20_000;
29
36
  const REPLAY_DEDUPE_CAP = 256;
30
37
 
31
38
  /**
@@ -258,11 +265,23 @@ export class ControlChannel {
258
265
  this.keepaliveTimer = setInterval(() => {
259
266
  const ws = this.ws;
260
267
  if (!ws || ws.readyState !== WebSocket.OPEN) return;
268
+ // WS-level ping for normal cases.
261
269
  try {
262
270
  ws.ping();
263
271
  } catch {
264
272
  // ignore — next failed send will trigger close
265
273
  }
274
+ // App-level heartbeat: a `pong` daemon-initiated frame. Hub recognizes
275
+ // it via `_DAEMON_INITIATED_TYPES` and bumps `last_seen_at`. Critical
276
+ // when an intermediary (Cloudflare, AWS ALB, some k8s ingresses)
277
+ // drops WS-level control frames — those proxies idle-close the WS at
278
+ // ~60s without app-level activity, masquerading as a clean 1006 to
279
+ // both peers.
280
+ this.send({
281
+ id: `hb_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
282
+ type: "pong",
283
+ ts: Date.now(),
284
+ });
266
285
  }, this.keepaliveMs);
267
286
  }
268
287
 
@@ -331,6 +350,16 @@ export class ControlChannel {
331
350
  return;
332
351
  }
333
352
  if (!frame || typeof frame.id !== "string" || typeof frame.type !== "string") {
353
+ // Hub ack responses for daemon-initiated frames (runtime_snapshot push,
354
+ // heartbeat, etc.) carry `{id, ok}` and no `type`. They're expected,
355
+ // not malformed — drop silently. Anything else stays a warn.
356
+ if (
357
+ frame &&
358
+ typeof (frame as { id?: unknown }).id === "string" &&
359
+ typeof (frame as { ok?: unknown }).ok === "boolean"
360
+ ) {
361
+ return;
362
+ }
334
363
  daemonLog.warn("control-channel: malformed frame", { frame });
335
364
  return;
336
365
  }
package/src/provision.ts CHANGED
@@ -903,15 +903,39 @@ export function removeAgentFromConfig(
903
903
  // runtime-discovery snapshot (plan §8.5)
904
904
  // ---------------------------------------------------------------------------
905
905
 
906
+ /**
907
+ * TTL for the L1 runtime-detection cache. `detectRuntimes()` shells out to
908
+ * each adapter binary (claude / codex / gemini / openclaw / hermes) to read
909
+ * `--version`, which routinely costs 1.5–2s in aggregate — long enough to
910
+ * push `list_runtimes` past the Hub's 10s ack budget when combined with the
911
+ * 3s openclaw gateway probe. Versions don't change between dashboard refresh
912
+ * clicks, so cache the L1 snapshot briefly and recompute on miss.
913
+ */
914
+ const RUNTIME_PROBE_CACHE_TTL_MS = 30_000;
915
+
916
+ let _runtimeProbeCache: { at: number; value: ListRuntimesResult } | null = null;
917
+
918
+ /** Drop the cache (e.g. before a `doctor`-style interactive re-probe). */
919
+ export function clearRuntimeProbeCache(): void {
920
+ _runtimeProbeCache = null;
921
+ }
922
+
906
923
  /**
907
924
  * Probe every registered adapter and shape the result as the wire-level
908
925
  * {@link ListRuntimesResult} — used by both the `list_runtimes` ack path and
909
926
  * the daemon-side first-connect `runtime_snapshot` push in `daemon.ts`.
910
927
  *
911
- * Kept pure: the only side effects are `detectRuntimes()` itself (which the
912
- * gateway already isolates from throwing) and reading the wall clock.
928
+ * Cached for {@link RUNTIME_PROBE_CACHE_TTL_MS}; pass `{ force: true }` to
929
+ * bypass the cache.
913
930
  */
914
- export function collectRuntimeSnapshot(): ListRuntimesResult {
931
+ export function collectRuntimeSnapshot(opts: { force?: boolean } = {}): ListRuntimesResult {
932
+ if (
933
+ !opts.force &&
934
+ _runtimeProbeCache &&
935
+ Date.now() - _runtimeProbeCache.at < RUNTIME_PROBE_CACHE_TTL_MS
936
+ ) {
937
+ return _runtimeProbeCache.value;
938
+ }
915
939
  const entries = detectRuntimes();
916
940
  const runtimes: RuntimeProbeResult[] = entries.map((entry) => {
917
941
  const record: RuntimeProbeResult = {
@@ -929,7 +953,9 @@ export function collectRuntimeSnapshot(): ListRuntimesResult {
929
953
  // enough; filling a synthetic message would be misleading.
930
954
  return record;
931
955
  });
932
- return { runtimes, probedAt: Date.now() };
956
+ const value: ListRuntimesResult = { runtimes, probedAt: Date.now() };
957
+ _runtimeProbeCache = { at: Date.now(), value };
958
+ return value;
933
959
  }
934
960
 
935
961
  /** Maximum number of `endpoints[]` entries persisted per runtime (RFC §3.8.2). */
@@ -1208,7 +1234,7 @@ export async function collectRuntimeSnapshotAsync(opts: {
1208
1234
  const gateways = opts.cfg?.openclawGateways ?? [];
1209
1235
  if (gateways.length === 0) return base;
1210
1236
  // Default daemon-side budget is 3s — it must stay below the Hub's
1211
- // `list_runtimes` ack wait (5s, see backend/hub/routers/daemon_control.py)
1237
+ // `list_runtimes` ack wait (10s, see backend/hub/routers/daemon_control.py)
1212
1238
  // so a single slow gateway can't blow the whole snapshot to a 504.
1213
1239
  const timeoutMs = opts.timeoutMs ?? 3000;
1214
1240
  const capped = gateways.slice(0, RUNTIME_ENDPOINTS_CAP);