@botcord/daemon 0.2.10 → 0.2.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/control-channel.js +28 -1
- package/dist/provision.d.ts +7 -3
- package/dist/provision.js +26 -5
- package/package.json +1 -1
- package/src/__tests__/runtime-discovery.test.ts +8 -2
- package/src/control-channel.ts +30 -1
- package/src/provision.ts +31 -5
package/dist/control-channel.js
CHANGED
|
@@ -12,7 +12,14 @@ import { log as daemonLog } from "./log.js";
|
|
|
12
12
|
import { AuthRefreshRejectedError, writeAuthExpiredFlag, } from "./user-auth.js";
|
|
13
13
|
/** Exponential backoff plan for transient disconnects. */
|
|
14
14
|
const RECONNECT_BACKOFF_MS = [1000, 2000, 4000, 8000, 16000, 30000];
|
|
15
|
-
|
|
15
|
+
/**
|
|
16
|
+
* Keepalive cadence. Has to stay below the smallest idle-timeout in any
|
|
17
|
+
* intermediary on the daemon → Hub WS path. Cloudflare and AWS ALB both
|
|
18
|
+
* default to ~60s of idle without app-level data, and some tunnels strip
|
|
19
|
+
* WS-level ping/pong control frames entirely — hence we send an app-level
|
|
20
|
+
* `pong` heartbeat alongside `ws.ping()` rather than relying on it alone.
|
|
21
|
+
*/
|
|
22
|
+
const KEEPALIVE_INTERVAL_MS = 20_000;
|
|
16
23
|
const REPLAY_DEDUPE_CAP = 256;
|
|
17
24
|
/**
|
|
18
25
|
* Build the canonical signing input for a control frame: RFC 8785 (JCS)
|
|
@@ -198,12 +205,24 @@ export class ControlChannel {
|
|
|
198
205
|
const ws = this.ws;
|
|
199
206
|
if (!ws || ws.readyState !== WebSocket.OPEN)
|
|
200
207
|
return;
|
|
208
|
+
// WS-level ping for normal cases.
|
|
201
209
|
try {
|
|
202
210
|
ws.ping();
|
|
203
211
|
}
|
|
204
212
|
catch {
|
|
205
213
|
// ignore — next failed send will trigger close
|
|
206
214
|
}
|
|
215
|
+
// App-level heartbeat: a `pong` daemon-initiated frame. Hub recognizes
|
|
216
|
+
// it via `_DAEMON_INITIATED_TYPES` and bumps `last_seen_at`. Critical
|
|
217
|
+
// when an intermediary (Cloudflare, AWS ALB, some k8s ingresses)
|
|
218
|
+
// drops WS-level control frames — those proxies idle-close the WS at
|
|
219
|
+
// ~60s without app-level activity, masquerading as a clean 1006 to
|
|
220
|
+
// both peers.
|
|
221
|
+
this.send({
|
|
222
|
+
id: `hb_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
|
|
223
|
+
type: "pong",
|
|
224
|
+
ts: Date.now(),
|
|
225
|
+
});
|
|
207
226
|
}, this.keepaliveMs);
|
|
208
227
|
}
|
|
209
228
|
stopKeepalive() {
|
|
@@ -271,6 +290,14 @@ export class ControlChannel {
|
|
|
271
290
|
return;
|
|
272
291
|
}
|
|
273
292
|
if (!frame || typeof frame.id !== "string" || typeof frame.type !== "string") {
|
|
293
|
+
// Hub ack responses for daemon-initiated frames (runtime_snapshot push,
|
|
294
|
+
// heartbeat, etc.) carry `{id, ok}` and no `type`. They're expected,
|
|
295
|
+
// not malformed — drop silently. Anything else stays a warn.
|
|
296
|
+
if (frame &&
|
|
297
|
+
typeof frame.id === "string" &&
|
|
298
|
+
typeof frame.ok === "boolean") {
|
|
299
|
+
return;
|
|
300
|
+
}
|
|
274
301
|
daemonLog.warn("control-channel: malformed frame", { frame });
|
|
275
302
|
return;
|
|
276
303
|
}
|
package/dist/provision.d.ts
CHANGED
|
@@ -56,15 +56,19 @@ export declare function adoptDiscoveredOpenclawAgents(ctx: {
|
|
|
56
56
|
export declare function addAgentToConfig(cfg: DaemonConfig, agentId: string): DaemonConfig | null;
|
|
57
57
|
/** Inverse of {@link addAgentToConfig}. Returns `null` on no-op. */
|
|
58
58
|
export declare function removeAgentFromConfig(cfg: DaemonConfig, agentId: string): DaemonConfig | null;
|
|
59
|
+
/** Drop the cache (e.g. before a `doctor`-style interactive re-probe). */
|
|
60
|
+
export declare function clearRuntimeProbeCache(): void;
|
|
59
61
|
/**
|
|
60
62
|
* Probe every registered adapter and shape the result as the wire-level
|
|
61
63
|
* {@link ListRuntimesResult} — used by both the `list_runtimes` ack path and
|
|
62
64
|
* the daemon-side first-connect `runtime_snapshot` push in `daemon.ts`.
|
|
63
65
|
*
|
|
64
|
-
*
|
|
65
|
-
*
|
|
66
|
+
* Cached for {@link RUNTIME_PROBE_CACHE_TTL_MS}; pass `{ force: true }` to
|
|
67
|
+
* bypass the cache.
|
|
66
68
|
*/
|
|
67
|
-
export declare function collectRuntimeSnapshot(
|
|
69
|
+
export declare function collectRuntimeSnapshot(opts?: {
|
|
70
|
+
force?: boolean;
|
|
71
|
+
}): ListRuntimesResult;
|
|
68
72
|
/** Maximum number of `endpoints[]` entries persisted per runtime (RFC §3.8.2). */
|
|
69
73
|
export declare const RUNTIME_ENDPOINTS_CAP = 32;
|
|
70
74
|
/** Injection seam for L2 + L3 endpoint probes — kept testable + side-effect-free. */
|
package/dist/provision.js
CHANGED
|
@@ -768,15 +768,34 @@ export function removeAgentFromConfig(cfg, agentId) {
|
|
|
768
768
|
// ---------------------------------------------------------------------------
|
|
769
769
|
// runtime-discovery snapshot (plan §8.5)
|
|
770
770
|
// ---------------------------------------------------------------------------
|
|
771
|
+
/**
|
|
772
|
+
* TTL for the L1 runtime-detection cache. `detectRuntimes()` shells out to
|
|
773
|
+
* each adapter binary (claude / codex / gemini / openclaw / hermes) to read
|
|
774
|
+
* `--version`, which routinely costs 1.5–2s in aggregate — long enough to
|
|
775
|
+
* push `list_runtimes` past the Hub's 10s ack budget when combined with the
|
|
776
|
+
* 3s openclaw gateway probe. Versions don't change between dashboard refresh
|
|
777
|
+
* clicks, so cache the L1 snapshot briefly and recompute on miss.
|
|
778
|
+
*/
|
|
779
|
+
const RUNTIME_PROBE_CACHE_TTL_MS = 30_000;
|
|
780
|
+
let _runtimeProbeCache = null;
|
|
781
|
+
/** Drop the cache (e.g. before a `doctor`-style interactive re-probe). */
|
|
782
|
+
export function clearRuntimeProbeCache() {
|
|
783
|
+
_runtimeProbeCache = null;
|
|
784
|
+
}
|
|
771
785
|
/**
|
|
772
786
|
* Probe every registered adapter and shape the result as the wire-level
|
|
773
787
|
* {@link ListRuntimesResult} — used by both the `list_runtimes` ack path and
|
|
774
788
|
* the daemon-side first-connect `runtime_snapshot` push in `daemon.ts`.
|
|
775
789
|
*
|
|
776
|
-
*
|
|
777
|
-
*
|
|
790
|
+
* Cached for {@link RUNTIME_PROBE_CACHE_TTL_MS}; pass `{ force: true }` to
|
|
791
|
+
* bypass the cache.
|
|
778
792
|
*/
|
|
779
|
-
export function collectRuntimeSnapshot() {
|
|
793
|
+
export function collectRuntimeSnapshot(opts = {}) {
|
|
794
|
+
if (!opts.force &&
|
|
795
|
+
_runtimeProbeCache &&
|
|
796
|
+
Date.now() - _runtimeProbeCache.at < RUNTIME_PROBE_CACHE_TTL_MS) {
|
|
797
|
+
return _runtimeProbeCache.value;
|
|
798
|
+
}
|
|
780
799
|
const entries = detectRuntimes();
|
|
781
800
|
const runtimes = entries.map((entry) => {
|
|
782
801
|
const record = {
|
|
@@ -796,7 +815,9 @@ export function collectRuntimeSnapshot() {
|
|
|
796
815
|
// enough; filling a synthetic message would be misleading.
|
|
797
816
|
return record;
|
|
798
817
|
});
|
|
799
|
-
|
|
818
|
+
const value = { runtimes, probedAt: Date.now() };
|
|
819
|
+
_runtimeProbeCache = { at: Date.now(), value };
|
|
820
|
+
return value;
|
|
800
821
|
}
|
|
801
822
|
/** Maximum number of `endpoints[]` entries persisted per runtime (RFC §3.8.2). */
|
|
802
823
|
export const RUNTIME_ENDPOINTS_CAP = 32;
|
|
@@ -1024,7 +1045,7 @@ export async function collectRuntimeSnapshotAsync(opts = {}) {
|
|
|
1024
1045
|
if (gateways.length === 0)
|
|
1025
1046
|
return base;
|
|
1026
1047
|
// Default daemon-side budget is 3s — it must stay below the Hub's
|
|
1027
|
-
// `list_runtimes` ack wait (
|
|
1048
|
+
// `list_runtimes` ack wait (10s, see backend/hub/routers/daemon_control.py)
|
|
1028
1049
|
// so a single slow gateway can't blow the whole snapshot to a 504.
|
|
1029
1050
|
const timeoutMs = opts.timeoutMs ?? 3000;
|
|
1030
1051
|
const capped = gateways.slice(0, RUNTIME_ENDPOINTS_CAP);
|
package/package.json
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { describe, expect, it, vi } from "vitest";
|
|
1
|
+
import { beforeEach, describe, expect, it, vi } from "vitest";
|
|
2
2
|
|
|
3
3
|
// Hoisted mock for `../adapters/runtimes.js` so each suite can stub
|
|
4
4
|
// `detectRuntimes()` independently — we want coverage of the "empty
|
|
@@ -24,7 +24,13 @@ vi.mock("../adapters/runtimes.js", async () => {
|
|
|
24
24
|
};
|
|
25
25
|
});
|
|
26
26
|
|
|
27
|
-
const { collectRuntimeSnapshot, createProvisioner } = await import("../provision.js");
|
|
27
|
+
const { collectRuntimeSnapshot, clearRuntimeProbeCache, createProvisioner } = await import("../provision.js");
|
|
28
|
+
|
|
29
|
+
beforeEach(() => {
|
|
30
|
+
// The L1 probe is memoized for 30s in production; tests rotate the
|
|
31
|
+
// mocked runtime list between cases, so reset before each.
|
|
32
|
+
clearRuntimeProbeCache();
|
|
33
|
+
});
|
|
28
34
|
const { pushRuntimeSnapshot } = await import("../daemon.js");
|
|
29
35
|
const { CONTROL_FRAME_TYPES } = await import("@botcord/protocol-core");
|
|
30
36
|
import type { GatewayChannelConfig, GatewayRuntimeSnapshot } from "../gateway/index.js";
|
package/src/control-channel.ts
CHANGED
|
@@ -25,7 +25,14 @@ import {
|
|
|
25
25
|
|
|
26
26
|
/** Exponential backoff plan for transient disconnects. */
|
|
27
27
|
const RECONNECT_BACKOFF_MS = [1000, 2000, 4000, 8000, 16000, 30000];
|
|
28
|
-
|
|
28
|
+
/**
|
|
29
|
+
* Keepalive cadence. Has to stay below the smallest idle-timeout in any
|
|
30
|
+
* intermediary on the daemon → Hub WS path. Cloudflare and AWS ALB both
|
|
31
|
+
* default to ~60s of idle without app-level data, and some tunnels strip
|
|
32
|
+
* WS-level ping/pong control frames entirely — hence we send an app-level
|
|
33
|
+
* `pong` heartbeat alongside `ws.ping()` rather than relying on it alone.
|
|
34
|
+
*/
|
|
35
|
+
const KEEPALIVE_INTERVAL_MS = 20_000;
|
|
29
36
|
const REPLAY_DEDUPE_CAP = 256;
|
|
30
37
|
|
|
31
38
|
/**
|
|
@@ -258,11 +265,23 @@ export class ControlChannel {
|
|
|
258
265
|
this.keepaliveTimer = setInterval(() => {
|
|
259
266
|
const ws = this.ws;
|
|
260
267
|
if (!ws || ws.readyState !== WebSocket.OPEN) return;
|
|
268
|
+
// WS-level ping for normal cases.
|
|
261
269
|
try {
|
|
262
270
|
ws.ping();
|
|
263
271
|
} catch {
|
|
264
272
|
// ignore — next failed send will trigger close
|
|
265
273
|
}
|
|
274
|
+
// App-level heartbeat: a `pong` daemon-initiated frame. Hub recognizes
|
|
275
|
+
// it via `_DAEMON_INITIATED_TYPES` and bumps `last_seen_at`. Critical
|
|
276
|
+
// when an intermediary (Cloudflare, AWS ALB, some k8s ingresses)
|
|
277
|
+
// drops WS-level control frames — those proxies idle-close the WS at
|
|
278
|
+
// ~60s without app-level activity, masquerading as a clean 1006 to
|
|
279
|
+
// both peers.
|
|
280
|
+
this.send({
|
|
281
|
+
id: `hb_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`,
|
|
282
|
+
type: "pong",
|
|
283
|
+
ts: Date.now(),
|
|
284
|
+
});
|
|
266
285
|
}, this.keepaliveMs);
|
|
267
286
|
}
|
|
268
287
|
|
|
@@ -331,6 +350,16 @@ export class ControlChannel {
|
|
|
331
350
|
return;
|
|
332
351
|
}
|
|
333
352
|
if (!frame || typeof frame.id !== "string" || typeof frame.type !== "string") {
|
|
353
|
+
// Hub ack responses for daemon-initiated frames (runtime_snapshot push,
|
|
354
|
+
// heartbeat, etc.) carry `{id, ok}` and no `type`. They're expected,
|
|
355
|
+
// not malformed — drop silently. Anything else stays a warn.
|
|
356
|
+
if (
|
|
357
|
+
frame &&
|
|
358
|
+
typeof (frame as { id?: unknown }).id === "string" &&
|
|
359
|
+
typeof (frame as { ok?: unknown }).ok === "boolean"
|
|
360
|
+
) {
|
|
361
|
+
return;
|
|
362
|
+
}
|
|
334
363
|
daemonLog.warn("control-channel: malformed frame", { frame });
|
|
335
364
|
return;
|
|
336
365
|
}
|
package/src/provision.ts
CHANGED
|
@@ -903,15 +903,39 @@ export function removeAgentFromConfig(
|
|
|
903
903
|
// runtime-discovery snapshot (plan §8.5)
|
|
904
904
|
// ---------------------------------------------------------------------------
|
|
905
905
|
|
|
906
|
+
/**
|
|
907
|
+
* TTL for the L1 runtime-detection cache. `detectRuntimes()` shells out to
|
|
908
|
+
* each adapter binary (claude / codex / gemini / openclaw / hermes) to read
|
|
909
|
+
* `--version`, which routinely costs 1.5–2s in aggregate — long enough to
|
|
910
|
+
* push `list_runtimes` past the Hub's 10s ack budget when combined with the
|
|
911
|
+
* 3s openclaw gateway probe. Versions don't change between dashboard refresh
|
|
912
|
+
* clicks, so cache the L1 snapshot briefly and recompute on miss.
|
|
913
|
+
*/
|
|
914
|
+
const RUNTIME_PROBE_CACHE_TTL_MS = 30_000;
|
|
915
|
+
|
|
916
|
+
let _runtimeProbeCache: { at: number; value: ListRuntimesResult } | null = null;
|
|
917
|
+
|
|
918
|
+
/** Drop the cache (e.g. before a `doctor`-style interactive re-probe). */
|
|
919
|
+
export function clearRuntimeProbeCache(): void {
|
|
920
|
+
_runtimeProbeCache = null;
|
|
921
|
+
}
|
|
922
|
+
|
|
906
923
|
/**
|
|
907
924
|
* Probe every registered adapter and shape the result as the wire-level
|
|
908
925
|
* {@link ListRuntimesResult} — used by both the `list_runtimes` ack path and
|
|
909
926
|
* the daemon-side first-connect `runtime_snapshot` push in `daemon.ts`.
|
|
910
927
|
*
|
|
911
|
-
*
|
|
912
|
-
*
|
|
928
|
+
* Cached for {@link RUNTIME_PROBE_CACHE_TTL_MS}; pass `{ force: true }` to
|
|
929
|
+
* bypass the cache.
|
|
913
930
|
*/
|
|
914
|
-
export function collectRuntimeSnapshot(): ListRuntimesResult {
|
|
931
|
+
export function collectRuntimeSnapshot(opts: { force?: boolean } = {}): ListRuntimesResult {
|
|
932
|
+
if (
|
|
933
|
+
!opts.force &&
|
|
934
|
+
_runtimeProbeCache &&
|
|
935
|
+
Date.now() - _runtimeProbeCache.at < RUNTIME_PROBE_CACHE_TTL_MS
|
|
936
|
+
) {
|
|
937
|
+
return _runtimeProbeCache.value;
|
|
938
|
+
}
|
|
915
939
|
const entries = detectRuntimes();
|
|
916
940
|
const runtimes: RuntimeProbeResult[] = entries.map((entry) => {
|
|
917
941
|
const record: RuntimeProbeResult = {
|
|
@@ -929,7 +953,9 @@ export function collectRuntimeSnapshot(): ListRuntimesResult {
|
|
|
929
953
|
// enough; filling a synthetic message would be misleading.
|
|
930
954
|
return record;
|
|
931
955
|
});
|
|
932
|
-
|
|
956
|
+
const value: ListRuntimesResult = { runtimes, probedAt: Date.now() };
|
|
957
|
+
_runtimeProbeCache = { at: Date.now(), value };
|
|
958
|
+
return value;
|
|
933
959
|
}
|
|
934
960
|
|
|
935
961
|
/** Maximum number of `endpoints[]` entries persisted per runtime (RFC §3.8.2). */
|
|
@@ -1208,7 +1234,7 @@ export async function collectRuntimeSnapshotAsync(opts: {
|
|
|
1208
1234
|
const gateways = opts.cfg?.openclawGateways ?? [];
|
|
1209
1235
|
if (gateways.length === 0) return base;
|
|
1210
1236
|
// Default daemon-side budget is 3s — it must stay below the Hub's
|
|
1211
|
-
// `list_runtimes` ack wait (
|
|
1237
|
+
// `list_runtimes` ack wait (10s, see backend/hub/routers/daemon_control.py)
|
|
1212
1238
|
// so a single slow gateway can't blow the whole snapshot to a 504.
|
|
1213
1239
|
const timeoutMs = opts.timeoutMs ?? 3000;
|
|
1214
1240
|
const capped = gateways.slice(0, RUNTIME_ENDPOINTS_CAP);
|