@pleri/olam-cli 0.1.196 → 0.1.198
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -0
- package/dist/ask/knowledge-pack.generated.d.ts.map +1 -1
- package/dist/ask/knowledge-pack.generated.js +10 -8
- package/dist/ask/knowledge-pack.generated.js.map +1 -1
- package/dist/commands/auth-list-json.d.ts +34 -0
- package/dist/commands/auth-list-json.d.ts.map +1 -1
- package/dist/commands/auth-list-json.js +24 -0
- package/dist/commands/auth-list-json.js.map +1 -1
- package/dist/commands/auth-migrate.d.ts +212 -0
- package/dist/commands/auth-migrate.d.ts.map +1 -0
- package/dist/commands/auth-migrate.js +465 -0
- package/dist/commands/auth-migrate.js.map +1 -0
- package/dist/commands/auth.d.ts.map +1 -1
- package/dist/commands/auth.js +239 -184
- package/dist/commands/auth.js.map +1 -1
- package/dist/commands/bootstrap.d.ts +4 -0
- package/dist/commands/bootstrap.d.ts.map +1 -1
- package/dist/commands/bootstrap.js +6 -0
- package/dist/commands/bootstrap.js.map +1 -1
- package/dist/commands/dispatch.d.ts.map +1 -1
- package/dist/commands/dispatch.js +11 -1
- package/dist/commands/dispatch.js.map +1 -1
- package/dist/commands/doctor.d.ts +33 -0
- package/dist/commands/doctor.d.ts.map +1 -1
- package/dist/commands/doctor.js +299 -12
- package/dist/commands/doctor.js.map +1 -1
- package/dist/commands/kg-mirror.d.ts +18 -2
- package/dist/commands/kg-mirror.d.ts.map +1 -1
- package/dist/commands/kg-mirror.js +78 -3
- package/dist/commands/kg-mirror.js.map +1 -1
- package/dist/commands/mcp/complete.d.ts +36 -0
- package/dist/commands/mcp/complete.d.ts.map +1 -0
- package/dist/commands/mcp/complete.js +66 -0
- package/dist/commands/mcp/complete.js.map +1 -0
- package/dist/commands/mcp/index.d.ts +1 -1
- package/dist/commands/mcp/index.d.ts.map +1 -1
- package/dist/commands/mcp/index.js +3 -1
- package/dist/commands/mcp/index.js.map +1 -1
- package/dist/commands/memory/bridge.d.ts +1 -1
- package/dist/commands/memory/bridge.d.ts.map +1 -1
- package/dist/commands/memory/bridge.js +2 -6
- package/dist/commands/memory/bridge.js.map +1 -1
- package/dist/commands/memory/secret.d.ts.map +1 -1
- package/dist/commands/memory/secret.js +4 -3
- package/dist/commands/memory/secret.js.map +1 -1
- package/dist/commands/observe.d.ts +3 -3
- package/dist/commands/observe.d.ts.map +1 -1
- package/dist/commands/observe.js +11 -8
- package/dist/commands/observe.js.map +1 -1
- package/dist/commands/runbooks.d.ts.map +1 -1
- package/dist/commands/runbooks.js +77 -10
- package/dist/commands/runbooks.js.map +1 -1
- package/dist/commands/services-tls.d.ts.map +1 -1
- package/dist/commands/services-tls.js +41 -0
- package/dist/commands/services-tls.js.map +1 -1
- package/dist/commands/services.d.ts +35 -1
- package/dist/commands/services.d.ts.map +1 -1
- package/dist/commands/services.js +153 -32
- package/dist/commands/services.js.map +1 -1
- package/dist/commands/setup-phase-8-kg-hook.d.ts +48 -0
- package/dist/commands/setup-phase-8-kg-hook.d.ts.map +1 -0
- package/dist/commands/setup-phase-8-kg-hook.js +93 -0
- package/dist/commands/setup-phase-8-kg-hook.js.map +1 -0
- package/dist/commands/setup-phase-9-memory-bridge.d.ts +36 -0
- package/dist/commands/setup-phase-9-memory-bridge.d.ts.map +1 -0
- package/dist/commands/setup-phase-9-memory-bridge.js +59 -0
- package/dist/commands/setup-phase-9-memory-bridge.js.map +1 -0
- package/dist/commands/setup.d.ts +34 -1
- package/dist/commands/setup.d.ts.map +1 -1
- package/dist/commands/setup.js +328 -23
- package/dist/commands/setup.js.map +1 -1
- package/dist/commands/update.d.ts +24 -0
- package/dist/commands/update.d.ts.map +1 -1
- package/dist/commands/update.js +53 -0
- package/dist/commands/update.js.map +1 -1
- package/dist/commands/upgrade.d.ts +5 -0
- package/dist/commands/upgrade.d.ts.map +1 -1
- package/dist/commands/upgrade.js +31 -8
- package/dist/commands/upgrade.js.map +1 -1
- package/dist/image-digests.json +8 -8
- package/dist/index.js +4193 -2426
- package/dist/lib/auth-backend.d.ts +168 -0
- package/dist/lib/auth-backend.d.ts.map +1 -0
- package/dist/lib/auth-backend.js +172 -0
- package/dist/lib/auth-backend.js.map +1 -0
- package/dist/lib/auth-list-cache.d.ts +67 -0
- package/dist/lib/auth-list-cache.d.ts.map +1 -0
- package/dist/lib/auth-list-cache.js +84 -0
- package/dist/lib/auth-list-cache.js.map +1 -0
- package/dist/lib/auth-list.d.ts +107 -0
- package/dist/lib/auth-list.d.ts.map +1 -0
- package/dist/lib/auth-list.js +123 -0
- package/dist/lib/auth-list.js.map +1 -0
- package/dist/lib/auth-login.d.ts +92 -0
- package/dist/lib/auth-login.d.ts.map +1 -0
- package/dist/lib/auth-login.js +124 -0
- package/dist/lib/auth-login.js.map +1 -0
- package/dist/lib/auth-mutator-backend.d.ts +54 -0
- package/dist/lib/auth-mutator-backend.d.ts.map +1 -0
- package/dist/lib/auth-mutator-backend.js +62 -0
- package/dist/lib/auth-mutator-backend.js.map +1 -0
- package/dist/lib/auth-remote.d.ts +50 -0
- package/dist/lib/auth-remote.d.ts.map +1 -1
- package/dist/lib/auth-remote.js +84 -2
- package/dist/lib/auth-remote.js.map +1 -1
- package/dist/lib/bootstrap-kubernetes.d.ts +69 -10
- package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
- package/dist/lib/bootstrap-kubernetes.js +264 -46
- package/dist/lib/bootstrap-kubernetes.js.map +1 -1
- package/dist/lib/config.d.ts +7 -0
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/health-probes.d.ts +0 -22
- package/dist/lib/health-probes.d.ts.map +1 -1
- package/dist/lib/health-probes.js +23 -2
- package/dist/lib/health-probes.js.map +1 -1
- package/dist/lib/peripheral-registry.d.ts +11 -0
- package/dist/lib/peripheral-registry.d.ts.map +1 -1
- package/dist/lib/peripheral-registry.js +5 -0
- package/dist/lib/peripheral-registry.js.map +1 -1
- package/dist/lib/plans-client.d.ts.map +1 -1
- package/dist/lib/plans-client.js +6 -3
- package/dist/lib/plans-client.js.map +1 -1
- package/dist/mcp-server.js +14 -3
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/30-configmap.yaml +4 -0
- package/host-cp/k8s/manifests/50-deployment.yaml +13 -1
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
- package/host-cp/src/dispatch-persister.mjs +157 -0
- package/host-cp/src/pr-nanny.mjs +7 -0
- package/host-cp/src/server.mjs +175 -3
- package/host-cp/src/world-watchdog-pid-lookup.mjs +119 -0
- package/host-cp/src/world-watchdog-probes.mjs +271 -0
- package/host-cp/src/world-watchdog-recovery.mjs +192 -0
- package/host-cp/src/world-watchdog.mjs +313 -0
- package/package.json +1 -1
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* world-watchdog-recovery.mjs — recovery hook for wedged claude processes.
|
|
3
|
+
*
|
|
4
|
+
* Isolated from world-watchdog.mjs so kill + replay logic is independently
|
|
5
|
+
* mockable in tests without touching the watchdog's ticker.
|
|
6
|
+
*
|
|
7
|
+
* API:
|
|
8
|
+
* createRecovery({ autoRecoverMode, leakyBucket, broadcaster, persister,
|
|
9
|
+
* replay, processKill, log })
|
|
10
|
+
* → { onWedgedVerdict({ worldId, pid }): Promise<void> }
|
|
11
|
+
*
|
|
12
|
+
* Three modes (from compute.autoRecover in .olam/config.yaml):
|
|
13
|
+
* false — no-op; recovery never fires even on wedged verdict (DEFAULT)
|
|
14
|
+
* 'dry-run' — emits all breadcrumbs, never calls processKill or replay
|
|
15
|
+
* true — SIGKILL pid + read last-dispatch + replay; rate-limited
|
|
16
|
+
*
|
|
17
|
+
* Rate-limit: B2 leaky-bucket (3/hour/world). 4th wedge in window emits
|
|
18
|
+
* world.watchdog.recovery.budget_exhausted and skips all action.
|
|
19
|
+
*
|
|
20
|
+
* Replay stub: the `replay` dep is accepted as an injected function. In
|
|
21
|
+
* server.mjs it is wired to a console.warn stub + breadcrumb until the
|
|
22
|
+
* operator runs the B3 idempotence probe and signs off. See TODO below.
|
|
23
|
+
*
|
|
24
|
+
* @see docs/architecture/world-watchdog.md Recovery section
|
|
25
|
+
* @see packages/host-cp/src/lib/leaky-bucket.mjs
|
|
26
|
+
* @see packages/host-cp/src/dispatch-persister.mjs
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* @typedef {'false'|true|'dry-run'} AutoRecoverMode
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* @typedef {object} RecoveryDeps
|
|
35
|
+
* @property {false|true|'dry-run'} autoRecoverMode
|
|
36
|
+
* Passed from server.mjs which reads config.compute.autoRecover.
|
|
37
|
+
* Default false if config unavailable.
|
|
38
|
+
* @property {{ tryConsume(key: string): { allowed: boolean, retryAfterMs?: number, totalInWindow: number } }} leakyBucket
|
|
39
|
+
* B2 leaky-bucket instance. Keyed by worldId.
|
|
40
|
+
* @property {{ broadcast(type: string, payload: object): void }} [broadcaster]
|
|
41
|
+
* Host-stream broadcaster. Optional — when absent, breadcrumbs are skipped.
|
|
42
|
+
* @property {{ read({ worldId: string }): Promise<{ messageId: string, prompt: string, dispatchedAt: string, source: string } | null> }} persister
|
|
43
|
+
* B4 dispatch-persister read function.
|
|
44
|
+
* @property {(opts: { worldId: string, messageId: string, prompt: string }) => Promise<void>} replay
|
|
45
|
+
* Opaque dispatch helper. Injected dep — DO NOT implement dispatch here.
|
|
46
|
+
* In server.mjs this is wired to a stub until operator signs off on B3 probe.
|
|
47
|
+
* @property {(pid: number) => void} [processKill]
|
|
48
|
+
* process.kill indirection so tests can spy without actually killing.
|
|
49
|
+
* Defaults to process.kill.
|
|
50
|
+
* @property {(msg: string) => void} [log]
|
|
51
|
+
* Logger. Defaults to console.log with [world-watchdog-recovery] prefix.
|
|
52
|
+
*/
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* @typedef {object} RecoveryHandle
|
|
56
|
+
* @property {(opts: { worldId: string, pid: number|null }) => Promise<void>} onWedgedVerdict
|
|
57
|
+
*/
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Create a recovery handle.
|
|
61
|
+
*
|
|
62
|
+
* @param {RecoveryDeps} deps
|
|
63
|
+
* @returns {RecoveryHandle}
|
|
64
|
+
*/
|
|
65
|
+
export function createRecovery({
|
|
66
|
+
autoRecoverMode = false,
|
|
67
|
+
leakyBucket,
|
|
68
|
+
broadcaster = null,
|
|
69
|
+
persister,
|
|
70
|
+
replay,
|
|
71
|
+
processKill = (pid) => process.kill(pid, 'SIGKILL'),
|
|
72
|
+
log = (m) => console.log(`[world-watchdog-recovery] ${m}`),
|
|
73
|
+
} = {}) {
|
|
74
|
+
/**
|
|
75
|
+
* Emit a breadcrumb via broadcaster (fail-soft).
|
|
76
|
+
*
|
|
77
|
+
* @param {string} type
|
|
78
|
+
* @param {object} payload
|
|
79
|
+
*/
|
|
80
|
+
function broadcast(type, payload) {
|
|
81
|
+
if (!broadcaster || typeof broadcaster.broadcast !== 'function') return;
|
|
82
|
+
try {
|
|
83
|
+
broadcaster.broadcast(type, payload);
|
|
84
|
+
} catch (err) {
|
|
85
|
+
log(`broadcast ${type} failed: ${err?.message ?? err}`);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Handle a 2-tick-confirmed wedged verdict for a world.
|
|
91
|
+
*
|
|
92
|
+
* Called by world-watchdog.mjs on verdict-transition only (suspect → wedged),
|
|
93
|
+
* NOT on steady-state re-wedge.
|
|
94
|
+
*
|
|
95
|
+
* @param {{ worldId: string, pid: number|null }} opts
|
|
96
|
+
* @returns {Promise<void>}
|
|
97
|
+
*/
|
|
98
|
+
async function onWedgedVerdict({ worldId, pid }) {
|
|
99
|
+
// mode=false → detection-only; never act.
|
|
100
|
+
if (autoRecoverMode === false) return;
|
|
101
|
+
|
|
102
|
+
// PID null → watchdog hasn't resolved a real PID yet (Phase A stub case);
|
|
103
|
+
// skip silently — there is nothing to kill.
|
|
104
|
+
if (pid === null) return;
|
|
105
|
+
|
|
106
|
+
// Rate-limit gate.
|
|
107
|
+
const bucket = leakyBucket.tryConsume(worldId);
|
|
108
|
+
if (!bucket.allowed) {
|
|
109
|
+
broadcast('world.watchdog.recovery.budget_exhausted', {
|
|
110
|
+
worldId,
|
|
111
|
+
retryAfterMs: bucket.retryAfterMs,
|
|
112
|
+
totalInWindow: bucket.totalInWindow,
|
|
113
|
+
});
|
|
114
|
+
log(`worldId=${worldId}: budget exhausted (${bucket.totalInWindow} in window); skipping recovery`);
|
|
115
|
+
return;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// Read last persisted dispatch for replay.
|
|
119
|
+
let lastDispatch = null;
|
|
120
|
+
try {
|
|
121
|
+
lastDispatch = await persister.read({ worldId });
|
|
122
|
+
} catch (err) {
|
|
123
|
+
log(`worldId=${worldId}: persister.read failed: ${err?.message ?? err}`);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
broadcast('world.watchdog.recovery.start', {
|
|
127
|
+
worldId,
|
|
128
|
+
pid,
|
|
129
|
+
mode: autoRecoverMode,
|
|
130
|
+
lastDispatchMessageId: lastDispatch?.messageId ?? null,
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
// dry-run — log planned action but do NOT kill.
|
|
134
|
+
if (autoRecoverMode === 'dry-run') {
|
|
135
|
+
log(`worldId=${worldId}: dry-run — would SIGKILL pid=${pid}${lastDispatch ? ` + replay messageId=${lastDispatch.messageId}` : ' (no last-dispatch)'}`);
|
|
136
|
+
broadcast('world.watchdog.recovery.complete', {
|
|
137
|
+
worldId,
|
|
138
|
+
pid,
|
|
139
|
+
mode: 'dry-run',
|
|
140
|
+
replayed: false,
|
|
141
|
+
});
|
|
142
|
+
return;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// mode=true — act.
|
|
146
|
+
try {
|
|
147
|
+
// 1. SIGKILL the wedged process.
|
|
148
|
+
processKill(pid);
|
|
149
|
+
log(`worldId=${worldId}: SIGKILL sent to pid=${pid}`);
|
|
150
|
+
|
|
151
|
+
// 2. Replay or note absence of last-dispatch.
|
|
152
|
+
if (!lastDispatch) {
|
|
153
|
+
broadcast('world.watchdog.recovery.restart_without_replay', {
|
|
154
|
+
worldId,
|
|
155
|
+
pid,
|
|
156
|
+
});
|
|
157
|
+
log(`worldId=${worldId}: no last-dispatch; killed without replay`);
|
|
158
|
+
} else {
|
|
159
|
+
// TODO: wire real replay once operator has run the B3 idempotence probe
|
|
160
|
+
// and confirmed dispatch is idempotent for the substrates in use.
|
|
161
|
+
// Until then this stub logs and emits a breadcrumb so the stub path
|
|
162
|
+
// is visible in production logs. See B3 probe + operator review gate B6.
|
|
163
|
+
broadcast('world.watchdog.recovery.replay_stub', {
|
|
164
|
+
worldId,
|
|
165
|
+
prompt: lastDispatch.prompt,
|
|
166
|
+
});
|
|
167
|
+
log(`worldId=${worldId}: replay stub hit — real replay deferred pending B3 sign-off`);
|
|
168
|
+
await replay({
|
|
169
|
+
worldId,
|
|
170
|
+
messageId: lastDispatch.messageId,
|
|
171
|
+
prompt: lastDispatch.prompt,
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
broadcast('world.watchdog.recovery.complete', {
|
|
176
|
+
worldId,
|
|
177
|
+
pid,
|
|
178
|
+
mode: true,
|
|
179
|
+
replayed: !!lastDispatch,
|
|
180
|
+
});
|
|
181
|
+
} catch (err) {
|
|
182
|
+
log(`worldId=${worldId}: recovery failed: ${err?.message ?? err}`);
|
|
183
|
+
broadcast('world.watchdog.recovery.failed', {
|
|
184
|
+
worldId,
|
|
185
|
+
pid,
|
|
186
|
+
error: err?.message ?? String(err),
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
return { onWedgedVerdict };
|
|
192
|
+
}
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* world-watchdog.mjs — periodic watchdog that probes each active world's
|
|
3
|
+
* `claude` PID for the three wedge signals (wchan + CLOSE_WAIT + CPU) and
|
|
4
|
+
* emits `world.watchdog.tick` events on the host-stream broadcaster.
|
|
5
|
+
*
|
|
6
|
+
* Design:
|
|
7
|
+
* - Mirrors `world-activity-tracker.mjs` shape exactly: `startWorldWatchdog(deps)`
|
|
8
|
+
* returns `{ stop, tickNow }`.
|
|
9
|
+
* - Per-world 2-tick confirm: a `'wedged'` classification is only emitted
|
|
10
|
+
* after TWO consecutive ticks with the wedge signature. A single-tick
|
|
11
|
+
* wedge emits `'suspect'`. A healthy tick resets the streak.
|
|
12
|
+
* - Per-world fail-soft: a probe error for one world never skips other worlds.
|
|
13
|
+
* - `OLAM_WORLD_WATCHDOG_DISABLED=1` → `start()` is a no-op (returns stub).
|
|
14
|
+
* - Cadence: `OLAM_WORLD_WATCHDOG_TICK_MS` env or `intervalMs` dep (default 30_000).
|
|
15
|
+
*
|
|
16
|
+
* v1 stub: `getClaudePidForWorld(worldId)` returns null for all worlds in
|
|
17
|
+
* Phase A. When null, the tick still fires but all probe signals are null,
|
|
18
|
+
* producing `verdict: 'unknown'`. Real PID lookup (docker inspect →
|
|
19
|
+
* /proc/<hostPid>/status NSpid field) is wired in a follow-up.
|
|
20
|
+
* This is documented here and in docs/architecture/world-watchdog.md.
|
|
21
|
+
*
|
|
22
|
+
* Wire-in: `server.mjs` constructs once after broadcaster is ready and calls
|
|
23
|
+
* `.stop()` from the SIGTERM/SIGINT handler. Gated on `!SERVE_ONLY`.
|
|
24
|
+
*
|
|
25
|
+
* @see docs/architecture/world-watchdog.md
|
|
26
|
+
* @see packages/host-cp/src/world-watchdog-probes.mjs
|
|
27
|
+
* @see packages/host-cp/src/world-activity-tracker.mjs (shape reference)
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
import {
|
|
31
|
+
readWchan,
|
|
32
|
+
readCloseWaitSockets,
|
|
33
|
+
readCpuPercent,
|
|
34
|
+
classify,
|
|
35
|
+
} from './world-watchdog-probes.mjs';
|
|
36
|
+
// Recovery hook (B5). Optional dep — when absent (recovery is null/undefined),
|
|
37
|
+
// the watchdog behaves exactly as Phase A: detection-only, no kill, no replay.
|
|
38
|
+
// Wire via startWorldWatchdog({ recovery: createRecovery({...}) }) in server.mjs.
|
|
39
|
+
|
|
40
|
+
const DEFAULT_TICK_MS = 30_000;
|
|
41
|
+
// CPU measurement window: shorter than the tick cadence so we don't overlap.
|
|
42
|
+
const CPU_WINDOW_MS = 500;
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* @typedef {object} WorldWatchdogDeps
|
|
46
|
+
* @property {object} [broadcaster] Object with `.broadcast(type, payload)`.
|
|
47
|
+
* Optional — when absent events are skipped but state tracking still works.
|
|
48
|
+
* @property {number} [intervalMs] Tick cadence in ms. Defaults to
|
|
49
|
+
* `OLAM_WORLD_WATCHDOG_TICK_MS` env or 30_000.
|
|
50
|
+
* @property {() => Promise<string[]>} [listActiveWorlds]
|
|
51
|
+
* Returns an array of active world IDs to probe each tick.
|
|
52
|
+
* Defaults to returning [].
|
|
53
|
+
* @property {(worldId: string) => Promise<number|null>} [getClaudePidForWorld]
|
|
54
|
+
* Returns the host-side PID of the claude process for a world, or null.
|
|
55
|
+
* v1 default: always returns null (all worlds → verdict 'unknown').
|
|
56
|
+
* @property {{ procRoot?: string }} [probes]
|
|
57
|
+
* Injectable probe options (procRoot for tests).
|
|
58
|
+
* @property {{ onWedgedVerdict(opts: { worldId: string, pid: number|null }): Promise<void> }} [recovery]
|
|
59
|
+
* Optional recovery handle (from world-watchdog-recovery.mjs). When present,
|
|
60
|
+
* called once on verdict-transition to 'wedged' (suspect → wedged), NOT on
|
|
61
|
+
* steady-state re-wedge. When absent, detection-only (Phase A behaviour).
|
|
62
|
+
* @property {(msg: string) => void} [log] Defaults to `console.log`.
|
|
63
|
+
* @property {(msg: string) => void} [debug] Defaults to no-op.
|
|
64
|
+
* @property {(cb: () => void, ms: number) => any} [setTimer]
|
|
65
|
+
* Injectable `setInterval` for tests.
|
|
66
|
+
* @property {(handle: any) => void} [clearTimer]
|
|
67
|
+
* Injectable `clearInterval` for tests.
|
|
68
|
+
* @property {() => Date} [now] Clock injection for tests.
|
|
69
|
+
*/
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* @typedef {object} WorldWatchdogHandle
|
|
73
|
+
* @property {() => void} stop
|
|
74
|
+
* @property {() => Promise<number>} tickNow Run one tick immediately (returns
|
|
75
|
+
* the count of worlds processed). Exposed for tests.
|
|
76
|
+
* @property {(worldId: string) => object|null} getVerdict
|
|
77
|
+
* Returns the latest in-memory verdict entry for a world, or null if no tick
|
|
78
|
+
* has fired yet. Used by the HTTP endpoint (A5).
|
|
79
|
+
*/
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Per-world state tracked between ticks for the 2-tick confirm.
|
|
83
|
+
*
|
|
84
|
+
* @typedef {object} WorldWatchdogState
|
|
85
|
+
* @property {'healthy'|'suspect'|'wedged'|'unknown'} lastClassification
|
|
86
|
+
* The raw classification from the previous tick (before 2-tick confirm).
|
|
87
|
+
* @property {'healthy'|'suspect'|'wedged'|'unknown'} lastVerdict
|
|
88
|
+
* The emitted verdict (post-confirm).
|
|
89
|
+
* @property {string} lastTickAt ISO-8601 timestamp of last tick.
|
|
90
|
+
* @property {object|null} lastSignals The signals from the last tick.
|
|
91
|
+
* @property {number|null} lastPid The PID probed last tick.
|
|
92
|
+
*/
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Start the world watchdog. Returns a `{ stop, tickNow, getVerdict }` handle.
|
|
96
|
+
*
|
|
97
|
+
* Honoring `OLAM_WORLD_WATCHDOG_DISABLED=1`: if the env var is set, returns
|
|
98
|
+
* a no-op stub immediately without starting the interval or making any probe
|
|
99
|
+
* calls.
|
|
100
|
+
*
|
|
101
|
+
* @param {WorldWatchdogDeps} [deps]
|
|
102
|
+
* @returns {WorldWatchdogHandle}
|
|
103
|
+
*/
|
|
104
|
+
export function startWorldWatchdog(deps = {}) {
|
|
105
|
+
// Honour kill switch — return a no-op stub.
|
|
106
|
+
if (process.env.OLAM_WORLD_WATCHDOG_DISABLED === '1') {
|
|
107
|
+
return {
|
|
108
|
+
stop() {},
|
|
109
|
+
tickNow: async () => 0,
|
|
110
|
+
getVerdict: () => null,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const log = deps.log ?? ((m) => console.log(`[world-watchdog] ${m}`));
|
|
115
|
+
const debug = deps.debug ?? (() => {});
|
|
116
|
+
const setTimer = deps.setTimer ?? ((cb, ms) => setInterval(cb, ms));
|
|
117
|
+
const clearTimer = deps.clearTimer ?? ((h) => clearInterval(h));
|
|
118
|
+
const now = deps.now ?? (() => new Date());
|
|
119
|
+
|
|
120
|
+
const intervalMs =
|
|
121
|
+
deps.intervalMs ??
|
|
122
|
+
parseInt(process.env.OLAM_WORLD_WATCHDOG_TICK_MS ?? `${DEFAULT_TICK_MS}`, 10);
|
|
123
|
+
|
|
124
|
+
const broadcaster = deps.broadcaster ?? null;
|
|
125
|
+
const listActiveWorlds = deps.listActiveWorlds ?? (async () => []);
|
|
126
|
+
const getClaudePidForWorld = deps.getClaudePidForWorld ?? (async (_id) => null);
|
|
127
|
+
const probeOpts = deps.probes ?? {};
|
|
128
|
+
// Recovery hook — null when not configured (Phase A / default-off behaviour).
|
|
129
|
+
const recovery = deps.recovery ?? null;
|
|
130
|
+
|
|
131
|
+
// Per-world state map: worldId → WorldWatchdogState.
|
|
132
|
+
/** @type {Map<string, WorldWatchdogState>} */
|
|
133
|
+
const worldState = new Map();
|
|
134
|
+
|
|
135
|
+
let stopped = false;
|
|
136
|
+
let inFlight = false;
|
|
137
|
+
let intervalHandle = null;
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Probe a single world and update its state. Returns the verdict emitted.
|
|
141
|
+
*
|
|
142
|
+
* @param {string} worldId
|
|
143
|
+
* @returns {Promise<'healthy'|'suspect'|'wedged'|'unknown'>}
|
|
144
|
+
*/
|
|
145
|
+
async function probeWorld(worldId) {
|
|
146
|
+
const pid = await getClaudePidForWorld(worldId);
|
|
147
|
+
|
|
148
|
+
let wchan = null;
|
|
149
|
+
let closeWaitSockets = [];
|
|
150
|
+
let cpuPercent = null;
|
|
151
|
+
|
|
152
|
+
if (pid !== null) {
|
|
153
|
+
// All probes are fail-soft — they return null/[] on I/O error.
|
|
154
|
+
[wchan, closeWaitSockets, cpuPercent] = await Promise.all([
|
|
155
|
+
readWchan(pid, probeOpts),
|
|
156
|
+
readCloseWaitSockets(pid, probeOpts),
|
|
157
|
+
readCpuPercent(pid, CPU_WINDOW_MS, probeOpts),
|
|
158
|
+
]);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
const closeWaitCount = closeWaitSockets.length;
|
|
162
|
+
const signals = pid !== null
|
|
163
|
+
? { wchan, closeWaitCount, cpuPercent }
|
|
164
|
+
: null;
|
|
165
|
+
|
|
166
|
+
// Classify raw signals.
|
|
167
|
+
const rawClassification = pid !== null
|
|
168
|
+
? classify({ wchan, closeWaitCount, cpuPercent })
|
|
169
|
+
: 'unknown';
|
|
170
|
+
|
|
171
|
+
// 2-tick confirm: only emit 'wedged' if BOTH this tick AND the previous tick
|
|
172
|
+
// classified as 'wedged'. Otherwise emit the raw classification.
|
|
173
|
+
const prev = worldState.get(worldId);
|
|
174
|
+
let verdict;
|
|
175
|
+
if (rawClassification === 'wedged' && prev?.lastClassification === 'wedged') {
|
|
176
|
+
verdict = 'wedged';
|
|
177
|
+
} else if (rawClassification === 'wedged') {
|
|
178
|
+
// First 'wedged' tick — emit 'suspect' (2-tick confirm pending).
|
|
179
|
+
verdict = 'suspect';
|
|
180
|
+
} else {
|
|
181
|
+
verdict = rawClassification;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
const tickAt = now().toISOString();
|
|
185
|
+
|
|
186
|
+
// Update per-world state.
|
|
187
|
+
worldState.set(worldId, {
|
|
188
|
+
lastClassification: rawClassification,
|
|
189
|
+
lastVerdict: verdict,
|
|
190
|
+
lastTickAt: tickAt,
|
|
191
|
+
lastSignals: signals,
|
|
192
|
+
lastPid: pid,
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
// Recovery hook — fire ONCE on verdict-transition to 'wedged' (not on
|
|
196
|
+
// steady-state re-wedge). Guard: prev?.lastVerdict !== 'wedged' ensures
|
|
197
|
+
// only the suspect→wedged transition triggers, not wedged→wedged.
|
|
198
|
+
if (
|
|
199
|
+
verdict === 'wedged' &&
|
|
200
|
+
recovery !== null &&
|
|
201
|
+
prev?.lastVerdict !== 'wedged'
|
|
202
|
+
) {
|
|
203
|
+
// Fire-and-forget; fail-soft so a recovery error never skips other worlds.
|
|
204
|
+
void recovery.onWedgedVerdict({ worldId, pid }).catch((err) => {
|
|
205
|
+
log(`recovery.onWedgedVerdict ${worldId} failed: ${err?.message ?? err}`);
|
|
206
|
+
});
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Emit broadcaster event.
|
|
210
|
+
if (broadcaster && typeof broadcaster.broadcast === 'function') {
|
|
211
|
+
try {
|
|
212
|
+
broadcaster.broadcast('world.watchdog.tick', {
|
|
213
|
+
worldId,
|
|
214
|
+
verdict,
|
|
215
|
+
signals,
|
|
216
|
+
pid,
|
|
217
|
+
lastTickAt: tickAt,
|
|
218
|
+
});
|
|
219
|
+
} catch (err) {
|
|
220
|
+
log(`broadcast ${worldId} failed: ${err?.message ?? err}`);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
return verdict;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
/**
|
|
228
|
+
* One tick: get active worlds, probe each, return count processed.
|
|
229
|
+
*
|
|
230
|
+
* @returns {Promise<number>}
|
|
231
|
+
*/
|
|
232
|
+
async function tick() {
|
|
233
|
+
if (stopped) return 0;
|
|
234
|
+
if (inFlight) {
|
|
235
|
+
debug('tick skipped: previous tick still in flight');
|
|
236
|
+
return 0;
|
|
237
|
+
}
|
|
238
|
+
inFlight = true;
|
|
239
|
+
|
|
240
|
+
let processed = 0;
|
|
241
|
+
try {
|
|
242
|
+
let worlds;
|
|
243
|
+
try {
|
|
244
|
+
worlds = await listActiveWorlds();
|
|
245
|
+
} catch (err) {
|
|
246
|
+
log(`listActiveWorlds failed: ${err?.message ?? err}`);
|
|
247
|
+
return 0;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
for (const worldId of worlds) {
|
|
251
|
+
if (stopped) break;
|
|
252
|
+
if (typeof worldId !== 'string') continue;
|
|
253
|
+
|
|
254
|
+
try {
|
|
255
|
+
await probeWorld(worldId);
|
|
256
|
+
processed += 1;
|
|
257
|
+
} catch (err) {
|
|
258
|
+
// Per-world fail-soft: one bad world doesn't crash the loop.
|
|
259
|
+
debug(`probe ${worldId} failed: ${err?.message ?? err}`);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
} finally {
|
|
263
|
+
inFlight = false;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
return processed;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// Kick off an initial tick on next event-loop turn so callers can
|
|
270
|
+
// attach test spies before any probe work happens.
|
|
271
|
+
setImmediate(() => {
|
|
272
|
+
if (stopped) return;
|
|
273
|
+
void tick().catch((err) => {
|
|
274
|
+
log(`initial tick crashed: ${err?.message ?? err}`);
|
|
275
|
+
});
|
|
276
|
+
});
|
|
277
|
+
|
|
278
|
+
intervalHandle = setTimer(() => {
|
|
279
|
+
void tick().catch((err) => {
|
|
280
|
+
log(`tick crashed: ${err?.message ?? err}`);
|
|
281
|
+
});
|
|
282
|
+
}, intervalMs);
|
|
283
|
+
// Don't pin the event loop on shutdown.
|
|
284
|
+
if (intervalHandle && typeof intervalHandle.unref === 'function') {
|
|
285
|
+
intervalHandle.unref();
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
log(`started: interval=${intervalMs}ms`);
|
|
289
|
+
|
|
290
|
+
return {
|
|
291
|
+
stop() {
|
|
292
|
+
if (stopped) return;
|
|
293
|
+
stopped = true;
|
|
294
|
+
if (intervalHandle !== null) {
|
|
295
|
+
try { clearTimer(intervalHandle); } catch { /* ignore */ }
|
|
296
|
+
intervalHandle = null;
|
|
297
|
+
}
|
|
298
|
+
},
|
|
299
|
+
|
|
300
|
+
tickNow: tick,
|
|
301
|
+
|
|
302
|
+
/**
|
|
303
|
+
* Return the latest in-memory verdict entry for a world.
|
|
304
|
+
* Returns null if no tick has fired for this world yet.
|
|
305
|
+
*
|
|
306
|
+
* @param {string} worldId
|
|
307
|
+
* @returns {WorldWatchdogState|null}
|
|
308
|
+
*/
|
|
309
|
+
getVerdict(worldId) {
|
|
310
|
+
return worldState.get(worldId) ?? null;
|
|
311
|
+
},
|
|
312
|
+
};
|
|
313
|
+
}
|