@pleri/olam-cli 0.1.196 → 0.1.199
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -0
- package/dist/ask/knowledge-pack.generated.d.ts.map +1 -1
- package/dist/ask/knowledge-pack.generated.js +10 -8
- package/dist/ask/knowledge-pack.generated.js.map +1 -1
- package/dist/commands/auth-list-json.d.ts +34 -0
- package/dist/commands/auth-list-json.d.ts.map +1 -1
- package/dist/commands/auth-list-json.js +24 -0
- package/dist/commands/auth-list-json.js.map +1 -1
- package/dist/commands/auth-migrate.d.ts +212 -0
- package/dist/commands/auth-migrate.d.ts.map +1 -0
- package/dist/commands/auth-migrate.js +465 -0
- package/dist/commands/auth-migrate.js.map +1 -0
- package/dist/commands/auth.d.ts.map +1 -1
- package/dist/commands/auth.js +239 -184
- package/dist/commands/auth.js.map +1 -1
- package/dist/commands/bootstrap.d.ts +4 -0
- package/dist/commands/bootstrap.d.ts.map +1 -1
- package/dist/commands/bootstrap.js +10 -0
- package/dist/commands/bootstrap.js.map +1 -1
- package/dist/commands/dispatch.d.ts.map +1 -1
- package/dist/commands/dispatch.js +11 -1
- package/dist/commands/dispatch.js.map +1 -1
- package/dist/commands/doctor.d.ts +33 -0
- package/dist/commands/doctor.d.ts.map +1 -1
- package/dist/commands/doctor.js +299 -12
- package/dist/commands/doctor.js.map +1 -1
- package/dist/commands/kg-mirror.d.ts +18 -2
- package/dist/commands/kg-mirror.d.ts.map +1 -1
- package/dist/commands/kg-mirror.js +78 -3
- package/dist/commands/kg-mirror.js.map +1 -1
- package/dist/commands/mcp/complete.d.ts +36 -0
- package/dist/commands/mcp/complete.d.ts.map +1 -0
- package/dist/commands/mcp/complete.js +66 -0
- package/dist/commands/mcp/complete.js.map +1 -0
- package/dist/commands/mcp/index.d.ts +1 -1
- package/dist/commands/mcp/index.d.ts.map +1 -1
- package/dist/commands/mcp/index.js +3 -1
- package/dist/commands/mcp/index.js.map +1 -1
- package/dist/commands/memory/bridge.d.ts +1 -1
- package/dist/commands/memory/bridge.d.ts.map +1 -1
- package/dist/commands/memory/bridge.js +2 -6
- package/dist/commands/memory/bridge.js.map +1 -1
- package/dist/commands/memory/secret.d.ts.map +1 -1
- package/dist/commands/memory/secret.js +4 -3
- package/dist/commands/memory/secret.js.map +1 -1
- package/dist/commands/observe.d.ts +3 -3
- package/dist/commands/observe.d.ts.map +1 -1
- package/dist/commands/observe.js +11 -8
- package/dist/commands/observe.js.map +1 -1
- package/dist/commands/runbooks.d.ts.map +1 -1
- package/dist/commands/runbooks.js +77 -10
- package/dist/commands/runbooks.js.map +1 -1
- package/dist/commands/services-tls.d.ts.map +1 -1
- package/dist/commands/services-tls.js +41 -0
- package/dist/commands/services-tls.js.map +1 -1
- package/dist/commands/services.d.ts +45 -3
- package/dist/commands/services.d.ts.map +1 -1
- package/dist/commands/services.js +198 -71
- package/dist/commands/services.js.map +1 -1
- package/dist/commands/setup-phase-8-kg-hook.d.ts +48 -0
- package/dist/commands/setup-phase-8-kg-hook.d.ts.map +1 -0
- package/dist/commands/setup-phase-8-kg-hook.js +93 -0
- package/dist/commands/setup-phase-8-kg-hook.js.map +1 -0
- package/dist/commands/setup-phase-9-memory-bridge.d.ts +36 -0
- package/dist/commands/setup-phase-9-memory-bridge.d.ts.map +1 -0
- package/dist/commands/setup-phase-9-memory-bridge.js +59 -0
- package/dist/commands/setup-phase-9-memory-bridge.js.map +1 -0
- package/dist/commands/setup.d.ts +34 -1
- package/dist/commands/setup.d.ts.map +1 -1
- package/dist/commands/setup.js +328 -23
- package/dist/commands/setup.js.map +1 -1
- package/dist/commands/update.d.ts +24 -0
- package/dist/commands/update.d.ts.map +1 -1
- package/dist/commands/update.js +53 -0
- package/dist/commands/update.js.map +1 -1
- package/dist/commands/upgrade.d.ts +5 -0
- package/dist/commands/upgrade.d.ts.map +1 -1
- package/dist/commands/upgrade.js +31 -8
- package/dist/commands/upgrade.js.map +1 -1
- package/dist/image-digests.json +8 -8
- package/dist/index.js +4302 -2466
- package/dist/lib/auth-backend.d.ts +168 -0
- package/dist/lib/auth-backend.d.ts.map +1 -0
- package/dist/lib/auth-backend.js +172 -0
- package/dist/lib/auth-backend.js.map +1 -0
- package/dist/lib/auth-list-cache.d.ts +67 -0
- package/dist/lib/auth-list-cache.d.ts.map +1 -0
- package/dist/lib/auth-list-cache.js +84 -0
- package/dist/lib/auth-list-cache.js.map +1 -0
- package/dist/lib/auth-list.d.ts +107 -0
- package/dist/lib/auth-list.d.ts.map +1 -0
- package/dist/lib/auth-list.js +123 -0
- package/dist/lib/auth-list.js.map +1 -0
- package/dist/lib/auth-login.d.ts +92 -0
- package/dist/lib/auth-login.d.ts.map +1 -0
- package/dist/lib/auth-login.js +124 -0
- package/dist/lib/auth-login.js.map +1 -0
- package/dist/lib/auth-mutator-backend.d.ts +54 -0
- package/dist/lib/auth-mutator-backend.d.ts.map +1 -0
- package/dist/lib/auth-mutator-backend.js +62 -0
- package/dist/lib/auth-mutator-backend.js.map +1 -0
- package/dist/lib/auth-remote.d.ts +50 -0
- package/dist/lib/auth-remote.d.ts.map +1 -1
- package/dist/lib/auth-remote.js +84 -2
- package/dist/lib/auth-remote.js.map +1 -1
- package/dist/lib/bootstrap-kubernetes.d.ts +93 -12
- package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
- package/dist/lib/bootstrap-kubernetes.js +364 -53
- package/dist/lib/bootstrap-kubernetes.js.map +1 -1
- package/dist/lib/config.d.ts +7 -0
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/health-probes.d.ts +0 -22
- package/dist/lib/health-probes.d.ts.map +1 -1
- package/dist/lib/health-probes.js +23 -2
- package/dist/lib/health-probes.js.map +1 -1
- package/dist/lib/peripheral-registry.d.ts +11 -0
- package/dist/lib/peripheral-registry.d.ts.map +1 -1
- package/dist/lib/peripheral-registry.js +5 -0
- package/dist/lib/peripheral-registry.js.map +1 -1
- package/dist/lib/plans-client.d.ts.map +1 -1
- package/dist/lib/plans-client.js +6 -3
- package/dist/lib/plans-client.js.map +1 -1
- package/dist/mcp-server.js +14 -3
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/30-configmap.yaml +4 -0
- package/host-cp/k8s/manifests/50-deployment.yaml +13 -1
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
- package/host-cp/src/dispatch-persister.mjs +157 -0
- package/host-cp/src/pr-nanny.mjs +7 -0
- package/host-cp/src/server.mjs +175 -3
- package/host-cp/src/world-watchdog-pid-lookup.mjs +119 -0
- package/host-cp/src/world-watchdog-probes.mjs +271 -0
- package/host-cp/src/world-watchdog-recovery.mjs +192 -0
- package/host-cp/src/world-watchdog.mjs +313 -0
- package/package.json +1 -1
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* world-watchdog-probes.mjs — pure probe functions for the world watchdog.
|
|
3
|
+
*
|
|
4
|
+
* Three readers extract raw signals from the Linux /proc filesystem:
|
|
5
|
+
* - readWchan(pid, opts) → string | null
|
|
6
|
+
* - readCloseWaitSockets(pid, opts) → Array<{remoteIp, remotePort}>
|
|
7
|
+
* - readCpuPercent(pid, windowMs, opts) → number | null
|
|
8
|
+
*
|
|
9
|
+
* One pure classifier turns those signals into a verdict:
|
|
10
|
+
* - classify({ wchan, closeWaitCount, cpuPercent }) → 'healthy'|'suspect'|'wedged'
|
|
11
|
+
*
|
|
12
|
+
* All readers are fail-soft: any I/O error or parse error returns
|
|
13
|
+
* null / [] / 0 rather than throwing. The classifier treats null inputs as
|
|
14
|
+
* the signal not firing (conservative — only promotes to 'wedged' when all
|
|
15
|
+
* three signals are conclusive).
|
|
16
|
+
*
|
|
17
|
+
* Test injection: pass `opts.procRoot` to redirect /proc reads to a fixture
|
|
18
|
+
* directory (e.g. src/__tests__/fixtures/proc-gold-elk-5574/).
|
|
19
|
+
*
|
|
20
|
+
* CLOSE_WAIT threshold note (deviation from D2): Decision D2 specifies
|
|
21
|
+
* filtering CLOSE_WAIT by peer hostname (*.anthropic.com | auth-worker.*).
|
|
22
|
+
* DNS resolution at every tick is unreliable under network stress (exactly
|
|
23
|
+
* when the watchdog must be most accurate). The gold-elk-5574 forensic data
|
|
24
|
+
* shows ≥3 CLOSE_WAIT to ANY peer is already diagnostic — a healthy claude
|
|
25
|
+
* process has 0-1 CLOSE_WAIT sockets under normal operation. The classifier
|
|
26
|
+
* therefore uses count ≥ 3 without hostname filtering. This deviation is
|
|
27
|
+
* documented in docs/architecture/world-watchdog.md Signal 2.
|
|
28
|
+
*
|
|
29
|
+
* @see docs/architecture/world-watchdog.md
|
|
30
|
+
* @see packages/host-cp/src/__tests__/world-watchdog-probes.test.mjs
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
import fs from 'node:fs/promises';
|
|
34
|
+
import path from 'node:path';
|
|
35
|
+
|
|
36
|
+
// HZ — Linux scheduler tick rate. Kernel default is 100; can be 250 or 1000
|
|
37
|
+
// on tickless kernels but the /proc/stat jiffies-to-seconds conversion is
|
|
38
|
+
// independent of the actual HZ when the denominator is wall-clock ms.
|
|
39
|
+
// We divide jiffies by HZ to get seconds of CPU time, then compare to the
|
|
40
|
+
// wall-clock window. HZ=100 is correct for virtually all container environments.
|
|
41
|
+
const LINUX_HZ = 100;
|
|
42
|
+
|
|
43
|
+
// /proc/net/tcp state byte for CLOSE_WAIT.
|
|
44
|
+
const CLOSE_WAIT_STATE = '08';
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Read the wchan (wait channel) of a process's main thread.
|
|
48
|
+
*
|
|
49
|
+
* @param {number|string} pid Process ID.
|
|
50
|
+
* @param {{ procRoot?: string }} [opts]
|
|
51
|
+
* `procRoot` defaults to '/proc'; override for tests.
|
|
52
|
+
* @returns {Promise<string|null>}
|
|
53
|
+
* The wchan string (e.g. 'futex_wait_queue', 'epoll_wait') or null on error.
|
|
54
|
+
*/
|
|
55
|
+
export async function readWchan(pid, opts = {}) {
|
|
56
|
+
const procRoot = opts.procRoot ?? '/proc';
|
|
57
|
+
const wchanPath = path.join(procRoot, String(pid), 'wchan');
|
|
58
|
+
try {
|
|
59
|
+
const content = await fs.readFile(wchanPath, 'utf8');
|
|
60
|
+
return content.trim() || null;
|
|
61
|
+
} catch {
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Read CLOSE_WAIT sockets for a process from /proc/<pid>/net/tcp (and tcp6).
|
|
68
|
+
*
|
|
69
|
+
* Parses the /proc/net/tcp format (space-separated hex fields). State field
|
|
70
|
+
* (column index 3, 0-based) == '08' means CLOSE_WAIT. Returns all matching
|
|
71
|
+
* entries regardless of remote peer — see module JSDoc for rationale.
|
|
72
|
+
*
|
|
73
|
+
* @param {number|string} pid Process ID.
|
|
74
|
+
* @param {{ procRoot?: string }} [opts]
|
|
75
|
+
* @returns {Promise<Array<{remoteIp: string, remotePort: number}>>}
|
|
76
|
+
* Array of CLOSE_WAIT socket descriptors, empty on error or no matches.
|
|
77
|
+
*/
|
|
78
|
+
export async function readCloseWaitSockets(pid, opts = {}) {
|
|
79
|
+
const procRoot = opts.procRoot ?? '/proc';
|
|
80
|
+
const results = [];
|
|
81
|
+
|
|
82
|
+
for (const proto of ['tcp', 'tcp6']) {
|
|
83
|
+
const tcpPath = path.join(procRoot, String(pid), 'net', proto);
|
|
84
|
+
let content;
|
|
85
|
+
try {
|
|
86
|
+
content = await fs.readFile(tcpPath, 'utf8');
|
|
87
|
+
} catch {
|
|
88
|
+
// ENOENT: pid gone or proto not available — skip, not an error.
|
|
89
|
+
continue;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const lines = content.split('\n');
|
|
93
|
+
// Skip header line.
|
|
94
|
+
for (let i = 1; i < lines.length; i++) {
|
|
95
|
+
const line = lines[i].trim();
|
|
96
|
+
if (!line) continue;
|
|
97
|
+
const fields = line.split(/\s+/);
|
|
98
|
+
// /proc/net/tcp columns (0-based):
|
|
99
|
+
// 0: sl
|
|
100
|
+
// 1: local_address (hex IP:port)
|
|
101
|
+
// 2: rem_address (hex IP:port)
|
|
102
|
+
// 3: st (hex state)
|
|
103
|
+
if (fields.length < 4) continue;
|
|
104
|
+
const state = fields[3];
|
|
105
|
+
if (state !== CLOSE_WAIT_STATE) continue;
|
|
106
|
+
|
|
107
|
+
const remAddr = fields[2];
|
|
108
|
+
const colonIdx = remAddr.lastIndexOf(':');
|
|
109
|
+
if (colonIdx === -1) continue;
|
|
110
|
+
const remIpHex = remAddr.slice(0, colonIdx);
|
|
111
|
+
const remPortHex = remAddr.slice(colonIdx + 1);
|
|
112
|
+
|
|
113
|
+
const remIp = parseHexIp(remIpHex);
|
|
114
|
+
const remPort = parseInt(remPortHex, 16);
|
|
115
|
+
|
|
116
|
+
if (remIp !== null && Number.isFinite(remPort)) {
|
|
117
|
+
results.push({ remoteIp: remIp, remotePort: remPort });
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return results;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Measure CPU utilisation for a process over a time window.
|
|
127
|
+
*
|
|
128
|
+
* Reads /proc/<pid>/stat twice (before + after `windowMs` ms) and computes:
|
|
129
|
+
* cpuPercent = (utime+stime delta) / (HZ * windowMs / 1000) * 100
|
|
130
|
+
*
|
|
131
|
+
* @param {number|string} pid Process ID.
|
|
132
|
+
* @param {number} windowMs Measurement window in milliseconds.
|
|
133
|
+
* @param {{ procRoot?: string, sleep?: (ms: number) => Promise<void>, now?: () => number }} [opts]
|
|
134
|
+
* `sleep` — injectable delay function (default: real setTimeout).
|
|
135
|
+
* `now` — injectable clock (default: Date.now).
|
|
136
|
+
* `procRoot` — injectable proc root for tests.
|
|
137
|
+
* @returns {Promise<number|null>}
|
|
138
|
+
* CPU percent (0–100+) or null on read/parse error.
|
|
139
|
+
*/
|
|
140
|
+
export async function readCpuPercent(pid, windowMs, opts = {}) {
|
|
141
|
+
const procRoot = opts.procRoot ?? '/proc';
|
|
142
|
+
const sleep = opts.sleep ?? ((ms) => new Promise((r) => setTimeout(r, ms)));
|
|
143
|
+
const statPath = path.join(procRoot, String(pid), 'stat');
|
|
144
|
+
|
|
145
|
+
const before = await readStatTimes(statPath);
|
|
146
|
+
if (before === null) return null;
|
|
147
|
+
|
|
148
|
+
await sleep(windowMs);
|
|
149
|
+
|
|
150
|
+
const after = await readStatTimes(statPath);
|
|
151
|
+
if (after === null) return null;
|
|
152
|
+
|
|
153
|
+
const deltaTicks = (after.utime + after.stime) - (before.utime + before.stime);
|
|
154
|
+
if (deltaTicks < 0) return null;
|
|
155
|
+
|
|
156
|
+
// deltaTicks jiffies / HZ = delta CPU-seconds.
|
|
157
|
+
// windowMs / 1000 = window in seconds.
|
|
158
|
+
const windowSec = windowMs / 1000;
|
|
159
|
+
if (windowSec <= 0) return null;
|
|
160
|
+
|
|
161
|
+
const cpuPercent = (deltaTicks / LINUX_HZ / windowSec) * 100;
|
|
162
|
+
return cpuPercent;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// ── Internal helpers ──────────────────────────────────────────────────────────
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Parse utime + stime from /proc/<pid>/stat content.
|
|
169
|
+
*
|
|
170
|
+
* @param {string} statPath
|
|
171
|
+
* @returns {Promise<{utime: number, stime: number}|null>}
|
|
172
|
+
*/
|
|
173
|
+
async function readStatTimes(statPath) {
|
|
174
|
+
let content;
|
|
175
|
+
try {
|
|
176
|
+
content = await fs.readFile(statPath, 'utf8');
|
|
177
|
+
} catch {
|
|
178
|
+
return null;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// The stat format is: pid (comm) state ppid pgroup session ... utime stime ...
|
|
182
|
+
// The command name (field 2) can contain spaces and parentheses, so we
|
|
183
|
+
// find the last ')' to reliably locate the fields that follow.
|
|
184
|
+
const parenClose = content.lastIndexOf(')');
|
|
185
|
+
if (parenClose === -1) return null;
|
|
186
|
+
|
|
187
|
+
// After the closing ')', fields are space-separated starting with ' state'.
|
|
188
|
+
// Fields after ')' (0-indexed):
|
|
189
|
+
// 0: state, 1: ppid, 2: pgrp, 3: session, 4: tty_nr, 5: tpgid,
|
|
190
|
+
// 6: flags, 7: minflt, 8: cminflt, 9: majflt, 10: cmajflt,
|
|
191
|
+
// 11: utime, 12: stime (i.e. indices 11+12 from the post-paren split)
|
|
192
|
+
const afterParen = content.slice(parenClose + 1).trim();
|
|
193
|
+
const fields = afterParen.split(/\s+/);
|
|
194
|
+
// utime = fields[11], stime = fields[12]
|
|
195
|
+
if (fields.length < 13) return null;
|
|
196
|
+
|
|
197
|
+
const utime = parseInt(fields[11], 10);
|
|
198
|
+
const stime = parseInt(fields[12], 10);
|
|
199
|
+
|
|
200
|
+
if (!Number.isFinite(utime) || !Number.isFinite(stime)) return null;
|
|
201
|
+
return { utime, stime };
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Parse a hex-encoded IP address from /proc/net/tcp format.
|
|
206
|
+
*
|
|
207
|
+
* IPv4: 8 hex chars in little-endian byte order (e.g. "0101007F" → "127.0.0.1").
|
|
208
|
+
* IPv6: 32 hex chars (4 groups of 8, each in little-endian).
|
|
209
|
+
*
|
|
210
|
+
* @param {string} hexIp
|
|
211
|
+
* @returns {string|null}
|
|
212
|
+
*/
|
|
213
|
+
function parseHexIp(hexIp) {
|
|
214
|
+
if (hexIp.length === 8) {
|
|
215
|
+
// IPv4: stored as little-endian 32-bit integer.
|
|
216
|
+
const b = [
|
|
217
|
+
parseInt(hexIp.slice(6, 8), 16),
|
|
218
|
+
parseInt(hexIp.slice(4, 6), 16),
|
|
219
|
+
parseInt(hexIp.slice(2, 4), 16),
|
|
220
|
+
parseInt(hexIp.slice(0, 2), 16),
|
|
221
|
+
];
|
|
222
|
+
if (b.some((x) => !Number.isFinite(x))) return null;
|
|
223
|
+
return b.join('.');
|
|
224
|
+
}
|
|
225
|
+
if (hexIp.length === 32) {
|
|
226
|
+
// IPv6: 4 groups of 8 hex chars, each group little-endian.
|
|
227
|
+
const groups = [];
|
|
228
|
+
for (let g = 0; g < 4; g++) {
|
|
229
|
+
const chunk = hexIp.slice(g * 8, g * 8 + 8);
|
|
230
|
+
// Reverse byte order within each 32-bit group.
|
|
231
|
+
const bytes = [
|
|
232
|
+
chunk.slice(6, 8),
|
|
233
|
+
chunk.slice(4, 6),
|
|
234
|
+
chunk.slice(2, 4),
|
|
235
|
+
chunk.slice(0, 2),
|
|
236
|
+
];
|
|
237
|
+
// Pair bytes into 16-bit groups for IPv6 notation.
|
|
238
|
+
groups.push(bytes[0] + bytes[1], bytes[2] + bytes[3]);
|
|
239
|
+
}
|
|
240
|
+
return groups.join(':');
|
|
241
|
+
}
|
|
242
|
+
return null;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// ── Classifier ───────────────────────────────────────────────────────────────
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* @typedef {'healthy'|'suspect'|'wedged'} WatchdogVerdict
|
|
249
|
+
*/
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Classify a set of probe signals into a watchdog verdict.
|
|
253
|
+
*
|
|
254
|
+
* AND-gate: all three of (wchan=futex_wait_queue, closeWaitCount≥3, cpuPercent<1)
|
|
255
|
+
* must fire for 'wedged'. Any subset → 'suspect'. None → 'healthy'.
|
|
256
|
+
* Null inputs are treated as not-firing (fail-soft).
|
|
257
|
+
*
|
|
258
|
+
* @param {{ wchan: string|null, closeWaitCount: number|null, cpuPercent: number|null }} signals
|
|
259
|
+
* @returns {WatchdogVerdict}
|
|
260
|
+
*/
|
|
261
|
+
export function classify({ wchan, closeWaitCount, cpuPercent }) {
|
|
262
|
+
const wchanFires = wchan === 'futex_wait_queue';
|
|
263
|
+
const closeWaitFires = typeof closeWaitCount === 'number' && closeWaitCount >= 3;
|
|
264
|
+
const cpuFires = typeof cpuPercent === 'number' && cpuPercent < 1;
|
|
265
|
+
|
|
266
|
+
const firingCount = (wchanFires ? 1 : 0) + (closeWaitFires ? 1 : 0) + (cpuFires ? 1 : 0);
|
|
267
|
+
|
|
268
|
+
if (firingCount === 3) return 'wedged';
|
|
269
|
+
if (firingCount > 0) return 'suspect';
|
|
270
|
+
return 'healthy';
|
|
271
|
+
}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* world-watchdog-recovery.mjs — recovery hook for wedged claude processes.
|
|
3
|
+
*
|
|
4
|
+
* Isolated from world-watchdog.mjs so kill + replay logic is independently
|
|
5
|
+
* mockable in tests without touching the watchdog's ticker.
|
|
6
|
+
*
|
|
7
|
+
* API:
|
|
8
|
+
* createRecovery({ autoRecoverMode, leakyBucket, broadcaster, persister,
|
|
9
|
+
* replay, processKill, log })
|
|
10
|
+
* → { onWedgedVerdict({ worldId, pid }): Promise<void> }
|
|
11
|
+
*
|
|
12
|
+
* Three modes (from compute.autoRecover in .olam/config.yaml):
|
|
13
|
+
* false — no-op; recovery never fires even on wedged verdict (DEFAULT)
|
|
14
|
+
* 'dry-run' — emits all breadcrumbs, never calls processKill or replay
|
|
15
|
+
* true — SIGKILL pid + read last-dispatch + replay; rate-limited
|
|
16
|
+
*
|
|
17
|
+
* Rate-limit: B2 leaky-bucket (3/hour/world). 4th wedge in window emits
|
|
18
|
+
* world.watchdog.recovery.budget_exhausted and skips all action.
|
|
19
|
+
*
|
|
20
|
+
* Replay stub: the `replay` dep is accepted as an injected function. In
|
|
21
|
+
* server.mjs it is wired to a console.warn stub + breadcrumb until the
|
|
22
|
+
* operator runs the B3 idempotence probe and signs off. See TODO below.
|
|
23
|
+
*
|
|
24
|
+
* @see docs/architecture/world-watchdog.md Recovery section
|
|
25
|
+
* @see packages/host-cp/src/lib/leaky-bucket.mjs
|
|
26
|
+
* @see packages/host-cp/src/dispatch-persister.mjs
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* @typedef {'false'|true|'dry-run'} AutoRecoverMode
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* @typedef {object} RecoveryDeps
|
|
35
|
+
* @property {false|true|'dry-run'} autoRecoverMode
|
|
36
|
+
* Passed from server.mjs which reads config.compute.autoRecover.
|
|
37
|
+
* Default false if config unavailable.
|
|
38
|
+
* @property {{ tryConsume(key: string): { allowed: boolean, retryAfterMs?: number, totalInWindow: number } }} leakyBucket
|
|
39
|
+
* B2 leaky-bucket instance. Keyed by worldId.
|
|
40
|
+
* @property {{ broadcast(type: string, payload: object): void }} [broadcaster]
|
|
41
|
+
* Host-stream broadcaster. Optional — when absent, breadcrumbs are skipped.
|
|
42
|
+
* @property {{ read({ worldId: string }): Promise<{ messageId: string, prompt: string, dispatchedAt: string, source: string } | null> }} persister
|
|
43
|
+
* B4 dispatch-persister read function.
|
|
44
|
+
* @property {(opts: { worldId: string, messageId: string, prompt: string }) => Promise<void>} replay
|
|
45
|
+
* Opaque dispatch helper. Injected dep — DO NOT implement dispatch here.
|
|
46
|
+
* In server.mjs this is wired to a stub until operator signs off on B3 probe.
|
|
47
|
+
* @property {(pid: number) => void} [processKill]
|
|
48
|
+
* process.kill indirection so tests can spy without actually killing.
|
|
49
|
+
* Defaults to process.kill.
|
|
50
|
+
* @property {(msg: string) => void} [log]
|
|
51
|
+
* Logger. Defaults to console.log with [world-watchdog-recovery] prefix.
|
|
52
|
+
*/
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* @typedef {object} RecoveryHandle
|
|
56
|
+
* @property {(opts: { worldId: string, pid: number|null }) => Promise<void>} onWedgedVerdict
|
|
57
|
+
*/
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Create a recovery handle.
|
|
61
|
+
*
|
|
62
|
+
* @param {RecoveryDeps} deps
|
|
63
|
+
* @returns {RecoveryHandle}
|
|
64
|
+
*/
|
|
65
|
+
export function createRecovery({
|
|
66
|
+
autoRecoverMode = false,
|
|
67
|
+
leakyBucket,
|
|
68
|
+
broadcaster = null,
|
|
69
|
+
persister,
|
|
70
|
+
replay,
|
|
71
|
+
processKill = (pid) => process.kill(pid, 'SIGKILL'),
|
|
72
|
+
log = (m) => console.log(`[world-watchdog-recovery] ${m}`),
|
|
73
|
+
} = {}) {
|
|
74
|
+
/**
|
|
75
|
+
* Emit a breadcrumb via broadcaster (fail-soft).
|
|
76
|
+
*
|
|
77
|
+
* @param {string} type
|
|
78
|
+
* @param {object} payload
|
|
79
|
+
*/
|
|
80
|
+
function broadcast(type, payload) {
|
|
81
|
+
if (!broadcaster || typeof broadcaster.broadcast !== 'function') return;
|
|
82
|
+
try {
|
|
83
|
+
broadcaster.broadcast(type, payload);
|
|
84
|
+
} catch (err) {
|
|
85
|
+
log(`broadcast ${type} failed: ${err?.message ?? err}`);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Handle a 2-tick-confirmed wedged verdict for a world.
|
|
91
|
+
*
|
|
92
|
+
* Called by world-watchdog.mjs on verdict-transition only (suspect → wedged),
|
|
93
|
+
* NOT on steady-state re-wedge.
|
|
94
|
+
*
|
|
95
|
+
* @param {{ worldId: string, pid: number|null }} opts
|
|
96
|
+
* @returns {Promise<void>}
|
|
97
|
+
*/
|
|
98
|
+
async function onWedgedVerdict({ worldId, pid }) {
|
|
99
|
+
// mode=false → detection-only; never act.
|
|
100
|
+
if (autoRecoverMode === false) return;
|
|
101
|
+
|
|
102
|
+
// PID null → watchdog hasn't resolved a real PID yet (Phase A stub case);
|
|
103
|
+
// skip silently — there is nothing to kill.
|
|
104
|
+
if (pid === null) return;
|
|
105
|
+
|
|
106
|
+
// Rate-limit gate.
|
|
107
|
+
const bucket = leakyBucket.tryConsume(worldId);
|
|
108
|
+
if (!bucket.allowed) {
|
|
109
|
+
broadcast('world.watchdog.recovery.budget_exhausted', {
|
|
110
|
+
worldId,
|
|
111
|
+
retryAfterMs: bucket.retryAfterMs,
|
|
112
|
+
totalInWindow: bucket.totalInWindow,
|
|
113
|
+
});
|
|
114
|
+
log(`worldId=${worldId}: budget exhausted (${bucket.totalInWindow} in window); skipping recovery`);
|
|
115
|
+
return;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
// Read last persisted dispatch for replay.
|
|
119
|
+
let lastDispatch = null;
|
|
120
|
+
try {
|
|
121
|
+
lastDispatch = await persister.read({ worldId });
|
|
122
|
+
} catch (err) {
|
|
123
|
+
log(`worldId=${worldId}: persister.read failed: ${err?.message ?? err}`);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
broadcast('world.watchdog.recovery.start', {
|
|
127
|
+
worldId,
|
|
128
|
+
pid,
|
|
129
|
+
mode: autoRecoverMode,
|
|
130
|
+
lastDispatchMessageId: lastDispatch?.messageId ?? null,
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
// dry-run — log planned action but do NOT kill.
|
|
134
|
+
if (autoRecoverMode === 'dry-run') {
|
|
135
|
+
log(`worldId=${worldId}: dry-run — would SIGKILL pid=${pid}${lastDispatch ? ` + replay messageId=${lastDispatch.messageId}` : ' (no last-dispatch)'}`);
|
|
136
|
+
broadcast('world.watchdog.recovery.complete', {
|
|
137
|
+
worldId,
|
|
138
|
+
pid,
|
|
139
|
+
mode: 'dry-run',
|
|
140
|
+
replayed: false,
|
|
141
|
+
});
|
|
142
|
+
return;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// mode=true — act.
|
|
146
|
+
try {
|
|
147
|
+
// 1. SIGKILL the wedged process.
|
|
148
|
+
processKill(pid);
|
|
149
|
+
log(`worldId=${worldId}: SIGKILL sent to pid=${pid}`);
|
|
150
|
+
|
|
151
|
+
// 2. Replay or note absence of last-dispatch.
|
|
152
|
+
if (!lastDispatch) {
|
|
153
|
+
broadcast('world.watchdog.recovery.restart_without_replay', {
|
|
154
|
+
worldId,
|
|
155
|
+
pid,
|
|
156
|
+
});
|
|
157
|
+
log(`worldId=${worldId}: no last-dispatch; killed without replay`);
|
|
158
|
+
} else {
|
|
159
|
+
// TODO: wire real replay once operator has run the B3 idempotence probe
|
|
160
|
+
// and confirmed dispatch is idempotent for the substrates in use.
|
|
161
|
+
// Until then this stub logs and emits a breadcrumb so the stub path
|
|
162
|
+
// is visible in production logs. See B3 probe + operator review gate B6.
|
|
163
|
+
broadcast('world.watchdog.recovery.replay_stub', {
|
|
164
|
+
worldId,
|
|
165
|
+
prompt: lastDispatch.prompt,
|
|
166
|
+
});
|
|
167
|
+
log(`worldId=${worldId}: replay stub hit — real replay deferred pending B3 sign-off`);
|
|
168
|
+
await replay({
|
|
169
|
+
worldId,
|
|
170
|
+
messageId: lastDispatch.messageId,
|
|
171
|
+
prompt: lastDispatch.prompt,
|
|
172
|
+
});
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
broadcast('world.watchdog.recovery.complete', {
|
|
176
|
+
worldId,
|
|
177
|
+
pid,
|
|
178
|
+
mode: true,
|
|
179
|
+
replayed: !!lastDispatch,
|
|
180
|
+
});
|
|
181
|
+
} catch (err) {
|
|
182
|
+
log(`worldId=${worldId}: recovery failed: ${err?.message ?? err}`);
|
|
183
|
+
broadcast('world.watchdog.recovery.failed', {
|
|
184
|
+
worldId,
|
|
185
|
+
pid,
|
|
186
|
+
error: err?.message ?? String(err),
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
return { onWedgedVerdict };
|
|
192
|
+
}
|