@pleri/olam-cli 0.1.195 → 0.1.198

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. package/README.md +52 -0
  2. package/dist/ask/knowledge-pack.generated.d.ts.map +1 -1
  3. package/dist/ask/knowledge-pack.generated.js +12 -8
  4. package/dist/ask/knowledge-pack.generated.js.map +1 -1
  5. package/dist/commands/auth-list-json.d.ts +34 -0
  6. package/dist/commands/auth-list-json.d.ts.map +1 -1
  7. package/dist/commands/auth-list-json.js +24 -0
  8. package/dist/commands/auth-list-json.js.map +1 -1
  9. package/dist/commands/auth-migrate.d.ts +212 -0
  10. package/dist/commands/auth-migrate.d.ts.map +1 -0
  11. package/dist/commands/auth-migrate.js +465 -0
  12. package/dist/commands/auth-migrate.js.map +1 -0
  13. package/dist/commands/auth.d.ts.map +1 -1
  14. package/dist/commands/auth.js +239 -184
  15. package/dist/commands/auth.js.map +1 -1
  16. package/dist/commands/bootstrap.d.ts +4 -0
  17. package/dist/commands/bootstrap.d.ts.map +1 -1
  18. package/dist/commands/bootstrap.js +6 -0
  19. package/dist/commands/bootstrap.js.map +1 -1
  20. package/dist/commands/dispatch.d.ts.map +1 -1
  21. package/dist/commands/dispatch.js +11 -1
  22. package/dist/commands/dispatch.js.map +1 -1
  23. package/dist/commands/doctor.d.ts +33 -0
  24. package/dist/commands/doctor.d.ts.map +1 -1
  25. package/dist/commands/doctor.js +299 -12
  26. package/dist/commands/doctor.js.map +1 -1
  27. package/dist/commands/kg-mirror.d.ts +18 -2
  28. package/dist/commands/kg-mirror.d.ts.map +1 -1
  29. package/dist/commands/kg-mirror.js +78 -3
  30. package/dist/commands/kg-mirror.js.map +1 -1
  31. package/dist/commands/mcp/complete.d.ts +36 -0
  32. package/dist/commands/mcp/complete.d.ts.map +1 -0
  33. package/dist/commands/mcp/complete.js +66 -0
  34. package/dist/commands/mcp/complete.js.map +1 -0
  35. package/dist/commands/mcp/index.d.ts +1 -1
  36. package/dist/commands/mcp/index.d.ts.map +1 -1
  37. package/dist/commands/mcp/index.js +3 -1
  38. package/dist/commands/mcp/index.js.map +1 -1
  39. package/dist/commands/memory/bridge.d.ts +1 -1
  40. package/dist/commands/memory/bridge.d.ts.map +1 -1
  41. package/dist/commands/memory/bridge.js +2 -6
  42. package/dist/commands/memory/bridge.js.map +1 -1
  43. package/dist/commands/memory/secret.d.ts.map +1 -1
  44. package/dist/commands/memory/secret.js +4 -3
  45. package/dist/commands/memory/secret.js.map +1 -1
  46. package/dist/commands/observe.d.ts +3 -3
  47. package/dist/commands/observe.d.ts.map +1 -1
  48. package/dist/commands/observe.js +11 -8
  49. package/dist/commands/observe.js.map +1 -1
  50. package/dist/commands/runbooks.d.ts.map +1 -1
  51. package/dist/commands/runbooks.js +77 -10
  52. package/dist/commands/runbooks.js.map +1 -1
  53. package/dist/commands/services-tls.d.ts.map +1 -1
  54. package/dist/commands/services-tls.js +65 -10
  55. package/dist/commands/services-tls.js.map +1 -1
  56. package/dist/commands/services.d.ts +35 -1
  57. package/dist/commands/services.d.ts.map +1 -1
  58. package/dist/commands/services.js +153 -32
  59. package/dist/commands/services.js.map +1 -1
  60. package/dist/commands/setup-phase-8-kg-hook.d.ts +48 -0
  61. package/dist/commands/setup-phase-8-kg-hook.d.ts.map +1 -0
  62. package/dist/commands/setup-phase-8-kg-hook.js +93 -0
  63. package/dist/commands/setup-phase-8-kg-hook.js.map +1 -0
  64. package/dist/commands/setup-phase-9-memory-bridge.d.ts +36 -0
  65. package/dist/commands/setup-phase-9-memory-bridge.d.ts.map +1 -0
  66. package/dist/commands/setup-phase-9-memory-bridge.js +59 -0
  67. package/dist/commands/setup-phase-9-memory-bridge.js.map +1 -0
  68. package/dist/commands/setup.d.ts +34 -1
  69. package/dist/commands/setup.d.ts.map +1 -1
  70. package/dist/commands/setup.js +372 -32
  71. package/dist/commands/setup.js.map +1 -1
  72. package/dist/commands/skills-source.d.ts.map +1 -1
  73. package/dist/commands/skills-source.js +70 -1
  74. package/dist/commands/skills-source.js.map +1 -1
  75. package/dist/commands/update.d.ts +24 -0
  76. package/dist/commands/update.d.ts.map +1 -1
  77. package/dist/commands/update.js +53 -0
  78. package/dist/commands/update.js.map +1 -1
  79. package/dist/commands/upgrade.d.ts +5 -0
  80. package/dist/commands/upgrade.d.ts.map +1 -1
  81. package/dist/commands/upgrade.js +31 -8
  82. package/dist/commands/upgrade.js.map +1 -1
  83. package/dist/image-digests.json +8 -8
  84. package/dist/index.js +4487 -2451
  85. package/dist/lib/auth-backend.d.ts +168 -0
  86. package/dist/lib/auth-backend.d.ts.map +1 -0
  87. package/dist/lib/auth-backend.js +172 -0
  88. package/dist/lib/auth-backend.js.map +1 -0
  89. package/dist/lib/auth-list-cache.d.ts +67 -0
  90. package/dist/lib/auth-list-cache.d.ts.map +1 -0
  91. package/dist/lib/auth-list-cache.js +84 -0
  92. package/dist/lib/auth-list-cache.js.map +1 -0
  93. package/dist/lib/auth-list.d.ts +107 -0
  94. package/dist/lib/auth-list.d.ts.map +1 -0
  95. package/dist/lib/auth-list.js +123 -0
  96. package/dist/lib/auth-list.js.map +1 -0
  97. package/dist/lib/auth-login.d.ts +92 -0
  98. package/dist/lib/auth-login.d.ts.map +1 -0
  99. package/dist/lib/auth-login.js +124 -0
  100. package/dist/lib/auth-login.js.map +1 -0
  101. package/dist/lib/auth-mutator-backend.d.ts +54 -0
  102. package/dist/lib/auth-mutator-backend.d.ts.map +1 -0
  103. package/dist/lib/auth-mutator-backend.js +62 -0
  104. package/dist/lib/auth-mutator-backend.js.map +1 -0
  105. package/dist/lib/auth-remote.d.ts +50 -0
  106. package/dist/lib/auth-remote.d.ts.map +1 -1
  107. package/dist/lib/auth-remote.js +84 -2
  108. package/dist/lib/auth-remote.js.map +1 -1
  109. package/dist/lib/bootstrap-kubernetes.d.ts +69 -10
  110. package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
  111. package/dist/lib/bootstrap-kubernetes.js +264 -46
  112. package/dist/lib/bootstrap-kubernetes.js.map +1 -1
  113. package/dist/lib/config.d.ts +35 -4
  114. package/dist/lib/config.d.ts.map +1 -1
  115. package/dist/lib/config.js +82 -11
  116. package/dist/lib/config.js.map +1 -1
  117. package/dist/lib/health-probes.d.ts +0 -22
  118. package/dist/lib/health-probes.d.ts.map +1 -1
  119. package/dist/lib/health-probes.js +57 -0
  120. package/dist/lib/health-probes.js.map +1 -1
  121. package/dist/lib/peripheral-registry.d.ts +11 -0
  122. package/dist/lib/peripheral-registry.d.ts.map +1 -1
  123. package/dist/lib/peripheral-registry.js +5 -0
  124. package/dist/lib/peripheral-registry.js.map +1 -1
  125. package/dist/lib/plans-client.d.ts.map +1 -1
  126. package/dist/lib/plans-client.js +6 -3
  127. package/dist/lib/plans-client.js.map +1 -1
  128. package/dist/mcp-server.js +138 -6
  129. package/hermes-bundle/version.json +1 -1
  130. package/host-cp/k8s/manifests/30-configmap.yaml +4 -0
  131. package/host-cp/k8s/manifests/50-deployment.yaml +13 -1
  132. package/host-cp/k8s/manifests/65-tls-secret-template.yaml.tmpl +35 -0
  133. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  134. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  135. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  136. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  137. package/host-cp/src/dispatch-persister.mjs +157 -0
  138. package/host-cp/src/pr-nanny.mjs +7 -0
  139. package/host-cp/src/server.mjs +175 -3
  140. package/host-cp/src/world-watchdog-pid-lookup.mjs +119 -0
  141. package/host-cp/src/world-watchdog-probes.mjs +271 -0
  142. package/host-cp/src/world-watchdog-recovery.mjs +192 -0
  143. package/host-cp/src/world-watchdog.mjs +313 -0
  144. package/package.json +1 -1
@@ -0,0 +1,192 @@
1
+ /**
2
+ * world-watchdog-recovery.mjs — recovery hook for wedged claude processes.
3
+ *
4
+ * Isolated from world-watchdog.mjs so kill + replay logic is independently
5
+ * mockable in tests without touching the watchdog's ticker.
6
+ *
7
+ * API:
8
+ * createRecovery({ autoRecoverMode, leakyBucket, broadcaster, persister,
9
+ * replay, processKill, log })
10
+ * → { onWedgedVerdict({ worldId, pid }): Promise<void> }
11
+ *
12
+ * Three modes (from compute.autoRecover in .olam/config.yaml):
13
+ * false — no-op; recovery never fires even on wedged verdict (DEFAULT)
14
+ * 'dry-run' — emits all breadcrumbs, never calls processKill or replay
15
+ * true — SIGKILL pid + read last-dispatch + replay; rate-limited
16
+ *
17
+ * Rate-limit: B2 leaky-bucket (3/hour/world). 4th wedge in window emits
18
+ * world.watchdog.recovery.budget_exhausted and skips all action.
19
+ *
20
+ * Replay stub: the `replay` dep is accepted as an injected function. In
21
+ * server.mjs it is wired to a console.warn stub + breadcrumb until the
22
+ * operator runs the B3 idempotence probe and signs off. See TODO below.
23
+ *
24
+ * @see docs/architecture/world-watchdog.md Recovery section
25
+ * @see packages/host-cp/src/lib/leaky-bucket.mjs
26
+ * @see packages/host-cp/src/dispatch-persister.mjs
27
+ */
28
+
29
+ /**
30
+ * @typedef {'false'|true|'dry-run'} AutoRecoverMode
31
+ */
32
+
33
+ /**
34
+ * @typedef {object} RecoveryDeps
35
+ * @property {false|true|'dry-run'} autoRecoverMode
36
+ * Passed from server.mjs which reads config.compute.autoRecover.
37
+ * Default false if config unavailable.
38
+ * @property {{ tryConsume(key: string): { allowed: boolean, retryAfterMs?: number, totalInWindow: number } }} leakyBucket
39
+ * B2 leaky-bucket instance. Keyed by worldId.
40
+ * @property {{ broadcast(type: string, payload: object): void }} [broadcaster]
41
+ * Host-stream broadcaster. Optional — when absent, breadcrumbs are skipped.
42
+ * @property {{ read({ worldId: string }): Promise<{ messageId: string, prompt: string, dispatchedAt: string, source: string } | null> }} persister
43
+ * B4 dispatch-persister read function.
44
+ * @property {(opts: { worldId: string, messageId: string, prompt: string }) => Promise<void>} replay
45
+ * Opaque dispatch helper. Injected dep — DO NOT implement dispatch here.
46
+ * In server.mjs this is wired to a stub until operator signs off on B3 probe.
47
+ * @property {(pid: number) => void} [processKill]
48
+ * process.kill indirection so tests can spy without actually killing.
49
+ * Defaults to process.kill.
50
+ * @property {(msg: string) => void} [log]
51
+ * Logger. Defaults to console.log with [world-watchdog-recovery] prefix.
52
+ */
53
+
54
+ /**
55
+ * @typedef {object} RecoveryHandle
56
+ * @property {(opts: { worldId: string, pid: number|null }) => Promise<void>} onWedgedVerdict
57
+ */
58
+
59
+ /**
60
+ * Create a recovery handle.
61
+ *
62
+ * @param {RecoveryDeps} deps
63
+ * @returns {RecoveryHandle}
64
+ */
65
+ export function createRecovery({
66
+ autoRecoverMode = false,
67
+ leakyBucket,
68
+ broadcaster = null,
69
+ persister,
70
+ replay,
71
+ processKill = (pid) => process.kill(pid, 'SIGKILL'),
72
+ log = (m) => console.log(`[world-watchdog-recovery] ${m}`),
73
+ } = {}) {
74
+ /**
75
+ * Emit a breadcrumb via broadcaster (fail-soft).
76
+ *
77
+ * @param {string} type
78
+ * @param {object} payload
79
+ */
80
+ function broadcast(type, payload) {
81
+ if (!broadcaster || typeof broadcaster.broadcast !== 'function') return;
82
+ try {
83
+ broadcaster.broadcast(type, payload);
84
+ } catch (err) {
85
+ log(`broadcast ${type} failed: ${err?.message ?? err}`);
86
+ }
87
+ }
88
+
89
+ /**
90
+ * Handle a 2-tick-confirmed wedged verdict for a world.
91
+ *
92
+ * Called by world-watchdog.mjs on verdict-transition only (suspect → wedged),
93
+ * NOT on steady-state re-wedge.
94
+ *
95
+ * @param {{ worldId: string, pid: number|null }} opts
96
+ * @returns {Promise<void>}
97
+ */
98
+ async function onWedgedVerdict({ worldId, pid }) {
99
+ // mode=false → detection-only; never act.
100
+ if (autoRecoverMode === false) return;
101
+
102
+ // PID null → watchdog hasn't resolved a real PID yet (Phase A stub case);
103
+ // skip silently — there is nothing to kill.
104
+ if (pid === null) return;
105
+
106
+ // Rate-limit gate.
107
+ const bucket = leakyBucket.tryConsume(worldId);
108
+ if (!bucket.allowed) {
109
+ broadcast('world.watchdog.recovery.budget_exhausted', {
110
+ worldId,
111
+ retryAfterMs: bucket.retryAfterMs,
112
+ totalInWindow: bucket.totalInWindow,
113
+ });
114
+ log(`worldId=${worldId}: budget exhausted (${bucket.totalInWindow} in window); skipping recovery`);
115
+ return;
116
+ }
117
+
118
+ // Read last persisted dispatch for replay.
119
+ let lastDispatch = null;
120
+ try {
121
+ lastDispatch = await persister.read({ worldId });
122
+ } catch (err) {
123
+ log(`worldId=${worldId}: persister.read failed: ${err?.message ?? err}`);
124
+ }
125
+
126
+ broadcast('world.watchdog.recovery.start', {
127
+ worldId,
128
+ pid,
129
+ mode: autoRecoverMode,
130
+ lastDispatchMessageId: lastDispatch?.messageId ?? null,
131
+ });
132
+
133
+ // dry-run — log planned action but do NOT kill.
134
+ if (autoRecoverMode === 'dry-run') {
135
+ log(`worldId=${worldId}: dry-run — would SIGKILL pid=${pid}${lastDispatch ? ` + replay messageId=${lastDispatch.messageId}` : ' (no last-dispatch)'}`);
136
+ broadcast('world.watchdog.recovery.complete', {
137
+ worldId,
138
+ pid,
139
+ mode: 'dry-run',
140
+ replayed: false,
141
+ });
142
+ return;
143
+ }
144
+
145
+ // mode=true — act.
146
+ try {
147
+ // 1. SIGKILL the wedged process.
148
+ processKill(pid);
149
+ log(`worldId=${worldId}: SIGKILL sent to pid=${pid}`);
150
+
151
+ // 2. Replay or note absence of last-dispatch.
152
+ if (!lastDispatch) {
153
+ broadcast('world.watchdog.recovery.restart_without_replay', {
154
+ worldId,
155
+ pid,
156
+ });
157
+ log(`worldId=${worldId}: no last-dispatch; killed without replay`);
158
+ } else {
159
+ // TODO: wire real replay once operator has run the B3 idempotence probe
160
+ // and confirmed dispatch is idempotent for the substrates in use.
161
+ // Until then this stub logs and emits a breadcrumb so the stub path
162
+ // is visible in production logs. See B3 probe + operator review gate B6.
163
+ broadcast('world.watchdog.recovery.replay_stub', {
164
+ worldId,
165
+ prompt: lastDispatch.prompt,
166
+ });
167
+ log(`worldId=${worldId}: replay stub hit — real replay deferred pending B3 sign-off`);
168
+ await replay({
169
+ worldId,
170
+ messageId: lastDispatch.messageId,
171
+ prompt: lastDispatch.prompt,
172
+ });
173
+ }
174
+
175
+ broadcast('world.watchdog.recovery.complete', {
176
+ worldId,
177
+ pid,
178
+ mode: true,
179
+ replayed: !!lastDispatch,
180
+ });
181
+ } catch (err) {
182
+ log(`worldId=${worldId}: recovery failed: ${err?.message ?? err}`);
183
+ broadcast('world.watchdog.recovery.failed', {
184
+ worldId,
185
+ pid,
186
+ error: err?.message ?? String(err),
187
+ });
188
+ }
189
+ }
190
+
191
+ return { onWedgedVerdict };
192
+ }
@@ -0,0 +1,313 @@
1
+ /**
2
+ * world-watchdog.mjs — periodic watchdog that probes each active world's
3
+ * `claude` PID for the three wedge signals (wchan + CLOSE_WAIT + CPU) and
4
+ * emits `world.watchdog.tick` events on the host-stream broadcaster.
5
+ *
6
+ * Design:
7
+ * - Mirrors `world-activity-tracker.mjs` shape exactly: `startWorldWatchdog(deps)`
8
+ * returns `{ stop, tickNow }`.
9
+ * - Per-world 2-tick confirm: a `'wedged'` classification is only emitted
10
+ * after TWO consecutive ticks with the wedge signature. A single-tick
11
+ * wedge emits `'suspect'`. A healthy tick resets the streak.
12
+ * - Per-world fail-soft: a probe error for one world never skips other worlds.
13
+ * - `OLAM_WORLD_WATCHDOG_DISABLED=1` → `start()` is a no-op (returns stub).
14
+ * - Cadence: `OLAM_WORLD_WATCHDOG_TICK_MS` env or `intervalMs` dep (default 30_000).
15
+ *
16
+ * v1 stub: `getClaudePidForWorld(worldId)` returns null for all worlds in
17
+ * Phase A. When null, the tick still fires but all probe signals are null,
18
+ * producing `verdict: 'unknown'`. Real PID lookup (docker inspect →
19
+ * /proc/<hostPid>/status NSpid field) is wired in a follow-up.
20
+ * This is documented here and in docs/architecture/world-watchdog.md.
21
+ *
22
+ * Wire-in: `server.mjs` constructs once after broadcaster is ready and calls
23
+ * `.stop()` from the SIGTERM/SIGINT handler. Gated on `!SERVE_ONLY`.
24
+ *
25
+ * @see docs/architecture/world-watchdog.md
26
+ * @see packages/host-cp/src/world-watchdog-probes.mjs
27
+ * @see packages/host-cp/src/world-activity-tracker.mjs (shape reference)
28
+ */
29
+
30
+ import {
31
+ readWchan,
32
+ readCloseWaitSockets,
33
+ readCpuPercent,
34
+ classify,
35
+ } from './world-watchdog-probes.mjs';
36
+ // Recovery hook (B5). Optional dep — when absent (recovery is null/undefined),
37
+ // the watchdog behaves exactly as Phase A: detection-only, no kill, no replay.
38
+ // Wire via startWorldWatchdog({ recovery: createRecovery({...}) }) in server.mjs.
39
+
40
+ const DEFAULT_TICK_MS = 30_000;
41
+ // CPU measurement window: shorter than the tick cadence so we don't overlap.
42
+ const CPU_WINDOW_MS = 500;
43
+
44
+ /**
45
+ * @typedef {object} WorldWatchdogDeps
46
+ * @property {object} [broadcaster] Object with `.broadcast(type, payload)`.
47
+ * Optional — when absent events are skipped but state tracking still works.
48
+ * @property {number} [intervalMs] Tick cadence in ms. Defaults to
49
+ * `OLAM_WORLD_WATCHDOG_TICK_MS` env or 30_000.
50
+ * @property {() => Promise<string[]>} [listActiveWorlds]
51
+ * Returns an array of active world IDs to probe each tick.
52
+ * Defaults to returning [].
53
+ * @property {(worldId: string) => Promise<number|null>} [getClaudePidForWorld]
54
+ * Returns the host-side PID of the claude process for a world, or null.
55
+ * v1 default: always returns null (all worlds → verdict 'unknown').
56
+ * @property {{ procRoot?: string }} [probes]
57
+ * Injectable probe options (procRoot for tests).
58
+ * @property {{ onWedgedVerdict(opts: { worldId: string, pid: number|null }): Promise<void> }} [recovery]
59
+ * Optional recovery handle (from world-watchdog-recovery.mjs). When present,
60
+ * called once on verdict-transition to 'wedged' (suspect → wedged), NOT on
61
+ * steady-state re-wedge. When absent, detection-only (Phase A behaviour).
62
+ * @property {(msg: string) => void} [log] Defaults to `console.log`.
63
+ * @property {(msg: string) => void} [debug] Defaults to no-op.
64
+ * @property {(cb: () => void, ms: number) => any} [setTimer]
65
+ * Injectable `setInterval` for tests.
66
+ * @property {(handle: any) => void} [clearTimer]
67
+ * Injectable `clearInterval` for tests.
68
+ * @property {() => Date} [now] Clock injection for tests.
69
+ */
70
+
71
+ /**
72
+ * @typedef {object} WorldWatchdogHandle
73
+ * @property {() => void} stop
74
+ * @property {() => Promise<number>} tickNow Run one tick immediately (returns
75
+ * the count of worlds processed). Exposed for tests.
76
+ * @property {(worldId: string) => object|null} getVerdict
77
+ * Returns the latest in-memory verdict entry for a world, or null if no tick
78
+ * has fired yet. Used by the HTTP endpoint (A5).
79
+ */
80
+
81
+ /**
82
+ * Per-world state tracked between ticks for the 2-tick confirm.
83
+ *
84
+ * @typedef {object} WorldWatchdogState
85
+ * @property {'healthy'|'suspect'|'wedged'|'unknown'} lastClassification
86
+ * The raw classification from the previous tick (before 2-tick confirm).
87
+ * @property {'healthy'|'suspect'|'wedged'|'unknown'} lastVerdict
88
+ * The emitted verdict (post-confirm).
89
+ * @property {string} lastTickAt ISO-8601 timestamp of last tick.
90
+ * @property {object|null} lastSignals The signals from the last tick.
91
+ * @property {number|null} lastPid The PID probed last tick.
92
+ */
93
+
94
+ /**
95
+ * Start the world watchdog. Returns a `{ stop, tickNow, getVerdict }` handle.
96
+ *
97
+ * Honoring `OLAM_WORLD_WATCHDOG_DISABLED=1`: if the env var is set, returns
98
+ * a no-op stub immediately without starting the interval or making any probe
99
+ * calls.
100
+ *
101
+ * @param {WorldWatchdogDeps} [deps]
102
+ * @returns {WorldWatchdogHandle}
103
+ */
104
+ export function startWorldWatchdog(deps = {}) {
105
+ // Honour kill switch — return a no-op stub.
106
+ if (process.env.OLAM_WORLD_WATCHDOG_DISABLED === '1') {
107
+ return {
108
+ stop() {},
109
+ tickNow: async () => 0,
110
+ getVerdict: () => null,
111
+ };
112
+ }
113
+
114
+ const log = deps.log ?? ((m) => console.log(`[world-watchdog] ${m}`));
115
+ const debug = deps.debug ?? (() => {});
116
+ const setTimer = deps.setTimer ?? ((cb, ms) => setInterval(cb, ms));
117
+ const clearTimer = deps.clearTimer ?? ((h) => clearInterval(h));
118
+ const now = deps.now ?? (() => new Date());
119
+
120
+ const intervalMs =
121
+ deps.intervalMs ??
122
+ parseInt(process.env.OLAM_WORLD_WATCHDOG_TICK_MS ?? `${DEFAULT_TICK_MS}`, 10);
123
+
124
+ const broadcaster = deps.broadcaster ?? null;
125
+ const listActiveWorlds = deps.listActiveWorlds ?? (async () => []);
126
+ const getClaudePidForWorld = deps.getClaudePidForWorld ?? (async (_id) => null);
127
+ const probeOpts = deps.probes ?? {};
128
+ // Recovery hook — null when not configured (Phase A / default-off behaviour).
129
+ const recovery = deps.recovery ?? null;
130
+
131
+ // Per-world state map: worldId → WorldWatchdogState.
132
+ /** @type {Map<string, WorldWatchdogState>} */
133
+ const worldState = new Map();
134
+
135
+ let stopped = false;
136
+ let inFlight = false;
137
+ let intervalHandle = null;
138
+
139
+ /**
140
+ * Probe a single world and update its state. Returns the verdict emitted.
141
+ *
142
+ * @param {string} worldId
143
+ * @returns {Promise<'healthy'|'suspect'|'wedged'|'unknown'>}
144
+ */
145
+ async function probeWorld(worldId) {
146
+ const pid = await getClaudePidForWorld(worldId);
147
+
148
+ let wchan = null;
149
+ let closeWaitSockets = [];
150
+ let cpuPercent = null;
151
+
152
+ if (pid !== null) {
153
+ // All probes are fail-soft — they return null/[] on I/O error.
154
+ [wchan, closeWaitSockets, cpuPercent] = await Promise.all([
155
+ readWchan(pid, probeOpts),
156
+ readCloseWaitSockets(pid, probeOpts),
157
+ readCpuPercent(pid, CPU_WINDOW_MS, probeOpts),
158
+ ]);
159
+ }
160
+
161
+ const closeWaitCount = closeWaitSockets.length;
162
+ const signals = pid !== null
163
+ ? { wchan, closeWaitCount, cpuPercent }
164
+ : null;
165
+
166
+ // Classify raw signals.
167
+ const rawClassification = pid !== null
168
+ ? classify({ wchan, closeWaitCount, cpuPercent })
169
+ : 'unknown';
170
+
171
+ // 2-tick confirm: only emit 'wedged' if BOTH this tick AND the previous tick
172
+ // classified as 'wedged'. Otherwise emit the raw classification.
173
+ const prev = worldState.get(worldId);
174
+ let verdict;
175
+ if (rawClassification === 'wedged' && prev?.lastClassification === 'wedged') {
176
+ verdict = 'wedged';
177
+ } else if (rawClassification === 'wedged') {
178
+ // First 'wedged' tick — emit 'suspect' (2-tick confirm pending).
179
+ verdict = 'suspect';
180
+ } else {
181
+ verdict = rawClassification;
182
+ }
183
+
184
+ const tickAt = now().toISOString();
185
+
186
+ // Update per-world state.
187
+ worldState.set(worldId, {
188
+ lastClassification: rawClassification,
189
+ lastVerdict: verdict,
190
+ lastTickAt: tickAt,
191
+ lastSignals: signals,
192
+ lastPid: pid,
193
+ });
194
+
195
+ // Recovery hook — fire ONCE on verdict-transition to 'wedged' (not on
196
+ // steady-state re-wedge). Guard: prev?.lastVerdict !== 'wedged' ensures
197
+ // only the suspect→wedged transition triggers, not wedged→wedged.
198
+ if (
199
+ verdict === 'wedged' &&
200
+ recovery !== null &&
201
+ prev?.lastVerdict !== 'wedged'
202
+ ) {
203
+ // Fire-and-forget; fail-soft so a recovery error never skips other worlds.
204
+ void recovery.onWedgedVerdict({ worldId, pid }).catch((err) => {
205
+ log(`recovery.onWedgedVerdict ${worldId} failed: ${err?.message ?? err}`);
206
+ });
207
+ }
208
+
209
+ // Emit broadcaster event.
210
+ if (broadcaster && typeof broadcaster.broadcast === 'function') {
211
+ try {
212
+ broadcaster.broadcast('world.watchdog.tick', {
213
+ worldId,
214
+ verdict,
215
+ signals,
216
+ pid,
217
+ lastTickAt: tickAt,
218
+ });
219
+ } catch (err) {
220
+ log(`broadcast ${worldId} failed: ${err?.message ?? err}`);
221
+ }
222
+ }
223
+
224
+ return verdict;
225
+ }
226
+
227
+ /**
228
+ * One tick: get active worlds, probe each, return count processed.
229
+ *
230
+ * @returns {Promise<number>}
231
+ */
232
+ async function tick() {
233
+ if (stopped) return 0;
234
+ if (inFlight) {
235
+ debug('tick skipped: previous tick still in flight');
236
+ return 0;
237
+ }
238
+ inFlight = true;
239
+
240
+ let processed = 0;
241
+ try {
242
+ let worlds;
243
+ try {
244
+ worlds = await listActiveWorlds();
245
+ } catch (err) {
246
+ log(`listActiveWorlds failed: ${err?.message ?? err}`);
247
+ return 0;
248
+ }
249
+
250
+ for (const worldId of worlds) {
251
+ if (stopped) break;
252
+ if (typeof worldId !== 'string') continue;
253
+
254
+ try {
255
+ await probeWorld(worldId);
256
+ processed += 1;
257
+ } catch (err) {
258
+ // Per-world fail-soft: one bad world doesn't crash the loop.
259
+ debug(`probe ${worldId} failed: ${err?.message ?? err}`);
260
+ }
261
+ }
262
+ } finally {
263
+ inFlight = false;
264
+ }
265
+
266
+ return processed;
267
+ }
268
+
269
+ // Kick off an initial tick on next event-loop turn so callers can
270
+ // attach test spies before any probe work happens.
271
+ setImmediate(() => {
272
+ if (stopped) return;
273
+ void tick().catch((err) => {
274
+ log(`initial tick crashed: ${err?.message ?? err}`);
275
+ });
276
+ });
277
+
278
+ intervalHandle = setTimer(() => {
279
+ void tick().catch((err) => {
280
+ log(`tick crashed: ${err?.message ?? err}`);
281
+ });
282
+ }, intervalMs);
283
+ // Don't pin the event loop on shutdown.
284
+ if (intervalHandle && typeof intervalHandle.unref === 'function') {
285
+ intervalHandle.unref();
286
+ }
287
+
288
+ log(`started: interval=${intervalMs}ms`);
289
+
290
+ return {
291
+ stop() {
292
+ if (stopped) return;
293
+ stopped = true;
294
+ if (intervalHandle !== null) {
295
+ try { clearTimer(intervalHandle); } catch { /* ignore */ }
296
+ intervalHandle = null;
297
+ }
298
+ },
299
+
300
+ tickNow: tick,
301
+
302
+ /**
303
+ * Return the latest in-memory verdict entry for a world.
304
+ * Returns null if no tick has fired for this world yet.
305
+ *
306
+ * @param {string} worldId
307
+ * @returns {WorldWatchdogState|null}
308
+ */
309
+ getVerdict(worldId) {
310
+ return worldState.get(worldId) ?? null;
311
+ },
312
+ };
313
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pleri/olam-cli",
3
- "version": "0.1.195",
3
+ "version": "0.1.198",
4
4
  "type": "module",
5
5
  "bin": {
6
6
  "olam": "./bin/olam.cjs"