@pleri/olam-cli 0.1.196 → 0.1.199

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/README.md +52 -0
  2. package/dist/ask/knowledge-pack.generated.d.ts.map +1 -1
  3. package/dist/ask/knowledge-pack.generated.js +10 -8
  4. package/dist/ask/knowledge-pack.generated.js.map +1 -1
  5. package/dist/commands/auth-list-json.d.ts +34 -0
  6. package/dist/commands/auth-list-json.d.ts.map +1 -1
  7. package/dist/commands/auth-list-json.js +24 -0
  8. package/dist/commands/auth-list-json.js.map +1 -1
  9. package/dist/commands/auth-migrate.d.ts +212 -0
  10. package/dist/commands/auth-migrate.d.ts.map +1 -0
  11. package/dist/commands/auth-migrate.js +465 -0
  12. package/dist/commands/auth-migrate.js.map +1 -0
  13. package/dist/commands/auth.d.ts.map +1 -1
  14. package/dist/commands/auth.js +239 -184
  15. package/dist/commands/auth.js.map +1 -1
  16. package/dist/commands/bootstrap.d.ts +4 -0
  17. package/dist/commands/bootstrap.d.ts.map +1 -1
  18. package/dist/commands/bootstrap.js +10 -0
  19. package/dist/commands/bootstrap.js.map +1 -1
  20. package/dist/commands/dispatch.d.ts.map +1 -1
  21. package/dist/commands/dispatch.js +11 -1
  22. package/dist/commands/dispatch.js.map +1 -1
  23. package/dist/commands/doctor.d.ts +33 -0
  24. package/dist/commands/doctor.d.ts.map +1 -1
  25. package/dist/commands/doctor.js +299 -12
  26. package/dist/commands/doctor.js.map +1 -1
  27. package/dist/commands/kg-mirror.d.ts +18 -2
  28. package/dist/commands/kg-mirror.d.ts.map +1 -1
  29. package/dist/commands/kg-mirror.js +78 -3
  30. package/dist/commands/kg-mirror.js.map +1 -1
  31. package/dist/commands/mcp/complete.d.ts +36 -0
  32. package/dist/commands/mcp/complete.d.ts.map +1 -0
  33. package/dist/commands/mcp/complete.js +66 -0
  34. package/dist/commands/mcp/complete.js.map +1 -0
  35. package/dist/commands/mcp/index.d.ts +1 -1
  36. package/dist/commands/mcp/index.d.ts.map +1 -1
  37. package/dist/commands/mcp/index.js +3 -1
  38. package/dist/commands/mcp/index.js.map +1 -1
  39. package/dist/commands/memory/bridge.d.ts +1 -1
  40. package/dist/commands/memory/bridge.d.ts.map +1 -1
  41. package/dist/commands/memory/bridge.js +2 -6
  42. package/dist/commands/memory/bridge.js.map +1 -1
  43. package/dist/commands/memory/secret.d.ts.map +1 -1
  44. package/dist/commands/memory/secret.js +4 -3
  45. package/dist/commands/memory/secret.js.map +1 -1
  46. package/dist/commands/observe.d.ts +3 -3
  47. package/dist/commands/observe.d.ts.map +1 -1
  48. package/dist/commands/observe.js +11 -8
  49. package/dist/commands/observe.js.map +1 -1
  50. package/dist/commands/runbooks.d.ts.map +1 -1
  51. package/dist/commands/runbooks.js +77 -10
  52. package/dist/commands/runbooks.js.map +1 -1
  53. package/dist/commands/services-tls.d.ts.map +1 -1
  54. package/dist/commands/services-tls.js +41 -0
  55. package/dist/commands/services-tls.js.map +1 -1
  56. package/dist/commands/services.d.ts +45 -3
  57. package/dist/commands/services.d.ts.map +1 -1
  58. package/dist/commands/services.js +198 -71
  59. package/dist/commands/services.js.map +1 -1
  60. package/dist/commands/setup-phase-8-kg-hook.d.ts +48 -0
  61. package/dist/commands/setup-phase-8-kg-hook.d.ts.map +1 -0
  62. package/dist/commands/setup-phase-8-kg-hook.js +93 -0
  63. package/dist/commands/setup-phase-8-kg-hook.js.map +1 -0
  64. package/dist/commands/setup-phase-9-memory-bridge.d.ts +36 -0
  65. package/dist/commands/setup-phase-9-memory-bridge.d.ts.map +1 -0
  66. package/dist/commands/setup-phase-9-memory-bridge.js +59 -0
  67. package/dist/commands/setup-phase-9-memory-bridge.js.map +1 -0
  68. package/dist/commands/setup.d.ts +34 -1
  69. package/dist/commands/setup.d.ts.map +1 -1
  70. package/dist/commands/setup.js +328 -23
  71. package/dist/commands/setup.js.map +1 -1
  72. package/dist/commands/update.d.ts +24 -0
  73. package/dist/commands/update.d.ts.map +1 -1
  74. package/dist/commands/update.js +53 -0
  75. package/dist/commands/update.js.map +1 -1
  76. package/dist/commands/upgrade.d.ts +5 -0
  77. package/dist/commands/upgrade.d.ts.map +1 -1
  78. package/dist/commands/upgrade.js +31 -8
  79. package/dist/commands/upgrade.js.map +1 -1
  80. package/dist/image-digests.json +8 -8
  81. package/dist/index.js +4302 -2466
  82. package/dist/lib/auth-backend.d.ts +168 -0
  83. package/dist/lib/auth-backend.d.ts.map +1 -0
  84. package/dist/lib/auth-backend.js +172 -0
  85. package/dist/lib/auth-backend.js.map +1 -0
  86. package/dist/lib/auth-list-cache.d.ts +67 -0
  87. package/dist/lib/auth-list-cache.d.ts.map +1 -0
  88. package/dist/lib/auth-list-cache.js +84 -0
  89. package/dist/lib/auth-list-cache.js.map +1 -0
  90. package/dist/lib/auth-list.d.ts +107 -0
  91. package/dist/lib/auth-list.d.ts.map +1 -0
  92. package/dist/lib/auth-list.js +123 -0
  93. package/dist/lib/auth-list.js.map +1 -0
  94. package/dist/lib/auth-login.d.ts +92 -0
  95. package/dist/lib/auth-login.d.ts.map +1 -0
  96. package/dist/lib/auth-login.js +124 -0
  97. package/dist/lib/auth-login.js.map +1 -0
  98. package/dist/lib/auth-mutator-backend.d.ts +54 -0
  99. package/dist/lib/auth-mutator-backend.d.ts.map +1 -0
  100. package/dist/lib/auth-mutator-backend.js +62 -0
  101. package/dist/lib/auth-mutator-backend.js.map +1 -0
  102. package/dist/lib/auth-remote.d.ts +50 -0
  103. package/dist/lib/auth-remote.d.ts.map +1 -1
  104. package/dist/lib/auth-remote.js +84 -2
  105. package/dist/lib/auth-remote.js.map +1 -1
  106. package/dist/lib/bootstrap-kubernetes.d.ts +93 -12
  107. package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
  108. package/dist/lib/bootstrap-kubernetes.js +364 -53
  109. package/dist/lib/bootstrap-kubernetes.js.map +1 -1
  110. package/dist/lib/config.d.ts +7 -0
  111. package/dist/lib/config.d.ts.map +1 -1
  112. package/dist/lib/config.js.map +1 -1
  113. package/dist/lib/health-probes.d.ts +0 -22
  114. package/dist/lib/health-probes.d.ts.map +1 -1
  115. package/dist/lib/health-probes.js +23 -2
  116. package/dist/lib/health-probes.js.map +1 -1
  117. package/dist/lib/peripheral-registry.d.ts +11 -0
  118. package/dist/lib/peripheral-registry.d.ts.map +1 -1
  119. package/dist/lib/peripheral-registry.js +5 -0
  120. package/dist/lib/peripheral-registry.js.map +1 -1
  121. package/dist/lib/plans-client.d.ts.map +1 -1
  122. package/dist/lib/plans-client.js +6 -3
  123. package/dist/lib/plans-client.js.map +1 -1
  124. package/dist/mcp-server.js +14 -3
  125. package/hermes-bundle/version.json +1 -1
  126. package/host-cp/k8s/manifests/30-configmap.yaml +4 -0
  127. package/host-cp/k8s/manifests/50-deployment.yaml +13 -1
  128. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  129. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  130. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  131. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  132. package/host-cp/src/dispatch-persister.mjs +157 -0
  133. package/host-cp/src/pr-nanny.mjs +7 -0
  134. package/host-cp/src/server.mjs +175 -3
  135. package/host-cp/src/world-watchdog-pid-lookup.mjs +119 -0
  136. package/host-cp/src/world-watchdog-probes.mjs +271 -0
  137. package/host-cp/src/world-watchdog-recovery.mjs +192 -0
  138. package/host-cp/src/world-watchdog.mjs +313 -0
  139. package/package.json +1 -1
@@ -0,0 +1,157 @@
1
+ /**
2
+ * dispatch-persister.mjs — persist the last dispatch for each world.
3
+ *
4
+ * The world watchdog's recovery hook reads this to replay the last
5
+ * unanswered prompt when it auto-recovers a wedged claude process.
6
+ *
7
+ * Contract:
8
+ * persist({ worldId, messageId, prompt, source, statePath?, now? })
9
+ * Atomically writes ~/.olam/worlds/<worldId>/state/last-dispatch.json.
10
+ * Overwrites any previous file — only the LATEST dispatch matters for
11
+ * replay. Atomic write (tmp + fs.rename) prevents partial-write residue
12
+ * from corrupting recovery reads.
13
+ *
14
+ * read({ worldId, statePath? })
15
+ * Returns { messageId, prompt, dispatchedAt, source } or null.
16
+ * null on ENOENT (no dispatch persisted yet) — never throws.
17
+ * null on JSON parse error (logs + skips) — never throws on corrupt file.
18
+ *
19
+ * Multiple worlds are independent: world A and world B have separate files.
20
+ * Multiple concurrent persist() calls for the SAME world are safe — each
21
+ * write is a rename of a tmp file so the worst case is one write winning.
22
+ *
23
+ * @see docs/architecture/world-watchdog.md
24
+ */
25
+
26
+ import fs from 'node:fs/promises';
27
+ import path from 'node:path';
28
+ import os from 'node:os';
29
+
30
+ // Default base path under which per-world state directories live.
31
+ const DEFAULT_STATE_BASE = path.join(os.homedir(), '.olam', 'worlds');
32
+
33
+ /**
34
+ * Derive the path to last-dispatch.json for a world.
35
+ *
36
+ * @param {string} worldId
37
+ * @param {string} [stateBase] Override the base directory (for tests).
38
+ * @returns {string}
39
+ */
40
+ export function lastDispatchPath(worldId, stateBase = DEFAULT_STATE_BASE) {
41
+ return path.join(stateBase, worldId, 'state', 'last-dispatch.json');
42
+ }
43
+
44
+ /**
45
+ * Persist the last dispatch for a world.
46
+ *
47
+ * @param {{
48
+ * worldId: string,
49
+ * messageId: string,
50
+ * prompt: string,
51
+ * source: string,
52
+ * statePath?: string,
53
+ * now?: () => number,
54
+ * }} opts
55
+ * @returns {Promise<void>}
56
+ */
57
+ export async function persist({
58
+ worldId,
59
+ messageId,
60
+ prompt,
61
+ source,
62
+ statePath,
63
+ now = () => Date.now(),
64
+ }) {
65
+ const filePath = statePath ?? lastDispatchPath(worldId);
66
+ const dir = path.dirname(filePath);
67
+ const tmpPath = `${filePath}.tmp`;
68
+
69
+ const record = {
70
+ messageId,
71
+ prompt,
72
+ dispatchedAt: new Date(now()).toISOString(),
73
+ source,
74
+ };
75
+
76
+ // Ensure the directory exists.
77
+ await fs.mkdir(dir, { recursive: true });
78
+
79
+ // Atomic write: write to .tmp then rename over the target.
80
+ await fs.writeFile(tmpPath, JSON.stringify(record, null, 2) + '\n', 'utf8');
81
+ await fs.rename(tmpPath, filePath);
82
+ }
83
+
84
+ /**
85
+ * Fire-and-forget persist wrapper used at the dispatch call-sites.
86
+ *
87
+ * Centralises the void/.catch boilerplate so the two enrichment sites
88
+ * (pr-nanny + /api/cloud-dispatch) can't drift on future changes.
89
+ * Logs failures via the supplied logSource tag; never throws.
90
+ *
91
+ * @param {{
92
+ * worldId: string,
93
+ * messageId: string,
94
+ * prompt: string,
95
+ * source: string,
96
+ * logSource?: string,
97
+ * statePath?: string,
98
+ * now?: () => number,
99
+ * }} opts
100
+ * @returns {void}
101
+ */
102
+ export function safePersistLastDispatch(opts) {
103
+ const { logSource = opts.source, ...persistOpts } = opts;
104
+ void persist(persistOpts).catch((err) => {
105
+ console.warn(
106
+ `[${logSource}] persistLastDispatch failed (non-fatal): ${err?.message ?? err}`,
107
+ );
108
+ });
109
+ }
110
+
111
+ /**
112
+ * Read the last persisted dispatch for a world.
113
+ *
114
+ * @param {{
115
+ * worldId: string,
116
+ * statePath?: string,
117
+ * }} opts
118
+ * @returns {Promise<{ messageId: string, prompt: string, dispatchedAt: string, source: string } | null>}
119
+ */
120
+ export async function read({ worldId, statePath }) {
121
+ const filePath = statePath ?? lastDispatchPath(worldId);
122
+
123
+ let raw;
124
+ try {
125
+ raw = await fs.readFile(filePath, 'utf8');
126
+ } catch (err) {
127
+ if (err?.code === 'ENOENT') return null;
128
+ // Other I/O errors (e.g. permissions) — log + return null (fail-soft).
129
+ console.error(`[dispatch-persister] readFile ${filePath}: ${err?.message ?? err}`);
130
+ return null;
131
+ }
132
+
133
+ try {
134
+ const parsed = JSON.parse(raw);
135
+ // Basic shape validation — don't throw on corrupt file.
136
+ if (
137
+ typeof parsed !== 'object' ||
138
+ parsed === null ||
139
+ typeof parsed.messageId !== 'string' ||
140
+ typeof parsed.prompt !== 'string' ||
141
+ typeof parsed.dispatchedAt !== 'string' ||
142
+ typeof parsed.source !== 'string'
143
+ ) {
144
+ console.error(`[dispatch-persister] ${filePath}: unexpected shape, skipping`);
145
+ return null;
146
+ }
147
+ return {
148
+ messageId: parsed.messageId,
149
+ prompt: parsed.prompt,
150
+ dispatchedAt: parsed.dispatchedAt,
151
+ source: parsed.source,
152
+ };
153
+ } catch (err) {
154
+ console.error(`[dispatch-persister] ${filePath}: JSON parse error: ${err?.message ?? err}`);
155
+ return null;
156
+ }
157
+ }
@@ -24,6 +24,7 @@
24
24
  import { execFile } from 'node:child_process';
25
25
  import { promisify } from 'node:util';
26
26
  import { pickNextTier } from './dispatch/tier-escalator.mjs';
27
+ import { safePersistLastDispatch } from './dispatch-persister.mjs';
27
28
 
28
29
  const execFileAsync = promisify(execFile);
29
30
 
@@ -251,6 +252,12 @@ export function createPrNanny({
251
252
 
252
253
  // Dispatch fix
253
254
  try {
255
+ safePersistLastDispatch({
256
+ worldId,
257
+ messageId: `nanny-${worldId}-${Date.now()}`,
258
+ prompt,
259
+ source: 'pr-nanny',
260
+ });
254
261
  await dispatchToWorld(worldId, prompt, { tier: tierForThisDispatch });
255
262
  const now = new Date().toISOString();
256
263
  prStateStore.set(worldId, {
@@ -84,6 +84,11 @@ import {
84
84
  defaultListContainerNames,
85
85
  } from './boot-reconciler.mjs';
86
86
  import { startWorldActivityTracker } from './world-activity-tracker.mjs';
87
+ import { startWorldWatchdog } from './world-watchdog.mjs';
88
+ import { createRecovery } from './world-watchdog-recovery.mjs';
89
+ import { createLeakyBucket } from './lib/leaky-bucket.mjs';
90
+ import { read as readLastDispatch, safePersistLastDispatch } from './dispatch-persister.mjs';
91
+ import { findClaudePid } from './world-watchdog-pid-lookup.mjs';
87
92
  import { authSecretHint } from './auth-secret-hint.mjs';
88
93
  import * as tunnelManager from './world-tunnel-manager.mjs';
89
94
  import * as bridgeManager from './port-bridge-manager.mjs';
@@ -96,6 +101,7 @@ import {
96
101
  import { instrumentHandler, renderMetrics } from './metrics.mjs';
97
102
  import { handleDispatchFromEmail } from './lib/email-dispatch.mjs';
98
103
  import { handleDispatchFromLinear } from './lib/linear-dispatch.mjs';
104
+ // (safePersistLastDispatch imported above alongside readLastDispatch)
99
105
  import { emitTierSuggestion } from '../dispatch/auto-tier-scheduler.mjs';
100
106
  import { isServeOnly, isOrchestrationRoute, ORCHESTRATION_UNAVAILABLE } from './serve-only-config.mjs';
101
107
 
@@ -1874,6 +1880,41 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
1874
1880
  // the isOrchestrationRoute guard — it covers /api/world/, /api/worlds/<id>,
1875
1881
  // and /v1/worlds/ for all methods, so no per-route guard is needed here.)
1876
1882
 
1883
+ // GET /api/world/<id>/socket-health — world watchdog verdict (A5).
1884
+ // Returns the latest in-memory probe result from the world watchdog.
1885
+ // Read-only; never mutates world state.
1886
+ // 200: { worldId, verdict, signals, pid, lastTickAt } — known world + tick fired
1887
+ // 200 verdict='unknown': known world but no tick has fired yet
1888
+ // 404: unknown_world
1889
+ // serve-only: returns 503 orchestration_unavailable (isOrchestrationRoute covers
1890
+ // /api/world/* prefix, so this route is already blocked upstream before reaching here).
1891
+ const socketHealthMatch = /^\/api\/world\/([^/?#]+)\/socket-health\/?$/.exec(url.pathname);
1892
+ if (socketHealthMatch && req.method === 'GET') {
1893
+ const worldId = decodeURIComponent(socketHealthMatch[1]);
1894
+ if (!(worldId in WORLDS)) {
1895
+ return jsonReply(res, 404, { error: 'unknown_world' });
1896
+ }
1897
+ // worldWatchdog is null in serve-only mode (but the serve-only gate above
1898
+ // would have returned 503 before we get here; belt-and-suspenders).
1899
+ const entry = worldWatchdog ? worldWatchdog.getVerdict(worldId) : null;
1900
+ if (!entry) {
1901
+ return jsonReply(res, 200, {
1902
+ worldId,
1903
+ verdict: 'unknown',
1904
+ signals: null,
1905
+ pid: null,
1906
+ lastTickAt: null,
1907
+ });
1908
+ }
1909
+ return jsonReply(res, 200, {
1910
+ worldId,
1911
+ verdict: entry.lastVerdict,
1912
+ signals: entry.lastSignals,
1913
+ pid: entry.lastPid,
1914
+ lastTickAt: entry.lastTickAt,
1915
+ });
1916
+ }
1917
+
1877
1918
  // GET /api/world/<id>/progress — phase ladder progress for inbox row.
1878
1919
  const progressMatch = /^\/api\/world\/([^/?#]+)\/progress\/?$/.exec(url.pathname);
1879
1920
  if (progressMatch && req.method === 'GET') {
@@ -1892,8 +1933,16 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
1892
1933
  prStateStore,
1893
1934
  getGhToken: resolveGhToken,
1894
1935
  });
1895
- progressCache.set(worldId, { fetchedAt: Date.now(), data });
1896
- return jsonReply(res, 200, data);
1936
+ // C1 attach socketHealth if watchdog has fired for this world.
1937
+ const verdictEntry = worldWatchdog ? worldWatchdog.getVerdict(worldId) : null;
1938
+ const enriched = verdictEntry
1939
+ ? {
1940
+ ...data,
1941
+ socketHealth: buildSocketHealthPayload(worldId, verdictEntry),
1942
+ }
1943
+ : data;
1944
+ progressCache.set(worldId, { fetchedAt: Date.now(), data: enriched });
1945
+ return jsonReply(res, 200, enriched);
1897
1946
  }
1898
1947
 
1899
1948
  // /api/world/<id>/* → proxy to per-world CP with X-Olam-Secret injected.
@@ -2686,6 +2735,15 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
2686
2735
  // env-var enrichments are applied.
2687
2736
  const enriched = enrichedObj ? JSON.stringify(enrichedObj) : JSON.stringify(parsed);
2688
2737
 
2738
+ if (parsed.world_id && parsed.prompt) {
2739
+ safePersistLastDispatch({
2740
+ worldId: parsed.world_id,
2741
+ messageId: parsed.session_id ?? `cloud-dispatch-${Date.now()}`,
2742
+ prompt: parsed.prompt,
2743
+ source: 'cloud-dispatch',
2744
+ });
2745
+ }
2746
+
2689
2747
  // Phase H h2: attach CF Access service-token headers when configured
2690
2748
  // (machine-to-machine auth). Additive alongside Basic auth. CF Access
2691
2749
  // headers are validated at the EDGE of origins behind a CF Access app
@@ -3674,6 +3732,119 @@ const worldActivityTracker = SERVE_ONLY
3674
3732
  broadcaster: hostStream,
3675
3733
  });
3676
3734
 
3735
+ // World watchdog — periodic probe of each active world's claude PID for the
3736
+ // three wedge signals (wchan + CLOSE_WAIT + CPU). Emits `world.watchdog.tick`
3737
+ // events on the host-stream broadcaster.
3738
+ //
3739
+ // Phase B: recovery is wired when compute.autoRecover !== false.
3740
+ // Default is false (detection-only, byte-identical to Phase A behaviour).
3741
+ //
3742
+ // SERVE-ONLY: no docker / worlds.db on a managed cluster; null sentinel keeps
3743
+ // the shutdown handler's `worldWatchdog?.stop()` a no-op.
3744
+ //
3745
+ // getClaudePidForWorld is a v1 stub returning null for all worlds. All worlds
3746
+ // therefore emit verdict='unknown' until real PID lookup is wired in a follow-up.
3747
+
3748
+ // ── Recovery setup (Phase B) ─────────────────────────────────────────────────
3749
+ // Load autoRecover from env OLAM_AUTO_RECOVER (or false by default — D4).
3750
+ // Falls back to false if config unavailable or field absent.
3751
+
3752
+ // The compute.autoRecover field lives in .olam/config.yaml (per workspace),
3753
+ // not in ~/.olam/config.json (global). Host-cp does not load workspace YAML at
3754
+ // startup. Read from env OLAM_AUTO_RECOVER; default false (D4 — OFF by default).
3755
+ const _watchdogAutoRecoverMode = (() => {
3756
+ const envVal = process.env.OLAM_AUTO_RECOVER;
3757
+ if (envVal === 'true') return true;
3758
+ if (envVal === 'dry-run') return 'dry-run';
3759
+ return false;
3760
+ })();
3761
+
3762
+ const _watchdogLeakyBucket = createLeakyBucket({ capacity: 3, windowMs: 3_600_000 });
3763
+
3764
+ const _watchdogRecovery = SERVE_ONLY
3765
+ ? null
3766
+ : createRecovery({
3767
+ autoRecoverMode: _watchdogAutoRecoverMode,
3768
+ leakyBucket: _watchdogLeakyBucket,
3769
+ broadcaster: hostStream,
3770
+ persister: { read: readLastDispatch },
3771
+ // TODO: wire real replay once operator has run the B3 idempotence probe
3772
+ // and confirmed dispatch is idempotent for all substrates in use.
3773
+ // See docs/architecture/world-watchdog.md Recovery > Idempotence probe.
3774
+ // For now: log + emit replay_stub breadcrumb so the stub path is visible.
3775
+ replay: async ({ worldId, prompt }) => {
3776
+ console.warn(
3777
+ `[world-watchdog-recovery] replay stub: worldId=${worldId} prompt="${prompt.slice(0, 80)}..." — real replay deferred pending B3 sign-off`,
3778
+ );
3779
+ // breadcrumb already emitted by createRecovery before calling replay
3780
+ },
3781
+ });
3782
+
3783
+ const worldWatchdog = SERVE_ONLY
3784
+ ? null
3785
+ : startWorldWatchdog({
3786
+ broadcaster: hostStream,
3787
+ recovery: _watchdogRecovery,
3788
+ listActiveWorlds: async () => {
3789
+ // Reuse the same worlds.db query as worldActivityTracker: return all
3790
+ // non-destroyed/failed world IDs for probing.
3791
+ let Database;
3792
+ try {
3793
+ const { createRequire } = await import('node:module');
3794
+ const req = createRequire(import.meta.url);
3795
+ Database = req('better-sqlite3');
3796
+ } catch {
3797
+ return [];
3798
+ }
3799
+ let db;
3800
+ try {
3801
+ db = new Database(WORLDS_DB_PATH, { fileMustExist: true });
3802
+ } catch {
3803
+ return [];
3804
+ }
3805
+ try {
3806
+ const rows = db
3807
+ .prepare("SELECT id FROM worlds WHERE status NOT IN ('destroyed', 'failed')")
3808
+ .all();
3809
+ return rows.map((r) => r.id).filter((id) => typeof id === 'string');
3810
+ } catch {
3811
+ return [];
3812
+ } finally {
3813
+ try { db.close(); } catch { /* ignore */ }
3814
+ }
3815
+ },
3816
+ getClaudePidForWorld: async (worldId) =>
3817
+ findClaudePid({ containerId: `olam-${worldId}-devbox` }),
3818
+ });
3819
+
3820
+ /**
3821
+ * C1 — Serialize a WorldWatchdogState entry into the `socketHealth` sub-object
3822
+ * shape shared by the /progress payload and the SPA TypeScript types.
3823
+ *
3824
+ * @param {string} worldId Used to peek the per-world leaky-bucket count.
3825
+ * @param {import('./world-watchdog.mjs').WorldWatchdogState} entry
3826
+ * @returns {object}
3827
+ */
3828
+ function buildSocketHealthPayload(worldId, entry) {
3829
+ const payload = {
3830
+ verdict: entry.lastVerdict,
3831
+ signals: entry.lastSignals,
3832
+ pid: entry.lastPid,
3833
+ lastTickAt: entry.lastTickAt,
3834
+ };
3835
+ // Attach recovery sub-object only when OLAM_AUTO_RECOVER is non-false.
3836
+ if (_watchdogRecovery) {
3837
+ payload.recovery = {
3838
+ mode: _watchdogAutoRecoverMode,
3839
+ restartsInWindow: _watchdogLeakyBucket
3840
+ ? _watchdogLeakyBucket.peek(worldId).totalInWindow
3841
+ : 0,
3842
+ lastRestartAt: null, // tracking per-world last-restart-at is a future enhancement
3843
+ };
3844
+ }
3845
+ return payload;
3846
+ }
3847
+
3677
3848
  // ── Phase 1a / B1 (PR3): engine-select + await-before-listen ─────
3678
3849
  //
3679
3850
  // Decision 15: the async KubernetesEngine factory MUST be fully awaited
@@ -3774,12 +3945,13 @@ for (const sig of ['SIGTERM', 'SIGINT']) {
3774
3945
  console.log(`received ${sig}, shutting down`);
3775
3946
  stopEvents();
3776
3947
  prPoller.stop();
3777
- // worldsDbReconciler + worldActivityTracker are null in SERVE-ONLY mode.
3948
+ // worldsDbReconciler + worldActivityTracker + worldWatchdog are null in SERVE-ONLY mode.
3778
3949
  worldsDbReconciler?.stop();
3779
3950
  stopWorldsSnapshotLoop();
3780
3951
  stopTunnelsSnapshotLoop();
3781
3952
  stopListeningSnapshotLoop();
3782
3953
  worldActivityTracker?.stop();
3954
+ worldWatchdog?.stop();
3783
3955
  if (serversSnapshotTimer) { clearTimeout(serversSnapshotTimer); serversSnapshotTimer = null; }
3784
3956
  hostStream.close();
3785
3957
  if (ndjsonSpanSink) ndjsonSpanSink.close().catch(() => {});
@@ -0,0 +1,119 @@
1
+ /**
2
+ * world-watchdog-pid-lookup.mjs — host-visible PID lookup for the world watchdog.
3
+ *
4
+ * Uses `docker top <containerId>` to enumerate processes inside a world's
5
+ * container and returns the host-visible PID of the claude process.
6
+ *
7
+ * `docker top` output format (Linux Docker / Colima):
8
+ * UID PID PPID C STIME TTY TIME CMD
9
+ * root 1234 1 0 10:00 ? 00:00:00 node /usr/local/bin/claude ...
10
+ *
11
+ * The PID column (index 1 in default ps output) is already the host-visible
12
+ * PID. On Mac/Colima the container runs inside a Linux VM so `docker top`
13
+ * returns PIDs within the VM's PID namespace — these are NOT the macOS host
14
+ * PIDs, but they ARE the PIDs visible from within the Linux layer (where
15
+ * /proc reads happen). This is the same namespace the watchdog probes use
16
+ * when reading /proc/<pid>/wchan etc., so the PIDs are correct for probe use.
17
+ *
18
+ * Inject `docker` for tests (avoids spawning real docker processes).
19
+ *
20
+ * @see docs/architecture/world-watchdog.md
21
+ */
22
+
23
+ import { execFile } from 'node:child_process';
24
+ import { promisify } from 'node:util';
25
+
26
+ const execFileAsync = promisify(execFile);
27
+
28
+ /**
29
+ * Default docker executor — shells out to the real `docker` CLI.
30
+ *
31
+ * @param {string} containerId
32
+ * @returns {Promise<string>} stdout from `docker top <containerId>`
33
+ */
34
+ async function defaultDockerTop(containerId) {
35
+ const { stdout } = await execFileAsync('docker', ['top', containerId], {
36
+ timeout: 5_000,
37
+ });
38
+ return stdout;
39
+ }
40
+
41
+ /**
42
+ * Parse the stdout from `docker top` and extract host-visible PIDs whose
43
+ * CMD column matches a claude process.
44
+ *
45
+ * docker top default output columns (ps -ef format):
46
+ * UID PID PPID C STIME TTY TIME CMD
47
+ * Indices: 0=UID, 1=PID, 2=PPID, ..., 7+=CMD (rest of line after 7 columns).
48
+ *
49
+ * @param {string} stdout Raw output from `docker top <id>`
50
+ * @returns {number[]} Host-visible PIDs of matching claude processes, sorted ascending.
51
+ */
52
+ export function parseDockerTopOutput(stdout) {
53
+ const lines = stdout.split('\n').filter((l) => l.trim().length > 0);
54
+ if (lines.length < 2) return []; // header only or empty
55
+
56
+ // Skip the header line (first line contains column names).
57
+ const dataLines = lines.slice(1);
58
+
59
+ const pids = [];
60
+ for (const line of dataLines) {
61
+ // Split on any whitespace — `docker top` columns are space-separated.
62
+ // CMD may contain spaces; split into at most 8 parts (last = full CMD string).
63
+ const parts = line.trim().split(/\s+/);
64
+ if (parts.length < 8) continue;
65
+
66
+ const pid = parseInt(parts[1], 10);
67
+ if (!Number.isFinite(pid) || pid <= 0) continue;
68
+
69
+ // parts[7] onward is the CMD. Rejoin the remainder.
70
+ const cmd = parts.slice(7).join(' ');
71
+
72
+ // Match: `claude` as standalone binary, or `node` process running claude.
73
+ if (/(?:^|\/)claude(\s|$)/.test(cmd) || /node[^\s]*\s+.*[/\\]claude(?:\s|$)/.test(cmd)) {
74
+ pids.push(pid);
75
+ }
76
+ }
77
+
78
+ return pids.sort((a, b) => a - b);
79
+ }
80
+
81
+ /**
82
+ * Find the host-visible PID of the claude process running inside a container.
83
+ *
84
+ * Returns the lowest matching PID (parent process heuristic — the supervisor
85
+ * claude process has a lower PID than any child workers it spawns).
86
+ *
87
+ * Fail-soft:
88
+ * - docker unreachable / container not found → null + log
89
+ * - no claude process in the container → null (silent)
90
+ * - multiple claude processes → return the lowest PID
91
+ *
92
+ * @param {{
93
+ * containerId: string,
94
+ * dockerTop?: (containerId: string) => Promise<string>,
95
+ * log?: (msg: string) => void,
96
+ * }} opts
97
+ * @returns {Promise<number | null>}
98
+ */
99
+ export async function findClaudePid({
100
+ containerId,
101
+ dockerTop = defaultDockerTop,
102
+ log = (m) => console.log(`[world-watchdog-pid-lookup] ${m}`),
103
+ }) {
104
+ if (!containerId) return null;
105
+
106
+ let stdout;
107
+ try {
108
+ stdout = await dockerTop(containerId);
109
+ } catch (err) {
110
+ log(`docker top ${containerId} failed: ${err?.message ?? err}`);
111
+ return null;
112
+ }
113
+
114
+ const pids = parseDockerTopOutput(stdout);
115
+ if (pids.length === 0) return null;
116
+
117
+ // Lowest PID = the parent/supervisor process.
118
+ return pids[0];
119
+ }