@pleri/olam-cli 0.1.195 → 0.1.198

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. package/README.md +52 -0
  2. package/dist/ask/knowledge-pack.generated.d.ts.map +1 -1
  3. package/dist/ask/knowledge-pack.generated.js +12 -8
  4. package/dist/ask/knowledge-pack.generated.js.map +1 -1
  5. package/dist/commands/auth-list-json.d.ts +34 -0
  6. package/dist/commands/auth-list-json.d.ts.map +1 -1
  7. package/dist/commands/auth-list-json.js +24 -0
  8. package/dist/commands/auth-list-json.js.map +1 -1
  9. package/dist/commands/auth-migrate.d.ts +212 -0
  10. package/dist/commands/auth-migrate.d.ts.map +1 -0
  11. package/dist/commands/auth-migrate.js +465 -0
  12. package/dist/commands/auth-migrate.js.map +1 -0
  13. package/dist/commands/auth.d.ts.map +1 -1
  14. package/dist/commands/auth.js +239 -184
  15. package/dist/commands/auth.js.map +1 -1
  16. package/dist/commands/bootstrap.d.ts +4 -0
  17. package/dist/commands/bootstrap.d.ts.map +1 -1
  18. package/dist/commands/bootstrap.js +6 -0
  19. package/dist/commands/bootstrap.js.map +1 -1
  20. package/dist/commands/dispatch.d.ts.map +1 -1
  21. package/dist/commands/dispatch.js +11 -1
  22. package/dist/commands/dispatch.js.map +1 -1
  23. package/dist/commands/doctor.d.ts +33 -0
  24. package/dist/commands/doctor.d.ts.map +1 -1
  25. package/dist/commands/doctor.js +299 -12
  26. package/dist/commands/doctor.js.map +1 -1
  27. package/dist/commands/kg-mirror.d.ts +18 -2
  28. package/dist/commands/kg-mirror.d.ts.map +1 -1
  29. package/dist/commands/kg-mirror.js +78 -3
  30. package/dist/commands/kg-mirror.js.map +1 -1
  31. package/dist/commands/mcp/complete.d.ts +36 -0
  32. package/dist/commands/mcp/complete.d.ts.map +1 -0
  33. package/dist/commands/mcp/complete.js +66 -0
  34. package/dist/commands/mcp/complete.js.map +1 -0
  35. package/dist/commands/mcp/index.d.ts +1 -1
  36. package/dist/commands/mcp/index.d.ts.map +1 -1
  37. package/dist/commands/mcp/index.js +3 -1
  38. package/dist/commands/mcp/index.js.map +1 -1
  39. package/dist/commands/memory/bridge.d.ts +1 -1
  40. package/dist/commands/memory/bridge.d.ts.map +1 -1
  41. package/dist/commands/memory/bridge.js +2 -6
  42. package/dist/commands/memory/bridge.js.map +1 -1
  43. package/dist/commands/memory/secret.d.ts.map +1 -1
  44. package/dist/commands/memory/secret.js +4 -3
  45. package/dist/commands/memory/secret.js.map +1 -1
  46. package/dist/commands/observe.d.ts +3 -3
  47. package/dist/commands/observe.d.ts.map +1 -1
  48. package/dist/commands/observe.js +11 -8
  49. package/dist/commands/observe.js.map +1 -1
  50. package/dist/commands/runbooks.d.ts.map +1 -1
  51. package/dist/commands/runbooks.js +77 -10
  52. package/dist/commands/runbooks.js.map +1 -1
  53. package/dist/commands/services-tls.d.ts.map +1 -1
  54. package/dist/commands/services-tls.js +65 -10
  55. package/dist/commands/services-tls.js.map +1 -1
  56. package/dist/commands/services.d.ts +35 -1
  57. package/dist/commands/services.d.ts.map +1 -1
  58. package/dist/commands/services.js +153 -32
  59. package/dist/commands/services.js.map +1 -1
  60. package/dist/commands/setup-phase-8-kg-hook.d.ts +48 -0
  61. package/dist/commands/setup-phase-8-kg-hook.d.ts.map +1 -0
  62. package/dist/commands/setup-phase-8-kg-hook.js +93 -0
  63. package/dist/commands/setup-phase-8-kg-hook.js.map +1 -0
  64. package/dist/commands/setup-phase-9-memory-bridge.d.ts +36 -0
  65. package/dist/commands/setup-phase-9-memory-bridge.d.ts.map +1 -0
  66. package/dist/commands/setup-phase-9-memory-bridge.js +59 -0
  67. package/dist/commands/setup-phase-9-memory-bridge.js.map +1 -0
  68. package/dist/commands/setup.d.ts +34 -1
  69. package/dist/commands/setup.d.ts.map +1 -1
  70. package/dist/commands/setup.js +372 -32
  71. package/dist/commands/setup.js.map +1 -1
  72. package/dist/commands/skills-source.d.ts.map +1 -1
  73. package/dist/commands/skills-source.js +70 -1
  74. package/dist/commands/skills-source.js.map +1 -1
  75. package/dist/commands/update.d.ts +24 -0
  76. package/dist/commands/update.d.ts.map +1 -1
  77. package/dist/commands/update.js +53 -0
  78. package/dist/commands/update.js.map +1 -1
  79. package/dist/commands/upgrade.d.ts +5 -0
  80. package/dist/commands/upgrade.d.ts.map +1 -1
  81. package/dist/commands/upgrade.js +31 -8
  82. package/dist/commands/upgrade.js.map +1 -1
  83. package/dist/image-digests.json +8 -8
  84. package/dist/index.js +4487 -2451
  85. package/dist/lib/auth-backend.d.ts +168 -0
  86. package/dist/lib/auth-backend.d.ts.map +1 -0
  87. package/dist/lib/auth-backend.js +172 -0
  88. package/dist/lib/auth-backend.js.map +1 -0
  89. package/dist/lib/auth-list-cache.d.ts +67 -0
  90. package/dist/lib/auth-list-cache.d.ts.map +1 -0
  91. package/dist/lib/auth-list-cache.js +84 -0
  92. package/dist/lib/auth-list-cache.js.map +1 -0
  93. package/dist/lib/auth-list.d.ts +107 -0
  94. package/dist/lib/auth-list.d.ts.map +1 -0
  95. package/dist/lib/auth-list.js +123 -0
  96. package/dist/lib/auth-list.js.map +1 -0
  97. package/dist/lib/auth-login.d.ts +92 -0
  98. package/dist/lib/auth-login.d.ts.map +1 -0
  99. package/dist/lib/auth-login.js +124 -0
  100. package/dist/lib/auth-login.js.map +1 -0
  101. package/dist/lib/auth-mutator-backend.d.ts +54 -0
  102. package/dist/lib/auth-mutator-backend.d.ts.map +1 -0
  103. package/dist/lib/auth-mutator-backend.js +62 -0
  104. package/dist/lib/auth-mutator-backend.js.map +1 -0
  105. package/dist/lib/auth-remote.d.ts +50 -0
  106. package/dist/lib/auth-remote.d.ts.map +1 -1
  107. package/dist/lib/auth-remote.js +84 -2
  108. package/dist/lib/auth-remote.js.map +1 -1
  109. package/dist/lib/bootstrap-kubernetes.d.ts +69 -10
  110. package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
  111. package/dist/lib/bootstrap-kubernetes.js +264 -46
  112. package/dist/lib/bootstrap-kubernetes.js.map +1 -1
  113. package/dist/lib/config.d.ts +35 -4
  114. package/dist/lib/config.d.ts.map +1 -1
  115. package/dist/lib/config.js +82 -11
  116. package/dist/lib/config.js.map +1 -1
  117. package/dist/lib/health-probes.d.ts +0 -22
  118. package/dist/lib/health-probes.d.ts.map +1 -1
  119. package/dist/lib/health-probes.js +57 -0
  120. package/dist/lib/health-probes.js.map +1 -1
  121. package/dist/lib/peripheral-registry.d.ts +11 -0
  122. package/dist/lib/peripheral-registry.d.ts.map +1 -1
  123. package/dist/lib/peripheral-registry.js +5 -0
  124. package/dist/lib/peripheral-registry.js.map +1 -1
  125. package/dist/lib/plans-client.d.ts.map +1 -1
  126. package/dist/lib/plans-client.js +6 -3
  127. package/dist/lib/plans-client.js.map +1 -1
  128. package/dist/mcp-server.js +138 -6
  129. package/hermes-bundle/version.json +1 -1
  130. package/host-cp/k8s/manifests/30-configmap.yaml +4 -0
  131. package/host-cp/k8s/manifests/50-deployment.yaml +13 -1
  132. package/host-cp/k8s/manifests/65-tls-secret-template.yaml.tmpl +35 -0
  133. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  134. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  135. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  136. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  137. package/host-cp/src/dispatch-persister.mjs +157 -0
  138. package/host-cp/src/pr-nanny.mjs +7 -0
  139. package/host-cp/src/server.mjs +175 -3
  140. package/host-cp/src/world-watchdog-pid-lookup.mjs +119 -0
  141. package/host-cp/src/world-watchdog-probes.mjs +271 -0
  142. package/host-cp/src/world-watchdog-recovery.mjs +192 -0
  143. package/host-cp/src/world-watchdog.mjs +313 -0
  144. package/package.json +1 -1
@@ -84,6 +84,11 @@ import {
84
84
  defaultListContainerNames,
85
85
  } from './boot-reconciler.mjs';
86
86
  import { startWorldActivityTracker } from './world-activity-tracker.mjs';
87
+ import { startWorldWatchdog } from './world-watchdog.mjs';
88
+ import { createRecovery } from './world-watchdog-recovery.mjs';
89
+ import { createLeakyBucket } from './lib/leaky-bucket.mjs';
90
+ import { read as readLastDispatch, safePersistLastDispatch } from './dispatch-persister.mjs';
91
+ import { findClaudePid } from './world-watchdog-pid-lookup.mjs';
87
92
  import { authSecretHint } from './auth-secret-hint.mjs';
88
93
  import * as tunnelManager from './world-tunnel-manager.mjs';
89
94
  import * as bridgeManager from './port-bridge-manager.mjs';
@@ -96,6 +101,7 @@ import {
96
101
  import { instrumentHandler, renderMetrics } from './metrics.mjs';
97
102
  import { handleDispatchFromEmail } from './lib/email-dispatch.mjs';
98
103
  import { handleDispatchFromLinear } from './lib/linear-dispatch.mjs';
104
+ // (safePersistLastDispatch imported above alongside readLastDispatch)
99
105
  import { emitTierSuggestion } from '../dispatch/auto-tier-scheduler.mjs';
100
106
  import { isServeOnly, isOrchestrationRoute, ORCHESTRATION_UNAVAILABLE } from './serve-only-config.mjs';
101
107
 
@@ -1874,6 +1880,41 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
1874
1880
  // the isOrchestrationRoute guard — it covers /api/world/, /api/worlds/<id>,
1875
1881
  // and /v1/worlds/ for all methods, so no per-route guard is needed here.)
1876
1882
 
1883
+ // GET /api/world/<id>/socket-health — world watchdog verdict (A5).
1884
+ // Returns the latest in-memory probe result from the world watchdog.
1885
+ // Read-only; never mutates world state.
1886
+ // 200: { worldId, verdict, signals, pid, lastTickAt } — known world + tick fired
1887
+ // 200 verdict='unknown': known world but no tick has fired yet
1888
+ // 404: unknown_world
1889
+ // serve-only: returns 503 orchestration_unavailable (isOrchestrationRoute covers
1890
+ // /api/world/* prefix, so this route is already blocked upstream before reaching here).
1891
+ const socketHealthMatch = /^\/api\/world\/([^/?#]+)\/socket-health\/?$/.exec(url.pathname);
1892
+ if (socketHealthMatch && req.method === 'GET') {
1893
+ const worldId = decodeURIComponent(socketHealthMatch[1]);
1894
+ if (!(worldId in WORLDS)) {
1895
+ return jsonReply(res, 404, { error: 'unknown_world' });
1896
+ }
1897
+ // worldWatchdog is null in serve-only mode (but the serve-only gate above
1898
+ // would have returned 503 before we get here; belt-and-suspenders).
1899
+ const entry = worldWatchdog ? worldWatchdog.getVerdict(worldId) : null;
1900
+ if (!entry) {
1901
+ return jsonReply(res, 200, {
1902
+ worldId,
1903
+ verdict: 'unknown',
1904
+ signals: null,
1905
+ pid: null,
1906
+ lastTickAt: null,
1907
+ });
1908
+ }
1909
+ return jsonReply(res, 200, {
1910
+ worldId,
1911
+ verdict: entry.lastVerdict,
1912
+ signals: entry.lastSignals,
1913
+ pid: entry.lastPid,
1914
+ lastTickAt: entry.lastTickAt,
1915
+ });
1916
+ }
1917
+
1877
1918
  // GET /api/world/<id>/progress — phase ladder progress for inbox row.
1878
1919
  const progressMatch = /^\/api\/world\/([^/?#]+)\/progress\/?$/.exec(url.pathname);
1879
1920
  if (progressMatch && req.method === 'GET') {
@@ -1892,8 +1933,16 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
1892
1933
  prStateStore,
1893
1934
  getGhToken: resolveGhToken,
1894
1935
  });
1895
- progressCache.set(worldId, { fetchedAt: Date.now(), data });
1896
- return jsonReply(res, 200, data);
1936
+ // C1 attach socketHealth if watchdog has fired for this world.
1937
+ const verdictEntry = worldWatchdog ? worldWatchdog.getVerdict(worldId) : null;
1938
+ const enriched = verdictEntry
1939
+ ? {
1940
+ ...data,
1941
+ socketHealth: buildSocketHealthPayload(worldId, verdictEntry),
1942
+ }
1943
+ : data;
1944
+ progressCache.set(worldId, { fetchedAt: Date.now(), data: enriched });
1945
+ return jsonReply(res, 200, enriched);
1897
1946
  }
1898
1947
 
1899
1948
  // /api/world/<id>/* → proxy to per-world CP with X-Olam-Secret injected.
@@ -2686,6 +2735,15 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
2686
2735
  // env-var enrichments are applied.
2687
2736
  const enriched = enrichedObj ? JSON.stringify(enrichedObj) : JSON.stringify(parsed);
2688
2737
 
2738
+ if (parsed.world_id && parsed.prompt) {
2739
+ safePersistLastDispatch({
2740
+ worldId: parsed.world_id,
2741
+ messageId: parsed.session_id ?? `cloud-dispatch-${Date.now()}`,
2742
+ prompt: parsed.prompt,
2743
+ source: 'cloud-dispatch',
2744
+ });
2745
+ }
2746
+
2689
2747
  // Phase H h2: attach CF Access service-token headers when configured
2690
2748
  // (machine-to-machine auth). Additive alongside Basic auth. CF Access
2691
2749
  // headers are validated at the EDGE of origins behind a CF Access app
@@ -3674,6 +3732,119 @@ const worldActivityTracker = SERVE_ONLY
3674
3732
  broadcaster: hostStream,
3675
3733
  });
3676
3734
 
3735
+ // World watchdog — periodic probe of each active world's claude PID for the
3736
+ // three wedge signals (wchan + CLOSE_WAIT + CPU). Emits `world.watchdog.tick`
3737
+ // events on the host-stream broadcaster.
3738
+ //
3739
+ // Phase B: recovery is wired when compute.autoRecover !== false.
3740
+ // Default is false (detection-only, byte-identical to Phase A behaviour).
3741
+ //
3742
+ // SERVE-ONLY: no docker / worlds.db on a managed cluster; null sentinel keeps
3743
+ // the shutdown handler's `worldWatchdog?.stop()` a no-op.
3744
+ //
3745
+ // getClaudePidForWorld is a v1 stub returning null for all worlds. All worlds
3746
+ // therefore emit verdict='unknown' until real PID lookup is wired in a follow-up.
3747
+
3748
+ // ── Recovery setup (Phase B) ─────────────────────────────────────────────────
3749
+ // Load autoRecover from env OLAM_AUTO_RECOVER (or false by default — D4).
3750
+ // Falls back to false if config unavailable or field absent.
3751
+
3752
+ // The compute.autoRecover field lives in .olam/config.yaml (per workspace),
3753
+ // not in ~/.olam/config.json (global). Host-cp does not load workspace YAML at
3754
+ // startup. Read from env OLAM_AUTO_RECOVER; default false (D4 — OFF by default).
3755
+ const _watchdogAutoRecoverMode = (() => {
3756
+ const envVal = process.env.OLAM_AUTO_RECOVER;
3757
+ if (envVal === 'true') return true;
3758
+ if (envVal === 'dry-run') return 'dry-run';
3759
+ return false;
3760
+ })();
3761
+
3762
+ const _watchdogLeakyBucket = createLeakyBucket({ capacity: 3, windowMs: 3_600_000 });
3763
+
3764
+ const _watchdogRecovery = SERVE_ONLY
3765
+ ? null
3766
+ : createRecovery({
3767
+ autoRecoverMode: _watchdogAutoRecoverMode,
3768
+ leakyBucket: _watchdogLeakyBucket,
3769
+ broadcaster: hostStream,
3770
+ persister: { read: readLastDispatch },
3771
+ // TODO: wire real replay once operator has run the B3 idempotence probe
3772
+ // and confirmed dispatch is idempotent for all substrates in use.
3773
+ // See docs/architecture/world-watchdog.md Recovery > Idempotence probe.
3774
+ // For now: log + emit replay_stub breadcrumb so the stub path is visible.
3775
+ replay: async ({ worldId, prompt }) => {
3776
+ console.warn(
3777
+ `[world-watchdog-recovery] replay stub: worldId=${worldId} prompt="${prompt.slice(0, 80)}..." — real replay deferred pending B3 sign-off`,
3778
+ );
3779
+ // breadcrumb already emitted by createRecovery before calling replay
3780
+ },
3781
+ });
3782
+
3783
+ const worldWatchdog = SERVE_ONLY
3784
+ ? null
3785
+ : startWorldWatchdog({
3786
+ broadcaster: hostStream,
3787
+ recovery: _watchdogRecovery,
3788
+ listActiveWorlds: async () => {
3789
+ // Reuse the same worlds.db query as worldActivityTracker: return all
3790
+ // non-destroyed/failed world IDs for probing.
3791
+ let Database;
3792
+ try {
3793
+ const { createRequire } = await import('node:module');
3794
+ const req = createRequire(import.meta.url);
3795
+ Database = req('better-sqlite3');
3796
+ } catch {
3797
+ return [];
3798
+ }
3799
+ let db;
3800
+ try {
3801
+ db = new Database(WORLDS_DB_PATH, { fileMustExist: true });
3802
+ } catch {
3803
+ return [];
3804
+ }
3805
+ try {
3806
+ const rows = db
3807
+ .prepare("SELECT id FROM worlds WHERE status NOT IN ('destroyed', 'failed')")
3808
+ .all();
3809
+ return rows.map((r) => r.id).filter((id) => typeof id === 'string');
3810
+ } catch {
3811
+ return [];
3812
+ } finally {
3813
+ try { db.close(); } catch { /* ignore */ }
3814
+ }
3815
+ },
3816
+ getClaudePidForWorld: async (worldId) =>
3817
+ findClaudePid({ containerId: `olam-${worldId}-devbox` }),
3818
+ });
3819
+
3820
+ /**
3821
+ * C1 — Serialize a WorldWatchdogState entry into the `socketHealth` sub-object
3822
+ * shape shared by the /progress payload and the SPA TypeScript types.
3823
+ *
3824
+ * @param {string} worldId Used to peek the per-world leaky-bucket count.
3825
+ * @param {import('./world-watchdog.mjs').WorldWatchdogState} entry
3826
+ * @returns {object}
3827
+ */
3828
+ function buildSocketHealthPayload(worldId, entry) {
3829
+ const payload = {
3830
+ verdict: entry.lastVerdict,
3831
+ signals: entry.lastSignals,
3832
+ pid: entry.lastPid,
3833
+ lastTickAt: entry.lastTickAt,
3834
+ };
3835
+ // Attach recovery sub-object only when OLAM_AUTO_RECOVER is non-false.
3836
+ if (_watchdogRecovery) {
3837
+ payload.recovery = {
3838
+ mode: _watchdogAutoRecoverMode,
3839
+ restartsInWindow: _watchdogLeakyBucket
3840
+ ? _watchdogLeakyBucket.peek(worldId).totalInWindow
3841
+ : 0,
3842
+ lastRestartAt: null, // tracking per-world last-restart-at is a future enhancement
3843
+ };
3844
+ }
3845
+ return payload;
3846
+ }
3847
+
3677
3848
  // ── Phase 1a / B1 (PR3): engine-select + await-before-listen ─────
3678
3849
  //
3679
3850
  // Decision 15: the async KubernetesEngine factory MUST be fully awaited
@@ -3774,12 +3945,13 @@ for (const sig of ['SIGTERM', 'SIGINT']) {
3774
3945
  console.log(`received ${sig}, shutting down`);
3775
3946
  stopEvents();
3776
3947
  prPoller.stop();
3777
- // worldsDbReconciler + worldActivityTracker are null in SERVE-ONLY mode.
3948
+ // worldsDbReconciler + worldActivityTracker + worldWatchdog are null in SERVE-ONLY mode.
3778
3949
  worldsDbReconciler?.stop();
3779
3950
  stopWorldsSnapshotLoop();
3780
3951
  stopTunnelsSnapshotLoop();
3781
3952
  stopListeningSnapshotLoop();
3782
3953
  worldActivityTracker?.stop();
3954
+ worldWatchdog?.stop();
3783
3955
  if (serversSnapshotTimer) { clearTimeout(serversSnapshotTimer); serversSnapshotTimer = null; }
3784
3956
  hostStream.close();
3785
3957
  if (ndjsonSpanSink) ndjsonSpanSink.close().catch(() => {});
@@ -0,0 +1,119 @@
1
+ /**
2
+ * world-watchdog-pid-lookup.mjs — host-visible PID lookup for the world watchdog.
3
+ *
4
+ * Uses `docker top <containerId>` to enumerate processes inside a world's
5
+ * container and returns the host-visible PID of the claude process.
6
+ *
7
+ * `docker top` output format (Linux Docker / Colima):
8
+ * UID PID PPID C STIME TTY TIME CMD
9
+ * root 1234 1 0 10:00 ? 00:00:00 node /usr/local/bin/claude ...
10
+ *
11
+ * The PID column (index 1 in default ps output) is already the host-visible
12
+ * PID. On Mac/Colima the container runs inside a Linux VM so `docker top`
13
+ * returns PIDs within the VM's PID namespace — these are NOT the macOS host
14
+ * PIDs, but they ARE the PIDs visible from within the Linux layer (where
15
+ * /proc reads happen). This is the same namespace the watchdog probes use
16
+ * when reading /proc/<pid>/wchan etc., so the PIDs are correct for probe use.
17
+ *
18
+ * Inject `docker` for tests (avoids spawning real docker processes).
19
+ *
20
+ * @see docs/architecture/world-watchdog.md
21
+ */
22
+
23
+ import { execFile } from 'node:child_process';
24
+ import { promisify } from 'node:util';
25
+
26
+ const execFileAsync = promisify(execFile);
27
+
28
+ /**
29
+ * Default docker executor — shells out to the real `docker` CLI.
30
+ *
31
+ * @param {string} containerId
32
+ * @returns {Promise<string>} stdout from `docker top <containerId>`
33
+ */
34
+ async function defaultDockerTop(containerId) {
35
+ const { stdout } = await execFileAsync('docker', ['top', containerId], {
36
+ timeout: 5_000,
37
+ });
38
+ return stdout;
39
+ }
40
+
41
+ /**
42
+ * Parse the stdout from `docker top` and extract host-visible PIDs whose
43
+ * CMD column matches a claude process.
44
+ *
45
+ * docker top default output columns (ps -ef format):
46
+ * UID PID PPID C STIME TTY TIME CMD
47
+ * Indices: 0=UID, 1=PID, 2=PPID, ..., 7+=CMD (rest of line after 7 columns).
48
+ *
49
+ * @param {string} stdout Raw output from `docker top <id>`
50
+ * @returns {number[]} Host-visible PIDs of matching claude processes, sorted ascending.
51
+ */
52
+ export function parseDockerTopOutput(stdout) {
53
+ const lines = stdout.split('\n').filter((l) => l.trim().length > 0);
54
+ if (lines.length < 2) return []; // header only or empty
55
+
56
+ // Skip the header line (first line contains column names).
57
+ const dataLines = lines.slice(1);
58
+
59
+ const pids = [];
60
+ for (const line of dataLines) {
61
+ // Split on any whitespace — `docker top` columns are space-separated.
62
+ // CMD may contain spaces; split into at most 8 parts (last = full CMD string).
63
+ const parts = line.trim().split(/\s+/);
64
+ if (parts.length < 8) continue;
65
+
66
+ const pid = parseInt(parts[1], 10);
67
+ if (!Number.isFinite(pid) || pid <= 0) continue;
68
+
69
+ // parts[7] onward is the CMD. Rejoin the remainder.
70
+ const cmd = parts.slice(7).join(' ');
71
+
72
+ // Match: `claude` as standalone binary, or `node` process running claude.
73
+ if (/(?:^|\/)claude(\s|$)/.test(cmd) || /node[^\s]*\s+.*[/\\]claude(?:\s|$)/.test(cmd)) {
74
+ pids.push(pid);
75
+ }
76
+ }
77
+
78
+ return pids.sort((a, b) => a - b);
79
+ }
80
+
81
+ /**
82
+ * Find the host-visible PID of the claude process running inside a container.
83
+ *
84
+ * Returns the lowest matching PID (parent process heuristic — the supervisor
85
+ * claude process has a lower PID than any child workers it spawns).
86
+ *
87
+ * Fail-soft:
88
+ * - docker unreachable / container not found → null + log
89
+ * - no claude process in the container → null (silent)
90
+ * - multiple claude processes → return the lowest PID
91
+ *
92
+ * @param {{
93
+ * containerId: string,
94
+ * dockerTop?: (containerId: string) => Promise<string>,
95
+ * log?: (msg: string) => void,
96
+ * }} opts
97
+ * @returns {Promise<number | null>}
98
+ */
99
+ export async function findClaudePid({
100
+ containerId,
101
+ dockerTop = defaultDockerTop,
102
+ log = (m) => console.log(`[world-watchdog-pid-lookup] ${m}`),
103
+ }) {
104
+ if (!containerId) return null;
105
+
106
+ let stdout;
107
+ try {
108
+ stdout = await dockerTop(containerId);
109
+ } catch (err) {
110
+ log(`docker top ${containerId} failed: ${err?.message ?? err}`);
111
+ return null;
112
+ }
113
+
114
+ const pids = parseDockerTopOutput(stdout);
115
+ if (pids.length === 0) return null;
116
+
117
+ // Lowest PID = the parent/supervisor process.
118
+ return pids[0];
119
+ }
@@ -0,0 +1,271 @@
1
+ /**
2
+ * world-watchdog-probes.mjs — pure probe functions for the world watchdog.
3
+ *
4
+ * Three readers extract raw signals from the Linux /proc filesystem:
5
+ * - readWchan(pid, opts) → string | null
6
+ * - readCloseWaitSockets(pid, opts) → Array<{remoteIp, remotePort}>
7
+ * - readCpuPercent(pid, windowMs, opts) → number | null
8
+ *
9
+ * One pure classifier turns those signals into a verdict:
10
+ * - classify({ wchan, closeWaitCount, cpuPercent }) → 'healthy'|'suspect'|'wedged'
11
+ *
12
+ * All readers are fail-soft: any I/O error or parse error returns
13
+ * null / [] / 0 rather than throwing. The classifier treats null inputs as
14
+ * the signal not firing (conservative — only promotes to 'wedged' when all
15
+ * three signals are conclusive).
16
+ *
17
+ * Test injection: pass `opts.procRoot` to redirect /proc reads to a fixture
18
+ * directory (e.g. src/__tests__/fixtures/proc-gold-elk-5574/).
19
+ *
20
+ * CLOSE_WAIT threshold note (deviation from D2): Decision D2 specifies
21
+ * filtering CLOSE_WAIT by peer hostname (*.anthropic.com | auth-worker.*).
22
+ * DNS resolution at every tick is unreliable under network stress (exactly
23
+ * when the watchdog must be most accurate). The gold-elk-5574 forensic data
24
+ * shows ≥3 CLOSE_WAIT to ANY peer is already diagnostic — a healthy claude
25
+ * process has 0-1 CLOSE_WAIT sockets under normal operation. The classifier
26
+ * therefore uses count ≥ 3 without hostname filtering. This deviation is
27
+ * documented in docs/architecture/world-watchdog.md Signal 2.
28
+ *
29
+ * @see docs/architecture/world-watchdog.md
30
+ * @see packages/host-cp/src/__tests__/world-watchdog-probes.test.mjs
31
+ */
32
+
33
+ import fs from 'node:fs/promises';
34
+ import path from 'node:path';
35
+
36
+ // HZ — Linux scheduler tick rate. Kernel default is 100; can be 250 or 1000
37
+ // on tickless kernels but the /proc/stat jiffies-to-seconds conversion is
38
+ // independent of the actual HZ when the denominator is wall-clock ms.
39
+ // We divide jiffies by HZ to get seconds of CPU time, then compare to the
40
+ // wall-clock window. HZ=100 is correct for virtually all container environments.
41
+ const LINUX_HZ = 100;
42
+
43
+ // /proc/net/tcp state byte for CLOSE_WAIT.
44
+ const CLOSE_WAIT_STATE = '08';
45
+
46
+ /**
47
+ * Read the wchan (wait channel) of a process's main thread.
48
+ *
49
+ * @param {number|string} pid Process ID.
50
+ * @param {{ procRoot?: string }} [opts]
51
+ * `procRoot` defaults to '/proc'; override for tests.
52
+ * @returns {Promise<string|null>}
53
+ * The wchan string (e.g. 'futex_wait_queue', 'epoll_wait') or null on error.
54
+ */
55
+ export async function readWchan(pid, opts = {}) {
56
+ const procRoot = opts.procRoot ?? '/proc';
57
+ const wchanPath = path.join(procRoot, String(pid), 'wchan');
58
+ try {
59
+ const content = await fs.readFile(wchanPath, 'utf8');
60
+ return content.trim() || null;
61
+ } catch {
62
+ return null;
63
+ }
64
+ }
65
+
66
+ /**
67
+ * Read CLOSE_WAIT sockets for a process from /proc/<pid>/net/tcp (and tcp6).
68
+ *
69
+ * Parses the /proc/net/tcp format (space-separated hex fields). State field
70
+ * (column index 3, 0-based) == '08' means CLOSE_WAIT. Returns all matching
71
+ * entries regardless of remote peer — see module JSDoc for rationale.
72
+ *
73
+ * @param {number|string} pid Process ID.
74
+ * @param {{ procRoot?: string }} [opts]
75
+ * @returns {Promise<Array<{remoteIp: string, remotePort: number}>>}
76
+ * Array of CLOSE_WAIT socket descriptors, empty on error or no matches.
77
+ */
78
+ export async function readCloseWaitSockets(pid, opts = {}) {
79
+ const procRoot = opts.procRoot ?? '/proc';
80
+ const results = [];
81
+
82
+ for (const proto of ['tcp', 'tcp6']) {
83
+ const tcpPath = path.join(procRoot, String(pid), 'net', proto);
84
+ let content;
85
+ try {
86
+ content = await fs.readFile(tcpPath, 'utf8');
87
+ } catch {
88
+ // ENOENT: pid gone or proto not available — skip, not an error.
89
+ continue;
90
+ }
91
+
92
+ const lines = content.split('\n');
93
+ // Skip header line.
94
+ for (let i = 1; i < lines.length; i++) {
95
+ const line = lines[i].trim();
96
+ if (!line) continue;
97
+ const fields = line.split(/\s+/);
98
+ // /proc/net/tcp columns (0-based):
99
+ // 0: sl
100
+ // 1: local_address (hex IP:port)
101
+ // 2: rem_address (hex IP:port)
102
+ // 3: st (hex state)
103
+ if (fields.length < 4) continue;
104
+ const state = fields[3];
105
+ if (state !== CLOSE_WAIT_STATE) continue;
106
+
107
+ const remAddr = fields[2];
108
+ const colonIdx = remAddr.lastIndexOf(':');
109
+ if (colonIdx === -1) continue;
110
+ const remIpHex = remAddr.slice(0, colonIdx);
111
+ const remPortHex = remAddr.slice(colonIdx + 1);
112
+
113
+ const remIp = parseHexIp(remIpHex);
114
+ const remPort = parseInt(remPortHex, 16);
115
+
116
+ if (remIp !== null && Number.isFinite(remPort)) {
117
+ results.push({ remoteIp: remIp, remotePort: remPort });
118
+ }
119
+ }
120
+ }
121
+
122
+ return results;
123
+ }
124
+
125
+ /**
126
+ * Measure CPU utilisation for a process over a time window.
127
+ *
128
+ * Reads /proc/<pid>/stat twice (before + after `windowMs` ms) and computes:
129
+ * cpuPercent = (utime+stime delta) / (HZ * windowMs / 1000) * 100
130
+ *
131
+ * @param {number|string} pid Process ID.
132
+ * @param {number} windowMs Measurement window in milliseconds.
133
+ * @param {{ procRoot?: string, sleep?: (ms: number) => Promise<void>, now?: () => number }} [opts]
134
+ * `sleep` — injectable delay function (default: real setTimeout).
135
+ * `now` — injectable clock (default: Date.now).
136
+ * `procRoot` — injectable proc root for tests.
137
+ * @returns {Promise<number|null>}
138
+ * CPU percent (0–100+) or null on read/parse error.
139
+ */
140
+ export async function readCpuPercent(pid, windowMs, opts = {}) {
141
+ const procRoot = opts.procRoot ?? '/proc';
142
+ const sleep = opts.sleep ?? ((ms) => new Promise((r) => setTimeout(r, ms)));
143
+ const statPath = path.join(procRoot, String(pid), 'stat');
144
+
145
+ const before = await readStatTimes(statPath);
146
+ if (before === null) return null;
147
+
148
+ await sleep(windowMs);
149
+
150
+ const after = await readStatTimes(statPath);
151
+ if (after === null) return null;
152
+
153
+ const deltaTicks = (after.utime + after.stime) - (before.utime + before.stime);
154
+ if (deltaTicks < 0) return null;
155
+
156
+ // deltaTicks jiffies / HZ = delta CPU-seconds.
157
+ // windowMs / 1000 = window in seconds.
158
+ const windowSec = windowMs / 1000;
159
+ if (windowSec <= 0) return null;
160
+
161
+ const cpuPercent = (deltaTicks / LINUX_HZ / windowSec) * 100;
162
+ return cpuPercent;
163
+ }
164
+
165
+ // ── Internal helpers ──────────────────────────────────────────────────────────
166
+
167
+ /**
168
+ * Parse utime + stime from /proc/<pid>/stat content.
169
+ *
170
+ * @param {string} statPath
171
+ * @returns {Promise<{utime: number, stime: number}|null>}
172
+ */
173
+ async function readStatTimes(statPath) {
174
+ let content;
175
+ try {
176
+ content = await fs.readFile(statPath, 'utf8');
177
+ } catch {
178
+ return null;
179
+ }
180
+
181
+ // The stat format is: pid (comm) state ppid pgroup session ... utime stime ...
182
+ // The command name (field 2) can contain spaces and parentheses, so we
183
+ // find the last ')' to reliably locate the fields that follow.
184
+ const parenClose = content.lastIndexOf(')');
185
+ if (parenClose === -1) return null;
186
+
187
+ // After the closing ')', fields are space-separated starting with ' state'.
188
+ // Fields after ')' (0-indexed):
189
+ // 0: state, 1: ppid, 2: pgrp, 3: session, 4: tty_nr, 5: tpgid,
190
+ // 6: flags, 7: minflt, 8: cminflt, 9: majflt, 10: cmajflt,
191
+ // 11: utime, 12: stime (i.e. indices 11+12 from the post-paren split)
192
+ const afterParen = content.slice(parenClose + 1).trim();
193
+ const fields = afterParen.split(/\s+/);
194
+ // utime = fields[11], stime = fields[12]
195
+ if (fields.length < 13) return null;
196
+
197
+ const utime = parseInt(fields[11], 10);
198
+ const stime = parseInt(fields[12], 10);
199
+
200
+ if (!Number.isFinite(utime) || !Number.isFinite(stime)) return null;
201
+ return { utime, stime };
202
+ }
203
+
204
+ /**
205
+ * Parse a hex-encoded IP address from /proc/net/tcp format.
206
+ *
207
+ * IPv4: 8 hex chars in little-endian byte order (e.g. "0101007F" → "127.0.0.1").
208
+ * IPv6: 32 hex chars (4 groups of 8, each in little-endian).
209
+ *
210
+ * @param {string} hexIp
211
+ * @returns {string|null}
212
+ */
213
+ function parseHexIp(hexIp) {
214
+ if (hexIp.length === 8) {
215
+ // IPv4: stored as little-endian 32-bit integer.
216
+ const b = [
217
+ parseInt(hexIp.slice(6, 8), 16),
218
+ parseInt(hexIp.slice(4, 6), 16),
219
+ parseInt(hexIp.slice(2, 4), 16),
220
+ parseInt(hexIp.slice(0, 2), 16),
221
+ ];
222
+ if (b.some((x) => !Number.isFinite(x))) return null;
223
+ return b.join('.');
224
+ }
225
+ if (hexIp.length === 32) {
226
+ // IPv6: 4 groups of 8 hex chars, each group little-endian.
227
+ const groups = [];
228
+ for (let g = 0; g < 4; g++) {
229
+ const chunk = hexIp.slice(g * 8, g * 8 + 8);
230
+ // Reverse byte order within each 32-bit group.
231
+ const bytes = [
232
+ chunk.slice(6, 8),
233
+ chunk.slice(4, 6),
234
+ chunk.slice(2, 4),
235
+ chunk.slice(0, 2),
236
+ ];
237
+ // Pair bytes into 16-bit groups for IPv6 notation.
238
+ groups.push(bytes[0] + bytes[1], bytes[2] + bytes[3]);
239
+ }
240
+ return groups.join(':');
241
+ }
242
+ return null;
243
+ }
244
+
245
+ // ── Classifier ───────────────────────────────────────────────────────────────
246
+
247
+ /**
248
+ * @typedef {'healthy'|'suspect'|'wedged'} WatchdogVerdict
249
+ */
250
+
251
+ /**
252
+ * Classify a set of probe signals into a watchdog verdict.
253
+ *
254
+ * AND-gate: all three of (wchan=futex_wait_queue, closeWaitCount≥3, cpuPercent<1)
255
+ * must fire for 'wedged'. Any subset → 'suspect'. None → 'healthy'.
256
+ * Null inputs are treated as not-firing (fail-soft).
257
+ *
258
+ * @param {{ wchan: string|null, closeWaitCount: number|null, cpuPercent: number|null }} signals
259
+ * @returns {WatchdogVerdict}
260
+ */
261
+ export function classify({ wchan, closeWaitCount, cpuPercent }) {
262
+ const wchanFires = wchan === 'futex_wait_queue';
263
+ const closeWaitFires = typeof closeWaitCount === 'number' && closeWaitCount >= 3;
264
+ const cpuFires = typeof cpuPercent === 'number' && cpuPercent < 1;
265
+
266
+ const firingCount = (wchanFires ? 1 : 0) + (closeWaitFires ? 1 : 0) + (cpuFires ? 1 : 0);
267
+
268
+ if (firingCount === 3) return 'wedged';
269
+ if (firingCount > 0) return 'suspect';
270
+ return 'healthy';
271
+ }