@pleri/olam-cli 0.1.196 → 0.1.198
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -0
- package/dist/ask/knowledge-pack.generated.d.ts.map +1 -1
- package/dist/ask/knowledge-pack.generated.js +10 -8
- package/dist/ask/knowledge-pack.generated.js.map +1 -1
- package/dist/commands/auth-list-json.d.ts +34 -0
- package/dist/commands/auth-list-json.d.ts.map +1 -1
- package/dist/commands/auth-list-json.js +24 -0
- package/dist/commands/auth-list-json.js.map +1 -1
- package/dist/commands/auth-migrate.d.ts +212 -0
- package/dist/commands/auth-migrate.d.ts.map +1 -0
- package/dist/commands/auth-migrate.js +465 -0
- package/dist/commands/auth-migrate.js.map +1 -0
- package/dist/commands/auth.d.ts.map +1 -1
- package/dist/commands/auth.js +239 -184
- package/dist/commands/auth.js.map +1 -1
- package/dist/commands/bootstrap.d.ts +4 -0
- package/dist/commands/bootstrap.d.ts.map +1 -1
- package/dist/commands/bootstrap.js +6 -0
- package/dist/commands/bootstrap.js.map +1 -1
- package/dist/commands/dispatch.d.ts.map +1 -1
- package/dist/commands/dispatch.js +11 -1
- package/dist/commands/dispatch.js.map +1 -1
- package/dist/commands/doctor.d.ts +33 -0
- package/dist/commands/doctor.d.ts.map +1 -1
- package/dist/commands/doctor.js +299 -12
- package/dist/commands/doctor.js.map +1 -1
- package/dist/commands/kg-mirror.d.ts +18 -2
- package/dist/commands/kg-mirror.d.ts.map +1 -1
- package/dist/commands/kg-mirror.js +78 -3
- package/dist/commands/kg-mirror.js.map +1 -1
- package/dist/commands/mcp/complete.d.ts +36 -0
- package/dist/commands/mcp/complete.d.ts.map +1 -0
- package/dist/commands/mcp/complete.js +66 -0
- package/dist/commands/mcp/complete.js.map +1 -0
- package/dist/commands/mcp/index.d.ts +1 -1
- package/dist/commands/mcp/index.d.ts.map +1 -1
- package/dist/commands/mcp/index.js +3 -1
- package/dist/commands/mcp/index.js.map +1 -1
- package/dist/commands/memory/bridge.d.ts +1 -1
- package/dist/commands/memory/bridge.d.ts.map +1 -1
- package/dist/commands/memory/bridge.js +2 -6
- package/dist/commands/memory/bridge.js.map +1 -1
- package/dist/commands/memory/secret.d.ts.map +1 -1
- package/dist/commands/memory/secret.js +4 -3
- package/dist/commands/memory/secret.js.map +1 -1
- package/dist/commands/observe.d.ts +3 -3
- package/dist/commands/observe.d.ts.map +1 -1
- package/dist/commands/observe.js +11 -8
- package/dist/commands/observe.js.map +1 -1
- package/dist/commands/runbooks.d.ts.map +1 -1
- package/dist/commands/runbooks.js +77 -10
- package/dist/commands/runbooks.js.map +1 -1
- package/dist/commands/services-tls.d.ts.map +1 -1
- package/dist/commands/services-tls.js +41 -0
- package/dist/commands/services-tls.js.map +1 -1
- package/dist/commands/services.d.ts +35 -1
- package/dist/commands/services.d.ts.map +1 -1
- package/dist/commands/services.js +153 -32
- package/dist/commands/services.js.map +1 -1
- package/dist/commands/setup-phase-8-kg-hook.d.ts +48 -0
- package/dist/commands/setup-phase-8-kg-hook.d.ts.map +1 -0
- package/dist/commands/setup-phase-8-kg-hook.js +93 -0
- package/dist/commands/setup-phase-8-kg-hook.js.map +1 -0
- package/dist/commands/setup-phase-9-memory-bridge.d.ts +36 -0
- package/dist/commands/setup-phase-9-memory-bridge.d.ts.map +1 -0
- package/dist/commands/setup-phase-9-memory-bridge.js +59 -0
- package/dist/commands/setup-phase-9-memory-bridge.js.map +1 -0
- package/dist/commands/setup.d.ts +34 -1
- package/dist/commands/setup.d.ts.map +1 -1
- package/dist/commands/setup.js +328 -23
- package/dist/commands/setup.js.map +1 -1
- package/dist/commands/update.d.ts +24 -0
- package/dist/commands/update.d.ts.map +1 -1
- package/dist/commands/update.js +53 -0
- package/dist/commands/update.js.map +1 -1
- package/dist/commands/upgrade.d.ts +5 -0
- package/dist/commands/upgrade.d.ts.map +1 -1
- package/dist/commands/upgrade.js +31 -8
- package/dist/commands/upgrade.js.map +1 -1
- package/dist/image-digests.json +8 -8
- package/dist/index.js +4193 -2426
- package/dist/lib/auth-backend.d.ts +168 -0
- package/dist/lib/auth-backend.d.ts.map +1 -0
- package/dist/lib/auth-backend.js +172 -0
- package/dist/lib/auth-backend.js.map +1 -0
- package/dist/lib/auth-list-cache.d.ts +67 -0
- package/dist/lib/auth-list-cache.d.ts.map +1 -0
- package/dist/lib/auth-list-cache.js +84 -0
- package/dist/lib/auth-list-cache.js.map +1 -0
- package/dist/lib/auth-list.d.ts +107 -0
- package/dist/lib/auth-list.d.ts.map +1 -0
- package/dist/lib/auth-list.js +123 -0
- package/dist/lib/auth-list.js.map +1 -0
- package/dist/lib/auth-login.d.ts +92 -0
- package/dist/lib/auth-login.d.ts.map +1 -0
- package/dist/lib/auth-login.js +124 -0
- package/dist/lib/auth-login.js.map +1 -0
- package/dist/lib/auth-mutator-backend.d.ts +54 -0
- package/dist/lib/auth-mutator-backend.d.ts.map +1 -0
- package/dist/lib/auth-mutator-backend.js +62 -0
- package/dist/lib/auth-mutator-backend.js.map +1 -0
- package/dist/lib/auth-remote.d.ts +50 -0
- package/dist/lib/auth-remote.d.ts.map +1 -1
- package/dist/lib/auth-remote.js +84 -2
- package/dist/lib/auth-remote.js.map +1 -1
- package/dist/lib/bootstrap-kubernetes.d.ts +69 -10
- package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
- package/dist/lib/bootstrap-kubernetes.js +264 -46
- package/dist/lib/bootstrap-kubernetes.js.map +1 -1
- package/dist/lib/config.d.ts +7 -0
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/health-probes.d.ts +0 -22
- package/dist/lib/health-probes.d.ts.map +1 -1
- package/dist/lib/health-probes.js +23 -2
- package/dist/lib/health-probes.js.map +1 -1
- package/dist/lib/peripheral-registry.d.ts +11 -0
- package/dist/lib/peripheral-registry.d.ts.map +1 -1
- package/dist/lib/peripheral-registry.js +5 -0
- package/dist/lib/peripheral-registry.js.map +1 -1
- package/dist/lib/plans-client.d.ts.map +1 -1
- package/dist/lib/plans-client.js +6 -3
- package/dist/lib/plans-client.js.map +1 -1
- package/dist/mcp-server.js +14 -3
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/30-configmap.yaml +4 -0
- package/host-cp/k8s/manifests/50-deployment.yaml +13 -1
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
- package/host-cp/src/dispatch-persister.mjs +157 -0
- package/host-cp/src/pr-nanny.mjs +7 -0
- package/host-cp/src/server.mjs +175 -3
- package/host-cp/src/world-watchdog-pid-lookup.mjs +119 -0
- package/host-cp/src/world-watchdog-probes.mjs +271 -0
- package/host-cp/src/world-watchdog-recovery.mjs +192 -0
- package/host-cp/src/world-watchdog.mjs +313 -0
- package/package.json +1 -1
package/host-cp/src/server.mjs
CHANGED
|
@@ -84,6 +84,11 @@ import {
|
|
|
84
84
|
defaultListContainerNames,
|
|
85
85
|
} from './boot-reconciler.mjs';
|
|
86
86
|
import { startWorldActivityTracker } from './world-activity-tracker.mjs';
|
|
87
|
+
import { startWorldWatchdog } from './world-watchdog.mjs';
|
|
88
|
+
import { createRecovery } from './world-watchdog-recovery.mjs';
|
|
89
|
+
import { createLeakyBucket } from './lib/leaky-bucket.mjs';
|
|
90
|
+
import { read as readLastDispatch, safePersistLastDispatch } from './dispatch-persister.mjs';
|
|
91
|
+
import { findClaudePid } from './world-watchdog-pid-lookup.mjs';
|
|
87
92
|
import { authSecretHint } from './auth-secret-hint.mjs';
|
|
88
93
|
import * as tunnelManager from './world-tunnel-manager.mjs';
|
|
89
94
|
import * as bridgeManager from './port-bridge-manager.mjs';
|
|
@@ -96,6 +101,7 @@ import {
|
|
|
96
101
|
import { instrumentHandler, renderMetrics } from './metrics.mjs';
|
|
97
102
|
import { handleDispatchFromEmail } from './lib/email-dispatch.mjs';
|
|
98
103
|
import { handleDispatchFromLinear } from './lib/linear-dispatch.mjs';
|
|
104
|
+
// (safePersistLastDispatch imported above alongside readLastDispatch)
|
|
99
105
|
import { emitTierSuggestion } from '../dispatch/auto-tier-scheduler.mjs';
|
|
100
106
|
import { isServeOnly, isOrchestrationRoute, ORCHESTRATION_UNAVAILABLE } from './serve-only-config.mjs';
|
|
101
107
|
|
|
@@ -1874,6 +1880,41 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
|
|
|
1874
1880
|
// the isOrchestrationRoute guard — it covers /api/world/, /api/worlds/<id>,
|
|
1875
1881
|
// and /v1/worlds/ for all methods, so no per-route guard is needed here.)
|
|
1876
1882
|
|
|
1883
|
+
// GET /api/world/<id>/socket-health — world watchdog verdict (A5).
|
|
1884
|
+
// Returns the latest in-memory probe result from the world watchdog.
|
|
1885
|
+
// Read-only; never mutates world state.
|
|
1886
|
+
// 200: { worldId, verdict, signals, pid, lastTickAt } — known world + tick fired
|
|
1887
|
+
// 200 verdict='unknown': known world but no tick has fired yet
|
|
1888
|
+
// 404: unknown_world
|
|
1889
|
+
// serve-only: returns 503 orchestration_unavailable (isOrchestrationRoute covers
|
|
1890
|
+
// /api/world/* prefix, so this route is already blocked upstream before reaching here).
|
|
1891
|
+
const socketHealthMatch = /^\/api\/world\/([^/?#]+)\/socket-health\/?$/.exec(url.pathname);
|
|
1892
|
+
if (socketHealthMatch && req.method === 'GET') {
|
|
1893
|
+
const worldId = decodeURIComponent(socketHealthMatch[1]);
|
|
1894
|
+
if (!(worldId in WORLDS)) {
|
|
1895
|
+
return jsonReply(res, 404, { error: 'unknown_world' });
|
|
1896
|
+
}
|
|
1897
|
+
// worldWatchdog is null in serve-only mode (but the serve-only gate above
|
|
1898
|
+
// would have returned 503 before we get here; belt-and-suspenders).
|
|
1899
|
+
const entry = worldWatchdog ? worldWatchdog.getVerdict(worldId) : null;
|
|
1900
|
+
if (!entry) {
|
|
1901
|
+
return jsonReply(res, 200, {
|
|
1902
|
+
worldId,
|
|
1903
|
+
verdict: 'unknown',
|
|
1904
|
+
signals: null,
|
|
1905
|
+
pid: null,
|
|
1906
|
+
lastTickAt: null,
|
|
1907
|
+
});
|
|
1908
|
+
}
|
|
1909
|
+
return jsonReply(res, 200, {
|
|
1910
|
+
worldId,
|
|
1911
|
+
verdict: entry.lastVerdict,
|
|
1912
|
+
signals: entry.lastSignals,
|
|
1913
|
+
pid: entry.lastPid,
|
|
1914
|
+
lastTickAt: entry.lastTickAt,
|
|
1915
|
+
});
|
|
1916
|
+
}
|
|
1917
|
+
|
|
1877
1918
|
// GET /api/world/<id>/progress — phase ladder progress for inbox row.
|
|
1878
1919
|
const progressMatch = /^\/api\/world\/([^/?#]+)\/progress\/?$/.exec(url.pathname);
|
|
1879
1920
|
if (progressMatch && req.method === 'GET') {
|
|
@@ -1892,8 +1933,16 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
|
|
|
1892
1933
|
prStateStore,
|
|
1893
1934
|
getGhToken: resolveGhToken,
|
|
1894
1935
|
});
|
|
1895
|
-
|
|
1896
|
-
|
|
1936
|
+
// C1 — attach socketHealth if watchdog has fired for this world.
|
|
1937
|
+
const verdictEntry = worldWatchdog ? worldWatchdog.getVerdict(worldId) : null;
|
|
1938
|
+
const enriched = verdictEntry
|
|
1939
|
+
? {
|
|
1940
|
+
...data,
|
|
1941
|
+
socketHealth: buildSocketHealthPayload(worldId, verdictEntry),
|
|
1942
|
+
}
|
|
1943
|
+
: data;
|
|
1944
|
+
progressCache.set(worldId, { fetchedAt: Date.now(), data: enriched });
|
|
1945
|
+
return jsonReply(res, 200, enriched);
|
|
1897
1946
|
}
|
|
1898
1947
|
|
|
1899
1948
|
// /api/world/<id>/* → proxy to per-world CP with X-Olam-Secret injected.
|
|
@@ -2686,6 +2735,15 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
|
|
|
2686
2735
|
// env-var enrichments are applied.
|
|
2687
2736
|
const enriched = enrichedObj ? JSON.stringify(enrichedObj) : JSON.stringify(parsed);
|
|
2688
2737
|
|
|
2738
|
+
if (parsed.world_id && parsed.prompt) {
|
|
2739
|
+
safePersistLastDispatch({
|
|
2740
|
+
worldId: parsed.world_id,
|
|
2741
|
+
messageId: parsed.session_id ?? `cloud-dispatch-${Date.now()}`,
|
|
2742
|
+
prompt: parsed.prompt,
|
|
2743
|
+
source: 'cloud-dispatch',
|
|
2744
|
+
});
|
|
2745
|
+
}
|
|
2746
|
+
|
|
2689
2747
|
// Phase H h2: attach CF Access service-token headers when configured
|
|
2690
2748
|
// (machine-to-machine auth). Additive alongside Basic auth. CF Access
|
|
2691
2749
|
// headers are validated at the EDGE of origins behind a CF Access app
|
|
@@ -3674,6 +3732,119 @@ const worldActivityTracker = SERVE_ONLY
|
|
|
3674
3732
|
broadcaster: hostStream,
|
|
3675
3733
|
});
|
|
3676
3734
|
|
|
3735
|
+
// World watchdog — periodic probe of each active world's claude PID for the
|
|
3736
|
+
// three wedge signals (wchan + CLOSE_WAIT + CPU). Emits `world.watchdog.tick`
|
|
3737
|
+
// events on the host-stream broadcaster.
|
|
3738
|
+
//
|
|
3739
|
+
// Phase B: recovery is wired when compute.autoRecover !== false.
|
|
3740
|
+
// Default is false (detection-only, byte-identical to Phase A behaviour).
|
|
3741
|
+
//
|
|
3742
|
+
// SERVE-ONLY: no docker / worlds.db on a managed cluster; null sentinel keeps
|
|
3743
|
+
// the shutdown handler's `worldWatchdog?.stop()` a no-op.
|
|
3744
|
+
//
|
|
3745
|
+
// getClaudePidForWorld is a v1 stub returning null for all worlds. All worlds
|
|
3746
|
+
// therefore emit verdict='unknown' until real PID lookup is wired in a follow-up.
|
|
3747
|
+
|
|
3748
|
+
// ── Recovery setup (Phase B) ─────────────────────────────────────────────────
|
|
3749
|
+
// Load autoRecover from env OLAM_AUTO_RECOVER (or false by default — D4).
|
|
3750
|
+
// Falls back to false if config unavailable or field absent.
|
|
3751
|
+
|
|
3752
|
+
// The compute.autoRecover field lives in .olam/config.yaml (per workspace),
|
|
3753
|
+
// not in ~/.olam/config.json (global). Host-cp does not load workspace YAML at
|
|
3754
|
+
// startup. Read from env OLAM_AUTO_RECOVER; default false (D4 — OFF by default).
|
|
3755
|
+
const _watchdogAutoRecoverMode = (() => {
|
|
3756
|
+
const envVal = process.env.OLAM_AUTO_RECOVER;
|
|
3757
|
+
if (envVal === 'true') return true;
|
|
3758
|
+
if (envVal === 'dry-run') return 'dry-run';
|
|
3759
|
+
return false;
|
|
3760
|
+
})();
|
|
3761
|
+
|
|
3762
|
+
const _watchdogLeakyBucket = createLeakyBucket({ capacity: 3, windowMs: 3_600_000 });
|
|
3763
|
+
|
|
3764
|
+
const _watchdogRecovery = SERVE_ONLY
|
|
3765
|
+
? null
|
|
3766
|
+
: createRecovery({
|
|
3767
|
+
autoRecoverMode: _watchdogAutoRecoverMode,
|
|
3768
|
+
leakyBucket: _watchdogLeakyBucket,
|
|
3769
|
+
broadcaster: hostStream,
|
|
3770
|
+
persister: { read: readLastDispatch },
|
|
3771
|
+
// TODO: wire real replay once operator has run the B3 idempotence probe
|
|
3772
|
+
// and confirmed dispatch is idempotent for all substrates in use.
|
|
3773
|
+
// See docs/architecture/world-watchdog.md Recovery > Idempotence probe.
|
|
3774
|
+
// For now: log + emit replay_stub breadcrumb so the stub path is visible.
|
|
3775
|
+
replay: async ({ worldId, prompt }) => {
|
|
3776
|
+
console.warn(
|
|
3777
|
+
`[world-watchdog-recovery] replay stub: worldId=${worldId} prompt="${prompt.slice(0, 80)}..." — real replay deferred pending B3 sign-off`,
|
|
3778
|
+
);
|
|
3779
|
+
// breadcrumb already emitted by createRecovery before calling replay
|
|
3780
|
+
},
|
|
3781
|
+
});
|
|
3782
|
+
|
|
3783
|
+
const worldWatchdog = SERVE_ONLY
|
|
3784
|
+
? null
|
|
3785
|
+
: startWorldWatchdog({
|
|
3786
|
+
broadcaster: hostStream,
|
|
3787
|
+
recovery: _watchdogRecovery,
|
|
3788
|
+
listActiveWorlds: async () => {
|
|
3789
|
+
// Reuse the same worlds.db query as worldActivityTracker: return all
|
|
3790
|
+
// non-destroyed/failed world IDs for probing.
|
|
3791
|
+
let Database;
|
|
3792
|
+
try {
|
|
3793
|
+
const { createRequire } = await import('node:module');
|
|
3794
|
+
const req = createRequire(import.meta.url);
|
|
3795
|
+
Database = req('better-sqlite3');
|
|
3796
|
+
} catch {
|
|
3797
|
+
return [];
|
|
3798
|
+
}
|
|
3799
|
+
let db;
|
|
3800
|
+
try {
|
|
3801
|
+
db = new Database(WORLDS_DB_PATH, { fileMustExist: true });
|
|
3802
|
+
} catch {
|
|
3803
|
+
return [];
|
|
3804
|
+
}
|
|
3805
|
+
try {
|
|
3806
|
+
const rows = db
|
|
3807
|
+
.prepare("SELECT id FROM worlds WHERE status NOT IN ('destroyed', 'failed')")
|
|
3808
|
+
.all();
|
|
3809
|
+
return rows.map((r) => r.id).filter((id) => typeof id === 'string');
|
|
3810
|
+
} catch {
|
|
3811
|
+
return [];
|
|
3812
|
+
} finally {
|
|
3813
|
+
try { db.close(); } catch { /* ignore */ }
|
|
3814
|
+
}
|
|
3815
|
+
},
|
|
3816
|
+
getClaudePidForWorld: async (worldId) =>
|
|
3817
|
+
findClaudePid({ containerId: `olam-${worldId}-devbox` }),
|
|
3818
|
+
});
|
|
3819
|
+
|
|
3820
|
+
/**
|
|
3821
|
+
* C1 — Serialize a WorldWatchdogState entry into the `socketHealth` sub-object
|
|
3822
|
+
* shape shared by the /progress payload and the SPA TypeScript types.
|
|
3823
|
+
*
|
|
3824
|
+
* @param {string} worldId Used to peek the per-world leaky-bucket count.
|
|
3825
|
+
* @param {import('./world-watchdog.mjs').WorldWatchdogState} entry
|
|
3826
|
+
* @returns {object}
|
|
3827
|
+
*/
|
|
3828
|
+
function buildSocketHealthPayload(worldId, entry) {
|
|
3829
|
+
const payload = {
|
|
3830
|
+
verdict: entry.lastVerdict,
|
|
3831
|
+
signals: entry.lastSignals,
|
|
3832
|
+
pid: entry.lastPid,
|
|
3833
|
+
lastTickAt: entry.lastTickAt,
|
|
3834
|
+
};
|
|
3835
|
+
// Attach recovery sub-object only when OLAM_AUTO_RECOVER is non-false.
|
|
3836
|
+
if (_watchdogRecovery) {
|
|
3837
|
+
payload.recovery = {
|
|
3838
|
+
mode: _watchdogAutoRecoverMode,
|
|
3839
|
+
restartsInWindow: _watchdogLeakyBucket
|
|
3840
|
+
? _watchdogLeakyBucket.peek(worldId).totalInWindow
|
|
3841
|
+
: 0,
|
|
3842
|
+
lastRestartAt: null, // tracking per-world last-restart-at is a future enhancement
|
|
3843
|
+
};
|
|
3844
|
+
}
|
|
3845
|
+
return payload;
|
|
3846
|
+
}
|
|
3847
|
+
|
|
3677
3848
|
// ── Phase 1a / B1 (PR3): engine-select + await-before-listen ─────
|
|
3678
3849
|
//
|
|
3679
3850
|
// Decision 15: the async KubernetesEngine factory MUST be fully awaited
|
|
@@ -3774,12 +3945,13 @@ for (const sig of ['SIGTERM', 'SIGINT']) {
|
|
|
3774
3945
|
console.log(`received ${sig}, shutting down`);
|
|
3775
3946
|
stopEvents();
|
|
3776
3947
|
prPoller.stop();
|
|
3777
|
-
// worldsDbReconciler + worldActivityTracker are null in SERVE-ONLY mode.
|
|
3948
|
+
// worldsDbReconciler + worldActivityTracker + worldWatchdog are null in SERVE-ONLY mode.
|
|
3778
3949
|
worldsDbReconciler?.stop();
|
|
3779
3950
|
stopWorldsSnapshotLoop();
|
|
3780
3951
|
stopTunnelsSnapshotLoop();
|
|
3781
3952
|
stopListeningSnapshotLoop();
|
|
3782
3953
|
worldActivityTracker?.stop();
|
|
3954
|
+
worldWatchdog?.stop();
|
|
3783
3955
|
if (serversSnapshotTimer) { clearTimeout(serversSnapshotTimer); serversSnapshotTimer = null; }
|
|
3784
3956
|
hostStream.close();
|
|
3785
3957
|
if (ndjsonSpanSink) ndjsonSpanSink.close().catch(() => {});
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* world-watchdog-pid-lookup.mjs — host-visible PID lookup for the world watchdog.
|
|
3
|
+
*
|
|
4
|
+
* Uses `docker top <containerId>` to enumerate processes inside a world's
|
|
5
|
+
* container and returns the host-visible PID of the claude process.
|
|
6
|
+
*
|
|
7
|
+
* `docker top` output format (Linux Docker / Colima):
|
|
8
|
+
* UID PID PPID C STIME TTY TIME CMD
|
|
9
|
+
* root 1234 1 0 10:00 ? 00:00:00 node /usr/local/bin/claude ...
|
|
10
|
+
*
|
|
11
|
+
* The PID column (index 1 in default ps output) is already the host-visible
|
|
12
|
+
* PID. On Mac/Colima the container runs inside a Linux VM so `docker top`
|
|
13
|
+
* returns PIDs within the VM's PID namespace — these are NOT the macOS host
|
|
14
|
+
* PIDs, but they ARE the PIDs visible from within the Linux layer (where
|
|
15
|
+
* /proc reads happen). This is the same namespace the watchdog probes use
|
|
16
|
+
* when reading /proc/<pid>/wchan etc., so the PIDs are correct for probe use.
|
|
17
|
+
*
|
|
18
|
+
* Inject `docker` for tests (avoids spawning real docker processes).
|
|
19
|
+
*
|
|
20
|
+
* @see docs/architecture/world-watchdog.md
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
import { execFile } from 'node:child_process';
|
|
24
|
+
import { promisify } from 'node:util';
|
|
25
|
+
|
|
26
|
+
const execFileAsync = promisify(execFile);
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Default docker executor — shells out to the real `docker` CLI.
|
|
30
|
+
*
|
|
31
|
+
* @param {string} containerId
|
|
32
|
+
* @returns {Promise<string>} stdout from `docker top <containerId>`
|
|
33
|
+
*/
|
|
34
|
+
async function defaultDockerTop(containerId) {
|
|
35
|
+
const { stdout } = await execFileAsync('docker', ['top', containerId], {
|
|
36
|
+
timeout: 5_000,
|
|
37
|
+
});
|
|
38
|
+
return stdout;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Parse the stdout from `docker top` and extract host-visible PIDs whose
|
|
43
|
+
* CMD column matches a claude process.
|
|
44
|
+
*
|
|
45
|
+
* docker top default output columns (ps -ef format):
|
|
46
|
+
* UID PID PPID C STIME TTY TIME CMD
|
|
47
|
+
* Indices: 0=UID, 1=PID, 2=PPID, ..., 7+=CMD (rest of line after 7 columns).
|
|
48
|
+
*
|
|
49
|
+
* @param {string} stdout Raw output from `docker top <id>`
|
|
50
|
+
* @returns {number[]} Host-visible PIDs of matching claude processes, sorted ascending.
|
|
51
|
+
*/
|
|
52
|
+
export function parseDockerTopOutput(stdout) {
|
|
53
|
+
const lines = stdout.split('\n').filter((l) => l.trim().length > 0);
|
|
54
|
+
if (lines.length < 2) return []; // header only or empty
|
|
55
|
+
|
|
56
|
+
// Skip the header line (first line contains column names).
|
|
57
|
+
const dataLines = lines.slice(1);
|
|
58
|
+
|
|
59
|
+
const pids = [];
|
|
60
|
+
for (const line of dataLines) {
|
|
61
|
+
// Split on any whitespace — `docker top` columns are space-separated.
|
|
62
|
+
// CMD may contain spaces; split into at most 8 parts (last = full CMD string).
|
|
63
|
+
const parts = line.trim().split(/\s+/);
|
|
64
|
+
if (parts.length < 8) continue;
|
|
65
|
+
|
|
66
|
+
const pid = parseInt(parts[1], 10);
|
|
67
|
+
if (!Number.isFinite(pid) || pid <= 0) continue;
|
|
68
|
+
|
|
69
|
+
// parts[7] onward is the CMD. Rejoin the remainder.
|
|
70
|
+
const cmd = parts.slice(7).join(' ');
|
|
71
|
+
|
|
72
|
+
// Match: `claude` as standalone binary, or `node` process running claude.
|
|
73
|
+
if (/(?:^|\/)claude(\s|$)/.test(cmd) || /node[^\s]*\s+.*[/\\]claude(?:\s|$)/.test(cmd)) {
|
|
74
|
+
pids.push(pid);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return pids.sort((a, b) => a - b);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Find the host-visible PID of the claude process running inside a container.
|
|
83
|
+
*
|
|
84
|
+
* Returns the lowest matching PID (parent process heuristic — the supervisor
|
|
85
|
+
* claude process has a lower PID than any child workers it spawns).
|
|
86
|
+
*
|
|
87
|
+
* Fail-soft:
|
|
88
|
+
* - docker unreachable / container not found → null + log
|
|
89
|
+
* - no claude process in the container → null (silent)
|
|
90
|
+
* - multiple claude processes → return the lowest PID
|
|
91
|
+
*
|
|
92
|
+
* @param {{
|
|
93
|
+
* containerId: string,
|
|
94
|
+
* dockerTop?: (containerId: string) => Promise<string>,
|
|
95
|
+
* log?: (msg: string) => void,
|
|
96
|
+
* }} opts
|
|
97
|
+
* @returns {Promise<number | null>}
|
|
98
|
+
*/
|
|
99
|
+
export async function findClaudePid({
|
|
100
|
+
containerId,
|
|
101
|
+
dockerTop = defaultDockerTop,
|
|
102
|
+
log = (m) => console.log(`[world-watchdog-pid-lookup] ${m}`),
|
|
103
|
+
}) {
|
|
104
|
+
if (!containerId) return null;
|
|
105
|
+
|
|
106
|
+
let stdout;
|
|
107
|
+
try {
|
|
108
|
+
stdout = await dockerTop(containerId);
|
|
109
|
+
} catch (err) {
|
|
110
|
+
log(`docker top ${containerId} failed: ${err?.message ?? err}`);
|
|
111
|
+
return null;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const pids = parseDockerTopOutput(stdout);
|
|
115
|
+
if (pids.length === 0) return null;
|
|
116
|
+
|
|
117
|
+
// Lowest PID = the parent/supervisor process.
|
|
118
|
+
return pids[0];
|
|
119
|
+
}
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* world-watchdog-probes.mjs — pure probe functions for the world watchdog.
|
|
3
|
+
*
|
|
4
|
+
* Three readers extract raw signals from the Linux /proc filesystem:
|
|
5
|
+
* - readWchan(pid, opts) → string | null
|
|
6
|
+
* - readCloseWaitSockets(pid, opts) → Array<{remoteIp, remotePort}>
|
|
7
|
+
* - readCpuPercent(pid, windowMs, opts) → number | null
|
|
8
|
+
*
|
|
9
|
+
* One pure classifier turns those signals into a verdict:
|
|
10
|
+
* - classify({ wchan, closeWaitCount, cpuPercent }) → 'healthy'|'suspect'|'wedged'
|
|
11
|
+
*
|
|
12
|
+
* All readers are fail-soft: any I/O error or parse error returns
|
|
13
|
+
* null / [] / 0 rather than throwing. The classifier treats null inputs as
|
|
14
|
+
* the signal not firing (conservative — only promotes to 'wedged' when all
|
|
15
|
+
* three signals are conclusive).
|
|
16
|
+
*
|
|
17
|
+
* Test injection: pass `opts.procRoot` to redirect /proc reads to a fixture
|
|
18
|
+
* directory (e.g. src/__tests__/fixtures/proc-gold-elk-5574/).
|
|
19
|
+
*
|
|
20
|
+
* CLOSE_WAIT threshold note (deviation from D2): Decision D2 specifies
|
|
21
|
+
* filtering CLOSE_WAIT by peer hostname (*.anthropic.com | auth-worker.*).
|
|
22
|
+
* DNS resolution at every tick is unreliable under network stress (exactly
|
|
23
|
+
* when the watchdog must be most accurate). The gold-elk-5574 forensic data
|
|
24
|
+
* shows ≥3 CLOSE_WAIT to ANY peer is already diagnostic — a healthy claude
|
|
25
|
+
* process has 0-1 CLOSE_WAIT sockets under normal operation. The classifier
|
|
26
|
+
* therefore uses count ≥ 3 without hostname filtering. This deviation is
|
|
27
|
+
* documented in docs/architecture/world-watchdog.md Signal 2.
|
|
28
|
+
*
|
|
29
|
+
* @see docs/architecture/world-watchdog.md
|
|
30
|
+
* @see packages/host-cp/src/__tests__/world-watchdog-probes.test.mjs
|
|
31
|
+
*/
|
|
32
|
+
|
|
33
|
+
import fs from 'node:fs/promises';
|
|
34
|
+
import path from 'node:path';
|
|
35
|
+
|
|
36
|
+
// HZ — Linux scheduler tick rate. Kernel default is 100; can be 250 or 1000
|
|
37
|
+
// on tickless kernels but the /proc/stat jiffies-to-seconds conversion is
|
|
38
|
+
// independent of the actual HZ when the denominator is wall-clock ms.
|
|
39
|
+
// We divide jiffies by HZ to get seconds of CPU time, then compare to the
|
|
40
|
+
// wall-clock window. HZ=100 is correct for virtually all container environments.
|
|
41
|
+
const LINUX_HZ = 100;
|
|
42
|
+
|
|
43
|
+
// /proc/net/tcp state byte for CLOSE_WAIT.
|
|
44
|
+
const CLOSE_WAIT_STATE = '08';
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Read the wchan (wait channel) of a process's main thread.
|
|
48
|
+
*
|
|
49
|
+
* @param {number|string} pid Process ID.
|
|
50
|
+
* @param {{ procRoot?: string }} [opts]
|
|
51
|
+
* `procRoot` defaults to '/proc'; override for tests.
|
|
52
|
+
* @returns {Promise<string|null>}
|
|
53
|
+
* The wchan string (e.g. 'futex_wait_queue', 'epoll_wait') or null on error.
|
|
54
|
+
*/
|
|
55
|
+
export async function readWchan(pid, opts = {}) {
|
|
56
|
+
const procRoot = opts.procRoot ?? '/proc';
|
|
57
|
+
const wchanPath = path.join(procRoot, String(pid), 'wchan');
|
|
58
|
+
try {
|
|
59
|
+
const content = await fs.readFile(wchanPath, 'utf8');
|
|
60
|
+
return content.trim() || null;
|
|
61
|
+
} catch {
|
|
62
|
+
return null;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Read CLOSE_WAIT sockets for a process from /proc/<pid>/net/tcp (and tcp6).
|
|
68
|
+
*
|
|
69
|
+
* Parses the /proc/net/tcp format (space-separated hex fields). State field
|
|
70
|
+
* (column index 3, 0-based) == '08' means CLOSE_WAIT. Returns all matching
|
|
71
|
+
* entries regardless of remote peer — see module JSDoc for rationale.
|
|
72
|
+
*
|
|
73
|
+
* @param {number|string} pid Process ID.
|
|
74
|
+
* @param {{ procRoot?: string }} [opts]
|
|
75
|
+
* @returns {Promise<Array<{remoteIp: string, remotePort: number}>>}
|
|
76
|
+
* Array of CLOSE_WAIT socket descriptors, empty on error or no matches.
|
|
77
|
+
*/
|
|
78
|
+
export async function readCloseWaitSockets(pid, opts = {}) {
|
|
79
|
+
const procRoot = opts.procRoot ?? '/proc';
|
|
80
|
+
const results = [];
|
|
81
|
+
|
|
82
|
+
for (const proto of ['tcp', 'tcp6']) {
|
|
83
|
+
const tcpPath = path.join(procRoot, String(pid), 'net', proto);
|
|
84
|
+
let content;
|
|
85
|
+
try {
|
|
86
|
+
content = await fs.readFile(tcpPath, 'utf8');
|
|
87
|
+
} catch {
|
|
88
|
+
// ENOENT: pid gone or proto not available — skip, not an error.
|
|
89
|
+
continue;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const lines = content.split('\n');
|
|
93
|
+
// Skip header line.
|
|
94
|
+
for (let i = 1; i < lines.length; i++) {
|
|
95
|
+
const line = lines[i].trim();
|
|
96
|
+
if (!line) continue;
|
|
97
|
+
const fields = line.split(/\s+/);
|
|
98
|
+
// /proc/net/tcp columns (0-based):
|
|
99
|
+
// 0: sl
|
|
100
|
+
// 1: local_address (hex IP:port)
|
|
101
|
+
// 2: rem_address (hex IP:port)
|
|
102
|
+
// 3: st (hex state)
|
|
103
|
+
if (fields.length < 4) continue;
|
|
104
|
+
const state = fields[3];
|
|
105
|
+
if (state !== CLOSE_WAIT_STATE) continue;
|
|
106
|
+
|
|
107
|
+
const remAddr = fields[2];
|
|
108
|
+
const colonIdx = remAddr.lastIndexOf(':');
|
|
109
|
+
if (colonIdx === -1) continue;
|
|
110
|
+
const remIpHex = remAddr.slice(0, colonIdx);
|
|
111
|
+
const remPortHex = remAddr.slice(colonIdx + 1);
|
|
112
|
+
|
|
113
|
+
const remIp = parseHexIp(remIpHex);
|
|
114
|
+
const remPort = parseInt(remPortHex, 16);
|
|
115
|
+
|
|
116
|
+
if (remIp !== null && Number.isFinite(remPort)) {
|
|
117
|
+
results.push({ remoteIp: remIp, remotePort: remPort });
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return results;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Measure CPU utilisation for a process over a time window.
|
|
127
|
+
*
|
|
128
|
+
* Reads /proc/<pid>/stat twice (before + after `windowMs` ms) and computes:
|
|
129
|
+
* cpuPercent = (utime+stime delta) / (HZ * windowMs / 1000) * 100
|
|
130
|
+
*
|
|
131
|
+
* @param {number|string} pid Process ID.
|
|
132
|
+
* @param {number} windowMs Measurement window in milliseconds.
|
|
133
|
+
* @param {{ procRoot?: string, sleep?: (ms: number) => Promise<void>, now?: () => number }} [opts]
|
|
134
|
+
* `sleep` — injectable delay function (default: real setTimeout).
|
|
135
|
+
* `now` — injectable clock (default: Date.now).
|
|
136
|
+
* `procRoot` — injectable proc root for tests.
|
|
137
|
+
* @returns {Promise<number|null>}
|
|
138
|
+
* CPU percent (0–100+) or null on read/parse error.
|
|
139
|
+
*/
|
|
140
|
+
export async function readCpuPercent(pid, windowMs, opts = {}) {
|
|
141
|
+
const procRoot = opts.procRoot ?? '/proc';
|
|
142
|
+
const sleep = opts.sleep ?? ((ms) => new Promise((r) => setTimeout(r, ms)));
|
|
143
|
+
const statPath = path.join(procRoot, String(pid), 'stat');
|
|
144
|
+
|
|
145
|
+
const before = await readStatTimes(statPath);
|
|
146
|
+
if (before === null) return null;
|
|
147
|
+
|
|
148
|
+
await sleep(windowMs);
|
|
149
|
+
|
|
150
|
+
const after = await readStatTimes(statPath);
|
|
151
|
+
if (after === null) return null;
|
|
152
|
+
|
|
153
|
+
const deltaTicks = (after.utime + after.stime) - (before.utime + before.stime);
|
|
154
|
+
if (deltaTicks < 0) return null;
|
|
155
|
+
|
|
156
|
+
// deltaTicks jiffies / HZ = delta CPU-seconds.
|
|
157
|
+
// windowMs / 1000 = window in seconds.
|
|
158
|
+
const windowSec = windowMs / 1000;
|
|
159
|
+
if (windowSec <= 0) return null;
|
|
160
|
+
|
|
161
|
+
const cpuPercent = (deltaTicks / LINUX_HZ / windowSec) * 100;
|
|
162
|
+
return cpuPercent;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// ── Internal helpers ──────────────────────────────────────────────────────────
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Parse utime + stime from /proc/<pid>/stat content.
|
|
169
|
+
*
|
|
170
|
+
* @param {string} statPath
|
|
171
|
+
* @returns {Promise<{utime: number, stime: number}|null>}
|
|
172
|
+
*/
|
|
173
|
+
async function readStatTimes(statPath) {
|
|
174
|
+
let content;
|
|
175
|
+
try {
|
|
176
|
+
content = await fs.readFile(statPath, 'utf8');
|
|
177
|
+
} catch {
|
|
178
|
+
return null;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// The stat format is: pid (comm) state ppid pgroup session ... utime stime ...
|
|
182
|
+
// The command name (field 2) can contain spaces and parentheses, so we
|
|
183
|
+
// find the last ')' to reliably locate the fields that follow.
|
|
184
|
+
const parenClose = content.lastIndexOf(')');
|
|
185
|
+
if (parenClose === -1) return null;
|
|
186
|
+
|
|
187
|
+
// After the closing ')', fields are space-separated starting with ' state'.
|
|
188
|
+
// Fields after ')' (0-indexed):
|
|
189
|
+
// 0: state, 1: ppid, 2: pgrp, 3: session, 4: tty_nr, 5: tpgid,
|
|
190
|
+
// 6: flags, 7: minflt, 8: cminflt, 9: majflt, 10: cmajflt,
|
|
191
|
+
// 11: utime, 12: stime (i.e. indices 11+12 from the post-paren split)
|
|
192
|
+
const afterParen = content.slice(parenClose + 1).trim();
|
|
193
|
+
const fields = afterParen.split(/\s+/);
|
|
194
|
+
// utime = fields[11], stime = fields[12]
|
|
195
|
+
if (fields.length < 13) return null;
|
|
196
|
+
|
|
197
|
+
const utime = parseInt(fields[11], 10);
|
|
198
|
+
const stime = parseInt(fields[12], 10);
|
|
199
|
+
|
|
200
|
+
if (!Number.isFinite(utime) || !Number.isFinite(stime)) return null;
|
|
201
|
+
return { utime, stime };
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
/**
|
|
205
|
+
* Parse a hex-encoded IP address from /proc/net/tcp format.
|
|
206
|
+
*
|
|
207
|
+
* IPv4: 8 hex chars in little-endian byte order (e.g. "0101007F" → "127.0.0.1").
|
|
208
|
+
* IPv6: 32 hex chars (4 groups of 8, each in little-endian).
|
|
209
|
+
*
|
|
210
|
+
* @param {string} hexIp
|
|
211
|
+
* @returns {string|null}
|
|
212
|
+
*/
|
|
213
|
+
function parseHexIp(hexIp) {
|
|
214
|
+
if (hexIp.length === 8) {
|
|
215
|
+
// IPv4: stored as little-endian 32-bit integer.
|
|
216
|
+
const b = [
|
|
217
|
+
parseInt(hexIp.slice(6, 8), 16),
|
|
218
|
+
parseInt(hexIp.slice(4, 6), 16),
|
|
219
|
+
parseInt(hexIp.slice(2, 4), 16),
|
|
220
|
+
parseInt(hexIp.slice(0, 2), 16),
|
|
221
|
+
];
|
|
222
|
+
if (b.some((x) => !Number.isFinite(x))) return null;
|
|
223
|
+
return b.join('.');
|
|
224
|
+
}
|
|
225
|
+
if (hexIp.length === 32) {
|
|
226
|
+
// IPv6: 4 groups of 8 hex chars, each group little-endian.
|
|
227
|
+
const groups = [];
|
|
228
|
+
for (let g = 0; g < 4; g++) {
|
|
229
|
+
const chunk = hexIp.slice(g * 8, g * 8 + 8);
|
|
230
|
+
// Reverse byte order within each 32-bit group.
|
|
231
|
+
const bytes = [
|
|
232
|
+
chunk.slice(6, 8),
|
|
233
|
+
chunk.slice(4, 6),
|
|
234
|
+
chunk.slice(2, 4),
|
|
235
|
+
chunk.slice(0, 2),
|
|
236
|
+
];
|
|
237
|
+
// Pair bytes into 16-bit groups for IPv6 notation.
|
|
238
|
+
groups.push(bytes[0] + bytes[1], bytes[2] + bytes[3]);
|
|
239
|
+
}
|
|
240
|
+
return groups.join(':');
|
|
241
|
+
}
|
|
242
|
+
return null;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// ── Classifier ───────────────────────────────────────────────────────────────
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* @typedef {'healthy'|'suspect'|'wedged'} WatchdogVerdict
|
|
249
|
+
*/
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Classify a set of probe signals into a watchdog verdict.
|
|
253
|
+
*
|
|
254
|
+
* AND-gate: all three of (wchan=futex_wait_queue, closeWaitCount≥3, cpuPercent<1)
|
|
255
|
+
* must fire for 'wedged'. Any subset → 'suspect'. None → 'healthy'.
|
|
256
|
+
* Null inputs are treated as not-firing (fail-soft).
|
|
257
|
+
*
|
|
258
|
+
* @param {{ wchan: string|null, closeWaitCount: number|null, cpuPercent: number|null }} signals
|
|
259
|
+
* @returns {WatchdogVerdict}
|
|
260
|
+
*/
|
|
261
|
+
export function classify({ wchan, closeWaitCount, cpuPercent }) {
|
|
262
|
+
const wchanFires = wchan === 'futex_wait_queue';
|
|
263
|
+
const closeWaitFires = typeof closeWaitCount === 'number' && closeWaitCount >= 3;
|
|
264
|
+
const cpuFires = typeof cpuPercent === 'number' && cpuPercent < 1;
|
|
265
|
+
|
|
266
|
+
const firingCount = (wchanFires ? 1 : 0) + (closeWaitFires ? 1 : 0) + (cpuFires ? 1 : 0);
|
|
267
|
+
|
|
268
|
+
if (firingCount === 3) return 'wedged';
|
|
269
|
+
if (firingCount > 0) return 'suspect';
|
|
270
|
+
return 'healthy';
|
|
271
|
+
}
|