@pleri/olam-cli 0.1.196 → 0.1.199
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -0
- package/dist/ask/knowledge-pack.generated.d.ts.map +1 -1
- package/dist/ask/knowledge-pack.generated.js +10 -8
- package/dist/ask/knowledge-pack.generated.js.map +1 -1
- package/dist/commands/auth-list-json.d.ts +34 -0
- package/dist/commands/auth-list-json.d.ts.map +1 -1
- package/dist/commands/auth-list-json.js +24 -0
- package/dist/commands/auth-list-json.js.map +1 -1
- package/dist/commands/auth-migrate.d.ts +212 -0
- package/dist/commands/auth-migrate.d.ts.map +1 -0
- package/dist/commands/auth-migrate.js +465 -0
- package/dist/commands/auth-migrate.js.map +1 -0
- package/dist/commands/auth.d.ts.map +1 -1
- package/dist/commands/auth.js +239 -184
- package/dist/commands/auth.js.map +1 -1
- package/dist/commands/bootstrap.d.ts +4 -0
- package/dist/commands/bootstrap.d.ts.map +1 -1
- package/dist/commands/bootstrap.js +10 -0
- package/dist/commands/bootstrap.js.map +1 -1
- package/dist/commands/dispatch.d.ts.map +1 -1
- package/dist/commands/dispatch.js +11 -1
- package/dist/commands/dispatch.js.map +1 -1
- package/dist/commands/doctor.d.ts +33 -0
- package/dist/commands/doctor.d.ts.map +1 -1
- package/dist/commands/doctor.js +299 -12
- package/dist/commands/doctor.js.map +1 -1
- package/dist/commands/kg-mirror.d.ts +18 -2
- package/dist/commands/kg-mirror.d.ts.map +1 -1
- package/dist/commands/kg-mirror.js +78 -3
- package/dist/commands/kg-mirror.js.map +1 -1
- package/dist/commands/mcp/complete.d.ts +36 -0
- package/dist/commands/mcp/complete.d.ts.map +1 -0
- package/dist/commands/mcp/complete.js +66 -0
- package/dist/commands/mcp/complete.js.map +1 -0
- package/dist/commands/mcp/index.d.ts +1 -1
- package/dist/commands/mcp/index.d.ts.map +1 -1
- package/dist/commands/mcp/index.js +3 -1
- package/dist/commands/mcp/index.js.map +1 -1
- package/dist/commands/memory/bridge.d.ts +1 -1
- package/dist/commands/memory/bridge.d.ts.map +1 -1
- package/dist/commands/memory/bridge.js +2 -6
- package/dist/commands/memory/bridge.js.map +1 -1
- package/dist/commands/memory/secret.d.ts.map +1 -1
- package/dist/commands/memory/secret.js +4 -3
- package/dist/commands/memory/secret.js.map +1 -1
- package/dist/commands/observe.d.ts +3 -3
- package/dist/commands/observe.d.ts.map +1 -1
- package/dist/commands/observe.js +11 -8
- package/dist/commands/observe.js.map +1 -1
- package/dist/commands/runbooks.d.ts.map +1 -1
- package/dist/commands/runbooks.js +77 -10
- package/dist/commands/runbooks.js.map +1 -1
- package/dist/commands/services-tls.d.ts.map +1 -1
- package/dist/commands/services-tls.js +41 -0
- package/dist/commands/services-tls.js.map +1 -1
- package/dist/commands/services.d.ts +45 -3
- package/dist/commands/services.d.ts.map +1 -1
- package/dist/commands/services.js +198 -71
- package/dist/commands/services.js.map +1 -1
- package/dist/commands/setup-phase-8-kg-hook.d.ts +48 -0
- package/dist/commands/setup-phase-8-kg-hook.d.ts.map +1 -0
- package/dist/commands/setup-phase-8-kg-hook.js +93 -0
- package/dist/commands/setup-phase-8-kg-hook.js.map +1 -0
- package/dist/commands/setup-phase-9-memory-bridge.d.ts +36 -0
- package/dist/commands/setup-phase-9-memory-bridge.d.ts.map +1 -0
- package/dist/commands/setup-phase-9-memory-bridge.js +59 -0
- package/dist/commands/setup-phase-9-memory-bridge.js.map +1 -0
- package/dist/commands/setup.d.ts +34 -1
- package/dist/commands/setup.d.ts.map +1 -1
- package/dist/commands/setup.js +328 -23
- package/dist/commands/setup.js.map +1 -1
- package/dist/commands/update.d.ts +24 -0
- package/dist/commands/update.d.ts.map +1 -1
- package/dist/commands/update.js +53 -0
- package/dist/commands/update.js.map +1 -1
- package/dist/commands/upgrade.d.ts +5 -0
- package/dist/commands/upgrade.d.ts.map +1 -1
- package/dist/commands/upgrade.js +31 -8
- package/dist/commands/upgrade.js.map +1 -1
- package/dist/image-digests.json +8 -8
- package/dist/index.js +4302 -2466
- package/dist/lib/auth-backend.d.ts +168 -0
- package/dist/lib/auth-backend.d.ts.map +1 -0
- package/dist/lib/auth-backend.js +172 -0
- package/dist/lib/auth-backend.js.map +1 -0
- package/dist/lib/auth-list-cache.d.ts +67 -0
- package/dist/lib/auth-list-cache.d.ts.map +1 -0
- package/dist/lib/auth-list-cache.js +84 -0
- package/dist/lib/auth-list-cache.js.map +1 -0
- package/dist/lib/auth-list.d.ts +107 -0
- package/dist/lib/auth-list.d.ts.map +1 -0
- package/dist/lib/auth-list.js +123 -0
- package/dist/lib/auth-list.js.map +1 -0
- package/dist/lib/auth-login.d.ts +92 -0
- package/dist/lib/auth-login.d.ts.map +1 -0
- package/dist/lib/auth-login.js +124 -0
- package/dist/lib/auth-login.js.map +1 -0
- package/dist/lib/auth-mutator-backend.d.ts +54 -0
- package/dist/lib/auth-mutator-backend.d.ts.map +1 -0
- package/dist/lib/auth-mutator-backend.js +62 -0
- package/dist/lib/auth-mutator-backend.js.map +1 -0
- package/dist/lib/auth-remote.d.ts +50 -0
- package/dist/lib/auth-remote.d.ts.map +1 -1
- package/dist/lib/auth-remote.js +84 -2
- package/dist/lib/auth-remote.js.map +1 -1
- package/dist/lib/bootstrap-kubernetes.d.ts +93 -12
- package/dist/lib/bootstrap-kubernetes.d.ts.map +1 -1
- package/dist/lib/bootstrap-kubernetes.js +364 -53
- package/dist/lib/bootstrap-kubernetes.js.map +1 -1
- package/dist/lib/config.d.ts +7 -0
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/health-probes.d.ts +0 -22
- package/dist/lib/health-probes.d.ts.map +1 -1
- package/dist/lib/health-probes.js +23 -2
- package/dist/lib/health-probes.js.map +1 -1
- package/dist/lib/peripheral-registry.d.ts +11 -0
- package/dist/lib/peripheral-registry.d.ts.map +1 -1
- package/dist/lib/peripheral-registry.js +5 -0
- package/dist/lib/peripheral-registry.js.map +1 -1
- package/dist/lib/plans-client.d.ts.map +1 -1
- package/dist/lib/plans-client.js +6 -3
- package/dist/lib/plans-client.js.map +1 -1
- package/dist/mcp-server.js +14 -3
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/30-configmap.yaml +4 -0
- package/host-cp/k8s/manifests/50-deployment.yaml +13 -1
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
- package/host-cp/src/dispatch-persister.mjs +157 -0
- package/host-cp/src/pr-nanny.mjs +7 -0
- package/host-cp/src/server.mjs +175 -3
- package/host-cp/src/world-watchdog-pid-lookup.mjs +119 -0
- package/host-cp/src/world-watchdog-probes.mjs +271 -0
- package/host-cp/src/world-watchdog-recovery.mjs +192 -0
- package/host-cp/src/world-watchdog.mjs +313 -0
- package/package.json +1 -1
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* dispatch-persister.mjs — persist the last dispatch for each world.
|
|
3
|
+
*
|
|
4
|
+
* The world watchdog's recovery hook reads this to replay the last
|
|
5
|
+
* unanswered prompt when it auto-recovers a wedged claude process.
|
|
6
|
+
*
|
|
7
|
+
* Contract:
|
|
8
|
+
* persist({ worldId, messageId, prompt, source, statePath?, now? })
|
|
9
|
+
* Atomically writes ~/.olam/worlds/<worldId>/state/last-dispatch.json.
|
|
10
|
+
* Overwrites any previous file — only the LATEST dispatch matters for
|
|
11
|
+
* replay. Atomic write (tmp + fs.rename) prevents partial-write residue
|
|
12
|
+
* from corrupting recovery reads.
|
|
13
|
+
*
|
|
14
|
+
* read({ worldId, statePath? })
|
|
15
|
+
* Returns { messageId, prompt, dispatchedAt, source } or null.
|
|
16
|
+
* null on ENOENT (no dispatch persisted yet) — never throws.
|
|
17
|
+
* null on JSON parse error (logs + skips) — never throws on corrupt file.
|
|
18
|
+
*
|
|
19
|
+
* Multiple worlds are independent: world A and world B have separate files.
|
|
20
|
+
* Multiple concurrent persist() calls for the SAME world are safe — each
|
|
21
|
+
* write is a rename of a tmp file so the worst case is one write winning.
|
|
22
|
+
*
|
|
23
|
+
* @see docs/architecture/world-watchdog.md
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
import fs from 'node:fs/promises';
|
|
27
|
+
import path from 'node:path';
|
|
28
|
+
import os from 'node:os';
|
|
29
|
+
|
|
30
|
+
// Default base path under which per-world state directories live.
|
|
31
|
+
const DEFAULT_STATE_BASE = path.join(os.homedir(), '.olam', 'worlds');
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Derive the path to last-dispatch.json for a world.
|
|
35
|
+
*
|
|
36
|
+
* @param {string} worldId
|
|
37
|
+
* @param {string} [stateBase] Override the base directory (for tests).
|
|
38
|
+
* @returns {string}
|
|
39
|
+
*/
|
|
40
|
+
export function lastDispatchPath(worldId, stateBase = DEFAULT_STATE_BASE) {
|
|
41
|
+
return path.join(stateBase, worldId, 'state', 'last-dispatch.json');
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Persist the last dispatch for a world.
|
|
46
|
+
*
|
|
47
|
+
* @param {{
|
|
48
|
+
* worldId: string,
|
|
49
|
+
* messageId: string,
|
|
50
|
+
* prompt: string,
|
|
51
|
+
* source: string,
|
|
52
|
+
* statePath?: string,
|
|
53
|
+
* now?: () => number,
|
|
54
|
+
* }} opts
|
|
55
|
+
* @returns {Promise<void>}
|
|
56
|
+
*/
|
|
57
|
+
export async function persist({
|
|
58
|
+
worldId,
|
|
59
|
+
messageId,
|
|
60
|
+
prompt,
|
|
61
|
+
source,
|
|
62
|
+
statePath,
|
|
63
|
+
now = () => Date.now(),
|
|
64
|
+
}) {
|
|
65
|
+
const filePath = statePath ?? lastDispatchPath(worldId);
|
|
66
|
+
const dir = path.dirname(filePath);
|
|
67
|
+
const tmpPath = `${filePath}.tmp`;
|
|
68
|
+
|
|
69
|
+
const record = {
|
|
70
|
+
messageId,
|
|
71
|
+
prompt,
|
|
72
|
+
dispatchedAt: new Date(now()).toISOString(),
|
|
73
|
+
source,
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
// Ensure the directory exists.
|
|
77
|
+
await fs.mkdir(dir, { recursive: true });
|
|
78
|
+
|
|
79
|
+
// Atomic write: write to .tmp then rename over the target.
|
|
80
|
+
await fs.writeFile(tmpPath, JSON.stringify(record, null, 2) + '\n', 'utf8');
|
|
81
|
+
await fs.rename(tmpPath, filePath);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Fire-and-forget persist wrapper used at the dispatch call-sites.
|
|
86
|
+
*
|
|
87
|
+
* Centralises the void/.catch boilerplate so the two enrichment sites
|
|
88
|
+
* (pr-nanny + /api/cloud-dispatch) can't drift on future changes.
|
|
89
|
+
* Logs failures via the supplied logSource tag; never throws.
|
|
90
|
+
*
|
|
91
|
+
* @param {{
|
|
92
|
+
* worldId: string,
|
|
93
|
+
* messageId: string,
|
|
94
|
+
* prompt: string,
|
|
95
|
+
* source: string,
|
|
96
|
+
* logSource?: string,
|
|
97
|
+
* statePath?: string,
|
|
98
|
+
* now?: () => number,
|
|
99
|
+
* }} opts
|
|
100
|
+
* @returns {void}
|
|
101
|
+
*/
|
|
102
|
+
export function safePersistLastDispatch(opts) {
|
|
103
|
+
const { logSource = opts.source, ...persistOpts } = opts;
|
|
104
|
+
void persist(persistOpts).catch((err) => {
|
|
105
|
+
console.warn(
|
|
106
|
+
`[${logSource}] persistLastDispatch failed (non-fatal): ${err?.message ?? err}`,
|
|
107
|
+
);
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Read the last persisted dispatch for a world.
|
|
113
|
+
*
|
|
114
|
+
* @param {{
|
|
115
|
+
* worldId: string,
|
|
116
|
+
* statePath?: string,
|
|
117
|
+
* }} opts
|
|
118
|
+
* @returns {Promise<{ messageId: string, prompt: string, dispatchedAt: string, source: string } | null>}
|
|
119
|
+
*/
|
|
120
|
+
export async function read({ worldId, statePath }) {
|
|
121
|
+
const filePath = statePath ?? lastDispatchPath(worldId);
|
|
122
|
+
|
|
123
|
+
let raw;
|
|
124
|
+
try {
|
|
125
|
+
raw = await fs.readFile(filePath, 'utf8');
|
|
126
|
+
} catch (err) {
|
|
127
|
+
if (err?.code === 'ENOENT') return null;
|
|
128
|
+
// Other I/O errors (e.g. permissions) — log + return null (fail-soft).
|
|
129
|
+
console.error(`[dispatch-persister] readFile ${filePath}: ${err?.message ?? err}`);
|
|
130
|
+
return null;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
try {
|
|
134
|
+
const parsed = JSON.parse(raw);
|
|
135
|
+
// Basic shape validation — don't throw on corrupt file.
|
|
136
|
+
if (
|
|
137
|
+
typeof parsed !== 'object' ||
|
|
138
|
+
parsed === null ||
|
|
139
|
+
typeof parsed.messageId !== 'string' ||
|
|
140
|
+
typeof parsed.prompt !== 'string' ||
|
|
141
|
+
typeof parsed.dispatchedAt !== 'string' ||
|
|
142
|
+
typeof parsed.source !== 'string'
|
|
143
|
+
) {
|
|
144
|
+
console.error(`[dispatch-persister] ${filePath}: unexpected shape, skipping`);
|
|
145
|
+
return null;
|
|
146
|
+
}
|
|
147
|
+
return {
|
|
148
|
+
messageId: parsed.messageId,
|
|
149
|
+
prompt: parsed.prompt,
|
|
150
|
+
dispatchedAt: parsed.dispatchedAt,
|
|
151
|
+
source: parsed.source,
|
|
152
|
+
};
|
|
153
|
+
} catch (err) {
|
|
154
|
+
console.error(`[dispatch-persister] ${filePath}: JSON parse error: ${err?.message ?? err}`);
|
|
155
|
+
return null;
|
|
156
|
+
}
|
|
157
|
+
}
|
package/host-cp/src/pr-nanny.mjs
CHANGED
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
import { execFile } from 'node:child_process';
|
|
25
25
|
import { promisify } from 'node:util';
|
|
26
26
|
import { pickNextTier } from './dispatch/tier-escalator.mjs';
|
|
27
|
+
import { safePersistLastDispatch } from './dispatch-persister.mjs';
|
|
27
28
|
|
|
28
29
|
const execFileAsync = promisify(execFile);
|
|
29
30
|
|
|
@@ -251,6 +252,12 @@ export function createPrNanny({
|
|
|
251
252
|
|
|
252
253
|
// Dispatch fix
|
|
253
254
|
try {
|
|
255
|
+
safePersistLastDispatch({
|
|
256
|
+
worldId,
|
|
257
|
+
messageId: `nanny-${worldId}-${Date.now()}`,
|
|
258
|
+
prompt,
|
|
259
|
+
source: 'pr-nanny',
|
|
260
|
+
});
|
|
254
261
|
await dispatchToWorld(worldId, prompt, { tier: tierForThisDispatch });
|
|
255
262
|
const now = new Date().toISOString();
|
|
256
263
|
prStateStore.set(worldId, {
|
package/host-cp/src/server.mjs
CHANGED
|
@@ -84,6 +84,11 @@ import {
|
|
|
84
84
|
defaultListContainerNames,
|
|
85
85
|
} from './boot-reconciler.mjs';
|
|
86
86
|
import { startWorldActivityTracker } from './world-activity-tracker.mjs';
|
|
87
|
+
import { startWorldWatchdog } from './world-watchdog.mjs';
|
|
88
|
+
import { createRecovery } from './world-watchdog-recovery.mjs';
|
|
89
|
+
import { createLeakyBucket } from './lib/leaky-bucket.mjs';
|
|
90
|
+
import { read as readLastDispatch, safePersistLastDispatch } from './dispatch-persister.mjs';
|
|
91
|
+
import { findClaudePid } from './world-watchdog-pid-lookup.mjs';
|
|
87
92
|
import { authSecretHint } from './auth-secret-hint.mjs';
|
|
88
93
|
import * as tunnelManager from './world-tunnel-manager.mjs';
|
|
89
94
|
import * as bridgeManager from './port-bridge-manager.mjs';
|
|
@@ -96,6 +101,7 @@ import {
|
|
|
96
101
|
import { instrumentHandler, renderMetrics } from './metrics.mjs';
|
|
97
102
|
import { handleDispatchFromEmail } from './lib/email-dispatch.mjs';
|
|
98
103
|
import { handleDispatchFromLinear } from './lib/linear-dispatch.mjs';
|
|
104
|
+
// (safePersistLastDispatch imported above alongside readLastDispatch)
|
|
99
105
|
import { emitTierSuggestion } from '../dispatch/auto-tier-scheduler.mjs';
|
|
100
106
|
import { isServeOnly, isOrchestrationRoute, ORCHESTRATION_UNAVAILABLE } from './serve-only-config.mjs';
|
|
101
107
|
|
|
@@ -1874,6 +1880,41 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
|
|
|
1874
1880
|
// the isOrchestrationRoute guard — it covers /api/world/, /api/worlds/<id>,
|
|
1875
1881
|
// and /v1/worlds/ for all methods, so no per-route guard is needed here.)
|
|
1876
1882
|
|
|
1883
|
+
// GET /api/world/<id>/socket-health — world watchdog verdict (A5).
|
|
1884
|
+
// Returns the latest in-memory probe result from the world watchdog.
|
|
1885
|
+
// Read-only; never mutates world state.
|
|
1886
|
+
// 200: { worldId, verdict, signals, pid, lastTickAt } — known world + tick fired
|
|
1887
|
+
// 200 verdict='unknown': known world but no tick has fired yet
|
|
1888
|
+
// 404: unknown_world
|
|
1889
|
+
// serve-only: returns 503 orchestration_unavailable (isOrchestrationRoute covers
|
|
1890
|
+
// /api/world/* prefix, so this route is already blocked upstream before reaching here).
|
|
1891
|
+
const socketHealthMatch = /^\/api\/world\/([^/?#]+)\/socket-health\/?$/.exec(url.pathname);
|
|
1892
|
+
if (socketHealthMatch && req.method === 'GET') {
|
|
1893
|
+
const worldId = decodeURIComponent(socketHealthMatch[1]);
|
|
1894
|
+
if (!(worldId in WORLDS)) {
|
|
1895
|
+
return jsonReply(res, 404, { error: 'unknown_world' });
|
|
1896
|
+
}
|
|
1897
|
+
// worldWatchdog is null in serve-only mode (but the serve-only gate above
|
|
1898
|
+
// would have returned 503 before we get here; belt-and-suspenders).
|
|
1899
|
+
const entry = worldWatchdog ? worldWatchdog.getVerdict(worldId) : null;
|
|
1900
|
+
if (!entry) {
|
|
1901
|
+
return jsonReply(res, 200, {
|
|
1902
|
+
worldId,
|
|
1903
|
+
verdict: 'unknown',
|
|
1904
|
+
signals: null,
|
|
1905
|
+
pid: null,
|
|
1906
|
+
lastTickAt: null,
|
|
1907
|
+
});
|
|
1908
|
+
}
|
|
1909
|
+
return jsonReply(res, 200, {
|
|
1910
|
+
worldId,
|
|
1911
|
+
verdict: entry.lastVerdict,
|
|
1912
|
+
signals: entry.lastSignals,
|
|
1913
|
+
pid: entry.lastPid,
|
|
1914
|
+
lastTickAt: entry.lastTickAt,
|
|
1915
|
+
});
|
|
1916
|
+
}
|
|
1917
|
+
|
|
1877
1918
|
// GET /api/world/<id>/progress — phase ladder progress for inbox row.
|
|
1878
1919
|
const progressMatch = /^\/api\/world\/([^/?#]+)\/progress\/?$/.exec(url.pathname);
|
|
1879
1920
|
if (progressMatch && req.method === 'GET') {
|
|
@@ -1892,8 +1933,16 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
|
|
|
1892
1933
|
prStateStore,
|
|
1893
1934
|
getGhToken: resolveGhToken,
|
|
1894
1935
|
});
|
|
1895
|
-
|
|
1896
|
-
|
|
1936
|
+
// C1 — attach socketHealth if watchdog has fired for this world.
|
|
1937
|
+
const verdictEntry = worldWatchdog ? worldWatchdog.getVerdict(worldId) : null;
|
|
1938
|
+
const enriched = verdictEntry
|
|
1939
|
+
? {
|
|
1940
|
+
...data,
|
|
1941
|
+
socketHealth: buildSocketHealthPayload(worldId, verdictEntry),
|
|
1942
|
+
}
|
|
1943
|
+
: data;
|
|
1944
|
+
progressCache.set(worldId, { fetchedAt: Date.now(), data: enriched });
|
|
1945
|
+
return jsonReply(res, 200, enriched);
|
|
1897
1946
|
}
|
|
1898
1947
|
|
|
1899
1948
|
// /api/world/<id>/* → proxy to per-world CP with X-Olam-Secret injected.
|
|
@@ -2686,6 +2735,15 @@ const server = http.createServer(instrumentHandler('host-cp', async (req, res) =
|
|
|
2686
2735
|
// env-var enrichments are applied.
|
|
2687
2736
|
const enriched = enrichedObj ? JSON.stringify(enrichedObj) : JSON.stringify(parsed);
|
|
2688
2737
|
|
|
2738
|
+
if (parsed.world_id && parsed.prompt) {
|
|
2739
|
+
safePersistLastDispatch({
|
|
2740
|
+
worldId: parsed.world_id,
|
|
2741
|
+
messageId: parsed.session_id ?? `cloud-dispatch-${Date.now()}`,
|
|
2742
|
+
prompt: parsed.prompt,
|
|
2743
|
+
source: 'cloud-dispatch',
|
|
2744
|
+
});
|
|
2745
|
+
}
|
|
2746
|
+
|
|
2689
2747
|
// Phase H h2: attach CF Access service-token headers when configured
|
|
2690
2748
|
// (machine-to-machine auth). Additive alongside Basic auth. CF Access
|
|
2691
2749
|
// headers are validated at the EDGE of origins behind a CF Access app
|
|
@@ -3674,6 +3732,119 @@ const worldActivityTracker = SERVE_ONLY
|
|
|
3674
3732
|
broadcaster: hostStream,
|
|
3675
3733
|
});
|
|
3676
3734
|
|
|
3735
|
+
// World watchdog — periodic probe of each active world's claude PID for the
|
|
3736
|
+
// three wedge signals (wchan + CLOSE_WAIT + CPU). Emits `world.watchdog.tick`
|
|
3737
|
+
// events on the host-stream broadcaster.
|
|
3738
|
+
//
|
|
3739
|
+
// Phase B: recovery is wired when compute.autoRecover !== false.
|
|
3740
|
+
// Default is false (detection-only, byte-identical to Phase A behaviour).
|
|
3741
|
+
//
|
|
3742
|
+
// SERVE-ONLY: no docker / worlds.db on a managed cluster; null sentinel keeps
|
|
3743
|
+
// the shutdown handler's `worldWatchdog?.stop()` a no-op.
|
|
3744
|
+
//
|
|
3745
|
+
// getClaudePidForWorld is a v1 stub returning null for all worlds. All worlds
|
|
3746
|
+
// therefore emit verdict='unknown' until real PID lookup is wired in a follow-up.
|
|
3747
|
+
|
|
3748
|
+
// ── Recovery setup (Phase B) ─────────────────────────────────────────────────
|
|
3749
|
+
// Load autoRecover from env OLAM_AUTO_RECOVER (or false by default — D4).
|
|
3750
|
+
// Falls back to false if config unavailable or field absent.
|
|
3751
|
+
|
|
3752
|
+
// The compute.autoRecover field lives in .olam/config.yaml (per workspace),
|
|
3753
|
+
// not in ~/.olam/config.json (global). Host-cp does not load workspace YAML at
|
|
3754
|
+
// startup. Read from env OLAM_AUTO_RECOVER; default false (D4 — OFF by default).
|
|
3755
|
+
const _watchdogAutoRecoverMode = (() => {
|
|
3756
|
+
const envVal = process.env.OLAM_AUTO_RECOVER;
|
|
3757
|
+
if (envVal === 'true') return true;
|
|
3758
|
+
if (envVal === 'dry-run') return 'dry-run';
|
|
3759
|
+
return false;
|
|
3760
|
+
})();
|
|
3761
|
+
|
|
3762
|
+
const _watchdogLeakyBucket = createLeakyBucket({ capacity: 3, windowMs: 3_600_000 });
|
|
3763
|
+
|
|
3764
|
+
const _watchdogRecovery = SERVE_ONLY
|
|
3765
|
+
? null
|
|
3766
|
+
: createRecovery({
|
|
3767
|
+
autoRecoverMode: _watchdogAutoRecoverMode,
|
|
3768
|
+
leakyBucket: _watchdogLeakyBucket,
|
|
3769
|
+
broadcaster: hostStream,
|
|
3770
|
+
persister: { read: readLastDispatch },
|
|
3771
|
+
// TODO: wire real replay once operator has run the B3 idempotence probe
|
|
3772
|
+
// and confirmed dispatch is idempotent for all substrates in use.
|
|
3773
|
+
// See docs/architecture/world-watchdog.md Recovery > Idempotence probe.
|
|
3774
|
+
// For now: log + emit replay_stub breadcrumb so the stub path is visible.
|
|
3775
|
+
replay: async ({ worldId, prompt }) => {
|
|
3776
|
+
console.warn(
|
|
3777
|
+
`[world-watchdog-recovery] replay stub: worldId=${worldId} prompt="${prompt.slice(0, 80)}..." — real replay deferred pending B3 sign-off`,
|
|
3778
|
+
);
|
|
3779
|
+
// breadcrumb already emitted by createRecovery before calling replay
|
|
3780
|
+
},
|
|
3781
|
+
});
|
|
3782
|
+
|
|
3783
|
+
const worldWatchdog = SERVE_ONLY
|
|
3784
|
+
? null
|
|
3785
|
+
: startWorldWatchdog({
|
|
3786
|
+
broadcaster: hostStream,
|
|
3787
|
+
recovery: _watchdogRecovery,
|
|
3788
|
+
listActiveWorlds: async () => {
|
|
3789
|
+
// Reuse the same worlds.db query as worldActivityTracker: return all
|
|
3790
|
+
// non-destroyed/failed world IDs for probing.
|
|
3791
|
+
let Database;
|
|
3792
|
+
try {
|
|
3793
|
+
const { createRequire } = await import('node:module');
|
|
3794
|
+
const req = createRequire(import.meta.url);
|
|
3795
|
+
Database = req('better-sqlite3');
|
|
3796
|
+
} catch {
|
|
3797
|
+
return [];
|
|
3798
|
+
}
|
|
3799
|
+
let db;
|
|
3800
|
+
try {
|
|
3801
|
+
db = new Database(WORLDS_DB_PATH, { fileMustExist: true });
|
|
3802
|
+
} catch {
|
|
3803
|
+
return [];
|
|
3804
|
+
}
|
|
3805
|
+
try {
|
|
3806
|
+
const rows = db
|
|
3807
|
+
.prepare("SELECT id FROM worlds WHERE status NOT IN ('destroyed', 'failed')")
|
|
3808
|
+
.all();
|
|
3809
|
+
return rows.map((r) => r.id).filter((id) => typeof id === 'string');
|
|
3810
|
+
} catch {
|
|
3811
|
+
return [];
|
|
3812
|
+
} finally {
|
|
3813
|
+
try { db.close(); } catch { /* ignore */ }
|
|
3814
|
+
}
|
|
3815
|
+
},
|
|
3816
|
+
getClaudePidForWorld: async (worldId) =>
|
|
3817
|
+
findClaudePid({ containerId: `olam-${worldId}-devbox` }),
|
|
3818
|
+
});
|
|
3819
|
+
|
|
3820
|
+
/**
|
|
3821
|
+
* C1 — Serialize a WorldWatchdogState entry into the `socketHealth` sub-object
|
|
3822
|
+
* shape shared by the /progress payload and the SPA TypeScript types.
|
|
3823
|
+
*
|
|
3824
|
+
* @param {string} worldId Used to peek the per-world leaky-bucket count.
|
|
3825
|
+
* @param {import('./world-watchdog.mjs').WorldWatchdogState} entry
|
|
3826
|
+
* @returns {object}
|
|
3827
|
+
*/
|
|
3828
|
+
function buildSocketHealthPayload(worldId, entry) {
|
|
3829
|
+
const payload = {
|
|
3830
|
+
verdict: entry.lastVerdict,
|
|
3831
|
+
signals: entry.lastSignals,
|
|
3832
|
+
pid: entry.lastPid,
|
|
3833
|
+
lastTickAt: entry.lastTickAt,
|
|
3834
|
+
};
|
|
3835
|
+
// Attach recovery sub-object only when OLAM_AUTO_RECOVER is non-false.
|
|
3836
|
+
if (_watchdogRecovery) {
|
|
3837
|
+
payload.recovery = {
|
|
3838
|
+
mode: _watchdogAutoRecoverMode,
|
|
3839
|
+
restartsInWindow: _watchdogLeakyBucket
|
|
3840
|
+
? _watchdogLeakyBucket.peek(worldId).totalInWindow
|
|
3841
|
+
: 0,
|
|
3842
|
+
lastRestartAt: null, // tracking per-world last-restart-at is a future enhancement
|
|
3843
|
+
};
|
|
3844
|
+
}
|
|
3845
|
+
return payload;
|
|
3846
|
+
}
|
|
3847
|
+
|
|
3677
3848
|
// ── Phase 1a / B1 (PR3): engine-select + await-before-listen ─────
|
|
3678
3849
|
//
|
|
3679
3850
|
// Decision 15: the async KubernetesEngine factory MUST be fully awaited
|
|
@@ -3774,12 +3945,13 @@ for (const sig of ['SIGTERM', 'SIGINT']) {
|
|
|
3774
3945
|
console.log(`received ${sig}, shutting down`);
|
|
3775
3946
|
stopEvents();
|
|
3776
3947
|
prPoller.stop();
|
|
3777
|
-
// worldsDbReconciler + worldActivityTracker are null in SERVE-ONLY mode.
|
|
3948
|
+
// worldsDbReconciler + worldActivityTracker + worldWatchdog are null in SERVE-ONLY mode.
|
|
3778
3949
|
worldsDbReconciler?.stop();
|
|
3779
3950
|
stopWorldsSnapshotLoop();
|
|
3780
3951
|
stopTunnelsSnapshotLoop();
|
|
3781
3952
|
stopListeningSnapshotLoop();
|
|
3782
3953
|
worldActivityTracker?.stop();
|
|
3954
|
+
worldWatchdog?.stop();
|
|
3783
3955
|
if (serversSnapshotTimer) { clearTimeout(serversSnapshotTimer); serversSnapshotTimer = null; }
|
|
3784
3956
|
hostStream.close();
|
|
3785
3957
|
if (ndjsonSpanSink) ndjsonSpanSink.close().catch(() => {});
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* world-watchdog-pid-lookup.mjs — host-visible PID lookup for the world watchdog.
|
|
3
|
+
*
|
|
4
|
+
* Uses `docker top <containerId>` to enumerate processes inside a world's
|
|
5
|
+
* container and returns the host-visible PID of the claude process.
|
|
6
|
+
*
|
|
7
|
+
* `docker top` output format (Linux Docker / Colima):
|
|
8
|
+
* UID PID PPID C STIME TTY TIME CMD
|
|
9
|
+
* root 1234 1 0 10:00 ? 00:00:00 node /usr/local/bin/claude ...
|
|
10
|
+
*
|
|
11
|
+
* The PID column (index 1 in default ps output) is already the host-visible
|
|
12
|
+
* PID. On Mac/Colima the container runs inside a Linux VM so `docker top`
|
|
13
|
+
* returns PIDs within the VM's PID namespace — these are NOT the macOS host
|
|
14
|
+
* PIDs, but they ARE the PIDs visible from within the Linux layer (where
|
|
15
|
+
* /proc reads happen). This is the same namespace the watchdog probes use
|
|
16
|
+
* when reading /proc/<pid>/wchan etc., so the PIDs are correct for probe use.
|
|
17
|
+
*
|
|
18
|
+
* Inject `docker` for tests (avoids spawning real docker processes).
|
|
19
|
+
*
|
|
20
|
+
* @see docs/architecture/world-watchdog.md
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
import { execFile } from 'node:child_process';
|
|
24
|
+
import { promisify } from 'node:util';
|
|
25
|
+
|
|
26
|
+
const execFileAsync = promisify(execFile);
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Default docker executor — shells out to the real `docker` CLI.
|
|
30
|
+
*
|
|
31
|
+
* @param {string} containerId
|
|
32
|
+
* @returns {Promise<string>} stdout from `docker top <containerId>`
|
|
33
|
+
*/
|
|
34
|
+
async function defaultDockerTop(containerId) {
|
|
35
|
+
const { stdout } = await execFileAsync('docker', ['top', containerId], {
|
|
36
|
+
timeout: 5_000,
|
|
37
|
+
});
|
|
38
|
+
return stdout;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Parse the stdout from `docker top` and extract host-visible PIDs whose
|
|
43
|
+
* CMD column matches a claude process.
|
|
44
|
+
*
|
|
45
|
+
* docker top default output columns (ps -ef format):
|
|
46
|
+
* UID PID PPID C STIME TTY TIME CMD
|
|
47
|
+
* Indices: 0=UID, 1=PID, 2=PPID, ..., 7+=CMD (rest of line after 7 columns).
|
|
48
|
+
*
|
|
49
|
+
* @param {string} stdout Raw output from `docker top <id>`
|
|
50
|
+
* @returns {number[]} Host-visible PIDs of matching claude processes, sorted ascending.
|
|
51
|
+
*/
|
|
52
|
+
export function parseDockerTopOutput(stdout) {
|
|
53
|
+
const lines = stdout.split('\n').filter((l) => l.trim().length > 0);
|
|
54
|
+
if (lines.length < 2) return []; // header only or empty
|
|
55
|
+
|
|
56
|
+
// Skip the header line (first line contains column names).
|
|
57
|
+
const dataLines = lines.slice(1);
|
|
58
|
+
|
|
59
|
+
const pids = [];
|
|
60
|
+
for (const line of dataLines) {
|
|
61
|
+
// Split on any whitespace — `docker top` columns are space-separated.
|
|
62
|
+
// CMD may contain spaces; split into at most 8 parts (last = full CMD string).
|
|
63
|
+
const parts = line.trim().split(/\s+/);
|
|
64
|
+
if (parts.length < 8) continue;
|
|
65
|
+
|
|
66
|
+
const pid = parseInt(parts[1], 10);
|
|
67
|
+
if (!Number.isFinite(pid) || pid <= 0) continue;
|
|
68
|
+
|
|
69
|
+
// parts[7] onward is the CMD. Rejoin the remainder.
|
|
70
|
+
const cmd = parts.slice(7).join(' ');
|
|
71
|
+
|
|
72
|
+
// Match: `claude` as standalone binary, or `node` process running claude.
|
|
73
|
+
if (/(?:^|\/)claude(\s|$)/.test(cmd) || /node[^\s]*\s+.*[/\\]claude(?:\s|$)/.test(cmd)) {
|
|
74
|
+
pids.push(pid);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return pids.sort((a, b) => a - b);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Find the host-visible PID of the claude process running inside a container.
|
|
83
|
+
*
|
|
84
|
+
* Returns the lowest matching PID (parent process heuristic — the supervisor
|
|
85
|
+
* claude process has a lower PID than any child workers it spawns).
|
|
86
|
+
*
|
|
87
|
+
* Fail-soft:
|
|
88
|
+
* - docker unreachable / container not found → null + log
|
|
89
|
+
* - no claude process in the container → null (silent)
|
|
90
|
+
* - multiple claude processes → return the lowest PID
|
|
91
|
+
*
|
|
92
|
+
* @param {{
|
|
93
|
+
* containerId: string,
|
|
94
|
+
* dockerTop?: (containerId: string) => Promise<string>,
|
|
95
|
+
* log?: (msg: string) => void,
|
|
96
|
+
* }} opts
|
|
97
|
+
* @returns {Promise<number | null>}
|
|
98
|
+
*/
|
|
99
|
+
export async function findClaudePid({
|
|
100
|
+
containerId,
|
|
101
|
+
dockerTop = defaultDockerTop,
|
|
102
|
+
log = (m) => console.log(`[world-watchdog-pid-lookup] ${m}`),
|
|
103
|
+
}) {
|
|
104
|
+
if (!containerId) return null;
|
|
105
|
+
|
|
106
|
+
let stdout;
|
|
107
|
+
try {
|
|
108
|
+
stdout = await dockerTop(containerId);
|
|
109
|
+
} catch (err) {
|
|
110
|
+
log(`docker top ${containerId} failed: ${err?.message ?? err}`);
|
|
111
|
+
return null;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const pids = parseDockerTopOutput(stdout);
|
|
115
|
+
if (pids.length === 0) return null;
|
|
116
|
+
|
|
117
|
+
// Lowest PID = the parent/supervisor process.
|
|
118
|
+
return pids[0];
|
|
119
|
+
}
|