jishushell 0.0.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +202 -0
- package/README.md +36 -0
- package/THIRD-PARTY-NOTICES +387 -0
- package/dist/auth.d.ts +6 -0
- package/dist/auth.js +88 -0
- package/dist/auth.js.map +1 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +290 -0
- package/dist/cli.js.map +1 -0
- package/dist/config.d.ts +24 -0
- package/dist/config.js +226 -0
- package/dist/config.js.map +1 -0
- package/dist/constants.d.ts +3 -0
- package/dist/constants.js +15 -0
- package/dist/constants.js.map +1 -0
- package/dist/control.d.ts +44 -0
- package/dist/control.js +1359 -0
- package/dist/control.js.map +1 -0
- package/dist/crypto-shim.d.ts +1 -0
- package/dist/crypto-shim.js +2 -0
- package/dist/crypto-shim.js.map +1 -0
- package/dist/doctor.d.ts +46 -0
- package/dist/doctor.js +937 -0
- package/dist/doctor.js.map +1 -0
- package/dist/install.d.ts +27 -0
- package/dist/install.js +570 -0
- package/dist/install.js.map +1 -0
- package/dist/routes/auth.d.ts +4 -0
- package/dist/routes/auth.js +151 -0
- package/dist/routes/auth.js.map +1 -0
- package/dist/routes/instances.d.ts +2 -0
- package/dist/routes/instances.js +1303 -0
- package/dist/routes/instances.js.map +1 -0
- package/dist/routes/setup.d.ts +2 -0
- package/dist/routes/setup.js +139 -0
- package/dist/routes/setup.js.map +1 -0
- package/dist/routes/system.d.ts +2 -0
- package/dist/routes/system.js +102 -0
- package/dist/routes/system.js.map +1 -0
- package/dist/server.d.ts +6 -0
- package/dist/server.js +392 -0
- package/dist/server.js.map +1 -0
- package/dist/services/instance-manager.d.ts +67 -0
- package/dist/services/instance-manager.js +1319 -0
- package/dist/services/instance-manager.js.map +1 -0
- package/dist/services/llm-proxy/adapters.d.ts +3 -0
- package/dist/services/llm-proxy/adapters.js +309 -0
- package/dist/services/llm-proxy/adapters.js.map +1 -0
- package/dist/services/llm-proxy/circuit-breaker.d.ts +9 -0
- package/dist/services/llm-proxy/circuit-breaker.js +73 -0
- package/dist/services/llm-proxy/circuit-breaker.js.map +1 -0
- package/dist/services/llm-proxy/encryption.d.ts +6 -0
- package/dist/services/llm-proxy/encryption.js +61 -0
- package/dist/services/llm-proxy/encryption.js.map +1 -0
- package/dist/services/llm-proxy/index.d.ts +24 -0
- package/dist/services/llm-proxy/index.js +708 -0
- package/dist/services/llm-proxy/index.js.map +1 -0
- package/dist/services/llm-proxy/rate-limiter.d.ts +1 -0
- package/dist/services/llm-proxy/rate-limiter.js +39 -0
- package/dist/services/llm-proxy/rate-limiter.js.map +1 -0
- package/dist/services/llm-proxy/sse.d.ts +10 -0
- package/dist/services/llm-proxy/sse.js +378 -0
- package/dist/services/llm-proxy/sse.js.map +1 -0
- package/dist/services/llm-proxy/ssrf.d.ts +16 -0
- package/dist/services/llm-proxy/ssrf.js +185 -0
- package/dist/services/llm-proxy/ssrf.js.map +1 -0
- package/dist/services/llm-proxy/types.d.ts +52 -0
- package/dist/services/llm-proxy/types.js +2 -0
- package/dist/services/llm-proxy/types.js.map +1 -0
- package/dist/services/llm-proxy/usage.d.ts +12 -0
- package/dist/services/llm-proxy/usage.js +108 -0
- package/dist/services/llm-proxy/usage.js.map +1 -0
- package/dist/services/nomad-manager.d.ts +22 -0
- package/dist/services/nomad-manager.js +828 -0
- package/dist/services/nomad-manager.js.map +1 -0
- package/dist/services/plugin-installer.d.ts +22 -0
- package/dist/services/plugin-installer.js +102 -0
- package/dist/services/plugin-installer.js.map +1 -0
- package/dist/services/process-manager.d.ts +25 -0
- package/dist/services/process-manager.js +531 -0
- package/dist/services/process-manager.js.map +1 -0
- package/dist/services/setup-manager.d.ts +93 -0
- package/dist/services/setup-manager.js +1922 -0
- package/dist/services/setup-manager.js.map +1 -0
- package/dist/services/system-monitor.d.ts +1 -0
- package/dist/services/system-monitor.js +79 -0
- package/dist/services/system-monitor.js.map +1 -0
- package/dist/services/telemetry/activation.d.ts +12 -0
- package/dist/services/telemetry/activation.js +75 -0
- package/dist/services/telemetry/activation.js.map +1 -0
- package/dist/services/telemetry/client.d.ts +21 -0
- package/dist/services/telemetry/client.js +47 -0
- package/dist/services/telemetry/client.js.map +1 -0
- package/dist/services/telemetry/device-fingerprint.d.ts +18 -0
- package/dist/services/telemetry/device-fingerprint.js +123 -0
- package/dist/services/telemetry/device-fingerprint.js.map +1 -0
- package/dist/services/telemetry/heartbeat.d.ts +13 -0
- package/dist/services/telemetry/heartbeat.js +81 -0
- package/dist/services/telemetry/heartbeat.js.map +1 -0
- package/dist/services/telemetry/index.d.ts +3 -0
- package/dist/services/telemetry/index.js +4 -0
- package/dist/services/telemetry/index.js.map +1 -0
- package/dist/types.d.ts +51 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/safe-json.d.ts +2 -0
- package/dist/utils/safe-json.js +80 -0
- package/dist/utils/safe-json.js.map +1 -0
- package/dist/utils/ttl-cache.d.ts +29 -0
- package/dist/utils/ttl-cache.js +77 -0
- package/dist/utils/ttl-cache.js.map +1 -0
- package/install/jishu-install.sh +2920 -0
- package/install/jishu-uninstall.sh +811 -0
- package/install/post-install.sh +110 -0
- package/install/post-uninstall.sh +46 -0
- package/package.json +57 -8
- package/public/assets/Dashboard-CAOQDYDR.js +1 -0
- package/public/assets/InitPassword-CkehIkJG.js +1 -0
- package/public/assets/InstanceDetail-CzW2S95J.js +14 -0
- package/public/assets/Login-RkjzTNWg.js +1 -0
- package/public/assets/NewInstance-DdbErdjA.js +1 -0
- package/public/assets/Settings-BUD7zwv9.js +1 -0
- package/public/assets/Setup-RRTIERGG.js +1 -0
- package/public/assets/index-77Ug7feY.css +1 -0
- package/public/assets/index-DfRnVUQR.js +16 -0
- package/public/assets/providers-lBSOjUWy.js +1 -0
- package/public/assets/usePolling-CqQ8hrNc.js +1 -0
- package/public/assets/vendor-i18n-Bvxxh8Di.js +9 -0
- package/public/assets/vendor-react-DONn7uBV.js +59 -0
- package/public/index.html +15 -0
- package/scripts/build-image.sh +55 -0
- package/scripts/run.sh +310 -0
- package/scripts/setup-pi.sh +80 -0
- package/scripts/start-feishu1.js +46 -0
- package/index.js +0 -0
- package/jishushell-0.0.1.tgz +0 -0
|
@@ -0,0 +1,828 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Nomad-based service manager for OpenClaw instances.
|
|
3
|
+
* Communicates with Nomad via its HTTP API.
|
|
4
|
+
*/
|
|
5
|
+
import { execFile as execFileCb, execFileSync } from "child_process";
|
|
6
|
+
import { chmodSync, existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from "fs";
|
|
7
|
+
import { homedir, platform, userInfo } from "os";
|
|
8
|
+
import { dirname } from "path";
|
|
9
|
+
import { join } from "path";
|
|
10
|
+
import { promisify } from "util";
|
|
11
|
+
import { getNomadAddr, getNomadDriver, getNomadToken, getOpenclawDockerImage, isOfficialImage, JISHUSHELL_HOME } from "../config.js";
|
|
12
|
+
import { TtlCache } from "../utils/ttl-cache.js";
|
|
13
|
+
import { findInstancesSharingGatewayPort, findInstancesSharingOpenclawHome, getGatewayPort, getInstanceRuntime, getOpenclawConfigPath, getOpenclawHome, getRuntimeEnv } from "./instance-manager.js";
|
|
14
|
+
import { getLegacyStatus, stopInstance as stopLegacyInstance } from "./process-manager.js";
|
|
15
|
+
// Docker image names must match this pattern to prevent command injection.
|
|
16
|
+
export const DOCKER_IMAGE_RE = /^[a-zA-Z0-9][a-zA-Z0-9\-_.:/@]*$/;
|
|
17
|
+
// Maximum allowed length for a Docker image reference.
|
|
18
|
+
export const MAX_DOCKER_IMAGE_NAME_LEN = 256;
|
|
19
|
+
const JOB_PREFIX = "openclaw-";
|
|
20
|
+
// Tracks the panel's listening port so bridge-mode containers can reach it via host.docker.internal.
|
|
21
|
+
let _panelPort = 8090;
|
|
22
|
+
export function setPanelPort(port) { _panelPort = port; }
|
|
23
|
+
/**
|
|
24
|
+
* When running in docker bridge mode, 127.0.0.1 inside the container resolves to
|
|
25
|
+
* the container's own loopback, not the host. Rewrite the jsproxy provider baseUrl
|
|
26
|
+
* in openclaw.json to use host.docker.internal instead so the container can reach
|
|
27
|
+
* the JishuShell LLM proxy.
|
|
28
|
+
*/
|
|
29
|
+
function patchJsproxyBaseUrl(configPath) {
|
|
30
|
+
try {
|
|
31
|
+
const raw = readFileSync(configPath, "utf-8");
|
|
32
|
+
const patched = raw.replace(/http:\/\/127\.0\.0\.1:(\d+)\/proxy/g, `http://host.docker.internal:$1/proxy`);
|
|
33
|
+
if (patched !== raw) {
|
|
34
|
+
writeFileSync(configPath, patched, "utf-8");
|
|
35
|
+
console.log(`[nomad] Patched jsproxy baseUrl in ${configPath} (127.0.0.1 → host.docker.internal)`);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
catch (e) {
|
|
39
|
+
console.warn(`[nomad] Failed to patch jsproxy baseUrl in ${configPath}: ${e.message}`);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
/**
|
|
43
|
+
* Docker bridge port publishing cannot reach a process that only binds the
|
|
44
|
+
* container loopback. Normalize default/loopback gateway binds to `lan` so
|
|
45
|
+
* Nomad's published host port can reach the gateway.
|
|
46
|
+
*
|
|
47
|
+
* OpenClaw will seed localhost Control UI origins automatically for non-loopback
|
|
48
|
+
* binds on startup when they are missing, so persisting the bind mode here keeps
|
|
49
|
+
* startup and runtime behavior aligned.
|
|
50
|
+
*/
|
|
51
|
+
function patchDockerBridgeGatewayBind(configPath) {
|
|
52
|
+
try {
|
|
53
|
+
const raw = readFileSync(configPath, "utf-8");
|
|
54
|
+
const parsed = JSON.parse(raw);
|
|
55
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed))
|
|
56
|
+
return;
|
|
57
|
+
const gatewayRaw = parsed.gateway;
|
|
58
|
+
const gateway = gatewayRaw && typeof gatewayRaw === "object" && !Array.isArray(gatewayRaw)
|
|
59
|
+
? gatewayRaw
|
|
60
|
+
: (parsed.gateway = {});
|
|
61
|
+
const bind = typeof gateway.bind === "string" ? gateway.bind.trim() : "";
|
|
62
|
+
if (bind && bind !== "loopback")
|
|
63
|
+
return;
|
|
64
|
+
gateway.bind = "lan";
|
|
65
|
+
const next = JSON.stringify(parsed, null, 2);
|
|
66
|
+
const output = raw.endsWith("\n") ? `${next}\n` : next;
|
|
67
|
+
if (output === raw)
|
|
68
|
+
return;
|
|
69
|
+
writeFileSync(configPath, output, "utf-8");
|
|
70
|
+
console.log(`[nomad] Normalized gateway.bind to "lan" in ${configPath} for Docker bridge networking`);
|
|
71
|
+
}
|
|
72
|
+
catch (e) {
|
|
73
|
+
console.warn(`[nomad] Failed to patch gateway.bind in ${configPath}: ${e.message}`);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
const DEFAULT_COMMAND = "/usr/bin/openclaw";
|
|
77
|
+
const DEFAULT_PIDS_LIMIT = 512;
|
|
78
|
+
export const VALID_LOG_TYPES = new Set(["stdout", "stderr"]);
|
|
79
|
+
const memoryOversubscriptionCache = new TtlCache(30_000);
|
|
80
|
+
function nomadAuthHeaders() {
|
|
81
|
+
const token = getNomadToken();
|
|
82
|
+
return token ? { "X-Nomad-Token": token } : {};
|
|
83
|
+
}
|
|
84
|
+
const DEFAULT_ARGS = ["gateway", "run", "--port", "18789", "--allow-unconfigured"];
|
|
85
|
+
const DEFAULT_USER = userInfo().username;
|
|
86
|
+
const DEFAULT_CWD = homedir();
|
|
87
|
+
const DEFAULT_ENV = {
|
|
88
|
+
HOME: homedir(),
|
|
89
|
+
TMPDIR: "/tmp",
|
|
90
|
+
PATH: `${homedir()}/.local/bin:${homedir()}/.npm-global/bin:${homedir()}/bin:${homedir()}/.volta/bin:`
|
|
91
|
+
+ `${homedir()}/.asdf/shims:${homedir()}/.bun/bin:${homedir()}/.nvm/current/bin:${homedir()}/.fnm/current/bin:`
|
|
92
|
+
+ `${homedir()}/.local/share/pnpm:/usr/local/bin:/usr/bin:/bin`,
|
|
93
|
+
};
|
|
94
|
+
const DEFAULT_RESOURCES = { CPU: 500, MemoryMB: 512 };
|
|
95
|
+
// Hard upper bounds applied before submitting any Nomad job. Prevents a
|
|
96
|
+
// misconfigured or malicious instance config from exhausting scheduler
|
|
97
|
+
// resources on the host (no Nomad Enterprise Resource Quotas in OSS).
|
|
98
|
+
const MAX_CPU_MHZ = 4000; // 4 GHz — sane ceiling for a single task
|
|
99
|
+
const MAX_MEMORY_MB = 4096; // 4 GB reservation
|
|
100
|
+
const MAX_MEMORY_MAX_MB = 4096; // 4 GB hard limit (memory_max)
|
|
101
|
+
function jobId(instanceId) {
|
|
102
|
+
return `${JOB_PREFIX}${instanceId}`;
|
|
103
|
+
}
|
|
104
|
+
// Nomad Template metacharacters that must not appear in values interpolated
|
|
105
|
+
// into EmbeddedTmpl. Defense-in-depth: instanceId is already validated by the
|
|
106
|
+
// route layer, but this guard makes the template-building code self-contained.
|
|
107
|
+
export const NOMAD_TEMPLATE_UNSAFE_RE = /[{}"\\]/;
|
|
108
|
+
function assertSafeTemplateId(id) {
|
|
109
|
+
if (NOMAD_TEMPLATE_UNSAFE_RE.test(id)) {
|
|
110
|
+
throw new Error(`Job ID "${id}" contains characters unsafe for Nomad Template interpolation`);
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
async function nomadGet(path) {
|
|
114
|
+
const resp = await fetch(`${getNomadAddr()}${path}`, {
|
|
115
|
+
headers: nomadAuthHeaders(),
|
|
116
|
+
signal: AbortSignal.timeout(10000),
|
|
117
|
+
});
|
|
118
|
+
if (!resp.ok && resp.status !== 404)
|
|
119
|
+
throw new Error(`Nomad ${path}: ${resp.status}`);
|
|
120
|
+
return resp;
|
|
121
|
+
}
|
|
122
|
+
async function nomadPost(path, body) {
|
|
123
|
+
return fetch(`${getNomadAddr()}${path}`, {
|
|
124
|
+
method: "POST",
|
|
125
|
+
headers: { "Content-Type": "application/json", ...nomadAuthHeaders() },
|
|
126
|
+
body: JSON.stringify(body),
|
|
127
|
+
signal: AbortSignal.timeout(10000),
|
|
128
|
+
});
|
|
129
|
+
}
|
|
130
|
+
async function nomadDelete(path) {
|
|
131
|
+
return fetch(`${getNomadAddr()}${path}`, {
|
|
132
|
+
method: "DELETE",
|
|
133
|
+
headers: nomadAuthHeaders(),
|
|
134
|
+
signal: AbortSignal.timeout(10000),
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
async function nomadPut(path, body) {
|
|
138
|
+
return fetch(`${getNomadAddr()}${path}`, {
|
|
139
|
+
method: "PUT",
|
|
140
|
+
headers: { "Content-Type": "application/json", ...nomadAuthHeaders() },
|
|
141
|
+
body: JSON.stringify(body),
|
|
142
|
+
signal: AbortSignal.timeout(10000),
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
async function getMemoryOversubscriptionState() {
|
|
146
|
+
const cached = memoryOversubscriptionCache.peek();
|
|
147
|
+
if (cached)
|
|
148
|
+
return cached;
|
|
149
|
+
try {
|
|
150
|
+
const resp = await nomadGet("/v1/operator/scheduler/configuration");
|
|
151
|
+
if (!resp.ok)
|
|
152
|
+
return "unknown";
|
|
153
|
+
const payload = await resp.json();
|
|
154
|
+
const state = payload?.SchedulerConfig?.MemoryOversubscriptionEnabled === true ? "enabled" : "disabled";
|
|
155
|
+
memoryOversubscriptionCache.set(state);
|
|
156
|
+
return state;
|
|
157
|
+
}
|
|
158
|
+
catch {
|
|
159
|
+
return "unknown";
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
// ── Nomad Variables (secrets) ──
|
|
163
|
+
async function writeInstanceVariables(instanceId) {
|
|
164
|
+
const jid = jobId(instanceId);
|
|
165
|
+
// (short-term mitigation): variable path follows Nomad's workload-identity
|
|
166
|
+
// convention. Each job's workload identity has implicit read/write access only
|
|
167
|
+
// to variables under its own nomad/jobs/<job-id>/ prefix, providing per-job
|
|
168
|
+
// secret isolation within the shared "default" namespace. Per-instance Nomad
|
|
169
|
+
// namespaces remain a planned future improvement.
|
|
170
|
+
const ns = "default";
|
|
171
|
+
const varPath = `nomad/jobs/${jid}/openclaw/gateway`;
|
|
172
|
+
const encodedPath = encodeURIComponent(varPath);
|
|
173
|
+
// Read proxy token from env file
|
|
174
|
+
const env = getRuntimeEnv(instanceId);
|
|
175
|
+
const proxyToken = env.JSPROXY_API_KEY || "";
|
|
176
|
+
// Nothing to store when proxy token is unconfigured.
|
|
177
|
+
if (!proxyToken)
|
|
178
|
+
return;
|
|
179
|
+
const items = { JSPROXY_API_KEY: proxyToken };
|
|
180
|
+
// retry with exponential back-off on CAS conflicts (409) so concurrent
|
|
181
|
+
// startInstance calls do not silently discard the latest token. Throw after
|
|
182
|
+
// MAX_ATTEMPTS so the caller can surface the error instead of continuing with
|
|
183
|
+
// a missing proxy token.
|
|
184
|
+
const MAX_ATTEMPTS = 3;
|
|
185
|
+
for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
|
|
186
|
+
// Re-read ModifyIndex on every attempt to always CAS against the latest version.
|
|
187
|
+
let cas = 0;
|
|
188
|
+
try {
|
|
189
|
+
const existing = await nomadGet(`/v1/var/${encodedPath}?namespace=${ns}`);
|
|
190
|
+
if (existing.ok) {
|
|
191
|
+
const data = await existing.json();
|
|
192
|
+
cas = data.ModifyIndex || 0;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
catch { /* variable may not exist yet — cas=0 creates a new one */ }
|
|
196
|
+
const resp = await nomadPut(`/v1/var/${encodedPath}?cas=${cas}&namespace=${ns}`, {
|
|
197
|
+
Namespace: ns,
|
|
198
|
+
Path: varPath,
|
|
199
|
+
Items: items,
|
|
200
|
+
});
|
|
201
|
+
if (resp.ok)
|
|
202
|
+
return;
|
|
203
|
+
const text = await resp.text();
|
|
204
|
+
// 409 Conflict = CAS mismatch; another writer won the race — retry.
|
|
205
|
+
if (resp.status === 409 && attempt < MAX_ATTEMPTS - 1) {
|
|
206
|
+
await new Promise(r => setTimeout(r, 100 * Math.pow(2, attempt)));
|
|
207
|
+
continue;
|
|
208
|
+
}
|
|
209
|
+
throw new Error(`Failed to write Nomad Variables for ${instanceId}` +
|
|
210
|
+
` (attempt ${attempt + 1}/${MAX_ATTEMPTS}): HTTP ${resp.status} ${text}`);
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
export async function purgeInstanceVariables(instanceId) {
|
|
214
|
+
const jid = jobId(instanceId);
|
|
215
|
+
const varPath = `nomad/jobs/${jid}/openclaw/gateway`;
|
|
216
|
+
const encodedPath = encodeURIComponent(varPath);
|
|
217
|
+
try {
|
|
218
|
+
const resp = await nomadDelete(`/v1/var/${encodedPath}`);
|
|
219
|
+
if (!resp.ok && resp.status !== 404) {
|
|
220
|
+
console.warn(`[nomad] Failed to purge variables for ${instanceId}: HTTP ${resp.status}`);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
catch (e) {
|
|
224
|
+
console.warn(`[nomad] Failed to purge variables for ${instanceId}: ${e.message}`);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
export const VALID_USER_RE = /^[a-z0-9._-]{1,32}$/;
|
|
228
|
+
/**
|
|
229
|
+
* Resolve the numeric uid:gid for a given username by reading /etc/passwd.
|
|
230
|
+
* Falls back to process.getuid!():process.getgid!() when the lookup fails
|
|
231
|
+
* (e.g. the user doesn't exist on this host or /etc/passwd is unreadable).
|
|
232
|
+
*/
|
|
233
|
+
function resolveUidGid(username) {
|
|
234
|
+
try {
|
|
235
|
+
const passwd = readFileSync("/etc/passwd", "utf-8");
|
|
236
|
+
const line = passwd.split("\n").find(l => l.startsWith(username + ":"));
|
|
237
|
+
if (line) {
|
|
238
|
+
const parts = line.split(":");
|
|
239
|
+
const uid = parseInt(parts[2], 10);
|
|
240
|
+
const gid = parseInt(parts[3], 10);
|
|
241
|
+
if (!isNaN(uid) && !isNaN(gid))
|
|
242
|
+
return `${uid}:${gid}`;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
catch { /* ignore */ }
|
|
246
|
+
return `${process.getuid()}:${process.getgid()}`;
|
|
247
|
+
}
|
|
248
|
+
function buildRuntime(instanceId) {
|
|
249
|
+
const runtime = getInstanceRuntime(instanceId);
|
|
250
|
+
const openclawHome = getOpenclawHome(instanceId);
|
|
251
|
+
// Validate user to prevent injection via Nomad job spec
|
|
252
|
+
if (runtime.user && !VALID_USER_RE.test(runtime.user)) {
|
|
253
|
+
throw new Error(`Invalid runtime user: ${runtime.user}`);
|
|
254
|
+
}
|
|
255
|
+
const command = runtime.command || DEFAULT_COMMAND;
|
|
256
|
+
let args = runtime.args;
|
|
257
|
+
if (!Array.isArray(args))
|
|
258
|
+
args = [...DEFAULT_ARGS];
|
|
259
|
+
else
|
|
260
|
+
args = args.map(String);
|
|
261
|
+
const env = { ...DEFAULT_ENV };
|
|
262
|
+
Object.assign(env, getRuntimeEnv(instanceId));
|
|
263
|
+
delete env.JSPROXY_API_KEY; // Injected by Nomad template from Variables
|
|
264
|
+
env.OPENCLAW_HOME = openclawHome;
|
|
265
|
+
env.OPENCLAW_INSTANCE_ID = instanceId;
|
|
266
|
+
const resources = { ...DEFAULT_RESOURCES };
|
|
267
|
+
for (const [key, value] of Object.entries(runtime.resources || {})) {
|
|
268
|
+
if (value != null)
|
|
269
|
+
resources[key] = Number(value);
|
|
270
|
+
}
|
|
271
|
+
// Clamp to sane upper bounds — guards against arbitrarily large values that
|
|
272
|
+
// would exhaust Nomad scheduler capacity or system memory.
|
|
273
|
+
resources.CPU = Math.max(1, Math.min(resources.CPU, MAX_CPU_MHZ));
|
|
274
|
+
resources.MemoryMB = Math.max(1, Math.min(resources.MemoryMB, MAX_MEMORY_MB));
|
|
275
|
+
return {
|
|
276
|
+
command: String(command),
|
|
277
|
+
args,
|
|
278
|
+
user: runtime.user || DEFAULT_USER,
|
|
279
|
+
cwd: runtime.cwd || DEFAULT_CWD,
|
|
280
|
+
env,
|
|
281
|
+
resources,
|
|
282
|
+
};
|
|
283
|
+
}
|
|
284
|
+
function normalizeDockerResources(instanceId, runtime, oversubState) {
|
|
285
|
+
const requestedMemoryMB = Number(runtime.resources.MemoryMB ?? DEFAULT_RESOURCES.MemoryMB);
|
|
286
|
+
let effectiveMemoryMB = requestedMemoryMB;
|
|
287
|
+
let effectiveMemoryMaxMB = Math.min(Number(runtime.resources.MemoryMaxMB ?? MAX_MEMORY_MAX_MB), MAX_MEMORY_MAX_MB);
|
|
288
|
+
if (effectiveMemoryMaxMB < effectiveMemoryMB) {
|
|
289
|
+
console.warn(`[nomad] ${instanceId}: MemoryMaxMB (${effectiveMemoryMaxMB}) is below MemoryMB (${effectiveMemoryMB}); clamping max to reservation.`);
|
|
290
|
+
effectiveMemoryMaxMB = effectiveMemoryMB;
|
|
291
|
+
}
|
|
292
|
+
if (oversubState === "disabled" && effectiveMemoryMaxMB > effectiveMemoryMB) {
|
|
293
|
+
console.warn(`[nomad] ${instanceId}: memory oversubscription is disabled; promoting MemoryMB ` +
|
|
294
|
+
`from ${effectiveMemoryMB}MB to ${effectiveMemoryMaxMB}MB so the Docker limit matches MemoryMaxMB.`);
|
|
295
|
+
effectiveMemoryMB = effectiveMemoryMaxMB;
|
|
296
|
+
}
|
|
297
|
+
return {
|
|
298
|
+
...runtime.resources,
|
|
299
|
+
MemoryMB: effectiveMemoryMB,
|
|
300
|
+
MemoryMaxMB: effectiveMemoryMaxMB,
|
|
301
|
+
};
|
|
302
|
+
}
|
|
303
|
+
function buildTaskDocker(instanceId, runtime, oversubState) {
|
|
304
|
+
// Guard against Nomad Template injection: validate the job ID contains no
|
|
305
|
+
// template metacharacters before interpolating it into EmbeddedTmpl.
|
|
306
|
+
const safeJobId = jobId(instanceId);
|
|
307
|
+
assertSafeTemplateId(safeJobId);
|
|
308
|
+
const openclawHome = getOpenclawHome(instanceId);
|
|
309
|
+
const image = getOpenclawDockerImage();
|
|
310
|
+
// Image classification:
|
|
311
|
+
// - Official: ghcr.io/openclaw/openclaw:* — binary baked in, no bind-mount needed
|
|
312
|
+
// - Slim base: jishushell-base:* — binary bind-mounted from host npm package (legacy)
|
|
313
|
+
// - Local/other: openclaw:* or custom images
|
|
314
|
+
const _isOfficialImage = isOfficialImage(image);
|
|
315
|
+
const isSlimBaseImage = /^jishushell-base:/i.test(image);
|
|
316
|
+
// node_modules parent dir — contains openclaw package AND all its sibling dependencies
|
|
317
|
+
const openclawNmDir = join(JISHUSHELL_HOME, "packages", "openclaw", "node_modules");
|
|
318
|
+
const openclawAppDir = join(openclawHome, "app");
|
|
319
|
+
const volumes = [
|
|
320
|
+
`${openclawHome}:${openclawHome}:rw`,
|
|
321
|
+
// Only slim base needs the bind-mount; official and local images have the binary baked in.
|
|
322
|
+
...(isSlimBaseImage ? [`${openclawNmDir}:/usr/lib/node_modules:ro`] : []),
|
|
323
|
+
// Official image: persist /app so OpenClaw can self-upgrade and install plugins.
|
|
324
|
+
...(_isOfficialImage ? [`${openclawAppDir}:/app:rw`] : []),
|
|
325
|
+
];
|
|
326
|
+
const containerEnv = { ...runtime.env };
|
|
327
|
+
containerEnv.PATH = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin";
|
|
328
|
+
const isLocalImage = !image.includes("/"); // e.g. "openclaw:v1.0" vs "ghcr.io/..."
|
|
329
|
+
// Container runs as uid 1000 (node user in the image). Always set HOME to
|
|
330
|
+
// /home/node which exists in the image and is writable (npm cache, etc).
|
|
331
|
+
// The host HOME (/home/pi) does not exist inside the container.
|
|
332
|
+
containerEnv.HOME = "/home/node";
|
|
333
|
+
// Plugins (e.g. openclaw-weixin) use OPENCLAW_STATE_DIR to find credentials.
|
|
334
|
+
// Inside Docker, HOME differs from the host, so set this explicitly.
|
|
335
|
+
if (!containerEnv.OPENCLAW_STATE_DIR) {
|
|
336
|
+
containerEnv.OPENCLAW_STATE_DIR = `${openclawHome}/.openclaw`;
|
|
337
|
+
}
|
|
338
|
+
const runtimeArgs = [...(runtime.args || [])];
|
|
339
|
+
let commandConfig;
|
|
340
|
+
if (_isOfficialImage) {
|
|
341
|
+
// Official/custom image ENTRYPOINT is docker-entrypoint.sh (from node base).
|
|
342
|
+
// Prepend "openclaw" so the entrypoint execs: openclaw gateway run ...
|
|
343
|
+
commandConfig = { args: ["openclaw", ...runtimeArgs] };
|
|
344
|
+
}
|
|
345
|
+
else if (isSlimBaseImage) {
|
|
346
|
+
// Slim base image: ENTRYPOINT is ["node", "/usr/lib/node_modules/openclaw/openclaw.mjs"]
|
|
347
|
+
// which uses the bind-mounted binary. Just pass gateway args, no command override needed.
|
|
348
|
+
commandConfig = { args: runtimeArgs };
|
|
349
|
+
}
|
|
350
|
+
else if (isLocalImage) {
|
|
351
|
+
// Local image has /usr/local/bin/openclaw symlink baked in.
|
|
352
|
+
commandConfig = { command: "/usr/local/bin/openclaw", args: runtimeArgs };
|
|
353
|
+
}
|
|
354
|
+
else {
|
|
355
|
+
commandConfig = { command: "/usr/local/bin/openclaw", args: runtimeArgs };
|
|
356
|
+
}
|
|
357
|
+
// Only the gateway port is published to the host; all other container ports stay
|
|
358
|
+
// hidden. Bridge networking gives each container an isolated network namespace;
|
|
359
|
+
// extra_hosts injects the host gateway IP so the container can still reach the
|
|
360
|
+
// JishuShell LLM proxy on the host without needing host-mode networking.
|
|
361
|
+
const gatewayPort = getGatewayPort(instanceId);
|
|
362
|
+
const normalizedResources = normalizeDockerResources(instanceId, runtime, oversubState);
|
|
363
|
+
return {
|
|
364
|
+
Name: "gateway",
|
|
365
|
+
Driver: "docker",
|
|
366
|
+
// Task-level User field — Nomad passes this as --user to docker run.
|
|
367
|
+
User: resolveUidGid(runtime.user),
|
|
368
|
+
Config: {
|
|
369
|
+
image,
|
|
370
|
+
// Local build: never force pull. Use local image cache.
|
|
371
|
+
force_pull: false,
|
|
372
|
+
...commandConfig,
|
|
373
|
+
volumes,
|
|
374
|
+
// Bridge mode (default): container gets an isolated network namespace.
|
|
375
|
+
// host.docker.internal resolves to the host's gateway IP on the bridge
|
|
376
|
+
// (172.17.0.1 or equivalent) so the container can call 127.0.0.1-bound
|
|
377
|
+
// host services via http://host.docker.internal:<port> instead.
|
|
378
|
+
extra_hosts: ["host.docker.internal:host-gateway"],
|
|
379
|
+
cap_drop: ["ALL"],
|
|
380
|
+
security_opt: ["no-new-privileges"], // block setuid/setgid escalation
|
|
381
|
+
pids_limit: DEFAULT_PIDS_LIMIT, // prevent fork bomb
|
|
382
|
+
// Official image: writable rootfs (users may install packages / upgrade inside the container).
|
|
383
|
+
// Other images: read-only rootfs for security.
|
|
384
|
+
readonly_rootfs: !_isOfficialImage,
|
|
385
|
+
// Provide a writable /tmp via mount config (Nomad docker driver
|
|
386
|
+
// doesn't support top-level "tmpfs" field in older versions).
|
|
387
|
+
mounts: [{ type: "tmpfs", target: "/tmp", tmpfs_options: { size: 67108864 } }],
|
|
388
|
+
},
|
|
389
|
+
Env: containerEnv,
|
|
390
|
+
Resources: {
|
|
391
|
+
...normalizedResources,
|
|
392
|
+
// When memory oversubscription is enabled, MemoryMB is the scheduler
|
|
393
|
+
// reservation and MemoryMaxMB is the burst ceiling / container hard limit.
|
|
394
|
+
// When it is disabled, Nomad ignores MemoryMaxMB and enforces MemoryMB as
|
|
395
|
+
// the Docker limit, so normalizeDockerResources() promotes MemoryMB to the
|
|
396
|
+
// configured max to preserve user intent.
|
|
397
|
+
// Statically reserve the gateway port on the host so Nomad can track it and
|
|
398
|
+
// detect conflicts across instances before the container even starts.
|
|
399
|
+
// In bridge mode Nomad maps this host port to the same container port.
|
|
400
|
+
Networks: [{ ReservedPorts: [{ Label: "gateway", Value: gatewayPort }] }],
|
|
401
|
+
},
|
|
402
|
+
LogConfig: { MaxFiles: 3, MaxFileSizeMB: 10 },
|
|
403
|
+
Templates: [{
|
|
404
|
+
DestPath: "secrets/instance.env",
|
|
405
|
+
Envvars: true,
|
|
406
|
+
EmbeddedTmpl: [
|
|
407
|
+
`{{ if nomadVarExists "nomad/jobs/${safeJobId}/openclaw/gateway" }}`,
|
|
408
|
+
`JSPROXY_API_KEY={{ with nomadVar "nomad/jobs/${safeJobId}/openclaw/gateway" }}{{ .JSPROXY_API_KEY }}{{ end }}`,
|
|
409
|
+
`{{ end }}`,
|
|
410
|
+
].join("\n"),
|
|
411
|
+
ChangeMode: "restart",
|
|
412
|
+
}],
|
|
413
|
+
};
|
|
414
|
+
}
|
|
415
|
+
async function buildJob(instanceId) {
|
|
416
|
+
const jid = jobId(instanceId);
|
|
417
|
+
const runtime = buildRuntime(instanceId);
|
|
418
|
+
const driver = getNomadDriver();
|
|
419
|
+
const oversubState = await getMemoryOversubscriptionState();
|
|
420
|
+
if (driver !== "docker") {
|
|
421
|
+
throw new Error(`Unsupported Nomad driver: ${driver}. Only "docker" is supported.`);
|
|
422
|
+
}
|
|
423
|
+
const task = buildTaskDocker(instanceId, runtime, oversubState);
|
|
424
|
+
return {
|
|
425
|
+
Job: {
|
|
426
|
+
ID: jid,
|
|
427
|
+
Name: jid,
|
|
428
|
+
Namespace: "default",
|
|
429
|
+
Type: "service",
|
|
430
|
+
Datacenters: ["*"],
|
|
431
|
+
TaskGroups: [{
|
|
432
|
+
Name: "openclaw",
|
|
433
|
+
Count: 1,
|
|
434
|
+
RestartPolicy: {
|
|
435
|
+
Attempts: 3,
|
|
436
|
+
Interval: 300000000000, // 5 min (nanoseconds)
|
|
437
|
+
Delay: 15000000000, // 15 s (nanoseconds)
|
|
438
|
+
// "fail" mode: once attempts are exhausted the alloc is marked failed
|
|
439
|
+
// and triggers reschedule evaluation, making failures visible.
|
|
440
|
+
// "delay" (old default) silently retries forever without ever
|
|
441
|
+
// setting the alloc to failed or triggering reschedule.
|
|
442
|
+
Mode: "fail",
|
|
443
|
+
},
|
|
444
|
+
// Single-node (Raspberry Pi) environment: reschedule is meaningless
|
|
445
|
+
// because there is only one node. Explicitly disable it so Nomad
|
|
446
|
+
// doesn't spin trying to place the job on a non-existent second node.
|
|
447
|
+
Reschedule: {
|
|
448
|
+
Attempts: 0,
|
|
449
|
+
Unlimited: false,
|
|
450
|
+
},
|
|
451
|
+
// Update policy: use task_states health check because no service
|
|
452
|
+
// checks are registered. Without this, Nomad defaults to
|
|
453
|
+
// health_check="checks" and waits forever for a signal that never comes,
|
|
454
|
+
// hanging every job re-submission indefinitely.
|
|
455
|
+
Update: {
|
|
456
|
+
MaxParallel: 1,
|
|
457
|
+
HealthCheck: "task_states",
|
|
458
|
+
MinHealthyTime: 5000000000, // 5 s
|
|
459
|
+
HealthyDeadline: 60000000000, // 60 s
|
|
460
|
+
AutoRevert: false,
|
|
461
|
+
},
|
|
462
|
+
Tasks: [task],
|
|
463
|
+
}],
|
|
464
|
+
},
|
|
465
|
+
};
|
|
466
|
+
}
|
|
467
|
+
async function getRunningAlloc(instanceId) {
|
|
468
|
+
const jid = jobId(instanceId);
|
|
469
|
+
try {
|
|
470
|
+
const resp = await nomadGet(`/v1/job/${jid}/allocations`);
|
|
471
|
+
if (resp.status === 404)
|
|
472
|
+
return null;
|
|
473
|
+
const allocs = await resp.json();
|
|
474
|
+
for (const status of ["running", "pending"]) {
|
|
475
|
+
for (const alloc of allocs) {
|
|
476
|
+
if (alloc.ClientStatus === status)
|
|
477
|
+
return alloc;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
return null;
|
|
481
|
+
}
|
|
482
|
+
catch {
|
|
483
|
+
return null;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
// Returns true if the Nomad job exists and was NOT explicitly stopped by the user (Stop=false).
|
|
487
|
+
// Used on jishushell startup to auto-restart instances that were running before a reboot.
|
|
488
|
+
export async function shouldAutoStart(instanceId) {
|
|
489
|
+
const jid = jobId(instanceId);
|
|
490
|
+
try {
|
|
491
|
+
const resp = await nomadGet(`/v1/job/${jid}`);
|
|
492
|
+
if (!resp.ok || resp.status === 404)
|
|
493
|
+
return false;
|
|
494
|
+
const job = await resp.json();
|
|
495
|
+
// Stop=true means user explicitly stopped it; Stop=false means it was running.
|
|
496
|
+
// Also skip dead jobs — all allocs failed, resubmitting would fail again.
|
|
497
|
+
return job.Stop === false && job.Status !== "dead";
|
|
498
|
+
}
|
|
499
|
+
catch {
|
|
500
|
+
return false;
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
export async function getStatus(instanceId) {
|
|
504
|
+
const jid = jobId(instanceId);
|
|
505
|
+
const stopped = { status: "stopped", pid: null, uptime: null, memory_mb: null, cpu_percent: null };
|
|
506
|
+
try {
|
|
507
|
+
const resp = await nomadGet(`/v1/job/${jid}`);
|
|
508
|
+
if (resp.status === 404)
|
|
509
|
+
return stopped;
|
|
510
|
+
const job = await resp.json();
|
|
511
|
+
if (job.Stop)
|
|
512
|
+
return stopped;
|
|
513
|
+
}
|
|
514
|
+
catch {
|
|
515
|
+
return { ...stopped, status: "unknown", error: "Nomad unreachable" };
|
|
516
|
+
}
|
|
517
|
+
const alloc = await getRunningAlloc(instanceId);
|
|
518
|
+
if (!alloc)
|
|
519
|
+
return { ...stopped, status: "pending" };
|
|
520
|
+
const allocId = alloc.ID;
|
|
521
|
+
const result = {
|
|
522
|
+
status: alloc.ClientStatus || "unknown",
|
|
523
|
+
alloc_id: allocId,
|
|
524
|
+
pid: null,
|
|
525
|
+
uptime: null,
|
|
526
|
+
memory_mb: null,
|
|
527
|
+
cpu_percent: null,
|
|
528
|
+
restarts: 0,
|
|
529
|
+
};
|
|
530
|
+
const gwState = alloc.TaskStates?.gateway || {};
|
|
531
|
+
result.restarts = gwState.Restarts || 0;
|
|
532
|
+
const startedAt = gwState.StartedAt;
|
|
533
|
+
if (startedAt) {
|
|
534
|
+
try {
|
|
535
|
+
const start = new Date(startedAt);
|
|
536
|
+
result.uptime = Math.floor((Date.now() - start.getTime()) / 1000);
|
|
537
|
+
}
|
|
538
|
+
catch { /* ignore */ }
|
|
539
|
+
}
|
|
540
|
+
try {
|
|
541
|
+
const statsResp = await nomadGet(`/v1/client/allocation/${allocId}/stats`);
|
|
542
|
+
if (statsResp.ok) {
|
|
543
|
+
const stats = await statsResp.json();
|
|
544
|
+
// raw_exec: stats nested under Tasks.gateway; docker: top-level ResourceUsage
|
|
545
|
+
const taskStats = stats.Tasks?.gateway?.ResourceUsage || stats.ResourceUsage || {};
|
|
546
|
+
const memStats = taskStats.MemoryStats || {};
|
|
547
|
+
const cpuStats = taskStats.CpuStats || {};
|
|
548
|
+
const memBytes = memStats.RSS || memStats.Usage || 0;
|
|
549
|
+
result.memory_mb = Math.round(memBytes / (1024 * 1024) * 10) / 10;
|
|
550
|
+
result.cpu_percent = Math.round((cpuStats.Percent || 0) * 10) / 10;
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
catch { /* ignore */ }
|
|
554
|
+
// Fallback: Nomad cgroup stats are often zero on cgroup v2 (e.g. Raspberry Pi).
|
|
555
|
+
// Use `docker stats` directly when Nomad reports 0.
|
|
556
|
+
if (!result.memory_mb && allocId) {
|
|
557
|
+
try {
|
|
558
|
+
// Validate allocId to prevent shell injection (Nomad UUIDs are hex + hyphens)
|
|
559
|
+
if (!/^[a-f0-9-]+$/i.test(allocId))
|
|
560
|
+
throw new Error("invalid allocId");
|
|
561
|
+
const containerName = `gateway-${allocId}`;
|
|
562
|
+
const { execFile } = await import("child_process");
|
|
563
|
+
const { promisify } = await import("util");
|
|
564
|
+
const execFileAsync = promisify(execFile);
|
|
565
|
+
const { stdout } = await execFileAsync("docker", ["stats", "--no-stream", "--format", "{{.MemUsage}}", containerName], { timeout: 5000 });
|
|
566
|
+
const raw = stdout.trim();
|
|
567
|
+
// Format: "499.6MiB / 3GiB" or "123.4MB / 2GB"
|
|
568
|
+
const match = raw.match(/^([\d.]+)\s*(MiB|GiB|MB|GB|KiB|KB)/i);
|
|
569
|
+
if (match) {
|
|
570
|
+
let mb = parseFloat(match[1]);
|
|
571
|
+
const unit = match[2].toLowerCase();
|
|
572
|
+
if (unit === "gib" || unit === "gb")
|
|
573
|
+
mb *= 1024;
|
|
574
|
+
else if (unit === "kib" || unit === "kb")
|
|
575
|
+
mb /= 1024;
|
|
576
|
+
result.memory_mb = Math.round(mb * 10) / 10;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
catch { /* ignore */ }
|
|
580
|
+
}
|
|
581
|
+
return result;
|
|
582
|
+
}
|
|
583
|
+
export async function startInstance(instanceId) {
|
|
584
|
+
const status = await getStatus(instanceId);
|
|
585
|
+
if (status.status === "running") {
|
|
586
|
+
return { ok: false, error: "Instance is already running" };
|
|
587
|
+
}
|
|
588
|
+
const homeConflicts = [];
|
|
589
|
+
for (const otherId of findInstancesSharingOpenclawHome(instanceId)) {
|
|
590
|
+
const otherStatus = await getStatus(otherId);
|
|
591
|
+
if (otherStatus.status === "running")
|
|
592
|
+
homeConflicts.push(otherId);
|
|
593
|
+
}
|
|
594
|
+
if (homeConflicts.length) {
|
|
595
|
+
return {
|
|
596
|
+
ok: false,
|
|
597
|
+
error: `This instance shares OPENCLAW_HOME with running instance(s): ${homeConflicts.join(", ")}. Move it to its own instance directory before starting it.`,
|
|
598
|
+
};
|
|
599
|
+
}
|
|
600
|
+
const portConflicts = [];
|
|
601
|
+
for (const otherId of findInstancesSharingGatewayPort(instanceId)) {
|
|
602
|
+
const otherStatus = await getStatus(otherId);
|
|
603
|
+
if (otherStatus.status === "running")
|
|
604
|
+
portConflicts.push(otherId);
|
|
605
|
+
}
|
|
606
|
+
if (portConflicts.length) {
|
|
607
|
+
const port = getGatewayPort(instanceId);
|
|
608
|
+
return {
|
|
609
|
+
ok: false,
|
|
610
|
+
error: `Gateway port ${port} is already in use by running instance(s): ${portConflicts.join(", ")}. Assign a different port before starting this instance.`,
|
|
611
|
+
};
|
|
612
|
+
}
|
|
613
|
+
const legacyStatus = await getLegacyStatus(instanceId);
|
|
614
|
+
if (legacyStatus.status === "running") {
|
|
615
|
+
console.log(`[nomad] Stopping legacy process for ${instanceId} (pid=${legacyStatus.pid}) before Nomad start...`);
|
|
616
|
+
await stopLegacyInstance(instanceId);
|
|
617
|
+
// Give it a moment to exit
|
|
618
|
+
await new Promise((r) => setTimeout(r, 2000));
|
|
619
|
+
}
|
|
620
|
+
const configPath = getOpenclawConfigPath(instanceId);
|
|
621
|
+
if (!existsSync(configPath)) {
|
|
622
|
+
return { ok: false, error: "Config file not found" };
|
|
623
|
+
}
|
|
624
|
+
mkdirSync(dirname(configPath), { recursive: true, mode: 0o750 });
|
|
625
|
+
// Ensure Docker image exists when using docker driver
|
|
626
|
+
if (getNomadDriver() === "docker") {
|
|
627
|
+
// Docker Desktop on macOS uses virtio-fs: the container process isn't recognized
|
|
628
|
+
// as the file owner even when uid matches. The .openclaw dir needs 0o777 so the
|
|
629
|
+
// container can create .tmp files for atomic config writes. Files need 0o644 so
|
|
630
|
+
// the container can read the config. Applied here to fix existing instances too.
|
|
631
|
+
// On Linux/RPi 0o755 is sufficient — Docker maps the host uid correctly.
|
|
632
|
+
const stateDir = dirname(configPath);
|
|
633
|
+
chmodSync(stateDir, platform() === "darwin" ? 0o777 : 0o750);
|
|
634
|
+
chmodSync(configPath, platform() === "darwin" ? 0o644 : 0o600);
|
|
635
|
+
// Bridge mode needs a non-loopback gateway bind inside the container.
|
|
636
|
+
patchDockerBridgeGatewayBind(configPath);
|
|
637
|
+
// Bridge mode: rewrite 127.0.0.1 → host.docker.internal in jsproxy baseUrl
|
|
638
|
+
// so the container can reach the JishuShell LLM proxy on the host.
|
|
639
|
+
patchJsproxyBaseUrl(configPath);
|
|
640
|
+
const image = getOpenclawDockerImage();
|
|
641
|
+
// validate image name format and length.
|
|
642
|
+
if (!DOCKER_IMAGE_RE.test(image) || image.length > MAX_DOCKER_IMAGE_NAME_LEN) {
|
|
643
|
+
return { ok: false, error: `Invalid Docker image name: "${image}"` };
|
|
644
|
+
}
|
|
645
|
+
// Initialize persistent /app directory for official images on first start.
|
|
646
|
+
// Copy the image's /app contents to the host so OpenClaw can self-upgrade.
|
|
647
|
+
if (isOfficialImage(image)) {
|
|
648
|
+
const appDir = join(getOpenclawHome(instanceId), "app");
|
|
649
|
+
mkdirSync(appDir, { recursive: true, mode: 0o755 });
|
|
650
|
+
const isEmpty = readdirSync(appDir).length === 0;
|
|
651
|
+
if (isEmpty) {
|
|
652
|
+
console.log(`[nomad] Initializing /app for ${instanceId} from image ${image}...`);
|
|
653
|
+
const tmpName = `jishushell-init-${instanceId}-${Date.now()}`;
|
|
654
|
+
try {
|
|
655
|
+
execFileSync("docker", ["create", "--name", tmpName, image], { timeout: 30000, stdio: "ignore" });
|
|
656
|
+
execFileSync("docker", ["cp", `${tmpName}:/app/.`, appDir], { timeout: 120000, stdio: "ignore" });
|
|
657
|
+
execFileSync("docker", ["rm", tmpName], { timeout: 10000, stdio: "ignore" });
|
|
658
|
+
}
|
|
659
|
+
catch (e) {
|
|
660
|
+
try {
|
|
661
|
+
execFileSync("docker", ["rm", "-f", tmpName], { timeout: 10000, stdio: "ignore" });
|
|
662
|
+
}
|
|
663
|
+
catch { }
|
|
664
|
+
console.error(`[nomad] Failed to initialize /app for ${instanceId}: ${e.message}`);
|
|
665
|
+
return { ok: false, error: `Failed to initialize OpenClaw app directory: ${e.message}` };
|
|
666
|
+
}
|
|
667
|
+
}
|
|
668
|
+
}
|
|
669
|
+
try {
|
|
670
|
+
execFileSync("docker", ["image", "inspect", image], { timeout: 10000, stdio: "ignore" });
|
|
671
|
+
}
|
|
672
|
+
catch {
|
|
673
|
+
// Image not found locally — kick off a background build and return
|
|
674
|
+
// immediately so the API doesn't block for 5-10 minutes on RPi.
|
|
675
|
+
console.log(`[nomad] Docker image ${image} not found, starting background build...`);
|
|
676
|
+
try {
|
|
677
|
+
const setupManager = await import("./setup-manager.js");
|
|
678
|
+
const result = isOfficialImage(image)
|
|
679
|
+
? setupManager.startBuildCustomOpenclawImage(image)
|
|
680
|
+
: setupManager.startBuildOpenclawDockerImage(image);
|
|
681
|
+
return {
|
|
682
|
+
ok: false,
|
|
683
|
+
error: `Docker image ${image} not found. Build started in background.`,
|
|
684
|
+
building: true,
|
|
685
|
+
taskId: result.taskId,
|
|
686
|
+
};
|
|
687
|
+
}
|
|
688
|
+
catch (e) {
|
|
689
|
+
return { ok: false, error: `Docker image ${image} not available: ${e.message}` };
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
// Write instance secrets to Nomad Variables before starting the job.
|
|
694
|
+
// propagate failure — a missing proxy token causes 401 on every LLM
|
|
695
|
+
// request, so it is better to surface the error here than start a broken instance.
|
|
696
|
+
try {
|
|
697
|
+
await writeInstanceVariables(instanceId);
|
|
698
|
+
}
|
|
699
|
+
catch (e) {
|
|
700
|
+
return { ok: false, error: `Failed to store instance secrets in Nomad Variables: ${e.message}` };
|
|
701
|
+
}
|
|
702
|
+
const jobDef = await buildJob(instanceId);
|
|
703
|
+
try {
|
|
704
|
+
const resp = await nomadPost("/v1/jobs", jobDef);
|
|
705
|
+
if (resp.ok) {
|
|
706
|
+
const data = await resp.json();
|
|
707
|
+
return { ok: true, eval_id: data.EvalID };
|
|
708
|
+
}
|
|
709
|
+
return { ok: false, error: await resp.text() };
|
|
710
|
+
}
|
|
711
|
+
catch (e) {
|
|
712
|
+
const isNetErr = e?.message === "fetch failed" || e?.cause?.code === "ECONNREFUSED";
|
|
713
|
+
return { ok: false, error: isNetErr ? `Nomad 服务不可达 (${getNomadAddr()}),请先启动 Nomad` : e.message };
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
export async function stopInstance(instanceId, purge = false) {
|
|
717
|
+
const jid = jobId(instanceId);
|
|
718
|
+
try {
|
|
719
|
+
const resp = await nomadDelete(`/v1/job/${jid}?purge=${purge}`);
|
|
720
|
+
if (resp.ok) {
|
|
721
|
+
if (purge) {
|
|
722
|
+
try {
|
|
723
|
+
await purgeInstanceVariables(instanceId);
|
|
724
|
+
}
|
|
725
|
+
catch { /* ignore */ }
|
|
726
|
+
}
|
|
727
|
+
return { ok: true };
|
|
728
|
+
}
|
|
729
|
+
if (resp.status === 404)
|
|
730
|
+
return { ok: false, error: "Instance is not running" };
|
|
731
|
+
return { ok: false, error: await resp.text() };
|
|
732
|
+
}
|
|
733
|
+
catch (e) {
|
|
734
|
+
const isNetErr = e?.message === "fetch failed" || e?.cause?.code === "ECONNREFUSED";
|
|
735
|
+
return { ok: false, error: isNetErr ? `Nomad 服务不可达 (${getNomadAddr()}),请先启动 Nomad` : e.message };
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
export async function restartInstance(instanceId) {
|
|
739
|
+
// Prefer the native Nomad allocation restart API over stop+start:
|
|
740
|
+
// PUT /v1/client/allocation/{allocID}/restart
|
|
741
|
+
// This preserves alloc history, EvalID, and avoids rescheduling.
|
|
742
|
+
// Only falls back to stop+start when no running/pending alloc exists.
|
|
743
|
+
const alloc = await getRunningAlloc(instanceId);
|
|
744
|
+
if (alloc) {
|
|
745
|
+
try {
|
|
746
|
+
const resp = await nomadPut(`/v1/client/allocation/${alloc.ID}/restart`, {
|
|
747
|
+
TaskName: "gateway",
|
|
748
|
+
AllTasks: false,
|
|
749
|
+
});
|
|
750
|
+
if (resp.ok)
|
|
751
|
+
return { ok: true, alloc_id: alloc.ID };
|
|
752
|
+
// Non-2xx from the restart endpoint falls through to stop+start
|
|
753
|
+
const errText = await resp.text();
|
|
754
|
+
console.warn(`[nomad] Native restart failed for ${instanceId} (HTTP ${resp.status}): ${errText} — falling back to stop+start`);
|
|
755
|
+
}
|
|
756
|
+
catch (e) {
|
|
757
|
+
console.warn(`[nomad] Native restart error for ${instanceId}: ${e.message} — falling back to stop+start`);
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
// Fallback: stop the job and re-submit it
|
|
761
|
+
await stopInstance(instanceId);
|
|
762
|
+
await new Promise((r) => setTimeout(r, 2000));
|
|
763
|
+
return startInstance(instanceId);
|
|
764
|
+
}
|
|
765
|
+
export async function getLogs(instanceId, lines = 200, logType = "stderr") {
|
|
766
|
+
// Defense-in-depth: only allow known log types to prevent path/query injection
|
|
767
|
+
if (!VALID_LOG_TYPES.has(logType))
|
|
768
|
+
logType = "stderr";
|
|
769
|
+
let alloc = await getRunningAlloc(instanceId);
|
|
770
|
+
if (!alloc) {
|
|
771
|
+
const jid = jobId(instanceId);
|
|
772
|
+
try {
|
|
773
|
+
const resp = await nomadGet(`/v1/job/${jid}/allocations`);
|
|
774
|
+
if (resp.ok) {
|
|
775
|
+
const allocs = await resp.json();
|
|
776
|
+
if (allocs.length) {
|
|
777
|
+
alloc = allocs.sort((a, b) => (b.CreateIndex || 0) - (a.CreateIndex || 0))[0];
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
catch { /* ignore */ }
|
|
782
|
+
}
|
|
783
|
+
if (!alloc)
|
|
784
|
+
return [];
|
|
785
|
+
try {
|
|
786
|
+
const params = new URLSearchParams({
|
|
787
|
+
task: "gateway",
|
|
788
|
+
type: logType,
|
|
789
|
+
plain: "true",
|
|
790
|
+
origin: "end",
|
|
791
|
+
offset: String(Math.max(lines * 512, 100000)),
|
|
792
|
+
follow: "false",
|
|
793
|
+
});
|
|
794
|
+
const resp = await nomadGet(`/v1/client/fs/logs/${alloc.ID}?${params}`);
|
|
795
|
+
if (resp.ok) {
|
|
796
|
+
const text = await resp.text();
|
|
797
|
+
return text.split("\n").slice(-lines);
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
catch { /* ignore */ }
|
|
801
|
+
return [];
|
|
802
|
+
}
|
|
803
|
+
const execFileAsync = promisify(execFileCb);
|
|
804
|
+
export async function exec(instanceId, command, timeoutMs = 120_000) {
|
|
805
|
+
const alloc = await getRunningAlloc(instanceId);
|
|
806
|
+
if (!alloc || alloc.ClientStatus !== "running") {
|
|
807
|
+
throw new Error("Instance is not running");
|
|
808
|
+
}
|
|
809
|
+
const allocId = alloc.ID;
|
|
810
|
+
if (!/^[a-f0-9-]+$/i.test(allocId))
|
|
811
|
+
throw new Error("invalid allocId");
|
|
812
|
+
const containerName = `gateway-${allocId}`;
|
|
813
|
+
// Use the same user as the container's main process (runtime.user uid:gid)
|
|
814
|
+
const runtime = getInstanceRuntime(instanceId);
|
|
815
|
+
const userFlag = resolveUidGid(runtime.user);
|
|
816
|
+
try {
|
|
817
|
+
const { stdout, stderr } = await execFileAsync("docker", ["exec", "--user", userFlag, containerName, ...command], { timeout: timeoutMs });
|
|
818
|
+
return { stdout, stderr, exitCode: 0 };
|
|
819
|
+
}
|
|
820
|
+
catch (e) {
|
|
821
|
+
return {
|
|
822
|
+
stdout: e.stdout || "",
|
|
823
|
+
stderr: e.stderr || e.message,
|
|
824
|
+
exitCode: e.code ?? 1,
|
|
825
|
+
};
|
|
826
|
+
}
|
|
827
|
+
}
|
|
828
|
+
//# sourceMappingURL=nomad-manager.js.map
|