@yemi33/minions 0.1.1966 → 0.1.1967
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/minions.js +6 -6
- package/dashboard/js/refresh.js +5 -0
- package/dashboard/js/render-managed.js +261 -0
- package/dashboard/pages/engine.html +6 -0
- package/dashboard-build.js +1 -1
- package/dashboard.js +250 -1
- package/docs/README.md +10 -13
- package/docs/managed-spawn.md +259 -0
- package/docs/watches.md +47 -20
- package/engine/cli.js +39 -0
- package/engine/managed-spawn.js +1325 -0
- package/engine/playbook.js +34 -0
- package/engine/projects.js +13 -0
- package/engine/shared.js +118 -0
- package/engine.js +264 -14
- package/package.json +2 -1
|
@@ -0,0 +1,1325 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* engine/managed-spawn.js — Item 1 of plan P-7a3b1c92.
|
|
3
|
+
*
|
|
4
|
+
* Schema, validator, sidecar reader, acceptance helper, and playbook hint for
|
|
5
|
+
* the managed-spawn primitive. Mirrors engine/keep-process-sweep.js
|
|
6
|
+
* beat-for-beat: an agent writes agents/<id>/managed-spawn.json describing
|
|
7
|
+
* long-running services it wants the engine to spawn + healthcheck on its
|
|
8
|
+
* behalf; this module validates the file, returns a structured acceptance
|
|
9
|
+
* decision the engine's onAgentClose handler uses to gate the dispatch, and
|
|
10
|
+
* exposes a hint block the playbook injects so the agent knows the contract.
|
|
11
|
+
*
|
|
12
|
+
* Engine spawn / state-file / healthcheck loop / sweep wiring lives in later
|
|
13
|
+
* plan items (P-7a3b1c93+). This module is pure: no spawns, no network, no
|
|
14
|
+
* mutation of shared state.
|
|
15
|
+
*
|
|
16
|
+
* Design template: engine/keep-process-sweep.js. Divergences are justified
|
|
17
|
+
* inline; symmetry with keep-pids is the convention.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
const fs = require('fs');
|
|
21
|
+
const path = require('path');
|
|
22
|
+
const http = require('http');
|
|
23
|
+
const https = require('https');
|
|
24
|
+
const { spawn, exec } = require('child_process');
|
|
25
|
+
const shared = require('./shared');
|
|
26
|
+
const queries = require('./queries');
|
|
27
|
+
|
|
28
|
+
const { log, ENGINE_DEFAULTS } = shared;
|
|
29
|
+
|
|
30
|
+
const MANAGED_SPAWN_FILENAME = 'managed-spawn.json';
|
|
31
|
+
const MANAGED_PROCESSES_STATE_FILE = 'managed-processes.json';
|
|
32
|
+
const MANAGED_LOGS_DIR = 'managed-logs';
|
|
33
|
+
const INVALID_WORKDIR_REASON_PREFIX = 'invalid-workdir: ';
|
|
34
|
+
|
|
35
|
+
const HEALTHCHECK_TYPES = ['http', 'command'];
|
|
36
|
+
|
|
37
|
+
// Kebab-case service names: lowercase letters, digits, single internal hyphens,
|
|
38
|
+
// no leading/trailing hyphen. Mirrors the spawn-name conventions used elsewhere
|
|
39
|
+
// in the codebase (worktree-pool, dispatch IDs).
|
|
40
|
+
const _KEBAB_RE = /^[a-z0-9]+(?:-[a-z0-9]+)*$/;
|
|
41
|
+
|
|
42
|
+
function _agentsDir() {
|
|
43
|
+
return queries.AGENTS_DIR || path.join(shared.MINIONS_DIR, 'agents');
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function _resolveRequireGitWorkdir(opts) {
|
|
47
|
+
const limits = ENGINE_DEFAULTS.managedSpawn || {};
|
|
48
|
+
if (opts && Object.prototype.hasOwnProperty.call(opts, 'requireGitWorkdir')) {
|
|
49
|
+
return !!opts.requireGitWorkdir;
|
|
50
|
+
}
|
|
51
|
+
if (limits.requireGitWorkdir === false) return false;
|
|
52
|
+
return true;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function _isOnAllowlist(name, allowlist) {
|
|
56
|
+
if (!Array.isArray(allowlist) || typeof name !== 'string' || name.length === 0) return false;
|
|
57
|
+
const base = path.basename(name).toLowerCase();
|
|
58
|
+
const baseNoExt = base.replace(/\.(exe|cmd|bat|ps1|sh)$/, '');
|
|
59
|
+
for (const entry of allowlist) {
|
|
60
|
+
if (typeof entry !== 'string') continue;
|
|
61
|
+
const e = entry.toLowerCase();
|
|
62
|
+
if (e === base || e === baseNoExt) return true;
|
|
63
|
+
}
|
|
64
|
+
return false;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function _envKeyAllowed(key, limits) {
|
|
68
|
+
if (typeof key !== 'string' || key.length === 0) return false;
|
|
69
|
+
const allowlist = Array.isArray(limits.envKeyAllowlist) ? limits.envKeyAllowlist : [];
|
|
70
|
+
if (allowlist.indexOf(key) >= 0) return true;
|
|
71
|
+
const prefixes = Array.isArray(limits.envKeyAllowlistPrefixes) ? limits.envKeyAllowlistPrefixes : [];
|
|
72
|
+
for (const p of prefixes) {
|
|
73
|
+
if (typeof p === 'string' && p.length > 0 && key.indexOf(p) === 0) return true;
|
|
74
|
+
}
|
|
75
|
+
return false;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Extract the first executable-like token from a shell-style healthcheck cmd
|
|
79
|
+
// string so it can be checked against the executable allowlist. Strips
|
|
80
|
+
// surrounding quotes and any leading path. Returns '' when no candidate
|
|
81
|
+
// found.
|
|
82
|
+
function _firstExecutable(cmd) {
|
|
83
|
+
if (typeof cmd !== 'string') return '';
|
|
84
|
+
const trimmed = cmd.trim();
|
|
85
|
+
if (trimmed.length === 0) return '';
|
|
86
|
+
const m = trimmed.match(/^["']?([^\s"']+)["']?/);
|
|
87
|
+
if (!m) return '';
|
|
88
|
+
return m[1];
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function _validateHealthcheck(hc, limits) {
|
|
92
|
+
if (!hc || typeof hc !== 'object' || Array.isArray(hc)) {
|
|
93
|
+
return { ok: false, reason: 'healthcheck-missing' };
|
|
94
|
+
}
|
|
95
|
+
if (HEALTHCHECK_TYPES.indexOf(hc.type) < 0) {
|
|
96
|
+
return { ok: false, reason: 'healthcheck-type-invalid (' + hc.type + ')' };
|
|
97
|
+
}
|
|
98
|
+
const timeoutS = Number(hc.timeout_s);
|
|
99
|
+
if (!Number.isFinite(timeoutS) || timeoutS <= 0 || timeoutS > 3600) {
|
|
100
|
+
return { ok: false, reason: 'healthcheck-timeout-invalid' };
|
|
101
|
+
}
|
|
102
|
+
let intervalS = hc.interval_s == null
|
|
103
|
+
? Number(limits.defaultHealthIntervalSec) || 1
|
|
104
|
+
: Number(hc.interval_s);
|
|
105
|
+
if (!Number.isFinite(intervalS) || intervalS <= 0 || intervalS > 600) {
|
|
106
|
+
return { ok: false, reason: 'healthcheck-interval-invalid' };
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if (hc.type === 'http') {
|
|
110
|
+
if (typeof hc.url !== 'string' || hc.url.length === 0) {
|
|
111
|
+
return { ok: false, reason: 'healthcheck-url-missing' };
|
|
112
|
+
}
|
|
113
|
+
if (!/^https?:\/\//i.test(hc.url)) {
|
|
114
|
+
return { ok: false, reason: 'healthcheck-url-not-absolute' };
|
|
115
|
+
}
|
|
116
|
+
if (hc.expect_status == null) {
|
|
117
|
+
return { ok: false, reason: 'healthcheck-expect-status-missing' };
|
|
118
|
+
}
|
|
119
|
+
const status = Number(hc.expect_status);
|
|
120
|
+
if (!Number.isInteger(status) || status < 100 || status > 599) {
|
|
121
|
+
return { ok: false, reason: 'healthcheck-expect-status-invalid' };
|
|
122
|
+
}
|
|
123
|
+
return {
|
|
124
|
+
ok: true,
|
|
125
|
+
value: {
|
|
126
|
+
type: 'http',
|
|
127
|
+
url: hc.url,
|
|
128
|
+
expect_status: status,
|
|
129
|
+
interval_s: intervalS,
|
|
130
|
+
timeout_s: timeoutS,
|
|
131
|
+
},
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// command
|
|
136
|
+
if (typeof hc.cmd !== 'string' || hc.cmd.length === 0) {
|
|
137
|
+
return { ok: false, reason: 'healthcheck-cmd-missing' };
|
|
138
|
+
}
|
|
139
|
+
if (hc.cmd.length > 1000) {
|
|
140
|
+
return { ok: false, reason: 'healthcheck-cmd-too-long' };
|
|
141
|
+
}
|
|
142
|
+
if (typeof hc.shell !== 'boolean') {
|
|
143
|
+
return { ok: false, reason: 'healthcheck-shell-not-boolean' };
|
|
144
|
+
}
|
|
145
|
+
if (typeof hc.expect_regex !== 'string' || hc.expect_regex.length === 0) {
|
|
146
|
+
return { ok: false, reason: 'healthcheck-expect-regex-missing' };
|
|
147
|
+
}
|
|
148
|
+
if (hc.expect_regex.length > 500) {
|
|
149
|
+
return { ok: false, reason: 'healthcheck-expect-regex-too-long' };
|
|
150
|
+
}
|
|
151
|
+
try { new RegExp(hc.expect_regex); }
|
|
152
|
+
catch (e) { return { ok: false, reason: 'healthcheck-expect-regex-invalid (' + e.message + ')' }; }
|
|
153
|
+
|
|
154
|
+
// Same allowlist gate as the spec's `cmd` field. Healthcheck commands run
|
|
155
|
+
// through child_process.exec under engine ownership; without the gate this
|
|
156
|
+
// would be an arbitrary-code-execution path back into the engine process
|
|
157
|
+
// tree. Reject anything whose first token is not on the allowlist.
|
|
158
|
+
const execName = _firstExecutable(hc.cmd);
|
|
159
|
+
if (!_isOnAllowlist(execName, limits.executableAllowlist)) {
|
|
160
|
+
return { ok: false, reason: 'healthcheck-cmd-not-on-allowlist (' + execName + ')' };
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
return {
|
|
164
|
+
ok: true,
|
|
165
|
+
value: {
|
|
166
|
+
type: 'command',
|
|
167
|
+
cmd: hc.cmd,
|
|
168
|
+
shell: hc.shell,
|
|
169
|
+
expect_regex: hc.expect_regex,
|
|
170
|
+
interval_s: intervalS,
|
|
171
|
+
timeout_s: timeoutS,
|
|
172
|
+
},
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
function _validateSpec(spec, index, limits, opts) {
|
|
177
|
+
if (!spec || typeof spec !== 'object' || Array.isArray(spec)) {
|
|
178
|
+
return { ok: false, reason: 'not-an-object' };
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// name
|
|
182
|
+
if (typeof spec.name !== 'string' || spec.name.length === 0) {
|
|
183
|
+
return { ok: false, reason: 'name-missing' };
|
|
184
|
+
}
|
|
185
|
+
const maxName = Math.max(8, Number(limits.maxNameLength) || 64);
|
|
186
|
+
if (spec.name.length > maxName) {
|
|
187
|
+
return { ok: false, reason: 'name-too-long (>' + maxName + ')' };
|
|
188
|
+
}
|
|
189
|
+
if (!_KEBAB_RE.test(spec.name)) {
|
|
190
|
+
return { ok: false, reason: 'name-not-kebab-case (' + spec.name + ')' };
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// cmd
|
|
194
|
+
if (typeof spec.cmd !== 'string' || spec.cmd.length === 0) {
|
|
195
|
+
return { ok: false, reason: 'cmd-missing' };
|
|
196
|
+
}
|
|
197
|
+
if (spec.cmd.length > 200) {
|
|
198
|
+
return { ok: false, reason: 'cmd-too-long' };
|
|
199
|
+
}
|
|
200
|
+
if (!_isOnAllowlist(spec.cmd, limits.executableAllowlist)) {
|
|
201
|
+
return { ok: false, reason: 'cmd-not-on-allowlist (' + spec.cmd + ')' };
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// args
|
|
205
|
+
const argsRaw = spec.args == null ? [] : spec.args;
|
|
206
|
+
if (!Array.isArray(argsRaw)) return { ok: false, reason: 'args-not-array' };
|
|
207
|
+
const maxArgs = Math.max(1, Number(limits.maxArgsCount) || 64);
|
|
208
|
+
if (argsRaw.length > maxArgs) {
|
|
209
|
+
return { ok: false, reason: 'args-too-many (>' + maxArgs + ')' };
|
|
210
|
+
}
|
|
211
|
+
const args = [];
|
|
212
|
+
for (const a of argsRaw) {
|
|
213
|
+
if (typeof a !== 'string') return { ok: false, reason: 'arg-not-string' };
|
|
214
|
+
if (a.length > 500) return { ok: false, reason: 'arg-too-long' };
|
|
215
|
+
args.push(a);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// cwd (optional; validated when present + requireGitWorkdir is on)
|
|
219
|
+
if (spec.cwd != null && typeof spec.cwd !== 'string') {
|
|
220
|
+
return { ok: false, reason: 'cwd-not-string' };
|
|
221
|
+
}
|
|
222
|
+
if (typeof spec.cwd === 'string' && spec.cwd.length > 500) {
|
|
223
|
+
return { ok: false, reason: 'cwd-too-long' };
|
|
224
|
+
}
|
|
225
|
+
if (_resolveRequireGitWorkdir(opts) && typeof spec.cwd === 'string' && spec.cwd.length > 0) {
|
|
226
|
+
const wt = shared.isValidGitWorktree(spec.cwd);
|
|
227
|
+
if (!wt.ok) {
|
|
228
|
+
return { ok: false, reason: INVALID_WORKDIR_REASON_PREFIX + wt.reason };
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// env (optional)
|
|
233
|
+
const envRaw = spec.env == null ? {} : spec.env;
|
|
234
|
+
if (typeof envRaw !== 'object' || Array.isArray(envRaw)) {
|
|
235
|
+
return { ok: false, reason: 'env-not-object' };
|
|
236
|
+
}
|
|
237
|
+
const envKeys = Object.keys(envRaw);
|
|
238
|
+
const maxEnv = Math.max(1, Number(limits.maxEnvVars) || 32);
|
|
239
|
+
if (envKeys.length > maxEnv) {
|
|
240
|
+
return { ok: false, reason: 'env-too-many (>' + maxEnv + ')' };
|
|
241
|
+
}
|
|
242
|
+
const env = {};
|
|
243
|
+
for (const k of envKeys) {
|
|
244
|
+
if (!_envKeyAllowed(k, limits)) {
|
|
245
|
+
return { ok: false, reason: 'env-key-not-on-allowlist (' + k + ')' };
|
|
246
|
+
}
|
|
247
|
+
const v = envRaw[k];
|
|
248
|
+
if (typeof v !== 'string') return { ok: false, reason: 'env-value-not-string (' + k + ')' };
|
|
249
|
+
if (v.length > 1000) return { ok: false, reason: 'env-value-too-long (' + k + ')' };
|
|
250
|
+
env[k] = v;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// ports (optional)
|
|
254
|
+
const portsRaw = spec.ports == null ? [] : spec.ports;
|
|
255
|
+
if (!Array.isArray(portsRaw)) return { ok: false, reason: 'ports-not-array' };
|
|
256
|
+
if (portsRaw.length > 20) return { ok: false, reason: 'ports-too-many (>20)' };
|
|
257
|
+
const ports = [];
|
|
258
|
+
for (const p of portsRaw) {
|
|
259
|
+
const n = Number(p);
|
|
260
|
+
if (!Number.isInteger(n) || n < 1024 || n > 65535) {
|
|
261
|
+
return { ok: false, reason: 'port-invalid (' + p + ')' };
|
|
262
|
+
}
|
|
263
|
+
ports.push(n);
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// ttl_minutes (optional, defaults to defaultTtlMinutes; capped at maxTtlMinutes)
|
|
267
|
+
const maxTtl = Math.max(1, Number(limits.maxTtlMinutes) || 1440);
|
|
268
|
+
const defaultTtl = Math.max(1, Number(limits.defaultTtlMinutes) || 240);
|
|
269
|
+
let ttlMinutes;
|
|
270
|
+
if (spec.ttl_minutes == null) {
|
|
271
|
+
ttlMinutes = defaultTtl;
|
|
272
|
+
} else {
|
|
273
|
+
const n = Number(spec.ttl_minutes);
|
|
274
|
+
if (!Number.isFinite(n) || n <= 0) return { ok: false, reason: 'ttl-invalid' };
|
|
275
|
+
if (n > maxTtl) return { ok: false, reason: 'ttl-too-long (>' + maxTtl + 'min)' };
|
|
276
|
+
ttlMinutes = Math.floor(n);
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// attrs (optional, opaque to the engine but capped at maxAttrsBytes serialized)
|
|
280
|
+
const attrsRaw = spec.attrs == null ? {} : spec.attrs;
|
|
281
|
+
if (typeof attrsRaw !== 'object' || Array.isArray(attrsRaw)) {
|
|
282
|
+
return { ok: false, reason: 'attrs-not-object' };
|
|
283
|
+
}
|
|
284
|
+
let attrsSerialized;
|
|
285
|
+
try { attrsSerialized = JSON.stringify(attrsRaw); }
|
|
286
|
+
catch (e) { return { ok: false, reason: 'attrs-not-serializable (' + e.message + ')' }; }
|
|
287
|
+
const maxAttrsBytes = Math.max(64, Number(limits.maxAttrsBytes) || 2048);
|
|
288
|
+
if (Buffer.byteLength(attrsSerialized, 'utf8') > maxAttrsBytes) {
|
|
289
|
+
return { ok: false, reason: 'attrs-too-large (>' + maxAttrsBytes + 'B)' };
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// healthcheck (required)
|
|
293
|
+
const hc = _validateHealthcheck(spec.healthcheck, limits);
|
|
294
|
+
if (!hc.ok) return { ok: false, reason: hc.reason };
|
|
295
|
+
|
|
296
|
+
return {
|
|
297
|
+
ok: true,
|
|
298
|
+
value: {
|
|
299
|
+
name: spec.name,
|
|
300
|
+
cmd: spec.cmd,
|
|
301
|
+
args: args,
|
|
302
|
+
cwd: typeof spec.cwd === 'string' ? spec.cwd : '',
|
|
303
|
+
env: env,
|
|
304
|
+
ports: ports,
|
|
305
|
+
ttl_minutes: ttlMinutes,
|
|
306
|
+
attrs: JSON.parse(attrsSerialized),
|
|
307
|
+
healthcheck: hc.value,
|
|
308
|
+
},
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
function validateManagedSpawnRecord(parsed, opts) {
|
|
313
|
+
opts = opts || {};
|
|
314
|
+
const limits = ENGINE_DEFAULTS.managedSpawn || {};
|
|
315
|
+
const maxSpecs = Math.max(1, Number(limits.maxSpecsPerFile) || 5);
|
|
316
|
+
|
|
317
|
+
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
|
|
318
|
+
return { ok: false, reason: 'not-an-object' };
|
|
319
|
+
}
|
|
320
|
+
if (!Array.isArray(parsed.specs)) return { ok: false, reason: 'specs-missing' };
|
|
321
|
+
if (parsed.specs.length === 0) return { ok: false, reason: 'specs-empty' };
|
|
322
|
+
if (parsed.specs.length > maxSpecs) {
|
|
323
|
+
return { ok: false, reason: 'specs-too-many (>' + maxSpecs + ')' };
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
const seen = new Set();
|
|
327
|
+
const out = [];
|
|
328
|
+
for (let i = 0; i < parsed.specs.length; i++) {
|
|
329
|
+
const v = _validateSpec(parsed.specs[i], i, limits, opts);
|
|
330
|
+
if (!v.ok) {
|
|
331
|
+
// Preserve workdir-rejection prefix at the top level so the engine
|
|
332
|
+
// close-handler gate can key off it the same way it does for
|
|
333
|
+
// keep-pids (INVALID_WORKDIR_REASON_PREFIX).
|
|
334
|
+
if (typeof v.reason === 'string' && v.reason.indexOf(INVALID_WORKDIR_REASON_PREFIX) === 0) {
|
|
335
|
+
return { ok: false, reason: INVALID_WORKDIR_REASON_PREFIX + 'spec[' + i + ']: ' + v.reason.slice(INVALID_WORKDIR_REASON_PREFIX.length) };
|
|
336
|
+
}
|
|
337
|
+
return { ok: false, reason: 'spec[' + i + ']: ' + v.reason };
|
|
338
|
+
}
|
|
339
|
+
if (seen.has(v.value.name)) {
|
|
340
|
+
return { ok: false, reason: 'spec[' + i + ']: duplicate-name (' + v.value.name + ')' };
|
|
341
|
+
}
|
|
342
|
+
seen.add(v.value.name);
|
|
343
|
+
out.push(v.value);
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
if (parsed.written_by != null && typeof parsed.written_by !== 'string') {
|
|
347
|
+
return { ok: false, reason: 'written_by-not-string' };
|
|
348
|
+
}
|
|
349
|
+
if (parsed.wi_id != null && typeof parsed.wi_id !== 'string') {
|
|
350
|
+
return { ok: false, reason: 'wi_id-not-string' };
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
return {
|
|
354
|
+
ok: true,
|
|
355
|
+
value: {
|
|
356
|
+
specs: out,
|
|
357
|
+
written_by: typeof parsed.written_by === 'string' ? parsed.written_by : '',
|
|
358
|
+
wi_id: typeof parsed.wi_id === 'string' ? parsed.wi_id : '',
|
|
359
|
+
},
|
|
360
|
+
};
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
function readManagedSpawnFile(agentId, opts) {
|
|
364
|
+
opts = opts || {};
|
|
365
|
+
const filePath = path.join(_agentsDir(), String(agentId || ''), MANAGED_SPAWN_FILENAME);
|
|
366
|
+
let raw;
|
|
367
|
+
try { raw = fs.readFileSync(filePath, 'utf8'); }
|
|
368
|
+
catch (_e) { return null; }
|
|
369
|
+
let parsed = null;
|
|
370
|
+
try { parsed = JSON.parse(raw); }
|
|
371
|
+
catch (e) {
|
|
372
|
+
return { agentId: agentId, filePath: filePath, valid: false, reason: 'json-parse: ' + e.message, parsed: null };
|
|
373
|
+
}
|
|
374
|
+
const v = validateManagedSpawnRecord(parsed, opts);
|
|
375
|
+
if (!v.ok) return { agentId: agentId, filePath: filePath, valid: false, reason: v.reason, parsed: parsed };
|
|
376
|
+
return { agentId: agentId, filePath: filePath, valid: true, value: v.value, parsed: parsed };
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
// Pure engine-side acceptance helper. Mirrors evaluateKeepPidsAcceptance:
|
|
380
|
+
// returns a structured summary the engine close handler uses to decide
|
|
381
|
+
// whether to (a) silently accept (no file present, or valid), (b) reject as
|
|
382
|
+
// workdir-invalid → fail dispatch + inbox alert + unlink sidecar, or
|
|
383
|
+
// (c) reject as schema-invalid → fail dispatch + inbox alert + unlink.
|
|
384
|
+
// No side effects; caller owns kill + unlink + WI mutation.
|
|
385
|
+
function evaluateManagedSpawnAcceptance(agentId, opts) {
|
|
386
|
+
opts = opts || {};
|
|
387
|
+
const filePath = path.join(_agentsDir(), String(agentId || ''), MANAGED_SPAWN_FILENAME);
|
|
388
|
+
if (!fs.existsSync(filePath)) {
|
|
389
|
+
return { exists: false, accepted: false, isWorkdirRejection: false, reason: null, record: null, filePath: filePath };
|
|
390
|
+
}
|
|
391
|
+
const rec = readManagedSpawnFile(agentId, opts);
|
|
392
|
+
if (!rec) {
|
|
393
|
+
// Race: file existed in existsSync above but was unlinked before read.
|
|
394
|
+
return { exists: false, accepted: false, isWorkdirRejection: false, reason: null, record: null, filePath: filePath };
|
|
395
|
+
}
|
|
396
|
+
if (rec.valid) {
|
|
397
|
+
return {
|
|
398
|
+
exists: true,
|
|
399
|
+
accepted: true,
|
|
400
|
+
isWorkdirRejection: false,
|
|
401
|
+
reason: null,
|
|
402
|
+
record: rec.value,
|
|
403
|
+
filePath: rec.filePath,
|
|
404
|
+
};
|
|
405
|
+
}
|
|
406
|
+
const reason = rec.reason || 'unknown';
|
|
407
|
+
const isWorkdirRejection = typeof reason === 'string' && reason.indexOf(INVALID_WORKDIR_REASON_PREFIX) === 0;
|
|
408
|
+
return {
|
|
409
|
+
exists: true,
|
|
410
|
+
accepted: false,
|
|
411
|
+
isWorkdirRejection: isWorkdirRejection,
|
|
412
|
+
reason: reason,
|
|
413
|
+
record: null,
|
|
414
|
+
filePath: rec.filePath,
|
|
415
|
+
parsedRaw: rec.parsed || null,
|
|
416
|
+
};
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
function buildManagedSpawnHint(opts) {
|
|
420
|
+
opts = opts || {};
|
|
421
|
+
const limits = ENGINE_DEFAULTS.managedSpawn || {};
|
|
422
|
+
const maxTtl = Math.max(1, Number(limits.maxTtlMinutes) || 1440);
|
|
423
|
+
const defaultTtl = Math.max(1, Number(limits.defaultTtlMinutes) || 240);
|
|
424
|
+
const maxSpecs = Math.max(1, Number(limits.maxSpecsPerFile) || 5);
|
|
425
|
+
const ttlIn = Number(opts.ttlMinutes);
|
|
426
|
+
const ttl = Number.isFinite(ttlIn) && ttlIn > 0
|
|
427
|
+
? Math.min(maxTtl, Math.floor(ttlIn))
|
|
428
|
+
: defaultTtl;
|
|
429
|
+
const agentId = opts.agentId || '<your-agent-id>';
|
|
430
|
+
const wiId = opts.workItemId || '<this-work-item-id>';
|
|
431
|
+
const minionsDir = opts.minionsDir || '<minions-dir>';
|
|
432
|
+
const portIn = Number(opts.dashboardPort);
|
|
433
|
+
const dashboardPort = Number.isFinite(portIn) && portIn > 0 ? portIn : 7331;
|
|
434
|
+
|
|
435
|
+
const lines = [
|
|
436
|
+
'',
|
|
437
|
+
'',
|
|
438
|
+
'---',
|
|
439
|
+
'',
|
|
440
|
+
'## Engine-managed long-running services (managed_spawn flag)',
|
|
441
|
+
'',
|
|
442
|
+
'This work item permits you to **describe** long-running services in a sidecar file; the engine will **own the spawn** and the lifecycle. Use this instead of hand-rolling detached-spawn pipelines yourself.',
|
|
443
|
+
'',
|
|
444
|
+
'BEFORE you write your completion report, write the sidecar:',
|
|
445
|
+
'',
|
|
446
|
+
' `' + minionsDir + '/agents/' + agentId + '/managed-spawn.json`',
|
|
447
|
+
'',
|
|
448
|
+
'with this exact JSON shape (one or more specs, max ' + maxSpecs + ' per file):',
|
|
449
|
+
'',
|
|
450
|
+
'```json',
|
|
451
|
+
'{',
|
|
452
|
+
' "specs": [',
|
|
453
|
+
' {',
|
|
454
|
+
' "name": "constellation-host",',
|
|
455
|
+
' "cmd": "bun",',
|
|
456
|
+
' "args": ["run", "dev"],',
|
|
457
|
+
' "cwd": "D:/repos/constellation",',
|
|
458
|
+
' "env": { "VITE_HOST": "127.0.0.1" },',
|
|
459
|
+
' "ports": [3001],',
|
|
460
|
+
' "ttl_minutes": ' + ttl + ',',
|
|
461
|
+
' "attrs": { "base_url": "http://localhost:3001", "framework": "vite" },',
|
|
462
|
+
' "healthcheck": {',
|
|
463
|
+
' "type": "http",',
|
|
464
|
+
' "url": "http://localhost:3001/health",',
|
|
465
|
+
' "expect_status": 200,',
|
|
466
|
+
' "interval_s": 1,',
|
|
467
|
+
' "timeout_s": 60',
|
|
468
|
+
' }',
|
|
469
|
+
' }',
|
|
470
|
+
' ],',
|
|
471
|
+
' "written_by": "' + agentId + '",',
|
|
472
|
+
' "wi_id": "' + wiId + '"',
|
|
473
|
+
'}',
|
|
474
|
+
'```',
|
|
475
|
+
'',
|
|
476
|
+
'Healthcheck types: `http` (GET URL, asserts status code) or `command` (runs a shell command, asserts stdout matches `expect_regex`). For `command`, set `shell: true` and `expect_regex` to a JavaScript regex string.',
|
|
477
|
+
'',
|
|
478
|
+
'### What the engine does for you',
|
|
479
|
+
'',
|
|
480
|
+
'1. Reads your sidecar after you exit.',
|
|
481
|
+
'2. Spawns each spec detached (the working Windows pattern is centralised in the engine — you do **not** need to write `Start-Process` or `spawn({ detached: true })` yourself).',
|
|
482
|
+
'3. Drives the healthcheck loop until each spec passes its first check (within `timeout_s`).',
|
|
483
|
+
'4. **Fails this dispatch (ERROR) if any spec fails its healthcheck.** Surviving siblings stay alive; failing PIDs are killed.',
|
|
484
|
+
'5. Auto-injects a `## Live managed processes` block into downstream agents\' prompts (scoped to your project) so the next dispatch can find the service without you telling it.',
|
|
485
|
+
'6. Sweeps dead PIDs / TTL-expired specs every ' + (limits.sweepEvery || 30) + ' ticks; kills + unlinks at TTL.',
|
|
486
|
+
'',
|
|
487
|
+
'### Caps the engine enforces (validator rejects anything over)',
|
|
488
|
+
'',
|
|
489
|
+
'- Specs per file: ≤ ' + maxSpecs,
|
|
490
|
+
'- Name: kebab-case, ≤ 64 chars, unique within file',
|
|
491
|
+
'- Executable (`cmd` and any `command` healthcheck cmd): on the engine\'s allowlist (node, bun, npm, npx, python, docker, adb, gradle, mvn, pwsh, …)',
|
|
492
|
+
'- Env keys: on the engine\'s allowlist or matching a known prefix (e.g. `VITE_`, `NEXT_`, `REACT_APP_`, `npm_config_`)',
|
|
493
|
+
'- Ports: 1024–65535, ≤ 20 per spec',
|
|
494
|
+
'- TTL: ≤ ' + maxTtl + ' minutes (hard cap), defaults to ' + defaultTtl + ' if omitted',
|
|
495
|
+
'- `attrs` serialized: ≤ 2048 bytes (opaque blob the engine surfaces to downstream agents)',
|
|
496
|
+
'',
|
|
497
|
+
'If your file is invalid the engine marks this dispatch ERROR with `failure_class: invalid-managed-spawn` (non-retryable) — fix the file shape, don\'t retry blindly.',
|
|
498
|
+
'',
|
|
499
|
+
'### Verify before exit',
|
|
500
|
+
'',
|
|
501
|
+
'After you write the file, query the engine to confirm acceptance:',
|
|
502
|
+
'',
|
|
503
|
+
' curl -s http://localhost:' + dashboardPort + '/api/managed-processes',
|
|
504
|
+
'',
|
|
505
|
+
'Each of your specs should appear with `healthy: true` once the engine\'s healthcheck loop fires (this happens after your agent process exits — the engine drives it). You don\'t need to wait for `healthy: true` yourself; just confirm the file is valid by re-reading it locally.',
|
|
506
|
+
'',
|
|
507
|
+
];
|
|
508
|
+
return lines.join('\n');
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
// ─── Item 2 (P-2d5e8f04) ─────────────────────────────────────────────────────
|
|
512
|
+
//
|
|
513
|
+
// Engine-side spawn + locked state file. The engine calls these from its
|
|
514
|
+
// onAgentClose handler after the managed-spawn sidecar is accepted by
|
|
515
|
+
// evaluateManagedSpawnAcceptance above:
|
|
516
|
+
//
|
|
517
|
+
// 1. openManagedLog(name) → opens an append fd under MINIONS_DIR/
|
|
518
|
+
// engine/managed-logs/<name>.log. Child
|
|
519
|
+
// stdio is wired to this fd (NOT a pipe)
|
|
520
|
+
// so the detached process survives our
|
|
521
|
+
// exit on Windows.
|
|
522
|
+
// 2. spawnManagedSpec(spec, ctx) → uses the proven bin/minions.js
|
|
523
|
+
// spawnDashboard detached-spawn pattern.
|
|
524
|
+
// Returns {pid, started_at, log_path}.
|
|
525
|
+
// 3. recordManagedSpec(spec, → writes one entry to
|
|
526
|
+
// runtime, ctx) engine/managed-processes.json via
|
|
527
|
+
// mutateJsonFileLocked. Replaces any
|
|
528
|
+
// existing entry with the same name
|
|
529
|
+
// (idempotent under retry).
|
|
530
|
+
// 4. recordManagedBatch(items, → same but one lock for N specs (the
|
|
531
|
+
// ctx) close-handler call site spawns each
|
|
532
|
+
// then persists them together).
|
|
533
|
+
// 5. removeManagedSpec(name) → locked unlink of the entry; best-effort
|
|
534
|
+
// process.kill of the recorded PID
|
|
535
|
+
// OUTSIDE the lock callback. No-op when
|
|
536
|
+
// the entry is missing.
|
|
537
|
+
// 6. listManagedSpecs({project}) → reads the state file, optionally
|
|
538
|
+
// filters by owner_project. Used later
|
|
539
|
+
// by item 4 (dashboard) + item 6
|
|
540
|
+
// (playbook injection) + items 3/7
|
|
541
|
+
// (sweep + boot reconcile).
|
|
542
|
+
//
|
|
543
|
+
// All state writes go through mutateJsonFileLocked per the repo convention
|
|
544
|
+
// ('Key conventions' in copilot-instructions.md). Callbacks stay synchronous
|
|
545
|
+
// and fast — no kill / no spawn inside the lock callback. Healthcheck loops,
|
|
546
|
+
// dispatch ERROR gating on healthcheck failure, per-tick sweep, boot
|
|
547
|
+
// reconcile, and dashboard endpoints are deferred to items 3/4/5/7.
|
|
548
|
+
|
|
549
|
+
function _getStatePath() {
|
|
550
|
+
return path.join(shared.MINIONS_DIR, 'engine', MANAGED_PROCESSES_STATE_FILE);
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
function _getLogsDir() {
|
|
554
|
+
return path.join(shared.MINIONS_DIR, 'engine', MANAGED_LOGS_DIR);
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
function openManagedLog(name) {
|
|
558
|
+
if (typeof name !== 'string' || name.length === 0 || !_KEBAB_RE.test(name)) {
|
|
559
|
+
throw new Error('openManagedLog: name must be a non-empty kebab-case string');
|
|
560
|
+
}
|
|
561
|
+
// P-8a4d6f29 — rotate-on-open is centralised in shared.openAppendLogFd so
|
|
562
|
+
// bin/minions.js (dashboard/engine stdio) and managed-spawn share the same
|
|
563
|
+
// ".1 sibling" rotation. Append mode is preserved so a post-boot respawn
|
|
564
|
+
// (item 7) doesn't clobber prior output.
|
|
565
|
+
const dir = _getLogsDir();
|
|
566
|
+
const { fd, logPath } = shared.openAppendLogFd(name + '.log', dir);
|
|
567
|
+
return { fd: fd, logPath: logPath };
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
// Build the child env: sanitised sidecar env + PATH + Windows-essential vars
|
|
571
|
+
// (SYSTEMROOT, USERPROFILE, TEMP, etc.) without which detached node/bun
|
|
572
|
+
// children fail on Windows. Host vars OUTSIDE this list are deliberately not
|
|
573
|
+
// forwarded — a managed spec must declare what it needs in spec.env (the
|
|
574
|
+
// validator enforces the env allowlist).
|
|
575
|
+
const _WIN_ESSENTIAL_ENV_KEYS = [
|
|
576
|
+
'SYSTEMROOT', 'SYSTEMDRIVE', 'WINDIR',
|
|
577
|
+
'USERPROFILE', 'APPDATA', 'LOCALAPPDATA',
|
|
578
|
+
'TEMP', 'TMP', 'HOMEDRIVE', 'HOMEPATH', 'HOME',
|
|
579
|
+
'PROCESSOR_ARCHITECTURE', 'PATHEXT', 'COMSPEC', 'OS',
|
|
580
|
+
];
|
|
581
|
+
|
|
582
|
+
function _buildChildEnv(specEnv) {
|
|
583
|
+
const env = Object.assign({}, specEnv || {});
|
|
584
|
+
if (!env.PATH && process.env.PATH) env.PATH = process.env.PATH;
|
|
585
|
+
for (const k of _WIN_ESSENTIAL_ENV_KEYS) {
|
|
586
|
+
if (env[k] == null && process.env[k] != null) env[k] = process.env[k];
|
|
587
|
+
}
|
|
588
|
+
return env;
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
function spawnManagedSpec(spec, ctx) {
|
|
592
|
+
if (!spec || typeof spec !== 'object') throw new Error('spawnManagedSpec: spec required');
|
|
593
|
+
if (typeof spec.cmd !== 'string' || spec.cmd.length === 0) {
|
|
594
|
+
throw new Error('spawnManagedSpec: spec.cmd required');
|
|
595
|
+
}
|
|
596
|
+
const limits = ENGINE_DEFAULTS.managedSpawn || {};
|
|
597
|
+
// Defensive re-check of the executable allowlist. The validator gates this
|
|
598
|
+
// at sidecar-read time too; this guards future direct callers and tests.
|
|
599
|
+
if (!_isOnAllowlist(spec.cmd, limits.executableAllowlist)) {
|
|
600
|
+
throw new Error('spawnManagedSpec: cmd not-on-allowlist (' + spec.cmd + ')');
|
|
601
|
+
}
|
|
602
|
+
ctx = ctx || {};
|
|
603
|
+
const { fd: logFd, logPath } = openManagedLog(spec.name);
|
|
604
|
+
const cwd = (typeof spec.cwd === 'string' && spec.cwd.length > 0) ? spec.cwd : undefined;
|
|
605
|
+
const env = _buildChildEnv(spec.env);
|
|
606
|
+
const argv = Array.isArray(spec.args) ? spec.args : [];
|
|
607
|
+
let child;
|
|
608
|
+
try {
|
|
609
|
+
// Working Windows-correct detached spawn — same shape as
|
|
610
|
+
// bin/minions.js spawnDashboard. DO NOT switch stdio to 'pipe' or
|
|
611
|
+
// 'inherit': pipes die on EPIPE once the parent exits.
|
|
612
|
+
child = spawn(spec.cmd, argv, {
|
|
613
|
+
cwd: cwd,
|
|
614
|
+
env: env,
|
|
615
|
+
detached: true,
|
|
616
|
+
stdio: ['ignore', logFd, logFd],
|
|
617
|
+
windowsHide: true,
|
|
618
|
+
});
|
|
619
|
+
} catch (e) {
|
|
620
|
+
try { fs.closeSync(logFd); } catch (_e) {}
|
|
621
|
+
throw e;
|
|
622
|
+
}
|
|
623
|
+
// Close our copy of the fd in the parent — the child holds its own dup.
|
|
624
|
+
try { fs.closeSync(logFd); } catch (_e) {}
|
|
625
|
+
if (!child || !child.pid) {
|
|
626
|
+
throw new Error('spawnManagedSpec: spawn failed for ' + spec.cmd);
|
|
627
|
+
}
|
|
628
|
+
child.unref();
|
|
629
|
+
const startedAt = Date.now();
|
|
630
|
+
log('info', 'managed-spawn born: name=' + spec.name + ' pid=' + child.pid
|
|
631
|
+
+ ' owner_project=' + (ctx.owner_project || '')
|
|
632
|
+
+ ' owner_wi=' + (ctx.owner_wi || ''));
|
|
633
|
+
return { pid: child.pid, started_at: startedAt, log_path: logPath };
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
function _initialStateShape() {
|
|
637
|
+
return { specs: [] };
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
function _toStateRecord(spec, runtime, ctx) {
|
|
641
|
+
const limits = ENGINE_DEFAULTS.managedSpawn || {};
|
|
642
|
+
const defaultTtl = Math.max(1, Number(limits.defaultTtlMinutes) || 240);
|
|
643
|
+
const ttlMin = Number.isFinite(spec.ttl_minutes) && spec.ttl_minutes > 0
|
|
644
|
+
? spec.ttl_minutes : defaultTtl;
|
|
645
|
+
const started = Number.isFinite(runtime && runtime.started_at) ? runtime.started_at : Date.now();
|
|
646
|
+
return {
|
|
647
|
+
name: spec.name,
|
|
648
|
+
pid: runtime && Number.isInteger(runtime.pid) ? runtime.pid : null,
|
|
649
|
+
owner_agent: (ctx && ctx.owner_agent) || '',
|
|
650
|
+
owner_wi: (ctx && ctx.owner_wi) || '',
|
|
651
|
+
owner_project: (ctx && ctx.owner_project) || '',
|
|
652
|
+
cmd: spec.cmd,
|
|
653
|
+
args: Array.isArray(spec.args) ? spec.args.slice() : [],
|
|
654
|
+
cwd: typeof spec.cwd === 'string' ? spec.cwd : '',
|
|
655
|
+
env: Object.assign({}, spec.env || {}),
|
|
656
|
+
ports: Array.isArray(spec.ports) ? spec.ports.slice() : [],
|
|
657
|
+
attrs: spec.attrs && typeof spec.attrs === 'object'
|
|
658
|
+
? JSON.parse(JSON.stringify(spec.attrs)) : {},
|
|
659
|
+
healthcheck: spec.healthcheck && typeof spec.healthcheck === 'object'
|
|
660
|
+
? JSON.parse(JSON.stringify(spec.healthcheck)) : null,
|
|
661
|
+
started_at: started,
|
|
662
|
+
ttl_expires_at: started + (ttlMin * 60 * 1000),
|
|
663
|
+
last_health_at: null,
|
|
664
|
+
healthy: false,
|
|
665
|
+
alive: true,
|
|
666
|
+
log_path: (runtime && runtime.log_path) || '',
|
|
667
|
+
};
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
function recordManagedSpec(spec, runtime, ctx) {
|
|
671
|
+
if (!spec || !spec.name) throw new Error('recordManagedSpec: spec.name required');
|
|
672
|
+
const statePath = _getStatePath();
|
|
673
|
+
shared.mutateJsonFileLocked(statePath, (data) => {
|
|
674
|
+
if (!data || typeof data !== 'object' || Array.isArray(data) || !Array.isArray(data.specs)) {
|
|
675
|
+
data = _initialStateShape();
|
|
676
|
+
}
|
|
677
|
+
const idx = data.specs.findIndex(s => s && s.name === spec.name);
|
|
678
|
+
const rec = _toStateRecord(spec, runtime, ctx);
|
|
679
|
+
if (idx >= 0) data.specs[idx] = rec;
|
|
680
|
+
else data.specs.push(rec);
|
|
681
|
+
return data;
|
|
682
|
+
}, { defaultValue: _initialStateShape() });
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
function recordManagedBatch(items, ctx) {
|
|
686
|
+
if (!Array.isArray(items) || items.length === 0) return;
|
|
687
|
+
const statePath = _getStatePath();
|
|
688
|
+
shared.mutateJsonFileLocked(statePath, (data) => {
|
|
689
|
+
if (!data || typeof data !== 'object' || Array.isArray(data) || !Array.isArray(data.specs)) {
|
|
690
|
+
data = _initialStateShape();
|
|
691
|
+
}
|
|
692
|
+
for (const entry of items) {
|
|
693
|
+
const spec = entry && entry.spec;
|
|
694
|
+
const runtime = entry && entry.runtime;
|
|
695
|
+
if (!spec || !spec.name) continue;
|
|
696
|
+
const idx = data.specs.findIndex(s => s && s.name === spec.name);
|
|
697
|
+
const rec = _toStateRecord(spec, runtime, ctx);
|
|
698
|
+
if (idx >= 0) data.specs[idx] = rec;
|
|
699
|
+
else data.specs.push(rec);
|
|
700
|
+
}
|
|
701
|
+
return data;
|
|
702
|
+
}, { defaultValue: _initialStateShape() });
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
function removeManagedSpec(name) {
|
|
706
|
+
if (typeof name !== 'string' || name.length === 0) return;
|
|
707
|
+
let killPid = null;
|
|
708
|
+
const statePath = _getStatePath();
|
|
709
|
+
shared.mutateJsonFileLocked(statePath, (data) => {
|
|
710
|
+
if (!data || !Array.isArray(data.specs)) return data;
|
|
711
|
+
const idx = data.specs.findIndex(s => s && s.name === name);
|
|
712
|
+
if (idx < 0) return data;
|
|
713
|
+
const rec = data.specs[idx];
|
|
714
|
+
if (rec && Number.isInteger(rec.pid) && rec.pid > 0) killPid = rec.pid;
|
|
715
|
+
data.specs.splice(idx, 1);
|
|
716
|
+
return data;
|
|
717
|
+
}, { defaultValue: _initialStateShape() });
|
|
718
|
+
// Kill OUTSIDE the lock — never run process ops inside a lock callback
|
|
719
|
+
// (copilot-instructions.md "Keep lock callbacks synchronous and fast").
|
|
720
|
+
if (killPid != null) {
|
|
721
|
+
try { shared.killByPidImmediate(killPid); }
|
|
722
|
+
catch (e) { log('warn', 'managed-spawn removeManagedSpec: kill ' + killPid + ' failed: ' + e.message); }
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
function listManagedSpecs(opts) {
|
|
727
|
+
opts = opts || {};
|
|
728
|
+
const statePath = _getStatePath();
|
|
729
|
+
let raw;
|
|
730
|
+
try { raw = fs.readFileSync(statePath, 'utf8'); }
|
|
731
|
+
catch (_e) { return []; }
|
|
732
|
+
let parsed;
|
|
733
|
+
try { parsed = JSON.parse(raw); }
|
|
734
|
+
catch (_e) { return []; }
|
|
735
|
+
const specs = (parsed && Array.isArray(parsed.specs)) ? parsed.specs : [];
|
|
736
|
+
if (opts.project) return specs.filter(s => s && s.owner_project === opts.project);
|
|
737
|
+
return specs.slice();
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
// ─── Item 3 (P-9c1f47a6) ─────────────────────────────────────────────────────
|
|
741
|
+
//
|
|
742
|
+
// Healthcheck implementations + dispatch SUCCESS/ERROR gate.
|
|
743
|
+
//
|
|
744
|
+
// runHealthcheck(spec) → fires one probe (http or command), resolves
|
|
745
|
+
// {healthy, error, lastCheckAt}. Pure — no
|
|
746
|
+
// state mutation, no PID kills. Used by both
|
|
747
|
+
// waitForFirstHealth and the per-spec
|
|
748
|
+
// liveness loop a future item wires in.
|
|
749
|
+
// waitForFirstHealth(spec,opts)→ self-scheduled async loop. Polls
|
|
750
|
+
// runHealthcheck every spec.healthcheck
|
|
751
|
+
// .interval_s. On first healthy → flips the
|
|
752
|
+
// state file's `healthy: true` +
|
|
753
|
+
// `last_health_at` (locked write) and
|
|
754
|
+
// resolves {healthy:true}. On
|
|
755
|
+
// spec.healthcheck.timeout_s elapsed
|
|
756
|
+
// without a pass → resolves
|
|
757
|
+
// {healthy:false, error:'timeout: ...'}
|
|
758
|
+
// without throwing (caller decides what to
|
|
759
|
+
// do with the rejection — the engine close-
|
|
760
|
+
// handler maps it to dispatch ERROR with
|
|
761
|
+
// FAILURE_CLASS.MANAGED_SPAWN_HEALTHCHECK_FAILED).
|
|
762
|
+
// tailManagedLog(name, lines) → reads up to N tail bytes of the named log
|
|
763
|
+
// and returns the last `lines` lines joined.
|
|
764
|
+
// Used by the engine close-handler to attach
|
|
765
|
+
// log evidence to inbox alerts on
|
|
766
|
+
// healthcheck failure.
|
|
767
|
+
//
|
|
768
|
+
// Per the plan, healthcheck loops are PER-SPEC and self-scheduled — NEVER
|
|
769
|
+
// driven from the tick cycle. The tick coupling regression was the original
|
|
770
|
+
// design constraint that drove the entire architecture choice.
|
|
771
|
+
// (No state writes inside the lock callback except `healthy` + `last_health_at`
|
|
772
|
+
// at first-pass; idle liveness updates batch every healthBackoffSec and land
|
|
773
|
+
// in a follow-up item.)
|
|
774
|
+
|
|
775
|
+
const _HC_CMD_TIMEOUT_MS = 5000; // hard ceiling for one healthcheck probe
|
|
776
|
+
|
|
777
|
+
function _httpProbe(url, expectStatus, timeoutMs) {
|
|
778
|
+
return new Promise((resolve) => {
|
|
779
|
+
let settled = false;
|
|
780
|
+
const finish = (result) => { if (!settled) { settled = true; resolve(result); } };
|
|
781
|
+
let req;
|
|
782
|
+
try {
|
|
783
|
+
const client = url.startsWith('https:') ? https : http;
|
|
784
|
+
req = client.get(url, { timeout: timeoutMs }, (res) => {
|
|
785
|
+
// Consume the body so the socket can close even on no listeners.
|
|
786
|
+
res.resume();
|
|
787
|
+
res.on('end', () => {
|
|
788
|
+
if (res.statusCode === expectStatus) finish({ healthy: true, error: null });
|
|
789
|
+
else finish({ healthy: false, error: 'http status ' + res.statusCode + ' (expected ' + expectStatus + ')' });
|
|
790
|
+
});
|
|
791
|
+
res.on('error', (e) => finish({ healthy: false, error: 'http response error: ' + e.message }));
|
|
792
|
+
});
|
|
793
|
+
req.on('error', (e) => finish({ healthy: false, error: 'http request error: ' + e.message }));
|
|
794
|
+
req.on('timeout', () => {
|
|
795
|
+
try { req.destroy(new Error('timeout')); } catch (_e) {}
|
|
796
|
+
finish({ healthy: false, error: 'http request timeout after ' + timeoutMs + 'ms' });
|
|
797
|
+
});
|
|
798
|
+
} catch (e) {
|
|
799
|
+
finish({ healthy: false, error: 'http probe threw: ' + e.message });
|
|
800
|
+
}
|
|
801
|
+
});
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
function _commandProbe(cmd, useShell, expectRegex, timeoutMs) {
|
|
805
|
+
return new Promise((resolve) => {
|
|
806
|
+
let regex;
|
|
807
|
+
try { regex = new RegExp(expectRegex); }
|
|
808
|
+
catch (e) { return resolve({ healthy: false, error: 'expect_regex invalid: ' + e.message }); }
|
|
809
|
+
exec(cmd, {
|
|
810
|
+
timeout: timeoutMs,
|
|
811
|
+
shell: useShell ? undefined : false, // exec defaults to /bin/sh or cmd.exe when shell undefined
|
|
812
|
+
windowsHide: true,
|
|
813
|
+
maxBuffer: 1024 * 1024,
|
|
814
|
+
}, (err, stdout, stderr) => {
|
|
815
|
+
if (err && err.killed) {
|
|
816
|
+
return resolve({ healthy: false, error: 'healthcheck command timeout after ' + timeoutMs + 'ms' });
|
|
817
|
+
}
|
|
818
|
+
if (err) {
|
|
819
|
+
// Non-zero exit. The plan says match against stdout, so a non-zero
|
|
820
|
+
// exit with matching stdout is still "unhealthy" because the
|
|
821
|
+
// process errored — treat command errors as unhealthy with the
|
|
822
|
+
// stderr tail for diagnostics.
|
|
823
|
+
const tail = (stderr || '').trim().slice(-200);
|
|
824
|
+
return resolve({ healthy: false, error: 'healthcheck command exit ' + (err.code != null ? err.code : '?') + (tail ? ': ' + tail : '') });
|
|
825
|
+
}
|
|
826
|
+
const out = String(stdout || '');
|
|
827
|
+
if (regex.test(out)) return resolve({ healthy: true, error: null });
|
|
828
|
+
return resolve({ healthy: false, error: 'healthcheck stdout did not match regex /' + expectRegex + '/' });
|
|
829
|
+
});
|
|
830
|
+
});
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
async function runHealthcheck(spec) {
|
|
834
|
+
if (!spec || !spec.healthcheck || typeof spec.healthcheck !== 'object') {
|
|
835
|
+
return { healthy: false, error: 'spec.healthcheck missing', lastCheckAt: Date.now() };
|
|
836
|
+
}
|
|
837
|
+
const hc = spec.healthcheck;
|
|
838
|
+
const timeoutMs = Math.min(_HC_CMD_TIMEOUT_MS, Math.max(500, (Number(hc.timeout_s) || 5) * 1000));
|
|
839
|
+
let result;
|
|
840
|
+
if (hc.type === 'http') {
|
|
841
|
+
result = await _httpProbe(hc.url, Number(hc.expect_status), timeoutMs);
|
|
842
|
+
} else if (hc.type === 'command') {
|
|
843
|
+
result = await _commandProbe(hc.cmd, !!hc.shell, hc.expect_regex, timeoutMs);
|
|
844
|
+
} else {
|
|
845
|
+
result = { healthy: false, error: 'healthcheck type unknown: ' + hc.type };
|
|
846
|
+
}
|
|
847
|
+
result.lastCheckAt = Date.now();
|
|
848
|
+
return result;
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
// Flip state.healthy=true + last_health_at on a single locked write. Used by
|
|
852
|
+
// waitForFirstHealth on the first pass. Callers MUST be outside any other
|
|
853
|
+
// lock — this acquires its own.
|
|
854
|
+
function _markHealthy(name, now) {
|
|
855
|
+
const statePath = _getStatePath();
|
|
856
|
+
shared.mutateJsonFileLocked(statePath, (data) => {
|
|
857
|
+
if (!data || !Array.isArray(data.specs)) return data;
|
|
858
|
+
const rec = data.specs.find(s => s && s.name === name);
|
|
859
|
+
if (!rec) return data;
|
|
860
|
+
rec.healthy = true;
|
|
861
|
+
rec.last_health_at = now;
|
|
862
|
+
return data;
|
|
863
|
+
}, { defaultValue: _initialStateShape() });
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
function waitForFirstHealth(spec, opts) {
|
|
867
|
+
opts = opts || {};
|
|
868
|
+
const intervalMs = Math.max(100, (Number(spec.healthcheck && spec.healthcheck.interval_s) || 1) * 1000);
|
|
869
|
+
const timeoutMs = Math.max(intervalMs, (Number(spec.healthcheck && spec.healthcheck.timeout_s) || 30) * 1000);
|
|
870
|
+
const deadline = Date.now() + timeoutMs;
|
|
871
|
+
return new Promise((resolve) => {
|
|
872
|
+
let stopped = false;
|
|
873
|
+
let lastError = null;
|
|
874
|
+
const tick = async () => {
|
|
875
|
+
if (stopped) return;
|
|
876
|
+
const result = await runHealthcheck(spec);
|
|
877
|
+
if (stopped) return;
|
|
878
|
+
if (result.healthy) {
|
|
879
|
+
stopped = true;
|
|
880
|
+
try { _markHealthy(spec.name, result.lastCheckAt); }
|
|
881
|
+
catch (e) { log('warn', 'managed-spawn waitForFirstHealth: state write failed for ' + spec.name + ': ' + e.message); }
|
|
882
|
+
return resolve({ healthy: true, error: null, lastCheckAt: result.lastCheckAt });
|
|
883
|
+
}
|
|
884
|
+
lastError = result.error;
|
|
885
|
+
if (Date.now() >= deadline) {
|
|
886
|
+
stopped = true;
|
|
887
|
+
return resolve({
|
|
888
|
+
healthy: false,
|
|
889
|
+
error: 'timeout: spec ' + spec.name + ' did not become healthy within ' + Math.round(timeoutMs / 1000) + 's (last: ' + (lastError || 'no probes ran') + ')',
|
|
890
|
+
lastCheckAt: result.lastCheckAt,
|
|
891
|
+
});
|
|
892
|
+
}
|
|
893
|
+
setTimeout(tick, intervalMs);
|
|
894
|
+
};
|
|
895
|
+
// First probe fires immediately so a fast service doesn't pay an
|
|
896
|
+
// interval_s delay.
|
|
897
|
+
tick();
|
|
898
|
+
});
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
function tailManagedLog(name, lines) {
|
|
902
|
+
const linesN = Math.max(1, Math.min(1000, Number(lines) || 100));
|
|
903
|
+
const logPath = path.join(_getLogsDir(), name + '.log');
|
|
904
|
+
let raw;
|
|
905
|
+
try { raw = fs.readFileSync(logPath, 'utf8'); }
|
|
906
|
+
catch (_e) { return ''; }
|
|
907
|
+
const arr = raw.split(/\r?\n/);
|
|
908
|
+
return arr.slice(-linesN).join('\n');
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
// ─── Item 6 (P-1f9c3a45) ─────────────────────────────────────────────────────
|
|
912
|
+
//
|
|
913
|
+
// Playbook auto-inject of the live-managed-processes block. Called by
|
|
914
|
+
// engine/playbook.js renderPlaybook (project-scoped, computed once per render).
|
|
915
|
+
// Filters listManagedSpecs() to specs where:
|
|
916
|
+
// - owner_project === opts.project
|
|
917
|
+
// - healthy === true
|
|
918
|
+
// - alive === true
|
|
919
|
+
//
|
|
920
|
+
// Returns '' when nothing matches — playbook caller can append unconditionally.
|
|
921
|
+
// Serialized payload is capped at ENGINE_DEFAULTS.managedSpawn.promptContextMaxBytes
|
|
922
|
+
// (default 2048). When the full block exceeds the cap, falls back to a compact
|
|
923
|
+
// list (name + base_url + ports), keeping the dashboard endpoint footer so the
|
|
924
|
+
// dispatched agent can still discover the rest via /api/managed-processes.
|
|
925
|
+
//
|
|
926
|
+
// Project arg is required — never inject "all specs" to avoid cross-project
|
|
927
|
+
// port/URL leakage (see plan §"Auto-injected prompt context could leak…").
|
|
928
|
+
function buildLiveManagedProcessesBlock(opts) {
|
|
929
|
+
opts = opts || {};
|
|
930
|
+
const project = typeof opts.project === 'string' ? opts.project : '';
|
|
931
|
+
if (!project) return '';
|
|
932
|
+
let specs;
|
|
933
|
+
try { specs = listManagedSpecs({ project: project }); }
|
|
934
|
+
catch (_e) { return ''; }
|
|
935
|
+
if (!Array.isArray(specs)) return '';
|
|
936
|
+
specs = specs.filter(s => s && s.healthy === true && s.alive === true);
|
|
937
|
+
if (specs.length === 0) return '';
|
|
938
|
+
|
|
939
|
+
const limits = ENGINE_DEFAULTS.managedSpawn || {};
|
|
940
|
+
const cap = Math.max(256, Number(limits.promptContextMaxBytes) || 2048);
|
|
941
|
+
const portIn = Number(opts.dashboardPort);
|
|
942
|
+
const dashboardPort = Number.isFinite(portIn) && portIn > 0 ? portIn : 7331;
|
|
943
|
+
const dashboardUrl = 'http://localhost:' + dashboardPort + '/api/managed-processes?project='
|
|
944
|
+
+ encodeURIComponent(project);
|
|
945
|
+
|
|
946
|
+
const header = [
|
|
947
|
+
'',
|
|
948
|
+
'',
|
|
949
|
+
'---',
|
|
950
|
+
'',
|
|
951
|
+
'## Live managed processes for project ' + project,
|
|
952
|
+
'',
|
|
953
|
+
'These services were spawned by earlier work items in this project and are currently healthy. Reuse them — do not re-spawn duplicates. Ports, URLs, and PIDs are owned by the engine; query `/api/managed-processes` if you need fresh state.',
|
|
954
|
+
'',
|
|
955
|
+
];
|
|
956
|
+
|
|
957
|
+
// ── Full rendering — name + pid + ports + attrs + log + ttl per spec ───
|
|
958
|
+
const fullLines = header.slice();
|
|
959
|
+
for (const s of specs) {
|
|
960
|
+
fullLines.push('### ' + s.name);
|
|
961
|
+
fullLines.push('');
|
|
962
|
+
if (Number.isInteger(s.pid) && s.pid > 0) fullLines.push('- pid: ' + s.pid);
|
|
963
|
+
if (Array.isArray(s.ports) && s.ports.length) {
|
|
964
|
+
fullLines.push('- ports: ' + s.ports.join(', '));
|
|
965
|
+
}
|
|
966
|
+
if (s.attrs && typeof s.attrs === 'object') {
|
|
967
|
+
const attrKeys = Object.keys(s.attrs);
|
|
968
|
+
if (attrKeys.length) {
|
|
969
|
+
const parts = attrKeys.map(k => k + ': ' + JSON.stringify(s.attrs[k]));
|
|
970
|
+
fullLines.push('- attrs: ' + parts.join('; '));
|
|
971
|
+
}
|
|
972
|
+
}
|
|
973
|
+
if (typeof s.log_path === 'string' && s.log_path) fullLines.push('- log: ' + s.log_path);
|
|
974
|
+
if (Number.isFinite(s.ttl_expires_at)) {
|
|
975
|
+
fullLines.push('- ttl_expires_at: ' + new Date(s.ttl_expires_at).toISOString());
|
|
976
|
+
}
|
|
977
|
+
fullLines.push('');
|
|
978
|
+
}
|
|
979
|
+
fullLines.push('Full details / kill / restart: `curl -s ' + dashboardUrl + '`');
|
|
980
|
+
fullLines.push('');
|
|
981
|
+
const full = fullLines.join('\n');
|
|
982
|
+
if (Buffer.byteLength(full, 'utf8') <= cap) return full;
|
|
983
|
+
|
|
984
|
+
// ── Compact fallback — name + base_url + ports ─────────────────────────
|
|
985
|
+
const compactLines = header.slice();
|
|
986
|
+
for (const s of specs) {
|
|
987
|
+
const baseUrl = (s.attrs && typeof s.attrs.base_url === 'string') ? s.attrs.base_url : '';
|
|
988
|
+
const portStr = (Array.isArray(s.ports) && s.ports.length) ? ' (ports ' + s.ports.join(',') + ')' : '';
|
|
989
|
+
compactLines.push('- **' + s.name + '**' + (baseUrl ? ' — ' + baseUrl : '') + portStr);
|
|
990
|
+
}
|
|
991
|
+
compactLines.push('');
|
|
992
|
+
compactLines.push('_' + specs.length + ' live service(s) — full details truncated above the '
|
|
993
|
+
+ cap + '-byte prompt cap. Query `' + dashboardUrl + '` for attrs, logs, and TTL._');
|
|
994
|
+
compactLines.push('');
|
|
995
|
+
return compactLines.join('\n');
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
// ─── Item 4 (P-4b8d2e57) ─────────────────────────────────────────────────────
|
|
999
|
+
//
|
|
1000
|
+
// Discovery API helpers: ETag fingerprinting for list endpoints and
|
|
1001
|
+
// state-driven respawn from a previously recorded spec.
|
|
1002
|
+
//
|
|
1003
|
+
// computeManagedSpecsEtag({project}) → returns a stable 16-char sha1 prefix
|
|
1004
|
+
// over the JSON-serialised list (filtered
|
|
1005
|
+
// by project if supplied). Same content
|
|
1006
|
+
// → same etag; ANY field change → new
|
|
1007
|
+
// etag. Honored by the dashboard
|
|
1008
|
+
// endpoints via If-None-Match → 304.
|
|
1009
|
+
// restartManagedSpec(name) → looks up the spec by name, kills the
|
|
1010
|
+
// old PID (if alive), re-spawns it using
|
|
1011
|
+
// the saved spec shape (cmd/args/cwd/
|
|
1012
|
+
// env/healthcheck), and replaces the
|
|
1013
|
+
// state row with healthy:false +
|
|
1014
|
+
// alive:true + new pid + new
|
|
1015
|
+
// started_at. Throws if the name is
|
|
1016
|
+
// unknown — caller maps to HTTP 404.
|
|
1017
|
+
// Healthcheck loop for the new PID is
|
|
1018
|
+
// the caller's job (the dashboard
|
|
1019
|
+
// endpoint kicks it off async; the
|
|
1020
|
+
// boot-reconcile path / item 7 does too
|
|
1021
|
+
// on engine restart).
|
|
1022
|
+
|
|
1023
|
+
const crypto = require('crypto');
|
|
1024
|
+
|
|
1025
|
+
function computeManagedSpecsEtag(opts) {
|
|
1026
|
+
const specs = listManagedSpecs(opts || {});
|
|
1027
|
+
// Deterministic serialization: sort by name so the etag isn't sensitive to
|
|
1028
|
+
// insertion order in the state file.
|
|
1029
|
+
const sorted = specs.slice().sort((a, b) => String(a.name).localeCompare(String(b.name)));
|
|
1030
|
+
const json = JSON.stringify(sorted);
|
|
1031
|
+
return crypto.createHash('sha1').update(json).digest('hex').slice(0, 16);
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
function restartManagedSpec(name) {
|
|
1035
|
+
if (typeof name !== 'string' || name.length === 0) {
|
|
1036
|
+
throw new Error('restartManagedSpec: name required');
|
|
1037
|
+
}
|
|
1038
|
+
const existing = listManagedSpecs().find(s => s && s.name === name);
|
|
1039
|
+
if (!existing) {
|
|
1040
|
+
throw new Error('restartManagedSpec: spec not found: ' + name);
|
|
1041
|
+
}
|
|
1042
|
+
// Reconstruct a sidecar-shaped spec from the persisted state row. The
|
|
1043
|
+
// state shape keeps cmd/args/cwd/env/ports/attrs/healthcheck verbatim
|
|
1044
|
+
// (see _toStateRecord), so this is a direct projection.
|
|
1045
|
+
const spec = {
|
|
1046
|
+
name: existing.name,
|
|
1047
|
+
cmd: existing.cmd,
|
|
1048
|
+
args: Array.isArray(existing.args) ? existing.args.slice() : [],
|
|
1049
|
+
cwd: existing.cwd || '',
|
|
1050
|
+
env: Object.assign({}, existing.env || {}),
|
|
1051
|
+
ports: Array.isArray(existing.ports) ? existing.ports.slice() : [],
|
|
1052
|
+
ttl_minutes: undefined, // re-uses ttl_expires_at via _toStateRecord; pass through default
|
|
1053
|
+
attrs: existing.attrs && typeof existing.attrs === 'object' ? existing.attrs : {},
|
|
1054
|
+
healthcheck: existing.healthcheck || null,
|
|
1055
|
+
};
|
|
1056
|
+
// Kill the old PID before respawn (best-effort; outside of any lock).
|
|
1057
|
+
if (Number.isInteger(existing.pid) && existing.pid > 0) {
|
|
1058
|
+
try { shared.killByPidImmediate(existing.pid); }
|
|
1059
|
+
catch (e) { log('warn', 'restartManagedSpec: kill of old PID ' + existing.pid + ' failed: ' + e.message); }
|
|
1060
|
+
}
|
|
1061
|
+
const ctx = {
|
|
1062
|
+
owner_agent: existing.owner_agent || '',
|
|
1063
|
+
owner_wi: existing.owner_wi || '',
|
|
1064
|
+
owner_project: existing.owner_project || '',
|
|
1065
|
+
};
|
|
1066
|
+
const runtime = spawnManagedSpec(spec, ctx);
|
|
1067
|
+
// recordManagedSpec replaces by name (item 2 idempotency contract) and
|
|
1068
|
+
// resets healthy:false / alive:true / new started_at / new ttl_expires_at.
|
|
1069
|
+
recordManagedSpec(spec, runtime, ctx);
|
|
1070
|
+
log('info', 'managed-spawn restart: name=' + name + ' old_pid=' + existing.pid + ' new_pid=' + runtime.pid);
|
|
1071
|
+
return runtime;
|
|
1072
|
+
}
|
|
1073
|
+
|
|
1074
|
+
// ─── Item 7 (P-8a4d6f29) ─────────────────────────────────────────────────────
|
|
1075
|
+
//
|
|
1076
|
+
// TTL sweep + boot reconcile + project-removal cleanup + log rotation.
|
|
1077
|
+
//
|
|
1078
|
+
// sweepManagedSpawn(opts) → tick-driven walk of the state file:
|
|
1079
|
+
// 1. probe each pid (process.kill 0);
|
|
1080
|
+
// dead-but-not-expired entries are
|
|
1081
|
+
// dropped from state (no kill —
|
|
1082
|
+
// the OS already reaped them).
|
|
1083
|
+
// 2. ttl_expires_at past now → batch
|
|
1084
|
+
// kill via killByPidsImmediate +
|
|
1085
|
+
// drop from state.
|
|
1086
|
+
// 3. rotate log_path when size >
|
|
1087
|
+
// logRotateBytes (rename to .1,
|
|
1088
|
+
// overwrite any prior .1).
|
|
1089
|
+
// Returns {scanned, ttlExpired,
|
|
1090
|
+
// deadDropped, killedPids,
|
|
1091
|
+
// rotatedLogs, malformed}.
|
|
1092
|
+
// bootReconcileManagedSpawn(opts) → one-shot equivalent for the engine
|
|
1093
|
+
// boot path. Same drop-dead + kill-
|
|
1094
|
+
// expired pass, plus a single
|
|
1095
|
+
// runHealthcheck() probe per
|
|
1096
|
+
// surviving spec to refresh
|
|
1097
|
+
// `healthy` / `last_health_at`.
|
|
1098
|
+
// Returns a Promise so callers can
|
|
1099
|
+
// Promise.race it against the
|
|
1100
|
+
// bootReconcileMaxMs ceiling.
|
|
1101
|
+
// removeManagedSpecsForProject(name) → centralised project-removal hook.
|
|
1102
|
+
// Kills + drops every spec whose
|
|
1103
|
+
// owner_project matches, unlinks the
|
|
1104
|
+
// log + log.1. Returns {killed,
|
|
1105
|
+
// unlinked}. engine/projects.js
|
|
1106
|
+
// removeProject calls this — no
|
|
1107
|
+
// managed-process awareness elsewhere.
|
|
1108
|
+
//
|
|
1109
|
+
// Per the plan, healthcheck loops stay PER-SPEC and self-scheduled — the tick
|
|
1110
|
+
// cycle never iterates all specs to drive a probe. The sweep ONLY handles
|
|
1111
|
+
// liveness/TTL/log-rotation; it does not re-attach healthcheck timers
|
|
1112
|
+
// (boot reconcile does that once at startup; the engine close-handler does it
|
|
1113
|
+
// at first spawn).
|
|
1114
|
+
|
|
1115
|
+
function _rotateManagedLog(logPath, cap) {
|
|
1116
|
+
if (!logPath || typeof logPath !== 'string') return false;
|
|
1117
|
+
let size = 0;
|
|
1118
|
+
try { size = fs.statSync(logPath).size; }
|
|
1119
|
+
catch (_e) { return false; }
|
|
1120
|
+
if (!Number.isFinite(cap) || cap <= 0 || size <= cap) return false;
|
|
1121
|
+
const rotated = logPath + '.1';
|
|
1122
|
+
try {
|
|
1123
|
+
try { fs.unlinkSync(rotated); }
|
|
1124
|
+
catch (e) { if (e && e.code !== 'ENOENT') throw e; }
|
|
1125
|
+
fs.renameSync(logPath, rotated);
|
|
1126
|
+
return true;
|
|
1127
|
+
} catch (e) {
|
|
1128
|
+
log('warn', 'managed-spawn rotate: ' + logPath + ' failed: ' + e.message);
|
|
1129
|
+
return false;
|
|
1130
|
+
}
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
function _runManagedReconcile(opts) {
|
|
1134
|
+
opts = opts || {};
|
|
1135
|
+
const limits = ENGINE_DEFAULTS.managedSpawn || {};
|
|
1136
|
+
const now = Number.isFinite(opts.now) ? opts.now : Date.now();
|
|
1137
|
+
const rotateBytes = Number.isFinite(opts.rotateBytes)
|
|
1138
|
+
? opts.rotateBytes
|
|
1139
|
+
: (Number(limits.logRotateBytes) || 10 * 1024 * 1024);
|
|
1140
|
+
const isAlive = typeof opts.isAlive === 'function'
|
|
1141
|
+
? opts.isAlive
|
|
1142
|
+
: shared.isPidAlive;
|
|
1143
|
+
const killBatch = typeof opts.killBatch === 'function'
|
|
1144
|
+
? opts.killBatch
|
|
1145
|
+
: shared.killByPidsImmediate;
|
|
1146
|
+
const stats = {
|
|
1147
|
+
scanned: 0,
|
|
1148
|
+
ttlExpired: 0,
|
|
1149
|
+
deadDropped: 0,
|
|
1150
|
+
killedPids: 0,
|
|
1151
|
+
rotatedLogs: 0,
|
|
1152
|
+
malformed: 0,
|
|
1153
|
+
};
|
|
1154
|
+
const statePath = _getStatePath();
|
|
1155
|
+
const ttlPidsToKill = [];
|
|
1156
|
+
const survivors = []; // [{name, log_path}] post-mutation, used for log rotation + bootReconcile probes
|
|
1157
|
+
shared.mutateJsonFileLocked(statePath, (data) => {
|
|
1158
|
+
if (!data || typeof data !== 'object' || Array.isArray(data) || !Array.isArray(data.specs)) {
|
|
1159
|
+
stats.malformed++;
|
|
1160
|
+
return _initialStateShape();
|
|
1161
|
+
}
|
|
1162
|
+
const kept = [];
|
|
1163
|
+
for (const rec of data.specs) {
|
|
1164
|
+
if (!rec || typeof rec !== 'object' || typeof rec.name !== 'string') {
|
|
1165
|
+
stats.malformed++;
|
|
1166
|
+
continue;
|
|
1167
|
+
}
|
|
1168
|
+
stats.scanned++;
|
|
1169
|
+
const ttlExpired = Number.isFinite(rec.ttl_expires_at) && rec.ttl_expires_at <= now;
|
|
1170
|
+
const alive = Number.isInteger(rec.pid) && rec.pid > 0 && isAlive(rec.pid);
|
|
1171
|
+
if (ttlExpired) {
|
|
1172
|
+
stats.ttlExpired++;
|
|
1173
|
+
if (alive) ttlPidsToKill.push(rec.pid);
|
|
1174
|
+
continue; // drop from state
|
|
1175
|
+
}
|
|
1176
|
+
if (!alive) {
|
|
1177
|
+
stats.deadDropped++;
|
|
1178
|
+
continue; // dead + not expired → drop
|
|
1179
|
+
}
|
|
1180
|
+
kept.push(rec);
|
|
1181
|
+
survivors.push({ name: rec.name, log_path: rec.log_path || '', healthy: rec.healthy === true, last_health_at: rec.last_health_at || 0 });
|
|
1182
|
+
}
|
|
1183
|
+
data.specs = kept;
|
|
1184
|
+
return data;
|
|
1185
|
+
}, { defaultValue: _initialStateShape() });
|
|
1186
|
+
// Process kills + log rotation OUTSIDE the lock callback. The copilot-
|
|
1187
|
+
// instructions key conventions explicitly forbid kills/network/awaits inside
|
|
1188
|
+
// a lock callback.
|
|
1189
|
+
if (ttlPidsToKill.length > 0) {
|
|
1190
|
+
try { stats.killedPids = killBatch(ttlPidsToKill); }
|
|
1191
|
+
catch (e) { log('warn', 'managed-spawn sweep: kill batch failed: ' + e.message); }
|
|
1192
|
+
}
|
|
1193
|
+
for (const surv of survivors) {
|
|
1194
|
+
if (_rotateManagedLog(surv.log_path, rotateBytes)) stats.rotatedLogs++;
|
|
1195
|
+
}
|
|
1196
|
+
return { stats: stats, survivors: survivors };
|
|
1197
|
+
}
|
|
1198
|
+
|
|
1199
|
+
function sweepManagedSpawn(opts) {
|
|
1200
|
+
return _runManagedReconcile(opts).stats;
|
|
1201
|
+
}
|
|
1202
|
+
|
|
1203
|
+
// Refresh `healthy` + `last_health_at` on the state row after a fresh probe.
|
|
1204
|
+
// Symmetric with _markHealthy but accepts the unhealthy case too (flipping
|
|
1205
|
+
// healthy=true→false would mask a real degradation — keep healthy sticky and
|
|
1206
|
+
// rely on the per-spec loop to refresh post-success, or item 7 boot reconcile
|
|
1207
|
+
// to (re)establish initial truth on engine restart).
|
|
1208
|
+
function _markBootProbe(name, result) {
|
|
1209
|
+
const statePath = _getStatePath();
|
|
1210
|
+
shared.mutateJsonFileLocked(statePath, (data) => {
|
|
1211
|
+
if (!data || !Array.isArray(data.specs)) return data;
|
|
1212
|
+
const rec = data.specs.find(s => s && s.name === name);
|
|
1213
|
+
if (!rec) return data;
|
|
1214
|
+
if (result && result.healthy === true) {
|
|
1215
|
+
rec.healthy = true;
|
|
1216
|
+
rec.last_health_at = Number.isFinite(result.lastCheckAt) ? result.lastCheckAt : Date.now();
|
|
1217
|
+
} else {
|
|
1218
|
+
// Survivor failed its boot probe — clear healthy so dashboard reflects
|
|
1219
|
+
// truth. The per-spec healthcheck loop (re-attached in a future item) or
|
|
1220
|
+
// the next dispatch's waitForFirstHealth will move it back to true.
|
|
1221
|
+
rec.healthy = false;
|
|
1222
|
+
}
|
|
1223
|
+
return data;
|
|
1224
|
+
}, { defaultValue: _initialStateShape() });
|
|
1225
|
+
}
|
|
1226
|
+
|
|
1227
|
+
async function bootReconcileManagedSpawn(opts) {
|
|
1228
|
+
opts = opts || {};
|
|
1229
|
+
const limits = ENGINE_DEFAULTS.managedSpawn || {};
|
|
1230
|
+
const backoffMs = Math.max(0, (Number(limits.healthBackoffSec) || 30) * 1000);
|
|
1231
|
+
const now = Number.isFinite(opts.now) ? opts.now : Date.now();
|
|
1232
|
+
const { stats, survivors } = _runManagedReconcile(opts);
|
|
1233
|
+
// Re-probe survivors with a single healthcheck each. Skip ones already
|
|
1234
|
+
// healthy within healthBackoffSec — boot reconcile must stay bounded.
|
|
1235
|
+
const probed = [];
|
|
1236
|
+
for (const surv of survivors) {
|
|
1237
|
+
if (surv.healthy && (now - surv.last_health_at) < backoffMs) continue;
|
|
1238
|
+
// Load the full spec record for the healthcheck shape (boot-reconcile
|
|
1239
|
+
// needs `healthcheck` block, which survivors[] omits to keep the lock
|
|
1240
|
+
// callback small).
|
|
1241
|
+
const list = listManagedSpecs();
|
|
1242
|
+
const rec = list.find(s => s && s.name === surv.name);
|
|
1243
|
+
if (!rec || !rec.healthcheck) continue;
|
|
1244
|
+
try {
|
|
1245
|
+
const r = await runHealthcheck({ name: rec.name, healthcheck: rec.healthcheck });
|
|
1246
|
+
try { _markBootProbe(rec.name, r); }
|
|
1247
|
+
catch (e) { log('warn', 'managed-spawn bootReconcile state write failed for ' + rec.name + ': ' + e.message); }
|
|
1248
|
+
probed.push({ name: rec.name, healthy: !!r.healthy });
|
|
1249
|
+
} catch (e) {
|
|
1250
|
+
log('warn', 'managed-spawn bootReconcile probe failed for ' + rec.name + ': ' + e.message);
|
|
1251
|
+
}
|
|
1252
|
+
}
|
|
1253
|
+
return { stats: stats, probed: probed };
|
|
1254
|
+
}
|
|
1255
|
+
|
|
1256
|
+
function removeManagedSpecsForProject(projectName) {
|
|
1257
|
+
if (typeof projectName !== 'string' || projectName.length === 0) {
|
|
1258
|
+
return { killed: 0, unlinked: 0, scanned: 0 };
|
|
1259
|
+
}
|
|
1260
|
+
const statePath = _getStatePath();
|
|
1261
|
+
let toKill = [];
|
|
1262
|
+
let logPaths = [];
|
|
1263
|
+
shared.mutateJsonFileLocked(statePath, (data) => {
|
|
1264
|
+
if (!data || typeof data !== 'object' || !Array.isArray(data.specs)) {
|
|
1265
|
+
return _initialStateShape();
|
|
1266
|
+
}
|
|
1267
|
+
const keep = [];
|
|
1268
|
+
for (const rec of data.specs) {
|
|
1269
|
+
if (rec && rec.owner_project === projectName) {
|
|
1270
|
+
if (Number.isInteger(rec.pid) && rec.pid > 0) toKill.push(rec.pid);
|
|
1271
|
+
if (rec.log_path) logPaths.push(rec.log_path);
|
|
1272
|
+
continue; // drop
|
|
1273
|
+
}
|
|
1274
|
+
keep.push(rec);
|
|
1275
|
+
}
|
|
1276
|
+
data.specs = keep;
|
|
1277
|
+
return data;
|
|
1278
|
+
}, { defaultValue: _initialStateShape() });
|
|
1279
|
+
let killed = 0;
|
|
1280
|
+
if (toKill.length > 0) {
|
|
1281
|
+
try { killed = shared.killByPidsImmediate(toKill); }
|
|
1282
|
+
catch (e) { log('warn', 'managed-spawn removeForProject: kill batch failed for ' + projectName + ': ' + e.message); }
|
|
1283
|
+
}
|
|
1284
|
+
let unlinked = 0;
|
|
1285
|
+
for (const p of logPaths) {
|
|
1286
|
+
for (const candidate of [p, p + '.1']) {
|
|
1287
|
+
try { fs.unlinkSync(candidate); unlinked++; }
|
|
1288
|
+
catch (e) { if (e && e.code !== 'ENOENT') log('warn', 'managed-spawn removeForProject: unlink ' + candidate + ' failed: ' + e.message); }
|
|
1289
|
+
}
|
|
1290
|
+
}
|
|
1291
|
+
return { killed: killed, unlinked: unlinked, scanned: toKill.length };
|
|
1292
|
+
}
|
|
1293
|
+
|
|
1294
|
+
module.exports = {
|
|
1295
|
+
MANAGED_SPAWN_FILENAME: MANAGED_SPAWN_FILENAME,
|
|
1296
|
+
MANAGED_PROCESSES_STATE_FILE: MANAGED_PROCESSES_STATE_FILE,
|
|
1297
|
+
MANAGED_LOGS_DIR: MANAGED_LOGS_DIR,
|
|
1298
|
+
INVALID_WORKDIR_REASON_PREFIX: INVALID_WORKDIR_REASON_PREFIX,
|
|
1299
|
+
HEALTHCHECK_TYPES: HEALTHCHECK_TYPES,
|
|
1300
|
+
validateManagedSpawnRecord: validateManagedSpawnRecord,
|
|
1301
|
+
readManagedSpawnFile: readManagedSpawnFile,
|
|
1302
|
+
evaluateManagedSpawnAcceptance: evaluateManagedSpawnAcceptance,
|
|
1303
|
+
buildManagedSpawnHint: buildManagedSpawnHint,
|
|
1304
|
+
// Item 2 (P-2d5e8f04): engine spawn + locked state file.
|
|
1305
|
+
openManagedLog: openManagedLog,
|
|
1306
|
+
spawnManagedSpec: spawnManagedSpec,
|
|
1307
|
+
recordManagedSpec: recordManagedSpec,
|
|
1308
|
+
recordManagedBatch: recordManagedBatch,
|
|
1309
|
+
removeManagedSpec: removeManagedSpec,
|
|
1310
|
+
listManagedSpecs: listManagedSpecs,
|
|
1311
|
+
getStatePath: _getStatePath,
|
|
1312
|
+
// Item 3 (P-9c1f47a6): healthcheck implementations + first-pass waiter.
|
|
1313
|
+
runHealthcheck: runHealthcheck,
|
|
1314
|
+
waitForFirstHealth: waitForFirstHealth,
|
|
1315
|
+
tailManagedLog: tailManagedLog,
|
|
1316
|
+
// Item 6 (P-1f9c3a45): playbook auto-inject of live managed processes.
|
|
1317
|
+
buildLiveManagedProcessesBlock: buildLiveManagedProcessesBlock,
|
|
1318
|
+
// Item 4 (P-4b8d2e57): discovery API (etag + state-driven respawn).
|
|
1319
|
+
computeManagedSpecsEtag: computeManagedSpecsEtag,
|
|
1320
|
+
restartManagedSpec: restartManagedSpec,
|
|
1321
|
+
// Item 7 (P-8a4d6f29): TTL sweep + boot reconcile + project cleanup.
|
|
1322
|
+
sweepManagedSpawn: sweepManagedSpawn,
|
|
1323
|
+
bootReconcileManagedSpawn: bootReconcileManagedSpawn,
|
|
1324
|
+
removeManagedSpecsForProject: removeManagedSpecsForProject,
|
|
1325
|
+
};
|