@yemi33/minions 0.1.1966 → 0.1.1967

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1325 @@
1
+ /**
2
+ * engine/managed-spawn.js — Item 1 of plan P-7a3b1c92.
3
+ *
4
+ * Schema, validator, sidecar reader, acceptance helper, and playbook hint for
5
+ * the managed-spawn primitive. Mirrors engine/keep-process-sweep.js
6
+ * beat-for-beat: an agent writes agents/<id>/managed-spawn.json describing
7
+ * long-running services it wants the engine to spawn + healthcheck on its
8
+ * behalf; this module validates the file, returns a structured acceptance
9
+ * decision the engine's onAgentClose handler uses to gate the dispatch, and
10
+ * exposes a hint block the playbook injects so the agent knows the contract.
11
+ *
12
+ * Engine spawn / state-file / healthcheck loop / sweep wiring lives in later
13
+ * plan items (P-7a3b1c93+). This module is pure: no spawns, no network, no
14
+ * mutation of shared state.
15
+ *
16
+ * Design template: engine/keep-process-sweep.js. Divergences are justified
17
+ * inline; symmetry with keep-pids is the convention.
18
+ */
19
+
20
+ const fs = require('fs');
21
+ const path = require('path');
22
+ const http = require('http');
23
+ const https = require('https');
24
+ const { spawn, exec } = require('child_process');
25
+ const shared = require('./shared');
26
+ const queries = require('./queries');
27
+
28
+ const { log, ENGINE_DEFAULTS } = shared;
29
+
30
+ const MANAGED_SPAWN_FILENAME = 'managed-spawn.json';
31
+ const MANAGED_PROCESSES_STATE_FILE = 'managed-processes.json';
32
+ const MANAGED_LOGS_DIR = 'managed-logs';
33
+ const INVALID_WORKDIR_REASON_PREFIX = 'invalid-workdir: ';
34
+
35
+ const HEALTHCHECK_TYPES = ['http', 'command'];
36
+
37
+ // Kebab-case service names: lowercase letters, digits, single internal hyphens,
38
+ // no leading/trailing hyphen. Mirrors the spawn-name conventions used elsewhere
39
+ // in the codebase (worktree-pool, dispatch IDs).
40
+ const _KEBAB_RE = /^[a-z0-9]+(?:-[a-z0-9]+)*$/;
41
+
42
+ function _agentsDir() {
43
+ return queries.AGENTS_DIR || path.join(shared.MINIONS_DIR, 'agents');
44
+ }
45
+
46
+ function _resolveRequireGitWorkdir(opts) {
47
+ const limits = ENGINE_DEFAULTS.managedSpawn || {};
48
+ if (opts && Object.prototype.hasOwnProperty.call(opts, 'requireGitWorkdir')) {
49
+ return !!opts.requireGitWorkdir;
50
+ }
51
+ if (limits.requireGitWorkdir === false) return false;
52
+ return true;
53
+ }
54
+
55
+ function _isOnAllowlist(name, allowlist) {
56
+ if (!Array.isArray(allowlist) || typeof name !== 'string' || name.length === 0) return false;
57
+ const base = path.basename(name).toLowerCase();
58
+ const baseNoExt = base.replace(/\.(exe|cmd|bat|ps1|sh)$/, '');
59
+ for (const entry of allowlist) {
60
+ if (typeof entry !== 'string') continue;
61
+ const e = entry.toLowerCase();
62
+ if (e === base || e === baseNoExt) return true;
63
+ }
64
+ return false;
65
+ }
66
+
67
+ function _envKeyAllowed(key, limits) {
68
+ if (typeof key !== 'string' || key.length === 0) return false;
69
+ const allowlist = Array.isArray(limits.envKeyAllowlist) ? limits.envKeyAllowlist : [];
70
+ if (allowlist.indexOf(key) >= 0) return true;
71
+ const prefixes = Array.isArray(limits.envKeyAllowlistPrefixes) ? limits.envKeyAllowlistPrefixes : [];
72
+ for (const p of prefixes) {
73
+ if (typeof p === 'string' && p.length > 0 && key.indexOf(p) === 0) return true;
74
+ }
75
+ return false;
76
+ }
77
+
78
+ // Extract the first executable-like token from a shell-style healthcheck cmd
79
+ // string so it can be checked against the executable allowlist. Strips
80
+ // surrounding quotes and any leading path. Returns '' when no candidate
81
+ // found.
82
+ function _firstExecutable(cmd) {
83
+ if (typeof cmd !== 'string') return '';
84
+ const trimmed = cmd.trim();
85
+ if (trimmed.length === 0) return '';
86
+ const m = trimmed.match(/^["']?([^\s"']+)["']?/);
87
+ if (!m) return '';
88
+ return m[1];
89
+ }
90
+
91
+ function _validateHealthcheck(hc, limits) {
92
+ if (!hc || typeof hc !== 'object' || Array.isArray(hc)) {
93
+ return { ok: false, reason: 'healthcheck-missing' };
94
+ }
95
+ if (HEALTHCHECK_TYPES.indexOf(hc.type) < 0) {
96
+ return { ok: false, reason: 'healthcheck-type-invalid (' + hc.type + ')' };
97
+ }
98
+ const timeoutS = Number(hc.timeout_s);
99
+ if (!Number.isFinite(timeoutS) || timeoutS <= 0 || timeoutS > 3600) {
100
+ return { ok: false, reason: 'healthcheck-timeout-invalid' };
101
+ }
102
+ let intervalS = hc.interval_s == null
103
+ ? Number(limits.defaultHealthIntervalSec) || 1
104
+ : Number(hc.interval_s);
105
+ if (!Number.isFinite(intervalS) || intervalS <= 0 || intervalS > 600) {
106
+ return { ok: false, reason: 'healthcheck-interval-invalid' };
107
+ }
108
+
109
+ if (hc.type === 'http') {
110
+ if (typeof hc.url !== 'string' || hc.url.length === 0) {
111
+ return { ok: false, reason: 'healthcheck-url-missing' };
112
+ }
113
+ if (!/^https?:\/\//i.test(hc.url)) {
114
+ return { ok: false, reason: 'healthcheck-url-not-absolute' };
115
+ }
116
+ if (hc.expect_status == null) {
117
+ return { ok: false, reason: 'healthcheck-expect-status-missing' };
118
+ }
119
+ const status = Number(hc.expect_status);
120
+ if (!Number.isInteger(status) || status < 100 || status > 599) {
121
+ return { ok: false, reason: 'healthcheck-expect-status-invalid' };
122
+ }
123
+ return {
124
+ ok: true,
125
+ value: {
126
+ type: 'http',
127
+ url: hc.url,
128
+ expect_status: status,
129
+ interval_s: intervalS,
130
+ timeout_s: timeoutS,
131
+ },
132
+ };
133
+ }
134
+
135
+ // command
136
+ if (typeof hc.cmd !== 'string' || hc.cmd.length === 0) {
137
+ return { ok: false, reason: 'healthcheck-cmd-missing' };
138
+ }
139
+ if (hc.cmd.length > 1000) {
140
+ return { ok: false, reason: 'healthcheck-cmd-too-long' };
141
+ }
142
+ if (typeof hc.shell !== 'boolean') {
143
+ return { ok: false, reason: 'healthcheck-shell-not-boolean' };
144
+ }
145
+ if (typeof hc.expect_regex !== 'string' || hc.expect_regex.length === 0) {
146
+ return { ok: false, reason: 'healthcheck-expect-regex-missing' };
147
+ }
148
+ if (hc.expect_regex.length > 500) {
149
+ return { ok: false, reason: 'healthcheck-expect-regex-too-long' };
150
+ }
151
+ try { new RegExp(hc.expect_regex); }
152
+ catch (e) { return { ok: false, reason: 'healthcheck-expect-regex-invalid (' + e.message + ')' }; }
153
+
154
+ // Same allowlist gate as the spec's `cmd` field. Healthcheck commands run
155
+ // through child_process.exec under engine ownership; without the gate this
156
+ // would be an arbitrary-code-execution path back into the engine process
157
+ // tree. Reject anything whose first token is not on the allowlist.
158
+ const execName = _firstExecutable(hc.cmd);
159
+ if (!_isOnAllowlist(execName, limits.executableAllowlist)) {
160
+ return { ok: false, reason: 'healthcheck-cmd-not-on-allowlist (' + execName + ')' };
161
+ }
162
+
163
+ return {
164
+ ok: true,
165
+ value: {
166
+ type: 'command',
167
+ cmd: hc.cmd,
168
+ shell: hc.shell,
169
+ expect_regex: hc.expect_regex,
170
+ interval_s: intervalS,
171
+ timeout_s: timeoutS,
172
+ },
173
+ };
174
+ }
175
+
176
+ function _validateSpec(spec, index, limits, opts) {
177
+ if (!spec || typeof spec !== 'object' || Array.isArray(spec)) {
178
+ return { ok: false, reason: 'not-an-object' };
179
+ }
180
+
181
+ // name
182
+ if (typeof spec.name !== 'string' || spec.name.length === 0) {
183
+ return { ok: false, reason: 'name-missing' };
184
+ }
185
+ const maxName = Math.max(8, Number(limits.maxNameLength) || 64);
186
+ if (spec.name.length > maxName) {
187
+ return { ok: false, reason: 'name-too-long (>' + maxName + ')' };
188
+ }
189
+ if (!_KEBAB_RE.test(spec.name)) {
190
+ return { ok: false, reason: 'name-not-kebab-case (' + spec.name + ')' };
191
+ }
192
+
193
+ // cmd
194
+ if (typeof spec.cmd !== 'string' || spec.cmd.length === 0) {
195
+ return { ok: false, reason: 'cmd-missing' };
196
+ }
197
+ if (spec.cmd.length > 200) {
198
+ return { ok: false, reason: 'cmd-too-long' };
199
+ }
200
+ if (!_isOnAllowlist(spec.cmd, limits.executableAllowlist)) {
201
+ return { ok: false, reason: 'cmd-not-on-allowlist (' + spec.cmd + ')' };
202
+ }
203
+
204
+ // args
205
+ const argsRaw = spec.args == null ? [] : spec.args;
206
+ if (!Array.isArray(argsRaw)) return { ok: false, reason: 'args-not-array' };
207
+ const maxArgs = Math.max(1, Number(limits.maxArgsCount) || 64);
208
+ if (argsRaw.length > maxArgs) {
209
+ return { ok: false, reason: 'args-too-many (>' + maxArgs + ')' };
210
+ }
211
+ const args = [];
212
+ for (const a of argsRaw) {
213
+ if (typeof a !== 'string') return { ok: false, reason: 'arg-not-string' };
214
+ if (a.length > 500) return { ok: false, reason: 'arg-too-long' };
215
+ args.push(a);
216
+ }
217
+
218
+ // cwd (optional; validated when present + requireGitWorkdir is on)
219
+ if (spec.cwd != null && typeof spec.cwd !== 'string') {
220
+ return { ok: false, reason: 'cwd-not-string' };
221
+ }
222
+ if (typeof spec.cwd === 'string' && spec.cwd.length > 500) {
223
+ return { ok: false, reason: 'cwd-too-long' };
224
+ }
225
+ if (_resolveRequireGitWorkdir(opts) && typeof spec.cwd === 'string' && spec.cwd.length > 0) {
226
+ const wt = shared.isValidGitWorktree(spec.cwd);
227
+ if (!wt.ok) {
228
+ return { ok: false, reason: INVALID_WORKDIR_REASON_PREFIX + wt.reason };
229
+ }
230
+ }
231
+
232
+ // env (optional)
233
+ const envRaw = spec.env == null ? {} : spec.env;
234
+ if (typeof envRaw !== 'object' || Array.isArray(envRaw)) {
235
+ return { ok: false, reason: 'env-not-object' };
236
+ }
237
+ const envKeys = Object.keys(envRaw);
238
+ const maxEnv = Math.max(1, Number(limits.maxEnvVars) || 32);
239
+ if (envKeys.length > maxEnv) {
240
+ return { ok: false, reason: 'env-too-many (>' + maxEnv + ')' };
241
+ }
242
+ const env = {};
243
+ for (const k of envKeys) {
244
+ if (!_envKeyAllowed(k, limits)) {
245
+ return { ok: false, reason: 'env-key-not-on-allowlist (' + k + ')' };
246
+ }
247
+ const v = envRaw[k];
248
+ if (typeof v !== 'string') return { ok: false, reason: 'env-value-not-string (' + k + ')' };
249
+ if (v.length > 1000) return { ok: false, reason: 'env-value-too-long (' + k + ')' };
250
+ env[k] = v;
251
+ }
252
+
253
+ // ports (optional)
254
+ const portsRaw = spec.ports == null ? [] : spec.ports;
255
+ if (!Array.isArray(portsRaw)) return { ok: false, reason: 'ports-not-array' };
256
+ if (portsRaw.length > 20) return { ok: false, reason: 'ports-too-many (>20)' };
257
+ const ports = [];
258
+ for (const p of portsRaw) {
259
+ const n = Number(p);
260
+ if (!Number.isInteger(n) || n < 1024 || n > 65535) {
261
+ return { ok: false, reason: 'port-invalid (' + p + ')' };
262
+ }
263
+ ports.push(n);
264
+ }
265
+
266
+ // ttl_minutes (optional, defaults to defaultTtlMinutes; capped at maxTtlMinutes)
267
+ const maxTtl = Math.max(1, Number(limits.maxTtlMinutes) || 1440);
268
+ const defaultTtl = Math.max(1, Number(limits.defaultTtlMinutes) || 240);
269
+ let ttlMinutes;
270
+ if (spec.ttl_minutes == null) {
271
+ ttlMinutes = defaultTtl;
272
+ } else {
273
+ const n = Number(spec.ttl_minutes);
274
+ if (!Number.isFinite(n) || n <= 0) return { ok: false, reason: 'ttl-invalid' };
275
+ if (n > maxTtl) return { ok: false, reason: 'ttl-too-long (>' + maxTtl + 'min)' };
276
+ ttlMinutes = Math.floor(n);
277
+ }
278
+
279
+ // attrs (optional, opaque to the engine but capped at maxAttrsBytes serialized)
280
+ const attrsRaw = spec.attrs == null ? {} : spec.attrs;
281
+ if (typeof attrsRaw !== 'object' || Array.isArray(attrsRaw)) {
282
+ return { ok: false, reason: 'attrs-not-object' };
283
+ }
284
+ let attrsSerialized;
285
+ try { attrsSerialized = JSON.stringify(attrsRaw); }
286
+ catch (e) { return { ok: false, reason: 'attrs-not-serializable (' + e.message + ')' }; }
287
+ const maxAttrsBytes = Math.max(64, Number(limits.maxAttrsBytes) || 2048);
288
+ if (Buffer.byteLength(attrsSerialized, 'utf8') > maxAttrsBytes) {
289
+ return { ok: false, reason: 'attrs-too-large (>' + maxAttrsBytes + 'B)' };
290
+ }
291
+
292
+ // healthcheck (required)
293
+ const hc = _validateHealthcheck(spec.healthcheck, limits);
294
+ if (!hc.ok) return { ok: false, reason: hc.reason };
295
+
296
+ return {
297
+ ok: true,
298
+ value: {
299
+ name: spec.name,
300
+ cmd: spec.cmd,
301
+ args: args,
302
+ cwd: typeof spec.cwd === 'string' ? spec.cwd : '',
303
+ env: env,
304
+ ports: ports,
305
+ ttl_minutes: ttlMinutes,
306
+ attrs: JSON.parse(attrsSerialized),
307
+ healthcheck: hc.value,
308
+ },
309
+ };
310
+ }
311
+
312
+ function validateManagedSpawnRecord(parsed, opts) {
313
+ opts = opts || {};
314
+ const limits = ENGINE_DEFAULTS.managedSpawn || {};
315
+ const maxSpecs = Math.max(1, Number(limits.maxSpecsPerFile) || 5);
316
+
317
+ if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
318
+ return { ok: false, reason: 'not-an-object' };
319
+ }
320
+ if (!Array.isArray(parsed.specs)) return { ok: false, reason: 'specs-missing' };
321
+ if (parsed.specs.length === 0) return { ok: false, reason: 'specs-empty' };
322
+ if (parsed.specs.length > maxSpecs) {
323
+ return { ok: false, reason: 'specs-too-many (>' + maxSpecs + ')' };
324
+ }
325
+
326
+ const seen = new Set();
327
+ const out = [];
328
+ for (let i = 0; i < parsed.specs.length; i++) {
329
+ const v = _validateSpec(parsed.specs[i], i, limits, opts);
330
+ if (!v.ok) {
331
+ // Preserve workdir-rejection prefix at the top level so the engine
332
+ // close-handler gate can key off it the same way it does for
333
+ // keep-pids (INVALID_WORKDIR_REASON_PREFIX).
334
+ if (typeof v.reason === 'string' && v.reason.indexOf(INVALID_WORKDIR_REASON_PREFIX) === 0) {
335
+ return { ok: false, reason: INVALID_WORKDIR_REASON_PREFIX + 'spec[' + i + ']: ' + v.reason.slice(INVALID_WORKDIR_REASON_PREFIX.length) };
336
+ }
337
+ return { ok: false, reason: 'spec[' + i + ']: ' + v.reason };
338
+ }
339
+ if (seen.has(v.value.name)) {
340
+ return { ok: false, reason: 'spec[' + i + ']: duplicate-name (' + v.value.name + ')' };
341
+ }
342
+ seen.add(v.value.name);
343
+ out.push(v.value);
344
+ }
345
+
346
+ if (parsed.written_by != null && typeof parsed.written_by !== 'string') {
347
+ return { ok: false, reason: 'written_by-not-string' };
348
+ }
349
+ if (parsed.wi_id != null && typeof parsed.wi_id !== 'string') {
350
+ return { ok: false, reason: 'wi_id-not-string' };
351
+ }
352
+
353
+ return {
354
+ ok: true,
355
+ value: {
356
+ specs: out,
357
+ written_by: typeof parsed.written_by === 'string' ? parsed.written_by : '',
358
+ wi_id: typeof parsed.wi_id === 'string' ? parsed.wi_id : '',
359
+ },
360
+ };
361
+ }
362
+
363
+ function readManagedSpawnFile(agentId, opts) {
364
+ opts = opts || {};
365
+ const filePath = path.join(_agentsDir(), String(agentId || ''), MANAGED_SPAWN_FILENAME);
366
+ let raw;
367
+ try { raw = fs.readFileSync(filePath, 'utf8'); }
368
+ catch (_e) { return null; }
369
+ let parsed = null;
370
+ try { parsed = JSON.parse(raw); }
371
+ catch (e) {
372
+ return { agentId: agentId, filePath: filePath, valid: false, reason: 'json-parse: ' + e.message, parsed: null };
373
+ }
374
+ const v = validateManagedSpawnRecord(parsed, opts);
375
+ if (!v.ok) return { agentId: agentId, filePath: filePath, valid: false, reason: v.reason, parsed: parsed };
376
+ return { agentId: agentId, filePath: filePath, valid: true, value: v.value, parsed: parsed };
377
+ }
378
+
379
+ // Pure engine-side acceptance helper. Mirrors evaluateKeepPidsAcceptance:
380
+ // returns a structured summary the engine close handler uses to decide
381
+ // whether to (a) silently accept (no file present, or valid), (b) reject as
382
+ // workdir-invalid → fail dispatch + inbox alert + unlink sidecar, or
383
+ // (c) reject as schema-invalid → fail dispatch + inbox alert + unlink.
384
+ // No side effects; caller owns kill + unlink + WI mutation.
385
+ function evaluateManagedSpawnAcceptance(agentId, opts) {
386
+ opts = opts || {};
387
+ const filePath = path.join(_agentsDir(), String(agentId || ''), MANAGED_SPAWN_FILENAME);
388
+ if (!fs.existsSync(filePath)) {
389
+ return { exists: false, accepted: false, isWorkdirRejection: false, reason: null, record: null, filePath: filePath };
390
+ }
391
+ const rec = readManagedSpawnFile(agentId, opts);
392
+ if (!rec) {
393
+ // Race: file existed in existsSync above but was unlinked before read.
394
+ return { exists: false, accepted: false, isWorkdirRejection: false, reason: null, record: null, filePath: filePath };
395
+ }
396
+ if (rec.valid) {
397
+ return {
398
+ exists: true,
399
+ accepted: true,
400
+ isWorkdirRejection: false,
401
+ reason: null,
402
+ record: rec.value,
403
+ filePath: rec.filePath,
404
+ };
405
+ }
406
+ const reason = rec.reason || 'unknown';
407
+ const isWorkdirRejection = typeof reason === 'string' && reason.indexOf(INVALID_WORKDIR_REASON_PREFIX) === 0;
408
+ return {
409
+ exists: true,
410
+ accepted: false,
411
+ isWorkdirRejection: isWorkdirRejection,
412
+ reason: reason,
413
+ record: null,
414
+ filePath: rec.filePath,
415
+ parsedRaw: rec.parsed || null,
416
+ };
417
+ }
418
+
419
+ function buildManagedSpawnHint(opts) {
420
+ opts = opts || {};
421
+ const limits = ENGINE_DEFAULTS.managedSpawn || {};
422
+ const maxTtl = Math.max(1, Number(limits.maxTtlMinutes) || 1440);
423
+ const defaultTtl = Math.max(1, Number(limits.defaultTtlMinutes) || 240);
424
+ const maxSpecs = Math.max(1, Number(limits.maxSpecsPerFile) || 5);
425
+ const ttlIn = Number(opts.ttlMinutes);
426
+ const ttl = Number.isFinite(ttlIn) && ttlIn > 0
427
+ ? Math.min(maxTtl, Math.floor(ttlIn))
428
+ : defaultTtl;
429
+ const agentId = opts.agentId || '<your-agent-id>';
430
+ const wiId = opts.workItemId || '<this-work-item-id>';
431
+ const minionsDir = opts.minionsDir || '<minions-dir>';
432
+ const portIn = Number(opts.dashboardPort);
433
+ const dashboardPort = Number.isFinite(portIn) && portIn > 0 ? portIn : 7331;
434
+
435
+ const lines = [
436
+ '',
437
+ '',
438
+ '---',
439
+ '',
440
+ '## Engine-managed long-running services (managed_spawn flag)',
441
+ '',
442
+ 'This work item permits you to **describe** long-running services in a sidecar file; the engine will **own the spawn** and the lifecycle. Use this instead of hand-rolling detached-spawn pipelines yourself.',
443
+ '',
444
+ 'BEFORE you write your completion report, write the sidecar:',
445
+ '',
446
+ ' `' + minionsDir + '/agents/' + agentId + '/managed-spawn.json`',
447
+ '',
448
+ 'with this exact JSON shape (one or more specs, max ' + maxSpecs + ' per file):',
449
+ '',
450
+ '```json',
451
+ '{',
452
+ ' "specs": [',
453
+ ' {',
454
+ ' "name": "constellation-host",',
455
+ ' "cmd": "bun",',
456
+ ' "args": ["run", "dev"],',
457
+ ' "cwd": "D:/repos/constellation",',
458
+ ' "env": { "VITE_HOST": "127.0.0.1" },',
459
+ ' "ports": [3001],',
460
+ ' "ttl_minutes": ' + ttl + ',',
461
+ ' "attrs": { "base_url": "http://localhost:3001", "framework": "vite" },',
462
+ ' "healthcheck": {',
463
+ ' "type": "http",',
464
+ ' "url": "http://localhost:3001/health",',
465
+ ' "expect_status": 200,',
466
+ ' "interval_s": 1,',
467
+ ' "timeout_s": 60',
468
+ ' }',
469
+ ' }',
470
+ ' ],',
471
+ ' "written_by": "' + agentId + '",',
472
+ ' "wi_id": "' + wiId + '"',
473
+ '}',
474
+ '```',
475
+ '',
476
+ 'Healthcheck types: `http` (GET URL, asserts status code) or `command` (runs a shell command, asserts stdout matches `expect_regex`). For `command`, set `shell: true` and `expect_regex` to a JavaScript regex string.',
477
+ '',
478
+ '### What the engine does for you',
479
+ '',
480
+ '1. Reads your sidecar after you exit.',
481
+ '2. Spawns each spec detached (the working Windows pattern is centralised in the engine — you do **not** need to write `Start-Process` or `spawn({ detached: true })` yourself).',
482
+ '3. Drives the healthcheck loop until each spec passes its first check (within `timeout_s`).',
483
+ '4. **Fails this dispatch (ERROR) if any spec fails its healthcheck.** Surviving siblings stay alive; failing PIDs are killed.',
484
+ '5. Auto-injects a `## Live managed processes` block into downstream agents\' prompts (scoped to your project) so the next dispatch can find the service without you telling it.',
485
+ '6. Sweeps dead PIDs / TTL-expired specs every ' + (limits.sweepEvery || 30) + ' ticks; kills + unlinks at TTL.',
486
+ '',
487
+ '### Caps the engine enforces (validator rejects anything over)',
488
+ '',
489
+ '- Specs per file: ≤ ' + maxSpecs,
490
+ '- Name: kebab-case, ≤ 64 chars, unique within file',
491
+ '- Executable (`cmd` and any `command` healthcheck cmd): on the engine\'s allowlist (node, bun, npm, npx, python, docker, adb, gradle, mvn, pwsh, …)',
492
+ '- Env keys: on the engine\'s allowlist or matching a known prefix (e.g. `VITE_`, `NEXT_`, `REACT_APP_`, `npm_config_`)',
493
+ '- Ports: 1024–65535, ≤ 20 per spec',
494
+ '- TTL: ≤ ' + maxTtl + ' minutes (hard cap), defaults to ' + defaultTtl + ' if omitted',
495
+ '- `attrs` serialized: ≤ 2048 bytes (opaque blob the engine surfaces to downstream agents)',
496
+ '',
497
+ 'If your file is invalid the engine marks this dispatch ERROR with `failure_class: invalid-managed-spawn` (non-retryable) — fix the file shape, don\'t retry blindly.',
498
+ '',
499
+ '### Verify before exit',
500
+ '',
501
+ 'After you write the file, query the engine to confirm acceptance:',
502
+ '',
503
+ ' curl -s http://localhost:' + dashboardPort + '/api/managed-processes',
504
+ '',
505
+ 'Each of your specs should appear with `healthy: true` once the engine\'s healthcheck loop fires (this happens after your agent process exits — the engine drives it). You don\'t need to wait for `healthy: true` yourself; just confirm the file is valid by re-reading it locally.',
506
+ '',
507
+ ];
508
+ return lines.join('\n');
509
+ }
510
+
511
+ // ─── Item 2 (P-2d5e8f04) ─────────────────────────────────────────────────────
512
+ //
513
+ // Engine-side spawn + locked state file. The engine calls these from its
514
+ // onAgentClose handler after the managed-spawn sidecar is accepted by
515
+ // evaluateManagedSpawnAcceptance above:
516
+ //
517
+ // 1. openManagedLog(name) → opens an append fd under MINIONS_DIR/
518
+ // engine/managed-logs/<name>.log. Child
519
+ // stdio is wired to this fd (NOT a pipe)
520
+ // so the detached process survives our
521
+ // exit on Windows.
522
+ // 2. spawnManagedSpec(spec, ctx) → uses the proven bin/minions.js
523
+ // spawnDashboard detached-spawn pattern.
524
+ // Returns {pid, started_at, log_path}.
525
+ // 3. recordManagedSpec(spec, → writes one entry to
526
+ // runtime, ctx) engine/managed-processes.json via
527
+ // mutateJsonFileLocked. Replaces any
528
+ // existing entry with the same name
529
+ // (idempotent under retry).
530
+ // 4. recordManagedBatch(items, → same but one lock for N specs (the
531
+ // ctx) close-handler call site spawns each
532
+ // then persists them together).
533
+ // 5. removeManagedSpec(name) → locked unlink of the entry; best-effort
534
+ // process.kill of the recorded PID
535
+ // OUTSIDE the lock callback. No-op when
536
+ // the entry is missing.
537
+ // 6. listManagedSpecs({project}) → reads the state file, optionally
538
+ // filters by owner_project. Used later
539
+ // by item 4 (dashboard) + item 6
540
+ // (playbook injection) + items 3/7
541
+ // (sweep + boot reconcile).
542
+ //
543
+ // All state writes go through mutateJsonFileLocked per the repo convention
544
+ // ('Key conventions' in copilot-instructions.md). Callbacks stay synchronous
545
+ // and fast — no kill / no spawn inside the lock callback. Healthcheck loops,
546
+ // dispatch ERROR gating on healthcheck failure, per-tick sweep, boot
547
+ // reconcile, and dashboard endpoints are deferred to items 3/4/5/7.
548
+
549
+ function _getStatePath() {
550
+ return path.join(shared.MINIONS_DIR, 'engine', MANAGED_PROCESSES_STATE_FILE);
551
+ }
552
+
553
+ function _getLogsDir() {
554
+ return path.join(shared.MINIONS_DIR, 'engine', MANAGED_LOGS_DIR);
555
+ }
556
+
557
+ function openManagedLog(name) {
558
+ if (typeof name !== 'string' || name.length === 0 || !_KEBAB_RE.test(name)) {
559
+ throw new Error('openManagedLog: name must be a non-empty kebab-case string');
560
+ }
561
+ // P-8a4d6f29 — rotate-on-open is centralised in shared.openAppendLogFd so
562
+ // bin/minions.js (dashboard/engine stdio) and managed-spawn share the same
563
+ // ".1 sibling" rotation. Append mode is preserved so a post-boot respawn
564
+ // (item 7) doesn't clobber prior output.
565
+ const dir = _getLogsDir();
566
+ const { fd, logPath } = shared.openAppendLogFd(name + '.log', dir);
567
+ return { fd: fd, logPath: logPath };
568
+ }
569
+
570
+ // Build the child env: sanitised sidecar env + PATH + Windows-essential vars
571
+ // (SYSTEMROOT, USERPROFILE, TEMP, etc.) without which detached node/bun
572
+ // children fail on Windows. Host vars OUTSIDE this list are deliberately not
573
+ // forwarded — a managed spec must declare what it needs in spec.env (the
574
+ // validator enforces the env allowlist).
575
+ const _WIN_ESSENTIAL_ENV_KEYS = [
576
+ 'SYSTEMROOT', 'SYSTEMDRIVE', 'WINDIR',
577
+ 'USERPROFILE', 'APPDATA', 'LOCALAPPDATA',
578
+ 'TEMP', 'TMP', 'HOMEDRIVE', 'HOMEPATH', 'HOME',
579
+ 'PROCESSOR_ARCHITECTURE', 'PATHEXT', 'COMSPEC', 'OS',
580
+ ];
581
+
582
+ function _buildChildEnv(specEnv) {
583
+ const env = Object.assign({}, specEnv || {});
584
+ if (!env.PATH && process.env.PATH) env.PATH = process.env.PATH;
585
+ for (const k of _WIN_ESSENTIAL_ENV_KEYS) {
586
+ if (env[k] == null && process.env[k] != null) env[k] = process.env[k];
587
+ }
588
+ return env;
589
+ }
590
+
591
+ function spawnManagedSpec(spec, ctx) {
592
+ if (!spec || typeof spec !== 'object') throw new Error('spawnManagedSpec: spec required');
593
+ if (typeof spec.cmd !== 'string' || spec.cmd.length === 0) {
594
+ throw new Error('spawnManagedSpec: spec.cmd required');
595
+ }
596
+ const limits = ENGINE_DEFAULTS.managedSpawn || {};
597
+ // Defensive re-check of the executable allowlist. The validator gates this
598
+ // at sidecar-read time too; this guards future direct callers and tests.
599
+ if (!_isOnAllowlist(spec.cmd, limits.executableAllowlist)) {
600
+ throw new Error('spawnManagedSpec: cmd not-on-allowlist (' + spec.cmd + ')');
601
+ }
602
+ ctx = ctx || {};
603
+ const { fd: logFd, logPath } = openManagedLog(spec.name);
604
+ const cwd = (typeof spec.cwd === 'string' && spec.cwd.length > 0) ? spec.cwd : undefined;
605
+ const env = _buildChildEnv(spec.env);
606
+ const argv = Array.isArray(spec.args) ? spec.args : [];
607
+ let child;
608
+ try {
609
+ // Working Windows-correct detached spawn — same shape as
610
+ // bin/minions.js spawnDashboard. DO NOT switch stdio to 'pipe' or
611
+ // 'inherit': pipes die on EPIPE once the parent exits.
612
+ child = spawn(spec.cmd, argv, {
613
+ cwd: cwd,
614
+ env: env,
615
+ detached: true,
616
+ stdio: ['ignore', logFd, logFd],
617
+ windowsHide: true,
618
+ });
619
+ } catch (e) {
620
+ try { fs.closeSync(logFd); } catch (_e) {}
621
+ throw e;
622
+ }
623
+ // Close our copy of the fd in the parent — the child holds its own dup.
624
+ try { fs.closeSync(logFd); } catch (_e) {}
625
+ if (!child || !child.pid) {
626
+ throw new Error('spawnManagedSpec: spawn failed for ' + spec.cmd);
627
+ }
628
+ child.unref();
629
+ const startedAt = Date.now();
630
+ log('info', 'managed-spawn born: name=' + spec.name + ' pid=' + child.pid
631
+ + ' owner_project=' + (ctx.owner_project || '')
632
+ + ' owner_wi=' + (ctx.owner_wi || ''));
633
+ return { pid: child.pid, started_at: startedAt, log_path: logPath };
634
+ }
635
+
636
+ function _initialStateShape() {
637
+ return { specs: [] };
638
+ }
639
+
640
+ function _toStateRecord(spec, runtime, ctx) {
641
+ const limits = ENGINE_DEFAULTS.managedSpawn || {};
642
+ const defaultTtl = Math.max(1, Number(limits.defaultTtlMinutes) || 240);
643
+ const ttlMin = Number.isFinite(spec.ttl_minutes) && spec.ttl_minutes > 0
644
+ ? spec.ttl_minutes : defaultTtl;
645
+ const started = Number.isFinite(runtime && runtime.started_at) ? runtime.started_at : Date.now();
646
+ return {
647
+ name: spec.name,
648
+ pid: runtime && Number.isInteger(runtime.pid) ? runtime.pid : null,
649
+ owner_agent: (ctx && ctx.owner_agent) || '',
650
+ owner_wi: (ctx && ctx.owner_wi) || '',
651
+ owner_project: (ctx && ctx.owner_project) || '',
652
+ cmd: spec.cmd,
653
+ args: Array.isArray(spec.args) ? spec.args.slice() : [],
654
+ cwd: typeof spec.cwd === 'string' ? spec.cwd : '',
655
+ env: Object.assign({}, spec.env || {}),
656
+ ports: Array.isArray(spec.ports) ? spec.ports.slice() : [],
657
+ attrs: spec.attrs && typeof spec.attrs === 'object'
658
+ ? JSON.parse(JSON.stringify(spec.attrs)) : {},
659
+ healthcheck: spec.healthcheck && typeof spec.healthcheck === 'object'
660
+ ? JSON.parse(JSON.stringify(spec.healthcheck)) : null,
661
+ started_at: started,
662
+ ttl_expires_at: started + (ttlMin * 60 * 1000),
663
+ last_health_at: null,
664
+ healthy: false,
665
+ alive: true,
666
+ log_path: (runtime && runtime.log_path) || '',
667
+ };
668
+ }
669
+
670
+ function recordManagedSpec(spec, runtime, ctx) {
671
+ if (!spec || !spec.name) throw new Error('recordManagedSpec: spec.name required');
672
+ const statePath = _getStatePath();
673
+ shared.mutateJsonFileLocked(statePath, (data) => {
674
+ if (!data || typeof data !== 'object' || Array.isArray(data) || !Array.isArray(data.specs)) {
675
+ data = _initialStateShape();
676
+ }
677
+ const idx = data.specs.findIndex(s => s && s.name === spec.name);
678
+ const rec = _toStateRecord(spec, runtime, ctx);
679
+ if (idx >= 0) data.specs[idx] = rec;
680
+ else data.specs.push(rec);
681
+ return data;
682
+ }, { defaultValue: _initialStateShape() });
683
+ }
684
+
685
+ function recordManagedBatch(items, ctx) {
686
+ if (!Array.isArray(items) || items.length === 0) return;
687
+ const statePath = _getStatePath();
688
+ shared.mutateJsonFileLocked(statePath, (data) => {
689
+ if (!data || typeof data !== 'object' || Array.isArray(data) || !Array.isArray(data.specs)) {
690
+ data = _initialStateShape();
691
+ }
692
+ for (const entry of items) {
693
+ const spec = entry && entry.spec;
694
+ const runtime = entry && entry.runtime;
695
+ if (!spec || !spec.name) continue;
696
+ const idx = data.specs.findIndex(s => s && s.name === spec.name);
697
+ const rec = _toStateRecord(spec, runtime, ctx);
698
+ if (idx >= 0) data.specs[idx] = rec;
699
+ else data.specs.push(rec);
700
+ }
701
+ return data;
702
+ }, { defaultValue: _initialStateShape() });
703
+ }
704
+
705
+ function removeManagedSpec(name) {
706
+ if (typeof name !== 'string' || name.length === 0) return;
707
+ let killPid = null;
708
+ const statePath = _getStatePath();
709
+ shared.mutateJsonFileLocked(statePath, (data) => {
710
+ if (!data || !Array.isArray(data.specs)) return data;
711
+ const idx = data.specs.findIndex(s => s && s.name === name);
712
+ if (idx < 0) return data;
713
+ const rec = data.specs[idx];
714
+ if (rec && Number.isInteger(rec.pid) && rec.pid > 0) killPid = rec.pid;
715
+ data.specs.splice(idx, 1);
716
+ return data;
717
+ }, { defaultValue: _initialStateShape() });
718
+ // Kill OUTSIDE the lock — never run process ops inside a lock callback
719
+ // (copilot-instructions.md "Keep lock callbacks synchronous and fast").
720
+ if (killPid != null) {
721
+ try { shared.killByPidImmediate(killPid); }
722
+ catch (e) { log('warn', 'managed-spawn removeManagedSpec: kill ' + killPid + ' failed: ' + e.message); }
723
+ }
724
+ }
725
+
726
+ function listManagedSpecs(opts) {
727
+ opts = opts || {};
728
+ const statePath = _getStatePath();
729
+ let raw;
730
+ try { raw = fs.readFileSync(statePath, 'utf8'); }
731
+ catch (_e) { return []; }
732
+ let parsed;
733
+ try { parsed = JSON.parse(raw); }
734
+ catch (_e) { return []; }
735
+ const specs = (parsed && Array.isArray(parsed.specs)) ? parsed.specs : [];
736
+ if (opts.project) return specs.filter(s => s && s.owner_project === opts.project);
737
+ return specs.slice();
738
+ }
739
+
740
+ // ─── Item 3 (P-9c1f47a6) ─────────────────────────────────────────────────────
741
+ //
742
+ // Healthcheck implementations + dispatch SUCCESS/ERROR gate.
743
+ //
744
+ // runHealthcheck(spec) → fires one probe (http or command), resolves
745
+ // {healthy, error, lastCheckAt}. Pure — no
746
+ // state mutation, no PID kills. Used by both
747
+ // waitForFirstHealth and the per-spec
748
+ // liveness loop a future item wires in.
749
+ // waitForFirstHealth(spec,opts)→ self-scheduled async loop. Polls
750
+ // runHealthcheck every spec.healthcheck
751
+ // .interval_s. On first healthy → flips the
752
+ // state file's `healthy: true` +
753
+ // `last_health_at` (locked write) and
754
+ // resolves {healthy:true}. On
755
+ // spec.healthcheck.timeout_s elapsed
756
+ // without a pass → resolves
757
+ // {healthy:false, error:'timeout: ...'}
758
+ // without throwing (caller decides what to
759
+ // do with the rejection — the engine close-
760
+ // handler maps it to dispatch ERROR with
761
+ // FAILURE_CLASS.MANAGED_SPAWN_HEALTHCHECK_FAILED).
762
+ // tailManagedLog(name, lines) → reads up to N tail bytes of the named log
763
+ // and returns the last `lines` lines joined.
764
+ // Used by the engine close-handler to attach
765
+ // log evidence to inbox alerts on
766
+ // healthcheck failure.
767
+ //
768
+ // Per the plan, healthcheck loops are PER-SPEC and self-scheduled — NEVER
769
+ // driven from the tick cycle. The tick coupling regression was the original
770
+ // design constraint that drove the entire architecture choice.
771
+ // (No state writes inside the lock callback except `healthy` + `last_health_at`
772
+ // at first-pass; idle liveness updates batch every healthBackoffSec and land
773
+ // in a follow-up item.)
774
+
775
+ const _HC_CMD_TIMEOUT_MS = 5000; // hard ceiling for one healthcheck probe
776
+
777
+ function _httpProbe(url, expectStatus, timeoutMs) {
778
+ return new Promise((resolve) => {
779
+ let settled = false;
780
+ const finish = (result) => { if (!settled) { settled = true; resolve(result); } };
781
+ let req;
782
+ try {
783
+ const client = url.startsWith('https:') ? https : http;
784
+ req = client.get(url, { timeout: timeoutMs }, (res) => {
785
+ // Consume the body so the socket can close even on no listeners.
786
+ res.resume();
787
+ res.on('end', () => {
788
+ if (res.statusCode === expectStatus) finish({ healthy: true, error: null });
789
+ else finish({ healthy: false, error: 'http status ' + res.statusCode + ' (expected ' + expectStatus + ')' });
790
+ });
791
+ res.on('error', (e) => finish({ healthy: false, error: 'http response error: ' + e.message }));
792
+ });
793
+ req.on('error', (e) => finish({ healthy: false, error: 'http request error: ' + e.message }));
794
+ req.on('timeout', () => {
795
+ try { req.destroy(new Error('timeout')); } catch (_e) {}
796
+ finish({ healthy: false, error: 'http request timeout after ' + timeoutMs + 'ms' });
797
+ });
798
+ } catch (e) {
799
+ finish({ healthy: false, error: 'http probe threw: ' + e.message });
800
+ }
801
+ });
802
+ }
803
+
804
+ function _commandProbe(cmd, useShell, expectRegex, timeoutMs) {
805
+ return new Promise((resolve) => {
806
+ let regex;
807
+ try { regex = new RegExp(expectRegex); }
808
+ catch (e) { return resolve({ healthy: false, error: 'expect_regex invalid: ' + e.message }); }
809
+ exec(cmd, {
810
+ timeout: timeoutMs,
811
+ shell: useShell ? undefined : false, // exec defaults to /bin/sh or cmd.exe when shell undefined
812
+ windowsHide: true,
813
+ maxBuffer: 1024 * 1024,
814
+ }, (err, stdout, stderr) => {
815
+ if (err && err.killed) {
816
+ return resolve({ healthy: false, error: 'healthcheck command timeout after ' + timeoutMs + 'ms' });
817
+ }
818
+ if (err) {
819
+ // Non-zero exit. The plan says match against stdout, so a non-zero
820
+ // exit with matching stdout is still "unhealthy" because the
821
+ // process errored — treat command errors as unhealthy with the
822
+ // stderr tail for diagnostics.
823
+ const tail = (stderr || '').trim().slice(-200);
824
+ return resolve({ healthy: false, error: 'healthcheck command exit ' + (err.code != null ? err.code : '?') + (tail ? ': ' + tail : '') });
825
+ }
826
+ const out = String(stdout || '');
827
+ if (regex.test(out)) return resolve({ healthy: true, error: null });
828
+ return resolve({ healthy: false, error: 'healthcheck stdout did not match regex /' + expectRegex + '/' });
829
+ });
830
+ });
831
+ }
832
+
833
+ async function runHealthcheck(spec) {
834
+ if (!spec || !spec.healthcheck || typeof spec.healthcheck !== 'object') {
835
+ return { healthy: false, error: 'spec.healthcheck missing', lastCheckAt: Date.now() };
836
+ }
837
+ const hc = spec.healthcheck;
838
+ const timeoutMs = Math.min(_HC_CMD_TIMEOUT_MS, Math.max(500, (Number(hc.timeout_s) || 5) * 1000));
839
+ let result;
840
+ if (hc.type === 'http') {
841
+ result = await _httpProbe(hc.url, Number(hc.expect_status), timeoutMs);
842
+ } else if (hc.type === 'command') {
843
+ result = await _commandProbe(hc.cmd, !!hc.shell, hc.expect_regex, timeoutMs);
844
+ } else {
845
+ result = { healthy: false, error: 'healthcheck type unknown: ' + hc.type };
846
+ }
847
+ result.lastCheckAt = Date.now();
848
+ return result;
849
+ }
850
+
851
+ // Flip state.healthy=true + last_health_at on a single locked write. Used by
852
+ // waitForFirstHealth on the first pass. Callers MUST be outside any other
853
+ // lock — this acquires its own.
854
+ function _markHealthy(name, now) {
855
+ const statePath = _getStatePath();
856
+ shared.mutateJsonFileLocked(statePath, (data) => {
857
+ if (!data || !Array.isArray(data.specs)) return data;
858
+ const rec = data.specs.find(s => s && s.name === name);
859
+ if (!rec) return data;
860
+ rec.healthy = true;
861
+ rec.last_health_at = now;
862
+ return data;
863
+ }, { defaultValue: _initialStateShape() });
864
+ }
865
+
866
+ function waitForFirstHealth(spec, opts) {
867
+ opts = opts || {};
868
+ const intervalMs = Math.max(100, (Number(spec.healthcheck && spec.healthcheck.interval_s) || 1) * 1000);
869
+ const timeoutMs = Math.max(intervalMs, (Number(spec.healthcheck && spec.healthcheck.timeout_s) || 30) * 1000);
870
+ const deadline = Date.now() + timeoutMs;
871
+ return new Promise((resolve) => {
872
+ let stopped = false;
873
+ let lastError = null;
874
+ const tick = async () => {
875
+ if (stopped) return;
876
+ const result = await runHealthcheck(spec);
877
+ if (stopped) return;
878
+ if (result.healthy) {
879
+ stopped = true;
880
+ try { _markHealthy(spec.name, result.lastCheckAt); }
881
+ catch (e) { log('warn', 'managed-spawn waitForFirstHealth: state write failed for ' + spec.name + ': ' + e.message); }
882
+ return resolve({ healthy: true, error: null, lastCheckAt: result.lastCheckAt });
883
+ }
884
+ lastError = result.error;
885
+ if (Date.now() >= deadline) {
886
+ stopped = true;
887
+ return resolve({
888
+ healthy: false,
889
+ error: 'timeout: spec ' + spec.name + ' did not become healthy within ' + Math.round(timeoutMs / 1000) + 's (last: ' + (lastError || 'no probes ran') + ')',
890
+ lastCheckAt: result.lastCheckAt,
891
+ });
892
+ }
893
+ setTimeout(tick, intervalMs);
894
+ };
895
+ // First probe fires immediately so a fast service doesn't pay an
896
+ // interval_s delay.
897
+ tick();
898
+ });
899
+ }
900
+
901
+ function tailManagedLog(name, lines) {
902
+ const linesN = Math.max(1, Math.min(1000, Number(lines) || 100));
903
+ const logPath = path.join(_getLogsDir(), name + '.log');
904
+ let raw;
905
+ try { raw = fs.readFileSync(logPath, 'utf8'); }
906
+ catch (_e) { return ''; }
907
+ const arr = raw.split(/\r?\n/);
908
+ return arr.slice(-linesN).join('\n');
909
+ }
910
+
911
+ // ─── Item 6 (P-1f9c3a45) ─────────────────────────────────────────────────────
912
+ //
913
+ // Playbook auto-inject of the live-managed-processes block. Called by
914
+ // engine/playbook.js renderPlaybook (project-scoped, computed once per render).
915
+ // Filters listManagedSpecs() to specs where:
916
+ // - owner_project === opts.project
917
+ // - healthy === true
918
+ // - alive === true
919
+ //
920
+ // Returns '' when nothing matches — playbook caller can append unconditionally.
921
+ // Serialized payload is capped at ENGINE_DEFAULTS.managedSpawn.promptContextMaxBytes
922
+ // (default 2048). When the full block exceeds the cap, falls back to a compact
923
+ // list (name + base_url + ports), keeping the dashboard endpoint footer so the
924
+ // dispatched agent can still discover the rest via /api/managed-processes.
925
+ //
926
+ // Project arg is required — never inject "all specs" to avoid cross-project
927
+ // port/URL leakage (see plan §"Auto-injected prompt context could leak…").
928
+ function buildLiveManagedProcessesBlock(opts) {
929
+ opts = opts || {};
930
+ const project = typeof opts.project === 'string' ? opts.project : '';
931
+ if (!project) return '';
932
+ let specs;
933
+ try { specs = listManagedSpecs({ project: project }); }
934
+ catch (_e) { return ''; }
935
+ if (!Array.isArray(specs)) return '';
936
+ specs = specs.filter(s => s && s.healthy === true && s.alive === true);
937
+ if (specs.length === 0) return '';
938
+
939
+ const limits = ENGINE_DEFAULTS.managedSpawn || {};
940
+ const cap = Math.max(256, Number(limits.promptContextMaxBytes) || 2048);
941
+ const portIn = Number(opts.dashboardPort);
942
+ const dashboardPort = Number.isFinite(portIn) && portIn > 0 ? portIn : 7331;
943
+ const dashboardUrl = 'http://localhost:' + dashboardPort + '/api/managed-processes?project='
944
+ + encodeURIComponent(project);
945
+
946
+ const header = [
947
+ '',
948
+ '',
949
+ '---',
950
+ '',
951
+ '## Live managed processes for project ' + project,
952
+ '',
953
+ 'These services were spawned by earlier work items in this project and are currently healthy. Reuse them — do not re-spawn duplicates. Ports, URLs, and PIDs are owned by the engine; query `/api/managed-processes` if you need fresh state.',
954
+ '',
955
+ ];
956
+
957
+ // ── Full rendering — name + pid + ports + attrs + log + ttl per spec ───
958
+ const fullLines = header.slice();
959
+ for (const s of specs) {
960
+ fullLines.push('### ' + s.name);
961
+ fullLines.push('');
962
+ if (Number.isInteger(s.pid) && s.pid > 0) fullLines.push('- pid: ' + s.pid);
963
+ if (Array.isArray(s.ports) && s.ports.length) {
964
+ fullLines.push('- ports: ' + s.ports.join(', '));
965
+ }
966
+ if (s.attrs && typeof s.attrs === 'object') {
967
+ const attrKeys = Object.keys(s.attrs);
968
+ if (attrKeys.length) {
969
+ const parts = attrKeys.map(k => k + ': ' + JSON.stringify(s.attrs[k]));
970
+ fullLines.push('- attrs: ' + parts.join('; '));
971
+ }
972
+ }
973
+ if (typeof s.log_path === 'string' && s.log_path) fullLines.push('- log: ' + s.log_path);
974
+ if (Number.isFinite(s.ttl_expires_at)) {
975
+ fullLines.push('- ttl_expires_at: ' + new Date(s.ttl_expires_at).toISOString());
976
+ }
977
+ fullLines.push('');
978
+ }
979
+ fullLines.push('Full details / kill / restart: `curl -s ' + dashboardUrl + '`');
980
+ fullLines.push('');
981
+ const full = fullLines.join('\n');
982
+ if (Buffer.byteLength(full, 'utf8') <= cap) return full;
983
+
984
+ // ── Compact fallback — name + base_url + ports ─────────────────────────
985
+ const compactLines = header.slice();
986
+ for (const s of specs) {
987
+ const baseUrl = (s.attrs && typeof s.attrs.base_url === 'string') ? s.attrs.base_url : '';
988
+ const portStr = (Array.isArray(s.ports) && s.ports.length) ? ' (ports ' + s.ports.join(',') + ')' : '';
989
+ compactLines.push('- **' + s.name + '**' + (baseUrl ? ' — ' + baseUrl : '') + portStr);
990
+ }
991
+ compactLines.push('');
992
+ compactLines.push('_' + specs.length + ' live service(s) — full details truncated above the '
993
+ + cap + '-byte prompt cap. Query `' + dashboardUrl + '` for attrs, logs, and TTL._');
994
+ compactLines.push('');
995
+ return compactLines.join('\n');
996
+ }
997
+
998
+ // ─── Item 4 (P-4b8d2e57) ─────────────────────────────────────────────────────
999
+ //
1000
+ // Discovery API helpers: ETag fingerprinting for list endpoints and
1001
+ // state-driven respawn from a previously recorded spec.
1002
+ //
1003
+ // computeManagedSpecsEtag({project}) → returns a stable 16-char sha1 prefix
1004
+ // over the JSON-serialised list (filtered
1005
+ // by project if supplied). Same content
1006
+ // → same etag; ANY field change → new
1007
+ // etag. Honored by the dashboard
1008
+ // endpoints via If-None-Match → 304.
1009
+ // restartManagedSpec(name) → looks up the spec by name, kills the
1010
+ // old PID (if alive), re-spawns it using
1011
+ // the saved spec shape (cmd/args/cwd/
1012
+ // env/healthcheck), and replaces the
1013
+ // state row with healthy:false +
1014
+ // alive:true + new pid + new
1015
+ // started_at. Throws if the name is
1016
+ // unknown — caller maps to HTTP 404.
1017
+ // Healthcheck loop for the new PID is
1018
+ // the caller's job (the dashboard
1019
+ // endpoint kicks it off async; the
1020
+ // boot-reconcile path / item 7 does too
1021
+ // on engine restart).
1022
+
1023
+ const crypto = require('crypto');
1024
+
1025
+ function computeManagedSpecsEtag(opts) {
1026
+ const specs = listManagedSpecs(opts || {});
1027
+ // Deterministic serialization: sort by name so the etag isn't sensitive to
1028
+ // insertion order in the state file.
1029
+ const sorted = specs.slice().sort((a, b) => String(a.name).localeCompare(String(b.name)));
1030
+ const json = JSON.stringify(sorted);
1031
+ return crypto.createHash('sha1').update(json).digest('hex').slice(0, 16);
1032
+ }
1033
+
1034
+ function restartManagedSpec(name) {
1035
+ if (typeof name !== 'string' || name.length === 0) {
1036
+ throw new Error('restartManagedSpec: name required');
1037
+ }
1038
+ const existing = listManagedSpecs().find(s => s && s.name === name);
1039
+ if (!existing) {
1040
+ throw new Error('restartManagedSpec: spec not found: ' + name);
1041
+ }
1042
+ // Reconstruct a sidecar-shaped spec from the persisted state row. The
1043
+ // state shape keeps cmd/args/cwd/env/ports/attrs/healthcheck verbatim
1044
+ // (see _toStateRecord), so this is a direct projection.
1045
+ const spec = {
1046
+ name: existing.name,
1047
+ cmd: existing.cmd,
1048
+ args: Array.isArray(existing.args) ? existing.args.slice() : [],
1049
+ cwd: existing.cwd || '',
1050
+ env: Object.assign({}, existing.env || {}),
1051
+ ports: Array.isArray(existing.ports) ? existing.ports.slice() : [],
1052
+ ttl_minutes: undefined, // re-uses ttl_expires_at via _toStateRecord; pass through default
1053
+ attrs: existing.attrs && typeof existing.attrs === 'object' ? existing.attrs : {},
1054
+ healthcheck: existing.healthcheck || null,
1055
+ };
1056
+ // Kill the old PID before respawn (best-effort; outside of any lock).
1057
+ if (Number.isInteger(existing.pid) && existing.pid > 0) {
1058
+ try { shared.killByPidImmediate(existing.pid); }
1059
+ catch (e) { log('warn', 'restartManagedSpec: kill of old PID ' + existing.pid + ' failed: ' + e.message); }
1060
+ }
1061
+ const ctx = {
1062
+ owner_agent: existing.owner_agent || '',
1063
+ owner_wi: existing.owner_wi || '',
1064
+ owner_project: existing.owner_project || '',
1065
+ };
1066
+ const runtime = spawnManagedSpec(spec, ctx);
1067
+ // recordManagedSpec replaces by name (item 2 idempotency contract) and
1068
+ // resets healthy:false / alive:true / new started_at / new ttl_expires_at.
1069
+ recordManagedSpec(spec, runtime, ctx);
1070
+ log('info', 'managed-spawn restart: name=' + name + ' old_pid=' + existing.pid + ' new_pid=' + runtime.pid);
1071
+ return runtime;
1072
+ }
1073
+
1074
+ // ─── Item 7 (P-8a4d6f29) ─────────────────────────────────────────────────────
1075
+ //
1076
+ // TTL sweep + boot reconcile + project-removal cleanup + log rotation.
1077
+ //
1078
+ // sweepManagedSpawn(opts) → tick-driven walk of the state file:
1079
+ // 1. probe each pid (process.kill 0);
1080
+ // dead-but-not-expired entries are
1081
+ // dropped from state (no kill —
1082
+ // the OS already reaped them).
1083
+ // 2. ttl_expires_at past now → batch
1084
+ // kill via killByPidsImmediate +
1085
+ // drop from state.
1086
+ // 3. rotate log_path when size >
1087
+ // logRotateBytes (rename to .1,
1088
+ // overwrite any prior .1).
1089
+ // Returns {scanned, ttlExpired,
1090
+ // deadDropped, killedPids,
1091
+ // rotatedLogs, malformed}.
1092
+ // bootReconcileManagedSpawn(opts) → one-shot equivalent for the engine
1093
+ // boot path. Same drop-dead + kill-
1094
+ // expired pass, plus a single
1095
+ // runHealthcheck() probe per
1096
+ // surviving spec to refresh
1097
+ // `healthy` / `last_health_at`.
1098
+ // Returns a Promise so callers can
1099
+ // Promise.race it against the
1100
+ // bootReconcileMaxMs ceiling.
1101
+ // removeManagedSpecsForProject(name) → centralised project-removal hook.
1102
+ // Kills + drops every spec whose
1103
+ // owner_project matches, unlinks the
1104
+ // log + log.1. Returns {killed,
1105
+ // unlinked}. engine/projects.js
1106
+ // removeProject calls this — no
1107
+ // managed-process awareness elsewhere.
1108
+ //
1109
+ // Per the plan, healthcheck loops stay PER-SPEC and self-scheduled — the tick
1110
+ // cycle never iterates all specs to drive a probe. The sweep ONLY handles
1111
+ // liveness/TTL/log-rotation; it does not re-attach healthcheck timers
1112
+ // (boot reconcile does that once at startup; the engine close-handler does it
1113
+ // at first spawn).
1114
+
1115
+ function _rotateManagedLog(logPath, cap) {
1116
+ if (!logPath || typeof logPath !== 'string') return false;
1117
+ let size = 0;
1118
+ try { size = fs.statSync(logPath).size; }
1119
+ catch (_e) { return false; }
1120
+ if (!Number.isFinite(cap) || cap <= 0 || size <= cap) return false;
1121
+ const rotated = logPath + '.1';
1122
+ try {
1123
+ try { fs.unlinkSync(rotated); }
1124
+ catch (e) { if (e && e.code !== 'ENOENT') throw e; }
1125
+ fs.renameSync(logPath, rotated);
1126
+ return true;
1127
+ } catch (e) {
1128
+ log('warn', 'managed-spawn rotate: ' + logPath + ' failed: ' + e.message);
1129
+ return false;
1130
+ }
1131
+ }
1132
+
1133
+ function _runManagedReconcile(opts) {
1134
+ opts = opts || {};
1135
+ const limits = ENGINE_DEFAULTS.managedSpawn || {};
1136
+ const now = Number.isFinite(opts.now) ? opts.now : Date.now();
1137
+ const rotateBytes = Number.isFinite(opts.rotateBytes)
1138
+ ? opts.rotateBytes
1139
+ : (Number(limits.logRotateBytes) || 10 * 1024 * 1024);
1140
+ const isAlive = typeof opts.isAlive === 'function'
1141
+ ? opts.isAlive
1142
+ : shared.isPidAlive;
1143
+ const killBatch = typeof opts.killBatch === 'function'
1144
+ ? opts.killBatch
1145
+ : shared.killByPidsImmediate;
1146
+ const stats = {
1147
+ scanned: 0,
1148
+ ttlExpired: 0,
1149
+ deadDropped: 0,
1150
+ killedPids: 0,
1151
+ rotatedLogs: 0,
1152
+ malformed: 0,
1153
+ };
1154
+ const statePath = _getStatePath();
1155
+ const ttlPidsToKill = [];
1156
+ const survivors = []; // [{name, log_path}] post-mutation, used for log rotation + bootReconcile probes
1157
+ shared.mutateJsonFileLocked(statePath, (data) => {
1158
+ if (!data || typeof data !== 'object' || Array.isArray(data) || !Array.isArray(data.specs)) {
1159
+ stats.malformed++;
1160
+ return _initialStateShape();
1161
+ }
1162
+ const kept = [];
1163
+ for (const rec of data.specs) {
1164
+ if (!rec || typeof rec !== 'object' || typeof rec.name !== 'string') {
1165
+ stats.malformed++;
1166
+ continue;
1167
+ }
1168
+ stats.scanned++;
1169
+ const ttlExpired = Number.isFinite(rec.ttl_expires_at) && rec.ttl_expires_at <= now;
1170
+ const alive = Number.isInteger(rec.pid) && rec.pid > 0 && isAlive(rec.pid);
1171
+ if (ttlExpired) {
1172
+ stats.ttlExpired++;
1173
+ if (alive) ttlPidsToKill.push(rec.pid);
1174
+ continue; // drop from state
1175
+ }
1176
+ if (!alive) {
1177
+ stats.deadDropped++;
1178
+ continue; // dead + not expired → drop
1179
+ }
1180
+ kept.push(rec);
1181
+ survivors.push({ name: rec.name, log_path: rec.log_path || '', healthy: rec.healthy === true, last_health_at: rec.last_health_at || 0 });
1182
+ }
1183
+ data.specs = kept;
1184
+ return data;
1185
+ }, { defaultValue: _initialStateShape() });
1186
+ // Process kills + log rotation OUTSIDE the lock callback. The copilot-
1187
+ // instructions key conventions explicitly forbid kills/network/awaits inside
1188
+ // a lock callback.
1189
+ if (ttlPidsToKill.length > 0) {
1190
+ try { stats.killedPids = killBatch(ttlPidsToKill); }
1191
+ catch (e) { log('warn', 'managed-spawn sweep: kill batch failed: ' + e.message); }
1192
+ }
1193
+ for (const surv of survivors) {
1194
+ if (_rotateManagedLog(surv.log_path, rotateBytes)) stats.rotatedLogs++;
1195
+ }
1196
+ return { stats: stats, survivors: survivors };
1197
+ }
1198
+
1199
+ function sweepManagedSpawn(opts) {
1200
+ return _runManagedReconcile(opts).stats;
1201
+ }
1202
+
1203
+ // Refresh `healthy` + `last_health_at` on the state row after a fresh probe.
1204
+ // Symmetric with _markHealthy but accepts the unhealthy case too (flipping
1205
+ // healthy=true→false would mask a real degradation — keep healthy sticky and
1206
+ // rely on the per-spec loop to refresh post-success, or item 7 boot reconcile
1207
+ // to (re)establish initial truth on engine restart).
1208
+ function _markBootProbe(name, result) {
1209
+ const statePath = _getStatePath();
1210
+ shared.mutateJsonFileLocked(statePath, (data) => {
1211
+ if (!data || !Array.isArray(data.specs)) return data;
1212
+ const rec = data.specs.find(s => s && s.name === name);
1213
+ if (!rec) return data;
1214
+ if (result && result.healthy === true) {
1215
+ rec.healthy = true;
1216
+ rec.last_health_at = Number.isFinite(result.lastCheckAt) ? result.lastCheckAt : Date.now();
1217
+ } else {
1218
+ // Survivor failed its boot probe — clear healthy so dashboard reflects
1219
+ // truth. The per-spec healthcheck loop (re-attached in a future item) or
1220
+ // the next dispatch's waitForFirstHealth will move it back to true.
1221
+ rec.healthy = false;
1222
+ }
1223
+ return data;
1224
+ }, { defaultValue: _initialStateShape() });
1225
+ }
1226
+
1227
+ async function bootReconcileManagedSpawn(opts) {
1228
+ opts = opts || {};
1229
+ const limits = ENGINE_DEFAULTS.managedSpawn || {};
1230
+ const backoffMs = Math.max(0, (Number(limits.healthBackoffSec) || 30) * 1000);
1231
+ const now = Number.isFinite(opts.now) ? opts.now : Date.now();
1232
+ const { stats, survivors } = _runManagedReconcile(opts);
1233
+ // Re-probe survivors with a single healthcheck each. Skip ones already
1234
+ // healthy within healthBackoffSec — boot reconcile must stay bounded.
1235
+ const probed = [];
1236
+ for (const surv of survivors) {
1237
+ if (surv.healthy && (now - surv.last_health_at) < backoffMs) continue;
1238
+ // Load the full spec record for the healthcheck shape (boot-reconcile
1239
+ // needs `healthcheck` block, which survivors[] omits to keep the lock
1240
+ // callback small).
1241
+ const list = listManagedSpecs();
1242
+ const rec = list.find(s => s && s.name === surv.name);
1243
+ if (!rec || !rec.healthcheck) continue;
1244
+ try {
1245
+ const r = await runHealthcheck({ name: rec.name, healthcheck: rec.healthcheck });
1246
+ try { _markBootProbe(rec.name, r); }
1247
+ catch (e) { log('warn', 'managed-spawn bootReconcile state write failed for ' + rec.name + ': ' + e.message); }
1248
+ probed.push({ name: rec.name, healthy: !!r.healthy });
1249
+ } catch (e) {
1250
+ log('warn', 'managed-spawn bootReconcile probe failed for ' + rec.name + ': ' + e.message);
1251
+ }
1252
+ }
1253
+ return { stats: stats, probed: probed };
1254
+ }
1255
+
1256
+ function removeManagedSpecsForProject(projectName) {
1257
+ if (typeof projectName !== 'string' || projectName.length === 0) {
1258
+ return { killed: 0, unlinked: 0, scanned: 0 };
1259
+ }
1260
+ const statePath = _getStatePath();
1261
+ let toKill = [];
1262
+ let logPaths = [];
1263
+ shared.mutateJsonFileLocked(statePath, (data) => {
1264
+ if (!data || typeof data !== 'object' || !Array.isArray(data.specs)) {
1265
+ return _initialStateShape();
1266
+ }
1267
+ const keep = [];
1268
+ for (const rec of data.specs) {
1269
+ if (rec && rec.owner_project === projectName) {
1270
+ if (Number.isInteger(rec.pid) && rec.pid > 0) toKill.push(rec.pid);
1271
+ if (rec.log_path) logPaths.push(rec.log_path);
1272
+ continue; // drop
1273
+ }
1274
+ keep.push(rec);
1275
+ }
1276
+ data.specs = keep;
1277
+ return data;
1278
+ }, { defaultValue: _initialStateShape() });
1279
+ let killed = 0;
1280
+ if (toKill.length > 0) {
1281
+ try { killed = shared.killByPidsImmediate(toKill); }
1282
+ catch (e) { log('warn', 'managed-spawn removeForProject: kill batch failed for ' + projectName + ': ' + e.message); }
1283
+ }
1284
+ let unlinked = 0;
1285
+ for (const p of logPaths) {
1286
+ for (const candidate of [p, p + '.1']) {
1287
+ try { fs.unlinkSync(candidate); unlinked++; }
1288
+ catch (e) { if (e && e.code !== 'ENOENT') log('warn', 'managed-spawn removeForProject: unlink ' + candidate + ' failed: ' + e.message); }
1289
+ }
1290
+ }
1291
+ return { killed: killed, unlinked: unlinked, scanned: toKill.length };
1292
+ }
1293
+
1294
+ module.exports = {
1295
+ MANAGED_SPAWN_FILENAME: MANAGED_SPAWN_FILENAME,
1296
+ MANAGED_PROCESSES_STATE_FILE: MANAGED_PROCESSES_STATE_FILE,
1297
+ MANAGED_LOGS_DIR: MANAGED_LOGS_DIR,
1298
+ INVALID_WORKDIR_REASON_PREFIX: INVALID_WORKDIR_REASON_PREFIX,
1299
+ HEALTHCHECK_TYPES: HEALTHCHECK_TYPES,
1300
+ validateManagedSpawnRecord: validateManagedSpawnRecord,
1301
+ readManagedSpawnFile: readManagedSpawnFile,
1302
+ evaluateManagedSpawnAcceptance: evaluateManagedSpawnAcceptance,
1303
+ buildManagedSpawnHint: buildManagedSpawnHint,
1304
+ // Item 2 (P-2d5e8f04): engine spawn + locked state file.
1305
+ openManagedLog: openManagedLog,
1306
+ spawnManagedSpec: spawnManagedSpec,
1307
+ recordManagedSpec: recordManagedSpec,
1308
+ recordManagedBatch: recordManagedBatch,
1309
+ removeManagedSpec: removeManagedSpec,
1310
+ listManagedSpecs: listManagedSpecs,
1311
+ getStatePath: _getStatePath,
1312
+ // Item 3 (P-9c1f47a6): healthcheck implementations + first-pass waiter.
1313
+ runHealthcheck: runHealthcheck,
1314
+ waitForFirstHealth: waitForFirstHealth,
1315
+ tailManagedLog: tailManagedLog,
1316
+ // Item 6 (P-1f9c3a45): playbook auto-inject of live managed processes.
1317
+ buildLiveManagedProcessesBlock: buildLiveManagedProcessesBlock,
1318
+ // Item 4 (P-4b8d2e57): discovery API (etag + state-driven respawn).
1319
+ computeManagedSpecsEtag: computeManagedSpecsEtag,
1320
+ restartManagedSpec: restartManagedSpec,
1321
+ // Item 7 (P-8a4d6f29): TTL sweep + boot reconcile + project cleanup.
1322
+ sweepManagedSpawn: sweepManagedSpawn,
1323
+ bootReconcileManagedSpawn: bootReconcileManagedSpawn,
1324
+ removeManagedSpecsForProject: removeManagedSpecsForProject,
1325
+ };