@worca/ui 0.22.0 → 0.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1149 @@
1
+ /**
2
+ * Fleet REST endpoints (W-040 §13.6).
3
+ *
4
+ * Manifests live at ~/.worca/fleet-runs/<fleet_id>.json.
5
+ * Uploaded guide files land under ~/.worca/fleet-runs/<fleet_id>/guides/.
6
+ */
7
+
8
+ import { execFileSync } from 'node:child_process';
9
+ import {
10
+ existsSync,
11
+ mkdirSync,
12
+ readdirSync,
13
+ readFileSync,
14
+ renameSync,
15
+ unlinkSync,
16
+ writeFileSync,
17
+ } from 'node:fs';
18
+ import { basename, join } from 'node:path';
19
+ import { Router } from 'express';
20
+ import { fleetRunsDir as resolveFleetRunsDir } from './paths.js';
21
+
22
+ const GUIDE_CAP_BYTES_DEFAULT = 64 * 1024; // 64 KB
23
+
24
+ // Fleet IDs have the form f_<12 digits>_<hex> — enforces no path traversal.
25
+ const FLEET_ID_RE = /^f_\d{12}_[0-9a-f]{1,32}$/;
26
+
27
+ // ─── helpers ───────────────────────────────────────────────────────────────
28
+
29
+ function validateFleetId(id) {
30
+ return typeof id === 'string' && FLEET_ID_RE.test(id);
31
+ }
32
+
33
+ function manifestFilePath(fleetRunsDir, fleetId) {
34
+ return join(fleetRunsDir, `${fleetId}.json`);
35
+ }
36
+
37
+ function readManifest(fleetRunsDir, fleetId) {
38
+ const p = manifestFilePath(fleetRunsDir, fleetId);
39
+ if (!existsSync(p)) return null;
40
+ try {
41
+ return JSON.parse(readFileSync(p, 'utf8'));
42
+ } catch {
43
+ return null;
44
+ }
45
+ }
46
+
47
+ function saveManifest(fleetRunsDir, manifest) {
48
+ mkdirSync(fleetRunsDir, { recursive: true });
49
+ const p = manifestFilePath(fleetRunsDir, manifest.fleet_id);
50
+ // Atomic write: temp file + rename, mirroring write_fleet_manifest() in
51
+ // src/worca/orchestrator/fleet_manifest.py. Without this a concurrent
52
+ // reader (WS watcher, listManifests) can see a half-written file and
53
+ // swallow a parse error — surfacing as a transient blank fleet event.
54
+ // The temp suffix avoids the watcher's `.json` filename filter.
55
+ const tmp = `${p}.tmp.${process.pid}.${Date.now()}`;
56
+ try {
57
+ writeFileSync(tmp, `${JSON.stringify(manifest, null, 2)}\n`, 'utf8');
58
+ renameSync(tmp, p);
59
+ } catch (err) {
60
+ try {
61
+ unlinkSync(tmp);
62
+ } catch {
63
+ /* best-effort temp cleanup */
64
+ }
65
+ throw err;
66
+ }
67
+ return p;
68
+ }
69
+
70
+ function listManifests(fleetRunsDir) {
71
+ if (!existsSync(fleetRunsDir)) return [];
72
+ const out = [];
73
+ for (const file of readdirSync(fleetRunsDir)) {
74
+ if (!file.endsWith('.json')) continue;
75
+ try {
76
+ const m = JSON.parse(readFileSync(join(fleetRunsDir, file), 'utf8'));
77
+ if (m?.fleet_id) out.push(m);
78
+ } catch {
79
+ // skip malformed
80
+ }
81
+ }
82
+ return out;
83
+ }
84
+
85
+ function _readChildRegistry(child) {
86
+ const { project_path, run_id } = child;
87
+ if (!project_path || !run_id) return null;
88
+ const reg = join(
89
+ project_path,
90
+ '.worca',
91
+ 'multi',
92
+ 'pipelines.d',
93
+ `${run_id}.json`,
94
+ );
95
+ if (!existsSync(reg)) return null;
96
+ try {
97
+ return JSON.parse(readFileSync(reg, 'utf8'));
98
+ } catch {
99
+ return null;
100
+ }
101
+ }
102
+
103
+ function resolveChildStatus(child) {
104
+ return _readChildRegistry(child)?.status ?? 'running';
105
+ }
106
+
107
+ // Reverse-lookup: scan every registered project's pipelines.d/ for runs that
108
+ // reference this fleet_id. Acts as a self-healing fallback when the manifest's
109
+ // children array was never populated (older dispatcher path) or when a child
110
+ // race-wrote its registry entry before the manifest update landed. Cheap:
111
+ // pipelines.d/ entries are already on disk, and the UI reads them anyway.
112
+ function _discoverChildrenFromRegistry(fleetId, prefsDir) {
113
+ if (!fleetId || !prefsDir) return [];
114
+ const projectsDir = join(prefsDir, 'projects.d');
115
+ if (!existsSync(projectsDir)) return [];
116
+ const discovered = [];
117
+ let projectFiles;
118
+ try {
119
+ projectFiles = readdirSync(projectsDir);
120
+ } catch {
121
+ return [];
122
+ }
123
+ for (const file of projectFiles) {
124
+ if (!file.endsWith('.json')) continue;
125
+ let project;
126
+ try {
127
+ project = JSON.parse(readFileSync(join(projectsDir, file), 'utf8'));
128
+ } catch {
129
+ continue;
130
+ }
131
+ if (!project?.path) continue;
132
+ const pipesDir = join(project.path, '.worca', 'multi', 'pipelines.d');
133
+ if (!existsSync(pipesDir)) continue;
134
+ let runFiles;
135
+ try {
136
+ runFiles = readdirSync(pipesDir);
137
+ } catch {
138
+ continue;
139
+ }
140
+ for (const f of runFiles) {
141
+ if (!f.endsWith('.json')) continue;
142
+ let entry;
143
+ try {
144
+ entry = JSON.parse(readFileSync(join(pipesDir, f), 'utf8'));
145
+ } catch {
146
+ continue;
147
+ }
148
+ if (entry?.fleet_id === fleetId && entry?.run_id) {
149
+ discovered.push({
150
+ project_path: project.path,
151
+ run_id: entry.run_id,
152
+ });
153
+ }
154
+ }
155
+ }
156
+ return discovered;
157
+ }
158
+
159
+ function _mergeChildren(manifestChildren, discoveredChildren) {
160
+ const seen = new Set();
161
+ const out = [];
162
+ const keyOf = (c) => `${c.project_path ?? ''}\0${c.run_id ?? ''}`;
163
+ for (const c of manifestChildren ?? []) {
164
+ const k = keyOf(c);
165
+ if (seen.has(k)) continue;
166
+ seen.add(k);
167
+ out.push(c);
168
+ }
169
+ for (const c of discoveredChildren) {
170
+ const k = keyOf(c);
171
+ if (seen.has(k)) continue;
172
+ seen.add(k);
173
+ out.push(c);
174
+ }
175
+ return out;
176
+ }
177
+
178
+ function _resolveChildren(manifest, prefsDir) {
179
+ const discovered = _discoverChildrenFromRegistry(manifest.fleet_id, prefsDir);
180
+ return _mergeChildren(manifest.children, discovered);
181
+ }
182
+
183
+ function enrichChildren(manifest, prefsDir) {
184
+ return _resolveChildren(manifest, prefsDir).map((c) => ({
185
+ ...c,
186
+ status: resolveChildStatus(c),
187
+ }));
188
+ }
189
+
190
+ /**
191
+ * Aggregate fleet-level metrics from the manifest plus each child's live
192
+ * pipeline-registry entry. Returns:
193
+ * { children, cost_usd, last_activity_at }
194
+ *
195
+ * - `children`: compact records carrying the live status (not the stale
196
+ * manifest status) so the UI's fleet-card status tiles are accurate.
197
+ * - `cost_usd`: sum of `iter.cost_usd` across every stage iteration of
198
+ * every dispatched child.
199
+ * - `last_activity_at`: the latest `last_event_at` / `completed_at` /
200
+ * `started_at` observed across children, falling back to the manifest's
201
+ * `updated_at` when no child has reported yet.
202
+ *
203
+ * Cheap: each fleet's pipelines.d/<run_id>.json is already on disk and we
204
+ * read it once per child; the dashboard previously did the same work
205
+ * client-side via WS-pushed Run records, so the I/O is comparable.
206
+ */
207
+ function aggregateFleetMetrics(manifest, prefsDir) {
208
+ let cost_usd = 0;
209
+ let last_activity_at = manifest.updated_at ?? null;
210
+ const children = _resolveChildren(manifest, prefsDir).map((c) => {
211
+ const reg = _readChildRegistry(c);
212
+ const compact = {
213
+ project_path: c.project_path,
214
+ run_id: c.run_id ?? null,
215
+ status: reg?.status ?? c.status ?? 'pending',
216
+ };
217
+ if (reg) {
218
+ for (const stage of Object.values(reg.stages ?? {})) {
219
+ for (const iter of stage.iterations ?? []) {
220
+ cost_usd += iter.cost_usd ?? 0;
221
+ }
222
+ }
223
+ const ts =
224
+ reg.last_event_at ?? reg.completed_at ?? reg.started_at ?? null;
225
+ if (ts && (!last_activity_at || ts > last_activity_at)) {
226
+ last_activity_at = ts;
227
+ }
228
+ }
229
+ return compact;
230
+ });
231
+ return { children, cost_usd, last_activity_at };
232
+ }
233
+
234
+ // ─── fleet status derivation ───────────────────────────────────────────────
235
+ //
236
+ // JS port of derive_fleet_status / poll_and_update_fleet_manifest from
237
+ // src/worca/orchestrator/fleet_manifest.py. The Python poller is only wired
238
+ // into tests — in production nothing re-derives a fleet's status after
239
+ // run_fleet.py exits (it launches detached children and returns within
240
+ // seconds, long before any child finishes). Without this server-side
241
+ // reconcile the manifest's stored `status` is frozen at "running" forever.
242
+
243
+ const _RUNNING_STATES = new Set(['running', 'resuming', 'paused']);
244
+ const _FAILURE_STATES = new Set(['failed', 'setup_failed', 'unrecoverable']);
245
+ // `interrupted` / `cancelled` are terminal-but-not-completed (and NOT
246
+ // failures — a deliberate stop must not inflate the circuit-breaker ratio).
247
+ const _TERMINAL_STATES = new Set([
248
+ 'completed',
249
+ 'interrupted',
250
+ 'cancelled',
251
+ ..._FAILURE_STATES,
252
+ ]);
253
+
254
+ /**
255
+ * Pure derivation of fleet status from a list of child pipeline statuses.
256
+ * Mirrors derive_fleet_status() in fleet_manifest.py. Exported for tests.
257
+ *
258
+ * @param {string[]} childStatuses
259
+ * @param {number} threshold circuit-breaker failure ratio (default 0.30)
260
+ * @returns {{ status: string, halt_reason: string|null }}
261
+ */
262
+ export function deriveFleetStatus(childStatuses, threshold = 0.3) {
263
+ if (!childStatuses.length) return { status: 'running', halt_reason: null };
264
+
265
+ const total = childStatuses.length;
266
+ const runningCount = childStatuses.filter((s) =>
267
+ _RUNNING_STATES.has(s),
268
+ ).length;
269
+ const completedCount = childStatuses.filter((s) => s === 'completed').length;
270
+ const failedCount = childStatuses.filter((s) =>
271
+ _FAILURE_STATES.has(s),
272
+ ).length;
273
+ const terminalCount = childStatuses.filter((s) =>
274
+ _TERMINAL_STATES.has(s),
275
+ ).length;
276
+
277
+ // Circuit breaker fires only while in-flight children still exist.
278
+ if (runningCount > 0) {
279
+ const minTerminal = Math.min(3, total);
280
+ if (
281
+ terminalCount >= minTerminal &&
282
+ failedCount > 0 &&
283
+ failedCount / terminalCount >= threshold
284
+ ) {
285
+ return { status: 'halted', halt_reason: 'circuit_breaker' };
286
+ }
287
+ return { status: 'running', halt_reason: null };
288
+ }
289
+
290
+ // All dispatched children are terminal.
291
+ if (terminalCount === total) {
292
+ return completedCount === total
293
+ ? { status: 'completed', halt_reason: null }
294
+ : { status: 'failed', halt_reason: null };
295
+ }
296
+
297
+ // Pending / untracked children not yet dispatched.
298
+ return { status: 'running', halt_reason: null };
299
+ }
300
+
301
+ /**
302
+ * Pure decision: given a manifest and live child statuses, return the
303
+ * effective { status, halt_reason } the API/WS should report. Persists
304
+ * nothing — safe for use in side-effect-free contexts (WS watcher).
305
+ *
306
+ * Sticky states are never re-derived:
307
+ * - halted / paused — operator actions (Halt / Stop / Pause), held until
308
+ * an explicit resume
309
+ * - completed / failed — terminal; only resume / relaunch leaves them
310
+ * Only `running` and `resuming` reconcile. From `resuming` the status may
311
+ * only advance to `running` — never straight to a terminal status, since a
312
+ * just-resumed fleet's children may still carry their pre-resume terminal
313
+ * registry state for a beat before the resumed runners flip them back.
314
+ *
315
+ * @param {object} manifest
316
+ * @param {string[]} childStatuses
317
+ * @returns {{ status: string, halt_reason: string|null }}
318
+ */
319
+ export function effectiveFleetStatus(manifest, childStatuses) {
320
+ const current = manifest.status ?? 'running';
321
+ if (current !== 'running' && current !== 'resuming') {
322
+ return { status: current, halt_reason: manifest.halt_reason ?? null };
323
+ }
324
+
325
+ const threshold = manifest.fleet_failure_threshold ?? 0.3;
326
+ const { status, halt_reason } = deriveFleetStatus(childStatuses, threshold);
327
+
328
+ if (current === 'resuming' && status !== 'running') {
329
+ return { status: 'resuming', halt_reason: manifest.halt_reason ?? null };
330
+ }
331
+ return { status, halt_reason };
332
+ }
333
+
334
+ /**
335
+ * Reconcile a manifest's stored status against the live child statuses and,
336
+ * when it changed, persist it back. Returns the effective
337
+ * { status, halt_reason } the API should report. Wraps effectiveFleetStatus
338
+ * with persistence — the WS watcher should call effectiveFleetStatus instead
339
+ * to avoid a write→watch→write loop.
340
+ */
341
+ function reconcileFleetStatus(manifest, childStatuses, fleetRunsDir) {
342
+ const current = manifest.status ?? 'running';
343
+ const { status, halt_reason } = effectiveFleetStatus(manifest, childStatuses);
344
+
345
+ if (status !== current || halt_reason !== (manifest.halt_reason ?? null)) {
346
+ manifest.status = status;
347
+ if (halt_reason != null) {
348
+ manifest.halt_reason = halt_reason;
349
+ } else if (status !== 'halted') {
350
+ manifest.halt_reason = null;
351
+ }
352
+ manifest.updated_at = new Date().toISOString();
353
+ try {
354
+ saveManifest(fleetRunsDir, manifest);
355
+ } catch {
356
+ // Best-effort persistence — the derived value is still returned, and
357
+ // the next read re-derives it anyway.
358
+ }
359
+ }
360
+ return { status, halt_reason };
361
+ }
362
+
363
+ function generateFleetId() {
364
+ const now = new Date();
365
+ const ts = [
366
+ now.getUTCFullYear(),
367
+ String(now.getUTCMonth() + 1).padStart(2, '0'),
368
+ String(now.getUTCDate()).padStart(2, '0'),
369
+ String(now.getUTCHours()).padStart(2, '0'),
370
+ String(now.getUTCMinutes()).padStart(2, '0'),
371
+ ].join('');
372
+ const rand = Math.random().toString(16).slice(2, 10).padStart(8, '0');
373
+ return { fleet_id: `f_${ts}_${rand}`, fleet_id_short: rand };
374
+ }
375
+
376
+ function sanitizeFilename(raw) {
377
+ const name = basename(raw || 'guide')
378
+ .replace(/[/\\]/g, '')
379
+ .replace(/[^A-Za-z0-9._-]/g, '_');
380
+ return name || 'guide';
381
+ }
382
+
383
+ // ─── multipart parser ──────────────────────────────────────────────────────
384
+
385
+ function readRawBody(req) {
386
+ return new Promise((resolve, reject) => {
387
+ const chunks = [];
388
+ req.on('data', (c) => chunks.push(c));
389
+ req.on('end', () => resolve(Buffer.concat(chunks)));
390
+ req.on('error', reject);
391
+ });
392
+ }
393
+
394
+ /**
395
+ * Parse a multipart/form-data body into an array of parts.
396
+ * Each part: { name: string|null, filename: string|null, content: Buffer }
397
+ */
398
+ function parseMultipart(body, contentType) {
399
+ const m = /boundary=([^\s;,]+)/.exec(contentType);
400
+ if (!m) return null;
401
+ const boundary = m[1].replace(/^["']|["']$/g, '');
402
+
403
+ const delim = Buffer.from(`\r\n--${boundary}`);
404
+ const parts = [];
405
+
406
+ // Locate the opening delimiter
407
+ const openStr = `--${boundary}\r\n`;
408
+ let pos = body.indexOf(openStr);
409
+ if (pos === -1) return parts;
410
+ pos += openStr.length;
411
+
412
+ while (pos < body.length) {
413
+ const end = body.indexOf(delim, pos);
414
+ if (end === -1) break;
415
+
416
+ const partBuf = body.slice(pos, end);
417
+ const hdrEnd = partBuf.indexOf('\r\n\r\n');
418
+ if (hdrEnd !== -1) {
419
+ const headerStr = partBuf.slice(0, hdrEnd).toString('utf8');
420
+ const content = partBuf.slice(hdrEnd + 4);
421
+
422
+ const headers = {};
423
+ for (const line of headerStr.split('\r\n')) {
424
+ const ci = line.indexOf(':');
425
+ if (ci !== -1) {
426
+ headers[line.slice(0, ci).toLowerCase().trim()] = line
427
+ .slice(ci + 1)
428
+ .trim();
429
+ }
430
+ }
431
+
432
+ const cd = headers['content-disposition'] ?? '';
433
+ const nm = /\bname="([^"]+)"/.exec(cd);
434
+ const fn = /\bfilename="([^"]+)"/.exec(cd);
435
+
436
+ parts.push({
437
+ name: nm?.[1] ?? null,
438
+ filename: fn?.[1] ?? null,
439
+ content,
440
+ });
441
+ }
442
+
443
+ pos = end + delim.length;
444
+ const after = body.slice(pos, pos + 2).toString();
445
+ if (after === '--') break;
446
+ pos += 2;
447
+ }
448
+
449
+ return parts;
450
+ }
451
+
452
+ // ─── default injectable implementations ────────────────────────────────────
453
+
454
+ function defaultValidateBaseBranch(project, branch) {
455
+ try {
456
+ const out = execFileSync(
457
+ 'git',
458
+ ['-C', project, 'branch', '--list', branch],
459
+ { encoding: 'utf8' },
460
+ );
461
+ return out.trim().length > 0;
462
+ } catch {
463
+ return false;
464
+ }
465
+ }
466
+
467
+ function defaultRunCleanup(fleetId) {
468
+ execFileSync('worca', ['cleanup', '--fleet-id', fleetId, '--all']);
469
+ return {};
470
+ }
471
+
472
+ // run_fleet.py --pause / --stop fan a control file out to every in-flight
473
+ // child and stamp the manifest (paused / halted+stopped). They print
474
+ // "paused N in-flight…" / "stopped N…" — parse N back out for the response.
475
+ // fleetId is validated against FLEET_ID_RE before reaching here, and
476
+ // execFileSync (no shell) means no injection surface regardless.
477
+ function _runFleetLifecycle(flag, fleetId) {
478
+ const out = execFileSync(
479
+ 'python3',
480
+ ['-m', 'worca.scripts.run_fleet', flag, fleetId],
481
+ { encoding: 'utf8' },
482
+ );
483
+ const m = /(\d+)/.exec(out || '');
484
+ return m ? Number(m[1]) : 0;
485
+ }
486
+
487
+ function defaultPauseFleet(fleetId) {
488
+ return { paused_count: _runFleetLifecycle('--pause', fleetId) };
489
+ }
490
+
491
+ function defaultStopFleet(fleetId) {
492
+ return { stopped_count: _runFleetLifecycle('--stop', fleetId) };
493
+ }
494
+
495
+ // ─── router factory ────────────────────────────────────────────────────────
496
+
497
+ /**
498
+ * @param {{
499
+ * fleetRunsDir?: string,
500
+ * prefsDir?: string,
501
+ * dispatchFleet?: (args: object) => Promise<object>,
502
+ * runCleanup?: (fleetId: string) => Promise<object>,
503
+ * pauseFleet?: (fleetId: string) => object,
504
+ * stopFleet?: (fleetId: string) => object,
505
+ * validateBaseBranch?: (project: string, branch: string) => Promise<boolean>,
506
+ * guideCapBytes?: number,
507
+ * }} opts
508
+ *
509
+ * `prefsDir` enables the reverse-lookup path: when a fleet manifest's
510
+ * `children` array is empty (older dispatcher) or out-of-sync, the router
511
+ * scans every project under `<prefsDir>/projects.d/` for pipelines.d/
512
+ * entries that reference the fleet_id and includes them in the response.
513
+ */
514
+ export function createFleetRouter({
515
+ fleetRunsDir: fleetRunsDirArg,
516
+ prefsDir = null,
517
+ dispatchFleet = null,
518
+ runCleanup = defaultRunCleanup,
519
+ pauseFleet = defaultPauseFleet,
520
+ stopFleet = defaultStopFleet,
521
+ validateBaseBranch = defaultValidateBaseBranch,
522
+ guideCapBytes = GUIDE_CAP_BYTES_DEFAULT,
523
+ } = {}) {
524
+ // Lazy resolution honors $WORCA_HOME at router-construction time, falling
525
+ // back to ~/.worca/fleet-runs. Issue #162.
526
+ const fleetRunsDir = resolveFleetRunsDir(fleetRunsDirArg);
527
+ const router = Router();
528
+
529
+ // ── GET /api/fleet-runs ─────────────────────────────────────────────────
530
+ //
531
+ // Returns a list of fleet summaries. The payload includes a compact
532
+ // `children` array (one slim record per dispatched child) so the UI can
533
+ // render `fleetCardView` with the children strip without an extra
534
+ // round-trip per fleet. The full per-child registry entry is still
535
+ // fetched lazily via GET /api/fleet-runs/:id when the user drills in.
536
+ router.get('/', (_req, res) => {
537
+ try {
538
+ const fleets = listManifests(fleetRunsDir).map((m) => {
539
+ const agg = aggregateFleetMetrics(m, prefsDir);
540
+ // Reconcile the stored status against the children's live registry
541
+ // statuses — run_fleet.py never writes a terminal status back.
542
+ const { status, halt_reason } = reconcileFleetStatus(
543
+ m,
544
+ agg.children.map((c) => c.status),
545
+ fleetRunsDir,
546
+ );
547
+ return {
548
+ fleet_id: m.fleet_id,
549
+ fleet_id_short: m.fleet_id_short,
550
+ work_request: m.work_request,
551
+ status,
552
+ halt_reason,
553
+ halted_at: m.halted_at ?? null,
554
+ archived: m.archived === true,
555
+ archived_at: m.archived_at ?? null,
556
+ children_count: agg.children.length,
557
+ children: agg.children,
558
+ head_template: m.head_template ?? null,
559
+ base_branch: m.base_branch ?? null,
560
+ plan: m.plan ? { mode: m.plan.mode ?? 'none' } : { mode: 'none' },
561
+ created_at: m.created_at,
562
+ updated_at: m.updated_at ?? null,
563
+ last_activity_at: agg.last_activity_at,
564
+ cost_usd: agg.cost_usd,
565
+ guide: m.guide
566
+ ? {
567
+ bytes: m.guide.bytes,
568
+ filenames: m.guide.filenames,
569
+ uploaded: m.guide.uploaded,
570
+ }
571
+ : null,
572
+ };
573
+ });
574
+ res.json({ ok: true, fleets });
575
+ } catch (err) {
576
+ res.status(500).json({ ok: false, error: err.message });
577
+ }
578
+ });
579
+
580
+ // ── POST /api/fleet-runs/validate-base ──────────────────────────────────
581
+ router.post('/validate-base', async (req, res) => {
582
+ const { projects, base_branch } = req.body ?? {};
583
+ if (!base_branch || typeof base_branch !== 'string') {
584
+ return res
585
+ .status(400)
586
+ .json({ ok: false, error: 'base_branch is required' });
587
+ }
588
+ if (!Array.isArray(projects) || projects.length === 0) {
589
+ return res
590
+ .status(400)
591
+ .json({ ok: false, error: 'projects must be a non-empty array' });
592
+ }
593
+
594
+ try {
595
+ const missing_in = [];
596
+ for (const proj of projects) {
597
+ const exists = await validateBaseBranch(proj, base_branch);
598
+ if (!exists) missing_in.push(proj);
599
+ }
600
+ res.json({ ok: missing_in.length === 0, missing_in });
601
+ } catch (err) {
602
+ res.status(500).json({ ok: false, error: err.message });
603
+ }
604
+ });
605
+
606
+ // ── POST /api/fleet-runs ────────────────────────────────────────────────
607
+ router.post('/', async (req, res) => {
608
+ try {
609
+ const contentType = req.headers['content-type'] ?? '';
610
+ const isMultipart = contentType.includes('multipart/form-data');
611
+
612
+ let fields = {};
613
+ const guideFiles = []; // [{ filename: string, content: Buffer }]
614
+
615
+ if (isMultipart) {
616
+ const rawBody = await readRawBody(req);
617
+ const parts = parseMultipart(rawBody, contentType);
618
+ if (!parts) {
619
+ return res
620
+ .status(400)
621
+ .json({ ok: false, error: 'Failed to parse multipart body' });
622
+ }
623
+ for (const part of parts) {
624
+ if (part.filename != null) {
625
+ guideFiles.push({
626
+ filename: part.filename,
627
+ content: part.content,
628
+ });
629
+ } else if (part.name) {
630
+ fields[part.name] = part.content.toString('utf8');
631
+ }
632
+ }
633
+ if (typeof fields.projects === 'string') {
634
+ try {
635
+ fields.projects = JSON.parse(fields.projects);
636
+ } catch {
637
+ // leave as string
638
+ }
639
+ }
640
+ } else {
641
+ fields = req.body ?? {};
642
+ }
643
+
644
+ const {
645
+ projects = [],
646
+ prompt,
647
+ source,
648
+ head_template,
649
+ base_branch,
650
+ plan_mode,
651
+ max_parallel = 5,
652
+ fleet_failure_threshold = 0.3,
653
+ } = fields;
654
+
655
+ if (!prompt && !source) {
656
+ return res
657
+ .status(400)
658
+ .json({ ok: false, error: 'prompt or source is required' });
659
+ }
660
+
661
+ const { fleet_id, fleet_id_short } = generateFleetId();
662
+
663
+ // Save uploaded guide files.
664
+ // Validate total size BEFORE writing anything to disk — partial writes
665
+ // would leave orphan files under <fleet_id>/guides/ that no manifest
666
+ // points at (cleanup wouldn't find them).
667
+ let guideEntry = null;
668
+ if (guideFiles.length > 0) {
669
+ const totalBytes = guideFiles.reduce(
670
+ (sum, f) => sum + f.content.length,
671
+ 0,
672
+ );
673
+ if (totalBytes > guideCapBytes) {
674
+ return res.status(400).json({
675
+ ok: false,
676
+ error: `Guide files exceed size cap of ${guideCapBytes} bytes`,
677
+ guide_bytes: totalBytes,
678
+ cap_bytes: guideCapBytes,
679
+ });
680
+ }
681
+
682
+ // Resolve every filename to a unique sanitized name first — no I/O yet.
683
+ const usedNames = new Set();
684
+ const planned = guideFiles.map(({ filename, content }) => {
685
+ let safe = sanitizeFilename(filename);
686
+ if (usedNames.has(safe)) {
687
+ const dot = safe.lastIndexOf('.');
688
+ const nameBase = dot !== -1 ? safe.slice(0, dot) : safe;
689
+ const ext = dot !== -1 ? safe.slice(dot) : '';
690
+ let counter = 1;
691
+ while (usedNames.has(`${nameBase}-${counter}${ext}`)) counter++;
692
+ safe = `${nameBase}-${counter}${ext}`;
693
+ }
694
+ usedNames.add(safe);
695
+ return { safe, content };
696
+ });
697
+
698
+ // Now write — total size is validated, no orphan files possible.
699
+ const guidesDir = join(fleetRunsDir, fleet_id, 'guides');
700
+ mkdirSync(guidesDir, { recursive: true });
701
+ const savedPaths = [];
702
+ const savedFilenames = [];
703
+ for (const { safe, content } of planned) {
704
+ writeFileSync(join(guidesDir, safe), content);
705
+ savedPaths.push(join(guidesDir, safe));
706
+ savedFilenames.push(safe);
707
+ }
708
+
709
+ guideEntry = {
710
+ paths: savedPaths,
711
+ bytes: totalBytes,
712
+ filenames: savedFilenames,
713
+ uploaded: true,
714
+ };
715
+ }
716
+
717
+ const manifest = {
718
+ fleet_id,
719
+ fleet_id_short,
720
+ created_at: new Date().toISOString(),
721
+ work_request: {
722
+ title: (prompt || source || '').slice(0, 80),
723
+ description: prompt ?? '',
724
+ source: source ?? null,
725
+ },
726
+ guide: guideEntry,
727
+ plan: { mode: plan_mode ?? 'none', path: null },
728
+ head_template: head_template ?? 'migration/{slug}/{project}',
729
+ base_branch: base_branch ?? null,
730
+ max_parallel: Number(max_parallel) || 5,
731
+ fleet_failure_threshold: Number(fleet_failure_threshold) || 0.3,
732
+ status: 'running',
733
+ halt_reason: null,
734
+ children: [],
735
+ };
736
+
737
+ const manifest_path = saveManifest(fleetRunsDir, manifest);
738
+
739
+ if (dispatchFleet) {
740
+ try {
741
+ await dispatchFleet({
742
+ fleet_id,
743
+ manifest,
744
+ manifest_path,
745
+ projects,
746
+ });
747
+ } catch (err) {
748
+ manifest.status = 'failed';
749
+ saveManifest(fleetRunsDir, manifest);
750
+ return res.status(500).json({
751
+ ok: false,
752
+ error: `Fleet dispatch failed: ${err.message}`,
753
+ });
754
+ }
755
+ }
756
+
757
+ res.status(201).json({ ok: true, fleet_id, manifest_path });
758
+ } catch (err) {
759
+ res.status(500).json({ ok: false, error: err.message });
760
+ }
761
+ });
762
+
763
+ // ── GET /api/fleet-runs/:id ─────────────────────────────────────────────
764
+ router.get('/:id', (req, res) => {
765
+ const { id } = req.params;
766
+ if (!validateFleetId(id)) {
767
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
768
+ }
769
+ const manifest = readManifest(fleetRunsDir, id);
770
+ if (!manifest) {
771
+ return res
772
+ .status(404)
773
+ .json({ ok: false, error: `Fleet "${id}" not found` });
774
+ }
775
+ const children = enrichChildren(manifest, prefsDir);
776
+ // Reconcile the stored status against the children's live registry
777
+ // statuses — run_fleet.py never writes a terminal status back.
778
+ const { status, halt_reason } = reconcileFleetStatus(
779
+ manifest,
780
+ children.map((c) => c.status),
781
+ fleetRunsDir,
782
+ );
783
+ res.json({
784
+ ok: true,
785
+ fleet: { ...manifest, status, halt_reason, children },
786
+ });
787
+ });
788
+
789
+ // ── POST /api/fleet-runs/:id/archive ────────────────────────────────────
790
+ // Hides a terminal fleet from the default list. Mirrors the pipeline
791
+ // run archive endpoint: refuses to archive an in-flight fleet, idempotent
792
+ // when already archived, stamps `archived` + `archived_at` on the manifest.
793
+ router.post('/:id/archive', (req, res) => {
794
+ const { id } = req.params;
795
+ if (!validateFleetId(id)) {
796
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
797
+ }
798
+ const manifest = readManifest(fleetRunsDir, id);
799
+ if (!manifest) {
800
+ return res
801
+ .status(404)
802
+ .json({ ok: false, error: `Fleet "${id}" not found` });
803
+ }
804
+ if (manifest.status === 'running' || manifest.status === 'resuming') {
805
+ return res
806
+ .status(409)
807
+ .json({ ok: false, error: 'Cannot archive an in-flight fleet' });
808
+ }
809
+ if (manifest.archived === true) {
810
+ return res.json({ ok: true, archived_at: manifest.archived_at ?? null });
811
+ }
812
+ manifest.archived = true;
813
+ manifest.archived_at = new Date().toISOString();
814
+ saveManifest(fleetRunsDir, manifest);
815
+ res.json({ ok: true, archived_at: manifest.archived_at });
816
+ });
817
+
818
+ // ── POST /api/fleet-runs/:id/unarchive ──────────────────────────────────
819
+ router.post('/:id/unarchive', (req, res) => {
820
+ const { id } = req.params;
821
+ if (!validateFleetId(id)) {
822
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
823
+ }
824
+ const manifest = readManifest(fleetRunsDir, id);
825
+ if (!manifest) {
826
+ return res
827
+ .status(404)
828
+ .json({ ok: false, error: `Fleet "${id}" not found` });
829
+ }
830
+ if (manifest.archived !== true) {
831
+ return res.json({ ok: true });
832
+ }
833
+ delete manifest.archived;
834
+ delete manifest.archived_at;
835
+ saveManifest(fleetRunsDir, manifest);
836
+ res.json({ ok: true });
837
+ });
838
+
839
+ // ── DELETE /api/fleet-runs/:id ──────────────────────────────────────────
840
+ router.delete('/:id', async (req, res) => {
841
+ const { id } = req.params;
842
+ if (!validateFleetId(id)) {
843
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
844
+ }
845
+
846
+ const manifest = readManifest(fleetRunsDir, id);
847
+ if (!manifest) {
848
+ return res
849
+ .status(404)
850
+ .json({ ok: false, error: `Fleet "${id}" not found` });
851
+ }
852
+
853
+ const { cleanup, force } = req.query;
854
+ const currentStatus = manifest.status;
855
+ // "Already stopped" — a non-in-flight state that is resumable, so cleanup
856
+ // forfeits resume and a plain DELETE (Halt) is a no-op. `paused` joins
857
+ // halted/failed here now that fleets can be paused.
858
+ const alreadyHalted =
859
+ currentStatus === 'halted' ||
860
+ currentStatus === 'failed' ||
861
+ currentStatus === 'paused';
862
+
863
+ // Resume-loss gate (412) applies ONLY to the cleanup path. Plain DELETE
864
+ // on a running fleet halts unstarted children; plain DELETE on an
865
+ // already-stopped fleet is an idempotent no-op (no worktree deletion,
866
+ // no resume-loss to warn about). See W-040 §13.6.
867
+ if (cleanup === '1' && alreadyHalted && force !== '1') {
868
+ return res.status(412).json({
869
+ ok: false,
870
+ error:
871
+ 'Fleet is in a resumable state. Pass ?force=1 to confirm cleanup will block future --resume attempts.',
872
+ current_status: currentStatus,
873
+ });
874
+ }
875
+
876
+ // Plain DELETE on an already-stopped fleet: no-op (200).
877
+ if (cleanup !== '1' && alreadyHalted) {
878
+ return res.json({
879
+ ok: true,
880
+ halted_count: 0,
881
+ already_halted: true,
882
+ });
883
+ }
884
+
885
+ const enriched = enrichChildren(manifest, prefsDir);
886
+ const halted_count = enriched.filter((c) => c.status === 'pending').length;
887
+
888
+ manifest.status = 'halted';
889
+ manifest.halt_reason = 'user';
890
+ manifest.halted_at = new Date().toISOString();
891
+ saveManifest(fleetRunsDir, manifest);
892
+
893
+ if (cleanup === '1') {
894
+ let cleanResult = {};
895
+ try {
896
+ cleanResult = (await runCleanup(id)) ?? {};
897
+ } catch (err) {
898
+ return res
899
+ .status(500)
900
+ .json({ ok: false, error: `Cleanup failed: ${err.message}` });
901
+ }
902
+ return res.json({ ok: true, halted_count, ...cleanResult });
903
+ }
904
+
905
+ res.json({ ok: true, halted_count });
906
+ });
907
+
908
+ // ── POST /api/fleet-runs/:id/resume ────────────────────────────────────
909
+ router.post('/:id/resume', async (req, res) => {
910
+ const { id } = req.params;
911
+ if (!validateFleetId(id)) {
912
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
913
+ }
914
+
915
+ const manifest = readManifest(fleetRunsDir, id);
916
+ if (!manifest) {
917
+ return res
918
+ .status(404)
919
+ .json({ ok: false, error: `Fleet "${id}" not found` });
920
+ }
921
+
922
+ if (manifest.status === 'running') {
923
+ return res
924
+ .status(409)
925
+ .json({ ok: false, error: 'Fleet is already running' });
926
+ }
927
+
928
+ // 410 when any launched child (has run_id) is missing its registry entry
929
+ const children = manifest.children ?? [];
930
+ const cleanedChildren = children.filter((child) => {
931
+ if (!child.run_id) return false;
932
+ const reg = join(
933
+ child.project_path,
934
+ '.worca',
935
+ 'multi',
936
+ 'pipelines.d',
937
+ `${child.run_id}.json`,
938
+ );
939
+ return !existsSync(reg);
940
+ });
941
+
942
+ if (cleanedChildren.length > 0) {
943
+ return res.status(410).json({
944
+ ok: false,
945
+ error:
946
+ 'Resume is unavailable — one or more child worktrees have been cleaned.',
947
+ cleaned_run_ids: cleanedChildren.map((c) => c.run_id),
948
+ });
949
+ }
950
+
951
+ // Only flip the manifest to `running` AFTER dispatch succeeds, so a
952
+ // throwing dispatcher cannot leave the manifest stuck at `running` with
953
+ // zero live children. The previous status (halted/failed/paused) is
954
+ // preserved on failure, leaving the user free to retry resume.
955
+ let relaunched_count = 0;
956
+ if (dispatchFleet) {
957
+ try {
958
+ const result = await dispatchFleet({
959
+ fleet_id: id,
960
+ manifest,
961
+ resume: true,
962
+ });
963
+ relaunched_count = result?.relaunched_count ?? 0;
964
+ } catch (err) {
965
+ return res
966
+ .status(500)
967
+ .json({ ok: false, error: `Resume failed: ${err.message}` });
968
+ }
969
+ }
970
+
971
+ manifest.status = 'running';
972
+ manifest.halt_reason = null;
973
+ saveManifest(fleetRunsDir, manifest);
974
+
975
+ res.json({ ok: true, relaunched_count });
976
+ });
977
+
978
+ // ── POST /api/fleet-runs/:id/pause ──────────────────────────────────────
979
+ // Pause a running fleet: fan a `pause` control file out to every in-flight
980
+ // child (each exits cleanly at its next iteration) and stamp the manifest
981
+ // status="paused". Sticky until an explicit resume. Only valid while the
982
+ // fleet is in flight — a terminal/halted fleet has nothing to pause.
983
+ router.post('/:id/pause', (req, res) => {
984
+ const { id } = req.params;
985
+ if (!validateFleetId(id)) {
986
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
987
+ }
988
+ const manifest = readManifest(fleetRunsDir, id);
989
+ if (!manifest) {
990
+ return res
991
+ .status(404)
992
+ .json({ ok: false, error: `Fleet "${id}" not found` });
993
+ }
994
+ if (manifest.status !== 'running' && manifest.status !== 'resuming') {
995
+ return res.status(409).json({
996
+ ok: false,
997
+ error: `Cannot pause a fleet in "${manifest.status}" state`,
998
+ current_status: manifest.status,
999
+ });
1000
+ }
1001
+ try {
1002
+ const result = pauseFleet(id) ?? {};
1003
+ return res.json({ ok: true, ...result });
1004
+ } catch (err) {
1005
+ return res
1006
+ .status(500)
1007
+ .json({ ok: false, error: `Pause failed: ${err.message}` });
1008
+ }
1009
+ });
1010
+
1011
+ // ── POST /api/fleet-runs/:id/stop ───────────────────────────────────────
1012
+ // Stop a running fleet: fan a `stop` control file out to every in-flight
1013
+ // child AND SIGTERM each child process, then stamp the manifest
1014
+ // status="halted" with halt_reason="stopped" (distinct from a plain Halt,
1015
+ // where in-flight children finish naturally). Sticky until resume.
1016
+ router.post('/:id/stop', (req, res) => {
1017
+ const { id } = req.params;
1018
+ if (!validateFleetId(id)) {
1019
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
1020
+ }
1021
+ const manifest = readManifest(fleetRunsDir, id);
1022
+ if (!manifest) {
1023
+ return res
1024
+ .status(404)
1025
+ .json({ ok: false, error: `Fleet "${id}" not found` });
1026
+ }
1027
+ if (manifest.status !== 'running' && manifest.status !== 'resuming') {
1028
+ return res.status(409).json({
1029
+ ok: false,
1030
+ error: `Cannot stop a fleet in "${manifest.status}" state`,
1031
+ current_status: manifest.status,
1032
+ });
1033
+ }
1034
+ try {
1035
+ const result = stopFleet(id) ?? {};
1036
+ return res.json({ ok: true, ...result });
1037
+ } catch (err) {
1038
+ return res
1039
+ .status(500)
1040
+ .json({ ok: false, error: `Stop failed: ${err.message}` });
1041
+ }
1042
+ });
1043
+
1044
+ // ── POST /api/fleet-runs/:id/relaunch ───────────────────────────────────
1045
+ router.post('/:id/relaunch', async (req, res) => {
1046
+ const { id } = req.params;
1047
+ if (!validateFleetId(id)) {
1048
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
1049
+ }
1050
+
1051
+ const manifest = readManifest(fleetRunsDir, id);
1052
+ if (!manifest) {
1053
+ return res
1054
+ .status(404)
1055
+ .json({ ok: false, error: `Fleet "${id}" not found` });
1056
+ }
1057
+
1058
+ const overrides = req.body ?? {};
1059
+ const { fleet_id: newId, fleet_id_short: newShort } = generateFleetId();
1060
+
1061
+ const newManifest = {
1062
+ ...manifest,
1063
+ fleet_id: newId,
1064
+ fleet_id_short: newShort,
1065
+ created_at: new Date().toISOString(),
1066
+ status: 'running',
1067
+ halt_reason: null,
1068
+ children: [],
1069
+ work_request: {
1070
+ ...manifest.work_request,
1071
+ ...(overrides.prompt
1072
+ ? {
1073
+ description: overrides.prompt,
1074
+ title: overrides.prompt.slice(0, 80),
1075
+ }
1076
+ : {}),
1077
+ },
1078
+ ...(overrides.head_template != null
1079
+ ? { head_template: overrides.head_template }
1080
+ : {}),
1081
+ ...(overrides.base_branch !== undefined
1082
+ ? { base_branch: overrides.base_branch }
1083
+ : {}),
1084
+ };
1085
+
1086
+ const manifest_path = saveManifest(fleetRunsDir, newManifest);
1087
+
1088
+ if (dispatchFleet) {
1089
+ try {
1090
+ await dispatchFleet({
1091
+ fleet_id: newId,
1092
+ manifest: newManifest,
1093
+ manifest_path,
1094
+ });
1095
+ } catch (err) {
1096
+ newManifest.status = 'failed';
1097
+ saveManifest(fleetRunsDir, newManifest);
1098
+ return res
1099
+ .status(500)
1100
+ .json({ ok: false, error: `Relaunch failed: ${err.message}` });
1101
+ }
1102
+ }
1103
+
1104
+ res.json({ ok: true, new_fleet_id: newId, manifest_path });
1105
+ });
1106
+
1107
+ // ── GET /api/fleet-runs/:id/guide ───────────────────────────────────────
1108
+ router.get('/:id/guide', (req, res) => {
1109
+ const { id } = req.params;
1110
+ if (!validateFleetId(id)) {
1111
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
1112
+ }
1113
+
1114
+ const manifest = readManifest(fleetRunsDir, id);
1115
+ if (!manifest) {
1116
+ return res
1117
+ .status(404)
1118
+ .json({ ok: false, error: `Fleet "${id}" not found` });
1119
+ }
1120
+
1121
+ const guide = manifest.guide;
1122
+ if (!guide?.paths?.length) {
1123
+ return res
1124
+ .status(404)
1125
+ .json({ ok: false, error: 'No guide attached to this fleet' });
1126
+ }
1127
+
1128
+ const chunks = [];
1129
+ for (const guidePath of guide.paths) {
1130
+ try {
1131
+ chunks.push(readFileSync(guidePath, 'utf8'));
1132
+ } catch (err) {
1133
+ if (err.code === 'ENOENT' || err.code === 'EACCES') {
1134
+ return res.status(404).json({
1135
+ ok: false,
1136
+ error: 'guide_not_retrievable',
1137
+ hint: 'Guide was supplied via CLI from a path the UI server cannot read. View the original file on the launching machine.',
1138
+ });
1139
+ }
1140
+ return res.status(500).json({ ok: false, error: err.message });
1141
+ }
1142
+ }
1143
+
1144
+ res.setHeader('Content-Type', 'text/markdown; charset=utf-8');
1145
+ res.send(chunks.join('\n\n---\n\n'));
1146
+ });
1147
+
1148
+ return router;
1149
+ }