@worca/ui 0.21.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1147 @@
1
+ /**
2
+ * Fleet REST endpoints (W-040 §13.6).
3
+ *
4
+ * Manifests live at ~/.worca/fleet-runs/<fleet_id>.json.
5
+ * Uploaded guide files land under ~/.worca/fleet-runs/<fleet_id>/guides/.
6
+ */
7
+
8
+ import { execFileSync } from 'node:child_process';
9
+ import {
10
+ existsSync,
11
+ mkdirSync,
12
+ readdirSync,
13
+ readFileSync,
14
+ renameSync,
15
+ unlinkSync,
16
+ writeFileSync,
17
+ } from 'node:fs';
18
+ import { homedir } from 'node:os';
19
+ import { basename, join } from 'node:path';
20
+ import { Router } from 'express';
21
+
22
+ const DEFAULT_FLEET_RUNS_DIR = join(homedir(), '.worca', 'fleet-runs');
23
+ const GUIDE_CAP_BYTES_DEFAULT = 64 * 1024; // 64 KB
24
+
25
+ // Fleet IDs have the form f_<12 digits>_<hex> — enforces no path traversal.
26
+ const FLEET_ID_RE = /^f_\d{12}_[0-9a-f]{1,32}$/;
27
+
28
+ // ─── helpers ───────────────────────────────────────────────────────────────
29
+
30
+ function validateFleetId(id) {
31
+ return typeof id === 'string' && FLEET_ID_RE.test(id);
32
+ }
33
+
34
+ function manifestFilePath(fleetRunsDir, fleetId) {
35
+ return join(fleetRunsDir, `${fleetId}.json`);
36
+ }
37
+
38
+ function readManifest(fleetRunsDir, fleetId) {
39
+ const p = manifestFilePath(fleetRunsDir, fleetId);
40
+ if (!existsSync(p)) return null;
41
+ try {
42
+ return JSON.parse(readFileSync(p, 'utf8'));
43
+ } catch {
44
+ return null;
45
+ }
46
+ }
47
+
48
+ function saveManifest(fleetRunsDir, manifest) {
49
+ mkdirSync(fleetRunsDir, { recursive: true });
50
+ const p = manifestFilePath(fleetRunsDir, manifest.fleet_id);
51
+ // Atomic write: temp file + rename, mirroring write_fleet_manifest() in
52
+ // src/worca/orchestrator/fleet_manifest.py. Without this a concurrent
53
+ // reader (WS watcher, listManifests) can see a half-written file and
54
+ // swallow a parse error — surfacing as a transient blank fleet event.
55
+ // The temp suffix avoids the watcher's `.json` filename filter.
56
+ const tmp = `${p}.tmp.${process.pid}.${Date.now()}`;
57
+ try {
58
+ writeFileSync(tmp, `${JSON.stringify(manifest, null, 2)}\n`, 'utf8');
59
+ renameSync(tmp, p);
60
+ } catch (err) {
61
+ try {
62
+ unlinkSync(tmp);
63
+ } catch {
64
+ /* best-effort temp cleanup */
65
+ }
66
+ throw err;
67
+ }
68
+ return p;
69
+ }
70
+
71
+ function listManifests(fleetRunsDir) {
72
+ if (!existsSync(fleetRunsDir)) return [];
73
+ const out = [];
74
+ for (const file of readdirSync(fleetRunsDir)) {
75
+ if (!file.endsWith('.json')) continue;
76
+ try {
77
+ const m = JSON.parse(readFileSync(join(fleetRunsDir, file), 'utf8'));
78
+ if (m?.fleet_id) out.push(m);
79
+ } catch {
80
+ // skip malformed
81
+ }
82
+ }
83
+ return out;
84
+ }
85
+
86
+ function _readChildRegistry(child) {
87
+ const { project_path, run_id } = child;
88
+ if (!project_path || !run_id) return null;
89
+ const reg = join(
90
+ project_path,
91
+ '.worca',
92
+ 'multi',
93
+ 'pipelines.d',
94
+ `${run_id}.json`,
95
+ );
96
+ if (!existsSync(reg)) return null;
97
+ try {
98
+ return JSON.parse(readFileSync(reg, 'utf8'));
99
+ } catch {
100
+ return null;
101
+ }
102
+ }
103
+
104
+ function resolveChildStatus(child) {
105
+ return _readChildRegistry(child)?.status ?? 'running';
106
+ }
107
+
108
+ // Reverse-lookup: scan every registered project's pipelines.d/ for runs that
109
+ // reference this fleet_id. Acts as a self-healing fallback when the manifest's
110
+ // children array was never populated (older dispatcher path) or when a child
111
+ // race-wrote its registry entry before the manifest update landed. Cheap:
112
+ // pipelines.d/ entries are already on disk, and the UI reads them anyway.
113
+ function _discoverChildrenFromRegistry(fleetId, prefsDir) {
114
+ if (!fleetId || !prefsDir) return [];
115
+ const projectsDir = join(prefsDir, 'projects.d');
116
+ if (!existsSync(projectsDir)) return [];
117
+ const discovered = [];
118
+ let projectFiles;
119
+ try {
120
+ projectFiles = readdirSync(projectsDir);
121
+ } catch {
122
+ return [];
123
+ }
124
+ for (const file of projectFiles) {
125
+ if (!file.endsWith('.json')) continue;
126
+ let project;
127
+ try {
128
+ project = JSON.parse(readFileSync(join(projectsDir, file), 'utf8'));
129
+ } catch {
130
+ continue;
131
+ }
132
+ if (!project?.path) continue;
133
+ const pipesDir = join(project.path, '.worca', 'multi', 'pipelines.d');
134
+ if (!existsSync(pipesDir)) continue;
135
+ let runFiles;
136
+ try {
137
+ runFiles = readdirSync(pipesDir);
138
+ } catch {
139
+ continue;
140
+ }
141
+ for (const f of runFiles) {
142
+ if (!f.endsWith('.json')) continue;
143
+ let entry;
144
+ try {
145
+ entry = JSON.parse(readFileSync(join(pipesDir, f), 'utf8'));
146
+ } catch {
147
+ continue;
148
+ }
149
+ if (entry?.fleet_id === fleetId && entry?.run_id) {
150
+ discovered.push({
151
+ project_path: project.path,
152
+ run_id: entry.run_id,
153
+ });
154
+ }
155
+ }
156
+ }
157
+ return discovered;
158
+ }
159
+
160
+ function _mergeChildren(manifestChildren, discoveredChildren) {
161
+ const seen = new Set();
162
+ const out = [];
163
+ const keyOf = (c) => `${c.project_path ?? ''}\0${c.run_id ?? ''}`;
164
+ for (const c of manifestChildren ?? []) {
165
+ const k = keyOf(c);
166
+ if (seen.has(k)) continue;
167
+ seen.add(k);
168
+ out.push(c);
169
+ }
170
+ for (const c of discoveredChildren) {
171
+ const k = keyOf(c);
172
+ if (seen.has(k)) continue;
173
+ seen.add(k);
174
+ out.push(c);
175
+ }
176
+ return out;
177
+ }
178
+
179
+ function _resolveChildren(manifest, prefsDir) {
180
+ const discovered = _discoverChildrenFromRegistry(manifest.fleet_id, prefsDir);
181
+ return _mergeChildren(manifest.children, discovered);
182
+ }
183
+
184
+ function enrichChildren(manifest, prefsDir) {
185
+ return _resolveChildren(manifest, prefsDir).map((c) => ({
186
+ ...c,
187
+ status: resolveChildStatus(c),
188
+ }));
189
+ }
190
+
191
+ /**
192
+ * Aggregate fleet-level metrics from the manifest plus each child's live
193
+ * pipeline-registry entry. Returns:
194
+ * { children, cost_usd, last_activity_at }
195
+ *
196
+ * - `children`: compact records carrying the live status (not the stale
197
+ * manifest status) so the UI's fleet-card status tiles are accurate.
198
+ * - `cost_usd`: sum of `iter.cost_usd` across every stage iteration of
199
+ * every dispatched child.
200
+ * - `last_activity_at`: the latest `last_event_at` / `completed_at` /
201
+ * `started_at` observed across children, falling back to the manifest's
202
+ * `updated_at` when no child has reported yet.
203
+ *
204
+ * Cheap: each fleet's pipelines.d/<run_id>.json is already on disk and we
205
+ * read it once per child; the dashboard previously did the same work
206
+ * client-side via WS-pushed Run records, so the I/O is comparable.
207
+ */
208
+ function aggregateFleetMetrics(manifest, prefsDir) {
209
+ let cost_usd = 0;
210
+ let last_activity_at = manifest.updated_at ?? null;
211
+ const children = _resolveChildren(manifest, prefsDir).map((c) => {
212
+ const reg = _readChildRegistry(c);
213
+ const compact = {
214
+ project_path: c.project_path,
215
+ run_id: c.run_id ?? null,
216
+ status: reg?.status ?? c.status ?? 'pending',
217
+ };
218
+ if (reg) {
219
+ for (const stage of Object.values(reg.stages ?? {})) {
220
+ for (const iter of stage.iterations ?? []) {
221
+ cost_usd += iter.cost_usd ?? 0;
222
+ }
223
+ }
224
+ const ts =
225
+ reg.last_event_at ?? reg.completed_at ?? reg.started_at ?? null;
226
+ if (ts && (!last_activity_at || ts > last_activity_at)) {
227
+ last_activity_at = ts;
228
+ }
229
+ }
230
+ return compact;
231
+ });
232
+ return { children, cost_usd, last_activity_at };
233
+ }
234
+
235
+ // ─── fleet status derivation ───────────────────────────────────────────────
236
+ //
237
+ // JS port of derive_fleet_status / poll_and_update_fleet_manifest from
238
+ // src/worca/orchestrator/fleet_manifest.py. The Python poller is only wired
239
+ // into tests — in production nothing re-derives a fleet's status after
240
+ // run_fleet.py exits (it launches detached children and returns within
241
+ // seconds, long before any child finishes). Without this server-side
242
+ // reconcile the manifest's stored `status` is frozen at "running" forever.
243
+
244
+ const _RUNNING_STATES = new Set(['running', 'resuming', 'paused']);
245
+ const _FAILURE_STATES = new Set(['failed', 'setup_failed', 'unrecoverable']);
246
+ // `interrupted` / `cancelled` are terminal-but-not-completed (and NOT
247
+ // failures — a deliberate stop must not inflate the circuit-breaker ratio).
248
+ const _TERMINAL_STATES = new Set([
249
+ 'completed',
250
+ 'interrupted',
251
+ 'cancelled',
252
+ ..._FAILURE_STATES,
253
+ ]);
254
+
255
+ /**
256
+ * Pure derivation of fleet status from a list of child pipeline statuses.
257
+ * Mirrors derive_fleet_status() in fleet_manifest.py. Exported for tests.
258
+ *
259
+ * @param {string[]} childStatuses
260
+ * @param {number} threshold circuit-breaker failure ratio (default 0.30)
261
+ * @returns {{ status: string, halt_reason: string|null }}
262
+ */
263
+ export function deriveFleetStatus(childStatuses, threshold = 0.3) {
264
+ if (!childStatuses.length) return { status: 'running', halt_reason: null };
265
+
266
+ const total = childStatuses.length;
267
+ const runningCount = childStatuses.filter((s) =>
268
+ _RUNNING_STATES.has(s),
269
+ ).length;
270
+ const completedCount = childStatuses.filter((s) => s === 'completed').length;
271
+ const failedCount = childStatuses.filter((s) =>
272
+ _FAILURE_STATES.has(s),
273
+ ).length;
274
+ const terminalCount = childStatuses.filter((s) =>
275
+ _TERMINAL_STATES.has(s),
276
+ ).length;
277
+
278
+ // Circuit breaker fires only while in-flight children still exist.
279
+ if (runningCount > 0) {
280
+ const minTerminal = Math.min(3, total);
281
+ if (
282
+ terminalCount >= minTerminal &&
283
+ failedCount > 0 &&
284
+ failedCount / terminalCount >= threshold
285
+ ) {
286
+ return { status: 'halted', halt_reason: 'circuit_breaker' };
287
+ }
288
+ return { status: 'running', halt_reason: null };
289
+ }
290
+
291
+ // All dispatched children are terminal.
292
+ if (terminalCount === total) {
293
+ return completedCount === total
294
+ ? { status: 'completed', halt_reason: null }
295
+ : { status: 'failed', halt_reason: null };
296
+ }
297
+
298
+ // Pending / untracked children not yet dispatched.
299
+ return { status: 'running', halt_reason: null };
300
+ }
301
+
302
+ /**
303
+ * Pure decision: given a manifest and live child statuses, return the
304
+ * effective { status, halt_reason } the API/WS should report. Persists
305
+ * nothing — safe for use in side-effect-free contexts (WS watcher).
306
+ *
307
+ * Sticky states are never re-derived:
308
+ * - halted / paused — operator actions (Halt / Stop / Pause), held until
309
+ * an explicit resume
310
+ * - completed / failed — terminal; only resume / relaunch leaves them
311
+ * Only `running` and `resuming` reconcile. From `resuming` the status may
312
+ * only advance to `running` — never straight to a terminal status, since a
313
+ * just-resumed fleet's children may still carry their pre-resume terminal
314
+ * registry state for a beat before the resumed runners flip them back.
315
+ *
316
+ * @param {object} manifest
317
+ * @param {string[]} childStatuses
318
+ * @returns {{ status: string, halt_reason: string|null }}
319
+ */
320
+ export function effectiveFleetStatus(manifest, childStatuses) {
321
+ const current = manifest.status ?? 'running';
322
+ if (current !== 'running' && current !== 'resuming') {
323
+ return { status: current, halt_reason: manifest.halt_reason ?? null };
324
+ }
325
+
326
+ const threshold = manifest.fleet_failure_threshold ?? 0.3;
327
+ const { status, halt_reason } = deriveFleetStatus(childStatuses, threshold);
328
+
329
+ if (current === 'resuming' && status !== 'running') {
330
+ return { status: 'resuming', halt_reason: manifest.halt_reason ?? null };
331
+ }
332
+ return { status, halt_reason };
333
+ }
334
+
335
+ /**
336
+ * Reconcile a manifest's stored status against the live child statuses and,
337
+ * when it changed, persist it back. Returns the effective
338
+ * { status, halt_reason } the API should report. Wraps effectiveFleetStatus
339
+ * with persistence — the WS watcher should call effectiveFleetStatus instead
340
+ * to avoid a write→watch→write loop.
341
+ */
342
+ function reconcileFleetStatus(manifest, childStatuses, fleetRunsDir) {
343
+ const current = manifest.status ?? 'running';
344
+ const { status, halt_reason } = effectiveFleetStatus(manifest, childStatuses);
345
+
346
+ if (status !== current || halt_reason !== (manifest.halt_reason ?? null)) {
347
+ manifest.status = status;
348
+ if (halt_reason != null) {
349
+ manifest.halt_reason = halt_reason;
350
+ } else if (status !== 'halted') {
351
+ manifest.halt_reason = null;
352
+ }
353
+ manifest.updated_at = new Date().toISOString();
354
+ try {
355
+ saveManifest(fleetRunsDir, manifest);
356
+ } catch {
357
+ // Best-effort persistence — the derived value is still returned, and
358
+ // the next read re-derives it anyway.
359
+ }
360
+ }
361
+ return { status, halt_reason };
362
+ }
363
+
364
+ function generateFleetId() {
365
+ const now = new Date();
366
+ const ts = [
367
+ now.getUTCFullYear(),
368
+ String(now.getUTCMonth() + 1).padStart(2, '0'),
369
+ String(now.getUTCDate()).padStart(2, '0'),
370
+ String(now.getUTCHours()).padStart(2, '0'),
371
+ String(now.getUTCMinutes()).padStart(2, '0'),
372
+ ].join('');
373
+ const rand = Math.random().toString(16).slice(2, 10).padStart(8, '0');
374
+ return { fleet_id: `f_${ts}_${rand}`, fleet_id_short: rand };
375
+ }
376
+
377
+ function sanitizeFilename(raw) {
378
+ const name = basename(raw || 'guide')
379
+ .replace(/[/\\]/g, '')
380
+ .replace(/[^A-Za-z0-9._-]/g, '_');
381
+ return name || 'guide';
382
+ }
383
+
384
+ // ─── multipart parser ──────────────────────────────────────────────────────
385
+
386
+ function readRawBody(req) {
387
+ return new Promise((resolve, reject) => {
388
+ const chunks = [];
389
+ req.on('data', (c) => chunks.push(c));
390
+ req.on('end', () => resolve(Buffer.concat(chunks)));
391
+ req.on('error', reject);
392
+ });
393
+ }
394
+
395
+ /**
396
+ * Parse a multipart/form-data body into an array of parts.
397
+ * Each part: { name: string|null, filename: string|null, content: Buffer }
398
+ */
399
+ function parseMultipart(body, contentType) {
400
+ const m = /boundary=([^\s;,]+)/.exec(contentType);
401
+ if (!m) return null;
402
+ const boundary = m[1].replace(/^["']|["']$/g, '');
403
+
404
+ const delim = Buffer.from(`\r\n--${boundary}`);
405
+ const parts = [];
406
+
407
+ // Locate the opening delimiter
408
+ const openStr = `--${boundary}\r\n`;
409
+ let pos = body.indexOf(openStr);
410
+ if (pos === -1) return parts;
411
+ pos += openStr.length;
412
+
413
+ while (pos < body.length) {
414
+ const end = body.indexOf(delim, pos);
415
+ if (end === -1) break;
416
+
417
+ const partBuf = body.slice(pos, end);
418
+ const hdrEnd = partBuf.indexOf('\r\n\r\n');
419
+ if (hdrEnd !== -1) {
420
+ const headerStr = partBuf.slice(0, hdrEnd).toString('utf8');
421
+ const content = partBuf.slice(hdrEnd + 4);
422
+
423
+ const headers = {};
424
+ for (const line of headerStr.split('\r\n')) {
425
+ const ci = line.indexOf(':');
426
+ if (ci !== -1) {
427
+ headers[line.slice(0, ci).toLowerCase().trim()] = line
428
+ .slice(ci + 1)
429
+ .trim();
430
+ }
431
+ }
432
+
433
+ const cd = headers['content-disposition'] ?? '';
434
+ const nm = /\bname="([^"]+)"/.exec(cd);
435
+ const fn = /\bfilename="([^"]+)"/.exec(cd);
436
+
437
+ parts.push({
438
+ name: nm?.[1] ?? null,
439
+ filename: fn?.[1] ?? null,
440
+ content,
441
+ });
442
+ }
443
+
444
+ pos = end + delim.length;
445
+ const after = body.slice(pos, pos + 2).toString();
446
+ if (after === '--') break;
447
+ pos += 2;
448
+ }
449
+
450
+ return parts;
451
+ }
452
+
453
+ // ─── default injectable implementations ────────────────────────────────────
454
+
455
+ function defaultValidateBaseBranch(project, branch) {
456
+ try {
457
+ const out = execFileSync(
458
+ 'git',
459
+ ['-C', project, 'branch', '--list', branch],
460
+ { encoding: 'utf8' },
461
+ );
462
+ return out.trim().length > 0;
463
+ } catch {
464
+ return false;
465
+ }
466
+ }
467
+
468
+ function defaultRunCleanup(fleetId) {
469
+ execFileSync('worca', ['cleanup', '--fleet-id', fleetId, '--all']);
470
+ return {};
471
+ }
472
+
473
+ // run_fleet.py --pause / --stop fan a control file out to every in-flight
474
+ // child and stamp the manifest (paused / halted+stopped). They print
475
+ // "paused N in-flight…" / "stopped N…" — parse N back out for the response.
476
+ // fleetId is validated against FLEET_ID_RE before reaching here, and
477
+ // execFileSync (no shell) means no injection surface regardless.
478
+ function _runFleetLifecycle(flag, fleetId) {
479
+ const out = execFileSync(
480
+ 'python3',
481
+ ['-m', 'worca.scripts.run_fleet', flag, fleetId],
482
+ { encoding: 'utf8' },
483
+ );
484
+ const m = /(\d+)/.exec(out || '');
485
+ return m ? Number(m[1]) : 0;
486
+ }
487
+
488
+ function defaultPauseFleet(fleetId) {
489
+ return { paused_count: _runFleetLifecycle('--pause', fleetId) };
490
+ }
491
+
492
+ function defaultStopFleet(fleetId) {
493
+ return { stopped_count: _runFleetLifecycle('--stop', fleetId) };
494
+ }
495
+
496
+ // ─── router factory ────────────────────────────────────────────────────────
497
+
498
+ /**
499
+ * @param {{
500
+ * fleetRunsDir?: string,
501
+ * prefsDir?: string,
502
+ * dispatchFleet?: (args: object) => Promise<object>,
503
+ * runCleanup?: (fleetId: string) => Promise<object>,
504
+ * pauseFleet?: (fleetId: string) => object,
505
+ * stopFleet?: (fleetId: string) => object,
506
+ * validateBaseBranch?: (project: string, branch: string) => Promise<boolean>,
507
+ * guideCapBytes?: number,
508
+ * }} opts
509
+ *
510
+ * `prefsDir` enables the reverse-lookup path: when a fleet manifest's
511
+ * `children` array is empty (older dispatcher) or out-of-sync, the router
512
+ * scans every project under `<prefsDir>/projects.d/` for pipelines.d/
513
+ * entries that reference the fleet_id and includes them in the response.
514
+ */
515
+ export function createFleetRouter({
516
+ fleetRunsDir = DEFAULT_FLEET_RUNS_DIR,
517
+ prefsDir = null,
518
+ dispatchFleet = null,
519
+ runCleanup = defaultRunCleanup,
520
+ pauseFleet = defaultPauseFleet,
521
+ stopFleet = defaultStopFleet,
522
+ validateBaseBranch = defaultValidateBaseBranch,
523
+ guideCapBytes = GUIDE_CAP_BYTES_DEFAULT,
524
+ } = {}) {
525
+ const router = Router();
526
+
527
+ // ── GET /api/fleet-runs ─────────────────────────────────────────────────
528
+ //
529
+ // Returns a list of fleet summaries. The payload includes a compact
530
+ // `children` array (one slim record per dispatched child) so the UI can
531
+ // render `fleetCardView` with the children strip without an extra
532
+ // round-trip per fleet. The full per-child registry entry is still
533
+ // fetched lazily via GET /api/fleet-runs/:id when the user drills in.
534
+ router.get('/', (_req, res) => {
535
+ try {
536
+ const fleets = listManifests(fleetRunsDir).map((m) => {
537
+ const agg = aggregateFleetMetrics(m, prefsDir);
538
+ // Reconcile the stored status against the children's live registry
539
+ // statuses — run_fleet.py never writes a terminal status back.
540
+ const { status, halt_reason } = reconcileFleetStatus(
541
+ m,
542
+ agg.children.map((c) => c.status),
543
+ fleetRunsDir,
544
+ );
545
+ return {
546
+ fleet_id: m.fleet_id,
547
+ fleet_id_short: m.fleet_id_short,
548
+ work_request: m.work_request,
549
+ status,
550
+ halt_reason,
551
+ halted_at: m.halted_at ?? null,
552
+ archived: m.archived === true,
553
+ archived_at: m.archived_at ?? null,
554
+ children_count: agg.children.length,
555
+ children: agg.children,
556
+ head_template: m.head_template ?? null,
557
+ base_branch: m.base_branch ?? null,
558
+ plan: m.plan ? { mode: m.plan.mode ?? 'none' } : { mode: 'none' },
559
+ created_at: m.created_at,
560
+ updated_at: m.updated_at ?? null,
561
+ last_activity_at: agg.last_activity_at,
562
+ cost_usd: agg.cost_usd,
563
+ guide: m.guide
564
+ ? {
565
+ bytes: m.guide.bytes,
566
+ filenames: m.guide.filenames,
567
+ uploaded: m.guide.uploaded,
568
+ }
569
+ : null,
570
+ };
571
+ });
572
+ res.json({ ok: true, fleets });
573
+ } catch (err) {
574
+ res.status(500).json({ ok: false, error: err.message });
575
+ }
576
+ });
577
+
578
+ // ── POST /api/fleet-runs/validate-base ──────────────────────────────────
579
+ router.post('/validate-base', async (req, res) => {
580
+ const { projects, base_branch } = req.body ?? {};
581
+ if (!base_branch || typeof base_branch !== 'string') {
582
+ return res
583
+ .status(400)
584
+ .json({ ok: false, error: 'base_branch is required' });
585
+ }
586
+ if (!Array.isArray(projects) || projects.length === 0) {
587
+ return res
588
+ .status(400)
589
+ .json({ ok: false, error: 'projects must be a non-empty array' });
590
+ }
591
+
592
+ try {
593
+ const missing_in = [];
594
+ for (const proj of projects) {
595
+ const exists = await validateBaseBranch(proj, base_branch);
596
+ if (!exists) missing_in.push(proj);
597
+ }
598
+ res.json({ ok: missing_in.length === 0, missing_in });
599
+ } catch (err) {
600
+ res.status(500).json({ ok: false, error: err.message });
601
+ }
602
+ });
603
+
604
+ // ── POST /api/fleet-runs ────────────────────────────────────────────────
605
+ router.post('/', async (req, res) => {
606
+ try {
607
+ const contentType = req.headers['content-type'] ?? '';
608
+ const isMultipart = contentType.includes('multipart/form-data');
609
+
610
+ let fields = {};
611
+ const guideFiles = []; // [{ filename: string, content: Buffer }]
612
+
613
+ if (isMultipart) {
614
+ const rawBody = await readRawBody(req);
615
+ const parts = parseMultipart(rawBody, contentType);
616
+ if (!parts) {
617
+ return res
618
+ .status(400)
619
+ .json({ ok: false, error: 'Failed to parse multipart body' });
620
+ }
621
+ for (const part of parts) {
622
+ if (part.filename != null) {
623
+ guideFiles.push({
624
+ filename: part.filename,
625
+ content: part.content,
626
+ });
627
+ } else if (part.name) {
628
+ fields[part.name] = part.content.toString('utf8');
629
+ }
630
+ }
631
+ if (typeof fields.projects === 'string') {
632
+ try {
633
+ fields.projects = JSON.parse(fields.projects);
634
+ } catch {
635
+ // leave as string
636
+ }
637
+ }
638
+ } else {
639
+ fields = req.body ?? {};
640
+ }
641
+
642
+ const {
643
+ projects = [],
644
+ prompt,
645
+ source,
646
+ head_template,
647
+ base_branch,
648
+ plan_mode,
649
+ max_parallel = 5,
650
+ fleet_failure_threshold = 0.3,
651
+ } = fields;
652
+
653
+ if (!prompt && !source) {
654
+ return res
655
+ .status(400)
656
+ .json({ ok: false, error: 'prompt or source is required' });
657
+ }
658
+
659
+ const { fleet_id, fleet_id_short } = generateFleetId();
660
+
661
+ // Save uploaded guide files.
662
+ // Validate total size BEFORE writing anything to disk — partial writes
663
+ // would leave orphan files under <fleet_id>/guides/ that no manifest
664
+ // points at (cleanup wouldn't find them).
665
+ let guideEntry = null;
666
+ if (guideFiles.length > 0) {
667
+ const totalBytes = guideFiles.reduce(
668
+ (sum, f) => sum + f.content.length,
669
+ 0,
670
+ );
671
+ if (totalBytes > guideCapBytes) {
672
+ return res.status(400).json({
673
+ ok: false,
674
+ error: `Guide files exceed size cap of ${guideCapBytes} bytes`,
675
+ guide_bytes: totalBytes,
676
+ cap_bytes: guideCapBytes,
677
+ });
678
+ }
679
+
680
+ // Resolve every filename to a unique sanitized name first — no I/O yet.
681
+ const usedNames = new Set();
682
+ const planned = guideFiles.map(({ filename, content }) => {
683
+ let safe = sanitizeFilename(filename);
684
+ if (usedNames.has(safe)) {
685
+ const dot = safe.lastIndexOf('.');
686
+ const nameBase = dot !== -1 ? safe.slice(0, dot) : safe;
687
+ const ext = dot !== -1 ? safe.slice(dot) : '';
688
+ let counter = 1;
689
+ while (usedNames.has(`${nameBase}-${counter}${ext}`)) counter++;
690
+ safe = `${nameBase}-${counter}${ext}`;
691
+ }
692
+ usedNames.add(safe);
693
+ return { safe, content };
694
+ });
695
+
696
+ // Now write — total size is validated, no orphan files possible.
697
+ const guidesDir = join(fleetRunsDir, fleet_id, 'guides');
698
+ mkdirSync(guidesDir, { recursive: true });
699
+ const savedPaths = [];
700
+ const savedFilenames = [];
701
+ for (const { safe, content } of planned) {
702
+ writeFileSync(join(guidesDir, safe), content);
703
+ savedPaths.push(join(guidesDir, safe));
704
+ savedFilenames.push(safe);
705
+ }
706
+
707
+ guideEntry = {
708
+ paths: savedPaths,
709
+ bytes: totalBytes,
710
+ filenames: savedFilenames,
711
+ uploaded: true,
712
+ };
713
+ }
714
+
715
+ const manifest = {
716
+ fleet_id,
717
+ fleet_id_short,
718
+ created_at: new Date().toISOString(),
719
+ work_request: {
720
+ title: (prompt || source || '').slice(0, 80),
721
+ description: prompt ?? '',
722
+ source: source ?? null,
723
+ },
724
+ guide: guideEntry,
725
+ plan: { mode: plan_mode ?? 'none', path: null },
726
+ head_template: head_template ?? 'migration/{slug}/{project}',
727
+ base_branch: base_branch ?? null,
728
+ max_parallel: Number(max_parallel) || 5,
729
+ fleet_failure_threshold: Number(fleet_failure_threshold) || 0.3,
730
+ status: 'running',
731
+ halt_reason: null,
732
+ children: [],
733
+ };
734
+
735
+ const manifest_path = saveManifest(fleetRunsDir, manifest);
736
+
737
+ if (dispatchFleet) {
738
+ try {
739
+ await dispatchFleet({
740
+ fleet_id,
741
+ manifest,
742
+ manifest_path,
743
+ projects,
744
+ });
745
+ } catch (err) {
746
+ manifest.status = 'failed';
747
+ saveManifest(fleetRunsDir, manifest);
748
+ return res.status(500).json({
749
+ ok: false,
750
+ error: `Fleet dispatch failed: ${err.message}`,
751
+ });
752
+ }
753
+ }
754
+
755
+ res.status(201).json({ ok: true, fleet_id, manifest_path });
756
+ } catch (err) {
757
+ res.status(500).json({ ok: false, error: err.message });
758
+ }
759
+ });
760
+
761
+ // ── GET /api/fleet-runs/:id ─────────────────────────────────────────────
762
+ router.get('/:id', (req, res) => {
763
+ const { id } = req.params;
764
+ if (!validateFleetId(id)) {
765
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
766
+ }
767
+ const manifest = readManifest(fleetRunsDir, id);
768
+ if (!manifest) {
769
+ return res
770
+ .status(404)
771
+ .json({ ok: false, error: `Fleet "${id}" not found` });
772
+ }
773
+ const children = enrichChildren(manifest, prefsDir);
774
+ // Reconcile the stored status against the children's live registry
775
+ // statuses — run_fleet.py never writes a terminal status back.
776
+ const { status, halt_reason } = reconcileFleetStatus(
777
+ manifest,
778
+ children.map((c) => c.status),
779
+ fleetRunsDir,
780
+ );
781
+ res.json({
782
+ ok: true,
783
+ fleet: { ...manifest, status, halt_reason, children },
784
+ });
785
+ });
786
+
787
+ // ── POST /api/fleet-runs/:id/archive ────────────────────────────────────
788
+ // Hides a terminal fleet from the default list. Mirrors the pipeline
789
+ // run archive endpoint: refuses to archive an in-flight fleet, idempotent
790
+ // when already archived, stamps `archived` + `archived_at` on the manifest.
791
+ router.post('/:id/archive', (req, res) => {
792
+ const { id } = req.params;
793
+ if (!validateFleetId(id)) {
794
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
795
+ }
796
+ const manifest = readManifest(fleetRunsDir, id);
797
+ if (!manifest) {
798
+ return res
799
+ .status(404)
800
+ .json({ ok: false, error: `Fleet "${id}" not found` });
801
+ }
802
+ if (manifest.status === 'running' || manifest.status === 'resuming') {
803
+ return res
804
+ .status(409)
805
+ .json({ ok: false, error: 'Cannot archive an in-flight fleet' });
806
+ }
807
+ if (manifest.archived === true) {
808
+ return res.json({ ok: true, archived_at: manifest.archived_at ?? null });
809
+ }
810
+ manifest.archived = true;
811
+ manifest.archived_at = new Date().toISOString();
812
+ saveManifest(fleetRunsDir, manifest);
813
+ res.json({ ok: true, archived_at: manifest.archived_at });
814
+ });
815
+
816
+ // ── POST /api/fleet-runs/:id/unarchive ──────────────────────────────────
817
+ router.post('/:id/unarchive', (req, res) => {
818
+ const { id } = req.params;
819
+ if (!validateFleetId(id)) {
820
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
821
+ }
822
+ const manifest = readManifest(fleetRunsDir, id);
823
+ if (!manifest) {
824
+ return res
825
+ .status(404)
826
+ .json({ ok: false, error: `Fleet "${id}" not found` });
827
+ }
828
+ if (manifest.archived !== true) {
829
+ return res.json({ ok: true });
830
+ }
831
+ delete manifest.archived;
832
+ delete manifest.archived_at;
833
+ saveManifest(fleetRunsDir, manifest);
834
+ res.json({ ok: true });
835
+ });
836
+
837
+ // ── DELETE /api/fleet-runs/:id ──────────────────────────────────────────
838
+ router.delete('/:id', async (req, res) => {
839
+ const { id } = req.params;
840
+ if (!validateFleetId(id)) {
841
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
842
+ }
843
+
844
+ const manifest = readManifest(fleetRunsDir, id);
845
+ if (!manifest) {
846
+ return res
847
+ .status(404)
848
+ .json({ ok: false, error: `Fleet "${id}" not found` });
849
+ }
850
+
851
+ const { cleanup, force } = req.query;
852
+ const currentStatus = manifest.status;
853
+ // "Already stopped" — a non-in-flight state that is resumable, so cleanup
854
+ // forfeits resume and a plain DELETE (Halt) is a no-op. `paused` joins
855
+ // halted/failed here now that fleets can be paused.
856
+ const alreadyHalted =
857
+ currentStatus === 'halted' ||
858
+ currentStatus === 'failed' ||
859
+ currentStatus === 'paused';
860
+
861
+ // Resume-loss gate (412) applies ONLY to the cleanup path. Plain DELETE
862
+ // on a running fleet halts unstarted children; plain DELETE on an
863
+ // already-stopped fleet is an idempotent no-op (no worktree deletion,
864
+ // no resume-loss to warn about). See W-040 §13.6.
865
+ if (cleanup === '1' && alreadyHalted && force !== '1') {
866
+ return res.status(412).json({
867
+ ok: false,
868
+ error:
869
+ 'Fleet is in a resumable state. Pass ?force=1 to confirm cleanup will block future --resume attempts.',
870
+ current_status: currentStatus,
871
+ });
872
+ }
873
+
874
+ // Plain DELETE on an already-stopped fleet: no-op (200).
875
+ if (cleanup !== '1' && alreadyHalted) {
876
+ return res.json({
877
+ ok: true,
878
+ halted_count: 0,
879
+ already_halted: true,
880
+ });
881
+ }
882
+
883
+ const enriched = enrichChildren(manifest, prefsDir);
884
+ const halted_count = enriched.filter((c) => c.status === 'pending').length;
885
+
886
+ manifest.status = 'halted';
887
+ manifest.halt_reason = 'user';
888
+ manifest.halted_at = new Date().toISOString();
889
+ saveManifest(fleetRunsDir, manifest);
890
+
891
+ if (cleanup === '1') {
892
+ let cleanResult = {};
893
+ try {
894
+ cleanResult = (await runCleanup(id)) ?? {};
895
+ } catch (err) {
896
+ return res
897
+ .status(500)
898
+ .json({ ok: false, error: `Cleanup failed: ${err.message}` });
899
+ }
900
+ return res.json({ ok: true, halted_count, ...cleanResult });
901
+ }
902
+
903
+ res.json({ ok: true, halted_count });
904
+ });
905
+
906
+ // ── POST /api/fleet-runs/:id/resume ────────────────────────────────────
907
+ router.post('/:id/resume', async (req, res) => {
908
+ const { id } = req.params;
909
+ if (!validateFleetId(id)) {
910
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
911
+ }
912
+
913
+ const manifest = readManifest(fleetRunsDir, id);
914
+ if (!manifest) {
915
+ return res
916
+ .status(404)
917
+ .json({ ok: false, error: `Fleet "${id}" not found` });
918
+ }
919
+
920
+ if (manifest.status === 'running') {
921
+ return res
922
+ .status(409)
923
+ .json({ ok: false, error: 'Fleet is already running' });
924
+ }
925
+
926
+ // 410 when any launched child (has run_id) is missing its registry entry
927
+ const children = manifest.children ?? [];
928
+ const cleanedChildren = children.filter((child) => {
929
+ if (!child.run_id) return false;
930
+ const reg = join(
931
+ child.project_path,
932
+ '.worca',
933
+ 'multi',
934
+ 'pipelines.d',
935
+ `${child.run_id}.json`,
936
+ );
937
+ return !existsSync(reg);
938
+ });
939
+
940
+ if (cleanedChildren.length > 0) {
941
+ return res.status(410).json({
942
+ ok: false,
943
+ error:
944
+ 'Resume is unavailable — one or more child worktrees have been cleaned.',
945
+ cleaned_run_ids: cleanedChildren.map((c) => c.run_id),
946
+ });
947
+ }
948
+
949
+ // Only flip the manifest to `running` AFTER dispatch succeeds, so a
950
+ // throwing dispatcher cannot leave the manifest stuck at `running` with
951
+ // zero live children. The previous status (halted/failed/paused) is
952
+ // preserved on failure, leaving the user free to retry resume.
953
+ let relaunched_count = 0;
954
+ if (dispatchFleet) {
955
+ try {
956
+ const result = await dispatchFleet({
957
+ fleet_id: id,
958
+ manifest,
959
+ resume: true,
960
+ });
961
+ relaunched_count = result?.relaunched_count ?? 0;
962
+ } catch (err) {
963
+ return res
964
+ .status(500)
965
+ .json({ ok: false, error: `Resume failed: ${err.message}` });
966
+ }
967
+ }
968
+
969
+ manifest.status = 'running';
970
+ manifest.halt_reason = null;
971
+ saveManifest(fleetRunsDir, manifest);
972
+
973
+ res.json({ ok: true, relaunched_count });
974
+ });
975
+
976
+ // ── POST /api/fleet-runs/:id/pause ──────────────────────────────────────
977
+ // Pause a running fleet: fan a `pause` control file out to every in-flight
978
+ // child (each exits cleanly at its next iteration) and stamp the manifest
979
+ // status="paused". Sticky until an explicit resume. Only valid while the
980
+ // fleet is in flight — a terminal/halted fleet has nothing to pause.
981
+ router.post('/:id/pause', (req, res) => {
982
+ const { id } = req.params;
983
+ if (!validateFleetId(id)) {
984
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
985
+ }
986
+ const manifest = readManifest(fleetRunsDir, id);
987
+ if (!manifest) {
988
+ return res
989
+ .status(404)
990
+ .json({ ok: false, error: `Fleet "${id}" not found` });
991
+ }
992
+ if (manifest.status !== 'running' && manifest.status !== 'resuming') {
993
+ return res.status(409).json({
994
+ ok: false,
995
+ error: `Cannot pause a fleet in "${manifest.status}" state`,
996
+ current_status: manifest.status,
997
+ });
998
+ }
999
+ try {
1000
+ const result = pauseFleet(id) ?? {};
1001
+ return res.json({ ok: true, ...result });
1002
+ } catch (err) {
1003
+ return res
1004
+ .status(500)
1005
+ .json({ ok: false, error: `Pause failed: ${err.message}` });
1006
+ }
1007
+ });
1008
+
1009
+ // ── POST /api/fleet-runs/:id/stop ───────────────────────────────────────
1010
+ // Stop a running fleet: fan a `stop` control file out to every in-flight
1011
+ // child AND SIGTERM each child process, then stamp the manifest
1012
+ // status="halted" with halt_reason="stopped" (distinct from a plain Halt,
1013
+ // where in-flight children finish naturally). Sticky until resume.
1014
+ router.post('/:id/stop', (req, res) => {
1015
+ const { id } = req.params;
1016
+ if (!validateFleetId(id)) {
1017
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
1018
+ }
1019
+ const manifest = readManifest(fleetRunsDir, id);
1020
+ if (!manifest) {
1021
+ return res
1022
+ .status(404)
1023
+ .json({ ok: false, error: `Fleet "${id}" not found` });
1024
+ }
1025
+ if (manifest.status !== 'running' && manifest.status !== 'resuming') {
1026
+ return res.status(409).json({
1027
+ ok: false,
1028
+ error: `Cannot stop a fleet in "${manifest.status}" state`,
1029
+ current_status: manifest.status,
1030
+ });
1031
+ }
1032
+ try {
1033
+ const result = stopFleet(id) ?? {};
1034
+ return res.json({ ok: true, ...result });
1035
+ } catch (err) {
1036
+ return res
1037
+ .status(500)
1038
+ .json({ ok: false, error: `Stop failed: ${err.message}` });
1039
+ }
1040
+ });
1041
+
1042
+ // ── POST /api/fleet-runs/:id/relaunch ───────────────────────────────────
1043
+ router.post('/:id/relaunch', async (req, res) => {
1044
+ const { id } = req.params;
1045
+ if (!validateFleetId(id)) {
1046
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
1047
+ }
1048
+
1049
+ const manifest = readManifest(fleetRunsDir, id);
1050
+ if (!manifest) {
1051
+ return res
1052
+ .status(404)
1053
+ .json({ ok: false, error: `Fleet "${id}" not found` });
1054
+ }
1055
+
1056
+ const overrides = req.body ?? {};
1057
+ const { fleet_id: newId, fleet_id_short: newShort } = generateFleetId();
1058
+
1059
+ const newManifest = {
1060
+ ...manifest,
1061
+ fleet_id: newId,
1062
+ fleet_id_short: newShort,
1063
+ created_at: new Date().toISOString(),
1064
+ status: 'running',
1065
+ halt_reason: null,
1066
+ children: [],
1067
+ work_request: {
1068
+ ...manifest.work_request,
1069
+ ...(overrides.prompt
1070
+ ? {
1071
+ description: overrides.prompt,
1072
+ title: overrides.prompt.slice(0, 80),
1073
+ }
1074
+ : {}),
1075
+ },
1076
+ ...(overrides.head_template != null
1077
+ ? { head_template: overrides.head_template }
1078
+ : {}),
1079
+ ...(overrides.base_branch !== undefined
1080
+ ? { base_branch: overrides.base_branch }
1081
+ : {}),
1082
+ };
1083
+
1084
+ const manifest_path = saveManifest(fleetRunsDir, newManifest);
1085
+
1086
+ if (dispatchFleet) {
1087
+ try {
1088
+ await dispatchFleet({
1089
+ fleet_id: newId,
1090
+ manifest: newManifest,
1091
+ manifest_path,
1092
+ });
1093
+ } catch (err) {
1094
+ newManifest.status = 'failed';
1095
+ saveManifest(fleetRunsDir, newManifest);
1096
+ return res
1097
+ .status(500)
1098
+ .json({ ok: false, error: `Relaunch failed: ${err.message}` });
1099
+ }
1100
+ }
1101
+
1102
+ res.json({ ok: true, new_fleet_id: newId, manifest_path });
1103
+ });
1104
+
1105
+ // ── GET /api/fleet-runs/:id/guide ───────────────────────────────────────
1106
+ router.get('/:id/guide', (req, res) => {
1107
+ const { id } = req.params;
1108
+ if (!validateFleetId(id)) {
1109
+ return res.status(400).json({ ok: false, error: 'Invalid fleet ID' });
1110
+ }
1111
+
1112
+ const manifest = readManifest(fleetRunsDir, id);
1113
+ if (!manifest) {
1114
+ return res
1115
+ .status(404)
1116
+ .json({ ok: false, error: `Fleet "${id}" not found` });
1117
+ }
1118
+
1119
+ const guide = manifest.guide;
1120
+ if (!guide?.paths?.length) {
1121
+ return res
1122
+ .status(404)
1123
+ .json({ ok: false, error: 'No guide attached to this fleet' });
1124
+ }
1125
+
1126
+ const chunks = [];
1127
+ for (const guidePath of guide.paths) {
1128
+ try {
1129
+ chunks.push(readFileSync(guidePath, 'utf8'));
1130
+ } catch (err) {
1131
+ if (err.code === 'ENOENT' || err.code === 'EACCES') {
1132
+ return res.status(404).json({
1133
+ ok: false,
1134
+ error: 'guide_not_retrievable',
1135
+ hint: 'Guide was supplied via CLI from a path the UI server cannot read. View the original file on the launching machine.',
1136
+ });
1137
+ }
1138
+ return res.status(500).json({ ok: false, error: err.message });
1139
+ }
1140
+ }
1141
+
1142
+ res.setHeader('Content-Type', 'text/markdown; charset=utf-8');
1143
+ res.send(chunks.join('\n\n---\n\n'));
1144
+ });
1145
+
1146
+ return router;
1147
+ }