@triflux/remote 10.0.0-alpha.2 → 10.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,554 +0,0 @@
1
- // hub/team/swarm-hypervisor.mjs — Multi-model swarm orchestration hypervisor
2
- // Consumes a SwarmPlan (from swarm-planner.mjs) and orchestrates parallel
3
- // conductor sessions with file-lease enforcement, result validation,
4
- // and ordered integration.
5
- //
6
- // Failure modes handled:
7
- // F1: Worker crash → conductor auto-restart (maxRestarts)
8
- // F2: Rate limit → account-broker cooldown + fallback agent
9
- // F3: Stall → health probe L1 detection + kill + restart
10
- // F4: File lease violation → revert worker changes, flag shard as failed
11
- // F5: Merge conflict → retry integration with conflict resolution
12
-
13
- import { EventEmitter } from 'node:events';
14
- import { join } from 'node:path';
15
- import { mkdirSync, readFileSync, existsSync } from 'node:fs';
16
- import { execSync } from 'node:child_process';
17
-
18
- import { createConductor, STATES } from './conductor.mjs';
19
- import { createSwarmLocks } from './swarm-locks.mjs';
20
- import { createEventLog } from './event-log.mjs';
21
-
22
- // ── Swarm states ──────────────────────────────────────────────
23
-
24
- export const SWARM_STATES = Object.freeze({
25
- PLANNING: 'planning',
26
- LAUNCHING: 'launching',
27
- RUNNING: 'running',
28
- INTEGRATING: 'integrating',
29
- VALIDATING: 'validating',
30
- COMPLETED: 'completed',
31
- FAILED: 'failed',
32
- });
33
-
34
- // ── Failure mode classification ───────────────────────────────
35
-
36
- const FAILURE_MODES = Object.freeze({
37
- F1_CRASH: 'F1_crash',
38
- F2_RATE_LIMIT: 'F2_rate_limit',
39
- F3_STALL: 'F3_stall',
40
- F4_LEASE_VIOLATION: 'F4_lease_violation',
41
- F5_MERGE_CONFLICT: 'F5_merge_conflict',
42
- });
43
-
44
- const FALLBACK_AGENTS = Object.freeze({
45
- codex: 'gemini',
46
- gemini: 'codex',
47
- claude: 'codex',
48
- });
49
-
50
- /**
51
- * Create a swarm hypervisor.
52
- * @param {object} opts
53
- * @param {string} opts.workdir — repository root / working directory
54
- * @param {string} opts.logsDir — base directory for all logs
55
- * @param {number} [opts.maxRestarts=2] — per-shard max restarts
56
- * @param {number} [opts.graceMs=10000] — conductor shutdown grace period
57
- * @param {number} [opts.integrationTimeoutMs=60000] — max time for integration phase
58
- * @param {object} [opts.probeOpts] — health probe overrides
59
- * @param {object} [opts.deps] — dependency injection for testing
60
- * @returns {SwarmHypervisor}
61
- */
62
- export function createSwarmHypervisor(opts) {
63
- const {
64
- workdir,
65
- logsDir,
66
- maxRestarts = 2,
67
- graceMs = 10_000,
68
- integrationTimeoutMs = 60_000,
69
- probeOpts = {},
70
- deps = {},
71
- } = opts;
72
-
73
- if (!workdir) throw new Error('workdir is required');
74
- if (!logsDir) throw new Error('logsDir is required');
75
-
76
- mkdirSync(logsDir, { recursive: true });
77
-
78
- const emitter = new EventEmitter();
79
- const eventLog = createEventLog(join(logsDir, 'swarm-events.jsonl'));
80
-
81
- let state = SWARM_STATES.PLANNING;
82
- let plan = null;
83
- let lockManager = null;
84
-
85
- /** @type {Map<string, { conductor, shardConfig, result, status }>} */
86
- const workers = new Map();
87
-
88
- /** @type {Map<string, { conductor, shardConfig }>} redundant workers for critical shards */
89
- const redundantWorkers = new Map();
90
-
91
- const results = new Map(); // shardName → validated result
92
- const failures = new Map(); // shardName → failure info
93
-
94
- // ── State machine ───────────────────────────────────────────
95
-
96
- function setState(next, reason = '') {
97
- const prev = state;
98
- state = next;
99
- eventLog.append('swarm_state', { from: prev, to: next, reason });
100
- emitter.emit('stateChange', { from: prev, to: next, reason });
101
- }
102
-
103
- // ── Worker lifecycle ────────────────────────────────────────
104
-
105
- function buildSessionConfig(shard) {
106
- return {
107
- id: `swarm-${shard.name}-${Date.now()}`,
108
- agent: shard.agent,
109
- prompt: shard.prompt,
110
- workdir,
111
- mcpServers: shard.mcp,
112
- };
113
- }
114
-
115
- function launchShard(shard, isRedundant = false) {
116
- const shardLogsDir = join(logsDir, isRedundant ? `${shard.name}-redundant` : shard.name);
117
- mkdirSync(shardLogsDir, { recursive: true });
118
-
119
- const conductor = createConductor({
120
- logsDir: shardLogsDir,
121
- maxRestarts,
122
- graceMs,
123
- probeOpts,
124
- onCompleted: (sessionId) => handleShardCompleted(shard.name, sessionId, isRedundant),
125
- });
126
-
127
- const sessionConfig = buildSessionConfig(shard);
128
-
129
- // Acquire file leases
130
- if (!isRedundant) {
131
- const leaseResult = lockManager.acquire(shard.name, shard.files);
132
- if (!leaseResult.ok) {
133
- eventLog.append('lease_denied', {
134
- shard: shard.name,
135
- conflicts: leaseResult.conflicts,
136
- });
137
- failures.set(shard.name, {
138
- mode: FAILURE_MODES.F4_LEASE_VIOLATION,
139
- conflicts: leaseResult.conflicts,
140
- });
141
- return null;
142
- }
143
- }
144
-
145
- conductor.spawnSession(sessionConfig);
146
-
147
- eventLog.append('shard_launched', {
148
- shard: shard.name,
149
- agent: shard.agent,
150
- sessionId: sessionConfig.id,
151
- isRedundant,
152
- files: shard.files,
153
- });
154
-
155
- const entry = { conductor, shardConfig: shard, sessionConfig, startedAt: Date.now() };
156
-
157
- if (isRedundant) {
158
- redundantWorkers.set(shard.name, entry);
159
- } else {
160
- workers.set(shard.name, entry);
161
- }
162
-
163
- // Listen for dead events (F1/F2/F3)
164
- conductor.on('dead', ({ sessionId, reason }) => {
165
- handleShardFailed(shard.name, sessionId, reason, isRedundant);
166
- });
167
-
168
- return entry;
169
- }
170
-
171
- // ── Completion handling ─────────────────────────────────────
172
-
173
- function handleShardCompleted(shardName, sessionId, isRedundant) {
174
- eventLog.append('shard_completed', { shard: shardName, sessionId, isRedundant });
175
-
176
- if (isRedundant) {
177
- // Redundant worker completed first — kill primary if still running
178
- const primary = workers.get(shardName);
179
- if (primary && !isTerminal(primary)) {
180
- eventLog.append('redundant_wins', { shard: shardName });
181
- void primary.conductor.shutdown('redundant_completed_first');
182
- }
183
- } else {
184
- // Primary completed — kill redundant if exists
185
- const redundant = redundantWorkers.get(shardName);
186
- if (redundant) {
187
- void redundant.conductor.shutdown('primary_completed_first');
188
- }
189
- }
190
-
191
- emitter.emit('shardCompleted', { shardName, sessionId, isRedundant });
192
- checkAllShardsCompleted();
193
- }
194
-
195
- function handleShardFailed(shardName, sessionId, reason, isRedundant) {
196
- const failureMode = classifyFailure(reason);
197
-
198
- eventLog.append('shard_failed', {
199
- shard: shardName,
200
- sessionId,
201
- reason,
202
- failureMode,
203
- isRedundant,
204
- });
205
-
206
- if (isRedundant) return; // redundant failure is non-critical
207
-
208
- // F2: Rate limit — try fallback agent
209
- if (failureMode === FAILURE_MODES.F2_RATE_LIMIT) {
210
- const shard = plan.shards.find((s) => s.name === shardName);
211
- if (shard) {
212
- const fallbackAgent = FALLBACK_AGENTS[shard.agent];
213
- if (fallbackAgent) {
214
- eventLog.append('fallback_agent', {
215
- shard: shardName,
216
- from: shard.agent,
217
- to: fallbackAgent,
218
- });
219
- const fallbackShard = { ...shard, agent: fallbackAgent };
220
- lockManager.release(shardName);
221
- launchShard(fallbackShard);
222
- return;
223
- }
224
- }
225
- }
226
-
227
- failures.set(shardName, { mode: failureMode, reason, sessionId });
228
- lockManager.release(shardName);
229
-
230
- emitter.emit('shardFailed', { shardName, failureMode, reason });
231
- checkAllShardsCompleted();
232
- }
233
-
234
- function classifyFailure(reason) {
235
- if (!reason) return FAILURE_MODES.F1_CRASH;
236
- const r = String(reason).toLowerCase();
237
- if (/rate.?limit|cooldown/u.test(r)) return FAILURE_MODES.F2_RATE_LIMIT;
238
- if (/stall|l1_stall|timeout/u.test(r)) return FAILURE_MODES.F3_STALL;
239
- if (/lease|violation/u.test(r)) return FAILURE_MODES.F4_LEASE_VIOLATION;
240
- if (/merge|conflict/u.test(r)) return FAILURE_MODES.F5_MERGE_CONFLICT;
241
- return FAILURE_MODES.F1_CRASH;
242
- }
243
-
244
- function isTerminal(entry) {
245
- const snap = entry.conductor.getSnapshot();
246
- return snap.every((s) => s.state === STATES.COMPLETED || s.state === STATES.DEAD);
247
- }
248
-
249
- // ── Integration ─────────────────────────────────────────────
250
-
251
- function checkAllShardsCompleted() {
252
- if (state !== SWARM_STATES.RUNNING) return;
253
-
254
- const allDone = plan.mergeOrder.every((name) => {
255
- const w = workers.get(name);
256
- return (w && isTerminal(w)) || failures.has(name);
257
- });
258
-
259
- if (allDone) {
260
- void integrateResults();
261
- }
262
- }
263
-
264
- /**
265
- * Validate a shard's output — check for file lease violations.
266
- * @param {string} shardName
267
- * @param {string[]} changedFiles — files the shard actually modified
268
- * @returns {{ ok: boolean, violations: Array }}
269
- */
270
- function validateResult(shardName, changedFiles) {
271
- const violations = lockManager.validateChanges(shardName, changedFiles);
272
-
273
- eventLog.append('validate_result', {
274
- shard: shardName,
275
- changedFiles,
276
- violations,
277
- ok: violations.length === 0,
278
- });
279
-
280
- return {
281
- ok: violations.length === 0,
282
- violations,
283
- };
284
- }
285
-
286
- /**
287
- * Integrate results from all completed shards in merge order.
288
- * Uses git operations for conflict detection.
289
- */
290
- async function integrateResults() {
291
- setState(SWARM_STATES.INTEGRATING, 'all_shards_done');
292
-
293
- const integrated = [];
294
- const integrationFailures = [];
295
-
296
- for (const shardName of plan.mergeOrder) {
297
- if (failures.has(shardName)) {
298
- eventLog.append('skip_failed_shard', { shard: shardName });
299
- continue;
300
- }
301
-
302
- const worker = workers.get(shardName);
303
- if (!worker) continue;
304
-
305
- // Read shard output log for changed files
306
- const changedFiles = detectChangedFiles(shardName, worker);
307
-
308
- // Validate against lease map
309
- const validation = validateResult(shardName, changedFiles);
310
- if (!validation.ok) {
311
- failures.set(shardName, {
312
- mode: FAILURE_MODES.F4_LEASE_VIOLATION,
313
- violations: validation.violations,
314
- });
315
- eventLog.append('lease_violation_revert', {
316
- shard: shardName,
317
- violations: validation.violations,
318
- });
319
- integrationFailures.push(shardName);
320
- continue;
321
- }
322
-
323
- results.set(shardName, {
324
- shard: shardName,
325
- changedFiles,
326
- completedAt: Date.now(),
327
- });
328
- integrated.push(shardName);
329
- }
330
-
331
- eventLog.append('integration_complete', {
332
- integrated,
333
- failed: integrationFailures,
334
- skipped: [...failures.keys()].filter((n) => !integrationFailures.includes(n)),
335
- });
336
-
337
- if (integrationFailures.length > 0 && integrated.length === 0) {
338
- setState(SWARM_STATES.FAILED, 'all_shards_failed_integration');
339
- } else {
340
- setState(SWARM_STATES.COMPLETED, `${integrated.length}/${plan.shards.length} integrated`);
341
- }
342
-
343
- emitter.emit('integrationComplete', {
344
- integrated,
345
- failed: integrationFailures,
346
- results: [...results.values()],
347
- });
348
- }
349
-
350
- /**
351
- * Detect which files a shard modified by reading its output logs.
352
- * Falls back to an empty list if detection fails.
353
- * @param {string} shardName
354
- * @param {object} worker
355
- * @returns {string[]}
356
- */
357
- function detectChangedFiles(shardName, worker) {
358
- // Best-effort: parse output log for file paths
359
- const outPath = join(logsDir, shardName);
360
- try {
361
- const snap = worker.conductor.getSnapshot();
362
- for (const session of snap) {
363
- if (session.outPath && existsSync(session.outPath)) {
364
- const output = readFileSync(session.outPath, 'utf8');
365
- return extractFilePathsFromOutput(output, plan.leaseMap.get(shardName) || []);
366
- }
367
- }
368
- } catch { /* best-effort */ }
369
-
370
- // Fallback: trust the lease map (shard was allowed these files)
371
- return plan.leaseMap.get(shardName) || [];
372
- }
373
-
374
- /**
375
- * Extract modified file paths from worker output text.
376
- * Looks for common patterns: "wrote file.mjs", "modified file.mjs", diff headers.
377
- * @param {string} output
378
- * @param {string[]} allowedFiles — lease map files to match against
379
- * @returns {string[]}
380
- */
381
- function extractFilePathsFromOutput(output, allowedFiles) {
382
- if (!output) return allowedFiles;
383
-
384
- const found = new Set();
385
- const lines = output.split(/\r?\n/);
386
-
387
- for (const line of lines) {
388
- // Match common patterns
389
- const patterns = [
390
- /(?:wrote|created|modified|updated|edited)\s+['"]?([^\s'"]+\.\w+)/i,
391
- /^[+-]{3}\s+[ab]\/(.+)/, // diff headers
392
- /^diff --git a\/(.+)\s+b\//, // git diff headers
393
- ];
394
-
395
- for (const re of patterns) {
396
- const match = line.match(re);
397
- if (match) found.add(match[1]);
398
- }
399
- }
400
-
401
- // Intersect with allowed files if we found anything
402
- if (found.size > 0) {
403
- return [...found].filter((f) => allowedFiles.some(
404
- (a) => f.endsWith(a) || a.endsWith(f) || f === a,
405
- ));
406
- }
407
-
408
- return allowedFiles;
409
- }
410
-
411
- // ── Status monitor ──────────────────────────────────────────
412
-
413
- /**
414
- * Get current swarm status snapshot.
415
- * @returns {SwarmStatus}
416
- */
417
- function getStatus() {
418
- const workerStatuses = [];
419
-
420
- for (const [name, w] of workers) {
421
- const snap = w.conductor.getSnapshot();
422
- workerStatuses.push({
423
- shard: name,
424
- agent: w.shardConfig.agent,
425
- sessions: snap,
426
- failed: failures.has(name),
427
- failureInfo: failures.get(name) || null,
428
- integrated: results.has(name),
429
- });
430
- }
431
-
432
- return Object.freeze({
433
- state,
434
- totalShards: plan?.shards.length || 0,
435
- completedShards: results.size,
436
- failedShards: failures.size,
437
- workers: workerStatuses,
438
- mergeOrder: plan?.mergeOrder || [],
439
- criticalShards: plan?.criticalShards || [],
440
- locks: lockManager?.snapshot() || [],
441
- });
442
- }
443
-
444
- // ── Public API ──────────────────────────────────────────────
445
-
446
- /**
447
- * Launch the swarm from a pre-built plan.
448
- * @param {SwarmPlan} swarmPlan — from planSwarm()
449
- * @returns {SwarmStatus}
450
- */
451
- function launch(swarmPlan) {
452
- if (state !== SWARM_STATES.PLANNING) {
453
- throw new Error(`Cannot launch in state "${state}"`);
454
- }
455
-
456
- plan = swarmPlan;
457
-
458
- // Warn about file conflicts but don't block
459
- if (plan.conflicts.length > 0) {
460
- eventLog.append('file_conflicts_warning', { conflicts: plan.conflicts });
461
- emitter.emit('warning', {
462
- type: 'file_conflicts',
463
- conflicts: plan.conflicts,
464
- });
465
- }
466
-
467
- // Initialize lock manager
468
- lockManager = createSwarmLocks({
469
- repoRoot: workdir,
470
- persistPath: join(workdir, '.triflux', 'swarm-locks.json'),
471
- });
472
-
473
- setState(SWARM_STATES.LAUNCHING, `${plan.shards.length} shards`);
474
-
475
- // Launch shards respecting dependency order
476
- const launched = new Set();
477
- const pending = new Set(plan.mergeOrder);
478
-
479
- function launchReady() {
480
- for (const name of pending) {
481
- const shard = plan.shards.find((s) => s.name === name);
482
- if (!shard) continue;
483
-
484
- // Check all dependencies are launched (not necessarily completed)
485
- const depsReady = shard.depends.every((d) => launched.has(d));
486
- if (!depsReady) continue;
487
-
488
- pending.delete(name);
489
- launched.add(name);
490
- launchShard(shard);
491
-
492
- // Launch redundant worker for critical shards
493
- if (shard.critical) {
494
- const redundantShard = {
495
- ...shard,
496
- agent: FALLBACK_AGENTS[shard.agent] || shard.agent,
497
- };
498
- launchShard(redundantShard, true);
499
- }
500
- }
501
- }
502
-
503
- launchReady();
504
-
505
- // Re-check pending on each shard completion (dependency chains)
506
- emitter.on('shardCompleted', () => {
507
- if (pending.size > 0) launchReady();
508
- });
509
-
510
- setState(SWARM_STATES.RUNNING, `${launched.size} launched, ${pending.size} pending deps`);
511
-
512
- return getStatus();
513
- }
514
-
515
- /**
516
- * Graceful shutdown — kill all workers and release locks.
517
- * @param {string} [reason]
518
- */
519
- async function shutdown(reason = 'shutdown') {
520
- eventLog.append('swarm_shutdown', { reason, state });
521
-
522
- const shutdowns = [];
523
- for (const [, w] of workers) {
524
- shutdowns.push(w.conductor.shutdown(reason));
525
- }
526
- for (const [, w] of redundantWorkers) {
527
- shutdowns.push(w.conductor.shutdown(reason));
528
- }
529
-
530
- await Promise.allSettled(shutdowns);
531
-
532
- lockManager?.releaseAll();
533
- await eventLog.flush();
534
- await eventLog.close();
535
-
536
- if (state !== SWARM_STATES.COMPLETED && state !== SWARM_STATES.FAILED) {
537
- setState(SWARM_STATES.FAILED, reason);
538
- }
539
-
540
- emitter.emit('shutdown', { reason });
541
- }
542
-
543
- return Object.freeze({
544
- launch,
545
- shutdown,
546
- getStatus,
547
- validateResult,
548
- on: emitter.on.bind(emitter),
549
- off: emitter.off.bind(emitter),
550
- get state() { return state; },
551
- get plan() { return plan; },
552
- get eventLogPath() { return eventLog.filePath; },
553
- });
554
- }