@covibes/zeroshot 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CHANGELOG.md +167 -0
  2. package/LICENSE +21 -0
  3. package/README.md +364 -0
  4. package/cli/index.js +3990 -0
  5. package/cluster-templates/base-templates/debug-workflow.json +181 -0
  6. package/cluster-templates/base-templates/full-workflow.json +455 -0
  7. package/cluster-templates/base-templates/single-worker.json +48 -0
  8. package/cluster-templates/base-templates/worker-validator.json +131 -0
  9. package/cluster-templates/conductor-bootstrap.json +122 -0
  10. package/cluster-templates/conductor-junior-bootstrap.json +69 -0
  11. package/docker/zeroshot-cluster/Dockerfile +132 -0
  12. package/lib/completion.js +174 -0
  13. package/lib/id-detector.js +53 -0
  14. package/lib/settings.js +97 -0
  15. package/lib/stream-json-parser.js +236 -0
  16. package/package.json +121 -0
  17. package/src/agent/agent-config.js +121 -0
  18. package/src/agent/agent-context-builder.js +241 -0
  19. package/src/agent/agent-hook-executor.js +329 -0
  20. package/src/agent/agent-lifecycle.js +555 -0
  21. package/src/agent/agent-stuck-detector.js +256 -0
  22. package/src/agent/agent-task-executor.js +1034 -0
  23. package/src/agent/agent-trigger-evaluator.js +67 -0
  24. package/src/agent-wrapper.js +459 -0
  25. package/src/agents/git-pusher-agent.json +20 -0
  26. package/src/attach/attach-client.js +438 -0
  27. package/src/attach/attach-server.js +543 -0
  28. package/src/attach/index.js +35 -0
  29. package/src/attach/protocol.js +220 -0
  30. package/src/attach/ring-buffer.js +121 -0
  31. package/src/attach/socket-discovery.js +242 -0
  32. package/src/claude-task-runner.js +468 -0
  33. package/src/config-router.js +80 -0
  34. package/src/config-validator.js +598 -0
  35. package/src/github.js +103 -0
  36. package/src/isolation-manager.js +1042 -0
  37. package/src/ledger.js +429 -0
  38. package/src/logic-engine.js +223 -0
  39. package/src/message-bus-bridge.js +139 -0
  40. package/src/message-bus.js +202 -0
  41. package/src/name-generator.js +232 -0
  42. package/src/orchestrator.js +1938 -0
  43. package/src/schemas/sub-cluster.js +156 -0
  44. package/src/sub-cluster-wrapper.js +545 -0
  45. package/src/task-runner.js +28 -0
  46. package/src/template-resolver.js +347 -0
  47. package/src/tui/CHANGES.txt +133 -0
  48. package/src/tui/LAYOUT.md +261 -0
  49. package/src/tui/README.txt +192 -0
  50. package/src/tui/TWO-LEVEL-NAVIGATION.md +186 -0
  51. package/src/tui/data-poller.js +325 -0
  52. package/src/tui/demo.js +208 -0
  53. package/src/tui/formatters.js +123 -0
  54. package/src/tui/index.js +193 -0
  55. package/src/tui/keybindings.js +383 -0
  56. package/src/tui/layout.js +317 -0
  57. package/src/tui/renderer.js +194 -0
@@ -0,0 +1,1938 @@
1
+ /**
2
+ * Orchestrator - Manages cluster lifecycle
3
+ *
4
+ * Provides:
5
+ * - Cluster initialization and configuration
6
+ * - Agent lifecycle management
7
+ * - GitHub issue integration
8
+ * - Cluster state tracking
9
+ * - Crash recovery
10
+ */
11
+
12
+ const fs = require('fs');
13
+ const path = require('path');
14
+ const os = require('os');
15
+ const lockfile = require('proper-lockfile');
16
+ const AgentWrapper = require('./agent-wrapper');
17
+ const SubClusterWrapper = require('./sub-cluster-wrapper');
18
+ const MessageBus = require('./message-bus');
19
+ const Ledger = require('./ledger');
20
+ const GitHub = require('./github');
21
+ const IsolationManager = require('./isolation-manager');
22
+ const { generateName } = require('./name-generator');
23
+ const configValidator = require('./config-validator');
24
+ const TemplateResolver = require('./template-resolver');
25
+
26
+ /**
27
+ * Operation Chain Schema
28
+ * Conductor (or any agent) can publish CLUSTER_OPERATIONS to dynamically modify cluster
29
+ *
30
+ * Supported operations:
31
+ * - add_agents: Spawn new agents with given configs
32
+ * - remove_agents: Stop and remove agents by ID
33
+ * - update_agent: Modify existing agent config
34
+ * - publish: Publish a message to the bus
35
+ * - load_config: Load agents from a named cluster config template
36
+ */
37
+ const VALID_OPERATIONS = ['add_agents', 'remove_agents', 'update_agent', 'publish', 'load_config'];
38
+
39
+ /**
40
+ * Workflow-triggering topics that indicate cluster state progression
41
+ * These are the topics that MATTER for resume - not AGENT_OUTPUT noise
42
+ */
43
+ const WORKFLOW_TRIGGERS = Object.freeze([
44
+ 'ISSUE_OPENED',
45
+ 'PLAN_READY',
46
+ 'IMPLEMENTATION_READY',
47
+ 'VALIDATION_RESULT',
48
+ 'CONDUCTOR_ESCALATE',
49
+ ]);
50
+
51
+ class Orchestrator {
52
+ constructor(options = {}) {
53
+ this.clusters = new Map(); // cluster_id -> cluster object
54
+ this.quiet = options.quiet || false; // Suppress verbose logging
55
+
56
+ // TaskRunner DI - allows injecting MockTaskRunner for testing
57
+ // When set, passed to all AgentWrappers to control task execution
58
+ this.taskRunner = options.taskRunner || null;
59
+
60
+ // Set up persistent storage directory (can be overridden for testing)
61
+ this.storageDir = options.storageDir || path.join(os.homedir(), '.zeroshot');
62
+ if (!fs.existsSync(this.storageDir)) {
63
+ fs.mkdirSync(this.storageDir, { recursive: true });
64
+ }
65
+
66
+ // Load existing clusters from disk (skip if explicitly disabled)
67
+ if (options.skipLoad !== true) {
68
+ this._loadClusters();
69
+ }
70
+ }
71
+
72
+ /**
73
+ * Log message (respects quiet mode)
74
+ * @private
75
+ */
76
+ _log(...args) {
77
+ if (!this.quiet) {
78
+ console.log(...args);
79
+ }
80
+ }
81
+
82
+ /**
83
+ * Load clusters from persistent storage
84
+ * Uses file locking for consistent reads
85
+ * @private
86
+ */
87
+ _loadClusters() {
88
+ const clustersFile = path.join(this.storageDir, 'clusters.json');
89
+ this._log(`[Orchestrator] Loading clusters from: ${clustersFile}`);
90
+
91
+ if (!fs.existsSync(clustersFile)) {
92
+ this._log(`[Orchestrator] No clusters file found at ${clustersFile}`);
93
+ return;
94
+ }
95
+
96
+ const lockfilePath = path.join(this.storageDir, 'clusters.json.lock');
97
+ let release;
98
+
99
+ try {
100
+ // Acquire lock (sync API doesn't support retries, so we retry manually)
101
+ const maxAttempts = 20;
102
+ const retryDelayMs = 100;
103
+
104
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
105
+ try {
106
+ release = lockfile.lockSync(clustersFile, {
107
+ lockfilePath,
108
+ stale: 30000,
109
+ });
110
+ break; // Lock acquired
111
+ } catch (lockErr) {
112
+ if (lockErr.code === 'ELOCKED' && attempt < maxAttempts - 1) {
113
+ // Wait and retry
114
+ const waitMs = retryDelayMs + Math.random() * retryDelayMs;
115
+ const start = Date.now();
116
+ while (Date.now() - start < waitMs) {
117
+ /* spin wait */
118
+ }
119
+ continue;
120
+ }
121
+ throw lockErr;
122
+ }
123
+ }
124
+
125
+ const data = JSON.parse(fs.readFileSync(clustersFile, 'utf8'));
126
+ const clusterIds = Object.keys(data);
127
+ this._log(`[Orchestrator] Found ${clusterIds.length} clusters in file:`, clusterIds);
128
+
129
+ for (const [clusterId, clusterData] of Object.entries(data)) {
130
+ this._log(`[Orchestrator] Loading cluster: ${clusterId}`);
131
+ this._loadSingleCluster(clusterId, clusterData);
132
+ }
133
+
134
+ this._log(`[Orchestrator] Total clusters loaded: ${this.clusters.size}`);
135
+ } catch (error) {
136
+ console.error('[Orchestrator] Failed to load clusters:', error.message);
137
+ console.error(error.stack);
138
+ } finally {
139
+ if (release) {
140
+ release();
141
+ }
142
+ }
143
+ }
144
+
145
+ /**
146
+ * Load a single cluster from data
147
+ * @private
148
+ */
149
+ _loadSingleCluster(clusterId, clusterData) {
150
+ // Skip if already loaded
151
+ if (this.clusters.has(clusterId)) {
152
+ return this.clusters.get(clusterId);
153
+ }
154
+
155
+ // Restore ledger and message bus
156
+ const dbPath = path.join(this.storageDir, `${clusterId}.db`);
157
+ const ledger = new Ledger(dbPath);
158
+ const messageBus = new MessageBus(ledger);
159
+
160
+ // Restore isolation manager FIRST if cluster was running in isolation mode
161
+ let isolation = clusterData.isolation || null;
162
+ let isolationManager = null;
163
+ if (isolation?.enabled && isolation.containerId) {
164
+ isolationManager = new IsolationManager({ image: isolation.image });
165
+ // Restore the container mapping so cleanup works
166
+ isolationManager.containers.set(clusterId, isolation.containerId);
167
+ isolation = {
168
+ ...isolation,
169
+ manager: isolationManager,
170
+ };
171
+ this._log(
172
+ `[Orchestrator] Restored isolation manager for ${clusterId} (container: ${isolation.containerId})`
173
+ );
174
+ }
175
+
176
+ // Reconstruct agent metadata from config (processes are ephemeral)
177
+ // CRITICAL: Pass isolation context to agents if cluster was running in isolation
178
+ const agents = [];
179
+ if (clusterData.config?.agents) {
180
+ for (const agentConfig of clusterData.config.agents) {
181
+ const agentOptions = {
182
+ id: clusterId,
183
+ quiet: this.quiet,
184
+ };
185
+
186
+ // Inject isolation context if enabled (MUST be done during agent creation)
187
+ if (isolation?.enabled && isolationManager) {
188
+ agentOptions.isolation = {
189
+ enabled: true,
190
+ manager: isolationManager,
191
+ clusterId,
192
+ };
193
+ }
194
+
195
+ // Create agent or subcluster wrapper based on type
196
+ let agent;
197
+ if (agentConfig.type === 'subcluster') {
198
+ agent = new SubClusterWrapper(agentConfig, messageBus, { id: clusterId }, agentOptions);
199
+ } else {
200
+ agent = new AgentWrapper(agentConfig, messageBus, { id: clusterId }, agentOptions);
201
+ }
202
+
203
+ agents.push(agent);
204
+ }
205
+ }
206
+
207
+ const cluster = {
208
+ ...clusterData,
209
+ ledger,
210
+ messageBus,
211
+ agents,
212
+ isolation,
213
+ };
214
+
215
+ this.clusters.set(clusterId, cluster);
216
+ this._log(`[Orchestrator] Loaded cluster: ${clusterId} with ${agents.length} agents`);
217
+
218
+ return cluster;
219
+ }
220
+
221
+ /**
222
+ * Ensure clusters file exists (required for file locking)
223
+ * @private
224
+ */
225
+ _ensureClustersFile() {
226
+ const clustersFile = path.join(this.storageDir, 'clusters.json');
227
+ if (!fs.existsSync(clustersFile)) {
228
+ fs.writeFileSync(clustersFile, '{}');
229
+ }
230
+ return clustersFile;
231
+ }
232
+
233
+ /**
234
+ * Save clusters to persistent storage
235
+ * Uses file locking to prevent race conditions with other processes
236
+ * @private
237
+ */
238
+ _saveClusters() {
239
+ const clustersFile = this._ensureClustersFile();
240
+ const lockfilePath = path.join(this.storageDir, 'clusters.json.lock');
241
+ let release;
242
+
243
+ try {
244
+ // Acquire exclusive lock (sync API doesn't support retries, so we retry manually)
245
+ const maxAttempts = 50;
246
+ const retryDelayMs = 100;
247
+
248
+ for (let attempt = 0; attempt < maxAttempts; attempt++) {
249
+ try {
250
+ release = lockfile.lockSync(clustersFile, {
251
+ lockfilePath,
252
+ stale: 30000, // Lock expires after 30s (in case process dies)
253
+ });
254
+ break; // Lock acquired
255
+ } catch (lockErr) {
256
+ if (lockErr.code === 'ELOCKED' && attempt < maxAttempts - 1) {
257
+ // Wait and retry with jitter
258
+ const waitMs = retryDelayMs + Math.random() * retryDelayMs * 2;
259
+ const start = Date.now();
260
+ while (Date.now() - start < waitMs) {
261
+ /* spin wait */
262
+ }
263
+ continue;
264
+ }
265
+ throw lockErr;
266
+ }
267
+ }
268
+
269
+ // Read existing clusters from file (other processes may have added clusters)
270
+ let existingClusters = {};
271
+ try {
272
+ const content = fs.readFileSync(clustersFile, 'utf8');
273
+ existingClusters = JSON.parse(content);
274
+ } catch (error) {
275
+ console.error('[Orchestrator] Failed to read existing clusters:', error.message);
276
+ }
277
+
278
+ // Merge: update/add clusters from this process
279
+ for (const [clusterId, cluster] of this.clusters.entries()) {
280
+ // CRITICAL: Only update clusters this process actually owns or has modified
281
+ // A process owns a cluster if: it started it (pid matches) OR it explicitly stopped/killed it
282
+ const isOwnedByThisProcess = cluster.pid === process.pid;
283
+ const wasModifiedByThisProcess = cluster.state === 'stopped' || cluster.state === 'killed';
284
+
285
+ // Skip clusters we don't own and haven't modified - prevents race condition
286
+ // where a running cluster overwrites another process's stop/kill operation
287
+ if (!isOwnedByThisProcess && !wasModifiedByThisProcess) {
288
+ // Preserve existing state from file for clusters we don't own
289
+ continue;
290
+ }
291
+
292
+ existingClusters[clusterId] = {
293
+ id: cluster.id,
294
+ config: cluster.config,
295
+ state: cluster.state,
296
+ createdAt: cluster.createdAt,
297
+ // Track PID for zombie detection (null if cluster is stopped/killed)
298
+ pid: cluster.state === 'running' ? cluster.pid : null,
299
+ // Persist failure info for resume capability
300
+ failureInfo: cluster.failureInfo || null,
301
+ // Persist isolation info (excluding manager instance which can't be serialized)
302
+ isolation: cluster.isolation
303
+ ? {
304
+ enabled: cluster.isolation.enabled,
305
+ containerId: cluster.isolation.containerId,
306
+ image: cluster.isolation.image,
307
+ }
308
+ : null,
309
+ };
310
+ }
311
+
312
+ // Write merged data
313
+ fs.writeFileSync(clustersFile, JSON.stringify(existingClusters, null, 2));
314
+ this._log(
315
+ `[Orchestrator] Saved ${this.clusters.size} cluster(s), file now has ${Object.keys(existingClusters).length} total`
316
+ );
317
+ } finally {
318
+ // Always release lock
319
+ if (release) {
320
+ release();
321
+ }
322
+ }
323
+ }
324
+
325
+ /**
326
+ * Watch for new clusters and call callback when found
327
+ * Polls the clusters file for changes with file locking
328
+ * @param {Function} onNewCluster - Callback(cluster) for each new cluster
329
+ * @param {Number} intervalMs - Poll interval in ms (default: 2000)
330
+ * @returns {Function} Stop function to cancel watching
331
+ */
332
+ watchForNewClusters(onNewCluster, intervalMs = 2000) {
333
+ const clustersFile = path.join(this.storageDir, 'clusters.json');
334
+ const lockfilePath = path.join(this.storageDir, 'clusters.json.lock');
335
+ const knownClusterIds = new Set(this.clusters.keys());
336
+
337
+ const intervalId = setInterval(() => {
338
+ let release;
339
+ try {
340
+ if (!fs.existsSync(clustersFile)) return;
341
+
342
+ // Try to acquire lock once (polling is best-effort, will retry on next cycle)
343
+ try {
344
+ release = lockfile.lockSync(clustersFile, {
345
+ lockfilePath,
346
+ stale: 30000,
347
+ });
348
+ } catch (lockErr) {
349
+ // Lock busy - skip this poll cycle, try again next interval
350
+ if (lockErr.code === 'ELOCKED') return;
351
+ throw lockErr;
352
+ }
353
+
354
+ const data = JSON.parse(fs.readFileSync(clustersFile, 'utf8'));
355
+
356
+ for (const [clusterId, clusterData] of Object.entries(data)) {
357
+ if (!knownClusterIds.has(clusterId)) {
358
+ // New cluster found
359
+ knownClusterIds.add(clusterId);
360
+ const cluster = this._loadSingleCluster(clusterId, clusterData);
361
+ if (cluster && onNewCluster) {
362
+ onNewCluster(cluster);
363
+ }
364
+ }
365
+ }
366
+ } catch (error) {
367
+ // File access during polling can fail transiently - log and continue
368
+ console.error(`[Orchestrator] watchForNewClusters error (will retry): ${error.message}`);
369
+ } finally {
370
+ if (release) {
371
+ release();
372
+ }
373
+ }
374
+ }, intervalMs);
375
+
376
+ // Return stop function
377
+ return () => clearInterval(intervalId);
378
+ }
379
+
380
+ /**
381
+ * Start a new cluster with mocked agent executors (TESTING ONLY)
382
+ *
383
+ * CRITICAL: This method PREVENTS real Claude API calls.
384
+ * All agent behaviors must be defined in mockExecutor.
385
+ *
386
+ * @param {Object} config - Cluster configuration
387
+ * @param {Object} input - Input source { issue, text, or config }
388
+ * @param {MockAgentExecutor} mockExecutor - Mock executor with agent behaviors
389
+ * @returns {Object} Cluster object
390
+ */
391
+ startWithMock(config, input, mockExecutor) {
392
+ if (!mockExecutor) {
393
+ throw new Error('Orchestrator.startWithMock: mockExecutor is required');
394
+ }
395
+
396
+ // Validate all agents that execute tasks have mock behaviors defined
397
+ // Orchestrator agents (action: 'stop_cluster') don't execute tasks, so don't need mocks
398
+ for (const agentConfig of config.agents) {
399
+ const agentId = agentConfig.id;
400
+
401
+ // Check if agent has any triggers that execute tasks
402
+ const executesTask = agentConfig.triggers?.some(
403
+ (trigger) => !trigger.action || trigger.action === 'execute_task'
404
+ );
405
+
406
+ if (executesTask && !mockExecutor.behaviors[agentId]) {
407
+ throw new Error(
408
+ `Orchestrator.startWithMock: No behavior defined for agent '${agentId}'. ` +
409
+ `This would cause real Claude API calls. ABORTING.\n` +
410
+ `Available behaviors: ${Object.keys(mockExecutor.behaviors).join(', ')}`
411
+ );
412
+ }
413
+ }
414
+
415
+ return this._startInternal(config, input, {
416
+ mockExecutor,
417
+ testMode: true,
418
+ });
419
+ }
420
+
421
+ /**
422
+ * Start a new cluster
423
+ * @param {Object} config - Cluster configuration
424
+ * @param {Object} input - Input source { issue, text, or config }
425
+ * @param {Object} options - Start options
426
+ * @param {boolean} options.isolation - Run in Docker container
427
+ * @param {string} options.isolationImage - Docker image to use
428
+ * @returns {Object} Cluster object
429
+ */
430
+ start(config, input = {}, options = {}) {
431
+ return this._startInternal(config, input, {
432
+ testMode: false,
433
+ cwd: options.cwd || process.cwd(), // Target working directory for agents
434
+ isolation: options.isolation || false,
435
+ isolationImage: options.isolationImage,
436
+ autoPr: process.env.CREW_PR === '1',
437
+ });
438
+ }
439
+
440
+ /**
441
+ * Internal start implementation (shared by start and startWithMock)
442
+ * @private
443
+ */
444
+ async _startInternal(config, input = {}, options = {}) {
445
+ // Use pre-generated ID from parent process, or generate new one
446
+ const clusterId = process.env.CREW_CLUSTER_ID || generateName('cluster');
447
+
448
+ // Create ledger and message bus with persistent storage
449
+ const dbPath = config.dbPath || path.join(this.storageDir, `${clusterId}.db`);
450
+ const ledger = new Ledger(dbPath);
451
+ const messageBus = new MessageBus(ledger);
452
+
453
+ // Handle isolation mode (Docker container)
454
+ let isolationManager = null;
455
+ let containerId = null;
456
+
457
+ if (options.isolation) {
458
+ // Check Docker availability
459
+ if (!IsolationManager.isDockerAvailable()) {
460
+ throw new Error('Docker is not available. Install Docker to use --isolation mode.');
461
+ }
462
+
463
+ // Ensure image exists (auto-build if missing)
464
+ const image = options.isolationImage || 'zeroshot-cluster-base';
465
+ await IsolationManager.ensureImage(image);
466
+
467
+ isolationManager = new IsolationManager({ image });
468
+ this._log(`[Orchestrator] Starting cluster in isolation mode (image: ${image})`);
469
+
470
+ // Create container with workspace mounted
471
+ // CRITICAL: Use options.cwd (git repo root) instead of process.cwd()
472
+ const workDir = options.cwd || process.cwd();
473
+ containerId = await isolationManager.createContainer(clusterId, {
474
+ workDir,
475
+ image,
476
+ });
477
+ this._log(`[Orchestrator] Container created: ${containerId} (workDir: ${workDir})`);
478
+ }
479
+
480
+ // Build cluster object
481
+ const cluster = {
482
+ id: clusterId,
483
+ config,
484
+ state: 'initializing',
485
+ messageBus,
486
+ ledger,
487
+ agents: [],
488
+ createdAt: Date.now(),
489
+ // Track PID for zombie detection (this process owns the cluster)
490
+ pid: process.pid,
491
+ // Isolation state (only if enabled)
492
+ isolation: options.isolation
493
+ ? {
494
+ enabled: true,
495
+ containerId,
496
+ image: options.isolationImage || 'zeroshot-cluster-base',
497
+ manager: isolationManager,
498
+ }
499
+ : null,
500
+ };
501
+
502
+ this.clusters.set(clusterId, cluster);
503
+
504
+ try {
505
+ // Fetch input (GitHub issue or text)
506
+ let inputData;
507
+ if (input.issue) {
508
+ inputData = await GitHub.fetchIssue(input.issue);
509
+ // Log clickable issue link
510
+ if (inputData.url) {
511
+ this._log(`[Orchestrator] Issue: ${inputData.url}`);
512
+ }
513
+ } else if (input.text) {
514
+ inputData = GitHub.createTextInput(input.text);
515
+ } else {
516
+ throw new Error('Either issue or text input is required');
517
+ }
518
+
519
+ // Inject git-pusher agent if --pr is set (replaces completion-detector)
520
+ if (options.autoPr) {
521
+ // Remove completion-detector by ID (git-pusher handles completion + PR)
522
+ config.agents = config.agents.filter((a) => a.id !== 'completion-detector');
523
+
524
+ // Load and configure git-pusher agent (use fs.readFileSync to avoid require cache)
525
+ const gitPusherPath = path.join(__dirname, 'agents', 'git-pusher-agent.json');
526
+ const gitPusherConfig = JSON.parse(fs.readFileSync(gitPusherPath, 'utf8'));
527
+
528
+ // Inject issue context placeholders
529
+ gitPusherConfig.prompt = gitPusherConfig.prompt.replace(
530
+ /\{\{issue_number\}\}/g,
531
+ inputData.number || 'unknown'
532
+ );
533
+ gitPusherConfig.prompt = gitPusherConfig.prompt.replace(
534
+ /\{\{issue_title\}\}/g,
535
+ inputData.title || 'Implementation'
536
+ );
537
+
538
+ config.agents.push(gitPusherConfig);
539
+ this._log(`[Orchestrator] Injected git-pusher agent (creates PR and auto-merges)`);
540
+ }
541
+
542
+ // Inject workers instruction if --workers explicitly provided and > 1
543
+ const workersCount = process.env.CREW_WORKERS ? parseInt(process.env.CREW_WORKERS) : 0;
544
+ if (workersCount > 1) {
545
+ const workerAgent = config.agents.find((a) => a.id === 'worker');
546
+ if (workerAgent) {
547
+ const instruction = `PARALLELIZATION: Use up to ${workersCount} sub-agents to parallelize your work where appropriate.\n\n`;
548
+
549
+ if (!workerAgent.prompt) {
550
+ workerAgent.prompt = instruction;
551
+ } else if (typeof workerAgent.prompt === 'string') {
552
+ workerAgent.prompt = instruction + workerAgent.prompt;
553
+ } else if (workerAgent.prompt.system) {
554
+ workerAgent.prompt.system = instruction + workerAgent.prompt.system;
555
+ }
556
+ this._log(
557
+ `[Orchestrator] Injected parallelization instruction (workers=${workersCount})`
558
+ );
559
+ }
560
+ }
561
+
562
+ // Initialize agents with optional mock injection
563
+ // Check agent type: regular agent or subcluster
564
+ // CRITICAL: Inject cwd into each agent config for proper working directory
565
+ const agentCwd = options.cwd || process.cwd();
566
+ for (const agentConfig of config.agents) {
567
+ // Inject cwd if not already set (config may override)
568
+ if (!agentConfig.cwd) {
569
+ agentConfig.cwd = agentCwd;
570
+ }
571
+
572
+ const agentOptions = {
573
+ testMode: options.testMode || !!this.taskRunner, // Enable testMode if taskRunner provided
574
+ quiet: this.quiet,
575
+ };
576
+
577
+ // Inject mock spawn function if provided (legacy mockExecutor API)
578
+ if (options.mockExecutor) {
579
+ agentOptions.mockSpawnFn = options.mockExecutor.createMockSpawnFn(agentConfig.id);
580
+ }
581
+
582
+ // TaskRunner DI - new pattern for mocking task execution
583
+ // Creates a mockSpawnFn wrapper that delegates to the taskRunner
584
+ if (this.taskRunner) {
585
+ agentOptions.mockSpawnFn = (args, { context }) => {
586
+ return this.taskRunner.run(context, {
587
+ agentId: agentConfig.id,
588
+ });
589
+ };
590
+ }
591
+
592
+ // Pass isolation context if enabled
593
+ if (cluster.isolation) {
594
+ agentOptions.isolation = {
595
+ enabled: true,
596
+ manager: isolationManager,
597
+ clusterId,
598
+ };
599
+ }
600
+
601
+ // Create agent or subcluster wrapper based on type
602
+ let agent;
603
+ if (agentConfig.type === 'subcluster') {
604
+ agent = new SubClusterWrapper(agentConfig, messageBus, cluster, agentOptions);
605
+ } else {
606
+ agent = new AgentWrapper(agentConfig, messageBus, cluster, agentOptions);
607
+ }
608
+
609
+ cluster.agents.push(agent);
610
+ }
611
+
612
+ // Start all agents
613
+ for (const agent of cluster.agents) {
614
+ await agent.start();
615
+ }
616
+
617
+ cluster.state = 'running';
618
+
619
+ // Publish ISSUE_OPENED message to bootstrap workflow
620
+ messageBus.publish({
621
+ cluster_id: clusterId,
622
+ topic: 'ISSUE_OPENED',
623
+ sender: 'system',
624
+ receiver: 'broadcast',
625
+ content: {
626
+ text: inputData.context,
627
+ data: {
628
+ issue_number: inputData.number,
629
+ title: inputData.title,
630
+ },
631
+ },
632
+ metadata: {
633
+ source: input.issue ? 'github' : 'text',
634
+ },
635
+ });
636
+
637
+ this._log(`Cluster ${clusterId} started with ${cluster.agents.length} agents`);
638
+
639
+ // Watch for CLUSTER_COMPLETE message to auto-stop
640
+ messageBus.subscribe((message) => {
641
+ if (message.topic === 'CLUSTER_COMPLETE' && message.cluster_id === clusterId) {
642
+ this._log(`\n${'='.repeat(80)}`);
643
+ this._log(`✅ CLUSTER COMPLETED SUCCESSFULLY: ${clusterId}`);
644
+ this._log(`${'='.repeat(80)}`);
645
+ this._log(`Reason: ${message.content?.data?.reason || 'unknown'}`);
646
+ this._log(`Initiated by: ${message.sender}`);
647
+ this._log(`${'='.repeat(80)}\n`);
648
+
649
+ // Auto-stop cluster
650
+ this.stop(clusterId).catch((err) => {
651
+ console.error(`Failed to auto-stop cluster ${clusterId}:`, err.message);
652
+ });
653
+ }
654
+ });
655
+
656
+ // Watch for CLUSTER_FAILED message to auto-stop (e.g., max iterations reached)
657
+ messageBus.subscribe((message) => {
658
+ if (message.topic === 'CLUSTER_FAILED' && message.cluster_id === clusterId) {
659
+ this._log(`\n${'='.repeat(80)}`);
660
+ this._log(`❌ CLUSTER FAILED: ${clusterId}`);
661
+ this._log(`${'='.repeat(80)}`);
662
+ this._log(`Reason: ${message.content?.data?.reason || 'unknown'}`);
663
+ this._log(`Agent: ${message.sender}`);
664
+ if (message.content?.text) {
665
+ this._log(`Details: ${message.content.text}`);
666
+ }
667
+ this._log(`${'='.repeat(80)}\n`);
668
+
669
+ // Auto-stop cluster
670
+ this.stop(clusterId).catch((err) => {
671
+ console.error(`Failed to auto-stop cluster ${clusterId}:`, err.message);
672
+ });
673
+ }
674
+ });
675
+
676
+ // Watch for AGENT_ERROR - if critical agent (worker/implementation) fails, stop cluster
677
+ // Validators auto-approve after retries (see agent-wrapper retry logic)
678
+ messageBus.subscribe((message) => {
679
+ if (message.topic === 'AGENT_ERROR' && message.cluster_id === clusterId) {
680
+ const agentRole = message.content?.data?.role;
681
+ const attempts = message.content?.data?.attempts || 1;
682
+
683
+ // Save cluster state to persist failureInfo (set by agent-wrapper on failure)
684
+ // This ensures resume capability even if cluster doesn't stop
685
+ this._saveClusters();
686
+
687
+ // Only stop cluster if non-validator agent exhausted retries
688
+ if (agentRole === 'implementation' && attempts >= 3) {
689
+ this._log(`\n${'='.repeat(80)}`);
690
+ this._log(`❌ WORKER AGENT FAILED: ${clusterId}`);
691
+ this._log(`${'='.repeat(80)}`);
692
+ this._log(`Worker agent ${message.sender} failed after ${attempts} attempts`);
693
+ this._log(`Error: ${message.content?.data?.error || 'unknown'}`);
694
+ this._log(`Stopping cluster - worker cannot continue`);
695
+ this._log(`${'='.repeat(80)}\n`);
696
+
697
+ // Auto-stop cluster
698
+ this.stop(clusterId).catch((err) => {
699
+ console.error(`Failed to auto-stop cluster ${clusterId}:`, err.message);
700
+ });
701
+ }
702
+ }
703
+ });
704
+
705
+ // Watch for stale agent detection (informational only - NEVER kills tasks)
706
+ // CHANGED: Stale detection is informational only - never kills tasks
707
+ messageBus.on('topic:AGENT_LIFECYCLE', (message) => {
708
+ if (message.content?.data?.event !== 'AGENT_STALE_WARNING') return;
709
+
710
+ const agentId = message.content?.data?.agent;
711
+ const timeSinceLastOutput = message.content?.data?.timeSinceLastOutput;
712
+ const analysis = message.content?.data?.analysis || 'No analysis available';
713
+
714
+ this._log(
715
+ `⚠️ Orchestrator: Agent ${agentId} appears stale (${Math.round(timeSinceLastOutput / 1000)}s no output) but will NOT be killed`
716
+ );
717
+ this._log(` Analysis: ${analysis}`);
718
+ this._log(
719
+ ` Manual intervention may be needed - use 'zeroshot resume ${clusterId}' if stuck`
720
+ );
721
+ });
722
+
723
+ // Watch for CLUSTER_OPERATIONS - dynamic agent spawn/removal/update
724
+ // Conductor (or any agent) can publish operation chains to modify the cluster
725
+ messageBus.subscribe((message) => {
726
+ if (message.topic === 'CLUSTER_OPERATIONS' && message.cluster_id === clusterId) {
727
+ let operations = message.content?.data?.operations;
728
+
729
+ // Parse operations if they came as a JSON string (template variable serialization)
730
+ if (typeof operations === 'string') {
731
+ try {
732
+ operations = JSON.parse(operations);
733
+ } catch (e) {
734
+ this._log(`⚠️ CLUSTER_OPERATIONS has invalid operations JSON: ${e.message}`);
735
+ return;
736
+ }
737
+ }
738
+
739
+ if (!operations || !Array.isArray(operations)) {
740
+ this._log(`⚠️ CLUSTER_OPERATIONS missing operations array, ignoring`);
741
+ return;
742
+ }
743
+
744
+ this._log(`\n${'='.repeat(80)}`);
745
+ this._log(`🔧 CLUSTER_OPERATIONS received from ${message.sender}`);
746
+ this._log(`${'='.repeat(80)}`);
747
+ if (message.content?.data?.reasoning) {
748
+ this._log(`Reasoning: ${message.content.data.reasoning}`);
749
+ }
750
+ this._log(`Operations: ${operations.length}`);
751
+ this._log(`${'='.repeat(80)}\n`);
752
+
753
+ // Execute operation chain
754
+ this._handleOperations(clusterId, operations, message.sender, {
755
+ isolationManager,
756
+ containerId,
757
+ }).catch((err) => {
758
+ console.error(`Failed to execute CLUSTER_OPERATIONS:`, err.message);
759
+ // Publish failure for conductor to retry
760
+ messageBus.publish({
761
+ cluster_id: clusterId,
762
+ topic: 'CLUSTER_OPERATIONS_FAILED',
763
+ sender: 'orchestrator',
764
+ content: {
765
+ text: `Operation chain failed: ${err.message}`,
766
+ data: {
767
+ error: err.message,
768
+ operations: operations,
769
+ },
770
+ },
771
+ });
772
+ });
773
+ }
774
+ });
775
+
776
+ // DISABLED: Idle timeout auto-stop mechanism
777
+ // WHY DISABLED: Clusters should only stop on explicit signals:
778
+ // - User `zeroshot kill` command
779
+ // - CLUSTER_COMPLETE message (successful completion)
780
+ // - CLUSTER_FAILED message (failure/abort)
781
+ // Being "idle" is NOT a reason to auto-stop - agents may be legitimately
782
+ // waiting for external events, user input (in interactive mode), or
783
+ // processing that doesn't show as "executing" (e.g., polling, monitoring).
784
+ //
785
+ // Previous behavior: Stopped cluster after 2 minutes of all agents idle
786
+ // Result: Clusters were killed while legitimately waiting, causing confusion
787
+ //
788
+ // cluster.idleCheckInterval = setInterval(() => { ... }, 30000);
789
+ // ^^^^^^ REMOVED - clusters run until explicitly stopped or completed
790
+
791
+ // Save cluster to disk
792
+ this._saveClusters();
793
+
794
+ return {
795
+ id: clusterId,
796
+ state: cluster.state,
797
+ agents: cluster.agents.map((a) => a.getState()),
798
+ ledger: cluster.ledger, // Expose ledger for testing
799
+ messageBus: cluster.messageBus, // Expose messageBus for testing
800
+ };
801
+ } catch (error) {
802
+ cluster.state = 'failed';
803
+ console.error(`Cluster ${clusterId} failed to start:`, error);
804
+ throw error;
805
+ }
806
+ }
807
+
808
+ /**
809
+ * Stop a cluster
810
+ * @param {String} clusterId - Cluster ID
811
+ */
812
+ async stop(clusterId) {
813
+ const cluster = this.clusters.get(clusterId);
814
+ if (!cluster) {
815
+ throw new Error(`Cluster ${clusterId} not found`);
816
+ }
817
+
818
+ cluster.state = 'stopping';
819
+
820
+ // Stop all agents (including subclusters which handle their own children)
821
+ for (const agent of cluster.agents) {
822
+ await agent.stop();
823
+ }
824
+
825
+ // Clean up isolation container if enabled
826
+ if (cluster.isolation?.manager) {
827
+ this._log(`[Orchestrator] Cleaning up isolation container for ${clusterId}...`);
828
+ await cluster.isolation.manager.cleanup(clusterId);
829
+ this._log(`[Orchestrator] Container removed`);
830
+ }
831
+
832
+ cluster.state = 'stopped';
833
+ cluster.pid = null; // Clear PID - cluster is no longer running
834
+ this._log(`Cluster ${clusterId} stopped`);
835
+
836
+ // Save updated state
837
+ this._saveClusters();
838
+ }
839
+
840
+ /**
841
+ * Kill a cluster (force stop)
842
+ * @param {String} clusterId - Cluster ID
843
+ */
844
+ async kill(clusterId) {
845
+ const cluster = this.clusters.get(clusterId);
846
+ if (!cluster) {
847
+ throw new Error(`Cluster ${clusterId} not found`);
848
+ }
849
+
850
+ cluster.state = 'stopping';
851
+
852
+ // Force stop all agents
853
+ for (const agent of cluster.agents) {
854
+ await agent.stop();
855
+ }
856
+
857
+ // Force remove isolation container if enabled
858
+ if (cluster.isolation?.manager) {
859
+ this._log(`[Orchestrator] Force removing isolation container for ${clusterId}...`);
860
+ await cluster.isolation.manager.removeContainer(clusterId, true); // force=true
861
+ this._log(`[Orchestrator] Container removed`);
862
+ }
863
+
864
+ // Close message bus and ledger
865
+ cluster.messageBus.close();
866
+
867
+ cluster.state = 'killed';
868
+ cluster.pid = null; // Clear PID - cluster is no longer running
869
+ // DON'T delete from memory - keep it so it gets saved with 'killed' state
870
+ // this.clusters.delete(clusterId);
871
+
872
+ this._log(`Cluster ${clusterId} killed`);
873
+
874
+ // Save updated state (will be marked as 'killed' in file)
875
+ this._saveClusters();
876
+
877
+ // Now remove from memory after persisting
878
+ this.clusters.delete(clusterId);
879
+ }
880
+
881
+ /**
882
+ * Kill all running clusters
883
+ * @returns {Object} { killed: Array<string>, errors: Array<{id, error}> }
884
+ */
885
+ async killAll() {
886
+ const results = { killed: [], errors: [] };
887
+ const runningClusters = Array.from(this.clusters.values()).filter(
888
+ (c) => c.state === 'running' || c.state === 'initializing'
889
+ );
890
+
891
+ for (const cluster of runningClusters) {
892
+ try {
893
+ await this.kill(cluster.id);
894
+ results.killed.push(cluster.id);
895
+ } catch (error) {
896
+ results.errors.push({ id: cluster.id, error: error.message });
897
+ }
898
+ }
899
+
900
+ return results;
901
+ }
902
+
903
+ /**
904
+ * Find the last workflow-triggering message in the ledger
905
+ * Workflow triggers indicate cluster state progression (not AGENT_OUTPUT noise)
906
+ * @param {Array} messages - All messages from ledger
907
+ * @returns {Object|null} - Last workflow trigger message or null
908
+ * @private
909
+ */
910
+ _findLastWorkflowTrigger(messages) {
911
+ for (let i = messages.length - 1; i >= 0; i--) {
912
+ if (WORKFLOW_TRIGGERS.includes(messages[i].topic)) {
913
+ return messages[i];
914
+ }
915
+ }
916
+ return null;
917
+ }
918
+
919
+ /**
920
+ * Resume a stopped cluster from where it left off
921
+ * Handles both failed clusters (with error context) and cleanly stopped clusters
922
+ * @param {String} clusterId - Cluster ID
923
+ * @param {String} prompt - Optional custom resume prompt
924
+ * @returns {Object} Resumed cluster info
925
+ */
926
+ async resume(clusterId, prompt) {
927
+ const cluster = this.clusters.get(clusterId);
928
+ if (!cluster) {
929
+ throw new Error(`Cluster not found: ${clusterId}`);
930
+ }
931
+
932
+ if (cluster.state === 'running') {
933
+ throw new Error(
934
+ `Cluster ${clusterId} is still running. Use 'zeroshot stop' first if you want to restart it.`
935
+ );
936
+ }
937
+
938
+ // Get failure info - either from saved state or from ledger
939
+ let failureInfo = cluster.failureInfo;
940
+
941
+ if (!failureInfo) {
942
+ // Query ledger for AGENT_ERROR messages to find failed agent
943
+ const errors = cluster.messageBus.query({
944
+ cluster_id: clusterId,
945
+ topic: 'AGENT_ERROR',
946
+ limit: 10,
947
+ });
948
+
949
+ if (errors.length > 0) {
950
+ // Use the first error found
951
+ const firstError = errors[0];
952
+ failureInfo = {
953
+ agentId: firstError.sender,
954
+ taskId: firstError.content?.data?.taskId || null,
955
+ iteration: firstError.content?.data?.iteration || 0,
956
+ error: firstError.content?.data?.error || firstError.content?.text,
957
+ timestamp: firstError.timestamp,
958
+ };
959
+ this._log(`[Orchestrator] Found failure from ledger: ${failureInfo.agentId}`);
960
+ }
961
+ }
962
+
963
+ // CRITICAL: Recreate isolation container if needed
964
+ if (cluster.isolation?.enabled) {
965
+ const { spawn } = require('child_process');
966
+ const oldContainerId = cluster.isolation.containerId;
967
+
968
+ // Check if container exists
969
+ const checkContainer = spawn('docker', ['inspect', oldContainerId], {
970
+ stdio: 'ignore',
971
+ });
972
+ const containerExists = await new Promise((resolve) => {
973
+ checkContainer.on('close', (code) => resolve(code === 0));
974
+ });
975
+
976
+ if (!containerExists) {
977
+ this._log(`[Orchestrator] Container ${oldContainerId} not found, recreating...`);
978
+
979
+ // Create new container
980
+ const newContainerId = await cluster.isolation.manager.createContainer(clusterId, {
981
+ workDir: process.cwd(),
982
+ image: cluster.isolation.image,
983
+ });
984
+
985
+ this._log(`[Orchestrator] New container created: ${newContainerId}`);
986
+
987
+ // Update cluster isolation state
988
+ cluster.isolation.containerId = newContainerId;
989
+
990
+ // CRITICAL: Update all agents' isolation context with new container ID
991
+ for (const agent of cluster.agents) {
992
+ if (agent.isolation?.enabled) {
993
+ agent.isolation.containerId = newContainerId;
994
+ agent.isolation.manager = cluster.isolation.manager;
995
+ }
996
+ }
997
+
998
+ this._log(`[Orchestrator] All agents updated with new container ID`);
999
+ } else {
1000
+ this._log(`[Orchestrator] Container ${oldContainerId} still exists, reusing`);
1001
+ }
1002
+ }
1003
+
1004
+ // Restart all agents
1005
+ cluster.state = 'running';
1006
+ for (const agent of cluster.agents) {
1007
+ if (!agent.running) {
1008
+ await agent.start();
1009
+ }
1010
+ }
1011
+
1012
+ // Query recent messages from ledger to provide context
1013
+ const recentMessages = cluster.messageBus.query({
1014
+ cluster_id: clusterId,
1015
+ limit: 50,
1016
+ });
1017
+
1018
+ // CASE 1: Failed cluster - Resume the failed agent with error context
1019
+ if (failureInfo) {
1020
+ const { agentId, iteration, error } = failureInfo;
1021
+ this._log(
1022
+ `[Orchestrator] Resuming failed cluster ${clusterId} from agent ${agentId} iteration ${iteration}`
1023
+ );
1024
+ this._log(`[Orchestrator] Previous error: ${error}`);
1025
+
1026
+ // Find the failed agent
1027
+ const failedAgent = cluster.agents.find((a) => a.id === agentId);
1028
+ if (!failedAgent) {
1029
+ throw new Error(`Failed agent '${agentId}' not found in cluster`);
1030
+ }
1031
+
1032
+ // Build failure resume context
1033
+ const resumePrompt = prompt || 'Continue from where you left off. Complete the task.';
1034
+ let context = `You are resuming from a previous failed attempt.\n\n`;
1035
+ context += `Previous error: ${error}\n\n`;
1036
+ context += `## Recent Context\n\n`;
1037
+
1038
+ for (const msg of recentMessages.slice(-10)) {
1039
+ if (msg.topic === 'AGENT_OUTPUT' || msg.topic === 'VALIDATION_RESULT') {
1040
+ context += `[${msg.sender}] ${msg.content?.text?.slice(0, 200) || ''}\n`;
1041
+ }
1042
+ }
1043
+
1044
+ context += `\n## Resume Instructions\n\n${resumePrompt}\n`;
1045
+
1046
+ // Clear failure info since we're resuming
1047
+ cluster.failureInfo = null;
1048
+
1049
+ // Save updated state
1050
+ this._saveClusters();
1051
+
1052
+ // Resume the failed agent
1053
+ failedAgent.resume(context).catch((err) => {
1054
+ console.error(`[Orchestrator] Resume failed for agent ${agentId}:`, err.message);
1055
+ });
1056
+
1057
+ this._log(`[Orchestrator] Cluster ${clusterId} resumed from failure`);
1058
+
1059
+ return {
1060
+ id: clusterId,
1061
+ state: cluster.state,
1062
+ resumeType: 'failure',
1063
+ resumedAgent: agentId,
1064
+ previousError: error,
1065
+ };
1066
+ }
1067
+
1068
+ // CASE 2: Cleanly stopped cluster - Resume by re-triggering based on ledger state
1069
+ this._log(`[Orchestrator] Resuming stopped cluster ${clusterId} (no failure)`);
1070
+
1071
+ // Build generic resume context
1072
+ const resumePrompt = prompt || 'Continue from where you left off. Complete the task.';
1073
+ let context = `Resuming cluster from previous session.\n\n`;
1074
+ context += `## Recent Context\n\n`;
1075
+
1076
+ for (const msg of recentMessages.slice(-10)) {
1077
+ if (
1078
+ msg.topic === 'AGENT_OUTPUT' ||
1079
+ msg.topic === 'VALIDATION_RESULT' ||
1080
+ msg.topic === 'ISSUE_OPENED'
1081
+ ) {
1082
+ context += `[${msg.sender}] ${msg.content?.text?.slice(0, 200) || ''}\n`;
1083
+ }
1084
+ }
1085
+
1086
+ context += `\n## Resume Instructions\n\n${resumePrompt}\n`;
1087
+
1088
+ // Find the LAST workflow trigger - not arbitrary last 5 messages
1089
+ // This is the message that indicates current workflow state
1090
+ const lastTrigger = this._findLastWorkflowTrigger(recentMessages);
1091
+ const agentsToResume = [];
1092
+
1093
+ if (lastTrigger) {
1094
+ this._log(
1095
+ `[Orchestrator] Last workflow trigger: ${lastTrigger.topic} (${new Date(lastTrigger.timestamp).toISOString()})`
1096
+ );
1097
+
1098
+ for (const agent of cluster.agents) {
1099
+ if (!agent.config.triggers) continue;
1100
+
1101
+ const matchingTrigger = agent.config.triggers.find((trigger) => {
1102
+ // Exact match
1103
+ if (trigger.topic === lastTrigger.topic) return true;
1104
+ // Wildcard match
1105
+ if (trigger.topic === '*') return true;
1106
+ // Prefix match (e.g., "VALIDATION_*")
1107
+ if (trigger.topic.endsWith('*')) {
1108
+ const prefix = trigger.topic.slice(0, -1);
1109
+ return lastTrigger.topic.startsWith(prefix);
1110
+ }
1111
+ return false;
1112
+ });
1113
+
1114
+ if (matchingTrigger) {
1115
+ // Evaluate logic script if present
1116
+ if (matchingTrigger.logic?.script) {
1117
+ const shouldTrigger = agent._evaluateTrigger(matchingTrigger, lastTrigger);
1118
+ if (!shouldTrigger) continue;
1119
+ }
1120
+ agentsToResume.push({ agent, message: lastTrigger, trigger: matchingTrigger });
1121
+ }
1122
+ }
1123
+ } else {
1124
+ this._log(`[Orchestrator] No workflow triggers found in ledger`);
1125
+ }
1126
+
1127
+ if (agentsToResume.length === 0) {
1128
+ if (!lastTrigger) {
1129
+ // No workflow activity - cluster never really started
1130
+ this._log(
1131
+ `[Orchestrator] WARNING: No workflow triggers in ledger. Cluster may not have started properly.`
1132
+ );
1133
+ this._log(`[Orchestrator] Publishing ISSUE_OPENED to bootstrap workflow...`);
1134
+
1135
+ // Re-publish the original issue if we have it
1136
+ const issueMessage = recentMessages.find((m) => m.topic === 'ISSUE_OPENED');
1137
+ if (issueMessage) {
1138
+ cluster.messageBus.publish({
1139
+ cluster_id: clusterId,
1140
+ topic: 'ISSUE_OPENED',
1141
+ sender: 'system',
1142
+ receiver: 'broadcast',
1143
+ content: issueMessage.content,
1144
+ metadata: { _resumed: true, _originalId: issueMessage.id },
1145
+ });
1146
+ } else {
1147
+ throw new Error(
1148
+ `Cannot resume cluster ${clusterId}: No workflow triggers found and no ISSUE_OPENED message. ` +
1149
+ `The cluster may not have started properly. Try: zeroshot run <issue> instead.`
1150
+ );
1151
+ }
1152
+ } else {
1153
+ // Had a trigger but no agents matched - something is wrong with agent configs
1154
+ throw new Error(
1155
+ `Cannot resume cluster ${clusterId}: Found trigger ${lastTrigger.topic} but no agents handle it. ` +
1156
+ `Check agent trigger configurations.`
1157
+ );
1158
+ }
1159
+ } else {
1160
+ // Resume agents that should run based on ledger state
1161
+ this._log(`[Orchestrator] Resuming ${agentsToResume.length} agent(s) based on ledger state`);
1162
+ for (const { agent, message } of agentsToResume) {
1163
+ this._log(`[Orchestrator] - Resuming agent ${agent.id} (triggered by ${message.topic})`);
1164
+ agent.resume(context).catch((err) => {
1165
+ console.error(`[Orchestrator] Resume failed for agent ${agent.id}:`, err.message);
1166
+ });
1167
+ }
1168
+ }
1169
+
1170
+ // Save updated state
1171
+ this._saveClusters();
1172
+
1173
+ this._log(`[Orchestrator] Cluster ${clusterId} resumed`);
1174
+
1175
+ return {
1176
+ id: clusterId,
1177
+ state: cluster.state,
1178
+ resumeType: 'clean',
1179
+ resumedAgents: agentsToResume.map((a) => a.agent.id),
1180
+ };
1181
+ }
1182
+
1183
+ /**
1184
+ * Force restart a stale agent with imperative prompt injection
1185
+ * @param {string} clusterId - Cluster ID
1186
+ * @param {string} agentId - Agent to restart
1187
+ * @param {number} staleDuration - How long agent was stale (ms)
1188
+ * @private
1189
+ */
1190
+ async _forceRestartAgent(clusterId, agentId, staleDuration) {
1191
+ const cluster = this.clusters.get(clusterId);
1192
+ if (!cluster) {
1193
+ throw new Error(`Cluster ${clusterId} not found`);
1194
+ }
1195
+
1196
+ const agent = cluster.agents.find((a) => a.id === agentId);
1197
+ if (!agent) {
1198
+ throw new Error(`Agent ${agentId} not found in cluster ${clusterId}`);
1199
+ }
1200
+
1201
+ // Kill current task
1202
+ try {
1203
+ agent._killTask();
1204
+ } catch (err) {
1205
+ this._log(`⚠️ Failed to kill agent ${agentId} task:`, err.message);
1206
+ }
1207
+
1208
+ // Build imperative restart context
1209
+ const staleMinutes = Math.round(staleDuration / 60000);
1210
+ const imperativePrompt = `
1211
+ 🔴 CRITICAL: Your previous session STOPPED PRODUCING OUTPUT for ${staleMinutes} minutes and was detected as STUCK.
1212
+
1213
+ ## What Happened
1214
+ - Last output timestamp: ${new Date(Date.now() - staleDuration).toISOString()}
1215
+ - Detected as stale after ${staleMinutes} minutes of silence
1216
+ - Process was forcefully restarted
1217
+
1218
+ ## Your Instructions
1219
+ You MUST complete your current task. DO NOT STOP until you either:
1220
+ 1. Successfully complete the task and publish your completion message, OR
1221
+ 2. Explicitly state WHY you cannot complete the task (missing files, impossible requirements, etc.)
1222
+
1223
+ If you discovered that files you need to modify don't exist:
1224
+ - CREATE them from scratch with the expected implementation
1225
+ - DO NOT silently give up
1226
+ - DO NOT stop working without explicit explanation
1227
+
1228
+ If you are stuck in an impossible situation:
1229
+ - EXPLAIN the problem clearly
1230
+ - PROPOSE alternative solutions
1231
+ - WAIT for guidance - do not exit
1232
+
1233
+ ## Resume Your Work
1234
+ Continue from where you left off. Review your previous output to understand what you were working on.
1235
+ `.trim();
1236
+
1237
+ // Get recent context from ledger
1238
+ const recentMessages = cluster.messageBus.query({
1239
+ cluster_id: cluster.id,
1240
+ limit: 10,
1241
+ });
1242
+
1243
+ const contextText = recentMessages
1244
+ .map((m) => `[${m.sender}] ${m.content?.text || JSON.stringify(m.content)}`)
1245
+ .join('\n\n');
1246
+
1247
+ const fullContext = `${imperativePrompt}\n\n## Recent Context\n${contextText}`;
1248
+
1249
+ // Resume agent with imperative prompt
1250
+ this._log(
1251
+ `🔄 Restarting agent ${agentId} with imperative prompt (${imperativePrompt.length} chars)`
1252
+ );
1253
+
1254
+ try {
1255
+ await agent.resume(fullContext);
1256
+ this._log(`✅ Agent ${agentId} successfully restarted`);
1257
+ } catch (err) {
1258
+ this._log(`❌ Failed to resume agent ${agentId}:`, err.message);
1259
+ throw err;
1260
+ }
1261
+ }
1262
+
1263
+ /**
1264
+ * Handle operation chain from CLUSTER_OPERATIONS message
1265
+ * Executes operations sequentially: add_agents, remove_agents, update_agent, publish
1266
+ *
1267
+ * Validation strategy:
1268
+ * 1. Pre-validate all agent configs before executing any operations
1269
+ * 2. Build a mock cluster config with proposed changes
1270
+ * 3. Run config-validator on the mock to catch structural issues
1271
+ * 4. Only execute operations if validation passes
1272
+ *
1273
+ * @param {string} clusterId - Cluster ID
1274
+ * @param {Array} operations - Array of operation objects
1275
+ * @param {string} sender - Who sent the operations (for attribution)
1276
+ * @param {Object} context - Isolation context { isolationManager, containerId }
1277
+ * @private
1278
+ */
1279
+ async _handleOperations(clusterId, operations, sender, context = {}) {
1280
+ const cluster = this.clusters.get(clusterId);
1281
+ if (!cluster) {
1282
+ throw new Error(`Cluster ${clusterId} not found`);
1283
+ }
1284
+
1285
+ this._log(`[Orchestrator] Validating ${operations.length} operation(s) from ${sender}`);
1286
+
1287
+ // Phase 1: Pre-validate operation structure
1288
+ const validationErrors = [];
1289
+ for (let i = 0; i < operations.length; i++) {
1290
+ const op = operations[i];
1291
+ if (!op.action) {
1292
+ validationErrors.push(`Operation ${i}: missing 'action' field`);
1293
+ continue;
1294
+ }
1295
+ if (!VALID_OPERATIONS.includes(op.action)) {
1296
+ validationErrors.push(
1297
+ `Operation ${i}: unknown action '${op.action}'. Valid: ${VALID_OPERATIONS.join(', ')}`
1298
+ );
1299
+ }
1300
+ }
1301
+
1302
+ if (validationErrors.length > 0) {
1303
+ const errorMsg = `Operation chain validation failed:\n - ${validationErrors.join('\n - ')}`;
1304
+ this._log(`[Orchestrator] ❌ ${errorMsg}`);
1305
+ throw new Error(errorMsg);
1306
+ }
1307
+
1308
+ // Phase 2: Build mock cluster config with proposed agents
1309
+ // Collect all agents that would exist after operations complete
1310
+ const existingAgentConfigs = cluster.config.agents || [];
1311
+ const proposedAgentConfigs = [...existingAgentConfigs];
1312
+
1313
+ for (const op of operations) {
1314
+ if (op.action === 'add_agents' && op.agents) {
1315
+ for (const agentConfig of op.agents) {
1316
+ // Check for duplicate before adding
1317
+ const existingIdx = proposedAgentConfigs.findIndex((a) => a.id === agentConfig.id);
1318
+ if (existingIdx === -1) {
1319
+ proposedAgentConfigs.push(agentConfig);
1320
+ }
1321
+ }
1322
+ } else if (op.action === 'remove_agents' && op.agentIds) {
1323
+ for (const agentId of op.agentIds) {
1324
+ const idx = proposedAgentConfigs.findIndex((a) => a.id === agentId);
1325
+ if (idx !== -1) {
1326
+ proposedAgentConfigs.splice(idx, 1);
1327
+ }
1328
+ }
1329
+ } else if (op.action === 'update_agent' && op.agentId && op.updates) {
1330
+ const agentConfig = proposedAgentConfigs.find((a) => a.id === op.agentId);
1331
+ if (agentConfig) {
1332
+ Object.assign(agentConfig, op.updates);
1333
+ }
1334
+ }
1335
+ }
1336
+
1337
+ // Phase 3: Validate proposed cluster config
1338
+ const mockConfig = { agents: proposedAgentConfigs };
1339
+ const validation = configValidator.validateConfig(mockConfig);
1340
+
1341
+ if (!validation.valid) {
1342
+ const errorMsg = `Proposed cluster configuration is invalid:\n - ${validation.errors.join('\n - ')}`;
1343
+ this._log(`[Orchestrator] ❌ ${errorMsg}`);
1344
+
1345
+ // Publish validation failure for conductor to see and retry
1346
+ cluster.messageBus.publish({
1347
+ cluster_id: clusterId,
1348
+ topic: 'CLUSTER_OPERATIONS_VALIDATION_FAILED',
1349
+ sender: 'orchestrator',
1350
+ content: {
1351
+ text: 'Operation chain would create invalid cluster configuration',
1352
+ data: {
1353
+ errors: validation.errors,
1354
+ warnings: validation.warnings,
1355
+ operations: operations,
1356
+ },
1357
+ },
1358
+ });
1359
+
1360
+ throw new Error(errorMsg);
1361
+ }
1362
+
1363
+ // Log warnings but proceed
1364
+ if (validation.warnings.length > 0) {
1365
+ this._log(`[Orchestrator] ⚠️ Warnings (proceeding anyway):`);
1366
+ for (const warning of validation.warnings) {
1367
+ this._log(` - ${warning}`);
1368
+ }
1369
+ }
1370
+
1371
+ // Phase 4: Execute validated operations
1372
+ this._log(`[Orchestrator] ✓ Validation passed, executing ${operations.length} operation(s)`);
1373
+
1374
+ for (let i = 0; i < operations.length; i++) {
1375
+ const op = operations[i];
1376
+ this._log(` [${i + 1}/${operations.length}] ${op.action}`);
1377
+
1378
+ switch (op.action) {
1379
+ case 'add_agents':
1380
+ await this._opAddAgents(cluster, op, context);
1381
+ break;
1382
+
1383
+ case 'remove_agents':
1384
+ await this._opRemoveAgents(cluster, op);
1385
+ break;
1386
+
1387
+ case 'update_agent':
1388
+ this._opUpdateAgent(cluster, op);
1389
+ break;
1390
+
1391
+ case 'publish':
1392
+ this._opPublish(cluster, op, sender);
1393
+ break;
1394
+
1395
+ case 'load_config':
1396
+ await this._opLoadConfig(cluster, op, context);
1397
+ break;
1398
+ }
1399
+ }
1400
+
1401
+ this._log(`[Orchestrator] All ${operations.length} operation(s) executed successfully`);
1402
+
1403
+ // Publish success notification
1404
+ cluster.messageBus.publish({
1405
+ cluster_id: clusterId,
1406
+ topic: 'CLUSTER_OPERATIONS_SUCCESS',
1407
+ sender: 'orchestrator',
1408
+ content: {
1409
+ text: `Executed ${operations.length} operation(s)`,
1410
+ data: {
1411
+ operationCount: operations.length,
1412
+ agentCount: cluster.agents.length,
1413
+ },
1414
+ },
1415
+ });
1416
+
1417
+ // Save updated cluster state to disk
1418
+ this._saveClusters();
1419
+ }
1420
+
1421
+ /**
1422
+ * Operation: add_agents - Spawn new agents dynamically
1423
+ * @private
1424
+ */
1425
+ async _opAddAgents(cluster, op, context) {
1426
+ const agents = op.agents;
1427
+ if (!agents || !Array.isArray(agents)) {
1428
+ throw new Error('add_agents operation missing agents array');
1429
+ }
1430
+
1431
+ for (const agentConfig of agents) {
1432
+ // Validate agent config has required fields
1433
+ if (!agentConfig.id) {
1434
+ throw new Error('Agent config missing required field: id');
1435
+ }
1436
+
1437
+ // Check for duplicate agent ID
1438
+ const existingAgent = cluster.agents.find((a) => a.id === agentConfig.id);
1439
+ if (existingAgent) {
1440
+ this._log(` ⚠️ Agent ${agentConfig.id} already exists, skipping`);
1441
+ continue;
1442
+ }
1443
+
1444
+ // Add to config agents array (for persistence)
1445
+ if (!cluster.config.agents) {
1446
+ cluster.config.agents = [];
1447
+ }
1448
+ cluster.config.agents.push(agentConfig);
1449
+
1450
+ // Build agent options
1451
+ const agentOptions = {
1452
+ testMode: false,
1453
+ quiet: this.quiet,
1454
+ };
1455
+
1456
+ // Pass isolation context if cluster is running in isolation mode
1457
+ if (cluster.isolation?.enabled && context.isolationManager) {
1458
+ agentOptions.isolation = {
1459
+ enabled: true,
1460
+ manager: context.isolationManager,
1461
+ clusterId: cluster.id,
1462
+ };
1463
+ }
1464
+
1465
+ // Create and start agent
1466
+ const agent = new AgentWrapper(agentConfig, cluster.messageBus, cluster, agentOptions);
1467
+ cluster.agents.push(agent);
1468
+ await agent.start();
1469
+
1470
+ this._log(
1471
+ ` ✓ Added agent: ${agentConfig.id} (role: ${agentConfig.role || 'unspecified'})`
1472
+ );
1473
+ }
1474
+ }
1475
+
1476
+ /**
1477
+ * Operation: remove_agents - Stop and remove agents by ID
1478
+ * @private
1479
+ */
1480
+ async _opRemoveAgents(cluster, op) {
1481
+ const agentIds = op.agentIds;
1482
+ if (!agentIds || !Array.isArray(agentIds)) {
1483
+ throw new Error('remove_agents operation missing agentIds array');
1484
+ }
1485
+
1486
+ for (const agentId of agentIds) {
1487
+ const agentIndex = cluster.agents.findIndex((a) => a.id === agentId);
1488
+ if (agentIndex === -1) {
1489
+ this._log(` ⚠️ Agent ${agentId} not found, skipping removal`);
1490
+ continue;
1491
+ }
1492
+
1493
+ const agent = cluster.agents[agentIndex];
1494
+ await agent.stop();
1495
+
1496
+ // Remove from cluster.agents
1497
+ cluster.agents.splice(agentIndex, 1);
1498
+
1499
+ // Remove from config.agents
1500
+ if (cluster.config.agents) {
1501
+ const configIndex = cluster.config.agents.findIndex((a) => a.id === agentId);
1502
+ if (configIndex !== -1) {
1503
+ cluster.config.agents.splice(configIndex, 1);
1504
+ }
1505
+ }
1506
+
1507
+ this._log(` ✓ Removed agent: ${agentId}`);
1508
+ }
1509
+ }
1510
+
1511
+ /**
1512
+ * Operation: update_agent - Modify existing agent config at runtime
1513
+ * Note: Some updates may require agent restart to take effect
1514
+ * @private
1515
+ */
1516
+ _opUpdateAgent(cluster, op) {
1517
+ const { agentId, updates } = op;
1518
+ if (!agentId) {
1519
+ throw new Error('update_agent operation missing agentId');
1520
+ }
1521
+ if (!updates || typeof updates !== 'object') {
1522
+ throw new Error('update_agent operation missing updates object');
1523
+ }
1524
+
1525
+ const agent = cluster.agents.find((a) => a.id === agentId);
1526
+ if (!agent) {
1527
+ throw new Error(`update_agent: Agent ${agentId} not found`);
1528
+ }
1529
+
1530
+ // Apply updates to agent config
1531
+ Object.assign(agent.config, updates);
1532
+
1533
+ // Also update in cluster.config.agents for persistence
1534
+ if (cluster.config.agents) {
1535
+ const configAgent = cluster.config.agents.find((a) => a.id === agentId);
1536
+ if (configAgent) {
1537
+ Object.assign(configAgent, updates);
1538
+ }
1539
+ }
1540
+
1541
+ this._log(` ✓ Updated agent: ${agentId} (fields: ${Object.keys(updates).join(', ')})`);
1542
+ }
1543
+
1544
+ /**
1545
+ * Operation: publish - Publish a message to the bus
1546
+ * @private
1547
+ */
1548
+ _opPublish(cluster, op, sender) {
1549
+ const { topic, content } = op;
1550
+ if (!topic) {
1551
+ throw new Error('publish operation missing topic');
1552
+ }
1553
+
1554
+ cluster.messageBus.publish({
1555
+ cluster_id: cluster.id,
1556
+ topic,
1557
+ sender: op.sender || sender,
1558
+ receiver: op.receiver || 'broadcast',
1559
+ content: content || {},
1560
+ });
1561
+
1562
+ this._log(` ✓ Published to topic: ${topic}`);
1563
+ }
1564
+
1565
+ /**
1566
+ * Operation: load_config - Load agents from a cluster config
1567
+ *
1568
+ * Supports two formats:
1569
+ * 1. Static config: { config: 'config-name' } - loads from cluster-templates/config-name.json
1570
+ * 2. Parameterized: { config: { base: 'template-name', params: {...} } } - resolves base template with params
1571
+ *
1572
+ * @private
1573
+ */
1574
+ async _opLoadConfig(cluster, op, context) {
1575
+ const { config } = op;
1576
+ if (!config) {
1577
+ throw new Error('load_config operation missing config');
1578
+ }
1579
+
1580
+ const templatesDir = path.join(__dirname, '..', 'cluster-templates');
1581
+ let loadedConfig;
1582
+
1583
+ // Check if config is parameterized ({ base, params }) or static (string)
1584
+ if (typeof config === 'object' && config.base) {
1585
+ // Parameterized template - resolve with TemplateResolver
1586
+ const { base, params } = config;
1587
+ this._log(` Loading parameterized template: ${base}`);
1588
+ this._log(` Params: ${JSON.stringify(params)}`);
1589
+
1590
+ const resolver = new TemplateResolver(templatesDir);
1591
+ loadedConfig = resolver.resolve(base, params);
1592
+
1593
+ this._log(` ✓ Resolved template: ${base} → ${loadedConfig.agents?.length || 0} agent(s)`);
1594
+ } else if (typeof config === 'string') {
1595
+ // Static config - load directly from file
1596
+ const configPath = path.join(templatesDir, `${config}.json`);
1597
+
1598
+ if (!fs.existsSync(configPath)) {
1599
+ throw new Error(`Config not found: ${config} (looked in ${configPath})`);
1600
+ }
1601
+
1602
+ this._log(` Loading static config: ${config}`);
1603
+
1604
+ const configContent = fs.readFileSync(configPath, 'utf8');
1605
+ loadedConfig = JSON.parse(configContent);
1606
+ } else {
1607
+ throw new Error(
1608
+ `Invalid config format: expected string or {base, params}, got ${typeof config}`
1609
+ );
1610
+ }
1611
+
1612
+ if (!loadedConfig.agents || !Array.isArray(loadedConfig.agents)) {
1613
+ throw new Error(`Config has no agents array`);
1614
+ }
1615
+
1616
+ this._log(` Found ${loadedConfig.agents.length} agent(s)`);
1617
+
1618
+ // Add agents from loaded config (reuse existing add_agents logic)
1619
+ await this._opAddAgents(cluster, { agents: loadedConfig.agents }, context);
1620
+
1621
+ this._log(` ✓ Config loaded (${loadedConfig.agents.length} agents)`);
1622
+ }
1623
+
1624
+ /**
1625
+ * Check if a process with given PID is running
1626
+ * @param {Number} pid - Process ID
1627
+ * @returns {Boolean} True if process exists
1628
+ * @private
1629
+ */
1630
+ _isProcessRunning(pid) {
1631
+ if (!pid) return false;
1632
+ try {
1633
+ // Signal 0 doesn't kill, just checks if process exists
1634
+ process.kill(pid, 0);
1635
+ return true;
1636
+ } catch (e) {
1637
+ // ESRCH = No such process, EPERM = process exists but no permission
1638
+ return e.code === 'EPERM';
1639
+ }
1640
+ }
1641
+
1642
+ /**
1643
+ * Get cluster status
1644
+ * @param {String} clusterId - Cluster ID
1645
+ * @returns {Object} Cluster status
1646
+ */
1647
+ getStatus(clusterId) {
1648
+ const cluster = this.clusters.get(clusterId);
1649
+ if (!cluster) {
1650
+ throw new Error(`Cluster ${clusterId} not found`);
1651
+ }
1652
+
1653
+ // Detect zombie clusters: state=running but no backing process
1654
+ let state = cluster.state;
1655
+ let isZombie = false;
1656
+ if (state === 'running') {
1657
+ if (cluster.pid) {
1658
+ // PID recorded - check if process is running
1659
+ if (!this._isProcessRunning(cluster.pid)) {
1660
+ state = 'zombie';
1661
+ isZombie = true;
1662
+ this._log(
1663
+ `[Orchestrator] Detected zombie cluster ${clusterId} (PID ${cluster.pid} not running)`
1664
+ );
1665
+ }
1666
+ } else {
1667
+ // No PID recorded (legacy cluster or bug) - definitely a zombie
1668
+ // New code always records PID for running clusters
1669
+ state = 'zombie';
1670
+ isZombie = true;
1671
+ this._log(
1672
+ `[Orchestrator] Detected zombie cluster ${clusterId} (no PID recorded - legacy or killed cluster)`
1673
+ );
1674
+ }
1675
+ }
1676
+
1677
+ return {
1678
+ id: clusterId,
1679
+ state: state,
1680
+ isZombie: isZombie,
1681
+ pid: cluster.pid || null,
1682
+ createdAt: cluster.createdAt,
1683
+ agents: cluster.agents.map((a) => a.getState()),
1684
+ messageCount: cluster.messageBus.count({ cluster_id: clusterId }),
1685
+ };
1686
+ }
1687
+
1688
+ /**
1689
+ * List all clusters
1690
+ * @returns {Array} List of cluster summaries
1691
+ */
1692
+ listClusters() {
1693
+ return Array.from(this.clusters.values()).map((cluster) => {
1694
+ // Detect zombie clusters (state=running but no backing process)
1695
+ let state = cluster.state;
1696
+ if (state === 'running') {
1697
+ if (cluster.pid) {
1698
+ if (!this._isProcessRunning(cluster.pid)) {
1699
+ state = 'zombie';
1700
+ }
1701
+ } else {
1702
+ // No PID recorded - definitely a zombie
1703
+ state = 'zombie';
1704
+ }
1705
+ }
1706
+
1707
+ return {
1708
+ id: cluster.id,
1709
+ state: state,
1710
+ createdAt: cluster.createdAt,
1711
+ agentCount: cluster.agents.length,
1712
+ messageCount: cluster.messageBus.getAll(cluster.id).length,
1713
+ };
1714
+ });
1715
+ }
1716
+
1717
+ /**
1718
+ * Get cluster object (for internal use)
1719
+ * @param {String} clusterId - Cluster ID
1720
+ * @returns {Object} Full cluster object
1721
+ */
1722
+ getCluster(clusterId) {
1723
+ return this.clusters.get(clusterId);
1724
+ }
1725
+
1726
+ /**
1727
+ * Export cluster conversation
1728
+ * @param {String} clusterId - Cluster ID
1729
+ * @param {String} format - Export format ('json' or 'markdown')
1730
+ * @returns {String} Exported data
1731
+ */
1732
+ export(clusterId, format = 'json') {
1733
+ const cluster = this.clusters.get(clusterId);
1734
+ if (!cluster) {
1735
+ throw new Error(`Cluster ${clusterId} not found`);
1736
+ }
1737
+
1738
+ const messages = cluster.messageBus.getAll(clusterId);
1739
+
1740
+ if (format === 'json') {
1741
+ return JSON.stringify(
1742
+ {
1743
+ cluster_id: clusterId,
1744
+ state: cluster.state,
1745
+ created_at: cluster.createdAt,
1746
+ agents: cluster.agents.map((a) => a.getState()),
1747
+ messages,
1748
+ },
1749
+ null,
1750
+ 2
1751
+ );
1752
+ } else if (format === 'markdown') {
1753
+ return this._exportMarkdown(cluster, clusterId, messages);
1754
+ } else {
1755
+ throw new Error(`Unknown export format: ${format}`);
1756
+ }
1757
+ }
1758
+
1759
+ /**
1760
+ * Export cluster as nicely formatted markdown
1761
+ * @private
1762
+ */
1763
+ _exportMarkdown(cluster, clusterId, messages) {
1764
+ const { parseChunk } = require('../lib/stream-json-parser');
1765
+
1766
+ // Find task info
1767
+ const issueOpened = messages.find((m) => m.topic === 'ISSUE_OPENED');
1768
+ const taskText = issueOpened?.content?.text || 'Unknown task';
1769
+
1770
+ // Calculate duration
1771
+ const firstMsg = messages[0];
1772
+ const lastMsg = messages[messages.length - 1];
1773
+ const durationMs = lastMsg ? lastMsg.timestamp - firstMsg.timestamp : 0;
1774
+ const durationMin = Math.round(durationMs / 60000);
1775
+
1776
+ // Header
1777
+ let md = `# Cluster: ${clusterId}\n\n`;
1778
+ md += `| Property | Value |\n|----------|-------|\n`;
1779
+ md += `| State | ${cluster.state} |\n`;
1780
+ md += `| Created | ${new Date(cluster.createdAt).toLocaleString()} |\n`;
1781
+ md += `| Duration | ${durationMin} minutes |\n`;
1782
+ md += `| Agents | ${cluster.agents.map((a) => a.id).join(', ')} |\n\n`;
1783
+
1784
+ // Task
1785
+ md += `## Task\n\n${taskText}\n\n`;
1786
+
1787
+ // Group messages by agent for cleaner output
1788
+ const agentOutputs = new Map();
1789
+
1790
+ for (const msg of messages) {
1791
+ if (msg.topic === 'AGENT_OUTPUT') {
1792
+ if (!agentOutputs.has(msg.sender)) {
1793
+ agentOutputs.set(msg.sender, []);
1794
+ }
1795
+ agentOutputs.get(msg.sender).push(msg);
1796
+ }
1797
+ }
1798
+
1799
+ // Agent sections
1800
+ for (const [agentId, agentMsgs] of agentOutputs) {
1801
+ md += `## Agent: ${agentId}\n\n`;
1802
+
1803
+ let text = '';
1804
+ let tools = [];
1805
+
1806
+ for (const msg of agentMsgs) {
1807
+ const content = msg.content?.data?.line || msg.content?.data?.chunk || msg.content?.text;
1808
+ if (!content) continue;
1809
+
1810
+ const events = parseChunk(content);
1811
+ for (const event of events) {
1812
+ switch (event.type) {
1813
+ case 'text':
1814
+ text += event.text;
1815
+ break;
1816
+ case 'tool_call':
1817
+ tools.push({ name: event.toolName, input: event.input });
1818
+ break;
1819
+ case 'tool_result':
1820
+ if (tools.length > 0) {
1821
+ const lastTool = tools[tools.length - 1];
1822
+ lastTool.result = event.content;
1823
+ lastTool.isError = event.isError;
1824
+ }
1825
+ break;
1826
+ }
1827
+ }
1828
+ }
1829
+
1830
+ // Output text
1831
+ if (text.trim()) {
1832
+ md += `### Output\n\n${text.trim()}\n\n`;
1833
+ }
1834
+
1835
+ // Tools used
1836
+ if (tools.length > 0) {
1837
+ md += `### Tools Used\n\n`;
1838
+ for (const tool of tools) {
1839
+ const status = tool.isError ? '❌' : '✓';
1840
+ md += `- **${tool.name}** ${status}\n`;
1841
+ if (tool.input) {
1842
+ const inputStr =
1843
+ typeof tool.input === 'string' ? tool.input : JSON.stringify(tool.input);
1844
+ if (inputStr.length < 100) {
1845
+ md += ` - Input: \`${inputStr}\`\n`;
1846
+ }
1847
+ }
1848
+ }
1849
+ md += '\n';
1850
+ }
1851
+ }
1852
+
1853
+ // Validation results
1854
+ const validations = messages.filter((m) => m.topic === 'VALIDATION_RESULT');
1855
+ if (validations.length > 0) {
1856
+ md += `## Validation Results\n\n`;
1857
+ for (const v of validations) {
1858
+ const data = v.content?.data || {};
1859
+ const approved = data.approved === true || data.approved === 'true';
1860
+ const icon = approved ? '✅' : '❌';
1861
+ md += `### ${v.sender} ${icon}\n\n`;
1862
+ if (data.summary) {
1863
+ md += `${data.summary}\n\n`;
1864
+ }
1865
+ if (!approved && data.issues) {
1866
+ const issues = typeof data.issues === 'string' ? JSON.parse(data.issues) : data.issues;
1867
+ if (Array.isArray(issues) && issues.length > 0) {
1868
+ md += `**Issues:**\n`;
1869
+ for (const issue of issues) {
1870
+ md += `- ${issue}\n`;
1871
+ }
1872
+ md += '\n';
1873
+ }
1874
+ }
1875
+ }
1876
+ }
1877
+
1878
+ // Final status
1879
+ const clusterComplete = messages.find((m) => m.topic === 'CLUSTER_COMPLETE');
1880
+ if (clusterComplete) {
1881
+ md += `## Result\n\n✅ **Cluster completed successfully**\n`;
1882
+ }
1883
+
1884
+ return md;
1885
+ }
1886
+
1887
+ /**
1888
+ * Validate cluster configuration (delegates to config-validator module)
1889
+ * @param {Object} config - Cluster configuration
1890
+ * @param {Object} options - Validation options
1891
+ * @param {boolean} options.strict - Treat warnings as errors (default: false)
1892
+ * @returns {Object} { valid: Boolean, errors: Array, warnings: Array }
1893
+ */
1894
+ validateConfig(config, options = {}) {
1895
+ const result = configValidator.validateConfig(config);
1896
+
1897
+ // In strict mode, warnings become errors
1898
+ if (options.strict && result.warnings.length > 0) {
1899
+ result.errors.push(...result.warnings.map((w) => `[strict] ${w}`));
1900
+ result.valid = false;
1901
+ }
1902
+
1903
+ return result;
1904
+ }
1905
+
1906
+ /**
1907
+ * Load cluster configuration from file
1908
+ * @param {String} configPath - Path to config JSON file
1909
+ * @param {Object} options - Load options
1910
+ * @param {boolean} options.strict - Treat warnings as errors
1911
+ * @returns {Object} Parsed configuration
1912
+ */
1913
+ loadConfig(configPath, options = {}) {
1914
+ const fullPath = path.resolve(configPath);
1915
+ const content = fs.readFileSync(fullPath, 'utf8');
1916
+ const config = JSON.parse(content);
1917
+
1918
+ const validation = this.validateConfig(config, options);
1919
+
1920
+ // Show warnings (but don't fail unless strict mode)
1921
+ if (validation.warnings && validation.warnings.length > 0 && !this.quiet) {
1922
+ console.warn('\n⚠️ Configuration warnings:');
1923
+ for (const warning of validation.warnings) {
1924
+ console.warn(` ${warning}`);
1925
+ }
1926
+ console.warn('');
1927
+ }
1928
+
1929
+ if (!validation.valid) {
1930
+ const errorMsg = validation.errors.join('\n ');
1931
+ throw new Error(`Invalid config:\n ${errorMsg}`);
1932
+ }
1933
+
1934
+ return config;
1935
+ }
1936
+ }
1937
+
1938
+ module.exports = Orchestrator;