@covibes/zeroshot 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +167 -0
- package/LICENSE +21 -0
- package/README.md +364 -0
- package/cli/index.js +3990 -0
- package/cluster-templates/base-templates/debug-workflow.json +181 -0
- package/cluster-templates/base-templates/full-workflow.json +455 -0
- package/cluster-templates/base-templates/single-worker.json +48 -0
- package/cluster-templates/base-templates/worker-validator.json +131 -0
- package/cluster-templates/conductor-bootstrap.json +122 -0
- package/cluster-templates/conductor-junior-bootstrap.json +69 -0
- package/docker/zeroshot-cluster/Dockerfile +132 -0
- package/lib/completion.js +174 -0
- package/lib/id-detector.js +53 -0
- package/lib/settings.js +97 -0
- package/lib/stream-json-parser.js +236 -0
- package/package.json +121 -0
- package/src/agent/agent-config.js +121 -0
- package/src/agent/agent-context-builder.js +241 -0
- package/src/agent/agent-hook-executor.js +329 -0
- package/src/agent/agent-lifecycle.js +555 -0
- package/src/agent/agent-stuck-detector.js +256 -0
- package/src/agent/agent-task-executor.js +1034 -0
- package/src/agent/agent-trigger-evaluator.js +67 -0
- package/src/agent-wrapper.js +459 -0
- package/src/agents/git-pusher-agent.json +20 -0
- package/src/attach/attach-client.js +438 -0
- package/src/attach/attach-server.js +543 -0
- package/src/attach/index.js +35 -0
- package/src/attach/protocol.js +220 -0
- package/src/attach/ring-buffer.js +121 -0
- package/src/attach/socket-discovery.js +242 -0
- package/src/claude-task-runner.js +468 -0
- package/src/config-router.js +80 -0
- package/src/config-validator.js +598 -0
- package/src/github.js +103 -0
- package/src/isolation-manager.js +1042 -0
- package/src/ledger.js +429 -0
- package/src/logic-engine.js +223 -0
- package/src/message-bus-bridge.js +139 -0
- package/src/message-bus.js +202 -0
- package/src/name-generator.js +232 -0
- package/src/orchestrator.js +1938 -0
- package/src/schemas/sub-cluster.js +156 -0
- package/src/sub-cluster-wrapper.js +545 -0
- package/src/task-runner.js +28 -0
- package/src/template-resolver.js +347 -0
- package/src/tui/CHANGES.txt +133 -0
- package/src/tui/LAYOUT.md +261 -0
- package/src/tui/README.txt +192 -0
- package/src/tui/TWO-LEVEL-NAVIGATION.md +186 -0
- package/src/tui/data-poller.js +325 -0
- package/src/tui/demo.js +208 -0
- package/src/tui/formatters.js +123 -0
- package/src/tui/index.js +193 -0
- package/src/tui/keybindings.js +383 -0
- package/src/tui/layout.js +317 -0
- package/src/tui/renderer.js +194 -0
|
@@ -0,0 +1,1938 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Orchestrator - Manages cluster lifecycle
|
|
3
|
+
*
|
|
4
|
+
* Provides:
|
|
5
|
+
* - Cluster initialization and configuration
|
|
6
|
+
* - Agent lifecycle management
|
|
7
|
+
* - GitHub issue integration
|
|
8
|
+
* - Cluster state tracking
|
|
9
|
+
* - Crash recovery
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
const fs = require('fs');
|
|
13
|
+
const path = require('path');
|
|
14
|
+
const os = require('os');
|
|
15
|
+
const lockfile = require('proper-lockfile');
|
|
16
|
+
const AgentWrapper = require('./agent-wrapper');
|
|
17
|
+
const SubClusterWrapper = require('./sub-cluster-wrapper');
|
|
18
|
+
const MessageBus = require('./message-bus');
|
|
19
|
+
const Ledger = require('./ledger');
|
|
20
|
+
const GitHub = require('./github');
|
|
21
|
+
const IsolationManager = require('./isolation-manager');
|
|
22
|
+
const { generateName } = require('./name-generator');
|
|
23
|
+
const configValidator = require('./config-validator');
|
|
24
|
+
const TemplateResolver = require('./template-resolver');
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Operation Chain Schema
|
|
28
|
+
* Conductor (or any agent) can publish CLUSTER_OPERATIONS to dynamically modify cluster
|
|
29
|
+
*
|
|
30
|
+
* Supported operations:
|
|
31
|
+
* - add_agents: Spawn new agents with given configs
|
|
32
|
+
* - remove_agents: Stop and remove agents by ID
|
|
33
|
+
* - update_agent: Modify existing agent config
|
|
34
|
+
* - publish: Publish a message to the bus
|
|
35
|
+
* - load_config: Load agents from a named cluster config template
|
|
36
|
+
*/
|
|
37
|
+
const VALID_OPERATIONS = ['add_agents', 'remove_agents', 'update_agent', 'publish', 'load_config'];
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Workflow-triggering topics that indicate cluster state progression
|
|
41
|
+
* These are the topics that MATTER for resume - not AGENT_OUTPUT noise
|
|
42
|
+
*/
|
|
43
|
+
const WORKFLOW_TRIGGERS = Object.freeze([
|
|
44
|
+
'ISSUE_OPENED',
|
|
45
|
+
'PLAN_READY',
|
|
46
|
+
'IMPLEMENTATION_READY',
|
|
47
|
+
'VALIDATION_RESULT',
|
|
48
|
+
'CONDUCTOR_ESCALATE',
|
|
49
|
+
]);
|
|
50
|
+
|
|
51
|
+
class Orchestrator {
|
|
52
|
+
constructor(options = {}) {
|
|
53
|
+
this.clusters = new Map(); // cluster_id -> cluster object
|
|
54
|
+
this.quiet = options.quiet || false; // Suppress verbose logging
|
|
55
|
+
|
|
56
|
+
// TaskRunner DI - allows injecting MockTaskRunner for testing
|
|
57
|
+
// When set, passed to all AgentWrappers to control task execution
|
|
58
|
+
this.taskRunner = options.taskRunner || null;
|
|
59
|
+
|
|
60
|
+
// Set up persistent storage directory (can be overridden for testing)
|
|
61
|
+
this.storageDir = options.storageDir || path.join(os.homedir(), '.zeroshot');
|
|
62
|
+
if (!fs.existsSync(this.storageDir)) {
|
|
63
|
+
fs.mkdirSync(this.storageDir, { recursive: true });
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Load existing clusters from disk (skip if explicitly disabled)
|
|
67
|
+
if (options.skipLoad !== true) {
|
|
68
|
+
this._loadClusters();
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Log message (respects quiet mode)
|
|
74
|
+
* @private
|
|
75
|
+
*/
|
|
76
|
+
_log(...args) {
|
|
77
|
+
if (!this.quiet) {
|
|
78
|
+
console.log(...args);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Load clusters from persistent storage
|
|
84
|
+
* Uses file locking for consistent reads
|
|
85
|
+
* @private
|
|
86
|
+
*/
|
|
87
|
+
_loadClusters() {
|
|
88
|
+
const clustersFile = path.join(this.storageDir, 'clusters.json');
|
|
89
|
+
this._log(`[Orchestrator] Loading clusters from: ${clustersFile}`);
|
|
90
|
+
|
|
91
|
+
if (!fs.existsSync(clustersFile)) {
|
|
92
|
+
this._log(`[Orchestrator] No clusters file found at ${clustersFile}`);
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const lockfilePath = path.join(this.storageDir, 'clusters.json.lock');
|
|
97
|
+
let release;
|
|
98
|
+
|
|
99
|
+
try {
|
|
100
|
+
// Acquire lock (sync API doesn't support retries, so we retry manually)
|
|
101
|
+
const maxAttempts = 20;
|
|
102
|
+
const retryDelayMs = 100;
|
|
103
|
+
|
|
104
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
105
|
+
try {
|
|
106
|
+
release = lockfile.lockSync(clustersFile, {
|
|
107
|
+
lockfilePath,
|
|
108
|
+
stale: 30000,
|
|
109
|
+
});
|
|
110
|
+
break; // Lock acquired
|
|
111
|
+
} catch (lockErr) {
|
|
112
|
+
if (lockErr.code === 'ELOCKED' && attempt < maxAttempts - 1) {
|
|
113
|
+
// Wait and retry
|
|
114
|
+
const waitMs = retryDelayMs + Math.random() * retryDelayMs;
|
|
115
|
+
const start = Date.now();
|
|
116
|
+
while (Date.now() - start < waitMs) {
|
|
117
|
+
/* spin wait */
|
|
118
|
+
}
|
|
119
|
+
continue;
|
|
120
|
+
}
|
|
121
|
+
throw lockErr;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const data = JSON.parse(fs.readFileSync(clustersFile, 'utf8'));
|
|
126
|
+
const clusterIds = Object.keys(data);
|
|
127
|
+
this._log(`[Orchestrator] Found ${clusterIds.length} clusters in file:`, clusterIds);
|
|
128
|
+
|
|
129
|
+
for (const [clusterId, clusterData] of Object.entries(data)) {
|
|
130
|
+
this._log(`[Orchestrator] Loading cluster: ${clusterId}`);
|
|
131
|
+
this._loadSingleCluster(clusterId, clusterData);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
this._log(`[Orchestrator] Total clusters loaded: ${this.clusters.size}`);
|
|
135
|
+
} catch (error) {
|
|
136
|
+
console.error('[Orchestrator] Failed to load clusters:', error.message);
|
|
137
|
+
console.error(error.stack);
|
|
138
|
+
} finally {
|
|
139
|
+
if (release) {
|
|
140
|
+
release();
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Load a single cluster from data
|
|
147
|
+
* @private
|
|
148
|
+
*/
|
|
149
|
+
_loadSingleCluster(clusterId, clusterData) {
|
|
150
|
+
// Skip if already loaded
|
|
151
|
+
if (this.clusters.has(clusterId)) {
|
|
152
|
+
return this.clusters.get(clusterId);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Restore ledger and message bus
|
|
156
|
+
const dbPath = path.join(this.storageDir, `${clusterId}.db`);
|
|
157
|
+
const ledger = new Ledger(dbPath);
|
|
158
|
+
const messageBus = new MessageBus(ledger);
|
|
159
|
+
|
|
160
|
+
// Restore isolation manager FIRST if cluster was running in isolation mode
|
|
161
|
+
let isolation = clusterData.isolation || null;
|
|
162
|
+
let isolationManager = null;
|
|
163
|
+
if (isolation?.enabled && isolation.containerId) {
|
|
164
|
+
isolationManager = new IsolationManager({ image: isolation.image });
|
|
165
|
+
// Restore the container mapping so cleanup works
|
|
166
|
+
isolationManager.containers.set(clusterId, isolation.containerId);
|
|
167
|
+
isolation = {
|
|
168
|
+
...isolation,
|
|
169
|
+
manager: isolationManager,
|
|
170
|
+
};
|
|
171
|
+
this._log(
|
|
172
|
+
`[Orchestrator] Restored isolation manager for ${clusterId} (container: ${isolation.containerId})`
|
|
173
|
+
);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Reconstruct agent metadata from config (processes are ephemeral)
|
|
177
|
+
// CRITICAL: Pass isolation context to agents if cluster was running in isolation
|
|
178
|
+
const agents = [];
|
|
179
|
+
if (clusterData.config?.agents) {
|
|
180
|
+
for (const agentConfig of clusterData.config.agents) {
|
|
181
|
+
const agentOptions = {
|
|
182
|
+
id: clusterId,
|
|
183
|
+
quiet: this.quiet,
|
|
184
|
+
};
|
|
185
|
+
|
|
186
|
+
// Inject isolation context if enabled (MUST be done during agent creation)
|
|
187
|
+
if (isolation?.enabled && isolationManager) {
|
|
188
|
+
agentOptions.isolation = {
|
|
189
|
+
enabled: true,
|
|
190
|
+
manager: isolationManager,
|
|
191
|
+
clusterId,
|
|
192
|
+
};
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Create agent or subcluster wrapper based on type
|
|
196
|
+
let agent;
|
|
197
|
+
if (agentConfig.type === 'subcluster') {
|
|
198
|
+
agent = new SubClusterWrapper(agentConfig, messageBus, { id: clusterId }, agentOptions);
|
|
199
|
+
} else {
|
|
200
|
+
agent = new AgentWrapper(agentConfig, messageBus, { id: clusterId }, agentOptions);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
agents.push(agent);
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
const cluster = {
|
|
208
|
+
...clusterData,
|
|
209
|
+
ledger,
|
|
210
|
+
messageBus,
|
|
211
|
+
agents,
|
|
212
|
+
isolation,
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
this.clusters.set(clusterId, cluster);
|
|
216
|
+
this._log(`[Orchestrator] Loaded cluster: ${clusterId} with ${agents.length} agents`);
|
|
217
|
+
|
|
218
|
+
return cluster;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Ensure clusters file exists (required for file locking)
|
|
223
|
+
* @private
|
|
224
|
+
*/
|
|
225
|
+
_ensureClustersFile() {
|
|
226
|
+
const clustersFile = path.join(this.storageDir, 'clusters.json');
|
|
227
|
+
if (!fs.existsSync(clustersFile)) {
|
|
228
|
+
fs.writeFileSync(clustersFile, '{}');
|
|
229
|
+
}
|
|
230
|
+
return clustersFile;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
/**
|
|
234
|
+
* Save clusters to persistent storage
|
|
235
|
+
* Uses file locking to prevent race conditions with other processes
|
|
236
|
+
* @private
|
|
237
|
+
*/
|
|
238
|
+
_saveClusters() {
|
|
239
|
+
const clustersFile = this._ensureClustersFile();
|
|
240
|
+
const lockfilePath = path.join(this.storageDir, 'clusters.json.lock');
|
|
241
|
+
let release;
|
|
242
|
+
|
|
243
|
+
try {
|
|
244
|
+
// Acquire exclusive lock (sync API doesn't support retries, so we retry manually)
|
|
245
|
+
const maxAttempts = 50;
|
|
246
|
+
const retryDelayMs = 100;
|
|
247
|
+
|
|
248
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
249
|
+
try {
|
|
250
|
+
release = lockfile.lockSync(clustersFile, {
|
|
251
|
+
lockfilePath,
|
|
252
|
+
stale: 30000, // Lock expires after 30s (in case process dies)
|
|
253
|
+
});
|
|
254
|
+
break; // Lock acquired
|
|
255
|
+
} catch (lockErr) {
|
|
256
|
+
if (lockErr.code === 'ELOCKED' && attempt < maxAttempts - 1) {
|
|
257
|
+
// Wait and retry with jitter
|
|
258
|
+
const waitMs = retryDelayMs + Math.random() * retryDelayMs * 2;
|
|
259
|
+
const start = Date.now();
|
|
260
|
+
while (Date.now() - start < waitMs) {
|
|
261
|
+
/* spin wait */
|
|
262
|
+
}
|
|
263
|
+
continue;
|
|
264
|
+
}
|
|
265
|
+
throw lockErr;
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// Read existing clusters from file (other processes may have added clusters)
|
|
270
|
+
let existingClusters = {};
|
|
271
|
+
try {
|
|
272
|
+
const content = fs.readFileSync(clustersFile, 'utf8');
|
|
273
|
+
existingClusters = JSON.parse(content);
|
|
274
|
+
} catch (error) {
|
|
275
|
+
console.error('[Orchestrator] Failed to read existing clusters:', error.message);
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// Merge: update/add clusters from this process
|
|
279
|
+
for (const [clusterId, cluster] of this.clusters.entries()) {
|
|
280
|
+
// CRITICAL: Only update clusters this process actually owns or has modified
|
|
281
|
+
// A process owns a cluster if: it started it (pid matches) OR it explicitly stopped/killed it
|
|
282
|
+
const isOwnedByThisProcess = cluster.pid === process.pid;
|
|
283
|
+
const wasModifiedByThisProcess = cluster.state === 'stopped' || cluster.state === 'killed';
|
|
284
|
+
|
|
285
|
+
// Skip clusters we don't own and haven't modified - prevents race condition
|
|
286
|
+
// where a running cluster overwrites another process's stop/kill operation
|
|
287
|
+
if (!isOwnedByThisProcess && !wasModifiedByThisProcess) {
|
|
288
|
+
// Preserve existing state from file for clusters we don't own
|
|
289
|
+
continue;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
existingClusters[clusterId] = {
|
|
293
|
+
id: cluster.id,
|
|
294
|
+
config: cluster.config,
|
|
295
|
+
state: cluster.state,
|
|
296
|
+
createdAt: cluster.createdAt,
|
|
297
|
+
// Track PID for zombie detection (null if cluster is stopped/killed)
|
|
298
|
+
pid: cluster.state === 'running' ? cluster.pid : null,
|
|
299
|
+
// Persist failure info for resume capability
|
|
300
|
+
failureInfo: cluster.failureInfo || null,
|
|
301
|
+
// Persist isolation info (excluding manager instance which can't be serialized)
|
|
302
|
+
isolation: cluster.isolation
|
|
303
|
+
? {
|
|
304
|
+
enabled: cluster.isolation.enabled,
|
|
305
|
+
containerId: cluster.isolation.containerId,
|
|
306
|
+
image: cluster.isolation.image,
|
|
307
|
+
}
|
|
308
|
+
: null,
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
// Write merged data
|
|
313
|
+
fs.writeFileSync(clustersFile, JSON.stringify(existingClusters, null, 2));
|
|
314
|
+
this._log(
|
|
315
|
+
`[Orchestrator] Saved ${this.clusters.size} cluster(s), file now has ${Object.keys(existingClusters).length} total`
|
|
316
|
+
);
|
|
317
|
+
} finally {
|
|
318
|
+
// Always release lock
|
|
319
|
+
if (release) {
|
|
320
|
+
release();
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* Watch for new clusters and call callback when found
|
|
327
|
+
* Polls the clusters file for changes with file locking
|
|
328
|
+
* @param {Function} onNewCluster - Callback(cluster) for each new cluster
|
|
329
|
+
* @param {Number} intervalMs - Poll interval in ms (default: 2000)
|
|
330
|
+
* @returns {Function} Stop function to cancel watching
|
|
331
|
+
*/
|
|
332
|
+
watchForNewClusters(onNewCluster, intervalMs = 2000) {
|
|
333
|
+
const clustersFile = path.join(this.storageDir, 'clusters.json');
|
|
334
|
+
const lockfilePath = path.join(this.storageDir, 'clusters.json.lock');
|
|
335
|
+
const knownClusterIds = new Set(this.clusters.keys());
|
|
336
|
+
|
|
337
|
+
const intervalId = setInterval(() => {
|
|
338
|
+
let release;
|
|
339
|
+
try {
|
|
340
|
+
if (!fs.existsSync(clustersFile)) return;
|
|
341
|
+
|
|
342
|
+
// Try to acquire lock once (polling is best-effort, will retry on next cycle)
|
|
343
|
+
try {
|
|
344
|
+
release = lockfile.lockSync(clustersFile, {
|
|
345
|
+
lockfilePath,
|
|
346
|
+
stale: 30000,
|
|
347
|
+
});
|
|
348
|
+
} catch (lockErr) {
|
|
349
|
+
// Lock busy - skip this poll cycle, try again next interval
|
|
350
|
+
if (lockErr.code === 'ELOCKED') return;
|
|
351
|
+
throw lockErr;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
const data = JSON.parse(fs.readFileSync(clustersFile, 'utf8'));
|
|
355
|
+
|
|
356
|
+
for (const [clusterId, clusterData] of Object.entries(data)) {
|
|
357
|
+
if (!knownClusterIds.has(clusterId)) {
|
|
358
|
+
// New cluster found
|
|
359
|
+
knownClusterIds.add(clusterId);
|
|
360
|
+
const cluster = this._loadSingleCluster(clusterId, clusterData);
|
|
361
|
+
if (cluster && onNewCluster) {
|
|
362
|
+
onNewCluster(cluster);
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
} catch (error) {
|
|
367
|
+
// File access during polling can fail transiently - log and continue
|
|
368
|
+
console.error(`[Orchestrator] watchForNewClusters error (will retry): ${error.message}`);
|
|
369
|
+
} finally {
|
|
370
|
+
if (release) {
|
|
371
|
+
release();
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
}, intervalMs);
|
|
375
|
+
|
|
376
|
+
// Return stop function
|
|
377
|
+
return () => clearInterval(intervalId);
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/**
|
|
381
|
+
* Start a new cluster with mocked agent executors (TESTING ONLY)
|
|
382
|
+
*
|
|
383
|
+
* CRITICAL: This method PREVENTS real Claude API calls.
|
|
384
|
+
* All agent behaviors must be defined in mockExecutor.
|
|
385
|
+
*
|
|
386
|
+
* @param {Object} config - Cluster configuration
|
|
387
|
+
* @param {Object} input - Input source { issue, text, or config }
|
|
388
|
+
* @param {MockAgentExecutor} mockExecutor - Mock executor with agent behaviors
|
|
389
|
+
* @returns {Object} Cluster object
|
|
390
|
+
*/
|
|
391
|
+
startWithMock(config, input, mockExecutor) {
|
|
392
|
+
if (!mockExecutor) {
|
|
393
|
+
throw new Error('Orchestrator.startWithMock: mockExecutor is required');
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// Validate all agents that execute tasks have mock behaviors defined
|
|
397
|
+
// Orchestrator agents (action: 'stop_cluster') don't execute tasks, so don't need mocks
|
|
398
|
+
for (const agentConfig of config.agents) {
|
|
399
|
+
const agentId = agentConfig.id;
|
|
400
|
+
|
|
401
|
+
// Check if agent has any triggers that execute tasks
|
|
402
|
+
const executesTask = agentConfig.triggers?.some(
|
|
403
|
+
(trigger) => !trigger.action || trigger.action === 'execute_task'
|
|
404
|
+
);
|
|
405
|
+
|
|
406
|
+
if (executesTask && !mockExecutor.behaviors[agentId]) {
|
|
407
|
+
throw new Error(
|
|
408
|
+
`Orchestrator.startWithMock: No behavior defined for agent '${agentId}'. ` +
|
|
409
|
+
`This would cause real Claude API calls. ABORTING.\n` +
|
|
410
|
+
`Available behaviors: ${Object.keys(mockExecutor.behaviors).join(', ')}`
|
|
411
|
+
);
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
return this._startInternal(config, input, {
|
|
416
|
+
mockExecutor,
|
|
417
|
+
testMode: true,
|
|
418
|
+
});
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
/**
|
|
422
|
+
* Start a new cluster
|
|
423
|
+
* @param {Object} config - Cluster configuration
|
|
424
|
+
* @param {Object} input - Input source { issue, text, or config }
|
|
425
|
+
* @param {Object} options - Start options
|
|
426
|
+
* @param {boolean} options.isolation - Run in Docker container
|
|
427
|
+
* @param {string} options.isolationImage - Docker image to use
|
|
428
|
+
* @returns {Object} Cluster object
|
|
429
|
+
*/
|
|
430
|
+
start(config, input = {}, options = {}) {
|
|
431
|
+
return this._startInternal(config, input, {
|
|
432
|
+
testMode: false,
|
|
433
|
+
cwd: options.cwd || process.cwd(), // Target working directory for agents
|
|
434
|
+
isolation: options.isolation || false,
|
|
435
|
+
isolationImage: options.isolationImage,
|
|
436
|
+
autoPr: process.env.CREW_PR === '1',
|
|
437
|
+
});
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
/**
|
|
441
|
+
* Internal start implementation (shared by start and startWithMock)
|
|
442
|
+
* @private
|
|
443
|
+
*/
|
|
444
|
+
async _startInternal(config, input = {}, options = {}) {
|
|
445
|
+
// Use pre-generated ID from parent process, or generate new one
|
|
446
|
+
const clusterId = process.env.CREW_CLUSTER_ID || generateName('cluster');
|
|
447
|
+
|
|
448
|
+
// Create ledger and message bus with persistent storage
|
|
449
|
+
const dbPath = config.dbPath || path.join(this.storageDir, `${clusterId}.db`);
|
|
450
|
+
const ledger = new Ledger(dbPath);
|
|
451
|
+
const messageBus = new MessageBus(ledger);
|
|
452
|
+
|
|
453
|
+
// Handle isolation mode (Docker container)
|
|
454
|
+
let isolationManager = null;
|
|
455
|
+
let containerId = null;
|
|
456
|
+
|
|
457
|
+
if (options.isolation) {
|
|
458
|
+
// Check Docker availability
|
|
459
|
+
if (!IsolationManager.isDockerAvailable()) {
|
|
460
|
+
throw new Error('Docker is not available. Install Docker to use --isolation mode.');
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
// Ensure image exists (auto-build if missing)
|
|
464
|
+
const image = options.isolationImage || 'zeroshot-cluster-base';
|
|
465
|
+
await IsolationManager.ensureImage(image);
|
|
466
|
+
|
|
467
|
+
isolationManager = new IsolationManager({ image });
|
|
468
|
+
this._log(`[Orchestrator] Starting cluster in isolation mode (image: ${image})`);
|
|
469
|
+
|
|
470
|
+
// Create container with workspace mounted
|
|
471
|
+
// CRITICAL: Use options.cwd (git repo root) instead of process.cwd()
|
|
472
|
+
const workDir = options.cwd || process.cwd();
|
|
473
|
+
containerId = await isolationManager.createContainer(clusterId, {
|
|
474
|
+
workDir,
|
|
475
|
+
image,
|
|
476
|
+
});
|
|
477
|
+
this._log(`[Orchestrator] Container created: ${containerId} (workDir: ${workDir})`);
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
// Build cluster object
|
|
481
|
+
const cluster = {
|
|
482
|
+
id: clusterId,
|
|
483
|
+
config,
|
|
484
|
+
state: 'initializing',
|
|
485
|
+
messageBus,
|
|
486
|
+
ledger,
|
|
487
|
+
agents: [],
|
|
488
|
+
createdAt: Date.now(),
|
|
489
|
+
// Track PID for zombie detection (this process owns the cluster)
|
|
490
|
+
pid: process.pid,
|
|
491
|
+
// Isolation state (only if enabled)
|
|
492
|
+
isolation: options.isolation
|
|
493
|
+
? {
|
|
494
|
+
enabled: true,
|
|
495
|
+
containerId,
|
|
496
|
+
image: options.isolationImage || 'zeroshot-cluster-base',
|
|
497
|
+
manager: isolationManager,
|
|
498
|
+
}
|
|
499
|
+
: null,
|
|
500
|
+
};
|
|
501
|
+
|
|
502
|
+
this.clusters.set(clusterId, cluster);
|
|
503
|
+
|
|
504
|
+
try {
|
|
505
|
+
// Fetch input (GitHub issue or text)
|
|
506
|
+
let inputData;
|
|
507
|
+
if (input.issue) {
|
|
508
|
+
inputData = await GitHub.fetchIssue(input.issue);
|
|
509
|
+
// Log clickable issue link
|
|
510
|
+
if (inputData.url) {
|
|
511
|
+
this._log(`[Orchestrator] Issue: ${inputData.url}`);
|
|
512
|
+
}
|
|
513
|
+
} else if (input.text) {
|
|
514
|
+
inputData = GitHub.createTextInput(input.text);
|
|
515
|
+
} else {
|
|
516
|
+
throw new Error('Either issue or text input is required');
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
// Inject git-pusher agent if --pr is set (replaces completion-detector)
|
|
520
|
+
if (options.autoPr) {
|
|
521
|
+
// Remove completion-detector by ID (git-pusher handles completion + PR)
|
|
522
|
+
config.agents = config.agents.filter((a) => a.id !== 'completion-detector');
|
|
523
|
+
|
|
524
|
+
// Load and configure git-pusher agent (use fs.readFileSync to avoid require cache)
|
|
525
|
+
const gitPusherPath = path.join(__dirname, 'agents', 'git-pusher-agent.json');
|
|
526
|
+
const gitPusherConfig = JSON.parse(fs.readFileSync(gitPusherPath, 'utf8'));
|
|
527
|
+
|
|
528
|
+
// Inject issue context placeholders
|
|
529
|
+
gitPusherConfig.prompt = gitPusherConfig.prompt.replace(
|
|
530
|
+
/\{\{issue_number\}\}/g,
|
|
531
|
+
inputData.number || 'unknown'
|
|
532
|
+
);
|
|
533
|
+
gitPusherConfig.prompt = gitPusherConfig.prompt.replace(
|
|
534
|
+
/\{\{issue_title\}\}/g,
|
|
535
|
+
inputData.title || 'Implementation'
|
|
536
|
+
);
|
|
537
|
+
|
|
538
|
+
config.agents.push(gitPusherConfig);
|
|
539
|
+
this._log(`[Orchestrator] Injected git-pusher agent (creates PR and auto-merges)`);
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
// Inject workers instruction if --workers explicitly provided and > 1
|
|
543
|
+
const workersCount = process.env.CREW_WORKERS ? parseInt(process.env.CREW_WORKERS) : 0;
|
|
544
|
+
if (workersCount > 1) {
|
|
545
|
+
const workerAgent = config.agents.find((a) => a.id === 'worker');
|
|
546
|
+
if (workerAgent) {
|
|
547
|
+
const instruction = `PARALLELIZATION: Use up to ${workersCount} sub-agents to parallelize your work where appropriate.\n\n`;
|
|
548
|
+
|
|
549
|
+
if (!workerAgent.prompt) {
|
|
550
|
+
workerAgent.prompt = instruction;
|
|
551
|
+
} else if (typeof workerAgent.prompt === 'string') {
|
|
552
|
+
workerAgent.prompt = instruction + workerAgent.prompt;
|
|
553
|
+
} else if (workerAgent.prompt.system) {
|
|
554
|
+
workerAgent.prompt.system = instruction + workerAgent.prompt.system;
|
|
555
|
+
}
|
|
556
|
+
this._log(
|
|
557
|
+
`[Orchestrator] Injected parallelization instruction (workers=${workersCount})`
|
|
558
|
+
);
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
// Initialize agents with optional mock injection
|
|
563
|
+
// Check agent type: regular agent or subcluster
|
|
564
|
+
// CRITICAL: Inject cwd into each agent config for proper working directory
|
|
565
|
+
const agentCwd = options.cwd || process.cwd();
|
|
566
|
+
for (const agentConfig of config.agents) {
|
|
567
|
+
// Inject cwd if not already set (config may override)
|
|
568
|
+
if (!agentConfig.cwd) {
|
|
569
|
+
agentConfig.cwd = agentCwd;
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
const agentOptions = {
|
|
573
|
+
testMode: options.testMode || !!this.taskRunner, // Enable testMode if taskRunner provided
|
|
574
|
+
quiet: this.quiet,
|
|
575
|
+
};
|
|
576
|
+
|
|
577
|
+
// Inject mock spawn function if provided (legacy mockExecutor API)
|
|
578
|
+
if (options.mockExecutor) {
|
|
579
|
+
agentOptions.mockSpawnFn = options.mockExecutor.createMockSpawnFn(agentConfig.id);
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
// TaskRunner DI - new pattern for mocking task execution
|
|
583
|
+
// Creates a mockSpawnFn wrapper that delegates to the taskRunner
|
|
584
|
+
if (this.taskRunner) {
|
|
585
|
+
agentOptions.mockSpawnFn = (args, { context }) => {
|
|
586
|
+
return this.taskRunner.run(context, {
|
|
587
|
+
agentId: agentConfig.id,
|
|
588
|
+
});
|
|
589
|
+
};
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
// Pass isolation context if enabled
|
|
593
|
+
if (cluster.isolation) {
|
|
594
|
+
agentOptions.isolation = {
|
|
595
|
+
enabled: true,
|
|
596
|
+
manager: isolationManager,
|
|
597
|
+
clusterId,
|
|
598
|
+
};
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
// Create agent or subcluster wrapper based on type
|
|
602
|
+
let agent;
|
|
603
|
+
if (agentConfig.type === 'subcluster') {
|
|
604
|
+
agent = new SubClusterWrapper(agentConfig, messageBus, cluster, agentOptions);
|
|
605
|
+
} else {
|
|
606
|
+
agent = new AgentWrapper(agentConfig, messageBus, cluster, agentOptions);
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
cluster.agents.push(agent);
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
// Start all agents
|
|
613
|
+
for (const agent of cluster.agents) {
|
|
614
|
+
await agent.start();
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
cluster.state = 'running';
|
|
618
|
+
|
|
619
|
+
// Publish ISSUE_OPENED message to bootstrap workflow
|
|
620
|
+
messageBus.publish({
|
|
621
|
+
cluster_id: clusterId,
|
|
622
|
+
topic: 'ISSUE_OPENED',
|
|
623
|
+
sender: 'system',
|
|
624
|
+
receiver: 'broadcast',
|
|
625
|
+
content: {
|
|
626
|
+
text: inputData.context,
|
|
627
|
+
data: {
|
|
628
|
+
issue_number: inputData.number,
|
|
629
|
+
title: inputData.title,
|
|
630
|
+
},
|
|
631
|
+
},
|
|
632
|
+
metadata: {
|
|
633
|
+
source: input.issue ? 'github' : 'text',
|
|
634
|
+
},
|
|
635
|
+
});
|
|
636
|
+
|
|
637
|
+
this._log(`Cluster ${clusterId} started with ${cluster.agents.length} agents`);
|
|
638
|
+
|
|
639
|
+
// Watch for CLUSTER_COMPLETE message to auto-stop
|
|
640
|
+
messageBus.subscribe((message) => {
|
|
641
|
+
if (message.topic === 'CLUSTER_COMPLETE' && message.cluster_id === clusterId) {
|
|
642
|
+
this._log(`\n${'='.repeat(80)}`);
|
|
643
|
+
this._log(`✅ CLUSTER COMPLETED SUCCESSFULLY: ${clusterId}`);
|
|
644
|
+
this._log(`${'='.repeat(80)}`);
|
|
645
|
+
this._log(`Reason: ${message.content?.data?.reason || 'unknown'}`);
|
|
646
|
+
this._log(`Initiated by: ${message.sender}`);
|
|
647
|
+
this._log(`${'='.repeat(80)}\n`);
|
|
648
|
+
|
|
649
|
+
// Auto-stop cluster
|
|
650
|
+
this.stop(clusterId).catch((err) => {
|
|
651
|
+
console.error(`Failed to auto-stop cluster ${clusterId}:`, err.message);
|
|
652
|
+
});
|
|
653
|
+
}
|
|
654
|
+
});
|
|
655
|
+
|
|
656
|
+
// Watch for CLUSTER_FAILED message to auto-stop (e.g., max iterations reached)
|
|
657
|
+
messageBus.subscribe((message) => {
|
|
658
|
+
if (message.topic === 'CLUSTER_FAILED' && message.cluster_id === clusterId) {
|
|
659
|
+
this._log(`\n${'='.repeat(80)}`);
|
|
660
|
+
this._log(`❌ CLUSTER FAILED: ${clusterId}`);
|
|
661
|
+
this._log(`${'='.repeat(80)}`);
|
|
662
|
+
this._log(`Reason: ${message.content?.data?.reason || 'unknown'}`);
|
|
663
|
+
this._log(`Agent: ${message.sender}`);
|
|
664
|
+
if (message.content?.text) {
|
|
665
|
+
this._log(`Details: ${message.content.text}`);
|
|
666
|
+
}
|
|
667
|
+
this._log(`${'='.repeat(80)}\n`);
|
|
668
|
+
|
|
669
|
+
// Auto-stop cluster
|
|
670
|
+
this.stop(clusterId).catch((err) => {
|
|
671
|
+
console.error(`Failed to auto-stop cluster ${clusterId}:`, err.message);
|
|
672
|
+
});
|
|
673
|
+
}
|
|
674
|
+
});
|
|
675
|
+
|
|
676
|
+
// Watch for AGENT_ERROR - if critical agent (worker/implementation) fails, stop cluster
|
|
677
|
+
// Validators auto-approve after retries (see agent-wrapper retry logic)
|
|
678
|
+
messageBus.subscribe((message) => {
|
|
679
|
+
if (message.topic === 'AGENT_ERROR' && message.cluster_id === clusterId) {
|
|
680
|
+
const agentRole = message.content?.data?.role;
|
|
681
|
+
const attempts = message.content?.data?.attempts || 1;
|
|
682
|
+
|
|
683
|
+
// Save cluster state to persist failureInfo (set by agent-wrapper on failure)
|
|
684
|
+
// This ensures resume capability even if cluster doesn't stop
|
|
685
|
+
this._saveClusters();
|
|
686
|
+
|
|
687
|
+
// Only stop cluster if non-validator agent exhausted retries
|
|
688
|
+
if (agentRole === 'implementation' && attempts >= 3) {
|
|
689
|
+
this._log(`\n${'='.repeat(80)}`);
|
|
690
|
+
this._log(`❌ WORKER AGENT FAILED: ${clusterId}`);
|
|
691
|
+
this._log(`${'='.repeat(80)}`);
|
|
692
|
+
this._log(`Worker agent ${message.sender} failed after ${attempts} attempts`);
|
|
693
|
+
this._log(`Error: ${message.content?.data?.error || 'unknown'}`);
|
|
694
|
+
this._log(`Stopping cluster - worker cannot continue`);
|
|
695
|
+
this._log(`${'='.repeat(80)}\n`);
|
|
696
|
+
|
|
697
|
+
// Auto-stop cluster
|
|
698
|
+
this.stop(clusterId).catch((err) => {
|
|
699
|
+
console.error(`Failed to auto-stop cluster ${clusterId}:`, err.message);
|
|
700
|
+
});
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
});
|
|
704
|
+
|
|
705
|
+
// Watch for stale agent detection (informational only - NEVER kills tasks)
|
|
706
|
+
// CHANGED: Stale detection is informational only - never kills tasks
|
|
707
|
+
messageBus.on('topic:AGENT_LIFECYCLE', (message) => {
|
|
708
|
+
if (message.content?.data?.event !== 'AGENT_STALE_WARNING') return;
|
|
709
|
+
|
|
710
|
+
const agentId = message.content?.data?.agent;
|
|
711
|
+
const timeSinceLastOutput = message.content?.data?.timeSinceLastOutput;
|
|
712
|
+
const analysis = message.content?.data?.analysis || 'No analysis available';
|
|
713
|
+
|
|
714
|
+
this._log(
|
|
715
|
+
`⚠️ Orchestrator: Agent ${agentId} appears stale (${Math.round(timeSinceLastOutput / 1000)}s no output) but will NOT be killed`
|
|
716
|
+
);
|
|
717
|
+
this._log(` Analysis: ${analysis}`);
|
|
718
|
+
this._log(
|
|
719
|
+
` Manual intervention may be needed - use 'zeroshot resume ${clusterId}' if stuck`
|
|
720
|
+
);
|
|
721
|
+
});
|
|
722
|
+
|
|
723
|
+
// Watch for CLUSTER_OPERATIONS - dynamic agent spawn/removal/update
|
|
724
|
+
// Conductor (or any agent) can publish operation chains to modify the cluster
|
|
725
|
+
messageBus.subscribe((message) => {
|
|
726
|
+
if (message.topic === 'CLUSTER_OPERATIONS' && message.cluster_id === clusterId) {
|
|
727
|
+
let operations = message.content?.data?.operations;
|
|
728
|
+
|
|
729
|
+
// Parse operations if they came as a JSON string (template variable serialization)
|
|
730
|
+
if (typeof operations === 'string') {
|
|
731
|
+
try {
|
|
732
|
+
operations = JSON.parse(operations);
|
|
733
|
+
} catch (e) {
|
|
734
|
+
this._log(`⚠️ CLUSTER_OPERATIONS has invalid operations JSON: ${e.message}`);
|
|
735
|
+
return;
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
if (!operations || !Array.isArray(operations)) {
|
|
740
|
+
this._log(`⚠️ CLUSTER_OPERATIONS missing operations array, ignoring`);
|
|
741
|
+
return;
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
this._log(`\n${'='.repeat(80)}`);
|
|
745
|
+
this._log(`🔧 CLUSTER_OPERATIONS received from ${message.sender}`);
|
|
746
|
+
this._log(`${'='.repeat(80)}`);
|
|
747
|
+
if (message.content?.data?.reasoning) {
|
|
748
|
+
this._log(`Reasoning: ${message.content.data.reasoning}`);
|
|
749
|
+
}
|
|
750
|
+
this._log(`Operations: ${operations.length}`);
|
|
751
|
+
this._log(`${'='.repeat(80)}\n`);
|
|
752
|
+
|
|
753
|
+
// Execute operation chain
|
|
754
|
+
this._handleOperations(clusterId, operations, message.sender, {
|
|
755
|
+
isolationManager,
|
|
756
|
+
containerId,
|
|
757
|
+
}).catch((err) => {
|
|
758
|
+
console.error(`Failed to execute CLUSTER_OPERATIONS:`, err.message);
|
|
759
|
+
// Publish failure for conductor to retry
|
|
760
|
+
messageBus.publish({
|
|
761
|
+
cluster_id: clusterId,
|
|
762
|
+
topic: 'CLUSTER_OPERATIONS_FAILED',
|
|
763
|
+
sender: 'orchestrator',
|
|
764
|
+
content: {
|
|
765
|
+
text: `Operation chain failed: ${err.message}`,
|
|
766
|
+
data: {
|
|
767
|
+
error: err.message,
|
|
768
|
+
operations: operations,
|
|
769
|
+
},
|
|
770
|
+
},
|
|
771
|
+
});
|
|
772
|
+
});
|
|
773
|
+
}
|
|
774
|
+
});
|
|
775
|
+
|
|
776
|
+
// DISABLED: Idle timeout auto-stop mechanism
|
|
777
|
+
// WHY DISABLED: Clusters should only stop on explicit signals:
|
|
778
|
+
// - User `zeroshot kill` command
|
|
779
|
+
// - CLUSTER_COMPLETE message (successful completion)
|
|
780
|
+
// - CLUSTER_FAILED message (failure/abort)
|
|
781
|
+
// Being "idle" is NOT a reason to auto-stop - agents may be legitimately
|
|
782
|
+
// waiting for external events, user input (in interactive mode), or
|
|
783
|
+
// processing that doesn't show as "executing" (e.g., polling, monitoring).
|
|
784
|
+
//
|
|
785
|
+
// Previous behavior: Stopped cluster after 2 minutes of all agents idle
|
|
786
|
+
// Result: Clusters were killed while legitimately waiting, causing confusion
|
|
787
|
+
//
|
|
788
|
+
// cluster.idleCheckInterval = setInterval(() => { ... }, 30000);
|
|
789
|
+
// ^^^^^^ REMOVED - clusters run until explicitly stopped or completed
|
|
790
|
+
|
|
791
|
+
// Save cluster to disk
|
|
792
|
+
this._saveClusters();
|
|
793
|
+
|
|
794
|
+
return {
|
|
795
|
+
id: clusterId,
|
|
796
|
+
state: cluster.state,
|
|
797
|
+
agents: cluster.agents.map((a) => a.getState()),
|
|
798
|
+
ledger: cluster.ledger, // Expose ledger for testing
|
|
799
|
+
messageBus: cluster.messageBus, // Expose messageBus for testing
|
|
800
|
+
};
|
|
801
|
+
} catch (error) {
|
|
802
|
+
cluster.state = 'failed';
|
|
803
|
+
console.error(`Cluster ${clusterId} failed to start:`, error);
|
|
804
|
+
throw error;
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
/**
|
|
809
|
+
* Stop a cluster
|
|
810
|
+
* @param {String} clusterId - Cluster ID
|
|
811
|
+
*/
|
|
812
|
+
async stop(clusterId) {
|
|
813
|
+
const cluster = this.clusters.get(clusterId);
|
|
814
|
+
if (!cluster) {
|
|
815
|
+
throw new Error(`Cluster ${clusterId} not found`);
|
|
816
|
+
}
|
|
817
|
+
|
|
818
|
+
cluster.state = 'stopping';
|
|
819
|
+
|
|
820
|
+
// Stop all agents (including subclusters which handle their own children)
|
|
821
|
+
for (const agent of cluster.agents) {
|
|
822
|
+
await agent.stop();
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
// Clean up isolation container if enabled
|
|
826
|
+
if (cluster.isolation?.manager) {
|
|
827
|
+
this._log(`[Orchestrator] Cleaning up isolation container for ${clusterId}...`);
|
|
828
|
+
await cluster.isolation.manager.cleanup(clusterId);
|
|
829
|
+
this._log(`[Orchestrator] Container removed`);
|
|
830
|
+
}
|
|
831
|
+
|
|
832
|
+
cluster.state = 'stopped';
|
|
833
|
+
cluster.pid = null; // Clear PID - cluster is no longer running
|
|
834
|
+
this._log(`Cluster ${clusterId} stopped`);
|
|
835
|
+
|
|
836
|
+
// Save updated state
|
|
837
|
+
this._saveClusters();
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
/**
|
|
841
|
+
* Kill a cluster (force stop)
|
|
842
|
+
* @param {String} clusterId - Cluster ID
|
|
843
|
+
*/
|
|
844
|
+
async kill(clusterId) {
|
|
845
|
+
const cluster = this.clusters.get(clusterId);
|
|
846
|
+
if (!cluster) {
|
|
847
|
+
throw new Error(`Cluster ${clusterId} not found`);
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
cluster.state = 'stopping';
|
|
851
|
+
|
|
852
|
+
// Force stop all agents
|
|
853
|
+
for (const agent of cluster.agents) {
|
|
854
|
+
await agent.stop();
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
// Force remove isolation container if enabled
|
|
858
|
+
if (cluster.isolation?.manager) {
|
|
859
|
+
this._log(`[Orchestrator] Force removing isolation container for ${clusterId}...`);
|
|
860
|
+
await cluster.isolation.manager.removeContainer(clusterId, true); // force=true
|
|
861
|
+
this._log(`[Orchestrator] Container removed`);
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
// Close message bus and ledger
|
|
865
|
+
cluster.messageBus.close();
|
|
866
|
+
|
|
867
|
+
cluster.state = 'killed';
|
|
868
|
+
cluster.pid = null; // Clear PID - cluster is no longer running
|
|
869
|
+
// DON'T delete from memory - keep it so it gets saved with 'killed' state
|
|
870
|
+
// this.clusters.delete(clusterId);
|
|
871
|
+
|
|
872
|
+
this._log(`Cluster ${clusterId} killed`);
|
|
873
|
+
|
|
874
|
+
// Save updated state (will be marked as 'killed' in file)
|
|
875
|
+
this._saveClusters();
|
|
876
|
+
|
|
877
|
+
// Now remove from memory after persisting
|
|
878
|
+
this.clusters.delete(clusterId);
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
/**
|
|
882
|
+
* Kill all running clusters
|
|
883
|
+
* @returns {Object} { killed: Array<string>, errors: Array<{id, error}> }
|
|
884
|
+
*/
|
|
885
|
+
async killAll() {
|
|
886
|
+
const results = { killed: [], errors: [] };
|
|
887
|
+
const runningClusters = Array.from(this.clusters.values()).filter(
|
|
888
|
+
(c) => c.state === 'running' || c.state === 'initializing'
|
|
889
|
+
);
|
|
890
|
+
|
|
891
|
+
for (const cluster of runningClusters) {
|
|
892
|
+
try {
|
|
893
|
+
await this.kill(cluster.id);
|
|
894
|
+
results.killed.push(cluster.id);
|
|
895
|
+
} catch (error) {
|
|
896
|
+
results.errors.push({ id: cluster.id, error: error.message });
|
|
897
|
+
}
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
return results;
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
/**
|
|
904
|
+
* Find the last workflow-triggering message in the ledger
|
|
905
|
+
* Workflow triggers indicate cluster state progression (not AGENT_OUTPUT noise)
|
|
906
|
+
* @param {Array} messages - All messages from ledger
|
|
907
|
+
* @returns {Object|null} - Last workflow trigger message or null
|
|
908
|
+
* @private
|
|
909
|
+
*/
|
|
910
|
+
_findLastWorkflowTrigger(messages) {
|
|
911
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
912
|
+
if (WORKFLOW_TRIGGERS.includes(messages[i].topic)) {
|
|
913
|
+
return messages[i];
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
return null;
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
/**
|
|
920
|
+
* Resume a stopped cluster from where it left off
|
|
921
|
+
* Handles both failed clusters (with error context) and cleanly stopped clusters
|
|
922
|
+
* @param {String} clusterId - Cluster ID
|
|
923
|
+
* @param {String} prompt - Optional custom resume prompt
|
|
924
|
+
* @returns {Object} Resumed cluster info
|
|
925
|
+
*/
|
|
926
|
+
async resume(clusterId, prompt) {
|
|
927
|
+
const cluster = this.clusters.get(clusterId);
|
|
928
|
+
if (!cluster) {
|
|
929
|
+
throw new Error(`Cluster not found: ${clusterId}`);
|
|
930
|
+
}
|
|
931
|
+
|
|
932
|
+
if (cluster.state === 'running') {
|
|
933
|
+
throw new Error(
|
|
934
|
+
`Cluster ${clusterId} is still running. Use 'zeroshot stop' first if you want to restart it.`
|
|
935
|
+
);
|
|
936
|
+
}
|
|
937
|
+
|
|
938
|
+
// Get failure info - either from saved state or from ledger
|
|
939
|
+
let failureInfo = cluster.failureInfo;
|
|
940
|
+
|
|
941
|
+
if (!failureInfo) {
|
|
942
|
+
// Query ledger for AGENT_ERROR messages to find failed agent
|
|
943
|
+
const errors = cluster.messageBus.query({
|
|
944
|
+
cluster_id: clusterId,
|
|
945
|
+
topic: 'AGENT_ERROR',
|
|
946
|
+
limit: 10,
|
|
947
|
+
});
|
|
948
|
+
|
|
949
|
+
if (errors.length > 0) {
|
|
950
|
+
// Use the first error found
|
|
951
|
+
const firstError = errors[0];
|
|
952
|
+
failureInfo = {
|
|
953
|
+
agentId: firstError.sender,
|
|
954
|
+
taskId: firstError.content?.data?.taskId || null,
|
|
955
|
+
iteration: firstError.content?.data?.iteration || 0,
|
|
956
|
+
error: firstError.content?.data?.error || firstError.content?.text,
|
|
957
|
+
timestamp: firstError.timestamp,
|
|
958
|
+
};
|
|
959
|
+
this._log(`[Orchestrator] Found failure from ledger: ${failureInfo.agentId}`);
|
|
960
|
+
}
|
|
961
|
+
}
|
|
962
|
+
|
|
963
|
+
// CRITICAL: Recreate isolation container if needed
|
|
964
|
+
if (cluster.isolation?.enabled) {
|
|
965
|
+
const { spawn } = require('child_process');
|
|
966
|
+
const oldContainerId = cluster.isolation.containerId;
|
|
967
|
+
|
|
968
|
+
// Check if container exists
|
|
969
|
+
const checkContainer = spawn('docker', ['inspect', oldContainerId], {
|
|
970
|
+
stdio: 'ignore',
|
|
971
|
+
});
|
|
972
|
+
const containerExists = await new Promise((resolve) => {
|
|
973
|
+
checkContainer.on('close', (code) => resolve(code === 0));
|
|
974
|
+
});
|
|
975
|
+
|
|
976
|
+
if (!containerExists) {
|
|
977
|
+
this._log(`[Orchestrator] Container ${oldContainerId} not found, recreating...`);
|
|
978
|
+
|
|
979
|
+
// Create new container
|
|
980
|
+
const newContainerId = await cluster.isolation.manager.createContainer(clusterId, {
|
|
981
|
+
workDir: process.cwd(),
|
|
982
|
+
image: cluster.isolation.image,
|
|
983
|
+
});
|
|
984
|
+
|
|
985
|
+
this._log(`[Orchestrator] New container created: ${newContainerId}`);
|
|
986
|
+
|
|
987
|
+
// Update cluster isolation state
|
|
988
|
+
cluster.isolation.containerId = newContainerId;
|
|
989
|
+
|
|
990
|
+
// CRITICAL: Update all agents' isolation context with new container ID
|
|
991
|
+
for (const agent of cluster.agents) {
|
|
992
|
+
if (agent.isolation?.enabled) {
|
|
993
|
+
agent.isolation.containerId = newContainerId;
|
|
994
|
+
agent.isolation.manager = cluster.isolation.manager;
|
|
995
|
+
}
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
this._log(`[Orchestrator] All agents updated with new container ID`);
|
|
999
|
+
} else {
|
|
1000
|
+
this._log(`[Orchestrator] Container ${oldContainerId} still exists, reusing`);
|
|
1001
|
+
}
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
// Restart all agents
|
|
1005
|
+
cluster.state = 'running';
|
|
1006
|
+
for (const agent of cluster.agents) {
|
|
1007
|
+
if (!agent.running) {
|
|
1008
|
+
await agent.start();
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
|
|
1012
|
+
// Query recent messages from ledger to provide context
|
|
1013
|
+
const recentMessages = cluster.messageBus.query({
|
|
1014
|
+
cluster_id: clusterId,
|
|
1015
|
+
limit: 50,
|
|
1016
|
+
});
|
|
1017
|
+
|
|
1018
|
+
// CASE 1: Failed cluster - Resume the failed agent with error context
|
|
1019
|
+
if (failureInfo) {
|
|
1020
|
+
const { agentId, iteration, error } = failureInfo;
|
|
1021
|
+
this._log(
|
|
1022
|
+
`[Orchestrator] Resuming failed cluster ${clusterId} from agent ${agentId} iteration ${iteration}`
|
|
1023
|
+
);
|
|
1024
|
+
this._log(`[Orchestrator] Previous error: ${error}`);
|
|
1025
|
+
|
|
1026
|
+
// Find the failed agent
|
|
1027
|
+
const failedAgent = cluster.agents.find((a) => a.id === agentId);
|
|
1028
|
+
if (!failedAgent) {
|
|
1029
|
+
throw new Error(`Failed agent '${agentId}' not found in cluster`);
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
// Build failure resume context
|
|
1033
|
+
const resumePrompt = prompt || 'Continue from where you left off. Complete the task.';
|
|
1034
|
+
let context = `You are resuming from a previous failed attempt.\n\n`;
|
|
1035
|
+
context += `Previous error: ${error}\n\n`;
|
|
1036
|
+
context += `## Recent Context\n\n`;
|
|
1037
|
+
|
|
1038
|
+
for (const msg of recentMessages.slice(-10)) {
|
|
1039
|
+
if (msg.topic === 'AGENT_OUTPUT' || msg.topic === 'VALIDATION_RESULT') {
|
|
1040
|
+
context += `[${msg.sender}] ${msg.content?.text?.slice(0, 200) || ''}\n`;
|
|
1041
|
+
}
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
context += `\n## Resume Instructions\n\n${resumePrompt}\n`;
|
|
1045
|
+
|
|
1046
|
+
// Clear failure info since we're resuming
|
|
1047
|
+
cluster.failureInfo = null;
|
|
1048
|
+
|
|
1049
|
+
// Save updated state
|
|
1050
|
+
this._saveClusters();
|
|
1051
|
+
|
|
1052
|
+
// Resume the failed agent
|
|
1053
|
+
failedAgent.resume(context).catch((err) => {
|
|
1054
|
+
console.error(`[Orchestrator] Resume failed for agent ${agentId}:`, err.message);
|
|
1055
|
+
});
|
|
1056
|
+
|
|
1057
|
+
this._log(`[Orchestrator] Cluster ${clusterId} resumed from failure`);
|
|
1058
|
+
|
|
1059
|
+
return {
|
|
1060
|
+
id: clusterId,
|
|
1061
|
+
state: cluster.state,
|
|
1062
|
+
resumeType: 'failure',
|
|
1063
|
+
resumedAgent: agentId,
|
|
1064
|
+
previousError: error,
|
|
1065
|
+
};
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
// CASE 2: Cleanly stopped cluster - Resume by re-triggering based on ledger state
|
|
1069
|
+
this._log(`[Orchestrator] Resuming stopped cluster ${clusterId} (no failure)`);
|
|
1070
|
+
|
|
1071
|
+
// Build generic resume context
|
|
1072
|
+
const resumePrompt = prompt || 'Continue from where you left off. Complete the task.';
|
|
1073
|
+
let context = `Resuming cluster from previous session.\n\n`;
|
|
1074
|
+
context += `## Recent Context\n\n`;
|
|
1075
|
+
|
|
1076
|
+
for (const msg of recentMessages.slice(-10)) {
|
|
1077
|
+
if (
|
|
1078
|
+
msg.topic === 'AGENT_OUTPUT' ||
|
|
1079
|
+
msg.topic === 'VALIDATION_RESULT' ||
|
|
1080
|
+
msg.topic === 'ISSUE_OPENED'
|
|
1081
|
+
) {
|
|
1082
|
+
context += `[${msg.sender}] ${msg.content?.text?.slice(0, 200) || ''}\n`;
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
context += `\n## Resume Instructions\n\n${resumePrompt}\n`;
|
|
1087
|
+
|
|
1088
|
+
// Find the LAST workflow trigger - not arbitrary last 5 messages
|
|
1089
|
+
// This is the message that indicates current workflow state
|
|
1090
|
+
const lastTrigger = this._findLastWorkflowTrigger(recentMessages);
|
|
1091
|
+
const agentsToResume = [];
|
|
1092
|
+
|
|
1093
|
+
if (lastTrigger) {
|
|
1094
|
+
this._log(
|
|
1095
|
+
`[Orchestrator] Last workflow trigger: ${lastTrigger.topic} (${new Date(lastTrigger.timestamp).toISOString()})`
|
|
1096
|
+
);
|
|
1097
|
+
|
|
1098
|
+
for (const agent of cluster.agents) {
|
|
1099
|
+
if (!agent.config.triggers) continue;
|
|
1100
|
+
|
|
1101
|
+
const matchingTrigger = agent.config.triggers.find((trigger) => {
|
|
1102
|
+
// Exact match
|
|
1103
|
+
if (trigger.topic === lastTrigger.topic) return true;
|
|
1104
|
+
// Wildcard match
|
|
1105
|
+
if (trigger.topic === '*') return true;
|
|
1106
|
+
// Prefix match (e.g., "VALIDATION_*")
|
|
1107
|
+
if (trigger.topic.endsWith('*')) {
|
|
1108
|
+
const prefix = trigger.topic.slice(0, -1);
|
|
1109
|
+
return lastTrigger.topic.startsWith(prefix);
|
|
1110
|
+
}
|
|
1111
|
+
return false;
|
|
1112
|
+
});
|
|
1113
|
+
|
|
1114
|
+
if (matchingTrigger) {
|
|
1115
|
+
// Evaluate logic script if present
|
|
1116
|
+
if (matchingTrigger.logic?.script) {
|
|
1117
|
+
const shouldTrigger = agent._evaluateTrigger(matchingTrigger, lastTrigger);
|
|
1118
|
+
if (!shouldTrigger) continue;
|
|
1119
|
+
}
|
|
1120
|
+
agentsToResume.push({ agent, message: lastTrigger, trigger: matchingTrigger });
|
|
1121
|
+
}
|
|
1122
|
+
}
|
|
1123
|
+
} else {
|
|
1124
|
+
this._log(`[Orchestrator] No workflow triggers found in ledger`);
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
if (agentsToResume.length === 0) {
|
|
1128
|
+
if (!lastTrigger) {
|
|
1129
|
+
// No workflow activity - cluster never really started
|
|
1130
|
+
this._log(
|
|
1131
|
+
`[Orchestrator] WARNING: No workflow triggers in ledger. Cluster may not have started properly.`
|
|
1132
|
+
);
|
|
1133
|
+
this._log(`[Orchestrator] Publishing ISSUE_OPENED to bootstrap workflow...`);
|
|
1134
|
+
|
|
1135
|
+
// Re-publish the original issue if we have it
|
|
1136
|
+
const issueMessage = recentMessages.find((m) => m.topic === 'ISSUE_OPENED');
|
|
1137
|
+
if (issueMessage) {
|
|
1138
|
+
cluster.messageBus.publish({
|
|
1139
|
+
cluster_id: clusterId,
|
|
1140
|
+
topic: 'ISSUE_OPENED',
|
|
1141
|
+
sender: 'system',
|
|
1142
|
+
receiver: 'broadcast',
|
|
1143
|
+
content: issueMessage.content,
|
|
1144
|
+
metadata: { _resumed: true, _originalId: issueMessage.id },
|
|
1145
|
+
});
|
|
1146
|
+
} else {
|
|
1147
|
+
throw new Error(
|
|
1148
|
+
`Cannot resume cluster ${clusterId}: No workflow triggers found and no ISSUE_OPENED message. ` +
|
|
1149
|
+
`The cluster may not have started properly. Try: zeroshot run <issue> instead.`
|
|
1150
|
+
);
|
|
1151
|
+
}
|
|
1152
|
+
} else {
|
|
1153
|
+
// Had a trigger but no agents matched - something is wrong with agent configs
|
|
1154
|
+
throw new Error(
|
|
1155
|
+
`Cannot resume cluster ${clusterId}: Found trigger ${lastTrigger.topic} but no agents handle it. ` +
|
|
1156
|
+
`Check agent trigger configurations.`
|
|
1157
|
+
);
|
|
1158
|
+
}
|
|
1159
|
+
} else {
|
|
1160
|
+
// Resume agents that should run based on ledger state
|
|
1161
|
+
this._log(`[Orchestrator] Resuming ${agentsToResume.length} agent(s) based on ledger state`);
|
|
1162
|
+
for (const { agent, message } of agentsToResume) {
|
|
1163
|
+
this._log(`[Orchestrator] - Resuming agent ${agent.id} (triggered by ${message.topic})`);
|
|
1164
|
+
agent.resume(context).catch((err) => {
|
|
1165
|
+
console.error(`[Orchestrator] Resume failed for agent ${agent.id}:`, err.message);
|
|
1166
|
+
});
|
|
1167
|
+
}
|
|
1168
|
+
}
|
|
1169
|
+
|
|
1170
|
+
// Save updated state
|
|
1171
|
+
this._saveClusters();
|
|
1172
|
+
|
|
1173
|
+
this._log(`[Orchestrator] Cluster ${clusterId} resumed`);
|
|
1174
|
+
|
|
1175
|
+
return {
|
|
1176
|
+
id: clusterId,
|
|
1177
|
+
state: cluster.state,
|
|
1178
|
+
resumeType: 'clean',
|
|
1179
|
+
resumedAgents: agentsToResume.map((a) => a.agent.id),
|
|
1180
|
+
};
|
|
1181
|
+
}
|
|
1182
|
+
|
|
1183
|
+
/**
|
|
1184
|
+
* Force restart a stale agent with imperative prompt injection
|
|
1185
|
+
* @param {string} clusterId - Cluster ID
|
|
1186
|
+
* @param {string} agentId - Agent to restart
|
|
1187
|
+
* @param {number} staleDuration - How long agent was stale (ms)
|
|
1188
|
+
* @private
|
|
1189
|
+
*/
|
|
1190
|
+
async _forceRestartAgent(clusterId, agentId, staleDuration) {
|
|
1191
|
+
const cluster = this.clusters.get(clusterId);
|
|
1192
|
+
if (!cluster) {
|
|
1193
|
+
throw new Error(`Cluster ${clusterId} not found`);
|
|
1194
|
+
}
|
|
1195
|
+
|
|
1196
|
+
const agent = cluster.agents.find((a) => a.id === agentId);
|
|
1197
|
+
if (!agent) {
|
|
1198
|
+
throw new Error(`Agent ${agentId} not found in cluster ${clusterId}`);
|
|
1199
|
+
}
|
|
1200
|
+
|
|
1201
|
+
// Kill current task
|
|
1202
|
+
try {
|
|
1203
|
+
agent._killTask();
|
|
1204
|
+
} catch (err) {
|
|
1205
|
+
this._log(`⚠️ Failed to kill agent ${agentId} task:`, err.message);
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1208
|
+
// Build imperative restart context
|
|
1209
|
+
const staleMinutes = Math.round(staleDuration / 60000);
|
|
1210
|
+
const imperativePrompt = `
|
|
1211
|
+
🔴 CRITICAL: Your previous session STOPPED PRODUCING OUTPUT for ${staleMinutes} minutes and was detected as STUCK.
|
|
1212
|
+
|
|
1213
|
+
## What Happened
|
|
1214
|
+
- Last output timestamp: ${new Date(Date.now() - staleDuration).toISOString()}
|
|
1215
|
+
- Detected as stale after ${staleMinutes} minutes of silence
|
|
1216
|
+
- Process was forcefully restarted
|
|
1217
|
+
|
|
1218
|
+
## Your Instructions
|
|
1219
|
+
You MUST complete your current task. DO NOT STOP until you either:
|
|
1220
|
+
1. Successfully complete the task and publish your completion message, OR
|
|
1221
|
+
2. Explicitly state WHY you cannot complete the task (missing files, impossible requirements, etc.)
|
|
1222
|
+
|
|
1223
|
+
If you discovered that files you need to modify don't exist:
|
|
1224
|
+
- CREATE them from scratch with the expected implementation
|
|
1225
|
+
- DO NOT silently give up
|
|
1226
|
+
- DO NOT stop working without explicit explanation
|
|
1227
|
+
|
|
1228
|
+
If you are stuck in an impossible situation:
|
|
1229
|
+
- EXPLAIN the problem clearly
|
|
1230
|
+
- PROPOSE alternative solutions
|
|
1231
|
+
- WAIT for guidance - do not exit
|
|
1232
|
+
|
|
1233
|
+
## Resume Your Work
|
|
1234
|
+
Continue from where you left off. Review your previous output to understand what you were working on.
|
|
1235
|
+
`.trim();
|
|
1236
|
+
|
|
1237
|
+
// Get recent context from ledger
|
|
1238
|
+
const recentMessages = cluster.messageBus.query({
|
|
1239
|
+
cluster_id: cluster.id,
|
|
1240
|
+
limit: 10,
|
|
1241
|
+
});
|
|
1242
|
+
|
|
1243
|
+
const contextText = recentMessages
|
|
1244
|
+
.map((m) => `[${m.sender}] ${m.content?.text || JSON.stringify(m.content)}`)
|
|
1245
|
+
.join('\n\n');
|
|
1246
|
+
|
|
1247
|
+
const fullContext = `${imperativePrompt}\n\n## Recent Context\n${contextText}`;
|
|
1248
|
+
|
|
1249
|
+
// Resume agent with imperative prompt
|
|
1250
|
+
this._log(
|
|
1251
|
+
`🔄 Restarting agent ${agentId} with imperative prompt (${imperativePrompt.length} chars)`
|
|
1252
|
+
);
|
|
1253
|
+
|
|
1254
|
+
try {
|
|
1255
|
+
await agent.resume(fullContext);
|
|
1256
|
+
this._log(`✅ Agent ${agentId} successfully restarted`);
|
|
1257
|
+
} catch (err) {
|
|
1258
|
+
this._log(`❌ Failed to resume agent ${agentId}:`, err.message);
|
|
1259
|
+
throw err;
|
|
1260
|
+
}
|
|
1261
|
+
}
|
|
1262
|
+
|
|
1263
|
+
/**
|
|
1264
|
+
* Handle operation chain from CLUSTER_OPERATIONS message
|
|
1265
|
+
* Executes operations sequentially: add_agents, remove_agents, update_agent, publish
|
|
1266
|
+
*
|
|
1267
|
+
* Validation strategy:
|
|
1268
|
+
* 1. Pre-validate all agent configs before executing any operations
|
|
1269
|
+
* 2. Build a mock cluster config with proposed changes
|
|
1270
|
+
* 3. Run config-validator on the mock to catch structural issues
|
|
1271
|
+
* 4. Only execute operations if validation passes
|
|
1272
|
+
*
|
|
1273
|
+
* @param {string} clusterId - Cluster ID
|
|
1274
|
+
* @param {Array} operations - Array of operation objects
|
|
1275
|
+
* @param {string} sender - Who sent the operations (for attribution)
|
|
1276
|
+
* @param {Object} context - Isolation context { isolationManager, containerId }
|
|
1277
|
+
* @private
|
|
1278
|
+
*/
|
|
1279
|
+
async _handleOperations(clusterId, operations, sender, context = {}) {
|
|
1280
|
+
const cluster = this.clusters.get(clusterId);
|
|
1281
|
+
if (!cluster) {
|
|
1282
|
+
throw new Error(`Cluster ${clusterId} not found`);
|
|
1283
|
+
}
|
|
1284
|
+
|
|
1285
|
+
this._log(`[Orchestrator] Validating ${operations.length} operation(s) from ${sender}`);
|
|
1286
|
+
|
|
1287
|
+
// Phase 1: Pre-validate operation structure
|
|
1288
|
+
const validationErrors = [];
|
|
1289
|
+
for (let i = 0; i < operations.length; i++) {
|
|
1290
|
+
const op = operations[i];
|
|
1291
|
+
if (!op.action) {
|
|
1292
|
+
validationErrors.push(`Operation ${i}: missing 'action' field`);
|
|
1293
|
+
continue;
|
|
1294
|
+
}
|
|
1295
|
+
if (!VALID_OPERATIONS.includes(op.action)) {
|
|
1296
|
+
validationErrors.push(
|
|
1297
|
+
`Operation ${i}: unknown action '${op.action}'. Valid: ${VALID_OPERATIONS.join(', ')}`
|
|
1298
|
+
);
|
|
1299
|
+
}
|
|
1300
|
+
}
|
|
1301
|
+
|
|
1302
|
+
if (validationErrors.length > 0) {
|
|
1303
|
+
const errorMsg = `Operation chain validation failed:\n - ${validationErrors.join('\n - ')}`;
|
|
1304
|
+
this._log(`[Orchestrator] ❌ ${errorMsg}`);
|
|
1305
|
+
throw new Error(errorMsg);
|
|
1306
|
+
}
|
|
1307
|
+
|
|
1308
|
+
// Phase 2: Build mock cluster config with proposed agents
|
|
1309
|
+
// Collect all agents that would exist after operations complete
|
|
1310
|
+
const existingAgentConfigs = cluster.config.agents || [];
|
|
1311
|
+
const proposedAgentConfigs = [...existingAgentConfigs];
|
|
1312
|
+
|
|
1313
|
+
for (const op of operations) {
|
|
1314
|
+
if (op.action === 'add_agents' && op.agents) {
|
|
1315
|
+
for (const agentConfig of op.agents) {
|
|
1316
|
+
// Check for duplicate before adding
|
|
1317
|
+
const existingIdx = proposedAgentConfigs.findIndex((a) => a.id === agentConfig.id);
|
|
1318
|
+
if (existingIdx === -1) {
|
|
1319
|
+
proposedAgentConfigs.push(agentConfig);
|
|
1320
|
+
}
|
|
1321
|
+
}
|
|
1322
|
+
} else if (op.action === 'remove_agents' && op.agentIds) {
|
|
1323
|
+
for (const agentId of op.agentIds) {
|
|
1324
|
+
const idx = proposedAgentConfigs.findIndex((a) => a.id === agentId);
|
|
1325
|
+
if (idx !== -1) {
|
|
1326
|
+
proposedAgentConfigs.splice(idx, 1);
|
|
1327
|
+
}
|
|
1328
|
+
}
|
|
1329
|
+
} else if (op.action === 'update_agent' && op.agentId && op.updates) {
|
|
1330
|
+
const agentConfig = proposedAgentConfigs.find((a) => a.id === op.agentId);
|
|
1331
|
+
if (agentConfig) {
|
|
1332
|
+
Object.assign(agentConfig, op.updates);
|
|
1333
|
+
}
|
|
1334
|
+
}
|
|
1335
|
+
}
|
|
1336
|
+
|
|
1337
|
+
// Phase 3: Validate proposed cluster config
|
|
1338
|
+
const mockConfig = { agents: proposedAgentConfigs };
|
|
1339
|
+
const validation = configValidator.validateConfig(mockConfig);
|
|
1340
|
+
|
|
1341
|
+
if (!validation.valid) {
|
|
1342
|
+
const errorMsg = `Proposed cluster configuration is invalid:\n - ${validation.errors.join('\n - ')}`;
|
|
1343
|
+
this._log(`[Orchestrator] ❌ ${errorMsg}`);
|
|
1344
|
+
|
|
1345
|
+
// Publish validation failure for conductor to see and retry
|
|
1346
|
+
cluster.messageBus.publish({
|
|
1347
|
+
cluster_id: clusterId,
|
|
1348
|
+
topic: 'CLUSTER_OPERATIONS_VALIDATION_FAILED',
|
|
1349
|
+
sender: 'orchestrator',
|
|
1350
|
+
content: {
|
|
1351
|
+
text: 'Operation chain would create invalid cluster configuration',
|
|
1352
|
+
data: {
|
|
1353
|
+
errors: validation.errors,
|
|
1354
|
+
warnings: validation.warnings,
|
|
1355
|
+
operations: operations,
|
|
1356
|
+
},
|
|
1357
|
+
},
|
|
1358
|
+
});
|
|
1359
|
+
|
|
1360
|
+
throw new Error(errorMsg);
|
|
1361
|
+
}
|
|
1362
|
+
|
|
1363
|
+
// Log warnings but proceed
|
|
1364
|
+
if (validation.warnings.length > 0) {
|
|
1365
|
+
this._log(`[Orchestrator] ⚠️ Warnings (proceeding anyway):`);
|
|
1366
|
+
for (const warning of validation.warnings) {
|
|
1367
|
+
this._log(` - ${warning}`);
|
|
1368
|
+
}
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1371
|
+
// Phase 4: Execute validated operations
|
|
1372
|
+
this._log(`[Orchestrator] ✓ Validation passed, executing ${operations.length} operation(s)`);
|
|
1373
|
+
|
|
1374
|
+
for (let i = 0; i < operations.length; i++) {
|
|
1375
|
+
const op = operations[i];
|
|
1376
|
+
this._log(` [${i + 1}/${operations.length}] ${op.action}`);
|
|
1377
|
+
|
|
1378
|
+
switch (op.action) {
|
|
1379
|
+
case 'add_agents':
|
|
1380
|
+
await this._opAddAgents(cluster, op, context);
|
|
1381
|
+
break;
|
|
1382
|
+
|
|
1383
|
+
case 'remove_agents':
|
|
1384
|
+
await this._opRemoveAgents(cluster, op);
|
|
1385
|
+
break;
|
|
1386
|
+
|
|
1387
|
+
case 'update_agent':
|
|
1388
|
+
this._opUpdateAgent(cluster, op);
|
|
1389
|
+
break;
|
|
1390
|
+
|
|
1391
|
+
case 'publish':
|
|
1392
|
+
this._opPublish(cluster, op, sender);
|
|
1393
|
+
break;
|
|
1394
|
+
|
|
1395
|
+
case 'load_config':
|
|
1396
|
+
await this._opLoadConfig(cluster, op, context);
|
|
1397
|
+
break;
|
|
1398
|
+
}
|
|
1399
|
+
}
|
|
1400
|
+
|
|
1401
|
+
this._log(`[Orchestrator] All ${operations.length} operation(s) executed successfully`);
|
|
1402
|
+
|
|
1403
|
+
// Publish success notification
|
|
1404
|
+
cluster.messageBus.publish({
|
|
1405
|
+
cluster_id: clusterId,
|
|
1406
|
+
topic: 'CLUSTER_OPERATIONS_SUCCESS',
|
|
1407
|
+
sender: 'orchestrator',
|
|
1408
|
+
content: {
|
|
1409
|
+
text: `Executed ${operations.length} operation(s)`,
|
|
1410
|
+
data: {
|
|
1411
|
+
operationCount: operations.length,
|
|
1412
|
+
agentCount: cluster.agents.length,
|
|
1413
|
+
},
|
|
1414
|
+
},
|
|
1415
|
+
});
|
|
1416
|
+
|
|
1417
|
+
// Save updated cluster state to disk
|
|
1418
|
+
this._saveClusters();
|
|
1419
|
+
}
|
|
1420
|
+
|
|
1421
|
+
/**
|
|
1422
|
+
* Operation: add_agents - Spawn new agents dynamically
|
|
1423
|
+
* @private
|
|
1424
|
+
*/
|
|
1425
|
+
async _opAddAgents(cluster, op, context) {
|
|
1426
|
+
const agents = op.agents;
|
|
1427
|
+
if (!agents || !Array.isArray(agents)) {
|
|
1428
|
+
throw new Error('add_agents operation missing agents array');
|
|
1429
|
+
}
|
|
1430
|
+
|
|
1431
|
+
for (const agentConfig of agents) {
|
|
1432
|
+
// Validate agent config has required fields
|
|
1433
|
+
if (!agentConfig.id) {
|
|
1434
|
+
throw new Error('Agent config missing required field: id');
|
|
1435
|
+
}
|
|
1436
|
+
|
|
1437
|
+
// Check for duplicate agent ID
|
|
1438
|
+
const existingAgent = cluster.agents.find((a) => a.id === agentConfig.id);
|
|
1439
|
+
if (existingAgent) {
|
|
1440
|
+
this._log(` ⚠️ Agent ${agentConfig.id} already exists, skipping`);
|
|
1441
|
+
continue;
|
|
1442
|
+
}
|
|
1443
|
+
|
|
1444
|
+
// Add to config agents array (for persistence)
|
|
1445
|
+
if (!cluster.config.agents) {
|
|
1446
|
+
cluster.config.agents = [];
|
|
1447
|
+
}
|
|
1448
|
+
cluster.config.agents.push(agentConfig);
|
|
1449
|
+
|
|
1450
|
+
// Build agent options
|
|
1451
|
+
const agentOptions = {
|
|
1452
|
+
testMode: false,
|
|
1453
|
+
quiet: this.quiet,
|
|
1454
|
+
};
|
|
1455
|
+
|
|
1456
|
+
// Pass isolation context if cluster is running in isolation mode
|
|
1457
|
+
if (cluster.isolation?.enabled && context.isolationManager) {
|
|
1458
|
+
agentOptions.isolation = {
|
|
1459
|
+
enabled: true,
|
|
1460
|
+
manager: context.isolationManager,
|
|
1461
|
+
clusterId: cluster.id,
|
|
1462
|
+
};
|
|
1463
|
+
}
|
|
1464
|
+
|
|
1465
|
+
// Create and start agent
|
|
1466
|
+
const agent = new AgentWrapper(agentConfig, cluster.messageBus, cluster, agentOptions);
|
|
1467
|
+
cluster.agents.push(agent);
|
|
1468
|
+
await agent.start();
|
|
1469
|
+
|
|
1470
|
+
this._log(
|
|
1471
|
+
` ✓ Added agent: ${agentConfig.id} (role: ${agentConfig.role || 'unspecified'})`
|
|
1472
|
+
);
|
|
1473
|
+
}
|
|
1474
|
+
}
|
|
1475
|
+
|
|
1476
|
+
/**
|
|
1477
|
+
* Operation: remove_agents - Stop and remove agents by ID
|
|
1478
|
+
* @private
|
|
1479
|
+
*/
|
|
1480
|
+
async _opRemoveAgents(cluster, op) {
|
|
1481
|
+
const agentIds = op.agentIds;
|
|
1482
|
+
if (!agentIds || !Array.isArray(agentIds)) {
|
|
1483
|
+
throw new Error('remove_agents operation missing agentIds array');
|
|
1484
|
+
}
|
|
1485
|
+
|
|
1486
|
+
for (const agentId of agentIds) {
|
|
1487
|
+
const agentIndex = cluster.agents.findIndex((a) => a.id === agentId);
|
|
1488
|
+
if (agentIndex === -1) {
|
|
1489
|
+
this._log(` ⚠️ Agent ${agentId} not found, skipping removal`);
|
|
1490
|
+
continue;
|
|
1491
|
+
}
|
|
1492
|
+
|
|
1493
|
+
const agent = cluster.agents[agentIndex];
|
|
1494
|
+
await agent.stop();
|
|
1495
|
+
|
|
1496
|
+
// Remove from cluster.agents
|
|
1497
|
+
cluster.agents.splice(agentIndex, 1);
|
|
1498
|
+
|
|
1499
|
+
// Remove from config.agents
|
|
1500
|
+
if (cluster.config.agents) {
|
|
1501
|
+
const configIndex = cluster.config.agents.findIndex((a) => a.id === agentId);
|
|
1502
|
+
if (configIndex !== -1) {
|
|
1503
|
+
cluster.config.agents.splice(configIndex, 1);
|
|
1504
|
+
}
|
|
1505
|
+
}
|
|
1506
|
+
|
|
1507
|
+
this._log(` ✓ Removed agent: ${agentId}`);
|
|
1508
|
+
}
|
|
1509
|
+
}
|
|
1510
|
+
|
|
1511
|
+
/**
|
|
1512
|
+
* Operation: update_agent - Modify existing agent config at runtime
|
|
1513
|
+
* Note: Some updates may require agent restart to take effect
|
|
1514
|
+
* @private
|
|
1515
|
+
*/
|
|
1516
|
+
_opUpdateAgent(cluster, op) {
|
|
1517
|
+
const { agentId, updates } = op;
|
|
1518
|
+
if (!agentId) {
|
|
1519
|
+
throw new Error('update_agent operation missing agentId');
|
|
1520
|
+
}
|
|
1521
|
+
if (!updates || typeof updates !== 'object') {
|
|
1522
|
+
throw new Error('update_agent operation missing updates object');
|
|
1523
|
+
}
|
|
1524
|
+
|
|
1525
|
+
const agent = cluster.agents.find((a) => a.id === agentId);
|
|
1526
|
+
if (!agent) {
|
|
1527
|
+
throw new Error(`update_agent: Agent ${agentId} not found`);
|
|
1528
|
+
}
|
|
1529
|
+
|
|
1530
|
+
// Apply updates to agent config
|
|
1531
|
+
Object.assign(agent.config, updates);
|
|
1532
|
+
|
|
1533
|
+
// Also update in cluster.config.agents for persistence
|
|
1534
|
+
if (cluster.config.agents) {
|
|
1535
|
+
const configAgent = cluster.config.agents.find((a) => a.id === agentId);
|
|
1536
|
+
if (configAgent) {
|
|
1537
|
+
Object.assign(configAgent, updates);
|
|
1538
|
+
}
|
|
1539
|
+
}
|
|
1540
|
+
|
|
1541
|
+
this._log(` ✓ Updated agent: ${agentId} (fields: ${Object.keys(updates).join(', ')})`);
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
/**
|
|
1545
|
+
* Operation: publish - Publish a message to the bus
|
|
1546
|
+
* @private
|
|
1547
|
+
*/
|
|
1548
|
+
_opPublish(cluster, op, sender) {
|
|
1549
|
+
const { topic, content } = op;
|
|
1550
|
+
if (!topic) {
|
|
1551
|
+
throw new Error('publish operation missing topic');
|
|
1552
|
+
}
|
|
1553
|
+
|
|
1554
|
+
cluster.messageBus.publish({
|
|
1555
|
+
cluster_id: cluster.id,
|
|
1556
|
+
topic,
|
|
1557
|
+
sender: op.sender || sender,
|
|
1558
|
+
receiver: op.receiver || 'broadcast',
|
|
1559
|
+
content: content || {},
|
|
1560
|
+
});
|
|
1561
|
+
|
|
1562
|
+
this._log(` ✓ Published to topic: ${topic}`);
|
|
1563
|
+
}
|
|
1564
|
+
|
|
1565
|
+
/**
|
|
1566
|
+
* Operation: load_config - Load agents from a cluster config
|
|
1567
|
+
*
|
|
1568
|
+
* Supports two formats:
|
|
1569
|
+
* 1. Static config: { config: 'config-name' } - loads from cluster-templates/config-name.json
|
|
1570
|
+
* 2. Parameterized: { config: { base: 'template-name', params: {...} } } - resolves base template with params
|
|
1571
|
+
*
|
|
1572
|
+
* @private
|
|
1573
|
+
*/
|
|
1574
|
+
async _opLoadConfig(cluster, op, context) {
|
|
1575
|
+
const { config } = op;
|
|
1576
|
+
if (!config) {
|
|
1577
|
+
throw new Error('load_config operation missing config');
|
|
1578
|
+
}
|
|
1579
|
+
|
|
1580
|
+
const templatesDir = path.join(__dirname, '..', 'cluster-templates');
|
|
1581
|
+
let loadedConfig;
|
|
1582
|
+
|
|
1583
|
+
// Check if config is parameterized ({ base, params }) or static (string)
|
|
1584
|
+
if (typeof config === 'object' && config.base) {
|
|
1585
|
+
// Parameterized template - resolve with TemplateResolver
|
|
1586
|
+
const { base, params } = config;
|
|
1587
|
+
this._log(` Loading parameterized template: ${base}`);
|
|
1588
|
+
this._log(` Params: ${JSON.stringify(params)}`);
|
|
1589
|
+
|
|
1590
|
+
const resolver = new TemplateResolver(templatesDir);
|
|
1591
|
+
loadedConfig = resolver.resolve(base, params);
|
|
1592
|
+
|
|
1593
|
+
this._log(` ✓ Resolved template: ${base} → ${loadedConfig.agents?.length || 0} agent(s)`);
|
|
1594
|
+
} else if (typeof config === 'string') {
|
|
1595
|
+
// Static config - load directly from file
|
|
1596
|
+
const configPath = path.join(templatesDir, `${config}.json`);
|
|
1597
|
+
|
|
1598
|
+
if (!fs.existsSync(configPath)) {
|
|
1599
|
+
throw new Error(`Config not found: ${config} (looked in ${configPath})`);
|
|
1600
|
+
}
|
|
1601
|
+
|
|
1602
|
+
this._log(` Loading static config: ${config}`);
|
|
1603
|
+
|
|
1604
|
+
const configContent = fs.readFileSync(configPath, 'utf8');
|
|
1605
|
+
loadedConfig = JSON.parse(configContent);
|
|
1606
|
+
} else {
|
|
1607
|
+
throw new Error(
|
|
1608
|
+
`Invalid config format: expected string or {base, params}, got ${typeof config}`
|
|
1609
|
+
);
|
|
1610
|
+
}
|
|
1611
|
+
|
|
1612
|
+
if (!loadedConfig.agents || !Array.isArray(loadedConfig.agents)) {
|
|
1613
|
+
throw new Error(`Config has no agents array`);
|
|
1614
|
+
}
|
|
1615
|
+
|
|
1616
|
+
this._log(` Found ${loadedConfig.agents.length} agent(s)`);
|
|
1617
|
+
|
|
1618
|
+
// Add agents from loaded config (reuse existing add_agents logic)
|
|
1619
|
+
await this._opAddAgents(cluster, { agents: loadedConfig.agents }, context);
|
|
1620
|
+
|
|
1621
|
+
this._log(` ✓ Config loaded (${loadedConfig.agents.length} agents)`);
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
/**
|
|
1625
|
+
* Check if a process with given PID is running
|
|
1626
|
+
* @param {Number} pid - Process ID
|
|
1627
|
+
* @returns {Boolean} True if process exists
|
|
1628
|
+
* @private
|
|
1629
|
+
*/
|
|
1630
|
+
_isProcessRunning(pid) {
|
|
1631
|
+
if (!pid) return false;
|
|
1632
|
+
try {
|
|
1633
|
+
// Signal 0 doesn't kill, just checks if process exists
|
|
1634
|
+
process.kill(pid, 0);
|
|
1635
|
+
return true;
|
|
1636
|
+
} catch (e) {
|
|
1637
|
+
// ESRCH = No such process, EPERM = process exists but no permission
|
|
1638
|
+
return e.code === 'EPERM';
|
|
1639
|
+
}
|
|
1640
|
+
}
|
|
1641
|
+
|
|
1642
|
+
/**
|
|
1643
|
+
* Get cluster status
|
|
1644
|
+
* @param {String} clusterId - Cluster ID
|
|
1645
|
+
* @returns {Object} Cluster status
|
|
1646
|
+
*/
|
|
1647
|
+
getStatus(clusterId) {
|
|
1648
|
+
const cluster = this.clusters.get(clusterId);
|
|
1649
|
+
if (!cluster) {
|
|
1650
|
+
throw new Error(`Cluster ${clusterId} not found`);
|
|
1651
|
+
}
|
|
1652
|
+
|
|
1653
|
+
// Detect zombie clusters: state=running but no backing process
|
|
1654
|
+
let state = cluster.state;
|
|
1655
|
+
let isZombie = false;
|
|
1656
|
+
if (state === 'running') {
|
|
1657
|
+
if (cluster.pid) {
|
|
1658
|
+
// PID recorded - check if process is running
|
|
1659
|
+
if (!this._isProcessRunning(cluster.pid)) {
|
|
1660
|
+
state = 'zombie';
|
|
1661
|
+
isZombie = true;
|
|
1662
|
+
this._log(
|
|
1663
|
+
`[Orchestrator] Detected zombie cluster ${clusterId} (PID ${cluster.pid} not running)`
|
|
1664
|
+
);
|
|
1665
|
+
}
|
|
1666
|
+
} else {
|
|
1667
|
+
// No PID recorded (legacy cluster or bug) - definitely a zombie
|
|
1668
|
+
// New code always records PID for running clusters
|
|
1669
|
+
state = 'zombie';
|
|
1670
|
+
isZombie = true;
|
|
1671
|
+
this._log(
|
|
1672
|
+
`[Orchestrator] Detected zombie cluster ${clusterId} (no PID recorded - legacy or killed cluster)`
|
|
1673
|
+
);
|
|
1674
|
+
}
|
|
1675
|
+
}
|
|
1676
|
+
|
|
1677
|
+
return {
|
|
1678
|
+
id: clusterId,
|
|
1679
|
+
state: state,
|
|
1680
|
+
isZombie: isZombie,
|
|
1681
|
+
pid: cluster.pid || null,
|
|
1682
|
+
createdAt: cluster.createdAt,
|
|
1683
|
+
agents: cluster.agents.map((a) => a.getState()),
|
|
1684
|
+
messageCount: cluster.messageBus.count({ cluster_id: clusterId }),
|
|
1685
|
+
};
|
|
1686
|
+
}
|
|
1687
|
+
|
|
1688
|
+
/**
|
|
1689
|
+
* List all clusters
|
|
1690
|
+
* @returns {Array} List of cluster summaries
|
|
1691
|
+
*/
|
|
1692
|
+
listClusters() {
|
|
1693
|
+
return Array.from(this.clusters.values()).map((cluster) => {
|
|
1694
|
+
// Detect zombie clusters (state=running but no backing process)
|
|
1695
|
+
let state = cluster.state;
|
|
1696
|
+
if (state === 'running') {
|
|
1697
|
+
if (cluster.pid) {
|
|
1698
|
+
if (!this._isProcessRunning(cluster.pid)) {
|
|
1699
|
+
state = 'zombie';
|
|
1700
|
+
}
|
|
1701
|
+
} else {
|
|
1702
|
+
// No PID recorded - definitely a zombie
|
|
1703
|
+
state = 'zombie';
|
|
1704
|
+
}
|
|
1705
|
+
}
|
|
1706
|
+
|
|
1707
|
+
return {
|
|
1708
|
+
id: cluster.id,
|
|
1709
|
+
state: state,
|
|
1710
|
+
createdAt: cluster.createdAt,
|
|
1711
|
+
agentCount: cluster.agents.length,
|
|
1712
|
+
messageCount: cluster.messageBus.getAll(cluster.id).length,
|
|
1713
|
+
};
|
|
1714
|
+
});
|
|
1715
|
+
}
|
|
1716
|
+
|
|
1717
|
+
/**
|
|
1718
|
+
* Get cluster object (for internal use)
|
|
1719
|
+
* @param {String} clusterId - Cluster ID
|
|
1720
|
+
* @returns {Object} Full cluster object
|
|
1721
|
+
*/
|
|
1722
|
+
getCluster(clusterId) {
|
|
1723
|
+
return this.clusters.get(clusterId);
|
|
1724
|
+
}
|
|
1725
|
+
|
|
1726
|
+
/**
|
|
1727
|
+
* Export cluster conversation
|
|
1728
|
+
* @param {String} clusterId - Cluster ID
|
|
1729
|
+
* @param {String} format - Export format ('json' or 'markdown')
|
|
1730
|
+
* @returns {String} Exported data
|
|
1731
|
+
*/
|
|
1732
|
+
export(clusterId, format = 'json') {
|
|
1733
|
+
const cluster = this.clusters.get(clusterId);
|
|
1734
|
+
if (!cluster) {
|
|
1735
|
+
throw new Error(`Cluster ${clusterId} not found`);
|
|
1736
|
+
}
|
|
1737
|
+
|
|
1738
|
+
const messages = cluster.messageBus.getAll(clusterId);
|
|
1739
|
+
|
|
1740
|
+
if (format === 'json') {
|
|
1741
|
+
return JSON.stringify(
|
|
1742
|
+
{
|
|
1743
|
+
cluster_id: clusterId,
|
|
1744
|
+
state: cluster.state,
|
|
1745
|
+
created_at: cluster.createdAt,
|
|
1746
|
+
agents: cluster.agents.map((a) => a.getState()),
|
|
1747
|
+
messages,
|
|
1748
|
+
},
|
|
1749
|
+
null,
|
|
1750
|
+
2
|
|
1751
|
+
);
|
|
1752
|
+
} else if (format === 'markdown') {
|
|
1753
|
+
return this._exportMarkdown(cluster, clusterId, messages);
|
|
1754
|
+
} else {
|
|
1755
|
+
throw new Error(`Unknown export format: ${format}`);
|
|
1756
|
+
}
|
|
1757
|
+
}
|
|
1758
|
+
|
|
1759
|
+
/**
|
|
1760
|
+
* Export cluster as nicely formatted markdown
|
|
1761
|
+
* @private
|
|
1762
|
+
*/
|
|
1763
|
+
_exportMarkdown(cluster, clusterId, messages) {
|
|
1764
|
+
const { parseChunk } = require('../lib/stream-json-parser');
|
|
1765
|
+
|
|
1766
|
+
// Find task info
|
|
1767
|
+
const issueOpened = messages.find((m) => m.topic === 'ISSUE_OPENED');
|
|
1768
|
+
const taskText = issueOpened?.content?.text || 'Unknown task';
|
|
1769
|
+
|
|
1770
|
+
// Calculate duration
|
|
1771
|
+
const firstMsg = messages[0];
|
|
1772
|
+
const lastMsg = messages[messages.length - 1];
|
|
1773
|
+
const durationMs = lastMsg ? lastMsg.timestamp - firstMsg.timestamp : 0;
|
|
1774
|
+
const durationMin = Math.round(durationMs / 60000);
|
|
1775
|
+
|
|
1776
|
+
// Header
|
|
1777
|
+
let md = `# Cluster: ${clusterId}\n\n`;
|
|
1778
|
+
md += `| Property | Value |\n|----------|-------|\n`;
|
|
1779
|
+
md += `| State | ${cluster.state} |\n`;
|
|
1780
|
+
md += `| Created | ${new Date(cluster.createdAt).toLocaleString()} |\n`;
|
|
1781
|
+
md += `| Duration | ${durationMin} minutes |\n`;
|
|
1782
|
+
md += `| Agents | ${cluster.agents.map((a) => a.id).join(', ')} |\n\n`;
|
|
1783
|
+
|
|
1784
|
+
// Task
|
|
1785
|
+
md += `## Task\n\n${taskText}\n\n`;
|
|
1786
|
+
|
|
1787
|
+
// Group messages by agent for cleaner output
|
|
1788
|
+
const agentOutputs = new Map();
|
|
1789
|
+
|
|
1790
|
+
for (const msg of messages) {
|
|
1791
|
+
if (msg.topic === 'AGENT_OUTPUT') {
|
|
1792
|
+
if (!agentOutputs.has(msg.sender)) {
|
|
1793
|
+
agentOutputs.set(msg.sender, []);
|
|
1794
|
+
}
|
|
1795
|
+
agentOutputs.get(msg.sender).push(msg);
|
|
1796
|
+
}
|
|
1797
|
+
}
|
|
1798
|
+
|
|
1799
|
+
// Agent sections
|
|
1800
|
+
for (const [agentId, agentMsgs] of agentOutputs) {
|
|
1801
|
+
md += `## Agent: ${agentId}\n\n`;
|
|
1802
|
+
|
|
1803
|
+
let text = '';
|
|
1804
|
+
let tools = [];
|
|
1805
|
+
|
|
1806
|
+
for (const msg of agentMsgs) {
|
|
1807
|
+
const content = msg.content?.data?.line || msg.content?.data?.chunk || msg.content?.text;
|
|
1808
|
+
if (!content) continue;
|
|
1809
|
+
|
|
1810
|
+
const events = parseChunk(content);
|
|
1811
|
+
for (const event of events) {
|
|
1812
|
+
switch (event.type) {
|
|
1813
|
+
case 'text':
|
|
1814
|
+
text += event.text;
|
|
1815
|
+
break;
|
|
1816
|
+
case 'tool_call':
|
|
1817
|
+
tools.push({ name: event.toolName, input: event.input });
|
|
1818
|
+
break;
|
|
1819
|
+
case 'tool_result':
|
|
1820
|
+
if (tools.length > 0) {
|
|
1821
|
+
const lastTool = tools[tools.length - 1];
|
|
1822
|
+
lastTool.result = event.content;
|
|
1823
|
+
lastTool.isError = event.isError;
|
|
1824
|
+
}
|
|
1825
|
+
break;
|
|
1826
|
+
}
|
|
1827
|
+
}
|
|
1828
|
+
}
|
|
1829
|
+
|
|
1830
|
+
// Output text
|
|
1831
|
+
if (text.trim()) {
|
|
1832
|
+
md += `### Output\n\n${text.trim()}\n\n`;
|
|
1833
|
+
}
|
|
1834
|
+
|
|
1835
|
+
// Tools used
|
|
1836
|
+
if (tools.length > 0) {
|
|
1837
|
+
md += `### Tools Used\n\n`;
|
|
1838
|
+
for (const tool of tools) {
|
|
1839
|
+
const status = tool.isError ? '❌' : '✓';
|
|
1840
|
+
md += `- **${tool.name}** ${status}\n`;
|
|
1841
|
+
if (tool.input) {
|
|
1842
|
+
const inputStr =
|
|
1843
|
+
typeof tool.input === 'string' ? tool.input : JSON.stringify(tool.input);
|
|
1844
|
+
if (inputStr.length < 100) {
|
|
1845
|
+
md += ` - Input: \`${inputStr}\`\n`;
|
|
1846
|
+
}
|
|
1847
|
+
}
|
|
1848
|
+
}
|
|
1849
|
+
md += '\n';
|
|
1850
|
+
}
|
|
1851
|
+
}
|
|
1852
|
+
|
|
1853
|
+
// Validation results
|
|
1854
|
+
const validations = messages.filter((m) => m.topic === 'VALIDATION_RESULT');
|
|
1855
|
+
if (validations.length > 0) {
|
|
1856
|
+
md += `## Validation Results\n\n`;
|
|
1857
|
+
for (const v of validations) {
|
|
1858
|
+
const data = v.content?.data || {};
|
|
1859
|
+
const approved = data.approved === true || data.approved === 'true';
|
|
1860
|
+
const icon = approved ? '✅' : '❌';
|
|
1861
|
+
md += `### ${v.sender} ${icon}\n\n`;
|
|
1862
|
+
if (data.summary) {
|
|
1863
|
+
md += `${data.summary}\n\n`;
|
|
1864
|
+
}
|
|
1865
|
+
if (!approved && data.issues) {
|
|
1866
|
+
const issues = typeof data.issues === 'string' ? JSON.parse(data.issues) : data.issues;
|
|
1867
|
+
if (Array.isArray(issues) && issues.length > 0) {
|
|
1868
|
+
md += `**Issues:**\n`;
|
|
1869
|
+
for (const issue of issues) {
|
|
1870
|
+
md += `- ${issue}\n`;
|
|
1871
|
+
}
|
|
1872
|
+
md += '\n';
|
|
1873
|
+
}
|
|
1874
|
+
}
|
|
1875
|
+
}
|
|
1876
|
+
}
|
|
1877
|
+
|
|
1878
|
+
// Final status
|
|
1879
|
+
const clusterComplete = messages.find((m) => m.topic === 'CLUSTER_COMPLETE');
|
|
1880
|
+
if (clusterComplete) {
|
|
1881
|
+
md += `## Result\n\n✅ **Cluster completed successfully**\n`;
|
|
1882
|
+
}
|
|
1883
|
+
|
|
1884
|
+
return md;
|
|
1885
|
+
}
|
|
1886
|
+
|
|
1887
|
+
/**
|
|
1888
|
+
* Validate cluster configuration (delegates to config-validator module)
|
|
1889
|
+
* @param {Object} config - Cluster configuration
|
|
1890
|
+
* @param {Object} options - Validation options
|
|
1891
|
+
* @param {boolean} options.strict - Treat warnings as errors (default: false)
|
|
1892
|
+
* @returns {Object} { valid: Boolean, errors: Array, warnings: Array }
|
|
1893
|
+
*/
|
|
1894
|
+
validateConfig(config, options = {}) {
|
|
1895
|
+
const result = configValidator.validateConfig(config);
|
|
1896
|
+
|
|
1897
|
+
// In strict mode, warnings become errors
|
|
1898
|
+
if (options.strict && result.warnings.length > 0) {
|
|
1899
|
+
result.errors.push(...result.warnings.map((w) => `[strict] ${w}`));
|
|
1900
|
+
result.valid = false;
|
|
1901
|
+
}
|
|
1902
|
+
|
|
1903
|
+
return result;
|
|
1904
|
+
}
|
|
1905
|
+
|
|
1906
|
+
/**
|
|
1907
|
+
* Load cluster configuration from file
|
|
1908
|
+
* @param {String} configPath - Path to config JSON file
|
|
1909
|
+
* @param {Object} options - Load options
|
|
1910
|
+
* @param {boolean} options.strict - Treat warnings as errors
|
|
1911
|
+
* @returns {Object} Parsed configuration
|
|
1912
|
+
*/
|
|
1913
|
+
loadConfig(configPath, options = {}) {
|
|
1914
|
+
const fullPath = path.resolve(configPath);
|
|
1915
|
+
const content = fs.readFileSync(fullPath, 'utf8');
|
|
1916
|
+
const config = JSON.parse(content);
|
|
1917
|
+
|
|
1918
|
+
const validation = this.validateConfig(config, options);
|
|
1919
|
+
|
|
1920
|
+
// Show warnings (but don't fail unless strict mode)
|
|
1921
|
+
if (validation.warnings && validation.warnings.length > 0 && !this.quiet) {
|
|
1922
|
+
console.warn('\n⚠️ Configuration warnings:');
|
|
1923
|
+
for (const warning of validation.warnings) {
|
|
1924
|
+
console.warn(` ${warning}`);
|
|
1925
|
+
}
|
|
1926
|
+
console.warn('');
|
|
1927
|
+
}
|
|
1928
|
+
|
|
1929
|
+
if (!validation.valid) {
|
|
1930
|
+
const errorMsg = validation.errors.join('\n ');
|
|
1931
|
+
throw new Error(`Invalid config:\n ${errorMsg}`);
|
|
1932
|
+
}
|
|
1933
|
+
|
|
1934
|
+
return config;
|
|
1935
|
+
}
|
|
1936
|
+
}
|
|
1937
|
+
|
|
1938
|
+
module.exports = Orchestrator;
|