@covibes/zeroshot 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,6 +6,10 @@
6
6
 
7
7
  FROM node:20-slim
8
8
 
9
+ # Upgrade npm to fix Arborist isDescendantOf bug (npm 10.x crash on complex peer deps)
10
+ # See: https://github.com/npm/cli/issues/7682
11
+ RUN npm install -g npm@latest
12
+
9
13
  # Version pinning for infrastructure tools
10
14
  ARG AWS_CLI_VERSION=2.15.10
11
15
  ARG TERRAFORM_VERSION=1.6.6
@@ -19,6 +23,9 @@ ARG TFSEC_VERSION=1.28.4
19
23
  RUN apt-get update && apt-get install -y --no-install-recommends \
20
24
  git \
21
25
  curl \
26
+ # Build tools for native modules (node-gyp needs make, gcc, g++)
27
+ build-essential \
28
+ python3-dev \
22
29
  ca-certificates \
23
30
  gnupg \
24
31
  unzip \
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@covibes/zeroshot",
3
- "version": "1.3.0",
3
+ "version": "1.5.0",
4
4
  "description": "Multi-agent orchestration engine for Claude - cluster coordinator and CLI",
5
5
  "main": "src/orchestrator.js",
6
6
  "bin": {
@@ -24,6 +24,7 @@
24
24
  "deadcode:files": "unimported",
25
25
  "deadcode:deps": "depcheck",
26
26
  "deadcode:all": "npm run deadcode && npm run deadcode:files && npm run deadcode:deps",
27
+ "dupcheck": "jscpd src/ --min-lines 5 --min-tokens 50 --threshold 5",
27
28
  "check": "npm run typecheck && npm run lint",
28
29
  "check:all": "npm run check && npm run deadcode:all",
29
30
  "release": "semantic-release",
@@ -111,6 +112,7 @@
111
112
  "eslint-config-prettier": "^10.1.8",
112
113
  "eslint-plugin-unused-imports": "^4.3.0",
113
114
  "husky": "^9.1.7",
115
+ "jscpd": "^3.5.10",
114
116
  "mocha": "^11.7.5",
115
117
  "semantic-release": "^25.0.2",
116
118
  "sinon": "^21.0.0",
@@ -11,10 +11,9 @@
11
11
  // Default max iterations (high limit - let the user decide when to give up)
12
12
  const DEFAULT_MAX_ITERATIONS = 100;
13
13
 
14
- // Task timeout - DISABLED (tasks run until completion or explicit kill)
15
- // Originally: 2 hours - caused premature termination of long-running tasks
16
- // Now: Infinity - tasks only stop on completion, explicit kill, or external error
17
- const DEFAULT_TASK_TIMEOUT_MS = Infinity;
14
+ // Default timeout: 0 = no timeout (task runs until completion or explicit kill)
15
+ // Use positive number for timeout in milliseconds
16
+ const DEFAULT_TIMEOUT = 0;
18
17
 
19
18
  // Stale detection - ENABLED by default using multi-indicator analysis (safe from false positives)
20
19
  // Multi-indicator approach checks: process state, CPU usage, context switches, network I/O
@@ -85,13 +84,28 @@ function validateAgentConfig(config, options = {}) {
85
84
  throw new Error(`Agent "${config.id}": invalid prompt format`);
86
85
  }
87
86
 
87
+ // Default timeout to 0 (no timeout) if not specified
88
+ // Use positive number for timeout in milliseconds
89
+ // ROBUST: Handle undefined, null, AND string values from template resolution
90
+ if (config.timeout === undefined || config.timeout === null || config.timeout === '') {
91
+ config.timeout = DEFAULT_TIMEOUT;
92
+ } else {
93
+ // Coerce to number (handles string "0" from template resolution)
94
+ config.timeout = Number(config.timeout);
95
+ }
96
+ if (!Number.isFinite(config.timeout) || config.timeout < 0) {
97
+ throw new Error(
98
+ `Agent "${config.id}": timeout must be a non-negative number (got ${config.timeout}).`
99
+ );
100
+ }
101
+
88
102
  // Build normalized config
89
103
  const normalizedConfig = {
90
104
  ...config,
91
105
  modelConfig,
92
106
  promptConfig,
93
107
  maxIterations: config.maxIterations || DEFAULT_MAX_ITERATIONS,
94
- timeout: config.timeout || DEFAULT_TASK_TIMEOUT_MS,
108
+ timeout: config.timeout, // Defaults to 0 (no timeout) if not specified
95
109
  staleDuration: config.staleDuration || DEFAULT_STALE_DURATION_MS,
96
110
  enableLivenessCheck: config.enableLivenessCheck ?? DEFAULT_LIVENESS_CHECK_ENABLED, // On by default, opt-out with false
97
111
  };
@@ -115,7 +129,6 @@ function validateAgentConfig(config, options = {}) {
115
129
  module.exports = {
116
130
  validateAgentConfig,
117
131
  DEFAULT_MAX_ITERATIONS,
118
- DEFAULT_TASK_TIMEOUT_MS,
119
132
  DEFAULT_STALE_DURATION_MS,
120
133
  DEFAULT_LIVENESS_CHECK_ENABLED,
121
134
  };
@@ -54,6 +54,15 @@ function buildContext({
54
54
  context += `- If unsure between "fix the code" vs "relax the rules" → ALWAYS fix the code\n`;
55
55
  context += `- If unsure between "do more" vs "do less" → ALWAYS do what's required, nothing more\n\n`;
56
56
 
57
+ // MINIMAL OUTPUT - No verbose prose for background agents
58
+ context += `## 📝 OUTPUT STYLE - MINIMAL\n\n`;
59
+ context += `You are a background agent. The human CANNOT interact with you.\n`;
60
+ context += `- NO explanatory prose ("Let me explain...", "I'll now...")\n`;
61
+ context += `- NO step-by-step narration\n`;
62
+ context += `- YES: Brief status updates ("Implementing auth", "Fixed 3 errors")\n`;
63
+ context += `- YES: Error reports with actionable info\n`;
64
+ context += `- YES: Final summary of changes made\n\n`;
65
+
57
66
  // Add prompt from config (system prompt, instructions, output format)
58
67
  // If selectedPrompt is provided (iteration-based), use it directly
59
68
  // Otherwise fall back to legacy config.prompt handling
@@ -51,6 +51,72 @@ function sanitizeErrorMessage(error) {
51
51
  return error;
52
52
  }
53
53
 
54
+ /**
55
+ * Strip timestamp prefix from log lines.
56
+ * Log lines may have format: [epochMs]{json...} or [epochMs]text
57
+ *
58
+ * @param {string} line - Raw log line
59
+ * @returns {string} Line content without timestamp prefix, empty string for invalid input
60
+ */
61
+ function stripTimestampPrefix(line) {
62
+ if (!line || typeof line !== 'string') return '';
63
+ const trimmed = line.trim().replace(/\r$/, '');
64
+ if (!trimmed) return '';
65
+ const match = trimmed.match(/^\[(\d{13})\](.*)$/);
66
+ return match ? match[2] : trimmed;
67
+ }
68
+
69
+ /**
70
+ * Extract error context from task output.
71
+ * Shared by both isolated and non-isolated modes.
72
+ *
73
+ * @param {Object} params - Extraction parameters
74
+ * @param {string} params.output - Full task output
75
+ * @param {string} [params.statusOutput] - Status command output (non-isolated only)
76
+ * @param {string} params.taskId - Task ID for error messages
77
+ * @param {boolean} [params.isNotFound=false] - True if task was not found
78
+ * @returns {string|null} Sanitized error context or null if extraction failed
79
+ */
80
+ function extractErrorContext({ output, statusOutput, taskId, isNotFound = false }) {
81
+ // Task not found - explicit error
82
+ if (isNotFound) {
83
+ return sanitizeErrorMessage(`Task ${taskId} not found (may have crashed or been killed)`);
84
+ }
85
+
86
+ // Try status output first (only available in non-isolated mode)
87
+ if (statusOutput) {
88
+ const statusErrorMatch = statusOutput.match(/Error:\s*(.+)/);
89
+ if (statusErrorMatch) {
90
+ return sanitizeErrorMessage(statusErrorMatch[1].trim());
91
+ }
92
+ }
93
+
94
+ // Fall back to extracting from output (last 500 chars)
95
+ const lastOutput = (output || '').slice(-500).trim();
96
+ if (!lastOutput) {
97
+ return sanitizeErrorMessage('Task failed with no output (check if task was interrupted or timed out)');
98
+ }
99
+
100
+ // Common error patterns
101
+ const errorPatterns = [
102
+ /Error:\s*(.+)/i,
103
+ /error:\s*(.+)/i,
104
+ /failed:\s*(.+)/i,
105
+ /Exception:\s*(.+)/i,
106
+ /panic:\s*(.+)/i,
107
+ ];
108
+
109
+ for (const pattern of errorPatterns) {
110
+ const match = lastOutput.match(pattern);
111
+ if (match) {
112
+ return sanitizeErrorMessage(match[1].slice(0, 200));
113
+ }
114
+ }
115
+
116
+ // No pattern matched - include last portion of output
117
+ return sanitizeErrorMessage(`Task failed. Last output: ${lastOutput.slice(-200)}`);
118
+ }
119
+
54
120
  // Track if we've already ensured the AskUserQuestion hook is installed
55
121
  let askUserQuestionHookInstalled = false;
56
122
 
@@ -68,10 +134,11 @@ function extractTokenUsage(output) {
68
134
 
69
135
  // Find the result line containing usage data
70
136
  for (const line of lines) {
71
- if (!line.trim()) continue;
137
+ const content = stripTimestampPrefix(line);
138
+ if (!content) continue;
72
139
 
73
140
  try {
74
- const event = JSON.parse(line.trim());
141
+ const event = JSON.parse(content);
75
142
  if (event.type === 'result') {
76
143
  const usage = event.usage || {};
77
144
  return {
@@ -527,14 +594,45 @@ function followClaudeTaskLogs(agent, taskId) {
527
594
  // Track exec failures - if status command keeps failing, something is wrong
528
595
  if (error) {
529
596
  consecutiveExecFailures++;
530
- if (consecutiveExecFailures === MAX_CONSECUTIVE_FAILURES) {
597
+ if (consecutiveExecFailures >= MAX_CONSECUTIVE_FAILURES) {
531
598
  console.error(
532
- `[Agent ${agent.id}] ⚠️ Status polling failed ${MAX_CONSECUTIVE_FAILURES} times consecutively!`
599
+ `[Agent ${agent.id}] ⚠️ Status polling failed ${MAX_CONSECUTIVE_FAILURES} times consecutively! STOPPING.`
533
600
  );
534
601
  console.error(` Command: ${ctPath} status ${taskId}`);
535
602
  console.error(` Error: ${error.message}`);
536
603
  console.error(` Stderr: ${stderr || 'none'}`);
537
604
  console.error(` This may indicate zeroshot is not in PATH or task storage is corrupted.`);
605
+
606
+ // Stop polling and resolve with failure
607
+ if (!resolved) {
608
+ resolved = true;
609
+ clearInterval(pollInterval);
610
+ clearInterval(statusCheckInterval);
611
+ agent.currentTask = null;
612
+
613
+ // Publish error for orchestrator/resume
614
+ agent._publish({
615
+ topic: 'AGENT_ERROR',
616
+ receiver: 'broadcast',
617
+ content: {
618
+ text: `Task ${taskId} polling failed after ${MAX_CONSECUTIVE_FAILURES} consecutive failures`,
619
+ data: {
620
+ taskId,
621
+ error: 'polling_timeout',
622
+ attempts: consecutiveExecFailures,
623
+ role: agent.role,
624
+ iteration: agent.iteration,
625
+ },
626
+ },
627
+ });
628
+
629
+ resolve({
630
+ success: false,
631
+ output,
632
+ error: `Status polling failed ${MAX_CONSECUTIVE_FAILURES} times - task may not exist`,
633
+ });
634
+ }
635
+ return;
538
636
  }
539
637
  return; // Keep polling - might be transient
540
638
  }
@@ -566,47 +664,15 @@ function followClaudeTaskLogs(agent, taskId) {
566
664
  clearInterval(statusCheckInterval);
567
665
  agent.currentTask = null;
568
666
 
569
- // Extract meaningful error context when task fails
570
- let errorContext = null;
571
- if (!success) {
572
- // Try to extract error from status output first
573
- const statusErrorMatch = stdout.match(/Error:\s*(.+)/);
574
- if (statusErrorMatch) {
575
- errorContext = statusErrorMatch[1].trim();
576
- } else {
577
- // Fall back to last 500 chars of output (likely contains the failure reason)
578
- const lastOutput = output.slice(-500).trim();
579
- if (lastOutput) {
580
- // Look for common error patterns in output
581
- const errorPatterns = [
582
- /Error:\s*(.+)/i,
583
- /error:\s*(.+)/i,
584
- /failed:\s*(.+)/i,
585
- /Exception:\s*(.+)/i,
586
- /panic:\s*(.+)/i,
587
- ];
588
- for (const pattern of errorPatterns) {
589
- const match = lastOutput.match(pattern);
590
- if (match) {
591
- errorContext = match[1].slice(0, 200);
592
- break;
593
- }
594
- }
595
- // If no pattern matched, include last portion of output
596
- if (!errorContext) {
597
- errorContext = `Task failed. Last output: ${lastOutput.slice(-200)}`;
598
- }
599
- } else {
600
- errorContext =
601
- 'Task failed with no output (check if task was interrupted or timed out)';
602
- }
603
- }
604
- }
667
+ // Extract error context using shared helper
668
+ const errorContext = !success
669
+ ? extractErrorContext({ output, statusOutput: stdout, taskId })
670
+ : null;
605
671
 
606
672
  resolve({
607
673
  success,
608
674
  output,
609
- error: sanitizeErrorMessage(errorContext),
675
+ error: errorContext,
610
676
  tokenUsage: extractTokenUsage(output),
611
677
  });
612
678
  }, 500);
@@ -912,12 +978,14 @@ function followClaudeTaskLogsIsolated(agent, taskId) {
912
978
  `zeroshot status ${taskId} 2>/dev/null || echo "not_found"`,
913
979
  ]);
914
980
 
915
- const statusOutput = statusResult.stdout.toLowerCase();
916
- if (
917
- statusOutput.includes('success') ||
918
- statusOutput.includes('error') ||
919
- statusOutput.includes('not_found')
920
- ) {
981
+ // Use same regex patterns as non-isolated mode (lines 649-650)
982
+ // CRITICAL: Don't use substring matching - it matches "error" in "is_error":false
983
+ const statusOutput = statusResult.stdout;
984
+ const isSuccess = /Status:\s+completed/i.test(statusOutput);
985
+ const isError = /Status:\s+failed/i.test(statusOutput);
986
+ const isNotFound = statusOutput.includes('not_found');
987
+
988
+ if (isSuccess || isError || isNotFound) {
921
989
  // Task finished - read final output and resolve
922
990
  const finalReadResult = await manager.execInContainer(clusterId, [
923
991
  'sh',
@@ -940,13 +1008,23 @@ function followClaudeTaskLogsIsolated(agent, taskId) {
940
1008
  cleanup();
941
1009
  taskExited = true;
942
1010
 
943
- // Parse result from output (same logic as non-isolated mode)
1011
+ // Determine success status
1012
+ const success = isSuccess && !isError;
1013
+
1014
+ // Extract error context using shared helper
1015
+ const errorContext = !success
1016
+ ? extractErrorContext({ output: fullOutput, taskId, isNotFound })
1017
+ : null;
1018
+
1019
+ // Parse result from output
944
1020
  const parsedResult = agent._parseResultOutput(fullOutput);
945
1021
 
946
1022
  resolve({
1023
+ success,
947
1024
  output: fullOutput,
948
1025
  taskId,
949
1026
  result: parsedResult,
1027
+ error: errorContext,
950
1028
  tokenUsage: extractTokenUsage(fullOutput),
951
1029
  });
952
1030
  }
@@ -956,18 +1034,19 @@ function followClaudeTaskLogsIsolated(agent, taskId) {
956
1034
  }
957
1035
  }, 500);
958
1036
 
959
- // Safety timeout (same as non-isolated mode)
960
- const timeoutMs = agent.timeout || 300000; // 5 minutes default
961
- setTimeout(() => {
962
- if (!taskExited) {
963
- cleanup();
964
- reject(
965
- new Error(
966
- `Task ${taskId} timeout after ${timeoutMs}ms (isolated mode)`
967
- )
968
- );
969
- }
970
- }, timeoutMs);
1037
+ // Safety timeout (0 = no timeout, task runs until completion)
1038
+ if (agent.timeout > 0) {
1039
+ setTimeout(() => {
1040
+ if (!taskExited) {
1041
+ cleanup();
1042
+ reject(
1043
+ new Error(
1044
+ `Task ${taskId} timeout after ${agent.timeout}ms (isolated mode)`
1045
+ )
1046
+ );
1047
+ }
1048
+ }, agent.timeout);
1049
+ }
971
1050
  })
972
1051
  .catch((err) => {
973
1052
  cleanup();
@@ -994,11 +1073,14 @@ function parseResultOutput(agent, output) {
994
1073
  let trimmedOutput = output.trim();
995
1074
 
996
1075
  // IMPORTANT: Output is NDJSON (one JSON object per line) from streaming log
1076
+ // Lines may have timestamp prefix: [epochMs]{json...}
997
1077
  // Find the line with "type":"result" which contains the actual result
998
1078
  const lines = trimmedOutput.split('\n');
999
1079
  const resultLine = lines.find((line) => {
1000
1080
  try {
1001
- const obj = JSON.parse(line.trim());
1081
+ const content = stripTimestampPrefix(line);
1082
+ if (!content.startsWith('{')) return false;
1083
+ const obj = JSON.parse(content);
1002
1084
  return obj.type === 'result';
1003
1085
  } catch {
1004
1086
  return false;
@@ -1006,13 +1088,15 @@ function parseResultOutput(agent, output) {
1006
1088
  });
1007
1089
 
1008
1090
  // Use the result line if found, otherwise use last non-empty line
1091
+ // CRITICAL: Strip timestamp prefix before assigning to trimmedOutput
1009
1092
  if (resultLine) {
1010
- trimmedOutput = resultLine.trim();
1093
+ trimmedOutput = stripTimestampPrefix(resultLine);
1011
1094
  } else if (lines.length > 1) {
1012
- // Fallback: use last non-empty line
1095
+ // Fallback: use last non-empty line (also strip timestamp)
1013
1096
  for (let i = lines.length - 1; i >= 0; i--) {
1014
- if (lines[i].trim()) {
1015
- trimmedOutput = lines[i].trim();
1097
+ const content = stripTimestampPrefix(lines[i]);
1098
+ if (content) {
1099
+ trimmedOutput = content;
1016
1100
  break;
1017
1101
  }
1018
1102
  }
@@ -415,6 +415,19 @@ function validateAgents(config) {
415
415
  }
416
416
  }
417
417
 
418
+ // Check for git operations in validator prompts (unreliable in agents)
419
+ if (agent.role === 'validator') {
420
+ const prompt = typeof agent.prompt === 'string' ? agent.prompt : agent.prompt?.system;
421
+ const gitPatterns = ['git diff', 'git status', 'git log', 'git show'];
422
+ for (const pattern of gitPatterns) {
423
+ if (prompt?.includes(pattern)) {
424
+ errors.push(
425
+ `Validator '${agent.id}' uses '${pattern}' - git state is unreliable in agents`
426
+ );
427
+ }
428
+ }
429
+ }
430
+
418
431
  // JSON output without schema
419
432
  if (agent.outputFormat === 'json' && !agent.jsonSchema) {
420
433
  warnings.push(
@@ -191,7 +191,7 @@ class IsolationManager {
191
191
  try {
192
192
  installResult = await this.execInContainer(
193
193
  clusterId,
194
- ['sh', '-c', 'npm install --no-audit --no-fund 2>&1'],
194
+ ['sh', '-c', 'npm_config_engine_strict=false npm install --no-audit --no-fund'],
195
195
  {}
196
196
  );
197
197
 
@@ -201,16 +201,18 @@ class IsolationManager {
201
201
  }
202
202
 
203
203
  // Failed - retry if not last attempt
204
+ // Use stderr if available, otherwise stdout (npm writes some errors to stdout)
205
+ const errorOutput = (installResult.stderr || installResult.stdout || '').slice(0, 500);
204
206
  if (attempt < maxRetries) {
205
207
  const delay = baseDelay * Math.pow(2, attempt - 1);
206
208
  console.warn(
207
209
  `[IsolationManager] ⚠️ npm install failed (attempt ${attempt}/${maxRetries}), retrying in ${delay}ms...`
208
210
  );
209
- console.warn(`[IsolationManager] Error: ${installResult.stderr.slice(0, 200)}`);
211
+ console.warn(`[IsolationManager] Error: ${errorOutput}`);
210
212
  await new Promise((_resolve) => setTimeout(_resolve, delay));
211
213
  } else {
212
214
  console.warn(
213
- `[IsolationManager] ⚠️ npm install failed after ${maxRetries} attempts (non-fatal): ${installResult.stderr.slice(0, 200)}`
215
+ `[IsolationManager] ⚠️ npm install failed after ${maxRetries} attempts (non-fatal): ${errorOutput}`
214
216
  );
215
217
  }
216
218
  } catch (execErr) {
@@ -342,8 +344,9 @@ class IsolationManager {
342
344
  * @param {number} [timeout=10] - Timeout in seconds before SIGKILL
343
345
  * @returns {Promise<void>}
344
346
  */
345
- stopContainer(clusterId, timeout = 10) {
346
- const containerId = this.containers.get(clusterId);
347
+ stopContainer(clusterId, timeout = 10, explicitContainerId = null) {
348
+ // Use explicit containerId (from restored state) or in-memory Map
349
+ const containerId = explicitContainerId || this.containers.get(clusterId);
347
350
  if (!containerId) {
348
351
  return; // Already stopped or never started
349
352
  }
@@ -369,8 +372,9 @@ class IsolationManager {
369
372
  * @param {boolean} [force=false] - Force remove running container
370
373
  * @returns {Promise<void>}
371
374
  */
372
- removeContainer(clusterId, force = false) {
373
- const containerId = this.containers.get(clusterId);
375
+ removeContainer(clusterId, force = false, explicitContainerId = null) {
376
+ // Use explicit containerId (from restored state) or in-memory Map
377
+ const containerId = explicitContainerId || this.containers.get(clusterId);
374
378
  if (!containerId) {
375
379
  return;
376
380
  }
@@ -126,9 +126,55 @@ class Orchestrator {
126
126
  const clusterIds = Object.keys(data);
127
127
  this._log(`[Orchestrator] Found ${clusterIds.length} clusters in file:`, clusterIds);
128
128
 
129
+ // Track clusters to remove (missing .db files or 0 messages)
130
+ const clustersToRemove = [];
131
+ // Track clusters with 0 messages (corrupted from SIGINT race condition)
132
+ const corruptedClusters = [];
133
+
129
134
  for (const [clusterId, clusterData] of Object.entries(data)) {
135
+ // Skip clusters whose .db file doesn't exist (orphaned registry entries)
136
+ const dbPath = path.join(this.storageDir, `${clusterId}.db`);
137
+ if (!fs.existsSync(dbPath)) {
138
+ console.warn(`[Orchestrator] Cluster ${clusterId} has no database file, removing from registry`);
139
+ clustersToRemove.push(clusterId);
140
+ continue;
141
+ }
142
+
130
143
  this._log(`[Orchestrator] Loading cluster: ${clusterId}`);
131
- this._loadSingleCluster(clusterId, clusterData);
144
+ const cluster = this._loadSingleCluster(clusterId, clusterData);
145
+
146
+ // VALIDATION: Detect 0-message clusters (corrupted from SIGINT during initialization)
147
+ // These clusters were created before the initCompletePromise fix was applied
148
+ if (cluster && cluster.messageBus) {
149
+ const messageCount = cluster.messageBus.count({ cluster_id: clusterId });
150
+ if (messageCount === 0) {
151
+ console.warn(`[Orchestrator] ⚠️ Cluster ${clusterId} has 0 messages (corrupted)`);
152
+ console.warn(`[Orchestrator] This likely occurred from SIGINT during initialization.`);
153
+ console.warn(`[Orchestrator] Marking as 'corrupted' - use 'zeroshot kill ${clusterId}' to remove.`);
154
+ corruptedClusters.push(clusterId);
155
+ // Mark cluster as corrupted for visibility in status/list commands
156
+ cluster.state = 'corrupted';
157
+ cluster.corruptedReason = 'SIGINT during initialization (0 messages in ledger)';
158
+ }
159
+ }
160
+ }
161
+
162
+ // Clean up orphaned entries from clusters.json
163
+ if (clustersToRemove.length > 0) {
164
+ for (const clusterId of clustersToRemove) {
165
+ delete data[clusterId];
166
+ }
167
+ fs.writeFileSync(clustersFile, JSON.stringify(data, null, 2));
168
+ this._log(`[Orchestrator] Removed ${clustersToRemove.length} orphaned cluster(s) from registry`);
169
+ }
170
+
171
+ // Log summary of corrupted clusters
172
+ if (corruptedClusters.length > 0) {
173
+ console.warn(`\n[Orchestrator] ⚠️ Found ${corruptedClusters.length} corrupted cluster(s):`);
174
+ for (const clusterId of corruptedClusters) {
175
+ console.warn(` - ${clusterId}`);
176
+ }
177
+ console.warn(`[Orchestrator] Run 'zeroshot clear' to remove all corrupted clusters.\n`);
132
178
  }
133
179
 
134
180
  this._log(`[Orchestrator] Total clusters loaded: ${this.clusters.size}`);
@@ -494,6 +540,13 @@ class Orchestrator {
494
540
  }
495
541
 
496
542
  // Build cluster object
543
+ // CRITICAL: initComplete promise ensures ISSUE_OPENED is published before stop() completes
544
+ // This prevents 0-message clusters from SIGINT during async initialization
545
+ let resolveInitComplete;
546
+ const initCompletePromise = new Promise((resolve) => {
547
+ resolveInitComplete = resolve;
548
+ });
549
+
497
550
  const cluster = {
498
551
  id: clusterId,
499
552
  config,
@@ -504,6 +557,9 @@ class Orchestrator {
504
557
  createdAt: Date.now(),
505
558
  // Track PID for zombie detection (this process owns the cluster)
506
559
  pid: process.pid,
560
+ // Initialization completion tracking (for safe SIGINT handling)
561
+ initCompletePromise,
562
+ _resolveInitComplete: resolveInitComplete,
507
563
  // Isolation state (only if enabled)
508
564
  // CRITICAL: Store workDir for resume capability - without this, resume() can't recreate container
509
565
  isolation: options.isolation
@@ -652,6 +708,12 @@ class Orchestrator {
652
708
  },
653
709
  });
654
710
 
711
+ // CRITICAL: Mark initialization complete AFTER ISSUE_OPENED is published
712
+ // This ensures stop() waits for at least 1 message before stopping
713
+ if (cluster._resolveInitComplete) {
714
+ cluster._resolveInitComplete();
715
+ }
716
+
655
717
  this._log(`Cluster ${clusterId} started with ${cluster.agents.length} agents`);
656
718
 
657
719
  // Watch for CLUSTER_COMPLETE message to auto-stop
@@ -818,6 +880,10 @@ class Orchestrator {
818
880
  };
819
881
  } catch (error) {
820
882
  cluster.state = 'failed';
883
+ // CRITICAL: Resolve the promise on failure too, so stop() doesn't hang
884
+ if (cluster._resolveInitComplete) {
885
+ cluster._resolveInitComplete();
886
+ }
821
887
  console.error(`Cluster ${clusterId} failed to start:`, error);
822
888
  throw error;
823
889
  }
@@ -833,6 +899,17 @@ class Orchestrator {
833
899
  throw new Error(`Cluster ${clusterId} not found`);
834
900
  }
835
901
 
902
+ // CRITICAL: Wait for initialization to complete before stopping
903
+ // This ensures ISSUE_OPENED is published, preventing 0-message clusters
904
+ // Timeout after 30s to prevent infinite hang if init truly fails
905
+ if (cluster.initCompletePromise && cluster.state === 'initializing') {
906
+ this._log(`[Orchestrator] Waiting for initialization to complete before stopping...`);
907
+ await Promise.race([
908
+ cluster.initCompletePromise,
909
+ new Promise((resolve) => setTimeout(resolve, 30000)),
910
+ ]);
911
+ }
912
+
836
913
  cluster.state = 'stopping';
837
914
 
838
915
  // Stop all agents (including subclusters which handle their own children)