npm - @covibes/zeroshot - Versions diffs - 1.3.0 → 1.5.0 - Mend

@covibes/zeroshot 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/CHANGELOG.md +50 -0
package/cli/index.js +131 -49
package/cli/message-formatters-normal.js +77 -38
package/cluster-templates/base-templates/debug-workflow.json +11 -2
package/cluster-templates/base-templates/full-workflow.json +20 -7
package/cluster-templates/base-templates/single-worker.json +8 -1
package/cluster-templates/base-templates/worker-validator.json +10 -2
package/docker/zeroshot-cluster/Dockerfile +7 -0
package/package.json +3 -1
package/src/agent/agent-config.js +19 -6
package/src/agent/agent-context-builder.js +9 -0
package/src/agent/agent-task-executor.js +149 -65
package/src/config-validator.js +13 -0
package/src/isolation-manager.js +11 -7
package/src/orchestrator.js +78 -1
package/src/status-footer.js +188 -42
package/src/template-resolver.js +23 -1

package/docker/zeroshot-cluster/Dockerfile CHANGED Viewed

@@ -6,6 +6,10 @@
 FROM node:20-slim
+# Upgrade npm to fix Arborist isDescendantOf bug (npm 10.x crash on complex peer deps)
+# See: https://github.com/npm/cli/issues/7682
+RUN npm install -g npm@latest
 # Version pinning for infrastructure tools
 ARG AWS_CLI_VERSION=2.15.10
 ARG TERRAFORM_VERSION=1.6.6
@@ -19,6 +23,9 @@ ARG TFSEC_VERSION=1.28.4
 RUN apt-get update && apt-get install -y --no-install-recommends \
     git \
     curl \
+    # Build tools for native modules (node-gyp needs make, gcc, g++)
+    build-essential \
+    python3-dev \
     ca-certificates \
     gnupg \
     unzip \

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@covibes/zeroshot",
-  "version": "1.3.0",
+  "version": "1.5.0",
   "description": "Multi-agent orchestration engine for Claude - cluster coordinator and CLI",
   "main": "src/orchestrator.js",
   "bin": {
@@ -24,6 +24,7 @@
     "deadcode:files": "unimported",
     "deadcode:deps": "depcheck",
     "deadcode:all": "npm run deadcode && npm run deadcode:files && npm run deadcode:deps",
+    "dupcheck": "jscpd src/ --min-lines 5 --min-tokens 50 --threshold 5",
     "check": "npm run typecheck && npm run lint",
     "check:all": "npm run check && npm run deadcode:all",
     "release": "semantic-release",
@@ -111,6 +112,7 @@
     "eslint-config-prettier": "^10.1.8",
     "eslint-plugin-unused-imports": "^4.3.0",
     "husky": "^9.1.7",
+    "jscpd": "^3.5.10",
     "mocha": "^11.7.5",
     "semantic-release": "^25.0.2",
     "sinon": "^21.0.0",

package/src/agent/agent-config.js CHANGED Viewed

@@ -11,10 +11,9 @@
 // Default max iterations (high limit - let the user decide when to give up)
 const DEFAULT_MAX_ITERATIONS = 100;
-// Task timeout - DISABLED (tasks run until completion or explicit kill)
-// Originally: 2 hours - caused premature termination of long-running tasks
-// Now: Infinity - tasks only stop on completion, explicit kill, or external error
-const DEFAULT_TASK_TIMEOUT_MS = Infinity;
+// Default timeout: 0 = no timeout (task runs until completion or explicit kill)
+// Use positive number for timeout in milliseconds
+const DEFAULT_TIMEOUT = 0;
 // Stale detection - ENABLED by default using multi-indicator analysis (safe from false positives)
 // Multi-indicator approach checks: process state, CPU usage, context switches, network I/O
@@ -85,13 +84,28 @@ function validateAgentConfig(config, options = {}) {
     throw new Error(`Agent "${config.id}": invalid prompt format`);
   }
+  // Default timeout to 0 (no timeout) if not specified
+  // Use positive number for timeout in milliseconds
+  // ROBUST: Handle undefined, null, AND string values from template resolution
+  if (config.timeout === undefined || config.timeout === null || config.timeout === '') {
+    config.timeout = DEFAULT_TIMEOUT;
+  } else {
+    // Coerce to number (handles string "0" from template resolution)
+    config.timeout = Number(config.timeout);
+  }
+  if (!Number.isFinite(config.timeout) || config.timeout < 0) {
+    throw new Error(
+      `Agent "${config.id}": timeout must be a non-negative number (got ${config.timeout}).`
+    );
+  }
   // Build normalized config
   const normalizedConfig = {
     ...config,
     modelConfig,
     promptConfig,
     maxIterations: config.maxIterations || DEFAULT_MAX_ITERATIONS,
-    timeout: config.timeout || DEFAULT_TASK_TIMEOUT_MS,
+    timeout: config.timeout, // Defaults to 0 (no timeout) if not specified
     staleDuration: config.staleDuration || DEFAULT_STALE_DURATION_MS,
     enableLivenessCheck: config.enableLivenessCheck ?? DEFAULT_LIVENESS_CHECK_ENABLED, // On by default, opt-out with false
   };
@@ -115,7 +129,6 @@ function validateAgentConfig(config, options = {}) {
 module.exports = {
   validateAgentConfig,
   DEFAULT_MAX_ITERATIONS,
-  DEFAULT_TASK_TIMEOUT_MS,
   DEFAULT_STALE_DURATION_MS,
   DEFAULT_LIVENESS_CHECK_ENABLED,
 };

package/src/agent/agent-context-builder.js CHANGED Viewed

@@ -54,6 +54,15 @@ function buildContext({
   context += `- If unsure between "fix the code" vs "relax the rules" → ALWAYS fix the code\n`;
   context += `- If unsure between "do more" vs "do less" → ALWAYS do what's required, nothing more\n\n`;
+  // MINIMAL OUTPUT - No verbose prose for background agents
+  context += `## 📝 OUTPUT STYLE - MINIMAL\n\n`;
+  context += `You are a background agent. The human CANNOT interact with you.\n`;
+  context += `- NO explanatory prose ("Let me explain...", "I'll now...")\n`;
+  context += `- NO step-by-step narration\n`;
+  context += `- YES: Brief status updates ("Implementing auth", "Fixed 3 errors")\n`;
+  context += `- YES: Error reports with actionable info\n`;
+  context += `- YES: Final summary of changes made\n\n`;
   // Add prompt from config (system prompt, instructions, output format)
   // If selectedPrompt is provided (iteration-based), use it directly
   // Otherwise fall back to legacy config.prompt handling

package/src/agent/agent-task-executor.js CHANGED Viewed

@@ -51,6 +51,72 @@ function sanitizeErrorMessage(error) {
   return error;
 }
+/**
+ * Strip timestamp prefix from log lines.
+ * Log lines may have format: [epochMs]{json...} or [epochMs]text
+ *
+ * @param {string} line - Raw log line
+ * @returns {string} Line content without timestamp prefix, empty string for invalid input
+ */
+function stripTimestampPrefix(line) {
+  if (!line || typeof line !== 'string') return '';
+  const trimmed = line.trim().replace(/\r$/, '');
+  if (!trimmed) return '';
+  const match = trimmed.match(/^\[(\d{13})\](.*)$/);
+  return match ? match[2] : trimmed;
+}
+/**
+ * Extract error context from task output.
+ * Shared by both isolated and non-isolated modes.
+ *
+ * @param {Object} params - Extraction parameters
+ * @param {string} params.output - Full task output
+ * @param {string} [params.statusOutput] - Status command output (non-isolated only)
+ * @param {string} params.taskId - Task ID for error messages
+ * @param {boolean} [params.isNotFound=false] - True if task was not found
+ * @returns {string|null} Sanitized error context or null if extraction failed
+ */
+function extractErrorContext({ output, statusOutput, taskId, isNotFound = false }) {
+  // Task not found - explicit error
+  if (isNotFound) {
+    return sanitizeErrorMessage(`Task ${taskId} not found (may have crashed or been killed)`);
+  }
+  // Try status output first (only available in non-isolated mode)
+  if (statusOutput) {
+    const statusErrorMatch = statusOutput.match(/Error:\s*(.+)/);
+    if (statusErrorMatch) {
+      return sanitizeErrorMessage(statusErrorMatch[1].trim());
+    }
+  }
+  // Fall back to extracting from output (last 500 chars)
+  const lastOutput = (output || '').slice(-500).trim();
+  if (!lastOutput) {
+    return sanitizeErrorMessage('Task failed with no output (check if task was interrupted or timed out)');
+  }
+  // Common error patterns
+  const errorPatterns = [
+    /Error:\s*(.+)/i,
+    /error:\s*(.+)/i,
+    /failed:\s*(.+)/i,
+    /Exception:\s*(.+)/i,
+    /panic:\s*(.+)/i,
+  ];
+  for (const pattern of errorPatterns) {
+    const match = lastOutput.match(pattern);
+    if (match) {
+      return sanitizeErrorMessage(match[1].slice(0, 200));
+    }
+  }
+  // No pattern matched - include last portion of output
+  return sanitizeErrorMessage(`Task failed. Last output: ${lastOutput.slice(-200)}`);
+}
 // Track if we've already ensured the AskUserQuestion hook is installed
 let askUserQuestionHookInstalled = false;
@@ -68,10 +134,11 @@ function extractTokenUsage(output) {
   // Find the result line containing usage data
   for (const line of lines) {
-    if (!line.trim()) continue;
+    const content = stripTimestampPrefix(line);
+    if (!content) continue;
     try {
-      const event = JSON.parse(line.trim());
+      const event = JSON.parse(content);
       if (event.type === 'result') {
         const usage = event.usage || {};
         return {
@@ -527,14 +594,45 @@ function followClaudeTaskLogs(agent, taskId) {
         // Track exec failures - if status command keeps failing, something is wrong
         if (error) {
           consecutiveExecFailures++;
-          if (consecutiveExecFailures === MAX_CONSECUTIVE_FAILURES) {
+          if (consecutiveExecFailures >= MAX_CONSECUTIVE_FAILURES) {
             console.error(
-              `[Agent ${agent.id}] ⚠️ Status polling failed ${MAX_CONSECUTIVE_FAILURES} times consecutively!`
+              `[Agent ${agent.id}] ⚠️ Status polling failed ${MAX_CONSECUTIVE_FAILURES} times consecutively! STOPPING.`
             );
             console.error(`  Command: ${ctPath} status ${taskId}`);
             console.error(`  Error: ${error.message}`);
             console.error(`  Stderr: ${stderr || 'none'}`);
             console.error(`  This may indicate zeroshot is not in PATH or task storage is corrupted.`);
+            // Stop polling and resolve with failure
+            if (!resolved) {
+              resolved = true;
+              clearInterval(pollInterval);
+              clearInterval(statusCheckInterval);
+              agent.currentTask = null;
+              // Publish error for orchestrator/resume
+              agent._publish({
+                topic: 'AGENT_ERROR',
+                receiver: 'broadcast',
+                content: {
+                  text: `Task ${taskId} polling failed after ${MAX_CONSECUTIVE_FAILURES} consecutive failures`,
+                  data: {
+                    taskId,
+                    error: 'polling_timeout',
+                    attempts: consecutiveExecFailures,
+                    role: agent.role,
+                    iteration: agent.iteration,
+                  },
+                },
+              });
+              resolve({
+                success: false,
+                output,
+                error: `Status polling failed ${MAX_CONSECUTIVE_FAILURES} times - task may not exist`,
+              });
+            }
+            return;
           }
           return; // Keep polling - might be transient
         }
@@ -566,47 +664,15 @@ function followClaudeTaskLogs(agent, taskId) {
             clearInterval(statusCheckInterval);
             agent.currentTask = null;
-            // Extract meaningful error context when task fails
-            let errorContext = null;
-            if (!success) {
-              // Try to extract error from status output first
-              const statusErrorMatch = stdout.match(/Error:\s*(.+)/);
-              if (statusErrorMatch) {
-                errorContext = statusErrorMatch[1].trim();
-              } else {
-                // Fall back to last 500 chars of output (likely contains the failure reason)
-                const lastOutput = output.slice(-500).trim();
-                if (lastOutput) {
-                  // Look for common error patterns in output
-                  const errorPatterns = [
-                    /Error:\s*(.+)/i,
-                    /error:\s*(.+)/i,
-                    /failed:\s*(.+)/i,
-                    /Exception:\s*(.+)/i,
-                    /panic:\s*(.+)/i,
-                  ];
-                  for (const pattern of errorPatterns) {
-                    const match = lastOutput.match(pattern);
-                    if (match) {
-                      errorContext = match[1].slice(0, 200);
-                      break;
-                    }
-                  }
-                  // If no pattern matched, include last portion of output
-                  if (!errorContext) {
-                    errorContext = `Task failed. Last output: ${lastOutput.slice(-200)}`;
-                  }
-                } else {
-                  errorContext =
-                    'Task failed with no output (check if task was interrupted or timed out)';
-                }
-              }
-            }
+            // Extract error context using shared helper
+            const errorContext = !success
+              ? extractErrorContext({ output, statusOutput: stdout, taskId })
+              : null;
             resolve({
               success,
               output,
-              error: sanitizeErrorMessage(errorContext),
+              error: errorContext,
               tokenUsage: extractTokenUsage(output),
             });
           }, 500);
@@ -912,12 +978,14 @@ function followClaudeTaskLogsIsolated(agent, taskId) {
               `zeroshot status ${taskId} 2>/dev/null || echo "not_found"`,
             ]);
-            const statusOutput = statusResult.stdout.toLowerCase();
-            if (
-              statusOutput.includes('success') ||
-              statusOutput.includes('error') ||
-              statusOutput.includes('not_found')
-            ) {
+            // Use same regex patterns as non-isolated mode (lines 649-650)
+            // CRITICAL: Don't use substring matching - it matches "error" in "is_error":false
+            const statusOutput = statusResult.stdout;
+            const isSuccess = /Status:\s+completed/i.test(statusOutput);
+            const isError = /Status:\s+failed/i.test(statusOutput);
+            const isNotFound = statusOutput.includes('not_found');
+            if (isSuccess || isError || isNotFound) {
               // Task finished - read final output and resolve
               const finalReadResult = await manager.execInContainer(clusterId, [
                 'sh',
@@ -940,13 +1008,23 @@ function followClaudeTaskLogsIsolated(agent, taskId) {
               cleanup();
               taskExited = true;
-              // Parse result from output (same logic as non-isolated mode)
+              // Determine success status
+              const success = isSuccess && !isError;
+              // Extract error context using shared helper
+              const errorContext = !success
+                ? extractErrorContext({ output: fullOutput, taskId, isNotFound })
+                : null;
+              // Parse result from output
               const parsedResult = agent._parseResultOutput(fullOutput);
               resolve({
+                success,
                 output: fullOutput,
                 taskId,
                 result: parsedResult,
+                error: errorContext,
                 tokenUsage: extractTokenUsage(fullOutput),
               });
             }
@@ -956,18 +1034,19 @@ function followClaudeTaskLogsIsolated(agent, taskId) {
           }
         }, 500);
-        // Safety timeout (same as non-isolated mode)
-        const timeoutMs = agent.timeout || 300000; // 5 minutes default
-        setTimeout(() => {
-          if (!taskExited) {
-            cleanup();
-            reject(
-              new Error(
-                `Task ${taskId} timeout after ${timeoutMs}ms (isolated mode)`
-              )
-            );
-          }
-        }, timeoutMs);
+        // Safety timeout (0 = no timeout, task runs until completion)
+        if (agent.timeout > 0) {
+          setTimeout(() => {
+            if (!taskExited) {
+              cleanup();
+              reject(
+                new Error(
+                  `Task ${taskId} timeout after ${agent.timeout}ms (isolated mode)`
+                )
+              );
+            }
+          }, agent.timeout);
+        }
       })
       .catch((err) => {
         cleanup();
@@ -994,11 +1073,14 @@ function parseResultOutput(agent, output) {
   let trimmedOutput = output.trim();
   // IMPORTANT: Output is NDJSON (one JSON object per line) from streaming log
+  // Lines may have timestamp prefix: [epochMs]{json...}
   // Find the line with "type":"result" which contains the actual result
   const lines = trimmedOutput.split('\n');
   const resultLine = lines.find((line) => {
     try {
-      const obj = JSON.parse(line.trim());
+      const content = stripTimestampPrefix(line);
+      if (!content.startsWith('{')) return false;
+      const obj = JSON.parse(content);
       return obj.type === 'result';
     } catch {
       return false;
@@ -1006,13 +1088,15 @@ function parseResultOutput(agent, output) {
   });
   // Use the result line if found, otherwise use last non-empty line
+  // CRITICAL: Strip timestamp prefix before assigning to trimmedOutput
   if (resultLine) {
-    trimmedOutput = resultLine.trim();
+    trimmedOutput = stripTimestampPrefix(resultLine);
   } else if (lines.length > 1) {
-    // Fallback: use last non-empty line
+    // Fallback: use last non-empty line (also strip timestamp)
     for (let i = lines.length - 1; i >= 0; i--) {
-      if (lines[i].trim()) {
-        trimmedOutput = lines[i].trim();
+      const content = stripTimestampPrefix(lines[i]);
+      if (content) {
+        trimmedOutput = content;
         break;
       }
     }

package/src/config-validator.js CHANGED Viewed

@@ -415,6 +415,19 @@ function validateAgents(config) {
       }
     }
+    // Check for git operations in validator prompts (unreliable in agents)
+    if (agent.role === 'validator') {
+      const prompt = typeof agent.prompt === 'string' ? agent.prompt : agent.prompt?.system;
+      const gitPatterns = ['git diff', 'git status', 'git log', 'git show'];
+      for (const pattern of gitPatterns) {
+        if (prompt?.includes(pattern)) {
+          errors.push(
+            `Validator '${agent.id}' uses '${pattern}' - git state is unreliable in agents`
+          );
+        }
+      }
+    }
     // JSON output without schema
     if (agent.outputFormat === 'json' && !agent.jsonSchema) {
       warnings.push(

package/src/isolation-manager.js CHANGED Viewed

@@ -191,7 +191,7 @@ class IsolationManager {
                 try {
                   installResult = await this.execInContainer(
                     clusterId,
-                    ['sh', '-c', 'npm install --no-audit --no-fund 2>&1'],
+                    ['sh', '-c', 'npm_config_engine_strict=false npm install --no-audit --no-fund'],
                     {}
                   );
@@ -201,16 +201,18 @@ class IsolationManager {
                   }
                   // Failed - retry if not last attempt
+                  // Use stderr if available, otherwise stdout (npm writes some errors to stdout)
+                  const errorOutput = (installResult.stderr || installResult.stdout || '').slice(0, 500);
                   if (attempt < maxRetries) {
                     const delay = baseDelay * Math.pow(2, attempt - 1);
                     console.warn(
                       `[IsolationManager] ⚠️ npm install failed (attempt ${attempt}/${maxRetries}), retrying in ${delay}ms...`
                     );
-                    console.warn(`[IsolationManager] Error: ${installResult.stderr.slice(0, 200)}`);
+                    console.warn(`[IsolationManager] Error: ${errorOutput}`);
                     await new Promise((_resolve) => setTimeout(_resolve, delay));
                   } else {
                     console.warn(
-                      `[IsolationManager] ⚠️ npm install failed after ${maxRetries} attempts (non-fatal): ${installResult.stderr.slice(0, 200)}`
+                      `[IsolationManager] ⚠️ npm install failed after ${maxRetries} attempts (non-fatal): ${errorOutput}`
                     );
                   }
                 } catch (execErr) {
@@ -342,8 +344,9 @@ class IsolationManager {
    * @param {number} [timeout=10] - Timeout in seconds before SIGKILL
    * @returns {Promise<void>}
    */
-  stopContainer(clusterId, timeout = 10) {
-    const containerId = this.containers.get(clusterId);
+  stopContainer(clusterId, timeout = 10, explicitContainerId = null) {
+    // Use explicit containerId (from restored state) or in-memory Map
+    const containerId = explicitContainerId || this.containers.get(clusterId);
     if (!containerId) {
       return; // Already stopped or never started
     }
@@ -369,8 +372,9 @@ class IsolationManager {
    * @param {boolean} [force=false] - Force remove running container
    * @returns {Promise<void>}
    */
-  removeContainer(clusterId, force = false) {
-    const containerId = this.containers.get(clusterId);
+  removeContainer(clusterId, force = false, explicitContainerId = null) {
+    // Use explicit containerId (from restored state) or in-memory Map
+    const containerId = explicitContainerId || this.containers.get(clusterId);
     if (!containerId) {
       return;
     }

package/src/orchestrator.js CHANGED Viewed

@@ -126,9 +126,55 @@ class Orchestrator {
       const clusterIds = Object.keys(data);
       this._log(`[Orchestrator] Found ${clusterIds.length} clusters in file:`, clusterIds);
+      // Track clusters to remove (missing .db files or 0 messages)
+      const clustersToRemove = [];
+      // Track clusters with 0 messages (corrupted from SIGINT race condition)
+      const corruptedClusters = [];
       for (const [clusterId, clusterData] of Object.entries(data)) {
+        // Skip clusters whose .db file doesn't exist (orphaned registry entries)
+        const dbPath = path.join(this.storageDir, `${clusterId}.db`);
+        if (!fs.existsSync(dbPath)) {
+          console.warn(`[Orchestrator] Cluster ${clusterId} has no database file, removing from registry`);
+          clustersToRemove.push(clusterId);
+          continue;
+        }
         this._log(`[Orchestrator] Loading cluster: ${clusterId}`);
-        this._loadSingleCluster(clusterId, clusterData);
+        const cluster = this._loadSingleCluster(clusterId, clusterData);
+        // VALIDATION: Detect 0-message clusters (corrupted from SIGINT during initialization)
+        // These clusters were created before the initCompletePromise fix was applied
+        if (cluster && cluster.messageBus) {
+          const messageCount = cluster.messageBus.count({ cluster_id: clusterId });
+          if (messageCount === 0) {
+            console.warn(`[Orchestrator] ⚠️  Cluster ${clusterId} has 0 messages (corrupted)`);
+            console.warn(`[Orchestrator]    This likely occurred from SIGINT during initialization.`);
+            console.warn(`[Orchestrator]    Marking as 'corrupted' - use 'zeroshot kill ${clusterId}' to remove.`);
+            corruptedClusters.push(clusterId);
+            // Mark cluster as corrupted for visibility in status/list commands
+            cluster.state = 'corrupted';
+            cluster.corruptedReason = 'SIGINT during initialization (0 messages in ledger)';
+          }
+        }
+      }
+      // Clean up orphaned entries from clusters.json
+      if (clustersToRemove.length > 0) {
+        for (const clusterId of clustersToRemove) {
+          delete data[clusterId];
+        }
+        fs.writeFileSync(clustersFile, JSON.stringify(data, null, 2));
+        this._log(`[Orchestrator] Removed ${clustersToRemove.length} orphaned cluster(s) from registry`);
+      }
+      // Log summary of corrupted clusters
+      if (corruptedClusters.length > 0) {
+        console.warn(`\n[Orchestrator] ⚠️  Found ${corruptedClusters.length} corrupted cluster(s):`);
+        for (const clusterId of corruptedClusters) {
+          console.warn(`    - ${clusterId}`);
+        }
+        console.warn(`[Orchestrator] Run 'zeroshot clear' to remove all corrupted clusters.\n`);
       }
       this._log(`[Orchestrator] Total clusters loaded: ${this.clusters.size}`);
@@ -494,6 +540,13 @@ class Orchestrator {
     }
     // Build cluster object
+    // CRITICAL: initComplete promise ensures ISSUE_OPENED is published before stop() completes
+    // This prevents 0-message clusters from SIGINT during async initialization
+    let resolveInitComplete;
+    const initCompletePromise = new Promise((resolve) => {
+      resolveInitComplete = resolve;
+    });
     const cluster = {
       id: clusterId,
       config,
@@ -504,6 +557,9 @@ class Orchestrator {
       createdAt: Date.now(),
       // Track PID for zombie detection (this process owns the cluster)
       pid: process.pid,
+      // Initialization completion tracking (for safe SIGINT handling)
+      initCompletePromise,
+      _resolveInitComplete: resolveInitComplete,
       // Isolation state (only if enabled)
       // CRITICAL: Store workDir for resume capability - without this, resume() can't recreate container
       isolation: options.isolation
@@ -652,6 +708,12 @@ class Orchestrator {
         },
       });
+      // CRITICAL: Mark initialization complete AFTER ISSUE_OPENED is published
+      // This ensures stop() waits for at least 1 message before stopping
+      if (cluster._resolveInitComplete) {
+        cluster._resolveInitComplete();
+      }
       this._log(`Cluster ${clusterId} started with ${cluster.agents.length} agents`);
       // Watch for CLUSTER_COMPLETE message to auto-stop
@@ -818,6 +880,10 @@ class Orchestrator {
       };
     } catch (error) {
       cluster.state = 'failed';
+      // CRITICAL: Resolve the promise on failure too, so stop() doesn't hang
+      if (cluster._resolveInitComplete) {
+        cluster._resolveInitComplete();
+      }
       console.error(`Cluster ${clusterId} failed to start:`, error);
       throw error;
     }
@@ -833,6 +899,17 @@ class Orchestrator {
       throw new Error(`Cluster ${clusterId} not found`);
     }
+    // CRITICAL: Wait for initialization to complete before stopping
+    // This ensures ISSUE_OPENED is published, preventing 0-message clusters
+    // Timeout after 30s to prevent infinite hang if init truly fails
+    if (cluster.initCompletePromise && cluster.state === 'initializing') {
+      this._log(`[Orchestrator] Waiting for initialization to complete before stopping...`);
+      await Promise.race([
+        cluster.initCompletePromise,
+        new Promise((resolve) => setTimeout(resolve, 30000)),
+      ]);
+    }
     cluster.state = 'stopping';
     // Stop all agents (including subclusters which handle their own children)