@covibes/zeroshot 5.2.1 → 5.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/CHANGELOG.md +174 -189
  2. package/README.md +199 -248
  3. package/cli/commands/providers.js +150 -0
  4. package/cli/index.js +214 -58
  5. package/cli/lib/first-run.js +40 -3
  6. package/cluster-templates/base-templates/debug-workflow.json +24 -78
  7. package/cluster-templates/base-templates/full-workflow.json +44 -145
  8. package/cluster-templates/base-templates/single-worker.json +23 -15
  9. package/cluster-templates/base-templates/worker-validator.json +47 -34
  10. package/cluster-templates/conductor-bootstrap.json +7 -5
  11. package/lib/docker-config.js +6 -1
  12. package/lib/provider-detection.js +59 -0
  13. package/lib/provider-names.js +56 -0
  14. package/lib/settings.js +191 -6
  15. package/lib/stream-json-parser.js +4 -238
  16. package/package.json +21 -5
  17. package/scripts/validate-templates.js +100 -0
  18. package/src/agent/agent-config.js +37 -13
  19. package/src/agent/agent-context-builder.js +64 -2
  20. package/src/agent/agent-hook-executor.js +82 -9
  21. package/src/agent/agent-lifecycle.js +53 -14
  22. package/src/agent/agent-task-executor.js +196 -194
  23. package/src/agent/output-extraction.js +200 -0
  24. package/src/agent/output-reformatter.js +175 -0
  25. package/src/agent/schema-utils.js +111 -0
  26. package/src/agent-wrapper.js +102 -30
  27. package/src/agents/git-pusher-agent.json +1 -1
  28. package/src/claude-task-runner.js +80 -30
  29. package/src/config-router.js +13 -13
  30. package/src/config-validator.js +231 -10
  31. package/src/github.js +36 -0
  32. package/src/isolation-manager.js +243 -154
  33. package/src/ledger.js +28 -6
  34. package/src/orchestrator.js +391 -96
  35. package/src/preflight.js +85 -82
  36. package/src/providers/anthropic/cli-builder.js +45 -0
  37. package/src/providers/anthropic/index.js +134 -0
  38. package/src/providers/anthropic/models.js +23 -0
  39. package/src/providers/anthropic/output-parser.js +159 -0
  40. package/src/providers/base-provider.js +181 -0
  41. package/src/providers/capabilities.js +51 -0
  42. package/src/providers/google/cli-builder.js +55 -0
  43. package/src/providers/google/index.js +116 -0
  44. package/src/providers/google/models.js +24 -0
  45. package/src/providers/google/output-parser.js +92 -0
  46. package/src/providers/index.js +75 -0
  47. package/src/providers/openai/cli-builder.js +122 -0
  48. package/src/providers/openai/index.js +135 -0
  49. package/src/providers/openai/models.js +21 -0
  50. package/src/providers/openai/output-parser.js +129 -0
  51. package/src/sub-cluster-wrapper.js +18 -3
  52. package/src/task-runner.js +8 -6
  53. package/src/tui/layout.js +20 -3
  54. package/task-lib/attachable-watcher.js +80 -78
  55. package/task-lib/claude-recovery.js +119 -0
  56. package/task-lib/commands/list.js +1 -1
  57. package/task-lib/commands/resume.js +3 -2
  58. package/task-lib/commands/run.js +12 -3
  59. package/task-lib/runner.js +59 -38
  60. package/task-lib/scheduler.js +2 -2
  61. package/task-lib/store.js +43 -30
  62. package/task-lib/watcher.js +81 -62
@@ -13,6 +13,26 @@ const fs = require('fs');
13
13
  const path = require('path');
14
14
  const os = require('os');
15
15
  const lockfile = require('proper-lockfile');
16
+
17
+ // Stale lock timeout in ms - if lock file is older than this, delete it
18
+ const LOCK_STALE_MS = 5000;
19
+
20
+ /**
21
+ * Remove lock file if it's stale (older than LOCK_STALE_MS)
22
+ * Handles crashes that leave orphaned lock files
23
+ */
24
+ function cleanStaleLock(lockPath) {
25
+ try {
26
+ if (fs.existsSync(lockPath)) {
27
+ const age = Date.now() - fs.statSync(lockPath).mtimeMs;
28
+ if (age > LOCK_STALE_MS) {
29
+ fs.unlinkSync(lockPath);
30
+ }
31
+ }
32
+ } catch {
33
+ // Ignore - another process may have cleaned it
34
+ }
35
+ }
16
36
  const AgentWrapper = require('./agent-wrapper');
17
37
  const SubClusterWrapper = require('./sub-cluster-wrapper');
18
38
  const MessageBus = require('./message-bus');
@@ -22,6 +42,21 @@ const IsolationManager = require('./isolation-manager');
22
42
  const { generateName } = require('./name-generator');
23
43
  const configValidator = require('./config-validator');
24
44
  const TemplateResolver = require('./template-resolver');
45
+ const { loadSettings } = require('../lib/settings');
46
+ const { normalizeProviderName } = require('../lib/provider-names');
47
+ const crypto = require('crypto');
48
+
49
+ function applyModelOverride(agentConfig, modelOverride) {
50
+ if (!modelOverride) return;
51
+
52
+ agentConfig.model = modelOverride;
53
+ if (agentConfig.modelRules) {
54
+ delete agentConfig.modelRules;
55
+ }
56
+ if (agentConfig.modelConfig) {
57
+ delete agentConfig.modelConfig;
58
+ }
59
+ }
25
60
 
26
61
  /**
27
62
  * Operation Chain Schema
@@ -66,10 +101,23 @@ class Orchestrator {
66
101
  // Track if orchestrator is closed (prevents _saveClusters race conditions during cleanup)
67
102
  this.closed = false;
68
103
 
69
- // Load existing clusters from disk (skip if explicitly disabled)
104
+ // Track if clusters are loaded (for lazy loading pattern)
105
+ this._clustersLoaded = options.skipLoad === true;
106
+ }
107
+
108
+ /**
109
+ * Factory method for async initialization
110
+ * Use this instead of `new Orchestrator()` for proper async cluster loading
111
+ * @param {Object} options - Same options as constructor
112
+ * @returns {Promise<Orchestrator>}
113
+ */
114
+ static async create(options = {}) {
115
+ const instance = new Orchestrator({ ...options, skipLoad: true });
70
116
  if (options.skipLoad !== true) {
71
- this._loadClusters();
117
+ await instance._loadClusters();
118
+ instance._clustersLoaded = true;
72
119
  }
120
+ return instance;
73
121
  }
74
122
 
75
123
  /**
@@ -87,7 +135,7 @@ class Orchestrator {
87
135
  * Uses file locking for consistent reads
88
136
  * @private
89
137
  */
90
- _loadClusters() {
138
+ async _loadClusters() {
91
139
  const clustersFile = path.join(this.storageDir, 'clusters.json');
92
140
  this._log(`[Orchestrator] Loading clusters from: ${clustersFile}`);
93
141
 
@@ -100,30 +148,20 @@ class Orchestrator {
100
148
  let release;
101
149
 
102
150
  try {
103
- // Acquire lock (sync API doesn't support retries, so we retry manually)
104
- const maxAttempts = 20;
105
- const retryDelayMs = 100;
106
-
107
- for (let attempt = 0; attempt < maxAttempts; attempt++) {
108
- try {
109
- release = lockfile.lockSync(clustersFile, {
110
- lockfilePath,
111
- stale: 30000,
112
- });
113
- break; // Lock acquired
114
- } catch (lockErr) {
115
- if (lockErr.code === 'ELOCKED' && attempt < maxAttempts - 1) {
116
- // Wait and retry
117
- const waitMs = retryDelayMs + Math.random() * retryDelayMs;
118
- const start = Date.now();
119
- while (Date.now() - start < waitMs) {
120
- /* spin wait */
121
- }
122
- continue;
123
- }
124
- throw lockErr;
125
- }
126
- }
151
+ // Clean stale locks from crashed processes
152
+ cleanStaleLock(lockfilePath);
153
+
154
+ // Acquire lock with async API (proper retries without CPU spin-wait)
155
+ release = await lockfile.lock(clustersFile, {
156
+ lockfilePath,
157
+ stale: LOCK_STALE_MS,
158
+ retries: {
159
+ retries: 20,
160
+ minTimeout: 100,
161
+ maxTimeout: 200,
162
+ randomize: true,
163
+ },
164
+ });
127
165
 
128
166
  const data = JSON.parse(fs.readFileSync(clustersFile, 'utf8'));
129
167
  const clusterIds = Object.keys(data);
@@ -138,7 +176,9 @@ class Orchestrator {
138
176
  // Skip clusters whose .db file doesn't exist (orphaned registry entries)
139
177
  const dbPath = path.join(this.storageDir, `${clusterId}.db`);
140
178
  if (!fs.existsSync(dbPath)) {
141
- console.warn(`[Orchestrator] Cluster ${clusterId} has no database file, removing from registry`);
179
+ console.warn(
180
+ `[Orchestrator] Cluster ${clusterId} has no database file, removing from registry`
181
+ );
142
182
  clustersToRemove.push(clusterId);
143
183
  continue;
144
184
  }
@@ -152,8 +192,12 @@ class Orchestrator {
152
192
  const messageCount = cluster.messageBus.count({ cluster_id: clusterId });
153
193
  if (messageCount === 0) {
154
194
  console.warn(`[Orchestrator] ⚠️ Cluster ${clusterId} has 0 messages (corrupted)`);
155
- console.warn(`[Orchestrator] This likely occurred from SIGINT during initialization.`);
156
- console.warn(`[Orchestrator] Marking as 'corrupted' - use 'zeroshot kill ${clusterId}' to remove.`);
195
+ console.warn(
196
+ `[Orchestrator] This likely occurred from SIGINT during initialization.`
197
+ );
198
+ console.warn(
199
+ `[Orchestrator] Marking as 'corrupted' - use 'zeroshot kill ${clusterId}' to remove.`
200
+ );
157
201
  corruptedClusters.push(clusterId);
158
202
  // Mark cluster as corrupted for visibility in status/list commands
159
203
  cluster.state = 'corrupted';
@@ -168,12 +212,16 @@ class Orchestrator {
168
212
  delete data[clusterId];
169
213
  }
170
214
  fs.writeFileSync(clustersFile, JSON.stringify(data, null, 2));
171
- this._log(`[Orchestrator] Removed ${clustersToRemove.length} orphaned cluster(s) from registry`);
215
+ this._log(
216
+ `[Orchestrator] Removed ${clustersToRemove.length} orphaned cluster(s) from registry`
217
+ );
172
218
  }
173
219
 
174
220
  // Log summary of corrupted clusters
175
221
  if (corruptedClusters.length > 0) {
176
- console.warn(`\n[Orchestrator] ⚠️ Found ${corruptedClusters.length} corrupted cluster(s):`);
222
+ console.warn(
223
+ `\n[Orchestrator] ⚠️ Found ${corruptedClusters.length} corrupted cluster(s):`
224
+ );
177
225
  for (const clusterId of corruptedClusters) {
178
226
  console.warn(` - ${clusterId}`);
179
227
  }
@@ -186,7 +234,7 @@ class Orchestrator {
186
234
  console.error(error.stack);
187
235
  } finally {
188
236
  if (release) {
189
- release();
237
+ await release();
190
238
  }
191
239
  }
192
240
  }
@@ -247,9 +295,14 @@ class Orchestrator {
247
295
  this._log(`[Orchestrator] Fixed missing cwd for agent ${agentConfig.id}: ${agentCwd}`);
248
296
  }
249
297
 
298
+ if (clusterData.modelOverride) {
299
+ applyModelOverride(agentConfig, clusterData.modelOverride);
300
+ }
301
+
250
302
  const agentOptions = {
251
303
  id: clusterId,
252
304
  quiet: this.quiet,
305
+ modelOverride: clusterData.modelOverride || null,
253
306
  };
254
307
 
255
308
  // Inject isolation context if enabled (MUST be done during agent creation)
@@ -291,6 +344,7 @@ class Orchestrator {
291
344
  messageBus,
292
345
  agents,
293
346
  isolation,
347
+ autoPr: clusterData.autoPr || false,
294
348
  };
295
349
 
296
350
  this.clusters.set(clusterId, cluster);
@@ -316,7 +370,7 @@ class Orchestrator {
316
370
  * Uses file locking to prevent race conditions with other processes
317
371
  * @private
318
372
  */
319
- _saveClusters() {
373
+ async _saveClusters() {
320
374
  // Skip saving if orchestrator is closed (prevents race conditions during cleanup)
321
375
  if (this.closed) {
322
376
  return;
@@ -327,30 +381,20 @@ class Orchestrator {
327
381
  let release;
328
382
 
329
383
  try {
330
- // Acquire exclusive lock (sync API doesn't support retries, so we retry manually)
331
- const maxAttempts = 50;
332
- const retryDelayMs = 100;
333
-
334
- for (let attempt = 0; attempt < maxAttempts; attempt++) {
335
- try {
336
- release = lockfile.lockSync(clustersFile, {
337
- lockfilePath,
338
- stale: 30000, // Lock expires after 30s (in case process dies)
339
- });
340
- break; // Lock acquired
341
- } catch (lockErr) {
342
- if (lockErr.code === 'ELOCKED' && attempt < maxAttempts - 1) {
343
- // Wait and retry with jitter
344
- const waitMs = retryDelayMs + Math.random() * retryDelayMs * 2;
345
- const start = Date.now();
346
- while (Date.now() - start < waitMs) {
347
- /* spin wait */
348
- }
349
- continue;
350
- }
351
- throw lockErr;
352
- }
353
- }
384
+ // Clean stale locks from crashed processes
385
+ cleanStaleLock(lockfilePath);
386
+
387
+ // Acquire exclusive lock with async API (proper retries without CPU spin-wait)
388
+ release = await lockfile.lock(clustersFile, {
389
+ lockfilePath,
390
+ stale: LOCK_STALE_MS,
391
+ retries: {
392
+ retries: 50,
393
+ minTimeout: 100,
394
+ maxTimeout: 300,
395
+ randomize: true,
396
+ },
397
+ });
354
398
 
355
399
  // Read existing clusters from file (other processes may have added clusters)
356
400
  let existingClusters = {};
@@ -391,6 +435,10 @@ class Orchestrator {
391
435
  pid: cluster.state === 'running' ? cluster.pid : null,
392
436
  // Persist failure info for resume capability
393
437
  failureInfo: cluster.failureInfo || null,
438
+ // Persist PR mode for completion agent selection
439
+ autoPr: cluster.autoPr || false,
440
+ // Persist model override for consistent agent spawning on resume
441
+ modelOverride: cluster.modelOverride || null,
394
442
  // Persist isolation info (excluding manager instance which can't be serialized)
395
443
  // CRITICAL: workDir is required for resume() to recreate container with same workspace
396
444
  isolation: cluster.isolation
@@ -423,7 +471,7 @@ class Orchestrator {
423
471
  } finally {
424
472
  // Always release lock
425
473
  if (release) {
426
- release();
474
+ await release();
427
475
  }
428
476
  }
429
477
  }
@@ -445,11 +493,14 @@ class Orchestrator {
445
493
  try {
446
494
  if (!fs.existsSync(clustersFile)) return;
447
495
 
496
+ // Clean stale locks from crashed processes
497
+ cleanStaleLock(lockfilePath);
498
+
448
499
  // Try to acquire lock once (polling is best-effort, will retry on next cycle)
449
500
  try {
450
501
  release = lockfile.lockSync(clustersFile, {
451
502
  lockfilePath,
452
- stale: 30000,
503
+ stale: LOCK_STALE_MS,
453
504
  });
454
505
  } catch (lockErr) {
455
506
  // Lock busy - skip this poll cycle, try again next interval
@@ -541,7 +592,9 @@ class Orchestrator {
541
592
  isolation: options.isolation || false,
542
593
  isolationImage: options.isolationImage,
543
594
  worktree: options.worktree || false,
544
- autoPr: process.env.ZEROSHOT_PR === '1',
595
+ autoPr: options.autoPr || process.env.ZEROSHOT_PR === '1',
596
+ modelOverride: options.modelOverride, // Model override for all agents
597
+ clusterId: options.clusterId, // Explicit ID from CLI/daemon parent
545
598
  });
546
599
  }
547
600
 
@@ -550,8 +603,15 @@ class Orchestrator {
550
603
  * @private
551
604
  */
552
605
  async _startInternal(config, input = {}, options = {}) {
553
- // Use pre-generated ID from parent process, or generate new one
554
- const clusterId = process.env.ZEROSHOT_CLUSTER_ID || generateName('cluster');
606
+ // Generate a unique cluster ID for this process call.
607
+ // IMPORTANT: Do NOT implicitly reuse ZEROSHOT_CLUSTER_ID, because:
608
+ // - test harnesses may set it globally (breaking multi-start tests)
609
+ // - callers may start multiple clusters in one process
610
+ // Use it only when explicitly passed (CLI/daemon parent) via options.clusterId.
611
+ const clusterId = this._generateUniqueClusterId(
612
+ options.clusterId || null,
613
+ config?.dbPath || null
614
+ );
555
615
 
556
616
  // Create ledger and message bus with persistent storage
557
617
  const dbPath = config.dbPath || path.join(this.storageDir, `${clusterId}.db`);
@@ -579,6 +639,9 @@ class Orchestrator {
579
639
  // Create container with workspace mounted
580
640
  // CRITICAL: Use options.cwd (git repo root) instead of process.cwd()
581
641
  const workDir = options.cwd || process.cwd();
642
+ const providerName = normalizeProviderName(
643
+ config.forceProvider || config.defaultProvider || loadSettings().defaultProvider || 'claude'
644
+ );
582
645
  containerId = await isolationManager.createContainer(clusterId, {
583
646
  workDir,
584
647
  image,
@@ -586,6 +649,7 @@ class Orchestrator {
586
649
  noMounts: options.noMounts,
587
650
  mounts: options.mounts,
588
651
  containerHome: options.containerHome,
652
+ provider: providerName,
589
653
  });
590
654
  this._log(`[Orchestrator] Container created: ${containerId} (workDir: ${workDir})`);
591
655
  } else if (options.worktree) {
@@ -621,6 +685,9 @@ class Orchestrator {
621
685
  // Initialization completion tracking (for safe SIGINT handling)
622
686
  initCompletePromise,
623
687
  _resolveInitComplete: resolveInitComplete,
688
+ autoPr: options.autoPr || false,
689
+ // Model override for all agents (applied to dynamically added agents)
690
+ modelOverride: options.modelOverride || null,
624
691
  // Isolation state (only if enabled)
625
692
  // CRITICAL: Store workDir for resume capability - without this, resume() can't recreate container
626
693
  isolation: options.isolation
@@ -648,7 +715,7 @@ class Orchestrator {
648
715
  this.clusters.set(clusterId, cluster);
649
716
 
650
717
  try {
651
- // Fetch input (GitHub issue or text)
718
+ // Fetch input (GitHub issue, file, or text)
652
719
  let inputData;
653
720
  if (input.issue) {
654
721
  inputData = await GitHub.fetchIssue(input.issue);
@@ -656,10 +723,13 @@ class Orchestrator {
656
723
  if (inputData.url) {
657
724
  this._log(`[Orchestrator] Issue: ${inputData.url}`);
658
725
  }
726
+ } else if (input.file) {
727
+ inputData = GitHub.createFileInput(input.file);
728
+ this._log(`[Orchestrator] File: ${input.file}`);
659
729
  } else if (input.text) {
660
730
  inputData = GitHub.createTextInput(input.text);
661
731
  } else {
662
- throw new Error('Either issue or text input is required');
732
+ throw new Error('Either issue, file, or text input is required');
663
733
  }
664
734
 
665
735
  // Inject git-pusher agent if --pr is set (replaces completion-detector)
@@ -686,7 +756,9 @@ class Orchestrator {
686
756
  }
687
757
 
688
758
  // Inject workers instruction if --workers explicitly provided and > 1
689
- const workersCount = process.env.ZEROSHOT_WORKERS ? parseInt(process.env.ZEROSHOT_WORKERS) : 0;
759
+ const workersCount = process.env.ZEROSHOT_WORKERS
760
+ ? parseInt(process.env.ZEROSHOT_WORKERS)
761
+ : 0;
690
762
  if (workersCount > 1) {
691
763
  const workerAgent = config.agents.find((a) => a.id === 'worker');
692
764
  if (workerAgent) {
@@ -716,9 +788,16 @@ class Orchestrator {
716
788
  agentConfig.cwd = agentCwd;
717
789
  }
718
790
 
791
+ // Apply model override if set (for consistency across all agents)
792
+ if (options.modelOverride) {
793
+ applyModelOverride(agentConfig, options.modelOverride);
794
+ this._log(` [model] Overridden model for ${agentConfig.id}: ${options.modelOverride}`);
795
+ }
796
+
719
797
  const agentOptions = {
720
798
  testMode: options.testMode || !!this.taskRunner, // Enable testMode if taskRunner provided
721
799
  quiet: this.quiet,
800
+ modelOverride: options.modelOverride || null,
722
801
  };
723
802
 
724
803
  // Inject mock spawn function if provided (legacy mockExecutor API)
@@ -777,8 +856,8 @@ class Orchestrator {
777
856
  //
778
857
  // ORDER:
779
858
  // 1. Register subscriptions (lines below)
780
- // 2. Start agents (line ~XXX)
781
- // 3. Publish ISSUE_OPENED (line ~XXX)
859
+ // 2. Start agents
860
+ // 3. Publish ISSUE_OPENED
782
861
  //
783
862
  // DO NOT move subscriptions after agent.start() - this will reintroduce
784
863
  // the race condition fixed in issue #31.
@@ -827,12 +906,12 @@ class Orchestrator {
827
906
  });
828
907
 
829
908
  // Watch for AGENT_ERROR - if critical agent fails, stop cluster
830
- subscribeToClusterTopic('AGENT_ERROR', (message) => {
909
+ subscribeToClusterTopic('AGENT_ERROR', async (message) => {
831
910
  const agentRole = message.content?.data?.role;
832
911
  const attempts = message.content?.data?.attempts || 1;
833
912
 
834
913
  // Save cluster state to persist failureInfo
835
- this._saveClusters();
914
+ await await this._saveClusters();
836
915
 
837
916
  // Only stop cluster if non-validator agent exhausted retries
838
917
  if (agentRole === 'implementation' && attempts >= 3) {
@@ -852,15 +931,19 @@ class Orchestrator {
852
931
  });
853
932
 
854
933
  // Persist agent state changes for accurate status display
855
- messageBus.on('topic:AGENT_LIFECYCLE', (message) => {
934
+ messageBus.on('topic:AGENT_LIFECYCLE', async (message) => {
856
935
  const event = message.content?.data?.event;
857
936
  // Save on key state transitions that affect status display
858
937
  if (
859
- ['TASK_STARTED', 'TASK_COMPLETED', 'PROCESS_SPAWNED', 'TASK_ID_ASSIGNED', 'STARTED'].includes(
860
- event
861
- )
938
+ [
939
+ 'TASK_STARTED',
940
+ 'TASK_COMPLETED',
941
+ 'PROCESS_SPAWNED',
942
+ 'TASK_ID_ASSIGNED',
943
+ 'STARTED',
944
+ ].includes(event)
862
945
  ) {
863
- this._saveClusters();
946
+ await await this._saveClusters();
864
947
  }
865
948
  });
866
949
 
@@ -876,11 +959,75 @@ class Orchestrator {
876
959
  `⚠️ Orchestrator: Agent ${agentId} appears stale (${Math.round(timeSinceLastOutput / 1000)}s no output) but will NOT be killed`
877
960
  );
878
961
  this._log(` Analysis: ${analysis}`);
879
- this._log(` Manual intervention may be needed - use 'zeroshot resume ${clusterId}' if stuck`);
962
+ this._log(
963
+ ` Manual intervention may be needed - use 'zeroshot resume ${clusterId}' if stuck`
964
+ );
965
+ });
966
+
967
+ // CONDUCTOR WATCHDOG: If conductor completes but CLUSTER_OPERATIONS never arrives, FAIL FAST
968
+ // This catches the silent failure where conductor outputs result but hook fails to publish
969
+ const CONDUCTOR_WATCHDOG_TIMEOUT_MS = 30000; // 30 seconds
970
+ let conductorWatchdogTimer = null;
971
+ let conductorCompletedAt = null;
972
+
973
+ // Start watchdog when conductor completes
974
+ subscribeToClusterTopic('AGENT_LIFECYCLE', (message) => {
975
+ const event = message.content?.data?.event;
976
+ const role = message.content?.data?.role;
977
+
978
+ // Conductor completed - start watchdog
979
+ if (event === 'TASK_COMPLETED' && role === 'conductor') {
980
+ conductorCompletedAt = Date.now();
981
+ this._log(
982
+ `⏱️ Conductor completed. Watchdog started - expecting CLUSTER_OPERATIONS within ${CONDUCTOR_WATCHDOG_TIMEOUT_MS / 1000}s`
983
+ );
984
+
985
+ conductorWatchdogTimer = setTimeout(() => {
986
+ // Check if CLUSTER_OPERATIONS was received
987
+ const clusterOps = messageBus.query({ topic: 'CLUSTER_OPERATIONS', limit: 1 });
988
+ if (clusterOps.length === 0) {
989
+ console.error(`\n${'='.repeat(80)}`);
990
+ console.error(`🔴 CONDUCTOR WATCHDOG TRIGGERED - CLUSTER_OPERATIONS NEVER RECEIVED`);
991
+ console.error(`${'='.repeat(80)}`);
992
+ console.error(
993
+ `Conductor completed ${CONDUCTOR_WATCHDOG_TIMEOUT_MS / 1000}s ago but no CLUSTER_OPERATIONS`
994
+ );
995
+ console.error(`This indicates the conductor's onComplete hook FAILED SILENTLY`);
996
+ console.error(
997
+ `Check: 1) Result parsing 2) Transform script errors 3) Schema validation`
998
+ );
999
+ console.error(`${'='.repeat(80)}\n`);
1000
+
1001
+ // Publish CLUSTER_FAILED to stop the cluster
1002
+ messageBus.publish({
1003
+ cluster_id: clusterId,
1004
+ topic: 'CLUSTER_FAILED',
1005
+ sender: 'orchestrator',
1006
+ content: {
1007
+ text: `Conductor completed but CLUSTER_OPERATIONS never published - hook failure`,
1008
+ data: {
1009
+ reason: 'CONDUCTOR_WATCHDOG_TIMEOUT',
1010
+ conductorCompletedAt,
1011
+ timeoutMs: CONDUCTOR_WATCHDOG_TIMEOUT_MS,
1012
+ },
1013
+ },
1014
+ });
1015
+ }
1016
+ }, CONDUCTOR_WATCHDOG_TIMEOUT_MS);
1017
+ }
880
1018
  });
881
1019
 
882
1020
  // Watch for CLUSTER_OPERATIONS - dynamic agent spawn/removal/update
883
1021
  subscribeToClusterTopic('CLUSTER_OPERATIONS', (message) => {
1022
+ // Clear conductor watchdog - CLUSTER_OPERATIONS received successfully
1023
+ if (conductorWatchdogTimer) {
1024
+ clearTimeout(conductorWatchdogTimer);
1025
+ conductorWatchdogTimer = null;
1026
+ const elapsed = conductorCompletedAt ? Date.now() - conductorCompletedAt : 0;
1027
+ this._log(
1028
+ `✅ CLUSTER_OPERATIONS received (${elapsed}ms after conductor completed) - watchdog cleared`
1029
+ );
1030
+ }
884
1031
  let operations = message.content?.data?.operations;
885
1032
 
886
1033
  // Parse operations if they came as a JSON string
@@ -961,7 +1108,7 @@ class Orchestrator {
961
1108
  },
962
1109
  },
963
1110
  metadata: {
964
- source: input.issue ? 'github' : 'text',
1111
+ source: input.issue ? 'github' : input.file ? 'file' : 'text',
965
1112
  },
966
1113
  });
967
1114
 
@@ -989,7 +1136,7 @@ class Orchestrator {
989
1136
  // ^^^^^^ REMOVED - clusters run until explicitly stopped or completed
990
1137
 
991
1138
  // Save cluster to disk
992
- this._saveClusters();
1139
+ await this._saveClusters();
993
1140
 
994
1141
  return {
995
1142
  id: clusterId,
@@ -1009,6 +1156,42 @@ class Orchestrator {
1009
1156
  }
1010
1157
  }
1011
1158
 
1159
+ /**
1160
+ * Generate a unique cluster ID, safe for concurrent starts in-process.
1161
+ * If an explicit ID is provided, uses it as a base and suffixes on collision.
1162
+ * @private
1163
+ */
1164
+ _generateUniqueClusterId(explicitId, explicitDbPath) {
1165
+ const baseId = explicitId || generateName('cluster');
1166
+ const baseDbPath = explicitDbPath || path.join(this.storageDir, `${baseId}.db`);
1167
+
1168
+ // Fast path: base is unused.
1169
+ if (!this.clusters.has(baseId) && !fs.existsSync(baseDbPath)) {
1170
+ return baseId;
1171
+ }
1172
+
1173
+ // Collision: suffix with random bytes to avoid race conditions under concurrency.
1174
+ for (let attempt = 0; attempt < 50; attempt++) {
1175
+ const suffix = crypto.randomBytes(3).toString('hex');
1176
+ const candidateId = `${baseId}-${suffix}`;
1177
+ const candidateDbPath = explicitDbPath || path.join(this.storageDir, `${candidateId}.db`);
1178
+ if (!this.clusters.has(candidateId) && !fs.existsSync(candidateDbPath)) {
1179
+ return candidateId;
1180
+ }
1181
+ }
1182
+
1183
+ // Last resort: new generated name (should never happen).
1184
+ for (let attempt = 0; attempt < 50; attempt++) {
1185
+ const candidateId = generateName('cluster');
1186
+ const candidateDbPath = explicitDbPath || path.join(this.storageDir, `${candidateId}.db`);
1187
+ if (!this.clusters.has(candidateId) && !fs.existsSync(candidateDbPath)) {
1188
+ return candidateId;
1189
+ }
1190
+ }
1191
+
1192
+ throw new Error('Failed to generate unique cluster ID after many attempts');
1193
+ }
1194
+
1012
1195
  /**
1013
1196
  * Stop a cluster
1014
1197
  * @param {String} clusterId - Cluster ID
@@ -1040,7 +1223,9 @@ class Orchestrator {
1040
1223
  // Clean up isolation container if enabled
1041
1224
  // CRITICAL: Preserve workspace for resume capability - only delete on kill()
1042
1225
  if (cluster.isolation?.manager) {
1043
- this._log(`[Orchestrator] Stopping isolation container for ${clusterId} (preserving workspace for resume)...`);
1226
+ this._log(
1227
+ `[Orchestrator] Stopping isolation container for ${clusterId} (preserving workspace for resume)...`
1228
+ );
1044
1229
  await cluster.isolation.manager.cleanup(clusterId, { preserveWorkspace: true });
1045
1230
  this._log(`[Orchestrator] Container stopped, workspace preserved`);
1046
1231
  }
@@ -1058,7 +1243,7 @@ class Orchestrator {
1058
1243
  this._log(`Cluster ${clusterId} stopped`);
1059
1244
 
1060
1245
  // Save updated state
1061
- this._saveClusters();
1246
+ await this._saveClusters();
1062
1247
  }
1063
1248
 
1064
1249
  /**
@@ -1080,7 +1265,9 @@ class Orchestrator {
1080
1265
 
1081
1266
  // Force remove isolation container AND workspace (full cleanup, no resume)
1082
1267
  if (cluster.isolation?.manager) {
1083
- this._log(`[Orchestrator] Force removing isolation container and workspace for ${clusterId}...`);
1268
+ this._log(
1269
+ `[Orchestrator] Force removing isolation container and workspace for ${clusterId}...`
1270
+ );
1084
1271
  await cluster.isolation.manager.cleanup(clusterId, { preserveWorkspace: false });
1085
1272
  this._log(`[Orchestrator] Container and workspace removed`);
1086
1273
  }
@@ -1104,7 +1291,7 @@ class Orchestrator {
1104
1291
  this._log(`Cluster ${clusterId} killed`);
1105
1292
 
1106
1293
  // Save updated state (will be marked as 'killed' in file)
1107
- this._saveClusters();
1294
+ await this._saveClusters();
1108
1295
 
1109
1296
  // Now remove from memory after persisting
1110
1297
  this.clusters.delete(clusterId);
@@ -1184,6 +1371,7 @@ class Orchestrator {
1184
1371
  cluster_id: clusterId,
1185
1372
  topic: 'AGENT_ERROR',
1186
1373
  limit: 10,
1374
+ order: 'desc',
1187
1375
  });
1188
1376
 
1189
1377
  if (errors.length > 0) {
@@ -1220,7 +1408,9 @@ class Orchestrator {
1220
1408
  // The isolated workspace at /tmp/zeroshot-isolated/{clusterId} was preserved by stop()
1221
1409
  const workDir = cluster.isolation.workDir;
1222
1410
  if (!workDir) {
1223
- throw new Error(`Cannot resume cluster ${clusterId}: workDir not saved in isolation state`);
1411
+ throw new Error(
1412
+ `Cannot resume cluster ${clusterId}: workDir not saved in isolation state`
1413
+ );
1224
1414
  }
1225
1415
 
1226
1416
  // Check if isolated workspace still exists (it should, if stop() was used)
@@ -1232,10 +1422,17 @@ class Orchestrator {
1232
1422
  );
1233
1423
  }
1234
1424
 
1425
+ const providerName = normalizeProviderName(
1426
+ cluster.config?.forceProvider ||
1427
+ cluster.config?.defaultProvider ||
1428
+ loadSettings().defaultProvider ||
1429
+ 'claude'
1430
+ );
1235
1431
  const newContainerId = await cluster.isolation.manager.createContainer(clusterId, {
1236
1432
  workDir, // Use saved workDir, NOT process.cwd()
1237
1433
  image: cluster.isolation.image,
1238
1434
  reuseExistingWorkspace: true, // CRITICAL: Don't wipe existing work
1435
+ provider: providerName,
1239
1436
  });
1240
1437
 
1241
1438
  this._log(`[Orchestrator] New container created: ${newContainerId}`);
@@ -1280,10 +1477,13 @@ class Orchestrator {
1280
1477
  }
1281
1478
 
1282
1479
  // Query recent messages from ledger to provide context
1283
- const recentMessages = cluster.messageBus.query({
1284
- cluster_id: clusterId,
1285
- limit: 50,
1286
- });
1480
+ const recentMessages = cluster.messageBus
1481
+ .query({
1482
+ cluster_id: clusterId,
1483
+ limit: 50,
1484
+ order: 'desc',
1485
+ })
1486
+ .reverse();
1287
1487
 
1288
1488
  // CASE 1: Failed cluster - Resume the failed agent with error context
1289
1489
  if (failureInfo) {
@@ -1317,7 +1517,7 @@ class Orchestrator {
1317
1517
  cluster.failureInfo = null;
1318
1518
 
1319
1519
  // Save updated state
1320
- this._saveClusters();
1520
+ await this._saveClusters();
1321
1521
 
1322
1522
  // Resume the failed agent
1323
1523
  failedAgent.resume(context).catch((err) => {
@@ -1438,7 +1638,7 @@ class Orchestrator {
1438
1638
  }
1439
1639
 
1440
1640
  // Save updated state
1441
- this._saveClusters();
1641
+ await this._saveClusters();
1442
1642
 
1443
1643
  this._log(`[Orchestrator] Cluster ${clusterId} resumed`);
1444
1644
 
@@ -1505,10 +1705,13 @@ Continue from where you left off. Review your previous output to understand what
1505
1705
  `.trim();
1506
1706
 
1507
1707
  // Get recent context from ledger
1508
- const recentMessages = cluster.messageBus.query({
1509
- cluster_id: cluster.id,
1510
- limit: 10,
1511
- });
1708
+ const recentMessages = cluster.messageBus
1709
+ .query({
1710
+ cluster_id: cluster.id,
1711
+ limit: 10,
1712
+ order: 'desc',
1713
+ })
1714
+ .reverse();
1512
1715
 
1513
1716
  const contextText = recentMessages
1514
1717
  .map((m) => `[${m.sender}] ${m.content?.text || JSON.stringify(m.content)}`)
@@ -1685,7 +1888,7 @@ Continue from where you left off. Review your previous output to understand what
1685
1888
  });
1686
1889
 
1687
1890
  // Save updated cluster state to disk
1688
- this._saveClusters();
1891
+ await this._saveClusters();
1689
1892
  }
1690
1893
 
1691
1894
  /**
@@ -1709,6 +1912,12 @@ Continue from where you left off. Review your previous output to understand what
1709
1912
  agentConfig.cwd = agentCwd;
1710
1913
  this._log(` [cwd] Injected worktree cwd for ${agentConfig.id}: ${agentCwd}`);
1711
1914
  }
1915
+
1916
+ // Apply model override if set (for consistency with initial agents)
1917
+ if (cluster.modelOverride) {
1918
+ applyModelOverride(agentConfig, cluster.modelOverride);
1919
+ this._log(` [model] Overridden model for ${agentConfig.id}: ${cluster.modelOverride}`);
1920
+ }
1712
1921
  // Validate agent config has required fields
1713
1922
  if (!agentConfig.id) {
1714
1923
  throw new Error('Agent config missing required field: id');
@@ -1731,6 +1940,7 @@ Continue from where you left off. Review your previous output to understand what
1731
1940
  const agentOptions = {
1732
1941
  testMode: !!this.taskRunner, // Enable testMode if taskRunner provided
1733
1942
  quiet: this.quiet,
1943
+ modelOverride: cluster.modelOverride || null,
1734
1944
  };
1735
1945
 
1736
1946
  // TaskRunner DI - propagate to dynamically spawned agents
@@ -1905,6 +2115,88 @@ Continue from where you left off. Review your previous output to understand what
1905
2115
  await this._opAddAgents(cluster, { agents: loadedConfig.agents }, context);
1906
2116
 
1907
2117
  this._log(` ✓ Config loaded (${loadedConfig.agents.length} agents)`);
2118
+
2119
+ // Inject completion agent (templates don't include one - orchestrator controls termination)
2120
+ await this._injectCompletionAgent(cluster, context);
2121
+ }
2122
+
2123
+ /**
2124
+ * Inject appropriate completion agent based on mode
2125
+ * Templates define work, orchestrator controls termination strategy
2126
+ * @private
2127
+ */
2128
+ async _injectCompletionAgent(cluster, context) {
2129
+ // Skip if completion agent already exists
2130
+ const hasCompletionAgent = cluster.agents.some(
2131
+ (a) => a.config?.id === 'completion-detector' || a.config?.id === 'git-pusher'
2132
+ );
2133
+ if (hasCompletionAgent) {
2134
+ return;
2135
+ }
2136
+
2137
+ const isPrMode = cluster.autoPr || process.env.ZEROSHOT_PR === '1';
2138
+
2139
+ if (isPrMode) {
2140
+ // Load git-pusher for PR mode
2141
+ const gitPusherPath = path.join(__dirname, 'agents', 'git-pusher-agent.json');
2142
+ const gitPusherConfig = JSON.parse(fs.readFileSync(gitPusherPath, 'utf8'));
2143
+
2144
+ // Get issue context from ledger
2145
+ const issueMsg = cluster.messageBus.ledger.findLast({ topic: 'ISSUE_OPENED' });
2146
+ const issueNumber = issueMsg?.content?.data?.number || 'unknown';
2147
+ const issueTitle = issueMsg?.content?.data?.title || 'Implementation';
2148
+
2149
+ // Inject placeholders
2150
+ gitPusherConfig.prompt = gitPusherConfig.prompt
2151
+ .replace(/\{\{issue_number\}\}/g, issueNumber)
2152
+ .replace(/\{\{issue_title\}\}/g, issueTitle);
2153
+
2154
+ await this._opAddAgents(cluster, { agents: [gitPusherConfig] }, context);
2155
+ this._log(` [--pr mode] Injected git-pusher agent`);
2156
+ } else {
2157
+ // Default completion-detector
2158
+ const completionDetector = {
2159
+ id: 'completion-detector',
2160
+ role: 'orchestrator',
2161
+ model: 'haiku',
2162
+ timeout: 0,
2163
+ triggers: [
2164
+ {
2165
+ topic: 'VALIDATION_RESULT',
2166
+ logic: {
2167
+ engine: 'javascript',
2168
+ script: `const validators = cluster.getAgentsByRole('validator');
2169
+ const lastPush = ledger.findLast({ topic: 'IMPLEMENTATION_READY' });
2170
+ if (!lastPush) return false;
2171
+ if (validators.length === 0) return true;
2172
+
2173
+ const validatorIds = new Set(validators.map((v) => v.id));
2174
+ const results = ledger.query({ topic: 'VALIDATION_RESULT', since: lastPush.timestamp });
2175
+
2176
+ const latestByValidator = new Map();
2177
+ for (const msg of results) {
2178
+ if (!validatorIds.has(msg.sender)) continue;
2179
+ latestByValidator.set(msg.sender, msg);
2180
+ }
2181
+
2182
+ if (latestByValidator.size < validators.length) return false;
2183
+
2184
+ for (const validator of validators) {
2185
+ const msg = latestByValidator.get(validator.id);
2186
+ const approved = msg?.content?.data?.approved;
2187
+ if (!(approved === true || approved === 'true')) return false;
2188
+ }
2189
+
2190
+ return true;`,
2191
+ },
2192
+ action: 'stop_cluster',
2193
+ },
2194
+ ],
2195
+ };
2196
+
2197
+ await this._opAddAgents(cluster, { agents: [completionDetector] }, context);
2198
+ this._log(` Injected completion-detector agent`);
2199
+ }
1908
2200
  }
1909
2201
 
1910
2202
  /**
@@ -2047,7 +2339,7 @@ Continue from where you left off. Review your previous output to understand what
2047
2339
  * @private
2048
2340
  */
2049
2341
  _exportMarkdown(cluster, clusterId, messages) {
2050
- const { parseChunk } = require('../lib/stream-json-parser');
2342
+ const { parseProviderChunk } = require('./providers');
2051
2343
 
2052
2344
  // Find task info
2053
2345
  const issueOpened = messages.find((m) => m.topic === 'ISSUE_OPENED');
@@ -2093,7 +2385,10 @@ Continue from where you left off. Review your previous output to understand what
2093
2385
  const content = msg.content?.data?.line || msg.content?.data?.chunk || msg.content?.text;
2094
2386
  if (!content) continue;
2095
2387
 
2096
- const events = parseChunk(content);
2388
+ const provider = normalizeProviderName(
2389
+ msg.content?.data?.provider || msg.sender_provider || 'claude'
2390
+ );
2391
+ const events = parseProviderChunk(provider, content);
2097
2392
  for (const event of events) {
2098
2393
  switch (event.type) {
2099
2394
  case 'text':