@litmers/cursorflow-orchestrator 0.1.28 → 0.1.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -73,6 +73,27 @@ const DEFAULT_ORCHESTRATOR_STALL_CONFIG = {
73
73
  progressTimeoutMs: 10 * 60 * 1000, // 10 minutes (only triggers if no activity at all)
74
74
  maxRestarts: 2,
75
75
  };
76
+ /**
77
+ * Log the tail of a file
78
+ */
79
+ function logFileTail(filePath, lines = 10) {
80
+ if (!fs.existsSync(filePath))
81
+ return;
82
+ try {
83
+ const content = fs.readFileSync(filePath, 'utf8');
84
+ const allLines = content.split('\n');
85
+ const tail = allLines.slice(-lines).filter(l => l.trim());
86
+ if (tail.length > 0) {
87
+ logger.error(` Last ${tail.length} lines of log:`);
88
+ for (const line of tail) {
89
+ logger.error(` ${line}`);
90
+ }
91
+ }
92
+ }
93
+ catch (e) {
94
+ // Ignore log reading errors
95
+ }
96
+ }
76
97
  /**
77
98
  * Spawn a lane process
78
99
  */
@@ -524,417 +545,482 @@ async function orchestrate(tasksDir, options = {}) {
524
545
  const monitorInterval = setInterval(() => {
525
546
  printLaneStatus(lanes, laneRunDirs);
526
547
  }, options.pollInterval || 60000);
527
- while (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length || (blockedLanes.size > 0 && running.size === 0)) {
528
- // 1. Identify lanes ready to start
529
- const readyToStart = lanes.filter(lane => {
530
- // Not already running or completed or failed or blocked
531
- if (running.has(lane.name) || completedLanes.has(lane.name) || failedLanes.has(lane.name) || blockedLanes.has(lane.name)) {
532
- return false;
548
+ // Handle process interruption
549
+ const sigIntHandler = () => {
550
+ logger.warn('\n⚠️ Orchestration interrupted! Stopping all lanes...');
551
+ for (const [name, info] of running.entries()) {
552
+ logger.info(`Stopping lane: ${name}`);
553
+ try {
554
+ info.child.kill('SIGTERM');
533
555
  }
534
- // Check dependencies
535
- for (const dep of lane.dependsOn) {
536
- if (failedLanes.has(dep)) {
537
- logger.error(`Lane ${lane.name} will not start because dependency ${dep} failed`);
538
- failedLanes.add(lane.name);
539
- exitCodes[lane.name] = 1;
540
- return false;
541
- }
542
- if (blockedLanes.has(dep)) {
543
- // If a dependency is blocked, wait
544
- return false;
545
- }
546
- if (!completedLanes.has(dep)) {
556
+ catch {
557
+ // Ignore kill errors
558
+ }
559
+ }
560
+ printLaneStatus(lanes, laneRunDirs);
561
+ process.exit(130);
562
+ };
563
+ process.on('SIGINT', sigIntHandler);
564
+ process.on('SIGTERM', sigIntHandler);
565
+ let lastStallCheck = Date.now();
566
+ try {
567
+ while (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length || (blockedLanes.size > 0 && running.size === 0)) {
568
+ // 1. Identify lanes ready to start
569
+ const readyToStart = lanes.filter(lane => {
570
+ // Not already running or completed or failed or blocked
571
+ if (running.has(lane.name) || completedLanes.has(lane.name) || failedLanes.has(lane.name) || blockedLanes.has(lane.name)) {
547
572
  return false;
548
573
  }
549
- }
550
- return true;
551
- });
552
- // 2. Spawn ready lanes up to maxConcurrent
553
- for (const lane of readyToStart) {
554
- if (running.size >= maxConcurrent)
555
- break;
556
- const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[lane.name], 'state.json');
557
- // Validate and repair state before starting
558
- const validation = (0, state_1.validateLaneState)(laneStatePath, { autoRepair: true });
559
- if (!validation.valid && !validation.repaired) {
560
- logger.warn(`[${lane.name}] State validation issues: ${validation.issues.join(', ')}`);
561
- }
562
- logger.info(`Lane started: ${lane.name}${lane.startIndex ? ` (resuming from ${lane.startIndex})` : ''}`);
563
- let lastOutput = '';
564
- const spawnResult = spawnLane({
565
- laneName: lane.name,
566
- tasksFile: lane.path,
567
- laneRunDir: laneRunDirs[lane.name],
568
- executor: options.executor || 'cursor-agent',
569
- startIndex: lane.startIndex,
570
- pipelineBranch: `${pipelineBranch}/${lane.name}`,
571
- worktreeDir: laneWorktreeDirs[lane.name],
572
- enhancedLogConfig: options.enhancedLogging,
573
- noGit: options.noGit,
574
- onActivity: () => {
575
- const info = running.get(lane.name);
576
- if (info) {
577
- const now = Date.now();
578
- info.lastActivity = now;
579
- // Also reset progress tracking when there's activity (THNK/TOOL events)
580
- // This prevents STALL_NO_PROGRESS from firing when agent is actively working
581
- info.lastStateUpdate = now;
582
- info.stallPhase = 0; // Reset stall phase since agent is responding
574
+ // Check dependencies
575
+ for (const dep of lane.dependsOn) {
576
+ if (failedLanes.has(dep)) {
577
+ logger.error(`Lane ${lane.name} will not start because dependency ${dep} failed`);
578
+ failedLanes.add(lane.name);
579
+ exitCodes[lane.name] = 1;
580
+ return false;
581
+ }
582
+ if (blockedLanes.has(dep)) {
583
+ // If a dependency is blocked, wait
584
+ return false;
585
+ }
586
+ if (!completedLanes.has(dep)) {
587
+ return false;
583
588
  }
584
589
  }
590
+ return true;
585
591
  });
586
- // Track last output and bytes received for long operation and stall detection
587
- if (spawnResult.child.stdout) {
588
- spawnResult.child.stdout.on('data', (data) => {
589
- const info = running.get(lane.name);
590
- if (info) {
591
- info.lastOutput = data.toString().trim().split('\n').pop() || '';
592
- info.bytesReceived += data.length;
593
- // Update auto-recovery manager
594
- autoRecoveryManager.recordActivity(lane.name, data.length, info.lastOutput);
595
- }
592
+ // 2. Spawn ready lanes up to maxConcurrent
593
+ for (const lane of readyToStart) {
594
+ if (running.size >= maxConcurrent)
595
+ break;
596
+ const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[lane.name], 'state.json');
597
+ // Validate and repair state before starting
598
+ const validation = (0, state_1.validateLaneState)(laneStatePath, { autoRepair: true });
599
+ if (!validation.valid && !validation.repaired) {
600
+ logger.warn(`[${lane.name}] State validation issues: ${validation.issues.join(', ')}`);
601
+ }
602
+ logger.info(`Lane started: ${lane.name}${lane.startIndex ? ` (resuming from ${lane.startIndex})` : ''}`);
603
+ const now = Date.now();
604
+ // Pre-register lane in running map so onActivity can find it immediately
605
+ running.set(lane.name, {
606
+ child: {}, // Placeholder, will be replaced below
607
+ logManager: undefined,
608
+ logPath: '',
609
+ lastActivity: now,
610
+ lastStateUpdate: now,
611
+ stallPhase: 0,
612
+ taskStartTime: now,
613
+ lastOutput: '',
614
+ statePath: laneStatePath,
615
+ bytesReceived: 0,
616
+ lastBytesCheck: 0,
617
+ continueSignalsSent: 0,
596
618
  });
597
- }
598
- const now = Date.now();
599
- running.set(lane.name, {
600
- ...spawnResult,
601
- lastActivity: now,
602
- lastStateUpdate: now,
603
- stallPhase: 0,
604
- taskStartTime: now,
605
- lastOutput: '',
606
- statePath: laneStatePath,
607
- bytesReceived: 0,
608
- lastBytesCheck: 0,
609
- continueSignalsSent: 0,
610
- });
611
- // Register lane with auto-recovery manager
612
- autoRecoveryManager.registerLane(lane.name);
613
- // Update lane tracking
614
- lane.taskStartTime = now;
615
- events_1.events.emit('lane.started', {
616
- laneName: lane.name,
617
- pid: spawnResult.child.pid,
618
- logPath: spawnResult.logPath,
619
- });
620
- }
621
- // 3. Wait for any running lane to finish OR check for stalls
622
- if (running.size > 0) {
623
- // Polling timeout for stall detection
624
- let pollTimeout;
625
- const pollPromise = new Promise(resolve => {
626
- pollTimeout = setTimeout(() => resolve({ name: '__poll__', code: 0 }), 10000);
627
- });
628
- const promises = Array.from(running.entries()).map(async ([name, { child }]) => {
629
- const code = await waitChild(child);
630
- return { name, code };
631
- });
632
- const result = await Promise.race([...promises, pollPromise]);
633
- if (pollTimeout)
634
- clearTimeout(pollTimeout);
635
- if (result.name === '__poll__') {
636
- // Periodic stall check with multi-layer detection and escalating recovery
637
- for (const [laneName, info] of running.entries()) {
638
- const now = Date.now();
639
- const idleTime = now - info.lastActivity;
640
- const lane = lanes.find(l => l.name === laneName);
641
- // Check state file for progress updates
642
- let progressTime = 0;
643
- try {
644
- const stateStat = fs.statSync(info.statePath);
645
- const stateUpdateTime = stateStat.mtimeMs;
646
- if (stateUpdateTime > info.lastStateUpdate) {
647
- info.lastStateUpdate = stateUpdateTime;
619
+ let lastOutput = '';
620
+ const spawnResult = spawnLane({
621
+ laneName: lane.name,
622
+ tasksFile: lane.path,
623
+ laneRunDir: laneRunDirs[lane.name],
624
+ executor: options.executor || 'cursor-agent',
625
+ startIndex: lane.startIndex,
626
+ pipelineBranch: `${pipelineBranch}/${lane.name}`,
627
+ worktreeDir: laneWorktreeDirs[lane.name],
628
+ enhancedLogConfig: options.enhancedLogging,
629
+ noGit: options.noGit,
630
+ onActivity: () => {
631
+ const info = running.get(lane.name);
632
+ if (info) {
633
+ const actNow = Date.now();
634
+ info.lastActivity = actNow;
635
+ info.lastStateUpdate = actNow;
636
+ info.stallPhase = 0;
648
637
  }
649
- progressTime = now - info.lastStateUpdate;
650
- }
651
- catch {
652
- // State file might not exist yet
653
638
  }
654
- // Calculate bytes received since last check
655
- const bytesDelta = info.bytesReceived - info.lastBytesCheck;
656
- info.lastBytesCheck = info.bytesReceived;
657
- // Use multi-layer stall analysis with enhanced context
658
- const analysis = (0, failure_policy_1.analyzeStall)({
659
- stallPhase: info.stallPhase,
660
- idleTimeMs: idleTime,
661
- progressTimeMs: progressTime,
662
- lastOutput: info.lastOutput,
663
- restartCount: lane.restartCount || 0,
664
- taskStartTimeMs: info.taskStartTime,
665
- bytesReceived: bytesDelta, // Bytes since last check
666
- continueSignalsSent: info.continueSignalsSent,
667
- }, stallConfig);
668
- // Only act if action is not NONE
669
- if (analysis.action !== failure_policy_1.RecoveryAction.NONE) {
670
- (0, failure_policy_1.logFailure)(laneName, analysis);
671
- info.logManager?.log('error', analysis.message);
672
- if (analysis.action === failure_policy_1.RecoveryAction.CONTINUE_SIGNAL) {
673
- const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
674
- try {
675
- fs.writeFileSync(interventionPath, 'continue');
676
- info.stallPhase = 1;
677
- info.lastActivity = now;
678
- info.continueSignalsSent++;
679
- logger.info(`[${laneName}] Sent continue signal (#${info.continueSignalsSent})`);
680
- events_1.events.emit('recovery.continue_signal', {
681
- laneName,
682
- idleSeconds: Math.round(idleTime / 1000),
683
- signalCount: info.continueSignalsSent,
684
- });
639
+ });
640
+ // Update with actual spawn result
641
+ const existingInfo = running.get(lane.name);
642
+ Object.assign(existingInfo, spawnResult);
643
+ // Track last output and bytes received for long operation and stall detection
644
+ if (spawnResult.child.stdout) {
645
+ spawnResult.child.stdout.on('data', (data) => {
646
+ const info = running.get(lane.name);
647
+ if (info) {
648
+ const output = data.toString();
649
+ const lines = output.split('\n').filter(l => l.trim());
650
+ // Filter out heartbeats from activity tracking to avoid resetting stall detection
651
+ const realLines = lines.filter(line => !(line.includes('Heartbeat') && line.includes('bytes received')));
652
+ if (realLines.length > 0) {
653
+ // Real activity detected
654
+ const lastRealLine = realLines[realLines.length - 1];
655
+ info.lastOutput = lastRealLine;
656
+ info.bytesReceived += data.length;
657
+ // Update auto-recovery manager with real activity
658
+ autoRecoveryManager.recordActivity(lane.name, data.length, info.lastOutput);
685
659
  }
686
- catch (e) {
687
- logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
660
+ else if (lines.length > 0) {
661
+ // Only heartbeats received - update auto-recovery manager with 0 bytes to avoid resetting idle timer
662
+ autoRecoveryManager.recordActivity(lane.name, 0, info.lastOutput);
688
663
  }
689
664
  }
690
- else if (analysis.action === failure_policy_1.RecoveryAction.STRONGER_PROMPT) {
691
- const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
692
- const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck. Please continue with your current task immediately. If you're waiting for something, explain what you need and proceed with what you can do now. If you've completed the task, summarize your work and finish.`;
693
- try {
694
- fs.writeFileSync(interventionPath, strongerPrompt);
695
- info.stallPhase = 2;
696
- info.lastActivity = now;
697
- logger.warn(`[${laneName}] Sent stronger prompt after continue signal failed`);
698
- events_1.events.emit('recovery.stronger_prompt', { laneName });
699
- }
700
- catch (e) {
701
- logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
702
- }
665
+ });
666
+ }
667
+ // Register lane with auto-recovery manager
668
+ autoRecoveryManager.registerLane(lane.name);
669
+ // Update lane tracking
670
+ lane.taskStartTime = now;
671
+ events_1.events.emit('lane.started', {
672
+ laneName: lane.name,
673
+ pid: spawnResult.child.pid,
674
+ logPath: spawnResult.logPath,
675
+ });
676
+ }
677
+ // 3. Wait for any running lane to finish OR check for stalls
678
+ if (running.size > 0) {
679
+ // Polling timeout for stall detection
680
+ let pollTimeout;
681
+ const pollPromise = new Promise(resolve => {
682
+ pollTimeout = setTimeout(() => resolve({ name: '__poll__', code: 0 }), 10000);
683
+ });
684
+ const promises = Array.from(running.entries()).map(async ([name, { child }]) => {
685
+ const code = await waitChild(child);
686
+ return { name, code };
687
+ });
688
+ const result = await Promise.race([...promises, pollPromise]);
689
+ if (pollTimeout)
690
+ clearTimeout(pollTimeout);
691
+ const now = Date.now();
692
+ if (result.name === '__poll__' || (now - lastStallCheck >= 10000)) {
693
+ lastStallCheck = now;
694
+ // Periodic stall check with multi-layer detection and escalating recovery
695
+ for (const [laneName, info] of running.entries()) {
696
+ const idleTime = now - info.lastActivity;
697
+ const lane = lanes.find(l => l.name === laneName);
698
+ if (process.env['DEBUG_STALL']) {
699
+ logger.debug(`[${laneName}] Stall check: idle=${Math.round(idleTime / 1000)}s, bytesDelta=${info.bytesReceived - info.lastBytesCheck}, phase=${info.stallPhase}`);
703
700
  }
704
- else if (analysis.action === failure_policy_1.RecoveryAction.KILL_AND_RESTART ||
705
- analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE ||
706
- analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE_FROM_CHECKPOINT) {
707
- lane.restartCount = (lane.restartCount || 0) + 1;
708
- info.stallPhase = 3;
709
- // Try to get checkpoint info
710
- const checkpoint = (0, checkpoint_1.getLatestCheckpoint)(laneRunDirs[laneName]);
711
- if (checkpoint) {
712
- logger.info(`[${laneName}] Checkpoint available: ${checkpoint.id} (task ${checkpoint.taskIndex})`);
713
- }
714
- // Kill the process
715
- try {
716
- info.child.kill('SIGKILL');
717
- }
718
- catch {
719
- // Process might already be dead
701
+ // Check state file for progress updates
702
+ let progressTime = 0;
703
+ try {
704
+ const stateStat = fs.statSync(info.statePath);
705
+ const stateUpdateTime = stateStat.mtimeMs;
706
+ if (stateUpdateTime > info.lastStateUpdate) {
707
+ info.lastStateUpdate = stateUpdateTime;
720
708
  }
721
- logger.warn(`[${laneName}] Killing and restarting lane (restart #${lane.restartCount})`);
722
- events_1.events.emit('recovery.restart', {
723
- laneName,
724
- restartCount: lane.restartCount,
725
- maxRestarts: stallConfig.maxRestarts,
726
- });
709
+ progressTime = now - info.lastStateUpdate;
727
710
  }
728
- else if (analysis.action === failure_policy_1.RecoveryAction.RUN_DOCTOR) {
729
- info.stallPhase = 4;
730
- // Run diagnostics
731
- logger.error(`[${laneName}] Running diagnostics due to persistent failures...`);
732
- // Import health check dynamically to avoid circular dependency
733
- const { checkAgentHealth, checkAuthHealth } = await Promise.resolve().then(() => __importStar(require('../utils/health')));
734
- const [agentHealth, authHealth] = await Promise.all([
735
- checkAgentHealth(),
736
- checkAuthHealth(),
737
- ]);
738
- const issues = [];
739
- if (!agentHealth.ok)
740
- issues.push(`Agent: ${agentHealth.message}`);
741
- if (!authHealth.ok)
742
- issues.push(`Auth: ${authHealth.message}`);
743
- if (issues.length > 0) {
744
- logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
745
- }
746
- else {
747
- logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
748
- }
749
- // Save diagnostic to file
750
- const diagnosticPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'diagnostic.json');
751
- fs.writeFileSync(diagnosticPath, JSON.stringify({
752
- timestamp: Date.now(),
753
- agentHealthy: agentHealth.ok,
754
- authHealthy: authHealth.ok,
755
- issues,
756
- analysis,
757
- }, null, 2));
758
- // Kill the process
759
- try {
760
- info.child.kill('SIGKILL');
711
+ catch {
712
+ // State file might not exist yet
713
+ }
714
+ // Calculate bytes received since last check
715
+ const bytesDelta = info.bytesReceived - info.lastBytesCheck;
716
+ info.lastBytesCheck = info.bytesReceived;
717
+ // Use multi-layer stall analysis with enhanced context
718
+ const analysis = (0, failure_policy_1.analyzeStall)({
719
+ stallPhase: info.stallPhase,
720
+ idleTimeMs: idleTime,
721
+ progressTimeMs: progressTime,
722
+ lastOutput: info.lastOutput,
723
+ restartCount: lane.restartCount || 0,
724
+ taskStartTimeMs: info.taskStartTime,
725
+ bytesReceived: bytesDelta, // Bytes since last check
726
+ continueSignalsSent: info.continueSignalsSent,
727
+ }, stallConfig);
728
+ // Only act if action is not NONE
729
+ if (analysis.action !== failure_policy_1.RecoveryAction.NONE) {
730
+ (0, failure_policy_1.logFailure)(laneName, analysis);
731
+ info.logManager?.log('error', analysis.message);
732
+ if (analysis.action === failure_policy_1.RecoveryAction.CONTINUE_SIGNAL) {
733
+ const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
734
+ try {
735
+ fs.writeFileSync(interventionPath, 'continue');
736
+ info.stallPhase = 1;
737
+ info.lastActivity = now;
738
+ info.continueSignalsSent++;
739
+ logger.info(`[${laneName}] Sent continue signal (#${info.continueSignalsSent})`);
740
+ events_1.events.emit('recovery.continue_signal', {
741
+ laneName,
742
+ idleSeconds: Math.round(idleTime / 1000),
743
+ signalCount: info.continueSignalsSent,
744
+ });
745
+ }
746
+ catch (e) {
747
+ logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
748
+ }
761
749
  }
762
- catch {
763
- // Process might already be dead
750
+ else if (analysis.action === failure_policy_1.RecoveryAction.STRONGER_PROMPT) {
751
+ const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
752
+ const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck. Please continue with your current task immediately. If you're waiting for something, explain what you need and proceed with what you can do now. If you've completed the task, summarize your work and finish.`;
753
+ try {
754
+ fs.writeFileSync(interventionPath, strongerPrompt);
755
+ info.stallPhase = 2;
756
+ info.lastActivity = now;
757
+ logger.warn(`[${laneName}] Sent stronger prompt after continue signal failed`);
758
+ events_1.events.emit('recovery.stronger_prompt', { laneName });
759
+ }
760
+ catch (e) {
761
+ logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
762
+ }
764
763
  }
765
- logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
766
- // Save POF for failed recovery
767
- const recoveryState = autoRecoveryManager.getState(laneName);
768
- if (recoveryState) {
764
+ else if (analysis.action === failure_policy_1.RecoveryAction.KILL_AND_RESTART ||
765
+ analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE ||
766
+ analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE_FROM_CHECKPOINT) {
767
+ lane.restartCount = (lane.restartCount || 0) + 1;
768
+ info.stallPhase = 3;
769
+ // Try to get checkpoint info
770
+ const checkpoint = (0, checkpoint_1.getLatestCheckpoint)(laneRunDirs[laneName]);
771
+ if (checkpoint) {
772
+ logger.info(`[${laneName}] Checkpoint available: ${checkpoint.id} (task ${checkpoint.taskIndex})`);
773
+ }
774
+ // Kill the process
769
775
  try {
770
- const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
771
- const laneState = (0, state_1.loadState)(laneStatePath);
772
- const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
773
- const diagnosticInfo = {
774
- timestamp: Date.now(),
775
- agentHealthy: agentHealth.ok,
776
- authHealthy: authHealth.ok,
777
- systemHealthy: true,
778
- suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
779
- details: issues.join('\n') || 'No obvious issues found',
780
- };
781
- const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, diagnosticInfo);
782
- (0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
776
+ info.child.kill('SIGKILL');
783
777
  }
784
- catch (pofError) {
785
- logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
778
+ catch {
779
+ // Process might already be dead
786
780
  }
781
+ logger.warn(`[${laneName}] Killing and restarting lane (restart #${lane.restartCount})`);
782
+ events_1.events.emit('recovery.restart', {
783
+ laneName,
784
+ restartCount: lane.restartCount,
785
+ maxRestarts: stallConfig.maxRestarts,
786
+ });
787
787
  }
788
- events_1.events.emit('recovery.diagnosed', {
789
- laneName,
790
- diagnostic: { agentHealthy: agentHealth.ok, authHealthy: authHealth.ok, issues },
791
- });
792
- }
793
- else if (analysis.action === failure_policy_1.RecoveryAction.ABORT_LANE) {
794
- info.stallPhase = 5;
795
- try {
796
- info.child.kill('SIGKILL');
788
+ else if (analysis.action === failure_policy_1.RecoveryAction.RUN_DOCTOR) {
789
+ info.stallPhase = 4;
790
+ // Run diagnostics
791
+ logger.error(`[${laneName}] Running diagnostics due to persistent failures...`);
792
+ // Import health check dynamically to avoid circular dependency
793
+ const { checkAgentHealth, checkAuthHealth } = await Promise.resolve().then(() => __importStar(require('../utils/health')));
794
+ const [agentHealth, authHealth] = await Promise.all([
795
+ checkAgentHealth(),
796
+ checkAuthHealth(),
797
+ ]);
798
+ const issues = [];
799
+ if (!agentHealth.ok)
800
+ issues.push(`Agent: ${agentHealth.message}`);
801
+ if (!authHealth.ok)
802
+ issues.push(`Auth: ${authHealth.message}`);
803
+ if (issues.length > 0) {
804
+ logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
805
+ }
806
+ else {
807
+ logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
808
+ }
809
+ // Save diagnostic to file
810
+ const diagnosticPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'diagnostic.json');
811
+ fs.writeFileSync(diagnosticPath, JSON.stringify({
812
+ timestamp: Date.now(),
813
+ agentHealthy: agentHealth.ok,
814
+ authHealthy: authHealth.ok,
815
+ issues,
816
+ analysis,
817
+ }, null, 2));
818
+ // Kill the process
819
+ try {
820
+ info.child.kill('SIGKILL');
821
+ }
822
+ catch {
823
+ // Process might already be dead
824
+ }
825
+ logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
826
+ // Save POF for failed recovery
827
+ const recoveryState = autoRecoveryManager.getState(laneName);
828
+ if (recoveryState) {
829
+ try {
830
+ const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
831
+ const laneState = (0, state_1.loadState)(laneStatePath);
832
+ const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
833
+ const diagnosticInfo = {
834
+ timestamp: Date.now(),
835
+ agentHealthy: agentHealth.ok,
836
+ authHealthy: authHealth.ok,
837
+ systemHealthy: true,
838
+ suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
839
+ details: issues.join('\n') || 'No obvious issues found',
840
+ };
841
+ const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, diagnosticInfo);
842
+ (0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
843
+ }
844
+ catch (pofError) {
845
+ logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
846
+ }
847
+ }
848
+ events_1.events.emit('recovery.diagnosed', {
849
+ laneName,
850
+ diagnostic: { agentHealthy: agentHealth.ok, authHealthy: authHealth.ok, issues },
851
+ });
797
852
  }
798
- catch {
799
- // Process might already be dead
853
+ else if (analysis.action === failure_policy_1.RecoveryAction.ABORT_LANE) {
854
+ info.stallPhase = 5;
855
+ try {
856
+ info.child.kill('SIGKILL');
857
+ }
858
+ catch {
859
+ // Process might already be dead
860
+ }
861
+ logger.error(`[${laneName}] Aborting lane due to repeated stalls`);
862
+ // Save POF for failed recovery
863
+ const recoveryState = autoRecoveryManager.getState(laneName);
864
+ if (recoveryState) {
865
+ try {
866
+ const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
867
+ const laneState = (0, state_1.loadState)(laneStatePath);
868
+ const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
869
+ const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, recoveryState.diagnosticInfo);
870
+ (0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
871
+ }
872
+ catch (pofError) {
873
+ logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
874
+ }
875
+ }
800
876
  }
801
- logger.error(`[${laneName}] Aborting lane due to repeated stalls`);
802
- // Save POF for failed recovery
803
- const recoveryState = autoRecoveryManager.getState(laneName);
804
- if (recoveryState) {
877
+ else if (analysis.action === failure_policy_1.RecoveryAction.SEND_GIT_GUIDANCE) {
878
+ // Send guidance message to agent for git issues
879
+ const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
880
+ // Determine which guidance to send based on the failure type
881
+ let guidance;
882
+ if (analysis.type === failure_policy_1.FailureType.GIT_PUSH_REJECTED) {
883
+ guidance = (0, auto_recovery_1.getGitPushFailureGuidance)();
884
+ }
885
+ else if (analysis.type === failure_policy_1.FailureType.MERGE_CONFLICT) {
886
+ guidance = (0, auto_recovery_1.getMergeConflictGuidance)();
887
+ }
888
+ else {
889
+ guidance = (0, auto_recovery_1.getGitErrorGuidance)(analysis.message);
890
+ }
805
891
  try {
806
- const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
807
- const laneState = (0, state_1.loadState)(laneStatePath);
808
- const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
809
- const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, recoveryState.diagnosticInfo);
810
- (0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
892
+ fs.writeFileSync(interventionPath, guidance);
893
+ info.lastActivity = now;
894
+ logger.info(`[${laneName}] Sent git issue guidance to agent`);
811
895
  }
812
- catch (pofError) {
813
- logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
896
+ catch (e) {
897
+ logger.error(`[${laneName}] Failed to send guidance: ${e.message}`);
814
898
  }
815
899
  }
816
900
  }
817
- else if (analysis.action === failure_policy_1.RecoveryAction.SEND_GIT_GUIDANCE) {
818
- // Send guidance message to agent for git issues
819
- const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
820
- // Determine which guidance to send based on the failure type
821
- let guidance;
822
- if (analysis.type === failure_policy_1.FailureType.GIT_PUSH_REJECTED) {
823
- guidance = (0, auto_recovery_1.getGitPushFailureGuidance)();
824
- }
825
- else if (analysis.type === failure_policy_1.FailureType.MERGE_CONFLICT) {
826
- guidance = (0, auto_recovery_1.getMergeConflictGuidance)();
827
- }
828
- else {
829
- guidance = (0, auto_recovery_1.getGitErrorGuidance)(analysis.message);
901
+ }
902
+ continue;
903
+ }
904
+ else {
905
+ const finished = result;
906
+ const info = running.get(finished.name);
907
+ running.delete(finished.name);
908
+ exitCodes[finished.name] = finished.code;
909
+ // Unregister from auto-recovery manager
910
+ autoRecoveryManager.unregisterLane(finished.name);
911
+ if (finished.code === 0) {
912
+ completedLanes.add(finished.name);
913
+ events_1.events.emit('lane.completed', {
914
+ laneName: finished.name,
915
+ exitCode: finished.code,
916
+ });
917
+ }
918
+ else if (finished.code === 2) {
919
+ // Blocked by dependency
920
+ const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
921
+ const state = (0, state_1.loadState)(statePath);
922
+ if (state && state.dependencyRequest) {
923
+ blockedLanes.set(finished.name, state.dependencyRequest);
924
+ const lane = lanes.find(l => l.name === finished.name);
925
+ if (lane) {
926
+ lane.startIndex = Math.max(0, state.currentTaskIndex - 1); // Task was blocked, retry it
830
927
  }
831
- try {
832
- fs.writeFileSync(interventionPath, guidance);
833
- info.lastActivity = now;
834
- logger.info(`[${laneName}] Sent git issue guidance to agent`);
928
+ events_1.events.emit('lane.blocked', {
929
+ laneName: finished.name,
930
+ dependencyRequest: state.dependencyRequest,
931
+ });
932
+ logger.warn(`Lane ${finished.name} is blocked on dependency change request`);
933
+ }
934
+ else {
935
+ failedLanes.add(finished.name);
936
+ logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
937
+ }
938
+ }
939
+ else {
940
+ // Check if it was a restart request
941
+ if (info.stallPhase === 2) {
942
+ logger.info(`🔄 Lane ${finished.name} is being restarted due to stall...`);
943
+ // Update startIndex from current state to resume from the same task
944
+ const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
945
+ const state = (0, state_1.loadState)(statePath);
946
+ if (state) {
947
+ const lane = lanes.find(l => l.name === finished.name);
948
+ if (lane) {
949
+ lane.startIndex = state.currentTaskIndex;
950
+ }
835
951
  }
836
- catch (e) {
837
- logger.error(`[${laneName}] Failed to send guidance: ${e.message}`);
952
+ // Note: we don't add to failedLanes or completedLanes,
953
+ // so it will be eligible to start again in the next iteration.
954
+ continue;
955
+ }
956
+ failedLanes.add(finished.name);
957
+ let errorMsg = 'Process exited with non-zero code';
958
+ if (info.stallPhase === 3) {
959
+ errorMsg = 'Stopped due to repeated stall';
960
+ }
961
+ else if (info.logManager) {
962
+ const lastError = info.logManager.getLastError();
963
+ if (lastError) {
964
+ errorMsg = `Process failed: ${lastError}`;
838
965
  }
839
966
  }
967
+ logger.error(`[${finished.name}] Lane failed with exit code ${finished.code}: ${errorMsg}`);
968
+ // Log log tail for visibility
969
+ if (info.logPath) {
970
+ logFileTail(info.logPath, 15);
971
+ }
972
+ events_1.events.emit('lane.failed', {
973
+ laneName: finished.name,
974
+ exitCode: finished.code,
975
+ error: errorMsg,
976
+ });
840
977
  }
841
- }
842
- continue;
843
- }
844
- const finished = result;
845
- const info = running.get(finished.name);
846
- running.delete(finished.name);
847
- exitCodes[finished.name] = finished.code;
848
- // Unregister from auto-recovery manager
849
- autoRecoveryManager.unregisterLane(finished.name);
850
- if (finished.code === 0) {
851
- completedLanes.add(finished.name);
852
- events_1.events.emit('lane.completed', {
853
- laneName: finished.name,
854
- exitCode: finished.code,
855
- });
856
- }
857
- else if (finished.code === 2) {
858
- // Blocked by dependency
859
- const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
860
- const state = (0, state_1.loadState)(statePath);
861
- if (state && state.dependencyRequest) {
862
- blockedLanes.set(finished.name, state.dependencyRequest);
863
- const lane = lanes.find(l => l.name === finished.name);
864
- if (lane) {
865
- lane.startIndex = Math.max(0, state.currentTaskIndex - 1); // Task was blocked, retry it
866
- }
867
- events_1.events.emit('lane.blocked', {
868
- laneName: finished.name,
869
- dependencyRequest: state.dependencyRequest,
870
- });
871
- logger.warn(`Lane ${finished.name} is blocked on dependency change request`);
872
- }
873
- else {
874
- failedLanes.add(finished.name);
875
- logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
978
+ printLaneStatus(lanes, laneRunDirs);
876
979
  }
877
980
  }
878
981
  else {
879
- // Check if it was a restart request
880
- if (info.stallPhase === 2) {
881
- logger.info(`🔄 Lane ${finished.name} is being restarted due to stall...`);
882
- // Update startIndex from current state to resume from the same task
883
- const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
884
- const state = (0, state_1.loadState)(statePath);
885
- if (state) {
886
- const lane = lanes.find(l => l.name === finished.name);
887
- if (lane) {
888
- lane.startIndex = state.currentTaskIndex;
982
+ // Nothing running. Are we blocked?
983
+ // Wait a bit to avoid busy-spin while waiting for dependencies or new slots
984
+ if (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length) {
985
+ await new Promise(resolve => setTimeout(resolve, 2000));
986
+ }
987
+ if (blockedLanes.size > 0 && autoResolve) {
988
+ logger.section('🛠 Auto-Resolving Dependencies');
989
+ try {
990
+ await resolveAllDependencies(blockedLanes, lanes, laneRunDirs, pipelineBranch, runRoot);
991
+ // Clear blocked status
992
+ blockedLanes.clear();
993
+ logger.success('Dependencies resolved and synced across all active lanes. Resuming...');
994
+ }
995
+ catch (error) {
996
+ logger.error(`Auto-resolution failed: ${error.message}`);
997
+ // Move blocked to failed
998
+ for (const name of blockedLanes.keys()) {
999
+ failedLanes.add(name);
889
1000
  }
1001
+ blockedLanes.clear();
890
1002
  }
891
- // Note: we don't add to failedLanes or completedLanes,
892
- // so it will be eligible to start again in the next iteration.
893
- continue;
894
- }
895
- failedLanes.add(finished.name);
896
- events_1.events.emit('lane.failed', {
897
- laneName: finished.name,
898
- exitCode: finished.code,
899
- error: info.stallPhase === 3 ? 'Stopped due to repeated stall' : 'Process exited with non-zero code',
900
- });
901
- }
902
- printLaneStatus(lanes, laneRunDirs);
903
- }
904
- else {
905
- // Nothing running. Are we blocked?
906
- if (blockedLanes.size > 0 && autoResolve) {
907
- logger.section('🛠 Auto-Resolving Dependencies');
908
- try {
909
- await resolveAllDependencies(blockedLanes, lanes, laneRunDirs, pipelineBranch, runRoot);
910
- // Clear blocked status
911
- blockedLanes.clear();
912
- logger.success('Dependencies resolved and synced across all active lanes. Resuming...');
913
1003
  }
914
- catch (error) {
915
- logger.error(`Auto-resolution failed: ${error.message}`);
916
- // Move blocked to failed
917
- for (const name of blockedLanes.keys()) {
918
- failedLanes.add(name);
1004
+ else if (readyToStart.length === 0 && completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length) {
1005
+ const remaining = lanes.filter(l => !completedLanes.has(l.name) && !failedLanes.has(l.name) && !blockedLanes.has(l.name));
1006
+ logger.error(`Deadlock detected! Remaining lanes cannot start: ${remaining.map(l => l.name).join(', ')}`);
1007
+ for (const l of remaining) {
1008
+ failedLanes.add(l.name);
1009
+ exitCodes[l.name] = 1;
919
1010
  }
920
- blockedLanes.clear();
921
1011
  }
922
- }
923
- else if (readyToStart.length === 0 && completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length) {
924
- const remaining = lanes.filter(l => !completedLanes.has(l.name) && !failedLanes.has(l.name) && !blockedLanes.has(l.name));
925
- logger.error(`Deadlock detected! Remaining lanes cannot start: ${remaining.map(l => l.name).join(', ')}`);
926
- for (const l of remaining) {
927
- failedLanes.add(l.name);
928
- exitCodes[l.name] = 1;
1012
+ else {
1013
+ // All finished
1014
+ break;
929
1015
  }
930
1016
  }
931
- else {
932
- // All finished
933
- break;
934
- }
935
1017
  }
936
1018
  }
937
- clearInterval(monitorInterval);
1019
+ finally {
1020
+ clearInterval(monitorInterval);
1021
+ process.removeListener('SIGINT', sigIntHandler);
1022
+ process.removeListener('SIGTERM', sigIntHandler);
1023
+ }
938
1024
  printLaneStatus(lanes, laneRunDirs);
939
1025
  // Check for failures
940
1026
  const failed = Object.entries(exitCodes).filter(([, code]) => code !== 0 && code !== 2);