@litmers/cursorflow-orchestrator 0.1.29 → 0.1.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -73,6 +73,27 @@ const DEFAULT_ORCHESTRATOR_STALL_CONFIG = {
73
73
  progressTimeoutMs: 10 * 60 * 1000, // 10 minutes (only triggers if no activity at all)
74
74
  maxRestarts: 2,
75
75
  };
76
+ /**
77
+ * Log the tail of a file
78
+ */
79
+ function logFileTail(filePath, lines = 10) {
80
+ if (!fs.existsSync(filePath))
81
+ return;
82
+ try {
83
+ const content = fs.readFileSync(filePath, 'utf8');
84
+ const allLines = content.split('\n');
85
+ const tail = allLines.slice(-lines).filter(l => l.trim());
86
+ if (tail.length > 0) {
87
+ logger.error(` Last ${tail.length} lines of log:`);
88
+ for (const line of tail) {
89
+ logger.error(` ${line}`);
90
+ }
91
+ }
92
+ }
93
+ catch (e) {
94
+ // Ignore log reading errors
95
+ }
96
+ }
76
97
  /**
77
98
  * Spawn a lane process
78
99
  */
@@ -141,8 +162,11 @@ function spawnLane({ laneName, tasksFile, laneRunDir, executor, startIndex = 0,
141
162
  // or if it's NOT a noisy JSON line
142
163
  const hasTimestamp = /^\[\d{4}-\d{2}-\d{2}T|\^\[\d{2}:\d{2}:\d{2}\]/.test(trimmed);
143
164
  const isJson = trimmed.startsWith('{') || trimmed.includes('{"type"');
165
+ // Filter out heartbeats - they should NOT reset the idle timer
166
+ const isHeartbeat = trimmed.includes('Heartbeat') && trimmed.includes('bytes received');
144
167
  if (trimmed && !isJson) {
145
- if (onActivity)
168
+ // Only trigger activity for non-heartbeat lines
169
+ if (onActivity && !isHeartbeat)
146
170
  onActivity();
147
171
  // If line alreedy has timestamp format, just add lane prefix
148
172
  if (hasTimestamp) {
@@ -524,417 +548,485 @@ async function orchestrate(tasksDir, options = {}) {
524
548
  const monitorInterval = setInterval(() => {
525
549
  printLaneStatus(lanes, laneRunDirs);
526
550
  }, options.pollInterval || 60000);
527
- while (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length || (blockedLanes.size > 0 && running.size === 0)) {
528
- // 1. Identify lanes ready to start
529
- const readyToStart = lanes.filter(lane => {
530
- // Not already running or completed or failed or blocked
531
- if (running.has(lane.name) || completedLanes.has(lane.name) || failedLanes.has(lane.name) || blockedLanes.has(lane.name)) {
532
- return false;
551
+ // Handle process interruption
552
+ const sigIntHandler = () => {
553
+ logger.warn('\n⚠️ Orchestration interrupted! Stopping all lanes...');
554
+ for (const [name, info] of running.entries()) {
555
+ logger.info(`Stopping lane: ${name}`);
556
+ try {
557
+ info.child.kill('SIGTERM');
533
558
  }
534
- // Check dependencies
535
- for (const dep of lane.dependsOn) {
536
- if (failedLanes.has(dep)) {
537
- logger.error(`Lane ${lane.name} will not start because dependency ${dep} failed`);
538
- failedLanes.add(lane.name);
539
- exitCodes[lane.name] = 1;
540
- return false;
541
- }
542
- if (blockedLanes.has(dep)) {
543
- // If a dependency is blocked, wait
544
- return false;
545
- }
546
- if (!completedLanes.has(dep)) {
559
+ catch {
560
+ // Ignore kill errors
561
+ }
562
+ }
563
+ printLaneStatus(lanes, laneRunDirs);
564
+ process.exit(130);
565
+ };
566
+ process.on('SIGINT', sigIntHandler);
567
+ process.on('SIGTERM', sigIntHandler);
568
+ let lastStallCheck = Date.now();
569
+ try {
570
+ while (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length || (blockedLanes.size > 0 && running.size === 0)) {
571
+ // 1. Identify lanes ready to start
572
+ const readyToStart = lanes.filter(lane => {
573
+ // Not already running or completed or failed or blocked
574
+ if (running.has(lane.name) || completedLanes.has(lane.name) || failedLanes.has(lane.name) || blockedLanes.has(lane.name)) {
547
575
  return false;
548
576
  }
549
- }
550
- return true;
551
- });
552
- // 2. Spawn ready lanes up to maxConcurrent
553
- for (const lane of readyToStart) {
554
- if (running.size >= maxConcurrent)
555
- break;
556
- const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[lane.name], 'state.json');
557
- // Validate and repair state before starting
558
- const validation = (0, state_1.validateLaneState)(laneStatePath, { autoRepair: true });
559
- if (!validation.valid && !validation.repaired) {
560
- logger.warn(`[${lane.name}] State validation issues: ${validation.issues.join(', ')}`);
561
- }
562
- logger.info(`Lane started: ${lane.name}${lane.startIndex ? ` (resuming from ${lane.startIndex})` : ''}`);
563
- let lastOutput = '';
564
- const spawnResult = spawnLane({
565
- laneName: lane.name,
566
- tasksFile: lane.path,
567
- laneRunDir: laneRunDirs[lane.name],
568
- executor: options.executor || 'cursor-agent',
569
- startIndex: lane.startIndex,
570
- pipelineBranch: `${pipelineBranch}/${lane.name}`,
571
- worktreeDir: laneWorktreeDirs[lane.name],
572
- enhancedLogConfig: options.enhancedLogging,
573
- noGit: options.noGit,
574
- onActivity: () => {
575
- const info = running.get(lane.name);
576
- if (info) {
577
- const now = Date.now();
578
- info.lastActivity = now;
579
- // Also reset progress tracking when there's activity (THNK/TOOL events)
580
- // This prevents STALL_NO_PROGRESS from firing when agent is actively working
581
- info.lastStateUpdate = now;
582
- info.stallPhase = 0; // Reset stall phase since agent is responding
577
+ // Check dependencies
578
+ for (const dep of lane.dependsOn) {
579
+ if (failedLanes.has(dep)) {
580
+ logger.error(`Lane ${lane.name} will not start because dependency ${dep} failed`);
581
+ failedLanes.add(lane.name);
582
+ exitCodes[lane.name] = 1;
583
+ return false;
584
+ }
585
+ if (blockedLanes.has(dep)) {
586
+ // If a dependency is blocked, wait
587
+ return false;
588
+ }
589
+ if (!completedLanes.has(dep)) {
590
+ return false;
583
591
  }
584
592
  }
593
+ return true;
585
594
  });
586
- // Track last output and bytes received for long operation and stall detection
587
- if (spawnResult.child.stdout) {
588
- spawnResult.child.stdout.on('data', (data) => {
589
- const info = running.get(lane.name);
590
- if (info) {
591
- info.lastOutput = data.toString().trim().split('\n').pop() || '';
592
- info.bytesReceived += data.length;
593
- // Update auto-recovery manager
594
- autoRecoveryManager.recordActivity(lane.name, data.length, info.lastOutput);
595
- }
595
+ // 2. Spawn ready lanes up to maxConcurrent
596
+ for (const lane of readyToStart) {
597
+ if (running.size >= maxConcurrent)
598
+ break;
599
+ const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[lane.name], 'state.json');
600
+ // Validate and repair state before starting
601
+ const validation = (0, state_1.validateLaneState)(laneStatePath, { autoRepair: true });
602
+ if (!validation.valid && !validation.repaired) {
603
+ logger.warn(`[${lane.name}] State validation issues: ${validation.issues.join(', ')}`);
604
+ }
605
+ logger.info(`Lane started: ${lane.name}${lane.startIndex ? ` (resuming from ${lane.startIndex})` : ''}`);
606
+ const now = Date.now();
607
+ // Pre-register lane in running map so onActivity can find it immediately
608
+ running.set(lane.name, {
609
+ child: {}, // Placeholder, will be replaced below
610
+ logManager: undefined,
611
+ logPath: '',
612
+ lastActivity: now,
613
+ lastStateUpdate: now,
614
+ stallPhase: 0,
615
+ taskStartTime: now,
616
+ lastOutput: '',
617
+ statePath: laneStatePath,
618
+ bytesReceived: 0,
619
+ lastBytesCheck: 0,
620
+ continueSignalsSent: 0,
596
621
  });
597
- }
598
- const now = Date.now();
599
- running.set(lane.name, {
600
- ...spawnResult,
601
- lastActivity: now,
602
- lastStateUpdate: now,
603
- stallPhase: 0,
604
- taskStartTime: now,
605
- lastOutput: '',
606
- statePath: laneStatePath,
607
- bytesReceived: 0,
608
- lastBytesCheck: 0,
609
- continueSignalsSent: 0,
610
- });
611
- // Register lane with auto-recovery manager
612
- autoRecoveryManager.registerLane(lane.name);
613
- // Update lane tracking
614
- lane.taskStartTime = now;
615
- events_1.events.emit('lane.started', {
616
- laneName: lane.name,
617
- pid: spawnResult.child.pid,
618
- logPath: spawnResult.logPath,
619
- });
620
- }
621
- // 3. Wait for any running lane to finish OR check for stalls
622
- if (running.size > 0) {
623
- // Polling timeout for stall detection
624
- let pollTimeout;
625
- const pollPromise = new Promise(resolve => {
626
- pollTimeout = setTimeout(() => resolve({ name: '__poll__', code: 0 }), 10000);
627
- });
628
- const promises = Array.from(running.entries()).map(async ([name, { child }]) => {
629
- const code = await waitChild(child);
630
- return { name, code };
631
- });
632
- const result = await Promise.race([...promises, pollPromise]);
633
- if (pollTimeout)
634
- clearTimeout(pollTimeout);
635
- if (result.name === '__poll__') {
636
- // Periodic stall check with multi-layer detection and escalating recovery
637
- for (const [laneName, info] of running.entries()) {
638
- const now = Date.now();
639
- const idleTime = now - info.lastActivity;
640
- const lane = lanes.find(l => l.name === laneName);
641
- // Check state file for progress updates
642
- let progressTime = 0;
643
- try {
644
- const stateStat = fs.statSync(info.statePath);
645
- const stateUpdateTime = stateStat.mtimeMs;
646
- if (stateUpdateTime > info.lastStateUpdate) {
647
- info.lastStateUpdate = stateUpdateTime;
622
+ let lastOutput = '';
623
+ const spawnResult = spawnLane({
624
+ laneName: lane.name,
625
+ tasksFile: lane.path,
626
+ laneRunDir: laneRunDirs[lane.name],
627
+ executor: options.executor || 'cursor-agent',
628
+ startIndex: lane.startIndex,
629
+ pipelineBranch: `${pipelineBranch}/${lane.name}`,
630
+ worktreeDir: laneWorktreeDirs[lane.name],
631
+ enhancedLogConfig: options.enhancedLogging,
632
+ noGit: options.noGit,
633
+ onActivity: () => {
634
+ const info = running.get(lane.name);
635
+ if (info) {
636
+ const actNow = Date.now();
637
+ info.lastActivity = actNow;
638
+ info.lastStateUpdate = actNow;
639
+ info.stallPhase = 0;
648
640
  }
649
- progressTime = now - info.lastStateUpdate;
650
- }
651
- catch {
652
- // State file might not exist yet
653
641
  }
654
- // Calculate bytes received since last check
655
- const bytesDelta = info.bytesReceived - info.lastBytesCheck;
656
- info.lastBytesCheck = info.bytesReceived;
657
- // Use multi-layer stall analysis with enhanced context
658
- const analysis = (0, failure_policy_1.analyzeStall)({
659
- stallPhase: info.stallPhase,
660
- idleTimeMs: idleTime,
661
- progressTimeMs: progressTime,
662
- lastOutput: info.lastOutput,
663
- restartCount: lane.restartCount || 0,
664
- taskStartTimeMs: info.taskStartTime,
665
- bytesReceived: bytesDelta, // Bytes since last check
666
- continueSignalsSent: info.continueSignalsSent,
667
- }, stallConfig);
668
- // Only act if action is not NONE
669
- if (analysis.action !== failure_policy_1.RecoveryAction.NONE) {
670
- (0, failure_policy_1.logFailure)(laneName, analysis);
671
- info.logManager?.log('error', analysis.message);
672
- if (analysis.action === failure_policy_1.RecoveryAction.CONTINUE_SIGNAL) {
673
- const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
674
- try {
675
- fs.writeFileSync(interventionPath, 'continue');
676
- info.stallPhase = 1;
677
- info.lastActivity = now;
678
- info.continueSignalsSent++;
679
- logger.info(`[${laneName}] Sent continue signal (#${info.continueSignalsSent})`);
680
- events_1.events.emit('recovery.continue_signal', {
681
- laneName,
682
- idleSeconds: Math.round(idleTime / 1000),
683
- signalCount: info.continueSignalsSent,
684
- });
642
+ });
643
+ // Update with actual spawn result
644
+ const existingInfo = running.get(lane.name);
645
+ Object.assign(existingInfo, spawnResult);
646
+ // Track last output and bytes received for long operation and stall detection
647
+ if (spawnResult.child.stdout) {
648
+ spawnResult.child.stdout.on('data', (data) => {
649
+ const info = running.get(lane.name);
650
+ if (info) {
651
+ const output = data.toString();
652
+ const lines = output.split('\n').filter(l => l.trim());
653
+ // Filter out heartbeats from activity tracking to avoid resetting stall detection
654
+ const realLines = lines.filter(line => !(line.includes('Heartbeat') && line.includes('bytes received')));
655
+ if (realLines.length > 0) {
656
+ // Real activity detected - update lastActivity to reset stall timer
657
+ const actNow = Date.now();
658
+ info.lastActivity = actNow;
659
+ info.stallPhase = 0; // Reset stall phase on real activity
660
+ const lastRealLine = realLines[realLines.length - 1];
661
+ info.lastOutput = lastRealLine;
662
+ info.bytesReceived += data.length;
663
+ // Update auto-recovery manager with real activity
664
+ autoRecoveryManager.recordActivity(lane.name, data.length, info.lastOutput);
685
665
  }
686
- catch (e) {
687
- logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
666
+ else if (lines.length > 0) {
667
+ // Only heartbeats received - do NOT update lastActivity (keep stall timer running)
668
+ autoRecoveryManager.recordActivity(lane.name, 0, info.lastOutput);
688
669
  }
689
670
  }
690
- else if (analysis.action === failure_policy_1.RecoveryAction.STRONGER_PROMPT) {
691
- const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
692
- const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck. Please continue with your current task immediately. If you're waiting for something, explain what you need and proceed with what you can do now. If you've completed the task, summarize your work and finish.`;
693
- try {
694
- fs.writeFileSync(interventionPath, strongerPrompt);
695
- info.stallPhase = 2;
696
- info.lastActivity = now;
697
- logger.warn(`[${laneName}] Sent stronger prompt after continue signal failed`);
698
- events_1.events.emit('recovery.stronger_prompt', { laneName });
699
- }
700
- catch (e) {
701
- logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
702
- }
671
+ });
672
+ }
673
+ // Register lane with auto-recovery manager
674
+ autoRecoveryManager.registerLane(lane.name);
675
+ // Update lane tracking
676
+ lane.taskStartTime = now;
677
+ events_1.events.emit('lane.started', {
678
+ laneName: lane.name,
679
+ pid: spawnResult.child.pid,
680
+ logPath: spawnResult.logPath,
681
+ });
682
+ }
683
+ // 3. Wait for any running lane to finish OR check for stalls
684
+ if (running.size > 0) {
685
+ // Polling timeout for stall detection
686
+ let pollTimeout;
687
+ const pollPromise = new Promise(resolve => {
688
+ pollTimeout = setTimeout(() => resolve({ name: '__poll__', code: 0 }), 10000);
689
+ });
690
+ const promises = Array.from(running.entries()).map(async ([name, { child }]) => {
691
+ const code = await waitChild(child);
692
+ return { name, code };
693
+ });
694
+ const result = await Promise.race([...promises, pollPromise]);
695
+ if (pollTimeout)
696
+ clearTimeout(pollTimeout);
697
+ const now = Date.now();
698
+ if (result.name === '__poll__' || (now - lastStallCheck >= 10000)) {
699
+ lastStallCheck = now;
700
+ // Periodic stall check with multi-layer detection and escalating recovery
701
+ for (const [laneName, info] of running.entries()) {
702
+ const idleTime = now - info.lastActivity;
703
+ const lane = lanes.find(l => l.name === laneName);
704
+ if (process.env['DEBUG_STALL']) {
705
+ logger.debug(`[${laneName}] Stall check: idle=${Math.round(idleTime / 1000)}s, bytesDelta=${info.bytesReceived - info.lastBytesCheck}, phase=${info.stallPhase}`);
703
706
  }
704
- else if (analysis.action === failure_policy_1.RecoveryAction.KILL_AND_RESTART ||
705
- analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE ||
706
- analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE_FROM_CHECKPOINT) {
707
- lane.restartCount = (lane.restartCount || 0) + 1;
708
- info.stallPhase = 3;
709
- // Try to get checkpoint info
710
- const checkpoint = (0, checkpoint_1.getLatestCheckpoint)(laneRunDirs[laneName]);
711
- if (checkpoint) {
712
- logger.info(`[${laneName}] Checkpoint available: ${checkpoint.id} (task ${checkpoint.taskIndex})`);
713
- }
714
- // Kill the process
715
- try {
716
- info.child.kill('SIGKILL');
717
- }
718
- catch {
719
- // Process might already be dead
707
+ // Check state file for progress updates
708
+ let progressTime = 0;
709
+ try {
710
+ const stateStat = fs.statSync(info.statePath);
711
+ const stateUpdateTime = stateStat.mtimeMs;
712
+ if (stateUpdateTime > info.lastStateUpdate) {
713
+ info.lastStateUpdate = stateUpdateTime;
720
714
  }
721
- logger.warn(`[${laneName}] Killing and restarting lane (restart #${lane.restartCount})`);
722
- events_1.events.emit('recovery.restart', {
723
- laneName,
724
- restartCount: lane.restartCount,
725
- maxRestarts: stallConfig.maxRestarts,
726
- });
715
+ progressTime = now - info.lastStateUpdate;
727
716
  }
728
- else if (analysis.action === failure_policy_1.RecoveryAction.RUN_DOCTOR) {
729
- info.stallPhase = 4;
730
- // Run diagnostics
731
- logger.error(`[${laneName}] Running diagnostics due to persistent failures...`);
732
- // Import health check dynamically to avoid circular dependency
733
- const { checkAgentHealth, checkAuthHealth } = await Promise.resolve().then(() => __importStar(require('../utils/health')));
734
- const [agentHealth, authHealth] = await Promise.all([
735
- checkAgentHealth(),
736
- checkAuthHealth(),
737
- ]);
738
- const issues = [];
739
- if (!agentHealth.ok)
740
- issues.push(`Agent: ${agentHealth.message}`);
741
- if (!authHealth.ok)
742
- issues.push(`Auth: ${authHealth.message}`);
743
- if (issues.length > 0) {
744
- logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
745
- }
746
- else {
747
- logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
748
- }
749
- // Save diagnostic to file
750
- const diagnosticPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'diagnostic.json');
751
- fs.writeFileSync(diagnosticPath, JSON.stringify({
752
- timestamp: Date.now(),
753
- agentHealthy: agentHealth.ok,
754
- authHealthy: authHealth.ok,
755
- issues,
756
- analysis,
757
- }, null, 2));
758
- // Kill the process
759
- try {
760
- info.child.kill('SIGKILL');
717
+ catch {
718
+ // State file might not exist yet
719
+ }
720
+ // Calculate bytes received since last check
721
+ const bytesDelta = info.bytesReceived - info.lastBytesCheck;
722
+ info.lastBytesCheck = info.bytesReceived;
723
+ // Use multi-layer stall analysis with enhanced context
724
+ const analysis = (0, failure_policy_1.analyzeStall)({
725
+ stallPhase: info.stallPhase,
726
+ idleTimeMs: idleTime,
727
+ progressTimeMs: progressTime,
728
+ lastOutput: info.lastOutput,
729
+ restartCount: lane.restartCount || 0,
730
+ taskStartTimeMs: info.taskStartTime,
731
+ bytesReceived: bytesDelta, // Bytes since last check
732
+ continueSignalsSent: info.continueSignalsSent,
733
+ }, stallConfig);
734
+ // Only act if action is not NONE
735
+ if (analysis.action !== failure_policy_1.RecoveryAction.NONE) {
736
+ (0, failure_policy_1.logFailure)(laneName, analysis);
737
+ info.logManager?.log('error', analysis.message);
738
+ if (analysis.action === failure_policy_1.RecoveryAction.CONTINUE_SIGNAL) {
739
+ const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
740
+ try {
741
+ fs.writeFileSync(interventionPath, 'continue');
742
+ info.stallPhase = 1;
743
+ info.lastActivity = now;
744
+ info.continueSignalsSent++;
745
+ logger.info(`[${laneName}] Sent continue signal (#${info.continueSignalsSent})`);
746
+ events_1.events.emit('recovery.continue_signal', {
747
+ laneName,
748
+ idleSeconds: Math.round(idleTime / 1000),
749
+ signalCount: info.continueSignalsSent,
750
+ });
751
+ }
752
+ catch (e) {
753
+ logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
754
+ }
761
755
  }
762
- catch {
763
- // Process might already be dead
756
+ else if (analysis.action === failure_policy_1.RecoveryAction.STRONGER_PROMPT) {
757
+ const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
758
+ const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck. Please continue with your current task immediately. If you're waiting for something, explain what you need and proceed with what you can do now. If you've completed the task, summarize your work and finish.`;
759
+ try {
760
+ fs.writeFileSync(interventionPath, strongerPrompt);
761
+ info.stallPhase = 2;
762
+ info.lastActivity = now;
763
+ logger.warn(`[${laneName}] Sent stronger prompt after continue signal failed`);
764
+ events_1.events.emit('recovery.stronger_prompt', { laneName });
765
+ }
766
+ catch (e) {
767
+ logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
768
+ }
764
769
  }
765
- logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
766
- // Save POF for failed recovery
767
- const recoveryState = autoRecoveryManager.getState(laneName);
768
- if (recoveryState) {
770
+ else if (analysis.action === failure_policy_1.RecoveryAction.KILL_AND_RESTART ||
771
+ analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE ||
772
+ analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE_FROM_CHECKPOINT) {
773
+ lane.restartCount = (lane.restartCount || 0) + 1;
774
+ info.stallPhase = 3;
775
+ // Try to get checkpoint info
776
+ const checkpoint = (0, checkpoint_1.getLatestCheckpoint)(laneRunDirs[laneName]);
777
+ if (checkpoint) {
778
+ logger.info(`[${laneName}] Checkpoint available: ${checkpoint.id} (task ${checkpoint.taskIndex})`);
779
+ }
780
+ // Kill the process
769
781
  try {
770
- const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
771
- const laneState = (0, state_1.loadState)(laneStatePath);
772
- const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
773
- const diagnosticInfo = {
774
- timestamp: Date.now(),
775
- agentHealthy: agentHealth.ok,
776
- authHealthy: authHealth.ok,
777
- systemHealthy: true,
778
- suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
779
- details: issues.join('\n') || 'No obvious issues found',
780
- };
781
- const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, diagnosticInfo);
782
- (0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
782
+ info.child.kill('SIGKILL');
783
783
  }
784
- catch (pofError) {
785
- logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
784
+ catch {
785
+ // Process might already be dead
786
786
  }
787
+ logger.warn(`[${laneName}] Killing and restarting lane (restart #${lane.restartCount})`);
788
+ events_1.events.emit('recovery.restart', {
789
+ laneName,
790
+ restartCount: lane.restartCount,
791
+ maxRestarts: stallConfig.maxRestarts,
792
+ });
787
793
  }
788
- events_1.events.emit('recovery.diagnosed', {
789
- laneName,
790
- diagnostic: { agentHealthy: agentHealth.ok, authHealthy: authHealth.ok, issues },
791
- });
792
- }
793
- else if (analysis.action === failure_policy_1.RecoveryAction.ABORT_LANE) {
794
- info.stallPhase = 5;
795
- try {
796
- info.child.kill('SIGKILL');
794
+ else if (analysis.action === failure_policy_1.RecoveryAction.RUN_DOCTOR) {
795
+ info.stallPhase = 4;
796
+ // Run diagnostics
797
+ logger.error(`[${laneName}] Running diagnostics due to persistent failures...`);
798
+ // Import health check dynamically to avoid circular dependency
799
+ const { checkAgentHealth, checkAuthHealth } = await Promise.resolve().then(() => __importStar(require('../utils/health')));
800
+ const [agentHealth, authHealth] = await Promise.all([
801
+ checkAgentHealth(),
802
+ checkAuthHealth(),
803
+ ]);
804
+ const issues = [];
805
+ if (!agentHealth.ok)
806
+ issues.push(`Agent: ${agentHealth.message}`);
807
+ if (!authHealth.ok)
808
+ issues.push(`Auth: ${authHealth.message}`);
809
+ if (issues.length > 0) {
810
+ logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
811
+ }
812
+ else {
813
+ logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
814
+ }
815
+ // Save diagnostic to file
816
+ const diagnosticPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'diagnostic.json');
817
+ fs.writeFileSync(diagnosticPath, JSON.stringify({
818
+ timestamp: Date.now(),
819
+ agentHealthy: agentHealth.ok,
820
+ authHealthy: authHealth.ok,
821
+ issues,
822
+ analysis,
823
+ }, null, 2));
824
+ // Kill the process
825
+ try {
826
+ info.child.kill('SIGKILL');
827
+ }
828
+ catch {
829
+ // Process might already be dead
830
+ }
831
+ logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
832
+ // Save POF for failed recovery
833
+ const recoveryState = autoRecoveryManager.getState(laneName);
834
+ if (recoveryState) {
835
+ try {
836
+ const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
837
+ const laneState = (0, state_1.loadState)(laneStatePath);
838
+ const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
839
+ const diagnosticInfo = {
840
+ timestamp: Date.now(),
841
+ agentHealthy: agentHealth.ok,
842
+ authHealthy: authHealth.ok,
843
+ systemHealthy: true,
844
+ suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
845
+ details: issues.join('\n') || 'No obvious issues found',
846
+ };
847
+ const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, diagnosticInfo);
848
+ (0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
849
+ }
850
+ catch (pofError) {
851
+ logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
852
+ }
853
+ }
854
+ events_1.events.emit('recovery.diagnosed', {
855
+ laneName,
856
+ diagnostic: { agentHealthy: agentHealth.ok, authHealthy: authHealth.ok, issues },
857
+ });
797
858
  }
798
- catch {
799
- // Process might already be dead
859
+ else if (analysis.action === failure_policy_1.RecoveryAction.ABORT_LANE) {
860
+ info.stallPhase = 5;
861
+ try {
862
+ info.child.kill('SIGKILL');
863
+ }
864
+ catch {
865
+ // Process might already be dead
866
+ }
867
+ logger.error(`[${laneName}] Aborting lane due to repeated stalls`);
868
+ // Save POF for failed recovery
869
+ const recoveryState = autoRecoveryManager.getState(laneName);
870
+ if (recoveryState) {
871
+ try {
872
+ const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
873
+ const laneState = (0, state_1.loadState)(laneStatePath);
874
+ const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
875
+ const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, recoveryState.diagnosticInfo);
876
+ (0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
877
+ }
878
+ catch (pofError) {
879
+ logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
880
+ }
881
+ }
800
882
  }
801
- logger.error(`[${laneName}] Aborting lane due to repeated stalls`);
802
- // Save POF for failed recovery
803
- const recoveryState = autoRecoveryManager.getState(laneName);
804
- if (recoveryState) {
883
+ else if (analysis.action === failure_policy_1.RecoveryAction.SEND_GIT_GUIDANCE) {
884
+ // Send guidance message to agent for git issues
885
+ const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
886
+ // Determine which guidance to send based on the failure type
887
+ let guidance;
888
+ if (analysis.type === failure_policy_1.FailureType.GIT_PUSH_REJECTED) {
889
+ guidance = (0, auto_recovery_1.getGitPushFailureGuidance)();
890
+ }
891
+ else if (analysis.type === failure_policy_1.FailureType.MERGE_CONFLICT) {
892
+ guidance = (0, auto_recovery_1.getMergeConflictGuidance)();
893
+ }
894
+ else {
895
+ guidance = (0, auto_recovery_1.getGitErrorGuidance)(analysis.message);
896
+ }
805
897
  try {
806
- const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
807
- const laneState = (0, state_1.loadState)(laneStatePath);
808
- const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
809
- const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, recoveryState.diagnosticInfo);
810
- (0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
898
+ fs.writeFileSync(interventionPath, guidance);
899
+ info.lastActivity = now;
900
+ logger.info(`[${laneName}] Sent git issue guidance to agent`);
811
901
  }
812
- catch (pofError) {
813
- logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
902
+ catch (e) {
903
+ logger.error(`[${laneName}] Failed to send guidance: ${e.message}`);
814
904
  }
815
905
  }
816
906
  }
817
- else if (analysis.action === failure_policy_1.RecoveryAction.SEND_GIT_GUIDANCE) {
818
- // Send guidance message to agent for git issues
819
- const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
820
- // Determine which guidance to send based on the failure type
821
- let guidance;
822
- if (analysis.type === failure_policy_1.FailureType.GIT_PUSH_REJECTED) {
823
- guidance = (0, auto_recovery_1.getGitPushFailureGuidance)();
824
- }
825
- else if (analysis.type === failure_policy_1.FailureType.MERGE_CONFLICT) {
826
- guidance = (0, auto_recovery_1.getMergeConflictGuidance)();
827
- }
828
- else {
829
- guidance = (0, auto_recovery_1.getGitErrorGuidance)(analysis.message);
907
+ }
908
+ continue;
909
+ }
910
+ else {
911
+ const finished = result;
912
+ const info = running.get(finished.name);
913
+ running.delete(finished.name);
914
+ exitCodes[finished.name] = finished.code;
915
+ // Unregister from auto-recovery manager
916
+ autoRecoveryManager.unregisterLane(finished.name);
917
+ if (finished.code === 0) {
918
+ completedLanes.add(finished.name);
919
+ events_1.events.emit('lane.completed', {
920
+ laneName: finished.name,
921
+ exitCode: finished.code,
922
+ });
923
+ }
924
+ else if (finished.code === 2) {
925
+ // Blocked by dependency
926
+ const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
927
+ const state = (0, state_1.loadState)(statePath);
928
+ if (state && state.dependencyRequest) {
929
+ blockedLanes.set(finished.name, state.dependencyRequest);
930
+ const lane = lanes.find(l => l.name === finished.name);
931
+ if (lane) {
932
+ lane.startIndex = Math.max(0, state.currentTaskIndex - 1); // Task was blocked, retry it
830
933
  }
831
- try {
832
- fs.writeFileSync(interventionPath, guidance);
833
- info.lastActivity = now;
834
- logger.info(`[${laneName}] Sent git issue guidance to agent`);
934
+ events_1.events.emit('lane.blocked', {
935
+ laneName: finished.name,
936
+ dependencyRequest: state.dependencyRequest,
937
+ });
938
+ logger.warn(`Lane ${finished.name} is blocked on dependency change request`);
939
+ }
940
+ else {
941
+ failedLanes.add(finished.name);
942
+ logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
943
+ }
944
+ }
945
+ else {
946
+ // Check if it was a restart request
947
+ if (info.stallPhase === 2) {
948
+ logger.info(`🔄 Lane ${finished.name} is being restarted due to stall...`);
949
+ // Update startIndex from current state to resume from the same task
950
+ const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
951
+ const state = (0, state_1.loadState)(statePath);
952
+ if (state) {
953
+ const lane = lanes.find(l => l.name === finished.name);
954
+ if (lane) {
955
+ lane.startIndex = state.currentTaskIndex;
956
+ }
835
957
  }
836
- catch (e) {
837
- logger.error(`[${laneName}] Failed to send guidance: ${e.message}`);
958
+ // Note: we don't add to failedLanes or completedLanes,
959
+ // so it will be eligible to start again in the next iteration.
960
+ continue;
961
+ }
962
+ failedLanes.add(finished.name);
963
+ let errorMsg = 'Process exited with non-zero code';
964
+ if (info.stallPhase === 3) {
965
+ errorMsg = 'Stopped due to repeated stall';
966
+ }
967
+ else if (info.logManager) {
968
+ const lastError = info.logManager.getLastError();
969
+ if (lastError) {
970
+ errorMsg = `Process failed: ${lastError}`;
838
971
  }
839
972
  }
973
+ logger.error(`[${finished.name}] Lane failed with exit code ${finished.code}: ${errorMsg}`);
974
+ // Log log tail for visibility
975
+ if (info.logPath) {
976
+ logFileTail(info.logPath, 15);
977
+ }
978
+ events_1.events.emit('lane.failed', {
979
+ laneName: finished.name,
980
+ exitCode: finished.code,
981
+ error: errorMsg,
982
+ });
840
983
  }
841
- }
842
- continue;
843
- }
844
- const finished = result;
845
- const info = running.get(finished.name);
846
- running.delete(finished.name);
847
- exitCodes[finished.name] = finished.code;
848
- // Unregister from auto-recovery manager
849
- autoRecoveryManager.unregisterLane(finished.name);
850
- if (finished.code === 0) {
851
- completedLanes.add(finished.name);
852
- events_1.events.emit('lane.completed', {
853
- laneName: finished.name,
854
- exitCode: finished.code,
855
- });
856
- }
857
- else if (finished.code === 2) {
858
- // Blocked by dependency
859
- const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
860
- const state = (0, state_1.loadState)(statePath);
861
- if (state && state.dependencyRequest) {
862
- blockedLanes.set(finished.name, state.dependencyRequest);
863
- const lane = lanes.find(l => l.name === finished.name);
864
- if (lane) {
865
- lane.startIndex = Math.max(0, state.currentTaskIndex - 1); // Task was blocked, retry it
866
- }
867
- events_1.events.emit('lane.blocked', {
868
- laneName: finished.name,
869
- dependencyRequest: state.dependencyRequest,
870
- });
871
- logger.warn(`Lane ${finished.name} is blocked on dependency change request`);
872
- }
873
- else {
874
- failedLanes.add(finished.name);
875
- logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
984
+ printLaneStatus(lanes, laneRunDirs);
876
985
  }
877
986
  }
878
987
  else {
879
- // Check if it was a restart request
880
- if (info.stallPhase === 2) {
881
- logger.info(`🔄 Lane ${finished.name} is being restarted due to stall...`);
882
- // Update startIndex from current state to resume from the same task
883
- const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
884
- const state = (0, state_1.loadState)(statePath);
885
- if (state) {
886
- const lane = lanes.find(l => l.name === finished.name);
887
- if (lane) {
888
- lane.startIndex = state.currentTaskIndex;
988
+ // Nothing running. Are we blocked?
989
+ // Wait a bit to avoid busy-spin while waiting for dependencies or new slots
990
+ if (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length) {
991
+ await new Promise(resolve => setTimeout(resolve, 2000));
992
+ }
993
+ if (blockedLanes.size > 0 && autoResolve) {
994
+ logger.section('🛠 Auto-Resolving Dependencies');
995
+ try {
996
+ await resolveAllDependencies(blockedLanes, lanes, laneRunDirs, pipelineBranch, runRoot);
997
+ // Clear blocked status
998
+ blockedLanes.clear();
999
+ logger.success('Dependencies resolved and synced across all active lanes. Resuming...');
1000
+ }
1001
+ catch (error) {
1002
+ logger.error(`Auto-resolution failed: ${error.message}`);
1003
+ // Move blocked to failed
1004
+ for (const name of blockedLanes.keys()) {
1005
+ failedLanes.add(name);
889
1006
  }
1007
+ blockedLanes.clear();
890
1008
  }
891
- // Note: we don't add to failedLanes or completedLanes,
892
- // so it will be eligible to start again in the next iteration.
893
- continue;
894
- }
895
- failedLanes.add(finished.name);
896
- events_1.events.emit('lane.failed', {
897
- laneName: finished.name,
898
- exitCode: finished.code,
899
- error: info.stallPhase === 3 ? 'Stopped due to repeated stall' : 'Process exited with non-zero code',
900
- });
901
- }
902
- printLaneStatus(lanes, laneRunDirs);
903
- }
904
- else {
905
- // Nothing running. Are we blocked?
906
- if (blockedLanes.size > 0 && autoResolve) {
907
- logger.section('🛠 Auto-Resolving Dependencies');
908
- try {
909
- await resolveAllDependencies(blockedLanes, lanes, laneRunDirs, pipelineBranch, runRoot);
910
- // Clear blocked status
911
- blockedLanes.clear();
912
- logger.success('Dependencies resolved and synced across all active lanes. Resuming...');
913
1009
  }
914
- catch (error) {
915
- logger.error(`Auto-resolution failed: ${error.message}`);
916
- // Move blocked to failed
917
- for (const name of blockedLanes.keys()) {
918
- failedLanes.add(name);
1010
+ else if (readyToStart.length === 0 && completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length) {
1011
+ const remaining = lanes.filter(l => !completedLanes.has(l.name) && !failedLanes.has(l.name) && !blockedLanes.has(l.name));
1012
+ logger.error(`Deadlock detected! Remaining lanes cannot start: ${remaining.map(l => l.name).join(', ')}`);
1013
+ for (const l of remaining) {
1014
+ failedLanes.add(l.name);
1015
+ exitCodes[l.name] = 1;
919
1016
  }
920
- blockedLanes.clear();
921
1017
  }
922
- }
923
- else if (readyToStart.length === 0 && completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length) {
924
- const remaining = lanes.filter(l => !completedLanes.has(l.name) && !failedLanes.has(l.name) && !blockedLanes.has(l.name));
925
- logger.error(`Deadlock detected! Remaining lanes cannot start: ${remaining.map(l => l.name).join(', ')}`);
926
- for (const l of remaining) {
927
- failedLanes.add(l.name);
928
- exitCodes[l.name] = 1;
1018
+ else {
1019
+ // All finished
1020
+ break;
929
1021
  }
930
1022
  }
931
- else {
932
- // All finished
933
- break;
934
- }
935
1023
  }
936
1024
  }
937
- clearInterval(monitorInterval);
1025
+ finally {
1026
+ clearInterval(monitorInterval);
1027
+ process.removeListener('SIGINT', sigIntHandler);
1028
+ process.removeListener('SIGTERM', sigIntHandler);
1029
+ }
938
1030
  printLaneStatus(lanes, laneRunDirs);
939
1031
  // Check for failures
940
1032
  const failed = Object.entries(exitCodes).filter(([, code]) => code !== 0 && code !== 2);