@litmers/cursorflow-orchestrator 0.1.28 → 0.1.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +17 -2
- package/dist/cli/clean.js +122 -0
- package/dist/cli/clean.js.map +1 -1
- package/dist/cli/prepare.js +0 -83
- package/dist/cli/prepare.js.map +1 -1
- package/dist/core/auto-recovery.js +3 -1
- package/dist/core/auto-recovery.js.map +1 -1
- package/dist/core/failure-policy.js +7 -1
- package/dist/core/failure-policy.js.map +1 -1
- package/dist/core/orchestrator.js +452 -366
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/utils/config.js +3 -1
- package/dist/utils/config.js.map +1 -1
- package/dist/utils/enhanced-logger.d.ts +5 -1
- package/dist/utils/enhanced-logger.js +65 -20
- package/dist/utils/enhanced-logger.js.map +1 -1
- package/dist/utils/git.d.ts +25 -0
- package/dist/utils/git.js +97 -2
- package/dist/utils/git.js.map +1 -1
- package/package.json +11 -3
- package/scripts/local-security-gate.sh +37 -7
- package/scripts/release.sh +15 -0
- package/src/cli/clean.ts +146 -0
- package/src/cli/prepare.ts +0 -93
- package/src/core/auto-recovery.ts +3 -1
- package/src/core/failure-policy.ts +8 -1
- package/src/core/orchestrator.ts +175 -82
- package/src/utils/config.ts +3 -1
- package/src/utils/enhanced-logger.ts +61 -20
- package/src/utils/git.ts +115 -2
|
@@ -73,6 +73,27 @@ const DEFAULT_ORCHESTRATOR_STALL_CONFIG = {
|
|
|
73
73
|
progressTimeoutMs: 10 * 60 * 1000, // 10 minutes (only triggers if no activity at all)
|
|
74
74
|
maxRestarts: 2,
|
|
75
75
|
};
|
|
76
|
+
/**
|
|
77
|
+
* Log the tail of a file
|
|
78
|
+
*/
|
|
79
|
+
function logFileTail(filePath, lines = 10) {
|
|
80
|
+
if (!fs.existsSync(filePath))
|
|
81
|
+
return;
|
|
82
|
+
try {
|
|
83
|
+
const content = fs.readFileSync(filePath, 'utf8');
|
|
84
|
+
const allLines = content.split('\n');
|
|
85
|
+
const tail = allLines.slice(-lines).filter(l => l.trim());
|
|
86
|
+
if (tail.length > 0) {
|
|
87
|
+
logger.error(` Last ${tail.length} lines of log:`);
|
|
88
|
+
for (const line of tail) {
|
|
89
|
+
logger.error(` ${line}`);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
catch (e) {
|
|
94
|
+
// Ignore log reading errors
|
|
95
|
+
}
|
|
96
|
+
}
|
|
76
97
|
/**
|
|
77
98
|
* Spawn a lane process
|
|
78
99
|
*/
|
|
@@ -524,417 +545,482 @@ async function orchestrate(tasksDir, options = {}) {
|
|
|
524
545
|
const monitorInterval = setInterval(() => {
|
|
525
546
|
printLaneStatus(lanes, laneRunDirs);
|
|
526
547
|
}, options.pollInterval || 60000);
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
548
|
+
// Handle process interruption
|
|
549
|
+
const sigIntHandler = () => {
|
|
550
|
+
logger.warn('\n⚠️ Orchestration interrupted! Stopping all lanes...');
|
|
551
|
+
for (const [name, info] of running.entries()) {
|
|
552
|
+
logger.info(`Stopping lane: ${name}`);
|
|
553
|
+
try {
|
|
554
|
+
info.child.kill('SIGTERM');
|
|
533
555
|
}
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
556
|
+
catch {
|
|
557
|
+
// Ignore kill errors
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
printLaneStatus(lanes, laneRunDirs);
|
|
561
|
+
process.exit(130);
|
|
562
|
+
};
|
|
563
|
+
process.on('SIGINT', sigIntHandler);
|
|
564
|
+
process.on('SIGTERM', sigIntHandler);
|
|
565
|
+
let lastStallCheck = Date.now();
|
|
566
|
+
try {
|
|
567
|
+
while (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length || (blockedLanes.size > 0 && running.size === 0)) {
|
|
568
|
+
// 1. Identify lanes ready to start
|
|
569
|
+
const readyToStart = lanes.filter(lane => {
|
|
570
|
+
// Not already running or completed or failed or blocked
|
|
571
|
+
if (running.has(lane.name) || completedLanes.has(lane.name) || failedLanes.has(lane.name) || blockedLanes.has(lane.name)) {
|
|
547
572
|
return false;
|
|
548
573
|
}
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
let lastOutput = '';
|
|
564
|
-
const spawnResult = spawnLane({
|
|
565
|
-
laneName: lane.name,
|
|
566
|
-
tasksFile: lane.path,
|
|
567
|
-
laneRunDir: laneRunDirs[lane.name],
|
|
568
|
-
executor: options.executor || 'cursor-agent',
|
|
569
|
-
startIndex: lane.startIndex,
|
|
570
|
-
pipelineBranch: `${pipelineBranch}/${lane.name}`,
|
|
571
|
-
worktreeDir: laneWorktreeDirs[lane.name],
|
|
572
|
-
enhancedLogConfig: options.enhancedLogging,
|
|
573
|
-
noGit: options.noGit,
|
|
574
|
-
onActivity: () => {
|
|
575
|
-
const info = running.get(lane.name);
|
|
576
|
-
if (info) {
|
|
577
|
-
const now = Date.now();
|
|
578
|
-
info.lastActivity = now;
|
|
579
|
-
// Also reset progress tracking when there's activity (THNK/TOOL events)
|
|
580
|
-
// This prevents STALL_NO_PROGRESS from firing when agent is actively working
|
|
581
|
-
info.lastStateUpdate = now;
|
|
582
|
-
info.stallPhase = 0; // Reset stall phase since agent is responding
|
|
574
|
+
// Check dependencies
|
|
575
|
+
for (const dep of lane.dependsOn) {
|
|
576
|
+
if (failedLanes.has(dep)) {
|
|
577
|
+
logger.error(`Lane ${lane.name} will not start because dependency ${dep} failed`);
|
|
578
|
+
failedLanes.add(lane.name);
|
|
579
|
+
exitCodes[lane.name] = 1;
|
|
580
|
+
return false;
|
|
581
|
+
}
|
|
582
|
+
if (blockedLanes.has(dep)) {
|
|
583
|
+
// If a dependency is blocked, wait
|
|
584
|
+
return false;
|
|
585
|
+
}
|
|
586
|
+
if (!completedLanes.has(dep)) {
|
|
587
|
+
return false;
|
|
583
588
|
}
|
|
584
589
|
}
|
|
590
|
+
return true;
|
|
585
591
|
});
|
|
586
|
-
//
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
592
|
+
// 2. Spawn ready lanes up to maxConcurrent
|
|
593
|
+
for (const lane of readyToStart) {
|
|
594
|
+
if (running.size >= maxConcurrent)
|
|
595
|
+
break;
|
|
596
|
+
const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[lane.name], 'state.json');
|
|
597
|
+
// Validate and repair state before starting
|
|
598
|
+
const validation = (0, state_1.validateLaneState)(laneStatePath, { autoRepair: true });
|
|
599
|
+
if (!validation.valid && !validation.repaired) {
|
|
600
|
+
logger.warn(`[${lane.name}] State validation issues: ${validation.issues.join(', ')}`);
|
|
601
|
+
}
|
|
602
|
+
logger.info(`Lane started: ${lane.name}${lane.startIndex ? ` (resuming from ${lane.startIndex})` : ''}`);
|
|
603
|
+
const now = Date.now();
|
|
604
|
+
// Pre-register lane in running map so onActivity can find it immediately
|
|
605
|
+
running.set(lane.name, {
|
|
606
|
+
child: {}, // Placeholder, will be replaced below
|
|
607
|
+
logManager: undefined,
|
|
608
|
+
logPath: '',
|
|
609
|
+
lastActivity: now,
|
|
610
|
+
lastStateUpdate: now,
|
|
611
|
+
stallPhase: 0,
|
|
612
|
+
taskStartTime: now,
|
|
613
|
+
lastOutput: '',
|
|
614
|
+
statePath: laneStatePath,
|
|
615
|
+
bytesReceived: 0,
|
|
616
|
+
lastBytesCheck: 0,
|
|
617
|
+
continueSignalsSent: 0,
|
|
596
618
|
});
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
events_1.events.emit('lane.started', {
|
|
616
|
-
laneName: lane.name,
|
|
617
|
-
pid: spawnResult.child.pid,
|
|
618
|
-
logPath: spawnResult.logPath,
|
|
619
|
-
});
|
|
620
|
-
}
|
|
621
|
-
// 3. Wait for any running lane to finish OR check for stalls
|
|
622
|
-
if (running.size > 0) {
|
|
623
|
-
// Polling timeout for stall detection
|
|
624
|
-
let pollTimeout;
|
|
625
|
-
const pollPromise = new Promise(resolve => {
|
|
626
|
-
pollTimeout = setTimeout(() => resolve({ name: '__poll__', code: 0 }), 10000);
|
|
627
|
-
});
|
|
628
|
-
const promises = Array.from(running.entries()).map(async ([name, { child }]) => {
|
|
629
|
-
const code = await waitChild(child);
|
|
630
|
-
return { name, code };
|
|
631
|
-
});
|
|
632
|
-
const result = await Promise.race([...promises, pollPromise]);
|
|
633
|
-
if (pollTimeout)
|
|
634
|
-
clearTimeout(pollTimeout);
|
|
635
|
-
if (result.name === '__poll__') {
|
|
636
|
-
// Periodic stall check with multi-layer detection and escalating recovery
|
|
637
|
-
for (const [laneName, info] of running.entries()) {
|
|
638
|
-
const now = Date.now();
|
|
639
|
-
const idleTime = now - info.lastActivity;
|
|
640
|
-
const lane = lanes.find(l => l.name === laneName);
|
|
641
|
-
// Check state file for progress updates
|
|
642
|
-
let progressTime = 0;
|
|
643
|
-
try {
|
|
644
|
-
const stateStat = fs.statSync(info.statePath);
|
|
645
|
-
const stateUpdateTime = stateStat.mtimeMs;
|
|
646
|
-
if (stateUpdateTime > info.lastStateUpdate) {
|
|
647
|
-
info.lastStateUpdate = stateUpdateTime;
|
|
619
|
+
let lastOutput = '';
|
|
620
|
+
const spawnResult = spawnLane({
|
|
621
|
+
laneName: lane.name,
|
|
622
|
+
tasksFile: lane.path,
|
|
623
|
+
laneRunDir: laneRunDirs[lane.name],
|
|
624
|
+
executor: options.executor || 'cursor-agent',
|
|
625
|
+
startIndex: lane.startIndex,
|
|
626
|
+
pipelineBranch: `${pipelineBranch}/${lane.name}`,
|
|
627
|
+
worktreeDir: laneWorktreeDirs[lane.name],
|
|
628
|
+
enhancedLogConfig: options.enhancedLogging,
|
|
629
|
+
noGit: options.noGit,
|
|
630
|
+
onActivity: () => {
|
|
631
|
+
const info = running.get(lane.name);
|
|
632
|
+
if (info) {
|
|
633
|
+
const actNow = Date.now();
|
|
634
|
+
info.lastActivity = actNow;
|
|
635
|
+
info.lastStateUpdate = actNow;
|
|
636
|
+
info.stallPhase = 0;
|
|
648
637
|
}
|
|
649
|
-
progressTime = now - info.lastStateUpdate;
|
|
650
|
-
}
|
|
651
|
-
catch {
|
|
652
|
-
// State file might not exist yet
|
|
653
638
|
}
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
try {
|
|
675
|
-
fs.writeFileSync(interventionPath, 'continue');
|
|
676
|
-
info.stallPhase = 1;
|
|
677
|
-
info.lastActivity = now;
|
|
678
|
-
info.continueSignalsSent++;
|
|
679
|
-
logger.info(`[${laneName}] Sent continue signal (#${info.continueSignalsSent})`);
|
|
680
|
-
events_1.events.emit('recovery.continue_signal', {
|
|
681
|
-
laneName,
|
|
682
|
-
idleSeconds: Math.round(idleTime / 1000),
|
|
683
|
-
signalCount: info.continueSignalsSent,
|
|
684
|
-
});
|
|
639
|
+
});
|
|
640
|
+
// Update with actual spawn result
|
|
641
|
+
const existingInfo = running.get(lane.name);
|
|
642
|
+
Object.assign(existingInfo, spawnResult);
|
|
643
|
+
// Track last output and bytes received for long operation and stall detection
|
|
644
|
+
if (spawnResult.child.stdout) {
|
|
645
|
+
spawnResult.child.stdout.on('data', (data) => {
|
|
646
|
+
const info = running.get(lane.name);
|
|
647
|
+
if (info) {
|
|
648
|
+
const output = data.toString();
|
|
649
|
+
const lines = output.split('\n').filter(l => l.trim());
|
|
650
|
+
// Filter out heartbeats from activity tracking to avoid resetting stall detection
|
|
651
|
+
const realLines = lines.filter(line => !(line.includes('Heartbeat') && line.includes('bytes received')));
|
|
652
|
+
if (realLines.length > 0) {
|
|
653
|
+
// Real activity detected
|
|
654
|
+
const lastRealLine = realLines[realLines.length - 1];
|
|
655
|
+
info.lastOutput = lastRealLine;
|
|
656
|
+
info.bytesReceived += data.length;
|
|
657
|
+
// Update auto-recovery manager with real activity
|
|
658
|
+
autoRecoveryManager.recordActivity(lane.name, data.length, info.lastOutput);
|
|
685
659
|
}
|
|
686
|
-
|
|
687
|
-
|
|
660
|
+
else if (lines.length > 0) {
|
|
661
|
+
// Only heartbeats received - update auto-recovery manager with 0 bytes to avoid resetting idle timer
|
|
662
|
+
autoRecoveryManager.recordActivity(lane.name, 0, info.lastOutput);
|
|
688
663
|
}
|
|
689
664
|
}
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
665
|
+
});
|
|
666
|
+
}
|
|
667
|
+
// Register lane with auto-recovery manager
|
|
668
|
+
autoRecoveryManager.registerLane(lane.name);
|
|
669
|
+
// Update lane tracking
|
|
670
|
+
lane.taskStartTime = now;
|
|
671
|
+
events_1.events.emit('lane.started', {
|
|
672
|
+
laneName: lane.name,
|
|
673
|
+
pid: spawnResult.child.pid,
|
|
674
|
+
logPath: spawnResult.logPath,
|
|
675
|
+
});
|
|
676
|
+
}
|
|
677
|
+
// 3. Wait for any running lane to finish OR check for stalls
|
|
678
|
+
if (running.size > 0) {
|
|
679
|
+
// Polling timeout for stall detection
|
|
680
|
+
let pollTimeout;
|
|
681
|
+
const pollPromise = new Promise(resolve => {
|
|
682
|
+
pollTimeout = setTimeout(() => resolve({ name: '__poll__', code: 0 }), 10000);
|
|
683
|
+
});
|
|
684
|
+
const promises = Array.from(running.entries()).map(async ([name, { child }]) => {
|
|
685
|
+
const code = await waitChild(child);
|
|
686
|
+
return { name, code };
|
|
687
|
+
});
|
|
688
|
+
const result = await Promise.race([...promises, pollPromise]);
|
|
689
|
+
if (pollTimeout)
|
|
690
|
+
clearTimeout(pollTimeout);
|
|
691
|
+
const now = Date.now();
|
|
692
|
+
if (result.name === '__poll__' || (now - lastStallCheck >= 10000)) {
|
|
693
|
+
lastStallCheck = now;
|
|
694
|
+
// Periodic stall check with multi-layer detection and escalating recovery
|
|
695
|
+
for (const [laneName, info] of running.entries()) {
|
|
696
|
+
const idleTime = now - info.lastActivity;
|
|
697
|
+
const lane = lanes.find(l => l.name === laneName);
|
|
698
|
+
if (process.env['DEBUG_STALL']) {
|
|
699
|
+
logger.debug(`[${laneName}] Stall check: idle=${Math.round(idleTime / 1000)}s, bytesDelta=${info.bytesReceived - info.lastBytesCheck}, phase=${info.stallPhase}`);
|
|
703
700
|
}
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
if (checkpoint) {
|
|
712
|
-
logger.info(`[${laneName}] Checkpoint available: ${checkpoint.id} (task ${checkpoint.taskIndex})`);
|
|
713
|
-
}
|
|
714
|
-
// Kill the process
|
|
715
|
-
try {
|
|
716
|
-
info.child.kill('SIGKILL');
|
|
717
|
-
}
|
|
718
|
-
catch {
|
|
719
|
-
// Process might already be dead
|
|
701
|
+
// Check state file for progress updates
|
|
702
|
+
let progressTime = 0;
|
|
703
|
+
try {
|
|
704
|
+
const stateStat = fs.statSync(info.statePath);
|
|
705
|
+
const stateUpdateTime = stateStat.mtimeMs;
|
|
706
|
+
if (stateUpdateTime > info.lastStateUpdate) {
|
|
707
|
+
info.lastStateUpdate = stateUpdateTime;
|
|
720
708
|
}
|
|
721
|
-
|
|
722
|
-
events_1.events.emit('recovery.restart', {
|
|
723
|
-
laneName,
|
|
724
|
-
restartCount: lane.restartCount,
|
|
725
|
-
maxRestarts: stallConfig.maxRestarts,
|
|
726
|
-
});
|
|
709
|
+
progressTime = now - info.lastStateUpdate;
|
|
727
710
|
}
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
711
|
+
catch {
|
|
712
|
+
// State file might not exist yet
|
|
713
|
+
}
|
|
714
|
+
// Calculate bytes received since last check
|
|
715
|
+
const bytesDelta = info.bytesReceived - info.lastBytesCheck;
|
|
716
|
+
info.lastBytesCheck = info.bytesReceived;
|
|
717
|
+
// Use multi-layer stall analysis with enhanced context
|
|
718
|
+
const analysis = (0, failure_policy_1.analyzeStall)({
|
|
719
|
+
stallPhase: info.stallPhase,
|
|
720
|
+
idleTimeMs: idleTime,
|
|
721
|
+
progressTimeMs: progressTime,
|
|
722
|
+
lastOutput: info.lastOutput,
|
|
723
|
+
restartCount: lane.restartCount || 0,
|
|
724
|
+
taskStartTimeMs: info.taskStartTime,
|
|
725
|
+
bytesReceived: bytesDelta, // Bytes since last check
|
|
726
|
+
continueSignalsSent: info.continueSignalsSent,
|
|
727
|
+
}, stallConfig);
|
|
728
|
+
// Only act if action is not NONE
|
|
729
|
+
if (analysis.action !== failure_policy_1.RecoveryAction.NONE) {
|
|
730
|
+
(0, failure_policy_1.logFailure)(laneName, analysis);
|
|
731
|
+
info.logManager?.log('error', analysis.message);
|
|
732
|
+
if (analysis.action === failure_policy_1.RecoveryAction.CONTINUE_SIGNAL) {
|
|
733
|
+
const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
|
|
734
|
+
try {
|
|
735
|
+
fs.writeFileSync(interventionPath, 'continue');
|
|
736
|
+
info.stallPhase = 1;
|
|
737
|
+
info.lastActivity = now;
|
|
738
|
+
info.continueSignalsSent++;
|
|
739
|
+
logger.info(`[${laneName}] Sent continue signal (#${info.continueSignalsSent})`);
|
|
740
|
+
events_1.events.emit('recovery.continue_signal', {
|
|
741
|
+
laneName,
|
|
742
|
+
idleSeconds: Math.round(idleTime / 1000),
|
|
743
|
+
signalCount: info.continueSignalsSent,
|
|
744
|
+
});
|
|
745
|
+
}
|
|
746
|
+
catch (e) {
|
|
747
|
+
logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
|
|
748
|
+
}
|
|
761
749
|
}
|
|
762
|
-
|
|
763
|
-
|
|
750
|
+
else if (analysis.action === failure_policy_1.RecoveryAction.STRONGER_PROMPT) {
|
|
751
|
+
const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
|
|
752
|
+
const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck. Please continue with your current task immediately. If you're waiting for something, explain what you need and proceed with what you can do now. If you've completed the task, summarize your work and finish.`;
|
|
753
|
+
try {
|
|
754
|
+
fs.writeFileSync(interventionPath, strongerPrompt);
|
|
755
|
+
info.stallPhase = 2;
|
|
756
|
+
info.lastActivity = now;
|
|
757
|
+
logger.warn(`[${laneName}] Sent stronger prompt after continue signal failed`);
|
|
758
|
+
events_1.events.emit('recovery.stronger_prompt', { laneName });
|
|
759
|
+
}
|
|
760
|
+
catch (e) {
|
|
761
|
+
logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
|
|
762
|
+
}
|
|
764
763
|
}
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
764
|
+
else if (analysis.action === failure_policy_1.RecoveryAction.KILL_AND_RESTART ||
|
|
765
|
+
analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE ||
|
|
766
|
+
analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE_FROM_CHECKPOINT) {
|
|
767
|
+
lane.restartCount = (lane.restartCount || 0) + 1;
|
|
768
|
+
info.stallPhase = 3;
|
|
769
|
+
// Try to get checkpoint info
|
|
770
|
+
const checkpoint = (0, checkpoint_1.getLatestCheckpoint)(laneRunDirs[laneName]);
|
|
771
|
+
if (checkpoint) {
|
|
772
|
+
logger.info(`[${laneName}] Checkpoint available: ${checkpoint.id} (task ${checkpoint.taskIndex})`);
|
|
773
|
+
}
|
|
774
|
+
// Kill the process
|
|
769
775
|
try {
|
|
770
|
-
|
|
771
|
-
const laneState = (0, state_1.loadState)(laneStatePath);
|
|
772
|
-
const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
|
|
773
|
-
const diagnosticInfo = {
|
|
774
|
-
timestamp: Date.now(),
|
|
775
|
-
agentHealthy: agentHealth.ok,
|
|
776
|
-
authHealthy: authHealth.ok,
|
|
777
|
-
systemHealthy: true,
|
|
778
|
-
suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
|
|
779
|
-
details: issues.join('\n') || 'No obvious issues found',
|
|
780
|
-
};
|
|
781
|
-
const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, diagnosticInfo);
|
|
782
|
-
(0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
|
|
776
|
+
info.child.kill('SIGKILL');
|
|
783
777
|
}
|
|
784
|
-
catch
|
|
785
|
-
|
|
778
|
+
catch {
|
|
779
|
+
// Process might already be dead
|
|
786
780
|
}
|
|
781
|
+
logger.warn(`[${laneName}] Killing and restarting lane (restart #${lane.restartCount})`);
|
|
782
|
+
events_1.events.emit('recovery.restart', {
|
|
783
|
+
laneName,
|
|
784
|
+
restartCount: lane.restartCount,
|
|
785
|
+
maxRestarts: stallConfig.maxRestarts,
|
|
786
|
+
});
|
|
787
787
|
}
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
788
|
+
else if (analysis.action === failure_policy_1.RecoveryAction.RUN_DOCTOR) {
|
|
789
|
+
info.stallPhase = 4;
|
|
790
|
+
// Run diagnostics
|
|
791
|
+
logger.error(`[${laneName}] Running diagnostics due to persistent failures...`);
|
|
792
|
+
// Import health check dynamically to avoid circular dependency
|
|
793
|
+
const { checkAgentHealth, checkAuthHealth } = await Promise.resolve().then(() => __importStar(require('../utils/health')));
|
|
794
|
+
const [agentHealth, authHealth] = await Promise.all([
|
|
795
|
+
checkAgentHealth(),
|
|
796
|
+
checkAuthHealth(),
|
|
797
|
+
]);
|
|
798
|
+
const issues = [];
|
|
799
|
+
if (!agentHealth.ok)
|
|
800
|
+
issues.push(`Agent: ${agentHealth.message}`);
|
|
801
|
+
if (!authHealth.ok)
|
|
802
|
+
issues.push(`Auth: ${authHealth.message}`);
|
|
803
|
+
if (issues.length > 0) {
|
|
804
|
+
logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
|
|
805
|
+
}
|
|
806
|
+
else {
|
|
807
|
+
logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
|
|
808
|
+
}
|
|
809
|
+
// Save diagnostic to file
|
|
810
|
+
const diagnosticPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'diagnostic.json');
|
|
811
|
+
fs.writeFileSync(diagnosticPath, JSON.stringify({
|
|
812
|
+
timestamp: Date.now(),
|
|
813
|
+
agentHealthy: agentHealth.ok,
|
|
814
|
+
authHealthy: authHealth.ok,
|
|
815
|
+
issues,
|
|
816
|
+
analysis,
|
|
817
|
+
}, null, 2));
|
|
818
|
+
// Kill the process
|
|
819
|
+
try {
|
|
820
|
+
info.child.kill('SIGKILL');
|
|
821
|
+
}
|
|
822
|
+
catch {
|
|
823
|
+
// Process might already be dead
|
|
824
|
+
}
|
|
825
|
+
logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
|
|
826
|
+
// Save POF for failed recovery
|
|
827
|
+
const recoveryState = autoRecoveryManager.getState(laneName);
|
|
828
|
+
if (recoveryState) {
|
|
829
|
+
try {
|
|
830
|
+
const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
|
|
831
|
+
const laneState = (0, state_1.loadState)(laneStatePath);
|
|
832
|
+
const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
|
|
833
|
+
const diagnosticInfo = {
|
|
834
|
+
timestamp: Date.now(),
|
|
835
|
+
agentHealthy: agentHealth.ok,
|
|
836
|
+
authHealthy: authHealth.ok,
|
|
837
|
+
systemHealthy: true,
|
|
838
|
+
suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
|
|
839
|
+
details: issues.join('\n') || 'No obvious issues found',
|
|
840
|
+
};
|
|
841
|
+
const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, diagnosticInfo);
|
|
842
|
+
(0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
|
|
843
|
+
}
|
|
844
|
+
catch (pofError) {
|
|
845
|
+
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
events_1.events.emit('recovery.diagnosed', {
|
|
849
|
+
laneName,
|
|
850
|
+
diagnostic: { agentHealthy: agentHealth.ok, authHealthy: authHealth.ok, issues },
|
|
851
|
+
});
|
|
797
852
|
}
|
|
798
|
-
|
|
799
|
-
|
|
853
|
+
else if (analysis.action === failure_policy_1.RecoveryAction.ABORT_LANE) {
|
|
854
|
+
info.stallPhase = 5;
|
|
855
|
+
try {
|
|
856
|
+
info.child.kill('SIGKILL');
|
|
857
|
+
}
|
|
858
|
+
catch {
|
|
859
|
+
// Process might already be dead
|
|
860
|
+
}
|
|
861
|
+
logger.error(`[${laneName}] Aborting lane due to repeated stalls`);
|
|
862
|
+
// Save POF for failed recovery
|
|
863
|
+
const recoveryState = autoRecoveryManager.getState(laneName);
|
|
864
|
+
if (recoveryState) {
|
|
865
|
+
try {
|
|
866
|
+
const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
|
|
867
|
+
const laneState = (0, state_1.loadState)(laneStatePath);
|
|
868
|
+
const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
|
|
869
|
+
const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, recoveryState.diagnosticInfo);
|
|
870
|
+
(0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
|
|
871
|
+
}
|
|
872
|
+
catch (pofError) {
|
|
873
|
+
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
874
|
+
}
|
|
875
|
+
}
|
|
800
876
|
}
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
877
|
+
else if (analysis.action === failure_policy_1.RecoveryAction.SEND_GIT_GUIDANCE) {
|
|
878
|
+
// Send guidance message to agent for git issues
|
|
879
|
+
const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
|
|
880
|
+
// Determine which guidance to send based on the failure type
|
|
881
|
+
let guidance;
|
|
882
|
+
if (analysis.type === failure_policy_1.FailureType.GIT_PUSH_REJECTED) {
|
|
883
|
+
guidance = (0, auto_recovery_1.getGitPushFailureGuidance)();
|
|
884
|
+
}
|
|
885
|
+
else if (analysis.type === failure_policy_1.FailureType.MERGE_CONFLICT) {
|
|
886
|
+
guidance = (0, auto_recovery_1.getMergeConflictGuidance)();
|
|
887
|
+
}
|
|
888
|
+
else {
|
|
889
|
+
guidance = (0, auto_recovery_1.getGitErrorGuidance)(analysis.message);
|
|
890
|
+
}
|
|
805
891
|
try {
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, recoveryState.diagnosticInfo);
|
|
810
|
-
(0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
|
|
892
|
+
fs.writeFileSync(interventionPath, guidance);
|
|
893
|
+
info.lastActivity = now;
|
|
894
|
+
logger.info(`[${laneName}] Sent git issue guidance to agent`);
|
|
811
895
|
}
|
|
812
|
-
catch (
|
|
813
|
-
logger.
|
|
896
|
+
catch (e) {
|
|
897
|
+
logger.error(`[${laneName}] Failed to send guidance: ${e.message}`);
|
|
814
898
|
}
|
|
815
899
|
}
|
|
816
900
|
}
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
901
|
+
}
|
|
902
|
+
continue;
|
|
903
|
+
}
|
|
904
|
+
else {
|
|
905
|
+
const finished = result;
|
|
906
|
+
const info = running.get(finished.name);
|
|
907
|
+
running.delete(finished.name);
|
|
908
|
+
exitCodes[finished.name] = finished.code;
|
|
909
|
+
// Unregister from auto-recovery manager
|
|
910
|
+
autoRecoveryManager.unregisterLane(finished.name);
|
|
911
|
+
if (finished.code === 0) {
|
|
912
|
+
completedLanes.add(finished.name);
|
|
913
|
+
events_1.events.emit('lane.completed', {
|
|
914
|
+
laneName: finished.name,
|
|
915
|
+
exitCode: finished.code,
|
|
916
|
+
});
|
|
917
|
+
}
|
|
918
|
+
else if (finished.code === 2) {
|
|
919
|
+
// Blocked by dependency
|
|
920
|
+
const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
|
|
921
|
+
const state = (0, state_1.loadState)(statePath);
|
|
922
|
+
if (state && state.dependencyRequest) {
|
|
923
|
+
blockedLanes.set(finished.name, state.dependencyRequest);
|
|
924
|
+
const lane = lanes.find(l => l.name === finished.name);
|
|
925
|
+
if (lane) {
|
|
926
|
+
lane.startIndex = Math.max(0, state.currentTaskIndex - 1); // Task was blocked, retry it
|
|
830
927
|
}
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
928
|
+
events_1.events.emit('lane.blocked', {
|
|
929
|
+
laneName: finished.name,
|
|
930
|
+
dependencyRequest: state.dependencyRequest,
|
|
931
|
+
});
|
|
932
|
+
logger.warn(`Lane ${finished.name} is blocked on dependency change request`);
|
|
933
|
+
}
|
|
934
|
+
else {
|
|
935
|
+
failedLanes.add(finished.name);
|
|
936
|
+
logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
else {
|
|
940
|
+
// Check if it was a restart request
|
|
941
|
+
if (info.stallPhase === 2) {
|
|
942
|
+
logger.info(`🔄 Lane ${finished.name} is being restarted due to stall...`);
|
|
943
|
+
// Update startIndex from current state to resume from the same task
|
|
944
|
+
const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
|
|
945
|
+
const state = (0, state_1.loadState)(statePath);
|
|
946
|
+
if (state) {
|
|
947
|
+
const lane = lanes.find(l => l.name === finished.name);
|
|
948
|
+
if (lane) {
|
|
949
|
+
lane.startIndex = state.currentTaskIndex;
|
|
950
|
+
}
|
|
835
951
|
}
|
|
836
|
-
|
|
837
|
-
|
|
952
|
+
// Note: we don't add to failedLanes or completedLanes,
|
|
953
|
+
// so it will be eligible to start again in the next iteration.
|
|
954
|
+
continue;
|
|
955
|
+
}
|
|
956
|
+
failedLanes.add(finished.name);
|
|
957
|
+
let errorMsg = 'Process exited with non-zero code';
|
|
958
|
+
if (info.stallPhase === 3) {
|
|
959
|
+
errorMsg = 'Stopped due to repeated stall';
|
|
960
|
+
}
|
|
961
|
+
else if (info.logManager) {
|
|
962
|
+
const lastError = info.logManager.getLastError();
|
|
963
|
+
if (lastError) {
|
|
964
|
+
errorMsg = `Process failed: ${lastError}`;
|
|
838
965
|
}
|
|
839
966
|
}
|
|
967
|
+
logger.error(`[${finished.name}] Lane failed with exit code ${finished.code}: ${errorMsg}`);
|
|
968
|
+
// Log log tail for visibility
|
|
969
|
+
if (info.logPath) {
|
|
970
|
+
logFileTail(info.logPath, 15);
|
|
971
|
+
}
|
|
972
|
+
events_1.events.emit('lane.failed', {
|
|
973
|
+
laneName: finished.name,
|
|
974
|
+
exitCode: finished.code,
|
|
975
|
+
error: errorMsg,
|
|
976
|
+
});
|
|
840
977
|
}
|
|
841
|
-
|
|
842
|
-
continue;
|
|
843
|
-
}
|
|
844
|
-
const finished = result;
|
|
845
|
-
const info = running.get(finished.name);
|
|
846
|
-
running.delete(finished.name);
|
|
847
|
-
exitCodes[finished.name] = finished.code;
|
|
848
|
-
// Unregister from auto-recovery manager
|
|
849
|
-
autoRecoveryManager.unregisterLane(finished.name);
|
|
850
|
-
if (finished.code === 0) {
|
|
851
|
-
completedLanes.add(finished.name);
|
|
852
|
-
events_1.events.emit('lane.completed', {
|
|
853
|
-
laneName: finished.name,
|
|
854
|
-
exitCode: finished.code,
|
|
855
|
-
});
|
|
856
|
-
}
|
|
857
|
-
else if (finished.code === 2) {
|
|
858
|
-
// Blocked by dependency
|
|
859
|
-
const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
|
|
860
|
-
const state = (0, state_1.loadState)(statePath);
|
|
861
|
-
if (state && state.dependencyRequest) {
|
|
862
|
-
blockedLanes.set(finished.name, state.dependencyRequest);
|
|
863
|
-
const lane = lanes.find(l => l.name === finished.name);
|
|
864
|
-
if (lane) {
|
|
865
|
-
lane.startIndex = Math.max(0, state.currentTaskIndex - 1); // Task was blocked, retry it
|
|
866
|
-
}
|
|
867
|
-
events_1.events.emit('lane.blocked', {
|
|
868
|
-
laneName: finished.name,
|
|
869
|
-
dependencyRequest: state.dependencyRequest,
|
|
870
|
-
});
|
|
871
|
-
logger.warn(`Lane ${finished.name} is blocked on dependency change request`);
|
|
872
|
-
}
|
|
873
|
-
else {
|
|
874
|
-
failedLanes.add(finished.name);
|
|
875
|
-
logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
|
|
978
|
+
printLaneStatus(lanes, laneRunDirs);
|
|
876
979
|
}
|
|
877
980
|
}
|
|
878
981
|
else {
|
|
879
|
-
//
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
982
|
+
// Nothing running. Are we blocked?
|
|
983
|
+
// Wait a bit to avoid busy-spin while waiting for dependencies or new slots
|
|
984
|
+
if (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length) {
|
|
985
|
+
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
986
|
+
}
|
|
987
|
+
if (blockedLanes.size > 0 && autoResolve) {
|
|
988
|
+
logger.section('🛠 Auto-Resolving Dependencies');
|
|
989
|
+
try {
|
|
990
|
+
await resolveAllDependencies(blockedLanes, lanes, laneRunDirs, pipelineBranch, runRoot);
|
|
991
|
+
// Clear blocked status
|
|
992
|
+
blockedLanes.clear();
|
|
993
|
+
logger.success('Dependencies resolved and synced across all active lanes. Resuming...');
|
|
994
|
+
}
|
|
995
|
+
catch (error) {
|
|
996
|
+
logger.error(`Auto-resolution failed: ${error.message}`);
|
|
997
|
+
// Move blocked to failed
|
|
998
|
+
for (const name of blockedLanes.keys()) {
|
|
999
|
+
failedLanes.add(name);
|
|
889
1000
|
}
|
|
1001
|
+
blockedLanes.clear();
|
|
890
1002
|
}
|
|
891
|
-
// Note: we don't add to failedLanes or completedLanes,
|
|
892
|
-
// so it will be eligible to start again in the next iteration.
|
|
893
|
-
continue;
|
|
894
|
-
}
|
|
895
|
-
failedLanes.add(finished.name);
|
|
896
|
-
events_1.events.emit('lane.failed', {
|
|
897
|
-
laneName: finished.name,
|
|
898
|
-
exitCode: finished.code,
|
|
899
|
-
error: info.stallPhase === 3 ? 'Stopped due to repeated stall' : 'Process exited with non-zero code',
|
|
900
|
-
});
|
|
901
|
-
}
|
|
902
|
-
printLaneStatus(lanes, laneRunDirs);
|
|
903
|
-
}
|
|
904
|
-
else {
|
|
905
|
-
// Nothing running. Are we blocked?
|
|
906
|
-
if (blockedLanes.size > 0 && autoResolve) {
|
|
907
|
-
logger.section('🛠 Auto-Resolving Dependencies');
|
|
908
|
-
try {
|
|
909
|
-
await resolveAllDependencies(blockedLanes, lanes, laneRunDirs, pipelineBranch, runRoot);
|
|
910
|
-
// Clear blocked status
|
|
911
|
-
blockedLanes.clear();
|
|
912
|
-
logger.success('Dependencies resolved and synced across all active lanes. Resuming...');
|
|
913
1003
|
}
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
for (const
|
|
918
|
-
failedLanes.add(name);
|
|
1004
|
+
else if (readyToStart.length === 0 && completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length) {
|
|
1005
|
+
const remaining = lanes.filter(l => !completedLanes.has(l.name) && !failedLanes.has(l.name) && !blockedLanes.has(l.name));
|
|
1006
|
+
logger.error(`Deadlock detected! Remaining lanes cannot start: ${remaining.map(l => l.name).join(', ')}`);
|
|
1007
|
+
for (const l of remaining) {
|
|
1008
|
+
failedLanes.add(l.name);
|
|
1009
|
+
exitCodes[l.name] = 1;
|
|
919
1010
|
}
|
|
920
|
-
blockedLanes.clear();
|
|
921
1011
|
}
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
logger.error(`Deadlock detected! Remaining lanes cannot start: ${remaining.map(l => l.name).join(', ')}`);
|
|
926
|
-
for (const l of remaining) {
|
|
927
|
-
failedLanes.add(l.name);
|
|
928
|
-
exitCodes[l.name] = 1;
|
|
1012
|
+
else {
|
|
1013
|
+
// All finished
|
|
1014
|
+
break;
|
|
929
1015
|
}
|
|
930
1016
|
}
|
|
931
|
-
else {
|
|
932
|
-
// All finished
|
|
933
|
-
break;
|
|
934
|
-
}
|
|
935
1017
|
}
|
|
936
1018
|
}
|
|
937
|
-
|
|
1019
|
+
finally {
|
|
1020
|
+
clearInterval(monitorInterval);
|
|
1021
|
+
process.removeListener('SIGINT', sigIntHandler);
|
|
1022
|
+
process.removeListener('SIGTERM', sigIntHandler);
|
|
1023
|
+
}
|
|
938
1024
|
printLaneStatus(lanes, laneRunDirs);
|
|
939
1025
|
// Check for failures
|
|
940
1026
|
const failed = Object.entries(exitCodes).filter(([, code]) => code !== 0 && code !== 2);
|