@litmers/cursorflow-orchestrator 0.1.29 → 0.1.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +17 -0
- package/dist/cli/clean.js +122 -0
- package/dist/cli/clean.js.map +1 -1
- package/dist/core/auto-recovery.js +3 -1
- package/dist/core/auto-recovery.js.map +1 -1
- package/dist/core/failure-policy.js +7 -1
- package/dist/core/failure-policy.js.map +1 -1
- package/dist/core/orchestrator.js +459 -367
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/utils/config.js +3 -1
- package/dist/utils/config.js.map +1 -1
- package/dist/utils/enhanced-logger.d.ts +5 -1
- package/dist/utils/enhanced-logger.js +65 -20
- package/dist/utils/enhanced-logger.js.map +1 -1
- package/dist/utils/git.d.ts +25 -0
- package/dist/utils/git.js +97 -2
- package/dist/utils/git.js.map +1 -1
- package/package.json +11 -3
- package/scripts/local-security-gate.sh +37 -7
- package/scripts/release.sh +15 -0
- package/src/cli/clean.ts +146 -0
- package/src/core/auto-recovery.ts +3 -1
- package/src/core/failure-policy.ts +8 -1
- package/src/core/orchestrator.ts +183 -83
- package/src/utils/config.ts +3 -1
- package/src/utils/enhanced-logger.ts +61 -20
- package/src/utils/git.ts +115 -2
|
@@ -73,6 +73,27 @@ const DEFAULT_ORCHESTRATOR_STALL_CONFIG = {
|
|
|
73
73
|
progressTimeoutMs: 10 * 60 * 1000, // 10 minutes (only triggers if no activity at all)
|
|
74
74
|
maxRestarts: 2,
|
|
75
75
|
};
|
|
76
|
+
/**
|
|
77
|
+
* Log the tail of a file
|
|
78
|
+
*/
|
|
79
|
+
function logFileTail(filePath, lines = 10) {
|
|
80
|
+
if (!fs.existsSync(filePath))
|
|
81
|
+
return;
|
|
82
|
+
try {
|
|
83
|
+
const content = fs.readFileSync(filePath, 'utf8');
|
|
84
|
+
const allLines = content.split('\n');
|
|
85
|
+
const tail = allLines.slice(-lines).filter(l => l.trim());
|
|
86
|
+
if (tail.length > 0) {
|
|
87
|
+
logger.error(` Last ${tail.length} lines of log:`);
|
|
88
|
+
for (const line of tail) {
|
|
89
|
+
logger.error(` ${line}`);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
catch (e) {
|
|
94
|
+
// Ignore log reading errors
|
|
95
|
+
}
|
|
96
|
+
}
|
|
76
97
|
/**
|
|
77
98
|
* Spawn a lane process
|
|
78
99
|
*/
|
|
@@ -141,8 +162,11 @@ function spawnLane({ laneName, tasksFile, laneRunDir, executor, startIndex = 0,
|
|
|
141
162
|
// or if it's NOT a noisy JSON line
|
|
142
163
|
const hasTimestamp = /^\[\d{4}-\d{2}-\d{2}T|\^\[\d{2}:\d{2}:\d{2}\]/.test(trimmed);
|
|
143
164
|
const isJson = trimmed.startsWith('{') || trimmed.includes('{"type"');
|
|
165
|
+
// Filter out heartbeats - they should NOT reset the idle timer
|
|
166
|
+
const isHeartbeat = trimmed.includes('Heartbeat') && trimmed.includes('bytes received');
|
|
144
167
|
if (trimmed && !isJson) {
|
|
145
|
-
|
|
168
|
+
// Only trigger activity for non-heartbeat lines
|
|
169
|
+
if (onActivity && !isHeartbeat)
|
|
146
170
|
onActivity();
|
|
147
171
|
// If line alreedy has timestamp format, just add lane prefix
|
|
148
172
|
if (hasTimestamp) {
|
|
@@ -524,417 +548,485 @@ async function orchestrate(tasksDir, options = {}) {
|
|
|
524
548
|
const monitorInterval = setInterval(() => {
|
|
525
549
|
printLaneStatus(lanes, laneRunDirs);
|
|
526
550
|
}, options.pollInterval || 60000);
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
551
|
+
// Handle process interruption
|
|
552
|
+
const sigIntHandler = () => {
|
|
553
|
+
logger.warn('\n⚠️ Orchestration interrupted! Stopping all lanes...');
|
|
554
|
+
for (const [name, info] of running.entries()) {
|
|
555
|
+
logger.info(`Stopping lane: ${name}`);
|
|
556
|
+
try {
|
|
557
|
+
info.child.kill('SIGTERM');
|
|
533
558
|
}
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
559
|
+
catch {
|
|
560
|
+
// Ignore kill errors
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
printLaneStatus(lanes, laneRunDirs);
|
|
564
|
+
process.exit(130);
|
|
565
|
+
};
|
|
566
|
+
process.on('SIGINT', sigIntHandler);
|
|
567
|
+
process.on('SIGTERM', sigIntHandler);
|
|
568
|
+
let lastStallCheck = Date.now();
|
|
569
|
+
try {
|
|
570
|
+
while (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length || (blockedLanes.size > 0 && running.size === 0)) {
|
|
571
|
+
// 1. Identify lanes ready to start
|
|
572
|
+
const readyToStart = lanes.filter(lane => {
|
|
573
|
+
// Not already running or completed or failed or blocked
|
|
574
|
+
if (running.has(lane.name) || completedLanes.has(lane.name) || failedLanes.has(lane.name) || blockedLanes.has(lane.name)) {
|
|
547
575
|
return false;
|
|
548
576
|
}
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
let lastOutput = '';
|
|
564
|
-
const spawnResult = spawnLane({
|
|
565
|
-
laneName: lane.name,
|
|
566
|
-
tasksFile: lane.path,
|
|
567
|
-
laneRunDir: laneRunDirs[lane.name],
|
|
568
|
-
executor: options.executor || 'cursor-agent',
|
|
569
|
-
startIndex: lane.startIndex,
|
|
570
|
-
pipelineBranch: `${pipelineBranch}/${lane.name}`,
|
|
571
|
-
worktreeDir: laneWorktreeDirs[lane.name],
|
|
572
|
-
enhancedLogConfig: options.enhancedLogging,
|
|
573
|
-
noGit: options.noGit,
|
|
574
|
-
onActivity: () => {
|
|
575
|
-
const info = running.get(lane.name);
|
|
576
|
-
if (info) {
|
|
577
|
-
const now = Date.now();
|
|
578
|
-
info.lastActivity = now;
|
|
579
|
-
// Also reset progress tracking when there's activity (THNK/TOOL events)
|
|
580
|
-
// This prevents STALL_NO_PROGRESS from firing when agent is actively working
|
|
581
|
-
info.lastStateUpdate = now;
|
|
582
|
-
info.stallPhase = 0; // Reset stall phase since agent is responding
|
|
577
|
+
// Check dependencies
|
|
578
|
+
for (const dep of lane.dependsOn) {
|
|
579
|
+
if (failedLanes.has(dep)) {
|
|
580
|
+
logger.error(`Lane ${lane.name} will not start because dependency ${dep} failed`);
|
|
581
|
+
failedLanes.add(lane.name);
|
|
582
|
+
exitCodes[lane.name] = 1;
|
|
583
|
+
return false;
|
|
584
|
+
}
|
|
585
|
+
if (blockedLanes.has(dep)) {
|
|
586
|
+
// If a dependency is blocked, wait
|
|
587
|
+
return false;
|
|
588
|
+
}
|
|
589
|
+
if (!completedLanes.has(dep)) {
|
|
590
|
+
return false;
|
|
583
591
|
}
|
|
584
592
|
}
|
|
593
|
+
return true;
|
|
585
594
|
});
|
|
586
|
-
//
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
595
|
+
// 2. Spawn ready lanes up to maxConcurrent
|
|
596
|
+
for (const lane of readyToStart) {
|
|
597
|
+
if (running.size >= maxConcurrent)
|
|
598
|
+
break;
|
|
599
|
+
const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[lane.name], 'state.json');
|
|
600
|
+
// Validate and repair state before starting
|
|
601
|
+
const validation = (0, state_1.validateLaneState)(laneStatePath, { autoRepair: true });
|
|
602
|
+
if (!validation.valid && !validation.repaired) {
|
|
603
|
+
logger.warn(`[${lane.name}] State validation issues: ${validation.issues.join(', ')}`);
|
|
604
|
+
}
|
|
605
|
+
logger.info(`Lane started: ${lane.name}${lane.startIndex ? ` (resuming from ${lane.startIndex})` : ''}`);
|
|
606
|
+
const now = Date.now();
|
|
607
|
+
// Pre-register lane in running map so onActivity can find it immediately
|
|
608
|
+
running.set(lane.name, {
|
|
609
|
+
child: {}, // Placeholder, will be replaced below
|
|
610
|
+
logManager: undefined,
|
|
611
|
+
logPath: '',
|
|
612
|
+
lastActivity: now,
|
|
613
|
+
lastStateUpdate: now,
|
|
614
|
+
stallPhase: 0,
|
|
615
|
+
taskStartTime: now,
|
|
616
|
+
lastOutput: '',
|
|
617
|
+
statePath: laneStatePath,
|
|
618
|
+
bytesReceived: 0,
|
|
619
|
+
lastBytesCheck: 0,
|
|
620
|
+
continueSignalsSent: 0,
|
|
596
621
|
});
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
events_1.events.emit('lane.started', {
|
|
616
|
-
laneName: lane.name,
|
|
617
|
-
pid: spawnResult.child.pid,
|
|
618
|
-
logPath: spawnResult.logPath,
|
|
619
|
-
});
|
|
620
|
-
}
|
|
621
|
-
// 3. Wait for any running lane to finish OR check for stalls
|
|
622
|
-
if (running.size > 0) {
|
|
623
|
-
// Polling timeout for stall detection
|
|
624
|
-
let pollTimeout;
|
|
625
|
-
const pollPromise = new Promise(resolve => {
|
|
626
|
-
pollTimeout = setTimeout(() => resolve({ name: '__poll__', code: 0 }), 10000);
|
|
627
|
-
});
|
|
628
|
-
const promises = Array.from(running.entries()).map(async ([name, { child }]) => {
|
|
629
|
-
const code = await waitChild(child);
|
|
630
|
-
return { name, code };
|
|
631
|
-
});
|
|
632
|
-
const result = await Promise.race([...promises, pollPromise]);
|
|
633
|
-
if (pollTimeout)
|
|
634
|
-
clearTimeout(pollTimeout);
|
|
635
|
-
if (result.name === '__poll__') {
|
|
636
|
-
// Periodic stall check with multi-layer detection and escalating recovery
|
|
637
|
-
for (const [laneName, info] of running.entries()) {
|
|
638
|
-
const now = Date.now();
|
|
639
|
-
const idleTime = now - info.lastActivity;
|
|
640
|
-
const lane = lanes.find(l => l.name === laneName);
|
|
641
|
-
// Check state file for progress updates
|
|
642
|
-
let progressTime = 0;
|
|
643
|
-
try {
|
|
644
|
-
const stateStat = fs.statSync(info.statePath);
|
|
645
|
-
const stateUpdateTime = stateStat.mtimeMs;
|
|
646
|
-
if (stateUpdateTime > info.lastStateUpdate) {
|
|
647
|
-
info.lastStateUpdate = stateUpdateTime;
|
|
622
|
+
let lastOutput = '';
|
|
623
|
+
const spawnResult = spawnLane({
|
|
624
|
+
laneName: lane.name,
|
|
625
|
+
tasksFile: lane.path,
|
|
626
|
+
laneRunDir: laneRunDirs[lane.name],
|
|
627
|
+
executor: options.executor || 'cursor-agent',
|
|
628
|
+
startIndex: lane.startIndex,
|
|
629
|
+
pipelineBranch: `${pipelineBranch}/${lane.name}`,
|
|
630
|
+
worktreeDir: laneWorktreeDirs[lane.name],
|
|
631
|
+
enhancedLogConfig: options.enhancedLogging,
|
|
632
|
+
noGit: options.noGit,
|
|
633
|
+
onActivity: () => {
|
|
634
|
+
const info = running.get(lane.name);
|
|
635
|
+
if (info) {
|
|
636
|
+
const actNow = Date.now();
|
|
637
|
+
info.lastActivity = actNow;
|
|
638
|
+
info.lastStateUpdate = actNow;
|
|
639
|
+
info.stallPhase = 0;
|
|
648
640
|
}
|
|
649
|
-
progressTime = now - info.lastStateUpdate;
|
|
650
|
-
}
|
|
651
|
-
catch {
|
|
652
|
-
// State file might not exist yet
|
|
653
641
|
}
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
info.lastActivity = now;
|
|
678
|
-
info.continueSignalsSent++;
|
|
679
|
-
logger.info(`[${laneName}] Sent continue signal (#${info.continueSignalsSent})`);
|
|
680
|
-
events_1.events.emit('recovery.continue_signal', {
|
|
681
|
-
laneName,
|
|
682
|
-
idleSeconds: Math.round(idleTime / 1000),
|
|
683
|
-
signalCount: info.continueSignalsSent,
|
|
684
|
-
});
|
|
642
|
+
});
|
|
643
|
+
// Update with actual spawn result
|
|
644
|
+
const existingInfo = running.get(lane.name);
|
|
645
|
+
Object.assign(existingInfo, spawnResult);
|
|
646
|
+
// Track last output and bytes received for long operation and stall detection
|
|
647
|
+
if (spawnResult.child.stdout) {
|
|
648
|
+
spawnResult.child.stdout.on('data', (data) => {
|
|
649
|
+
const info = running.get(lane.name);
|
|
650
|
+
if (info) {
|
|
651
|
+
const output = data.toString();
|
|
652
|
+
const lines = output.split('\n').filter(l => l.trim());
|
|
653
|
+
// Filter out heartbeats from activity tracking to avoid resetting stall detection
|
|
654
|
+
const realLines = lines.filter(line => !(line.includes('Heartbeat') && line.includes('bytes received')));
|
|
655
|
+
if (realLines.length > 0) {
|
|
656
|
+
// Real activity detected - update lastActivity to reset stall timer
|
|
657
|
+
const actNow = Date.now();
|
|
658
|
+
info.lastActivity = actNow;
|
|
659
|
+
info.stallPhase = 0; // Reset stall phase on real activity
|
|
660
|
+
const lastRealLine = realLines[realLines.length - 1];
|
|
661
|
+
info.lastOutput = lastRealLine;
|
|
662
|
+
info.bytesReceived += data.length;
|
|
663
|
+
// Update auto-recovery manager with real activity
|
|
664
|
+
autoRecoveryManager.recordActivity(lane.name, data.length, info.lastOutput);
|
|
685
665
|
}
|
|
686
|
-
|
|
687
|
-
|
|
666
|
+
else if (lines.length > 0) {
|
|
667
|
+
// Only heartbeats received - do NOT update lastActivity (keep stall timer running)
|
|
668
|
+
autoRecoveryManager.recordActivity(lane.name, 0, info.lastOutput);
|
|
688
669
|
}
|
|
689
670
|
}
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
671
|
+
});
|
|
672
|
+
}
|
|
673
|
+
// Register lane with auto-recovery manager
|
|
674
|
+
autoRecoveryManager.registerLane(lane.name);
|
|
675
|
+
// Update lane tracking
|
|
676
|
+
lane.taskStartTime = now;
|
|
677
|
+
events_1.events.emit('lane.started', {
|
|
678
|
+
laneName: lane.name,
|
|
679
|
+
pid: spawnResult.child.pid,
|
|
680
|
+
logPath: spawnResult.logPath,
|
|
681
|
+
});
|
|
682
|
+
}
|
|
683
|
+
// 3. Wait for any running lane to finish OR check for stalls
|
|
684
|
+
if (running.size > 0) {
|
|
685
|
+
// Polling timeout for stall detection
|
|
686
|
+
let pollTimeout;
|
|
687
|
+
const pollPromise = new Promise(resolve => {
|
|
688
|
+
pollTimeout = setTimeout(() => resolve({ name: '__poll__', code: 0 }), 10000);
|
|
689
|
+
});
|
|
690
|
+
const promises = Array.from(running.entries()).map(async ([name, { child }]) => {
|
|
691
|
+
const code = await waitChild(child);
|
|
692
|
+
return { name, code };
|
|
693
|
+
});
|
|
694
|
+
const result = await Promise.race([...promises, pollPromise]);
|
|
695
|
+
if (pollTimeout)
|
|
696
|
+
clearTimeout(pollTimeout);
|
|
697
|
+
const now = Date.now();
|
|
698
|
+
if (result.name === '__poll__' || (now - lastStallCheck >= 10000)) {
|
|
699
|
+
lastStallCheck = now;
|
|
700
|
+
// Periodic stall check with multi-layer detection and escalating recovery
|
|
701
|
+
for (const [laneName, info] of running.entries()) {
|
|
702
|
+
const idleTime = now - info.lastActivity;
|
|
703
|
+
const lane = lanes.find(l => l.name === laneName);
|
|
704
|
+
if (process.env['DEBUG_STALL']) {
|
|
705
|
+
logger.debug(`[${laneName}] Stall check: idle=${Math.round(idleTime / 1000)}s, bytesDelta=${info.bytesReceived - info.lastBytesCheck}, phase=${info.stallPhase}`);
|
|
703
706
|
}
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
if (checkpoint) {
|
|
712
|
-
logger.info(`[${laneName}] Checkpoint available: ${checkpoint.id} (task ${checkpoint.taskIndex})`);
|
|
713
|
-
}
|
|
714
|
-
// Kill the process
|
|
715
|
-
try {
|
|
716
|
-
info.child.kill('SIGKILL');
|
|
717
|
-
}
|
|
718
|
-
catch {
|
|
719
|
-
// Process might already be dead
|
|
707
|
+
// Check state file for progress updates
|
|
708
|
+
let progressTime = 0;
|
|
709
|
+
try {
|
|
710
|
+
const stateStat = fs.statSync(info.statePath);
|
|
711
|
+
const stateUpdateTime = stateStat.mtimeMs;
|
|
712
|
+
if (stateUpdateTime > info.lastStateUpdate) {
|
|
713
|
+
info.lastStateUpdate = stateUpdateTime;
|
|
720
714
|
}
|
|
721
|
-
|
|
722
|
-
events_1.events.emit('recovery.restart', {
|
|
723
|
-
laneName,
|
|
724
|
-
restartCount: lane.restartCount,
|
|
725
|
-
maxRestarts: stallConfig.maxRestarts,
|
|
726
|
-
});
|
|
715
|
+
progressTime = now - info.lastStateUpdate;
|
|
727
716
|
}
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
717
|
+
catch {
|
|
718
|
+
// State file might not exist yet
|
|
719
|
+
}
|
|
720
|
+
// Calculate bytes received since last check
|
|
721
|
+
const bytesDelta = info.bytesReceived - info.lastBytesCheck;
|
|
722
|
+
info.lastBytesCheck = info.bytesReceived;
|
|
723
|
+
// Use multi-layer stall analysis with enhanced context
|
|
724
|
+
const analysis = (0, failure_policy_1.analyzeStall)({
|
|
725
|
+
stallPhase: info.stallPhase,
|
|
726
|
+
idleTimeMs: idleTime,
|
|
727
|
+
progressTimeMs: progressTime,
|
|
728
|
+
lastOutput: info.lastOutput,
|
|
729
|
+
restartCount: lane.restartCount || 0,
|
|
730
|
+
taskStartTimeMs: info.taskStartTime,
|
|
731
|
+
bytesReceived: bytesDelta, // Bytes since last check
|
|
732
|
+
continueSignalsSent: info.continueSignalsSent,
|
|
733
|
+
}, stallConfig);
|
|
734
|
+
// Only act if action is not NONE
|
|
735
|
+
if (analysis.action !== failure_policy_1.RecoveryAction.NONE) {
|
|
736
|
+
(0, failure_policy_1.logFailure)(laneName, analysis);
|
|
737
|
+
info.logManager?.log('error', analysis.message);
|
|
738
|
+
if (analysis.action === failure_policy_1.RecoveryAction.CONTINUE_SIGNAL) {
|
|
739
|
+
const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
|
|
740
|
+
try {
|
|
741
|
+
fs.writeFileSync(interventionPath, 'continue');
|
|
742
|
+
info.stallPhase = 1;
|
|
743
|
+
info.lastActivity = now;
|
|
744
|
+
info.continueSignalsSent++;
|
|
745
|
+
logger.info(`[${laneName}] Sent continue signal (#${info.continueSignalsSent})`);
|
|
746
|
+
events_1.events.emit('recovery.continue_signal', {
|
|
747
|
+
laneName,
|
|
748
|
+
idleSeconds: Math.round(idleTime / 1000),
|
|
749
|
+
signalCount: info.continueSignalsSent,
|
|
750
|
+
});
|
|
751
|
+
}
|
|
752
|
+
catch (e) {
|
|
753
|
+
logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
|
|
754
|
+
}
|
|
761
755
|
}
|
|
762
|
-
|
|
763
|
-
|
|
756
|
+
else if (analysis.action === failure_policy_1.RecoveryAction.STRONGER_PROMPT) {
|
|
757
|
+
const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
|
|
758
|
+
const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck. Please continue with your current task immediately. If you're waiting for something, explain what you need and proceed with what you can do now. If you've completed the task, summarize your work and finish.`;
|
|
759
|
+
try {
|
|
760
|
+
fs.writeFileSync(interventionPath, strongerPrompt);
|
|
761
|
+
info.stallPhase = 2;
|
|
762
|
+
info.lastActivity = now;
|
|
763
|
+
logger.warn(`[${laneName}] Sent stronger prompt after continue signal failed`);
|
|
764
|
+
events_1.events.emit('recovery.stronger_prompt', { laneName });
|
|
765
|
+
}
|
|
766
|
+
catch (e) {
|
|
767
|
+
logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
|
|
768
|
+
}
|
|
764
769
|
}
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
770
|
+
else if (analysis.action === failure_policy_1.RecoveryAction.KILL_AND_RESTART ||
|
|
771
|
+
analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE ||
|
|
772
|
+
analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE_FROM_CHECKPOINT) {
|
|
773
|
+
lane.restartCount = (lane.restartCount || 0) + 1;
|
|
774
|
+
info.stallPhase = 3;
|
|
775
|
+
// Try to get checkpoint info
|
|
776
|
+
const checkpoint = (0, checkpoint_1.getLatestCheckpoint)(laneRunDirs[laneName]);
|
|
777
|
+
if (checkpoint) {
|
|
778
|
+
logger.info(`[${laneName}] Checkpoint available: ${checkpoint.id} (task ${checkpoint.taskIndex})`);
|
|
779
|
+
}
|
|
780
|
+
// Kill the process
|
|
769
781
|
try {
|
|
770
|
-
|
|
771
|
-
const laneState = (0, state_1.loadState)(laneStatePath);
|
|
772
|
-
const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
|
|
773
|
-
const diagnosticInfo = {
|
|
774
|
-
timestamp: Date.now(),
|
|
775
|
-
agentHealthy: agentHealth.ok,
|
|
776
|
-
authHealthy: authHealth.ok,
|
|
777
|
-
systemHealthy: true,
|
|
778
|
-
suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
|
|
779
|
-
details: issues.join('\n') || 'No obvious issues found',
|
|
780
|
-
};
|
|
781
|
-
const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, diagnosticInfo);
|
|
782
|
-
(0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
|
|
782
|
+
info.child.kill('SIGKILL');
|
|
783
783
|
}
|
|
784
|
-
catch
|
|
785
|
-
|
|
784
|
+
catch {
|
|
785
|
+
// Process might already be dead
|
|
786
786
|
}
|
|
787
|
+
logger.warn(`[${laneName}] Killing and restarting lane (restart #${lane.restartCount})`);
|
|
788
|
+
events_1.events.emit('recovery.restart', {
|
|
789
|
+
laneName,
|
|
790
|
+
restartCount: lane.restartCount,
|
|
791
|
+
maxRestarts: stallConfig.maxRestarts,
|
|
792
|
+
});
|
|
787
793
|
}
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
794
|
+
else if (analysis.action === failure_policy_1.RecoveryAction.RUN_DOCTOR) {
|
|
795
|
+
info.stallPhase = 4;
|
|
796
|
+
// Run diagnostics
|
|
797
|
+
logger.error(`[${laneName}] Running diagnostics due to persistent failures...`);
|
|
798
|
+
// Import health check dynamically to avoid circular dependency
|
|
799
|
+
const { checkAgentHealth, checkAuthHealth } = await Promise.resolve().then(() => __importStar(require('../utils/health')));
|
|
800
|
+
const [agentHealth, authHealth] = await Promise.all([
|
|
801
|
+
checkAgentHealth(),
|
|
802
|
+
checkAuthHealth(),
|
|
803
|
+
]);
|
|
804
|
+
const issues = [];
|
|
805
|
+
if (!agentHealth.ok)
|
|
806
|
+
issues.push(`Agent: ${agentHealth.message}`);
|
|
807
|
+
if (!authHealth.ok)
|
|
808
|
+
issues.push(`Auth: ${authHealth.message}`);
|
|
809
|
+
if (issues.length > 0) {
|
|
810
|
+
logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
|
|
811
|
+
}
|
|
812
|
+
else {
|
|
813
|
+
logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
|
|
814
|
+
}
|
|
815
|
+
// Save diagnostic to file
|
|
816
|
+
const diagnosticPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'diagnostic.json');
|
|
817
|
+
fs.writeFileSync(diagnosticPath, JSON.stringify({
|
|
818
|
+
timestamp: Date.now(),
|
|
819
|
+
agentHealthy: agentHealth.ok,
|
|
820
|
+
authHealthy: authHealth.ok,
|
|
821
|
+
issues,
|
|
822
|
+
analysis,
|
|
823
|
+
}, null, 2));
|
|
824
|
+
// Kill the process
|
|
825
|
+
try {
|
|
826
|
+
info.child.kill('SIGKILL');
|
|
827
|
+
}
|
|
828
|
+
catch {
|
|
829
|
+
// Process might already be dead
|
|
830
|
+
}
|
|
831
|
+
logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
|
|
832
|
+
// Save POF for failed recovery
|
|
833
|
+
const recoveryState = autoRecoveryManager.getState(laneName);
|
|
834
|
+
if (recoveryState) {
|
|
835
|
+
try {
|
|
836
|
+
const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
|
|
837
|
+
const laneState = (0, state_1.loadState)(laneStatePath);
|
|
838
|
+
const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
|
|
839
|
+
const diagnosticInfo = {
|
|
840
|
+
timestamp: Date.now(),
|
|
841
|
+
agentHealthy: agentHealth.ok,
|
|
842
|
+
authHealthy: authHealth.ok,
|
|
843
|
+
systemHealthy: true,
|
|
844
|
+
suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
|
|
845
|
+
details: issues.join('\n') || 'No obvious issues found',
|
|
846
|
+
};
|
|
847
|
+
const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, diagnosticInfo);
|
|
848
|
+
(0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
|
|
849
|
+
}
|
|
850
|
+
catch (pofError) {
|
|
851
|
+
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
events_1.events.emit('recovery.diagnosed', {
|
|
855
|
+
laneName,
|
|
856
|
+
diagnostic: { agentHealthy: agentHealth.ok, authHealthy: authHealth.ok, issues },
|
|
857
|
+
});
|
|
797
858
|
}
|
|
798
|
-
|
|
799
|
-
|
|
859
|
+
else if (analysis.action === failure_policy_1.RecoveryAction.ABORT_LANE) {
|
|
860
|
+
info.stallPhase = 5;
|
|
861
|
+
try {
|
|
862
|
+
info.child.kill('SIGKILL');
|
|
863
|
+
}
|
|
864
|
+
catch {
|
|
865
|
+
// Process might already be dead
|
|
866
|
+
}
|
|
867
|
+
logger.error(`[${laneName}] Aborting lane due to repeated stalls`);
|
|
868
|
+
// Save POF for failed recovery
|
|
869
|
+
const recoveryState = autoRecoveryManager.getState(laneName);
|
|
870
|
+
if (recoveryState) {
|
|
871
|
+
try {
|
|
872
|
+
const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
|
|
873
|
+
const laneState = (0, state_1.loadState)(laneStatePath);
|
|
874
|
+
const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
|
|
875
|
+
const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, recoveryState.diagnosticInfo);
|
|
876
|
+
(0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
|
|
877
|
+
}
|
|
878
|
+
catch (pofError) {
|
|
879
|
+
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
880
|
+
}
|
|
881
|
+
}
|
|
800
882
|
}
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
883
|
+
else if (analysis.action === failure_policy_1.RecoveryAction.SEND_GIT_GUIDANCE) {
|
|
884
|
+
// Send guidance message to agent for git issues
|
|
885
|
+
const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
|
|
886
|
+
// Determine which guidance to send based on the failure type
|
|
887
|
+
let guidance;
|
|
888
|
+
if (analysis.type === failure_policy_1.FailureType.GIT_PUSH_REJECTED) {
|
|
889
|
+
guidance = (0, auto_recovery_1.getGitPushFailureGuidance)();
|
|
890
|
+
}
|
|
891
|
+
else if (analysis.type === failure_policy_1.FailureType.MERGE_CONFLICT) {
|
|
892
|
+
guidance = (0, auto_recovery_1.getMergeConflictGuidance)();
|
|
893
|
+
}
|
|
894
|
+
else {
|
|
895
|
+
guidance = (0, auto_recovery_1.getGitErrorGuidance)(analysis.message);
|
|
896
|
+
}
|
|
805
897
|
try {
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, recoveryState.diagnosticInfo);
|
|
810
|
-
(0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
|
|
898
|
+
fs.writeFileSync(interventionPath, guidance);
|
|
899
|
+
info.lastActivity = now;
|
|
900
|
+
logger.info(`[${laneName}] Sent git issue guidance to agent`);
|
|
811
901
|
}
|
|
812
|
-
catch (
|
|
813
|
-
logger.
|
|
902
|
+
catch (e) {
|
|
903
|
+
logger.error(`[${laneName}] Failed to send guidance: ${e.message}`);
|
|
814
904
|
}
|
|
815
905
|
}
|
|
816
906
|
}
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
907
|
+
}
|
|
908
|
+
continue;
|
|
909
|
+
}
|
|
910
|
+
else {
|
|
911
|
+
const finished = result;
|
|
912
|
+
const info = running.get(finished.name);
|
|
913
|
+
running.delete(finished.name);
|
|
914
|
+
exitCodes[finished.name] = finished.code;
|
|
915
|
+
// Unregister from auto-recovery manager
|
|
916
|
+
autoRecoveryManager.unregisterLane(finished.name);
|
|
917
|
+
if (finished.code === 0) {
|
|
918
|
+
completedLanes.add(finished.name);
|
|
919
|
+
events_1.events.emit('lane.completed', {
|
|
920
|
+
laneName: finished.name,
|
|
921
|
+
exitCode: finished.code,
|
|
922
|
+
});
|
|
923
|
+
}
|
|
924
|
+
else if (finished.code === 2) {
|
|
925
|
+
// Blocked by dependency
|
|
926
|
+
const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
|
|
927
|
+
const state = (0, state_1.loadState)(statePath);
|
|
928
|
+
if (state && state.dependencyRequest) {
|
|
929
|
+
blockedLanes.set(finished.name, state.dependencyRequest);
|
|
930
|
+
const lane = lanes.find(l => l.name === finished.name);
|
|
931
|
+
if (lane) {
|
|
932
|
+
lane.startIndex = Math.max(0, state.currentTaskIndex - 1); // Task was blocked, retry it
|
|
830
933
|
}
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
934
|
+
events_1.events.emit('lane.blocked', {
|
|
935
|
+
laneName: finished.name,
|
|
936
|
+
dependencyRequest: state.dependencyRequest,
|
|
937
|
+
});
|
|
938
|
+
logger.warn(`Lane ${finished.name} is blocked on dependency change request`);
|
|
939
|
+
}
|
|
940
|
+
else {
|
|
941
|
+
failedLanes.add(finished.name);
|
|
942
|
+
logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
else {
|
|
946
|
+
// Check if it was a restart request
|
|
947
|
+
if (info.stallPhase === 2) {
|
|
948
|
+
logger.info(`🔄 Lane ${finished.name} is being restarted due to stall...`);
|
|
949
|
+
// Update startIndex from current state to resume from the same task
|
|
950
|
+
const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
|
|
951
|
+
const state = (0, state_1.loadState)(statePath);
|
|
952
|
+
if (state) {
|
|
953
|
+
const lane = lanes.find(l => l.name === finished.name);
|
|
954
|
+
if (lane) {
|
|
955
|
+
lane.startIndex = state.currentTaskIndex;
|
|
956
|
+
}
|
|
835
957
|
}
|
|
836
|
-
|
|
837
|
-
|
|
958
|
+
// Note: we don't add to failedLanes or completedLanes,
|
|
959
|
+
// so it will be eligible to start again in the next iteration.
|
|
960
|
+
continue;
|
|
961
|
+
}
|
|
962
|
+
failedLanes.add(finished.name);
|
|
963
|
+
let errorMsg = 'Process exited with non-zero code';
|
|
964
|
+
if (info.stallPhase === 3) {
|
|
965
|
+
errorMsg = 'Stopped due to repeated stall';
|
|
966
|
+
}
|
|
967
|
+
else if (info.logManager) {
|
|
968
|
+
const lastError = info.logManager.getLastError();
|
|
969
|
+
if (lastError) {
|
|
970
|
+
errorMsg = `Process failed: ${lastError}`;
|
|
838
971
|
}
|
|
839
972
|
}
|
|
973
|
+
logger.error(`[${finished.name}] Lane failed with exit code ${finished.code}: ${errorMsg}`);
|
|
974
|
+
// Log log tail for visibility
|
|
975
|
+
if (info.logPath) {
|
|
976
|
+
logFileTail(info.logPath, 15);
|
|
977
|
+
}
|
|
978
|
+
events_1.events.emit('lane.failed', {
|
|
979
|
+
laneName: finished.name,
|
|
980
|
+
exitCode: finished.code,
|
|
981
|
+
error: errorMsg,
|
|
982
|
+
});
|
|
840
983
|
}
|
|
841
|
-
|
|
842
|
-
continue;
|
|
843
|
-
}
|
|
844
|
-
const finished = result;
|
|
845
|
-
const info = running.get(finished.name);
|
|
846
|
-
running.delete(finished.name);
|
|
847
|
-
exitCodes[finished.name] = finished.code;
|
|
848
|
-
// Unregister from auto-recovery manager
|
|
849
|
-
autoRecoveryManager.unregisterLane(finished.name);
|
|
850
|
-
if (finished.code === 0) {
|
|
851
|
-
completedLanes.add(finished.name);
|
|
852
|
-
events_1.events.emit('lane.completed', {
|
|
853
|
-
laneName: finished.name,
|
|
854
|
-
exitCode: finished.code,
|
|
855
|
-
});
|
|
856
|
-
}
|
|
857
|
-
else if (finished.code === 2) {
|
|
858
|
-
// Blocked by dependency
|
|
859
|
-
const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
|
|
860
|
-
const state = (0, state_1.loadState)(statePath);
|
|
861
|
-
if (state && state.dependencyRequest) {
|
|
862
|
-
blockedLanes.set(finished.name, state.dependencyRequest);
|
|
863
|
-
const lane = lanes.find(l => l.name === finished.name);
|
|
864
|
-
if (lane) {
|
|
865
|
-
lane.startIndex = Math.max(0, state.currentTaskIndex - 1); // Task was blocked, retry it
|
|
866
|
-
}
|
|
867
|
-
events_1.events.emit('lane.blocked', {
|
|
868
|
-
laneName: finished.name,
|
|
869
|
-
dependencyRequest: state.dependencyRequest,
|
|
870
|
-
});
|
|
871
|
-
logger.warn(`Lane ${finished.name} is blocked on dependency change request`);
|
|
872
|
-
}
|
|
873
|
-
else {
|
|
874
|
-
failedLanes.add(finished.name);
|
|
875
|
-
logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
|
|
984
|
+
printLaneStatus(lanes, laneRunDirs);
|
|
876
985
|
}
|
|
877
986
|
}
|
|
878
987
|
else {
|
|
879
|
-
//
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
988
|
+
// Nothing running. Are we blocked?
|
|
989
|
+
// Wait a bit to avoid busy-spin while waiting for dependencies or new slots
|
|
990
|
+
if (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length) {
|
|
991
|
+
await new Promise(resolve => setTimeout(resolve, 2000));
|
|
992
|
+
}
|
|
993
|
+
if (blockedLanes.size > 0 && autoResolve) {
|
|
994
|
+
logger.section('🛠 Auto-Resolving Dependencies');
|
|
995
|
+
try {
|
|
996
|
+
await resolveAllDependencies(blockedLanes, lanes, laneRunDirs, pipelineBranch, runRoot);
|
|
997
|
+
// Clear blocked status
|
|
998
|
+
blockedLanes.clear();
|
|
999
|
+
logger.success('Dependencies resolved and synced across all active lanes. Resuming...');
|
|
1000
|
+
}
|
|
1001
|
+
catch (error) {
|
|
1002
|
+
logger.error(`Auto-resolution failed: ${error.message}`);
|
|
1003
|
+
// Move blocked to failed
|
|
1004
|
+
for (const name of blockedLanes.keys()) {
|
|
1005
|
+
failedLanes.add(name);
|
|
889
1006
|
}
|
|
1007
|
+
blockedLanes.clear();
|
|
890
1008
|
}
|
|
891
|
-
// Note: we don't add to failedLanes or completedLanes,
|
|
892
|
-
// so it will be eligible to start again in the next iteration.
|
|
893
|
-
continue;
|
|
894
|
-
}
|
|
895
|
-
failedLanes.add(finished.name);
|
|
896
|
-
events_1.events.emit('lane.failed', {
|
|
897
|
-
laneName: finished.name,
|
|
898
|
-
exitCode: finished.code,
|
|
899
|
-
error: info.stallPhase === 3 ? 'Stopped due to repeated stall' : 'Process exited with non-zero code',
|
|
900
|
-
});
|
|
901
|
-
}
|
|
902
|
-
printLaneStatus(lanes, laneRunDirs);
|
|
903
|
-
}
|
|
904
|
-
else {
|
|
905
|
-
// Nothing running. Are we blocked?
|
|
906
|
-
if (blockedLanes.size > 0 && autoResolve) {
|
|
907
|
-
logger.section('🛠 Auto-Resolving Dependencies');
|
|
908
|
-
try {
|
|
909
|
-
await resolveAllDependencies(blockedLanes, lanes, laneRunDirs, pipelineBranch, runRoot);
|
|
910
|
-
// Clear blocked status
|
|
911
|
-
blockedLanes.clear();
|
|
912
|
-
logger.success('Dependencies resolved and synced across all active lanes. Resuming...');
|
|
913
1009
|
}
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
for (const
|
|
918
|
-
failedLanes.add(name);
|
|
1010
|
+
else if (readyToStart.length === 0 && completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length) {
|
|
1011
|
+
const remaining = lanes.filter(l => !completedLanes.has(l.name) && !failedLanes.has(l.name) && !blockedLanes.has(l.name));
|
|
1012
|
+
logger.error(`Deadlock detected! Remaining lanes cannot start: ${remaining.map(l => l.name).join(', ')}`);
|
|
1013
|
+
for (const l of remaining) {
|
|
1014
|
+
failedLanes.add(l.name);
|
|
1015
|
+
exitCodes[l.name] = 1;
|
|
919
1016
|
}
|
|
920
|
-
blockedLanes.clear();
|
|
921
1017
|
}
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
logger.error(`Deadlock detected! Remaining lanes cannot start: ${remaining.map(l => l.name).join(', ')}`);
|
|
926
|
-
for (const l of remaining) {
|
|
927
|
-
failedLanes.add(l.name);
|
|
928
|
-
exitCodes[l.name] = 1;
|
|
1018
|
+
else {
|
|
1019
|
+
// All finished
|
|
1020
|
+
break;
|
|
929
1021
|
}
|
|
930
1022
|
}
|
|
931
|
-
else {
|
|
932
|
-
// All finished
|
|
933
|
-
break;
|
|
934
|
-
}
|
|
935
1023
|
}
|
|
936
1024
|
}
|
|
937
|
-
|
|
1025
|
+
finally {
|
|
1026
|
+
clearInterval(monitorInterval);
|
|
1027
|
+
process.removeListener('SIGINT', sigIntHandler);
|
|
1028
|
+
process.removeListener('SIGTERM', sigIntHandler);
|
|
1029
|
+
}
|
|
938
1030
|
printLaneStatus(lanes, laneRunDirs);
|
|
939
1031
|
// Check for failures
|
|
940
1032
|
const failed = Object.entries(exitCodes).filter(([, code]) => code !== 0 && code !== 2);
|