@litmers/cursorflow-orchestrator 0.1.31 → 0.1.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +144 -52
- package/commands/cursorflow-add.md +159 -0
- package/commands/cursorflow-monitor.md +23 -2
- package/commands/cursorflow-new.md +87 -0
- package/dist/cli/add.d.ts +7 -0
- package/dist/cli/add.js +377 -0
- package/dist/cli/add.js.map +1 -0
- package/dist/cli/clean.js +1 -0
- package/dist/cli/clean.js.map +1 -1
- package/dist/cli/config.d.ts +7 -0
- package/dist/cli/config.js +181 -0
- package/dist/cli/config.js.map +1 -0
- package/dist/cli/index.js +34 -30
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/logs.js +7 -33
- package/dist/cli/logs.js.map +1 -1
- package/dist/cli/monitor.js +51 -62
- package/dist/cli/monitor.js.map +1 -1
- package/dist/cli/new.d.ts +7 -0
- package/dist/cli/new.js +232 -0
- package/dist/cli/new.js.map +1 -0
- package/dist/cli/prepare.js +95 -193
- package/dist/cli/prepare.js.map +1 -1
- package/dist/cli/resume.js +11 -47
- package/dist/cli/resume.js.map +1 -1
- package/dist/cli/run.js +27 -22
- package/dist/cli/run.js.map +1 -1
- package/dist/cli/tasks.js +1 -2
- package/dist/cli/tasks.js.map +1 -1
- package/dist/core/failure-policy.d.ts +9 -0
- package/dist/core/failure-policy.js +9 -0
- package/dist/core/failure-policy.js.map +1 -1
- package/dist/core/orchestrator.d.ts +20 -6
- package/dist/core/orchestrator.js +213 -333
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/core/runner/agent.d.ts +27 -0
- package/dist/core/runner/agent.js +294 -0
- package/dist/core/runner/agent.js.map +1 -0
- package/dist/core/runner/index.d.ts +5 -0
- package/dist/core/runner/index.js +22 -0
- package/dist/core/runner/index.js.map +1 -0
- package/dist/core/runner/pipeline.d.ts +9 -0
- package/dist/core/runner/pipeline.js +539 -0
- package/dist/core/runner/pipeline.js.map +1 -0
- package/dist/core/runner/prompt.d.ts +25 -0
- package/dist/core/runner/prompt.js +175 -0
- package/dist/core/runner/prompt.js.map +1 -0
- package/dist/core/runner/task.d.ts +26 -0
- package/dist/core/runner/task.js +283 -0
- package/dist/core/runner/task.js.map +1 -0
- package/dist/core/runner/utils.d.ts +37 -0
- package/dist/core/runner/utils.js +161 -0
- package/dist/core/runner/utils.js.map +1 -0
- package/dist/core/runner.d.ts +2 -96
- package/dist/core/runner.js +11 -1136
- package/dist/core/runner.js.map +1 -1
- package/dist/core/stall-detection.d.ts +326 -0
- package/dist/core/stall-detection.js +781 -0
- package/dist/core/stall-detection.js.map +1 -0
- package/dist/types/config.d.ts +6 -6
- package/dist/types/flow.d.ts +84 -0
- package/dist/types/flow.js +10 -0
- package/dist/types/flow.js.map +1 -0
- package/dist/types/index.d.ts +1 -0
- package/dist/types/index.js +3 -3
- package/dist/types/index.js.map +1 -1
- package/dist/types/lane.d.ts +0 -2
- package/dist/types/logging.d.ts +5 -1
- package/dist/types/task.d.ts +7 -11
- package/dist/utils/config.js +7 -15
- package/dist/utils/config.js.map +1 -1
- package/dist/utils/dependency.d.ts +36 -1
- package/dist/utils/dependency.js +256 -1
- package/dist/utils/dependency.js.map +1 -1
- package/dist/utils/enhanced-logger.d.ts +45 -82
- package/dist/utils/enhanced-logger.js +238 -844
- package/dist/utils/enhanced-logger.js.map +1 -1
- package/dist/utils/git.d.ts +29 -0
- package/dist/utils/git.js +115 -5
- package/dist/utils/git.js.map +1 -1
- package/dist/utils/state.js +0 -2
- package/dist/utils/state.js.map +1 -1
- package/dist/utils/task-service.d.ts +2 -2
- package/dist/utils/task-service.js +40 -31
- package/dist/utils/task-service.js.map +1 -1
- package/package.json +4 -3
- package/src/cli/add.ts +397 -0
- package/src/cli/clean.ts +1 -0
- package/src/cli/config.ts +177 -0
- package/src/cli/index.ts +36 -32
- package/src/cli/logs.ts +7 -31
- package/src/cli/monitor.ts +55 -71
- package/src/cli/new.ts +235 -0
- package/src/cli/prepare.ts +98 -205
- package/src/cli/resume.ts +13 -56
- package/src/cli/run.ts +311 -306
- package/src/cli/tasks.ts +1 -2
- package/src/core/failure-policy.ts +9 -0
- package/src/core/orchestrator.ts +277 -378
- package/src/core/runner/agent.ts +314 -0
- package/src/core/runner/index.ts +6 -0
- package/src/core/runner/pipeline.ts +567 -0
- package/src/core/runner/prompt.ts +174 -0
- package/src/core/runner/task.ts +320 -0
- package/src/core/runner/utils.ts +142 -0
- package/src/core/runner.ts +8 -1347
- package/src/core/stall-detection.ts +936 -0
- package/src/types/config.ts +6 -6
- package/src/types/flow.ts +91 -0
- package/src/types/index.ts +15 -3
- package/src/types/lane.ts +0 -2
- package/src/types/logging.ts +5 -1
- package/src/types/task.ts +7 -11
- package/src/utils/config.ts +8 -16
- package/src/utils/dependency.ts +311 -2
- package/src/utils/enhanced-logger.ts +263 -927
- package/src/utils/git.ts +145 -5
- package/src/utils/state.ts +0 -2
- package/src/utils/task-service.ts +48 -40
- package/commands/cursorflow-review.md +0 -56
- package/commands/cursorflow-runs.md +0 -59
- package/dist/cli/runs.d.ts +0 -5
- package/dist/cli/runs.js +0 -214
- package/dist/cli/runs.js.map +0 -1
- package/dist/core/reviewer.d.ts +0 -66
- package/dist/core/reviewer.js +0 -265
- package/dist/core/reviewer.js.map +0 -1
- package/src/cli/runs.ts +0 -212
- package/src/core/reviewer.ts +0 -285
package/src/core/orchestrator.ts
CHANGED
|
@@ -25,28 +25,35 @@ import {
|
|
|
25
25
|
EnhancedLogManager,
|
|
26
26
|
createLogManager,
|
|
27
27
|
DEFAULT_LOG_CONFIG,
|
|
28
|
-
ParsedMessage
|
|
28
|
+
ParsedMessage,
|
|
29
|
+
stripAnsi
|
|
29
30
|
} from '../utils/enhanced-logger';
|
|
30
31
|
import { formatMessageForConsole } from '../utils/log-formatter';
|
|
31
|
-
import {
|
|
32
|
+
import { FailureType, analyzeFailure as analyzeFailureFromPolicy } from './failure-policy';
|
|
32
33
|
import {
|
|
33
|
-
getAutoRecoveryManager,
|
|
34
|
-
DEFAULT_AUTO_RECOVERY_CONFIG,
|
|
35
|
-
AutoRecoveryConfig,
|
|
36
34
|
savePOF,
|
|
37
35
|
createPOFFromRecoveryState,
|
|
38
36
|
getGitPushFailureGuidance,
|
|
39
37
|
getMergeConflictGuidance,
|
|
40
38
|
getGitErrorGuidance,
|
|
39
|
+
LaneRecoveryState,
|
|
41
40
|
} from './auto-recovery';
|
|
41
|
+
import {
|
|
42
|
+
StallDetectionService,
|
|
43
|
+
getStallService,
|
|
44
|
+
StallDetectionConfig,
|
|
45
|
+
DEFAULT_STALL_CONFIG,
|
|
46
|
+
RecoveryAction,
|
|
47
|
+
StallPhase,
|
|
48
|
+
StallAnalysis,
|
|
49
|
+
} from './stall-detection';
|
|
42
50
|
import { detectCyclicDependencies, validateDependencies, printDependencyGraph, DependencyInfo } from '../utils/dependency';
|
|
43
51
|
import { preflightCheck, printPreflightReport, autoRepair } from '../utils/health';
|
|
44
52
|
import { getLatestCheckpoint } from '../utils/checkpoint';
|
|
45
53
|
import { cleanStaleLocks, getLockDir } from '../utils/lock';
|
|
46
54
|
|
|
47
55
|
/** Default stall detection configuration - 2 minute idle timeout for recovery */
|
|
48
|
-
const DEFAULT_ORCHESTRATOR_STALL_CONFIG: StallDetectionConfig = {
|
|
49
|
-
...DEFAULT_STALL_CONFIG,
|
|
56
|
+
const DEFAULT_ORCHESTRATOR_STALL_CONFIG: Partial<StallDetectionConfig> = {
|
|
50
57
|
idleTimeoutMs: 2 * 60 * 1000, // 2 minutes (idle detection for continue signal)
|
|
51
58
|
progressTimeoutMs: 10 * 60 * 1000, // 10 minutes (only triggers if no activity at all)
|
|
52
59
|
maxRestarts: 2,
|
|
@@ -55,7 +62,6 @@ const DEFAULT_ORCHESTRATOR_STALL_CONFIG: StallDetectionConfig = {
|
|
|
55
62
|
export interface LaneInfo {
|
|
56
63
|
name: string;
|
|
57
64
|
path: string;
|
|
58
|
-
dependsOn: string[];
|
|
59
65
|
startIndex?: number; // Current task index to resume from
|
|
60
66
|
restartCount?: number; // Number of times restarted due to stall
|
|
61
67
|
lastStateUpdate?: number; // Timestamp of last state file update
|
|
@@ -66,24 +72,22 @@ export interface SpawnLaneResult {
|
|
|
66
72
|
child: ChildProcess;
|
|
67
73
|
logPath: string;
|
|
68
74
|
logManager?: EnhancedLogManager;
|
|
75
|
+
info: RunningLaneInfo;
|
|
69
76
|
}
|
|
70
77
|
|
|
71
78
|
/**
|
|
72
79
|
* Lane execution tracking info
|
|
80
|
+
*
|
|
81
|
+
* NOTE: Stall 감지 관련 상태(lastActivity, stallPhase 등)는 StallDetectionService에서 관리
|
|
82
|
+
* 여기서는 프로세스 관리에 필요한 최소한의 정보만 유지
|
|
73
83
|
*/
|
|
74
84
|
interface RunningLaneInfo {
|
|
75
85
|
child: ChildProcess;
|
|
76
86
|
logPath: string;
|
|
77
87
|
logManager?: EnhancedLogManager;
|
|
78
|
-
lastActivity: number;
|
|
79
|
-
lastStateUpdate: number;
|
|
80
|
-
stallPhase: number; // 0: normal, 1: continued, 2: stronger_prompt, 3: restarted
|
|
81
|
-
taskStartTime: number;
|
|
82
|
-
lastOutput: string;
|
|
83
88
|
statePath: string;
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
continueSignalsSent: number; // Number of continue signals sent
|
|
89
|
+
laneIndex: number;
|
|
90
|
+
currentTaskIndex?: number;
|
|
87
91
|
}
|
|
88
92
|
|
|
89
93
|
/**
|
|
@@ -106,6 +110,109 @@ function logFileTail(filePath: string, lines: number = 10): void {
|
|
|
106
110
|
}
|
|
107
111
|
}
|
|
108
112
|
|
|
113
|
+
/**
|
|
114
|
+
* Handle RUN_DOCTOR action - runs async health diagnostics
|
|
115
|
+
*/
|
|
116
|
+
async function handleDoctorDiagnostics(
|
|
117
|
+
laneName: string,
|
|
118
|
+
laneRunDir: string,
|
|
119
|
+
runId: string,
|
|
120
|
+
runRoot: string,
|
|
121
|
+
stallService: StallDetectionService,
|
|
122
|
+
child: ChildProcess
|
|
123
|
+
): Promise<void> {
|
|
124
|
+
// Import health check dynamically to avoid circular dependency
|
|
125
|
+
const { checkAgentHealth, checkAuthHealth } = await import('../utils/health');
|
|
126
|
+
|
|
127
|
+
const [agentHealth, authHealth] = await Promise.all([
|
|
128
|
+
checkAgentHealth(),
|
|
129
|
+
checkAuthHealth(),
|
|
130
|
+
]);
|
|
131
|
+
|
|
132
|
+
const issues: string[] = [];
|
|
133
|
+
if (!agentHealth.ok) issues.push(`Agent: ${agentHealth.message}`);
|
|
134
|
+
if (!authHealth.ok) issues.push(`Auth: ${authHealth.message}`);
|
|
135
|
+
|
|
136
|
+
if (issues.length > 0) {
|
|
137
|
+
logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
|
|
138
|
+
} else {
|
|
139
|
+
logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Save diagnostic to file
|
|
143
|
+
const diagnosticPath = safeJoin(laneRunDir, 'diagnostic.json');
|
|
144
|
+
fs.writeFileSync(diagnosticPath, JSON.stringify({
|
|
145
|
+
timestamp: Date.now(),
|
|
146
|
+
agentHealthy: agentHealth.ok,
|
|
147
|
+
authHealthy: authHealth.ok,
|
|
148
|
+
issues,
|
|
149
|
+
}, null, 2));
|
|
150
|
+
|
|
151
|
+
// Kill the process
|
|
152
|
+
try {
|
|
153
|
+
child.kill('SIGKILL');
|
|
154
|
+
} catch {
|
|
155
|
+
// Process might already be dead
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
|
|
159
|
+
|
|
160
|
+
// Save POF for failed recovery
|
|
161
|
+
const stallState = stallService.getState(laneName);
|
|
162
|
+
if (stallState) {
|
|
163
|
+
try {
|
|
164
|
+
const laneStatePath = safeJoin(laneRunDir, 'state.json');
|
|
165
|
+
const laneState = loadState<LaneState>(laneStatePath);
|
|
166
|
+
const pofDir = safeJoin(runRoot, '..', '..', 'pof');
|
|
167
|
+
|
|
168
|
+
// Convert stall state to recovery state format for POF
|
|
169
|
+
// Note: StallPhase and RecoveryStage have compatible numeric values (0-5)
|
|
170
|
+
const recoveryState: LaneRecoveryState = {
|
|
171
|
+
laneName,
|
|
172
|
+
stage: stallState.phase as unknown as number, // Both enums use 0-5
|
|
173
|
+
lastActivityTime: stallState.lastRealActivityTime,
|
|
174
|
+
lastBytesReceived: stallState.bytesSinceLastCheck,
|
|
175
|
+
totalBytesReceived: stallState.totalBytesReceived,
|
|
176
|
+
lastOutput: stallState.lastOutput,
|
|
177
|
+
restartCount: stallState.restartCount,
|
|
178
|
+
continueSignalsSent: stallState.continueSignalCount,
|
|
179
|
+
lastStageChangeTime: stallState.lastPhaseChangeTime,
|
|
180
|
+
isLongOperation: stallState.isLongOperation,
|
|
181
|
+
failureHistory: stallState.failureHistory.map(f => ({
|
|
182
|
+
timestamp: f.timestamp,
|
|
183
|
+
stage: f.phase as unknown as number, // Both enums use 0-5
|
|
184
|
+
action: f.action as string,
|
|
185
|
+
message: f.message,
|
|
186
|
+
idleTimeMs: f.idleTimeMs,
|
|
187
|
+
bytesReceived: f.bytesReceived,
|
|
188
|
+
lastOutput: f.lastOutput,
|
|
189
|
+
})),
|
|
190
|
+
};
|
|
191
|
+
|
|
192
|
+
const diagnosticInfo = {
|
|
193
|
+
timestamp: Date.now(),
|
|
194
|
+
agentHealthy: agentHealth.ok,
|
|
195
|
+
authHealthy: authHealth.ok,
|
|
196
|
+
systemHealthy: true,
|
|
197
|
+
suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
|
|
198
|
+
details: issues.join('\n') || 'No obvious issues found',
|
|
199
|
+
};
|
|
200
|
+
|
|
201
|
+
const pofEntry = createPOFFromRecoveryState(
|
|
202
|
+
runId,
|
|
203
|
+
runRoot,
|
|
204
|
+
laneName,
|
|
205
|
+
recoveryState,
|
|
206
|
+
laneState,
|
|
207
|
+
diagnosticInfo
|
|
208
|
+
);
|
|
209
|
+
savePOF(runId, pofDir, pofEntry);
|
|
210
|
+
} catch (pofError: any) {
|
|
211
|
+
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
109
216
|
/**
|
|
110
217
|
* Spawn a lane process
|
|
111
218
|
*/
|
|
@@ -120,6 +227,7 @@ export function spawnLane({
|
|
|
120
227
|
enhancedLogConfig,
|
|
121
228
|
noGit = false,
|
|
122
229
|
onActivity,
|
|
230
|
+
laneIndex = 0,
|
|
123
231
|
}: {
|
|
124
232
|
laneName: string;
|
|
125
233
|
tasksFile: string;
|
|
@@ -131,6 +239,7 @@ export function spawnLane({
|
|
|
131
239
|
enhancedLogConfig?: Partial<EnhancedLogConfig>;
|
|
132
240
|
noGit?: boolean;
|
|
133
241
|
onActivity?: () => void;
|
|
242
|
+
laneIndex?: number;
|
|
134
243
|
}): SpawnLaneResult {
|
|
135
244
|
fs.mkdirSync(laneRunDir, { recursive: true});
|
|
136
245
|
|
|
@@ -169,17 +278,24 @@ export function spawnLane({
|
|
|
169
278
|
};
|
|
170
279
|
|
|
171
280
|
if (logConfig.enabled) {
|
|
281
|
+
// Helper to get dynamic lane label like [L01-T01-laneName]
|
|
282
|
+
const getDynamicLabel = () => {
|
|
283
|
+
const laneNum = `L${(laneIndex + 1).toString().padStart(2, '0')}`;
|
|
284
|
+
const taskPart = info.currentTaskIndex ? `-T${info.currentTaskIndex.toString().padStart(2, '0')}` : '';
|
|
285
|
+
return `[${laneNum}${taskPart}-${laneName}]`;
|
|
286
|
+
};
|
|
287
|
+
|
|
172
288
|
// Create callback for clean console output
|
|
173
289
|
const onParsedMessage = (msg: ParsedMessage) => {
|
|
174
290
|
if (onActivity) onActivity();
|
|
175
291
|
const formatted = formatMessageForConsole(msg, {
|
|
176
|
-
laneLabel:
|
|
292
|
+
laneLabel: getDynamicLabel(),
|
|
177
293
|
includeTimestamp: true
|
|
178
294
|
});
|
|
179
295
|
process.stdout.write(formatted + '\n');
|
|
180
296
|
};
|
|
181
297
|
|
|
182
|
-
logManager = createLogManager(laneRunDir, laneName, logConfig, onParsedMessage);
|
|
298
|
+
logManager = createLogManager(laneRunDir, laneName, logConfig, onParsedMessage, laneIndex);
|
|
183
299
|
logPath = logManager.getLogPaths().clean;
|
|
184
300
|
|
|
185
301
|
// Spawn with pipe for enhanced logging
|
|
@@ -189,6 +305,16 @@ export function spawnLane({
|
|
|
189
305
|
detached: false,
|
|
190
306
|
});
|
|
191
307
|
|
|
308
|
+
// Initialize info object for stdout handler to use
|
|
309
|
+
const info: RunningLaneInfo = {
|
|
310
|
+
child,
|
|
311
|
+
logManager,
|
|
312
|
+
logPath,
|
|
313
|
+
statePath: safeJoin(laneRunDir, 'state.json'),
|
|
314
|
+
laneIndex,
|
|
315
|
+
currentTaskIndex: startIndex > 0 ? startIndex + 1 : 0
|
|
316
|
+
};
|
|
317
|
+
|
|
192
318
|
// Buffer for non-JSON lines
|
|
193
319
|
let lineBuffer = '';
|
|
194
320
|
|
|
@@ -205,24 +331,52 @@ export function spawnLane({
|
|
|
205
331
|
|
|
206
332
|
for (const line of lines) {
|
|
207
333
|
const trimmed = line.trim();
|
|
334
|
+
if (!trimmed) continue;
|
|
335
|
+
|
|
336
|
+
// Detect task start/progress to update label
|
|
337
|
+
// Example: [1/1] hello-task
|
|
338
|
+
const cleanLine = stripAnsi(trimmed);
|
|
339
|
+
const taskMatch = cleanLine.match(/^\s*\[(\d+)\/(\d+)\]\s+(.+)$/);
|
|
340
|
+
if (taskMatch) {
|
|
341
|
+
info.currentTaskIndex = parseInt(taskMatch[1]!);
|
|
342
|
+
// Update log manager's task index to keep it in sync for readable log
|
|
343
|
+
if (logManager) {
|
|
344
|
+
logManager.setTask(taskMatch[3]!.trim(), undefined, info.currentTaskIndex - 1);
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
208
348
|
// Show if it's a timestamped log line (starts with [YYYY-MM-DD... or [HH:MM:SS])
|
|
209
349
|
// or if it's NOT a noisy JSON line
|
|
210
|
-
const hasTimestamp = /^\[\d{4}-\d{2}-\d{2}T|\^\[\d{2}:\d{2}:\d{2}\]/.test(trimmed);
|
|
211
350
|
const isJson = trimmed.startsWith('{') || trimmed.includes('{"type"');
|
|
212
351
|
// Filter out heartbeats - they should NOT reset the idle timer
|
|
213
352
|
const isHeartbeat = trimmed.includes('Heartbeat') && trimmed.includes('bytes received');
|
|
214
353
|
|
|
215
|
-
if (
|
|
354
|
+
if (!isJson) {
|
|
216
355
|
// Only trigger activity for non-heartbeat lines
|
|
217
356
|
if (onActivity && !isHeartbeat) onActivity();
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
357
|
+
|
|
358
|
+
const currentLabel = getDynamicLabel();
|
|
359
|
+
const coloredLabel = `${logger.COLORS.magenta}${currentLabel}${logger.COLORS.reset}`;
|
|
360
|
+
|
|
361
|
+
// Regex that matches timestamp even if it has ANSI color codes
|
|
362
|
+
// Matches: [24:39:14] or \x1b[90m[24:39:14]\x1b[0m
|
|
363
|
+
const timestampRegex = /^((?:\x1b\[[0-9;]*m)*)\[(\d{4}-\d{2}-\d{2}T|\d{2}:\d{2}:\d{2})\]/;
|
|
364
|
+
const tsMatch = trimmed.match(timestampRegex);
|
|
365
|
+
|
|
366
|
+
if (tsMatch) {
|
|
367
|
+
// If line already has timestamp format, just add lane prefix
|
|
368
|
+
// Check if lane label is already present to avoid triple duplication
|
|
369
|
+
if (!trimmed.includes(currentLabel)) {
|
|
370
|
+
// Insert label after the timestamp part
|
|
371
|
+
const tsPart = tsMatch[0];
|
|
372
|
+
const formatted = trimmed.replace(tsPart, `${tsPart} ${coloredLabel}`);
|
|
373
|
+
process.stdout.write(formatted + '\n');
|
|
374
|
+
} else {
|
|
375
|
+
process.stdout.write(trimmed + '\n');
|
|
376
|
+
}
|
|
223
377
|
} else {
|
|
224
378
|
// Add full prefix: timestamp + lane
|
|
225
|
-
process.stdout.write(`${logger.COLORS.gray}[${new Date().toLocaleTimeString('en-US', { hour12: false })}]${logger.COLORS.reset} ${
|
|
379
|
+
process.stdout.write(`${logger.COLORS.gray}[${new Date().toLocaleTimeString('en-US', { hour12: false })}]${logger.COLORS.reset} ${coloredLabel} ${line}\n`);
|
|
226
380
|
}
|
|
227
381
|
}
|
|
228
382
|
}
|
|
@@ -244,11 +398,14 @@ export function spawnLane({
|
|
|
244
398
|
trimmed.includes('actual output');
|
|
245
399
|
|
|
246
400
|
const ts = new Date().toLocaleTimeString('en-US', { hour12: false });
|
|
401
|
+
const currentLabel = getDynamicLabel();
|
|
402
|
+
const coloredLabel = `${logger.COLORS.magenta}${currentLabel}${logger.COLORS.reset}`;
|
|
403
|
+
|
|
247
404
|
if (isStatus) {
|
|
248
|
-
process.stdout.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${
|
|
405
|
+
process.stdout.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${coloredLabel} ${trimmed}\n`);
|
|
249
406
|
} else {
|
|
250
407
|
if (onActivity) onActivity();
|
|
251
|
-
process.stderr.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${
|
|
408
|
+
process.stderr.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${coloredLabel} ${logger.COLORS.red}❌ ERR ${trimmed}${logger.COLORS.reset}\n`);
|
|
252
409
|
}
|
|
253
410
|
}
|
|
254
411
|
}
|
|
@@ -259,9 +416,11 @@ export function spawnLane({
|
|
|
259
416
|
child.on('exit', () => {
|
|
260
417
|
logManager?.close();
|
|
261
418
|
});
|
|
419
|
+
|
|
420
|
+
return { child, logPath, logManager, info };
|
|
262
421
|
} else {
|
|
263
422
|
// Fallback to simple file logging
|
|
264
|
-
logPath = safeJoin(laneRunDir, 'terminal.log');
|
|
423
|
+
logPath = safeJoin(laneRunDir, 'terminal-readable.log');
|
|
265
424
|
const logFd = fs.openSync(logPath, 'a');
|
|
266
425
|
|
|
267
426
|
child = spawn('node', args, {
|
|
@@ -275,9 +434,19 @@ export function spawnLane({
|
|
|
275
434
|
} catch {
|
|
276
435
|
// Ignore
|
|
277
436
|
}
|
|
437
|
+
|
|
438
|
+
return {
|
|
439
|
+
child,
|
|
440
|
+
logPath,
|
|
441
|
+
logManager,
|
|
442
|
+
info: {
|
|
443
|
+
child,
|
|
444
|
+
logPath,
|
|
445
|
+
statePath: safeJoin(laneRunDir, 'state.json'),
|
|
446
|
+
laneIndex
|
|
447
|
+
}
|
|
448
|
+
};
|
|
278
449
|
}
|
|
279
|
-
|
|
280
|
-
return { child, logPath, logManager };
|
|
281
450
|
}
|
|
282
451
|
|
|
283
452
|
/**
|
|
@@ -296,7 +465,7 @@ export function waitChild(proc: ChildProcess): Promise<number> {
|
|
|
296
465
|
}
|
|
297
466
|
|
|
298
467
|
/**
|
|
299
|
-
* List lane task files in directory
|
|
468
|
+
* List lane task files in directory
|
|
300
469
|
*/
|
|
301
470
|
export function listLaneFiles(tasksDir: string): LaneInfo[] {
|
|
302
471
|
if (!fs.existsSync(tasksDir)) {
|
|
@@ -310,19 +479,10 @@ export function listLaneFiles(tasksDir: string): LaneInfo[] {
|
|
|
310
479
|
.map(f => {
|
|
311
480
|
const filePath = safeJoin(tasksDir, f);
|
|
312
481
|
const name = path.basename(f, '.json');
|
|
313
|
-
let dependsOn: string[] = [];
|
|
314
|
-
|
|
315
|
-
try {
|
|
316
|
-
const config = JSON.parse(fs.readFileSync(filePath, 'utf8')) as RunnerConfig;
|
|
317
|
-
dependsOn = config.dependsOn || [];
|
|
318
|
-
} catch (e) {
|
|
319
|
-
logger.warn(`Failed to parse config for lane ${name}: ${e}`);
|
|
320
|
-
}
|
|
321
482
|
|
|
322
483
|
return {
|
|
323
484
|
name,
|
|
324
485
|
path: filePath,
|
|
325
|
-
dependsOn,
|
|
326
486
|
};
|
|
327
487
|
});
|
|
328
488
|
}
|
|
@@ -339,8 +499,7 @@ export function printLaneStatus(lanes: LaneInfo[], laneRunDirs: Record<string, s
|
|
|
339
499
|
const state = loadState<LaneState>(statePath);
|
|
340
500
|
|
|
341
501
|
if (!state) {
|
|
342
|
-
|
|
343
|
-
return { lane: lane.name, status: isWaiting ? 'waiting' : 'pending', task: '-' };
|
|
502
|
+
return { lane: lane.name, status: 'pending', task: '-' };
|
|
344
503
|
}
|
|
345
504
|
|
|
346
505
|
const idx = (state.currentTaskIndex || 0) + 1;
|
|
@@ -388,12 +547,12 @@ async function resolveAllDependencies(
|
|
|
388
547
|
const worktreeDir = state?.worktreeDir || safeJoin(runRoot, 'resolution-worktree');
|
|
389
548
|
|
|
390
549
|
if (!fs.existsSync(worktreeDir)) {
|
|
391
|
-
logger.info(
|
|
550
|
+
logger.info(`🏗️ Creating resolution worktree at ${worktreeDir}`);
|
|
392
551
|
git.createWorktree(worktreeDir, pipelineBranch, { baseBranch: git.getCurrentBranch() });
|
|
393
552
|
}
|
|
394
553
|
|
|
395
554
|
// 3. Resolve on pipeline branch
|
|
396
|
-
logger.info(
|
|
555
|
+
logger.info(`🔄 Resolving dependencies on branch ${pipelineBranch}`);
|
|
397
556
|
git.runGit(['checkout', pipelineBranch], { cwd: worktreeDir });
|
|
398
557
|
|
|
399
558
|
for (const cmd of uniqueCommands) {
|
|
@@ -474,7 +633,6 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
474
633
|
noGit?: boolean;
|
|
475
634
|
skipPreflight?: boolean;
|
|
476
635
|
stallConfig?: Partial<StallDetectionConfig>;
|
|
477
|
-
autoRecoveryConfig?: Partial<AutoRecoveryConfig>;
|
|
478
636
|
} = {}): Promise<{ lanes: LaneInfo[]; exitCodes: Record<string, number>; runRoot: string }> {
|
|
479
637
|
const lanes = listLaneFiles(tasksDir);
|
|
480
638
|
|
|
@@ -510,34 +668,11 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
510
668
|
logger.success('✓ Preflight checks passed');
|
|
511
669
|
}
|
|
512
670
|
|
|
513
|
-
// Validate dependencies and detect cycles
|
|
514
|
-
logger.section('📊 Dependency Analysis');
|
|
515
|
-
|
|
516
|
-
const depInfos: DependencyInfo[] = lanes.map(l => ({
|
|
517
|
-
name: l.name,
|
|
518
|
-
dependsOn: l.dependsOn,
|
|
519
|
-
}));
|
|
520
|
-
|
|
521
|
-
const depValidation = validateDependencies(depInfos);
|
|
522
|
-
|
|
523
|
-
if (!depValidation.valid) {
|
|
524
|
-
logger.error('❌ Dependency validation failed:');
|
|
525
|
-
for (const err of depValidation.errors) {
|
|
526
|
-
logger.error(` • ${err}`);
|
|
527
|
-
}
|
|
528
|
-
throw new Error('Invalid dependency configuration');
|
|
529
|
-
}
|
|
530
|
-
|
|
531
|
-
if (depValidation.warnings.length > 0) {
|
|
532
|
-
for (const warn of depValidation.warnings) {
|
|
533
|
-
logger.warn(`⚠️ ${warn}`);
|
|
534
|
-
}
|
|
535
|
-
}
|
|
536
|
-
|
|
537
|
-
// Print dependency graph
|
|
538
|
-
printDependencyGraph(depInfos);
|
|
539
|
-
|
|
540
671
|
const config = loadConfig();
|
|
672
|
+
|
|
673
|
+
// Set verbose git logging from config
|
|
674
|
+
git.setVerboseGit(config.verboseGit || false);
|
|
675
|
+
|
|
541
676
|
const logsDir = getLogsDir(config);
|
|
542
677
|
const runId = `run-${Date.now()}`;
|
|
543
678
|
// Use absolute path for runRoot to avoid issues with subfolders
|
|
@@ -561,17 +696,11 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
561
696
|
const randomSuffix = Math.random().toString(36).substring(2, 7);
|
|
562
697
|
const pipelineBranch = `cursorflow/run-${Date.now().toString(36)}-${randomSuffix}`;
|
|
563
698
|
|
|
564
|
-
//
|
|
565
|
-
const
|
|
699
|
+
// Initialize unified stall detection service (Single Source of Truth)
|
|
700
|
+
const stallService = getStallService({
|
|
566
701
|
...DEFAULT_ORCHESTRATOR_STALL_CONFIG,
|
|
567
702
|
...options.stallConfig,
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
// Initialize auto-recovery manager
|
|
571
|
-
const autoRecoveryManager = getAutoRecoveryManager({
|
|
572
|
-
...DEFAULT_AUTO_RECOVERY_CONFIG,
|
|
573
|
-
idleTimeoutMs: stallConfig.idleTimeoutMs, // Sync with stall config
|
|
574
|
-
...options.autoRecoveryConfig,
|
|
703
|
+
verbose: process.env['DEBUG_STALL'] === 'true',
|
|
575
704
|
});
|
|
576
705
|
|
|
577
706
|
// Initialize event system
|
|
@@ -632,6 +761,7 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
632
761
|
|
|
633
762
|
laneWorktreeDirs[lane.name] = laneWorktreeDir;
|
|
634
763
|
|
|
764
|
+
logger.info(`🏗️ Initializing lane ${lane.name}: branch=${lanePipelineBranch}`);
|
|
635
765
|
const initialState = createLaneState(lane.name, taskConfig, lane.path, {
|
|
636
766
|
pipelineBranch: lanePipelineBranch,
|
|
637
767
|
worktreeDir: laneWorktreeDir
|
|
@@ -647,21 +777,6 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
647
777
|
logger.info(`Run directory: ${runRoot}`);
|
|
648
778
|
logger.info(`Lanes: ${lanes.length}`);
|
|
649
779
|
|
|
650
|
-
// Display dependency graph
|
|
651
|
-
logger.info('\n📊 Dependency Graph:');
|
|
652
|
-
for (const lane of lanes) {
|
|
653
|
-
const deps = lane.dependsOn.length > 0 ? ` [depends on: ${lane.dependsOn.join(', ')}]` : '';
|
|
654
|
-
console.log(` ${logger.COLORS.cyan}${lane.name}${logger.COLORS.reset}${deps}`);
|
|
655
|
-
|
|
656
|
-
// Simple tree-like visualization for deep dependencies
|
|
657
|
-
if (lane.dependsOn.length > 0) {
|
|
658
|
-
for (const dep of lane.dependsOn) {
|
|
659
|
-
console.log(` └─ ${dep}`);
|
|
660
|
-
}
|
|
661
|
-
}
|
|
662
|
-
}
|
|
663
|
-
console.log('');
|
|
664
|
-
|
|
665
780
|
// Disable auto-resolve when noGit mode is enabled
|
|
666
781
|
const autoResolve = !options.noGit && options.autoResolveDependencies !== false;
|
|
667
782
|
|
|
@@ -696,29 +811,12 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
696
811
|
|
|
697
812
|
try {
|
|
698
813
|
while (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length || (blockedLanes.size > 0 && running.size === 0)) {
|
|
699
|
-
// 1. Identify lanes ready to start
|
|
814
|
+
// 1. Identify lanes ready to start (all lanes can start immediately - no lane-level dependencies)
|
|
700
815
|
const readyToStart = lanes.filter(lane => {
|
|
701
816
|
// Not already running or completed or failed or blocked
|
|
702
817
|
if (running.has(lane.name) || completedLanes.has(lane.name) || failedLanes.has(lane.name) || blockedLanes.has(lane.name)) {
|
|
703
818
|
return false;
|
|
704
819
|
}
|
|
705
|
-
|
|
706
|
-
// Check dependencies
|
|
707
|
-
for (const dep of lane.dependsOn) {
|
|
708
|
-
if (failedLanes.has(dep)) {
|
|
709
|
-
logger.error(`Lane ${lane.name} will not start because dependency ${dep} failed`);
|
|
710
|
-
failedLanes.add(lane.name);
|
|
711
|
-
exitCodes[lane.name] = 1;
|
|
712
|
-
return false;
|
|
713
|
-
}
|
|
714
|
-
if (blockedLanes.has(dep)) {
|
|
715
|
-
// If a dependency is blocked, wait
|
|
716
|
-
return false;
|
|
717
|
-
}
|
|
718
|
-
if (!completedLanes.has(dep)) {
|
|
719
|
-
return false;
|
|
720
|
-
}
|
|
721
|
-
}
|
|
722
820
|
return true;
|
|
723
821
|
});
|
|
724
822
|
|
|
@@ -737,23 +835,23 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
737
835
|
logger.info(`Lane started: ${lane.name}${lane.startIndex ? ` (resuming from ${lane.startIndex})` : ''}`);
|
|
738
836
|
|
|
739
837
|
const now = Date.now();
|
|
740
|
-
|
|
838
|
+
|
|
839
|
+
// Register lane with unified stall detection service FIRST
|
|
840
|
+
stallService.registerLane(lane.name, {
|
|
841
|
+
laneRunDir: laneRunDirs[lane.name]!,
|
|
842
|
+
});
|
|
843
|
+
|
|
844
|
+
const laneIdx = lanes.findIndex(l => l.name === lane.name);
|
|
845
|
+
|
|
846
|
+
// Pre-register lane in running map
|
|
741
847
|
running.set(lane.name, {
|
|
742
848
|
child: {} as any, // Placeholder, will be replaced below
|
|
743
849
|
logManager: undefined,
|
|
744
850
|
logPath: '',
|
|
745
|
-
lastActivity: now,
|
|
746
|
-
lastStateUpdate: now,
|
|
747
|
-
stallPhase: 0,
|
|
748
|
-
taskStartTime: now,
|
|
749
|
-
lastOutput: '',
|
|
750
851
|
statePath: laneStatePath,
|
|
751
|
-
|
|
752
|
-
lastBytesCheck: 0,
|
|
753
|
-
continueSignalsSent: 0,
|
|
852
|
+
laneIndex: laneIdx >= 0 ? laneIdx : 0,
|
|
754
853
|
});
|
|
755
854
|
|
|
756
|
-
let lastOutput = '';
|
|
757
855
|
const spawnResult = spawnLane({
|
|
758
856
|
laneName: lane.name,
|
|
759
857
|
tasksFile: lane.path,
|
|
@@ -764,55 +862,40 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
764
862
|
worktreeDir: laneWorktreeDirs[lane.name],
|
|
765
863
|
enhancedLogConfig: options.enhancedLogging,
|
|
766
864
|
noGit: options.noGit,
|
|
865
|
+
laneIndex: laneIdx >= 0 ? laneIdx : 0,
|
|
767
866
|
onActivity: () => {
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
const actNow = Date.now();
|
|
771
|
-
info.lastActivity = actNow;
|
|
772
|
-
info.lastStateUpdate = actNow;
|
|
773
|
-
info.stallPhase = 0;
|
|
774
|
-
}
|
|
867
|
+
// Record state file update activity
|
|
868
|
+
stallService.recordStateUpdate(lane.name);
|
|
775
869
|
}
|
|
776
870
|
});
|
|
777
871
|
|
|
778
872
|
// Update with actual spawn result
|
|
779
873
|
const existingInfo = running.get(lane.name)!;
|
|
780
|
-
Object.assign(existingInfo, spawnResult);
|
|
874
|
+
Object.assign(existingInfo, spawnResult.info);
|
|
875
|
+
|
|
876
|
+
// Update stall service with child process reference
|
|
877
|
+
stallService.setChildProcess(lane.name, spawnResult.child);
|
|
781
878
|
|
|
782
|
-
// Track
|
|
879
|
+
// Track stdout for activity detection - delegate to StallDetectionService
|
|
783
880
|
if (spawnResult.child.stdout) {
|
|
784
881
|
spawnResult.child.stdout.on('data', (data: Buffer) => {
|
|
785
|
-
const
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
const lastRealLine = realLines[realLines.length - 1]!;
|
|
800
|
-
info.lastOutput = lastRealLine;
|
|
801
|
-
info.bytesReceived += data.length;
|
|
802
|
-
|
|
803
|
-
// Update auto-recovery manager with real activity
|
|
804
|
-
autoRecoveryManager.recordActivity(lane.name, data.length, info.lastOutput);
|
|
805
|
-
} else if (lines.length > 0) {
|
|
806
|
-
// Only heartbeats received - do NOT update lastActivity (keep stall timer running)
|
|
807
|
-
autoRecoveryManager.recordActivity(lane.name, 0, info.lastOutput);
|
|
808
|
-
}
|
|
882
|
+
const output = data.toString();
|
|
883
|
+
const lines = output.split('\n').filter(l => l.trim());
|
|
884
|
+
|
|
885
|
+
// Filter out heartbeats from activity tracking
|
|
886
|
+
const realLines = lines.filter(line => !(line.includes('Heartbeat') && line.includes('bytes received')));
|
|
887
|
+
|
|
888
|
+
if (realLines.length > 0) {
|
|
889
|
+
// Real activity - record with bytes
|
|
890
|
+
const lastRealLine = realLines[realLines.length - 1]!;
|
|
891
|
+
stallService.recordActivity(lane.name, data.length, lastRealLine);
|
|
892
|
+
} else if (lines.length > 0) {
|
|
893
|
+
// Heartbeat only - record with 0 bytes (won't reset timer)
|
|
894
|
+
stallService.recordActivity(lane.name, 0);
|
|
809
895
|
}
|
|
810
896
|
});
|
|
811
897
|
}
|
|
812
898
|
|
|
813
|
-
// Register lane with auto-recovery manager
|
|
814
|
-
autoRecoveryManager.registerLane(lane.name);
|
|
815
|
-
|
|
816
899
|
// Update lane tracking
|
|
817
900
|
lane.taskStartTime = now;
|
|
818
901
|
|
|
@@ -843,234 +926,47 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
843
926
|
if (result.name === '__poll__' || (now - lastStallCheck >= 10000)) {
|
|
844
927
|
lastStallCheck = now;
|
|
845
928
|
|
|
846
|
-
// Periodic stall check
|
|
929
|
+
// Periodic stall check using unified StallDetectionService
|
|
847
930
|
for (const [laneName, info] of running.entries()) {
|
|
848
|
-
const idleTime = now - info.lastActivity;
|
|
849
931
|
const lane = lanes.find(l => l.name === laneName)!;
|
|
850
932
|
|
|
851
|
-
if (process.env['DEBUG_STALL']) {
|
|
852
|
-
logger.debug(`[${laneName}] Stall check: idle=${Math.round(idleTime/1000)}s, bytesDelta=${info.bytesReceived - info.lastBytesCheck}, phase=${info.stallPhase}`);
|
|
853
|
-
}
|
|
854
|
-
|
|
855
933
|
// Check state file for progress updates
|
|
856
|
-
let progressTime = 0;
|
|
857
934
|
try {
|
|
858
935
|
const stateStat = fs.statSync(info.statePath);
|
|
859
|
-
const
|
|
860
|
-
if (
|
|
861
|
-
|
|
936
|
+
const stallState = stallService.getState(laneName);
|
|
937
|
+
if (stallState && stateStat.mtimeMs > stallState.lastStateUpdateTime) {
|
|
938
|
+
stallService.recordStateUpdate(laneName);
|
|
862
939
|
}
|
|
863
|
-
progressTime = now - info.lastStateUpdate;
|
|
864
940
|
} catch {
|
|
865
941
|
// State file might not exist yet
|
|
866
942
|
}
|
|
867
943
|
|
|
868
|
-
//
|
|
869
|
-
|
|
870
|
-
|
|
944
|
+
// Debug logging
|
|
945
|
+
if (process.env['DEBUG_STALL']) {
|
|
946
|
+
logger.debug(`[${laneName}] ${stallService.dumpState(laneName)}`);
|
|
947
|
+
}
|
|
871
948
|
|
|
872
|
-
//
|
|
873
|
-
const analysis =
|
|
874
|
-
stallPhase: info.stallPhase,
|
|
875
|
-
idleTimeMs: idleTime,
|
|
876
|
-
progressTimeMs: progressTime,
|
|
877
|
-
lastOutput: info.lastOutput,
|
|
878
|
-
restartCount: lane.restartCount || 0,
|
|
879
|
-
taskStartTimeMs: info.taskStartTime,
|
|
880
|
-
bytesReceived: bytesDelta, // Bytes since last check
|
|
881
|
-
continueSignalsSent: info.continueSignalsSent,
|
|
882
|
-
}, stallConfig);
|
|
949
|
+
// Run stall analysis and recovery (all logic is in StallDetectionService)
|
|
950
|
+
const analysis = stallService.checkAndRecover(laneName);
|
|
883
951
|
|
|
884
|
-
//
|
|
952
|
+
// Log to lane log manager if there was an action
|
|
885
953
|
if (analysis.action !== RecoveryAction.NONE) {
|
|
886
|
-
logFailure(laneName, analysis);
|
|
887
954
|
info.logManager?.log('error', analysis.message);
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
laneName,
|
|
900
|
-
idleSeconds: Math.round(idleTime / 1000),
|
|
901
|
-
signalCount: info.continueSignalsSent,
|
|
902
|
-
});
|
|
903
|
-
} catch (e) {
|
|
904
|
-
logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
|
|
905
|
-
}
|
|
906
|
-
} else if (analysis.action === RecoveryAction.STRONGER_PROMPT) {
|
|
907
|
-
const interventionPath = safeJoin(laneRunDirs[laneName]!, 'intervention.txt');
|
|
908
|
-
const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck. Please continue with your current task immediately. If you're waiting for something, explain what you need and proceed with what you can do now. If you've completed the task, summarize your work and finish.`;
|
|
909
|
-
try {
|
|
910
|
-
fs.writeFileSync(interventionPath, strongerPrompt);
|
|
911
|
-
info.stallPhase = 2;
|
|
912
|
-
info.lastActivity = now;
|
|
913
|
-
logger.warn(`[${laneName}] Sent stronger prompt after continue signal failed`);
|
|
914
|
-
|
|
915
|
-
events.emit('recovery.stronger_prompt', { laneName });
|
|
916
|
-
} catch (e) {
|
|
917
|
-
logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
|
|
918
|
-
}
|
|
919
|
-
} else if (analysis.action === RecoveryAction.KILL_AND_RESTART ||
|
|
920
|
-
analysis.action === RecoveryAction.RESTART_LANE ||
|
|
921
|
-
analysis.action === RecoveryAction.RESTART_LANE_FROM_CHECKPOINT) {
|
|
922
|
-
lane.restartCount = (lane.restartCount || 0) + 1;
|
|
923
|
-
info.stallPhase = 3;
|
|
924
|
-
|
|
925
|
-
// Try to get checkpoint info
|
|
926
|
-
const checkpoint = getLatestCheckpoint(laneRunDirs[laneName]!);
|
|
927
|
-
if (checkpoint) {
|
|
928
|
-
logger.info(`[${laneName}] Checkpoint available: ${checkpoint.id} (task ${checkpoint.taskIndex})`);
|
|
929
|
-
}
|
|
930
|
-
|
|
931
|
-
// Kill the process
|
|
932
|
-
try {
|
|
933
|
-
info.child.kill('SIGKILL');
|
|
934
|
-
} catch {
|
|
935
|
-
// Process might already be dead
|
|
936
|
-
}
|
|
937
|
-
|
|
938
|
-
logger.warn(`[${laneName}] Killing and restarting lane (restart #${lane.restartCount})`);
|
|
939
|
-
|
|
940
|
-
events.emit('recovery.restart', {
|
|
941
|
-
laneName,
|
|
942
|
-
restartCount: lane.restartCount,
|
|
943
|
-
maxRestarts: stallConfig.maxRestarts,
|
|
944
|
-
});
|
|
945
|
-
} else if (analysis.action === RecoveryAction.RUN_DOCTOR) {
|
|
946
|
-
info.stallPhase = 4;
|
|
947
|
-
|
|
948
|
-
// Run diagnostics
|
|
949
|
-
logger.error(`[${laneName}] Running diagnostics due to persistent failures...`);
|
|
950
|
-
|
|
951
|
-
// Import health check dynamically to avoid circular dependency
|
|
952
|
-
const { checkAgentHealth, checkAuthHealth } = await import('../utils/health');
|
|
953
|
-
|
|
954
|
-
const [agentHealth, authHealth] = await Promise.all([
|
|
955
|
-
checkAgentHealth(),
|
|
956
|
-
checkAuthHealth(),
|
|
957
|
-
]);
|
|
958
|
-
|
|
959
|
-
const issues: string[] = [];
|
|
960
|
-
if (!agentHealth.ok) issues.push(`Agent: ${agentHealth.message}`);
|
|
961
|
-
if (!authHealth.ok) issues.push(`Auth: ${authHealth.message}`);
|
|
962
|
-
|
|
963
|
-
if (issues.length > 0) {
|
|
964
|
-
logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
|
|
965
|
-
} else {
|
|
966
|
-
logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
|
|
967
|
-
}
|
|
968
|
-
|
|
969
|
-
// Save diagnostic to file
|
|
970
|
-
const diagnosticPath = safeJoin(laneRunDirs[laneName]!, 'diagnostic.json');
|
|
971
|
-
fs.writeFileSync(diagnosticPath, JSON.stringify({
|
|
972
|
-
timestamp: Date.now(),
|
|
973
|
-
agentHealthy: agentHealth.ok,
|
|
974
|
-
authHealthy: authHealth.ok,
|
|
975
|
-
issues,
|
|
976
|
-
analysis,
|
|
977
|
-
}, null, 2));
|
|
978
|
-
|
|
979
|
-
// Kill the process
|
|
980
|
-
try {
|
|
981
|
-
info.child.kill('SIGKILL');
|
|
982
|
-
} catch {
|
|
983
|
-
// Process might already be dead
|
|
984
|
-
}
|
|
985
|
-
|
|
986
|
-
logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
|
|
987
|
-
|
|
988
|
-
// Save POF for failed recovery
|
|
989
|
-
const recoveryState = autoRecoveryManager.getState(laneName);
|
|
990
|
-
if (recoveryState) {
|
|
991
|
-
try {
|
|
992
|
-
const laneStatePath = safeJoin(laneRunDirs[laneName]!, 'state.json');
|
|
993
|
-
const laneState = loadState<LaneState>(laneStatePath);
|
|
994
|
-
const pofDir = safeJoin(runRoot, '..', '..', 'pof');
|
|
995
|
-
const diagnosticInfo = {
|
|
996
|
-
timestamp: Date.now(),
|
|
997
|
-
agentHealthy: agentHealth.ok,
|
|
998
|
-
authHealthy: authHealth.ok,
|
|
999
|
-
systemHealthy: true,
|
|
1000
|
-
suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
|
|
1001
|
-
details: issues.join('\n') || 'No obvious issues found',
|
|
1002
|
-
};
|
|
1003
|
-
const pofEntry = createPOFFromRecoveryState(
|
|
1004
|
-
runId,
|
|
1005
|
-
runRoot,
|
|
1006
|
-
laneName,
|
|
1007
|
-
recoveryState,
|
|
1008
|
-
laneState,
|
|
1009
|
-
diagnosticInfo
|
|
1010
|
-
);
|
|
1011
|
-
savePOF(runId, pofDir, pofEntry);
|
|
1012
|
-
} catch (pofError: any) {
|
|
1013
|
-
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
1014
|
-
}
|
|
1015
|
-
}
|
|
1016
|
-
|
|
1017
|
-
events.emit('recovery.diagnosed', {
|
|
1018
|
-
laneName,
|
|
1019
|
-
diagnostic: { agentHealthy: agentHealth.ok, authHealthy: authHealth.ok, issues },
|
|
1020
|
-
});
|
|
1021
|
-
} else if (analysis.action === RecoveryAction.ABORT_LANE) {
|
|
1022
|
-
info.stallPhase = 5;
|
|
1023
|
-
|
|
1024
|
-
try {
|
|
1025
|
-
info.child.kill('SIGKILL');
|
|
1026
|
-
} catch {
|
|
1027
|
-
// Process might already be dead
|
|
1028
|
-
}
|
|
1029
|
-
|
|
1030
|
-
logger.error(`[${laneName}] Aborting lane due to repeated stalls`);
|
|
1031
|
-
|
|
1032
|
-
// Save POF for failed recovery
|
|
1033
|
-
const recoveryState = autoRecoveryManager.getState(laneName);
|
|
1034
|
-
if (recoveryState) {
|
|
1035
|
-
try {
|
|
1036
|
-
const laneStatePath = safeJoin(laneRunDirs[laneName]!, 'state.json');
|
|
1037
|
-
const laneState = loadState<LaneState>(laneStatePath);
|
|
1038
|
-
const pofDir = safeJoin(runRoot, '..', '..', 'pof');
|
|
1039
|
-
const pofEntry = createPOFFromRecoveryState(
|
|
1040
|
-
runId,
|
|
1041
|
-
runRoot,
|
|
1042
|
-
laneName,
|
|
1043
|
-
recoveryState,
|
|
1044
|
-
laneState,
|
|
1045
|
-
recoveryState.diagnosticInfo
|
|
1046
|
-
);
|
|
1047
|
-
savePOF(runId, pofDir, pofEntry);
|
|
1048
|
-
} catch (pofError: any) {
|
|
1049
|
-
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
1050
|
-
}
|
|
1051
|
-
}
|
|
1052
|
-
} else if (analysis.action === RecoveryAction.SEND_GIT_GUIDANCE) {
|
|
1053
|
-
// Send guidance message to agent for git issues
|
|
1054
|
-
const interventionPath = safeJoin(laneRunDirs[laneName]!, 'intervention.txt');
|
|
1055
|
-
|
|
1056
|
-
// Determine which guidance to send based on the failure type
|
|
1057
|
-
let guidance: string;
|
|
1058
|
-
if (analysis.type === FailureType.GIT_PUSH_REJECTED) {
|
|
1059
|
-
guidance = getGitPushFailureGuidance();
|
|
1060
|
-
} else if (analysis.type === FailureType.MERGE_CONFLICT) {
|
|
1061
|
-
guidance = getMergeConflictGuidance();
|
|
1062
|
-
} else {
|
|
1063
|
-
guidance = getGitErrorGuidance(analysis.message);
|
|
1064
|
-
}
|
|
1065
|
-
|
|
1066
|
-
try {
|
|
1067
|
-
fs.writeFileSync(interventionPath, guidance);
|
|
1068
|
-
info.lastActivity = now;
|
|
1069
|
-
logger.info(`[${laneName}] Sent git issue guidance to agent`);
|
|
1070
|
-
} catch (e: any) {
|
|
1071
|
-
logger.error(`[${laneName}] Failed to send guidance: ${e.message}`);
|
|
1072
|
-
}
|
|
955
|
+
|
|
956
|
+
// Handle special case: RUN_DOCTOR needs async operations
|
|
957
|
+
if (analysis.action === RecoveryAction.RUN_DOCTOR) {
|
|
958
|
+
await handleDoctorDiagnostics(
|
|
959
|
+
laneName,
|
|
960
|
+
laneRunDirs[laneName]!,
|
|
961
|
+
runId,
|
|
962
|
+
runRoot,
|
|
963
|
+
stallService,
|
|
964
|
+
info.child
|
|
965
|
+
);
|
|
1073
966
|
}
|
|
967
|
+
|
|
968
|
+
// Sync restartCount back to lane info (for restart logic in process exit handler)
|
|
969
|
+
lane.restartCount = stallService.getRestartCount(laneName);
|
|
1074
970
|
}
|
|
1075
971
|
}
|
|
1076
972
|
continue;
|
|
@@ -1080,8 +976,11 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
1080
976
|
running.delete(finished.name);
|
|
1081
977
|
exitCodes[finished.name] = finished.code;
|
|
1082
978
|
|
|
1083
|
-
//
|
|
1084
|
-
|
|
979
|
+
// Get stall state before unregistering
|
|
980
|
+
const stallPhase = stallService.getPhase(finished.name);
|
|
981
|
+
|
|
982
|
+
// Unregister from stall detection service
|
|
983
|
+
stallService.unregisterLane(finished.name);
|
|
1085
984
|
|
|
1086
985
|
if (finished.code === 0) {
|
|
1087
986
|
completedLanes.add(finished.name);
|
|
@@ -1111,8 +1010,8 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
1111
1010
|
logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
|
|
1112
1011
|
}
|
|
1113
1012
|
} else {
|
|
1114
|
-
// Check if it was a restart request
|
|
1115
|
-
if (
|
|
1013
|
+
// Check if it was a restart request (RESTART_REQUESTED phase)
|
|
1014
|
+
if (stallPhase === StallPhase.RESTART_REQUESTED) {
|
|
1116
1015
|
logger.info(`🔄 Lane ${finished.name} is being restarted due to stall...`);
|
|
1117
1016
|
|
|
1118
1017
|
// Update startIndex from current state to resume from the same task
|
|
@@ -1133,7 +1032,7 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
1133
1032
|
failedLanes.add(finished.name);
|
|
1134
1033
|
|
|
1135
1034
|
let errorMsg = 'Process exited with non-zero code';
|
|
1136
|
-
if (
|
|
1035
|
+
if (stallPhase >= StallPhase.DIAGNOSED) {
|
|
1137
1036
|
errorMsg = 'Stopped due to repeated stall';
|
|
1138
1037
|
} else if (info.logManager) {
|
|
1139
1038
|
const lastError = info.logManager.getLastError();
|