@litmers/cursorflow-orchestrator 0.1.30 → 0.1.34
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +144 -52
- package/commands/cursorflow-add.md +159 -0
- package/commands/cursorflow-monitor.md +23 -2
- package/commands/cursorflow-new.md +87 -0
- package/dist/cli/add.d.ts +7 -0
- package/dist/cli/add.js +377 -0
- package/dist/cli/add.js.map +1 -0
- package/dist/cli/clean.js +1 -0
- package/dist/cli/clean.js.map +1 -1
- package/dist/cli/config.d.ts +7 -0
- package/dist/cli/config.js +181 -0
- package/dist/cli/config.js.map +1 -0
- package/dist/cli/index.js +34 -30
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/logs.js +7 -33
- package/dist/cli/logs.js.map +1 -1
- package/dist/cli/monitor.js +51 -62
- package/dist/cli/monitor.js.map +1 -1
- package/dist/cli/new.d.ts +7 -0
- package/dist/cli/new.js +232 -0
- package/dist/cli/new.js.map +1 -0
- package/dist/cli/prepare.js +95 -193
- package/dist/cli/prepare.js.map +1 -1
- package/dist/cli/resume.js +11 -47
- package/dist/cli/resume.js.map +1 -1
- package/dist/cli/run.js +27 -22
- package/dist/cli/run.js.map +1 -1
- package/dist/cli/tasks.js +1 -2
- package/dist/cli/tasks.js.map +1 -1
- package/dist/core/failure-policy.d.ts +9 -0
- package/dist/core/failure-policy.js +9 -0
- package/dist/core/failure-policy.js.map +1 -1
- package/dist/core/orchestrator.d.ts +20 -6
- package/dist/core/orchestrator.js +217 -331
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/core/runner/agent.d.ts +27 -0
- package/dist/core/runner/agent.js +294 -0
- package/dist/core/runner/agent.js.map +1 -0
- package/dist/core/runner/index.d.ts +5 -0
- package/dist/core/runner/index.js +22 -0
- package/dist/core/runner/index.js.map +1 -0
- package/dist/core/runner/pipeline.d.ts +9 -0
- package/dist/core/runner/pipeline.js +539 -0
- package/dist/core/runner/pipeline.js.map +1 -0
- package/dist/core/runner/prompt.d.ts +25 -0
- package/dist/core/runner/prompt.js +175 -0
- package/dist/core/runner/prompt.js.map +1 -0
- package/dist/core/runner/task.d.ts +26 -0
- package/dist/core/runner/task.js +283 -0
- package/dist/core/runner/task.js.map +1 -0
- package/dist/core/runner/utils.d.ts +37 -0
- package/dist/core/runner/utils.js +161 -0
- package/dist/core/runner/utils.js.map +1 -0
- package/dist/core/runner.d.ts +2 -96
- package/dist/core/runner.js +11 -1136
- package/dist/core/runner.js.map +1 -1
- package/dist/core/stall-detection.d.ts +326 -0
- package/dist/core/stall-detection.js +781 -0
- package/dist/core/stall-detection.js.map +1 -0
- package/dist/types/config.d.ts +6 -6
- package/dist/types/flow.d.ts +84 -0
- package/dist/types/flow.js +10 -0
- package/dist/types/flow.js.map +1 -0
- package/dist/types/index.d.ts +1 -0
- package/dist/types/index.js +3 -3
- package/dist/types/index.js.map +1 -1
- package/dist/types/lane.d.ts +0 -2
- package/dist/types/logging.d.ts +5 -1
- package/dist/types/task.d.ts +7 -11
- package/dist/utils/config.js +7 -15
- package/dist/utils/config.js.map +1 -1
- package/dist/utils/dependency.d.ts +36 -1
- package/dist/utils/dependency.js +256 -1
- package/dist/utils/dependency.js.map +1 -1
- package/dist/utils/enhanced-logger.d.ts +45 -82
- package/dist/utils/enhanced-logger.js +238 -844
- package/dist/utils/enhanced-logger.js.map +1 -1
- package/dist/utils/git.d.ts +29 -0
- package/dist/utils/git.js +115 -5
- package/dist/utils/git.js.map +1 -1
- package/dist/utils/state.js +0 -2
- package/dist/utils/state.js.map +1 -1
- package/dist/utils/task-service.d.ts +2 -2
- package/dist/utils/task-service.js +40 -31
- package/dist/utils/task-service.js.map +1 -1
- package/package.json +4 -3
- package/src/cli/add.ts +397 -0
- package/src/cli/clean.ts +1 -0
- package/src/cli/config.ts +177 -0
- package/src/cli/index.ts +36 -32
- package/src/cli/logs.ts +7 -31
- package/src/cli/monitor.ts +55 -71
- package/src/cli/new.ts +235 -0
- package/src/cli/prepare.ts +98 -205
- package/src/cli/resume.ts +13 -56
- package/src/cli/run.ts +311 -306
- package/src/cli/tasks.ts +1 -2
- package/src/core/failure-policy.ts +9 -0
- package/src/core/orchestrator.ts +281 -375
- package/src/core/runner/agent.ts +314 -0
- package/src/core/runner/index.ts +6 -0
- package/src/core/runner/pipeline.ts +567 -0
- package/src/core/runner/prompt.ts +174 -0
- package/src/core/runner/task.ts +320 -0
- package/src/core/runner/utils.ts +142 -0
- package/src/core/runner.ts +8 -1347
- package/src/core/stall-detection.ts +936 -0
- package/src/types/config.ts +6 -6
- package/src/types/flow.ts +91 -0
- package/src/types/index.ts +15 -3
- package/src/types/lane.ts +0 -2
- package/src/types/logging.ts +5 -1
- package/src/types/task.ts +7 -11
- package/src/utils/config.ts +8 -16
- package/src/utils/dependency.ts +311 -2
- package/src/utils/enhanced-logger.ts +263 -927
- package/src/utils/git.ts +145 -5
- package/src/utils/state.ts +0 -2
- package/src/utils/task-service.ts +48 -40
- package/commands/cursorflow-review.md +0 -56
- package/commands/cursorflow-runs.md +0 -59
- package/dist/cli/runs.d.ts +0 -5
- package/dist/cli/runs.js +0 -214
- package/dist/cli/runs.js.map +0 -1
- package/dist/core/reviewer.d.ts +0 -66
- package/dist/core/reviewer.js +0 -265
- package/dist/core/reviewer.js.map +0 -1
- package/src/cli/runs.ts +0 -212
- package/src/core/reviewer.ts +0 -285
package/src/core/orchestrator.ts
CHANGED
|
@@ -25,28 +25,35 @@ import {
|
|
|
25
25
|
EnhancedLogManager,
|
|
26
26
|
createLogManager,
|
|
27
27
|
DEFAULT_LOG_CONFIG,
|
|
28
|
-
ParsedMessage
|
|
28
|
+
ParsedMessage,
|
|
29
|
+
stripAnsi
|
|
29
30
|
} from '../utils/enhanced-logger';
|
|
30
31
|
import { formatMessageForConsole } from '../utils/log-formatter';
|
|
31
|
-
import {
|
|
32
|
+
import { FailureType, analyzeFailure as analyzeFailureFromPolicy } from './failure-policy';
|
|
32
33
|
import {
|
|
33
|
-
getAutoRecoveryManager,
|
|
34
|
-
DEFAULT_AUTO_RECOVERY_CONFIG,
|
|
35
|
-
AutoRecoveryConfig,
|
|
36
34
|
savePOF,
|
|
37
35
|
createPOFFromRecoveryState,
|
|
38
36
|
getGitPushFailureGuidance,
|
|
39
37
|
getMergeConflictGuidance,
|
|
40
38
|
getGitErrorGuidance,
|
|
39
|
+
LaneRecoveryState,
|
|
41
40
|
} from './auto-recovery';
|
|
41
|
+
import {
|
|
42
|
+
StallDetectionService,
|
|
43
|
+
getStallService,
|
|
44
|
+
StallDetectionConfig,
|
|
45
|
+
DEFAULT_STALL_CONFIG,
|
|
46
|
+
RecoveryAction,
|
|
47
|
+
StallPhase,
|
|
48
|
+
StallAnalysis,
|
|
49
|
+
} from './stall-detection';
|
|
42
50
|
import { detectCyclicDependencies, validateDependencies, printDependencyGraph, DependencyInfo } from '../utils/dependency';
|
|
43
51
|
import { preflightCheck, printPreflightReport, autoRepair } from '../utils/health';
|
|
44
52
|
import { getLatestCheckpoint } from '../utils/checkpoint';
|
|
45
53
|
import { cleanStaleLocks, getLockDir } from '../utils/lock';
|
|
46
54
|
|
|
47
55
|
/** Default stall detection configuration - 2 minute idle timeout for recovery */
|
|
48
|
-
const DEFAULT_ORCHESTRATOR_STALL_CONFIG: StallDetectionConfig = {
|
|
49
|
-
...DEFAULT_STALL_CONFIG,
|
|
56
|
+
const DEFAULT_ORCHESTRATOR_STALL_CONFIG: Partial<StallDetectionConfig> = {
|
|
50
57
|
idleTimeoutMs: 2 * 60 * 1000, // 2 minutes (idle detection for continue signal)
|
|
51
58
|
progressTimeoutMs: 10 * 60 * 1000, // 10 minutes (only triggers if no activity at all)
|
|
52
59
|
maxRestarts: 2,
|
|
@@ -55,7 +62,6 @@ const DEFAULT_ORCHESTRATOR_STALL_CONFIG: StallDetectionConfig = {
|
|
|
55
62
|
export interface LaneInfo {
|
|
56
63
|
name: string;
|
|
57
64
|
path: string;
|
|
58
|
-
dependsOn: string[];
|
|
59
65
|
startIndex?: number; // Current task index to resume from
|
|
60
66
|
restartCount?: number; // Number of times restarted due to stall
|
|
61
67
|
lastStateUpdate?: number; // Timestamp of last state file update
|
|
@@ -66,24 +72,22 @@ export interface SpawnLaneResult {
|
|
|
66
72
|
child: ChildProcess;
|
|
67
73
|
logPath: string;
|
|
68
74
|
logManager?: EnhancedLogManager;
|
|
75
|
+
info: RunningLaneInfo;
|
|
69
76
|
}
|
|
70
77
|
|
|
71
78
|
/**
|
|
72
79
|
* Lane execution tracking info
|
|
80
|
+
*
|
|
81
|
+
* NOTE: Stall 감지 관련 상태(lastActivity, stallPhase 등)는 StallDetectionService에서 관리
|
|
82
|
+
* 여기서는 프로세스 관리에 필요한 최소한의 정보만 유지
|
|
73
83
|
*/
|
|
74
84
|
interface RunningLaneInfo {
|
|
75
85
|
child: ChildProcess;
|
|
76
86
|
logPath: string;
|
|
77
87
|
logManager?: EnhancedLogManager;
|
|
78
|
-
lastActivity: number;
|
|
79
|
-
lastStateUpdate: number;
|
|
80
|
-
stallPhase: number; // 0: normal, 1: continued, 2: stronger_prompt, 3: restarted
|
|
81
|
-
taskStartTime: number;
|
|
82
|
-
lastOutput: string;
|
|
83
88
|
statePath: string;
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
continueSignalsSent: number; // Number of continue signals sent
|
|
89
|
+
laneIndex: number;
|
|
90
|
+
currentTaskIndex?: number;
|
|
87
91
|
}
|
|
88
92
|
|
|
89
93
|
/**
|
|
@@ -106,6 +110,109 @@ function logFileTail(filePath: string, lines: number = 10): void {
|
|
|
106
110
|
}
|
|
107
111
|
}
|
|
108
112
|
|
|
113
|
+
/**
|
|
114
|
+
* Handle RUN_DOCTOR action - runs async health diagnostics
|
|
115
|
+
*/
|
|
116
|
+
async function handleDoctorDiagnostics(
|
|
117
|
+
laneName: string,
|
|
118
|
+
laneRunDir: string,
|
|
119
|
+
runId: string,
|
|
120
|
+
runRoot: string,
|
|
121
|
+
stallService: StallDetectionService,
|
|
122
|
+
child: ChildProcess
|
|
123
|
+
): Promise<void> {
|
|
124
|
+
// Import health check dynamically to avoid circular dependency
|
|
125
|
+
const { checkAgentHealth, checkAuthHealth } = await import('../utils/health');
|
|
126
|
+
|
|
127
|
+
const [agentHealth, authHealth] = await Promise.all([
|
|
128
|
+
checkAgentHealth(),
|
|
129
|
+
checkAuthHealth(),
|
|
130
|
+
]);
|
|
131
|
+
|
|
132
|
+
const issues: string[] = [];
|
|
133
|
+
if (!agentHealth.ok) issues.push(`Agent: ${agentHealth.message}`);
|
|
134
|
+
if (!authHealth.ok) issues.push(`Auth: ${authHealth.message}`);
|
|
135
|
+
|
|
136
|
+
if (issues.length > 0) {
|
|
137
|
+
logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
|
|
138
|
+
} else {
|
|
139
|
+
logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Save diagnostic to file
|
|
143
|
+
const diagnosticPath = safeJoin(laneRunDir, 'diagnostic.json');
|
|
144
|
+
fs.writeFileSync(diagnosticPath, JSON.stringify({
|
|
145
|
+
timestamp: Date.now(),
|
|
146
|
+
agentHealthy: agentHealth.ok,
|
|
147
|
+
authHealthy: authHealth.ok,
|
|
148
|
+
issues,
|
|
149
|
+
}, null, 2));
|
|
150
|
+
|
|
151
|
+
// Kill the process
|
|
152
|
+
try {
|
|
153
|
+
child.kill('SIGKILL');
|
|
154
|
+
} catch {
|
|
155
|
+
// Process might already be dead
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
|
|
159
|
+
|
|
160
|
+
// Save POF for failed recovery
|
|
161
|
+
const stallState = stallService.getState(laneName);
|
|
162
|
+
if (stallState) {
|
|
163
|
+
try {
|
|
164
|
+
const laneStatePath = safeJoin(laneRunDir, 'state.json');
|
|
165
|
+
const laneState = loadState<LaneState>(laneStatePath);
|
|
166
|
+
const pofDir = safeJoin(runRoot, '..', '..', 'pof');
|
|
167
|
+
|
|
168
|
+
// Convert stall state to recovery state format for POF
|
|
169
|
+
// Note: StallPhase and RecoveryStage have compatible numeric values (0-5)
|
|
170
|
+
const recoveryState: LaneRecoveryState = {
|
|
171
|
+
laneName,
|
|
172
|
+
stage: stallState.phase as unknown as number, // Both enums use 0-5
|
|
173
|
+
lastActivityTime: stallState.lastRealActivityTime,
|
|
174
|
+
lastBytesReceived: stallState.bytesSinceLastCheck,
|
|
175
|
+
totalBytesReceived: stallState.totalBytesReceived,
|
|
176
|
+
lastOutput: stallState.lastOutput,
|
|
177
|
+
restartCount: stallState.restartCount,
|
|
178
|
+
continueSignalsSent: stallState.continueSignalCount,
|
|
179
|
+
lastStageChangeTime: stallState.lastPhaseChangeTime,
|
|
180
|
+
isLongOperation: stallState.isLongOperation,
|
|
181
|
+
failureHistory: stallState.failureHistory.map(f => ({
|
|
182
|
+
timestamp: f.timestamp,
|
|
183
|
+
stage: f.phase as unknown as number, // Both enums use 0-5
|
|
184
|
+
action: f.action as string,
|
|
185
|
+
message: f.message,
|
|
186
|
+
idleTimeMs: f.idleTimeMs,
|
|
187
|
+
bytesReceived: f.bytesReceived,
|
|
188
|
+
lastOutput: f.lastOutput,
|
|
189
|
+
})),
|
|
190
|
+
};
|
|
191
|
+
|
|
192
|
+
const diagnosticInfo = {
|
|
193
|
+
timestamp: Date.now(),
|
|
194
|
+
agentHealthy: agentHealth.ok,
|
|
195
|
+
authHealthy: authHealth.ok,
|
|
196
|
+
systemHealthy: true,
|
|
197
|
+
suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
|
|
198
|
+
details: issues.join('\n') || 'No obvious issues found',
|
|
199
|
+
};
|
|
200
|
+
|
|
201
|
+
const pofEntry = createPOFFromRecoveryState(
|
|
202
|
+
runId,
|
|
203
|
+
runRoot,
|
|
204
|
+
laneName,
|
|
205
|
+
recoveryState,
|
|
206
|
+
laneState,
|
|
207
|
+
diagnosticInfo
|
|
208
|
+
);
|
|
209
|
+
savePOF(runId, pofDir, pofEntry);
|
|
210
|
+
} catch (pofError: any) {
|
|
211
|
+
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
109
216
|
/**
|
|
110
217
|
* Spawn a lane process
|
|
111
218
|
*/
|
|
@@ -120,6 +227,7 @@ export function spawnLane({
|
|
|
120
227
|
enhancedLogConfig,
|
|
121
228
|
noGit = false,
|
|
122
229
|
onActivity,
|
|
230
|
+
laneIndex = 0,
|
|
123
231
|
}: {
|
|
124
232
|
laneName: string;
|
|
125
233
|
tasksFile: string;
|
|
@@ -131,6 +239,7 @@ export function spawnLane({
|
|
|
131
239
|
enhancedLogConfig?: Partial<EnhancedLogConfig>;
|
|
132
240
|
noGit?: boolean;
|
|
133
241
|
onActivity?: () => void;
|
|
242
|
+
laneIndex?: number;
|
|
134
243
|
}): SpawnLaneResult {
|
|
135
244
|
fs.mkdirSync(laneRunDir, { recursive: true});
|
|
136
245
|
|
|
@@ -169,17 +278,24 @@ export function spawnLane({
|
|
|
169
278
|
};
|
|
170
279
|
|
|
171
280
|
if (logConfig.enabled) {
|
|
281
|
+
// Helper to get dynamic lane label like [L01-T01-laneName]
|
|
282
|
+
const getDynamicLabel = () => {
|
|
283
|
+
const laneNum = `L${(laneIndex + 1).toString().padStart(2, '0')}`;
|
|
284
|
+
const taskPart = info.currentTaskIndex ? `-T${info.currentTaskIndex.toString().padStart(2, '0')}` : '';
|
|
285
|
+
return `[${laneNum}${taskPart}-${laneName}]`;
|
|
286
|
+
};
|
|
287
|
+
|
|
172
288
|
// Create callback for clean console output
|
|
173
289
|
const onParsedMessage = (msg: ParsedMessage) => {
|
|
174
290
|
if (onActivity) onActivity();
|
|
175
291
|
const formatted = formatMessageForConsole(msg, {
|
|
176
|
-
laneLabel:
|
|
292
|
+
laneLabel: getDynamicLabel(),
|
|
177
293
|
includeTimestamp: true
|
|
178
294
|
});
|
|
179
295
|
process.stdout.write(formatted + '\n');
|
|
180
296
|
};
|
|
181
297
|
|
|
182
|
-
logManager = createLogManager(laneRunDir, laneName, logConfig, onParsedMessage);
|
|
298
|
+
logManager = createLogManager(laneRunDir, laneName, logConfig, onParsedMessage, laneIndex);
|
|
183
299
|
logPath = logManager.getLogPaths().clean;
|
|
184
300
|
|
|
185
301
|
// Spawn with pipe for enhanced logging
|
|
@@ -189,6 +305,16 @@ export function spawnLane({
|
|
|
189
305
|
detached: false,
|
|
190
306
|
});
|
|
191
307
|
|
|
308
|
+
// Initialize info object for stdout handler to use
|
|
309
|
+
const info: RunningLaneInfo = {
|
|
310
|
+
child,
|
|
311
|
+
logManager,
|
|
312
|
+
logPath,
|
|
313
|
+
statePath: safeJoin(laneRunDir, 'state.json'),
|
|
314
|
+
laneIndex,
|
|
315
|
+
currentTaskIndex: startIndex > 0 ? startIndex + 1 : 0
|
|
316
|
+
};
|
|
317
|
+
|
|
192
318
|
// Buffer for non-JSON lines
|
|
193
319
|
let lineBuffer = '';
|
|
194
320
|
|
|
@@ -205,21 +331,52 @@ export function spawnLane({
|
|
|
205
331
|
|
|
206
332
|
for (const line of lines) {
|
|
207
333
|
const trimmed = line.trim();
|
|
334
|
+
if (!trimmed) continue;
|
|
335
|
+
|
|
336
|
+
// Detect task start/progress to update label
|
|
337
|
+
// Example: [1/1] hello-task
|
|
338
|
+
const cleanLine = stripAnsi(trimmed);
|
|
339
|
+
const taskMatch = cleanLine.match(/^\s*\[(\d+)\/(\d+)\]\s+(.+)$/);
|
|
340
|
+
if (taskMatch) {
|
|
341
|
+
info.currentTaskIndex = parseInt(taskMatch[1]!);
|
|
342
|
+
// Update log manager's task index to keep it in sync for readable log
|
|
343
|
+
if (logManager) {
|
|
344
|
+
logManager.setTask(taskMatch[3]!.trim(), undefined, info.currentTaskIndex - 1);
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
208
348
|
// Show if it's a timestamped log line (starts with [YYYY-MM-DD... or [HH:MM:SS])
|
|
209
349
|
// or if it's NOT a noisy JSON line
|
|
210
|
-
const hasTimestamp = /^\[\d{4}-\d{2}-\d{2}T|\^\[\d{2}:\d{2}:\d{2}\]/.test(trimmed);
|
|
211
350
|
const isJson = trimmed.startsWith('{') || trimmed.includes('{"type"');
|
|
351
|
+
// Filter out heartbeats - they should NOT reset the idle timer
|
|
352
|
+
const isHeartbeat = trimmed.includes('Heartbeat') && trimmed.includes('bytes received');
|
|
212
353
|
|
|
213
|
-
if (
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
354
|
+
if (!isJson) {
|
|
355
|
+
// Only trigger activity for non-heartbeat lines
|
|
356
|
+
if (onActivity && !isHeartbeat) onActivity();
|
|
357
|
+
|
|
358
|
+
const currentLabel = getDynamicLabel();
|
|
359
|
+
const coloredLabel = `${logger.COLORS.magenta}${currentLabel}${logger.COLORS.reset}`;
|
|
360
|
+
|
|
361
|
+
// Regex that matches timestamp even if it has ANSI color codes
|
|
362
|
+
// Matches: [24:39:14] or \x1b[90m[24:39:14]\x1b[0m
|
|
363
|
+
const timestampRegex = /^((?:\x1b\[[0-9;]*m)*)\[(\d{4}-\d{2}-\d{2}T|\d{2}:\d{2}:\d{2})\]/;
|
|
364
|
+
const tsMatch = trimmed.match(timestampRegex);
|
|
365
|
+
|
|
366
|
+
if (tsMatch) {
|
|
367
|
+
// If line already has timestamp format, just add lane prefix
|
|
368
|
+
// Check if lane label is already present to avoid triple duplication
|
|
369
|
+
if (!trimmed.includes(currentLabel)) {
|
|
370
|
+
// Insert label after the timestamp part
|
|
371
|
+
const tsPart = tsMatch[0];
|
|
372
|
+
const formatted = trimmed.replace(tsPart, `${tsPart} ${coloredLabel}`);
|
|
373
|
+
process.stdout.write(formatted + '\n');
|
|
374
|
+
} else {
|
|
375
|
+
process.stdout.write(trimmed + '\n');
|
|
376
|
+
}
|
|
220
377
|
} else {
|
|
221
378
|
// Add full prefix: timestamp + lane
|
|
222
|
-
process.stdout.write(`${logger.COLORS.gray}[${new Date().toLocaleTimeString('en-US', { hour12: false })}]${logger.COLORS.reset} ${
|
|
379
|
+
process.stdout.write(`${logger.COLORS.gray}[${new Date().toLocaleTimeString('en-US', { hour12: false })}]${logger.COLORS.reset} ${coloredLabel} ${line}\n`);
|
|
223
380
|
}
|
|
224
381
|
}
|
|
225
382
|
}
|
|
@@ -241,11 +398,14 @@ export function spawnLane({
|
|
|
241
398
|
trimmed.includes('actual output');
|
|
242
399
|
|
|
243
400
|
const ts = new Date().toLocaleTimeString('en-US', { hour12: false });
|
|
401
|
+
const currentLabel = getDynamicLabel();
|
|
402
|
+
const coloredLabel = `${logger.COLORS.magenta}${currentLabel}${logger.COLORS.reset}`;
|
|
403
|
+
|
|
244
404
|
if (isStatus) {
|
|
245
|
-
process.stdout.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${
|
|
405
|
+
process.stdout.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${coloredLabel} ${trimmed}\n`);
|
|
246
406
|
} else {
|
|
247
407
|
if (onActivity) onActivity();
|
|
248
|
-
process.stderr.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${
|
|
408
|
+
process.stderr.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${coloredLabel} ${logger.COLORS.red}❌ ERR ${trimmed}${logger.COLORS.reset}\n`);
|
|
249
409
|
}
|
|
250
410
|
}
|
|
251
411
|
}
|
|
@@ -256,9 +416,11 @@ export function spawnLane({
|
|
|
256
416
|
child.on('exit', () => {
|
|
257
417
|
logManager?.close();
|
|
258
418
|
});
|
|
419
|
+
|
|
420
|
+
return { child, logPath, logManager, info };
|
|
259
421
|
} else {
|
|
260
422
|
// Fallback to simple file logging
|
|
261
|
-
logPath = safeJoin(laneRunDir, 'terminal.log');
|
|
423
|
+
logPath = safeJoin(laneRunDir, 'terminal-readable.log');
|
|
262
424
|
const logFd = fs.openSync(logPath, 'a');
|
|
263
425
|
|
|
264
426
|
child = spawn('node', args, {
|
|
@@ -272,9 +434,19 @@ export function spawnLane({
|
|
|
272
434
|
} catch {
|
|
273
435
|
// Ignore
|
|
274
436
|
}
|
|
437
|
+
|
|
438
|
+
return {
|
|
439
|
+
child,
|
|
440
|
+
logPath,
|
|
441
|
+
logManager,
|
|
442
|
+
info: {
|
|
443
|
+
child,
|
|
444
|
+
logPath,
|
|
445
|
+
statePath: safeJoin(laneRunDir, 'state.json'),
|
|
446
|
+
laneIndex
|
|
447
|
+
}
|
|
448
|
+
};
|
|
275
449
|
}
|
|
276
|
-
|
|
277
|
-
return { child, logPath, logManager };
|
|
278
450
|
}
|
|
279
451
|
|
|
280
452
|
/**
|
|
@@ -293,7 +465,7 @@ export function waitChild(proc: ChildProcess): Promise<number> {
|
|
|
293
465
|
}
|
|
294
466
|
|
|
295
467
|
/**
|
|
296
|
-
* List lane task files in directory
|
|
468
|
+
* List lane task files in directory
|
|
297
469
|
*/
|
|
298
470
|
export function listLaneFiles(tasksDir: string): LaneInfo[] {
|
|
299
471
|
if (!fs.existsSync(tasksDir)) {
|
|
@@ -307,19 +479,10 @@ export function listLaneFiles(tasksDir: string): LaneInfo[] {
|
|
|
307
479
|
.map(f => {
|
|
308
480
|
const filePath = safeJoin(tasksDir, f);
|
|
309
481
|
const name = path.basename(f, '.json');
|
|
310
|
-
let dependsOn: string[] = [];
|
|
311
|
-
|
|
312
|
-
try {
|
|
313
|
-
const config = JSON.parse(fs.readFileSync(filePath, 'utf8')) as RunnerConfig;
|
|
314
|
-
dependsOn = config.dependsOn || [];
|
|
315
|
-
} catch (e) {
|
|
316
|
-
logger.warn(`Failed to parse config for lane ${name}: ${e}`);
|
|
317
|
-
}
|
|
318
482
|
|
|
319
483
|
return {
|
|
320
484
|
name,
|
|
321
485
|
path: filePath,
|
|
322
|
-
dependsOn,
|
|
323
486
|
};
|
|
324
487
|
});
|
|
325
488
|
}
|
|
@@ -336,8 +499,7 @@ export function printLaneStatus(lanes: LaneInfo[], laneRunDirs: Record<string, s
|
|
|
336
499
|
const state = loadState<LaneState>(statePath);
|
|
337
500
|
|
|
338
501
|
if (!state) {
|
|
339
|
-
|
|
340
|
-
return { lane: lane.name, status: isWaiting ? 'waiting' : 'pending', task: '-' };
|
|
502
|
+
return { lane: lane.name, status: 'pending', task: '-' };
|
|
341
503
|
}
|
|
342
504
|
|
|
343
505
|
const idx = (state.currentTaskIndex || 0) + 1;
|
|
@@ -385,12 +547,12 @@ async function resolveAllDependencies(
|
|
|
385
547
|
const worktreeDir = state?.worktreeDir || safeJoin(runRoot, 'resolution-worktree');
|
|
386
548
|
|
|
387
549
|
if (!fs.existsSync(worktreeDir)) {
|
|
388
|
-
logger.info(
|
|
550
|
+
logger.info(`🏗️ Creating resolution worktree at ${worktreeDir}`);
|
|
389
551
|
git.createWorktree(worktreeDir, pipelineBranch, { baseBranch: git.getCurrentBranch() });
|
|
390
552
|
}
|
|
391
553
|
|
|
392
554
|
// 3. Resolve on pipeline branch
|
|
393
|
-
logger.info(
|
|
555
|
+
logger.info(`🔄 Resolving dependencies on branch ${pipelineBranch}`);
|
|
394
556
|
git.runGit(['checkout', pipelineBranch], { cwd: worktreeDir });
|
|
395
557
|
|
|
396
558
|
for (const cmd of uniqueCommands) {
|
|
@@ -471,7 +633,6 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
471
633
|
noGit?: boolean;
|
|
472
634
|
skipPreflight?: boolean;
|
|
473
635
|
stallConfig?: Partial<StallDetectionConfig>;
|
|
474
|
-
autoRecoveryConfig?: Partial<AutoRecoveryConfig>;
|
|
475
636
|
} = {}): Promise<{ lanes: LaneInfo[]; exitCodes: Record<string, number>; runRoot: string }> {
|
|
476
637
|
const lanes = listLaneFiles(tasksDir);
|
|
477
638
|
|
|
@@ -507,34 +668,11 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
507
668
|
logger.success('✓ Preflight checks passed');
|
|
508
669
|
}
|
|
509
670
|
|
|
510
|
-
// Validate dependencies and detect cycles
|
|
511
|
-
logger.section('📊 Dependency Analysis');
|
|
512
|
-
|
|
513
|
-
const depInfos: DependencyInfo[] = lanes.map(l => ({
|
|
514
|
-
name: l.name,
|
|
515
|
-
dependsOn: l.dependsOn,
|
|
516
|
-
}));
|
|
517
|
-
|
|
518
|
-
const depValidation = validateDependencies(depInfos);
|
|
519
|
-
|
|
520
|
-
if (!depValidation.valid) {
|
|
521
|
-
logger.error('❌ Dependency validation failed:');
|
|
522
|
-
for (const err of depValidation.errors) {
|
|
523
|
-
logger.error(` • ${err}`);
|
|
524
|
-
}
|
|
525
|
-
throw new Error('Invalid dependency configuration');
|
|
526
|
-
}
|
|
527
|
-
|
|
528
|
-
if (depValidation.warnings.length > 0) {
|
|
529
|
-
for (const warn of depValidation.warnings) {
|
|
530
|
-
logger.warn(`⚠️ ${warn}`);
|
|
531
|
-
}
|
|
532
|
-
}
|
|
533
|
-
|
|
534
|
-
// Print dependency graph
|
|
535
|
-
printDependencyGraph(depInfos);
|
|
536
|
-
|
|
537
671
|
const config = loadConfig();
|
|
672
|
+
|
|
673
|
+
// Set verbose git logging from config
|
|
674
|
+
git.setVerboseGit(config.verboseGit || false);
|
|
675
|
+
|
|
538
676
|
const logsDir = getLogsDir(config);
|
|
539
677
|
const runId = `run-${Date.now()}`;
|
|
540
678
|
// Use absolute path for runRoot to avoid issues with subfolders
|
|
@@ -558,17 +696,11 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
558
696
|
const randomSuffix = Math.random().toString(36).substring(2, 7);
|
|
559
697
|
const pipelineBranch = `cursorflow/run-${Date.now().toString(36)}-${randomSuffix}`;
|
|
560
698
|
|
|
561
|
-
//
|
|
562
|
-
const
|
|
699
|
+
// Initialize unified stall detection service (Single Source of Truth)
|
|
700
|
+
const stallService = getStallService({
|
|
563
701
|
...DEFAULT_ORCHESTRATOR_STALL_CONFIG,
|
|
564
702
|
...options.stallConfig,
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
// Initialize auto-recovery manager
|
|
568
|
-
const autoRecoveryManager = getAutoRecoveryManager({
|
|
569
|
-
...DEFAULT_AUTO_RECOVERY_CONFIG,
|
|
570
|
-
idleTimeoutMs: stallConfig.idleTimeoutMs, // Sync with stall config
|
|
571
|
-
...options.autoRecoveryConfig,
|
|
703
|
+
verbose: process.env['DEBUG_STALL'] === 'true',
|
|
572
704
|
});
|
|
573
705
|
|
|
574
706
|
// Initialize event system
|
|
@@ -629,6 +761,7 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
629
761
|
|
|
630
762
|
laneWorktreeDirs[lane.name] = laneWorktreeDir;
|
|
631
763
|
|
|
764
|
+
logger.info(`🏗️ Initializing lane ${lane.name}: branch=${lanePipelineBranch}`);
|
|
632
765
|
const initialState = createLaneState(lane.name, taskConfig, lane.path, {
|
|
633
766
|
pipelineBranch: lanePipelineBranch,
|
|
634
767
|
worktreeDir: laneWorktreeDir
|
|
@@ -644,21 +777,6 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
644
777
|
logger.info(`Run directory: ${runRoot}`);
|
|
645
778
|
logger.info(`Lanes: ${lanes.length}`);
|
|
646
779
|
|
|
647
|
-
// Display dependency graph
|
|
648
|
-
logger.info('\n📊 Dependency Graph:');
|
|
649
|
-
for (const lane of lanes) {
|
|
650
|
-
const deps = lane.dependsOn.length > 0 ? ` [depends on: ${lane.dependsOn.join(', ')}]` : '';
|
|
651
|
-
console.log(` ${logger.COLORS.cyan}${lane.name}${logger.COLORS.reset}${deps}`);
|
|
652
|
-
|
|
653
|
-
// Simple tree-like visualization for deep dependencies
|
|
654
|
-
if (lane.dependsOn.length > 0) {
|
|
655
|
-
for (const dep of lane.dependsOn) {
|
|
656
|
-
console.log(` └─ ${dep}`);
|
|
657
|
-
}
|
|
658
|
-
}
|
|
659
|
-
}
|
|
660
|
-
console.log('');
|
|
661
|
-
|
|
662
780
|
// Disable auto-resolve when noGit mode is enabled
|
|
663
781
|
const autoResolve = !options.noGit && options.autoResolveDependencies !== false;
|
|
664
782
|
|
|
@@ -693,29 +811,12 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
693
811
|
|
|
694
812
|
try {
|
|
695
813
|
while (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length || (blockedLanes.size > 0 && running.size === 0)) {
|
|
696
|
-
// 1. Identify lanes ready to start
|
|
814
|
+
// 1. Identify lanes ready to start (all lanes can start immediately - no lane-level dependencies)
|
|
697
815
|
const readyToStart = lanes.filter(lane => {
|
|
698
816
|
// Not already running or completed or failed or blocked
|
|
699
817
|
if (running.has(lane.name) || completedLanes.has(lane.name) || failedLanes.has(lane.name) || blockedLanes.has(lane.name)) {
|
|
700
818
|
return false;
|
|
701
819
|
}
|
|
702
|
-
|
|
703
|
-
// Check dependencies
|
|
704
|
-
for (const dep of lane.dependsOn) {
|
|
705
|
-
if (failedLanes.has(dep)) {
|
|
706
|
-
logger.error(`Lane ${lane.name} will not start because dependency ${dep} failed`);
|
|
707
|
-
failedLanes.add(lane.name);
|
|
708
|
-
exitCodes[lane.name] = 1;
|
|
709
|
-
return false;
|
|
710
|
-
}
|
|
711
|
-
if (blockedLanes.has(dep)) {
|
|
712
|
-
// If a dependency is blocked, wait
|
|
713
|
-
return false;
|
|
714
|
-
}
|
|
715
|
-
if (!completedLanes.has(dep)) {
|
|
716
|
-
return false;
|
|
717
|
-
}
|
|
718
|
-
}
|
|
719
820
|
return true;
|
|
720
821
|
});
|
|
721
822
|
|
|
@@ -734,23 +835,23 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
734
835
|
logger.info(`Lane started: ${lane.name}${lane.startIndex ? ` (resuming from ${lane.startIndex})` : ''}`);
|
|
735
836
|
|
|
736
837
|
const now = Date.now();
|
|
737
|
-
|
|
838
|
+
|
|
839
|
+
// Register lane with unified stall detection service FIRST
|
|
840
|
+
stallService.registerLane(lane.name, {
|
|
841
|
+
laneRunDir: laneRunDirs[lane.name]!,
|
|
842
|
+
});
|
|
843
|
+
|
|
844
|
+
const laneIdx = lanes.findIndex(l => l.name === lane.name);
|
|
845
|
+
|
|
846
|
+
// Pre-register lane in running map
|
|
738
847
|
running.set(lane.name, {
|
|
739
848
|
child: {} as any, // Placeholder, will be replaced below
|
|
740
849
|
logManager: undefined,
|
|
741
850
|
logPath: '',
|
|
742
|
-
lastActivity: now,
|
|
743
|
-
lastStateUpdate: now,
|
|
744
|
-
stallPhase: 0,
|
|
745
|
-
taskStartTime: now,
|
|
746
|
-
lastOutput: '',
|
|
747
851
|
statePath: laneStatePath,
|
|
748
|
-
|
|
749
|
-
lastBytesCheck: 0,
|
|
750
|
-
continueSignalsSent: 0,
|
|
852
|
+
laneIndex: laneIdx >= 0 ? laneIdx : 0,
|
|
751
853
|
});
|
|
752
854
|
|
|
753
|
-
let lastOutput = '';
|
|
754
855
|
const spawnResult = spawnLane({
|
|
755
856
|
laneName: lane.name,
|
|
756
857
|
tasksFile: lane.path,
|
|
@@ -761,51 +862,40 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
761
862
|
worktreeDir: laneWorktreeDirs[lane.name],
|
|
762
863
|
enhancedLogConfig: options.enhancedLogging,
|
|
763
864
|
noGit: options.noGit,
|
|
865
|
+
laneIndex: laneIdx >= 0 ? laneIdx : 0,
|
|
764
866
|
onActivity: () => {
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
const actNow = Date.now();
|
|
768
|
-
info.lastActivity = actNow;
|
|
769
|
-
info.lastStateUpdate = actNow;
|
|
770
|
-
info.stallPhase = 0;
|
|
771
|
-
}
|
|
867
|
+
// Record state file update activity
|
|
868
|
+
stallService.recordStateUpdate(lane.name);
|
|
772
869
|
}
|
|
773
870
|
});
|
|
774
871
|
|
|
775
872
|
// Update with actual spawn result
|
|
776
873
|
const existingInfo = running.get(lane.name)!;
|
|
777
|
-
Object.assign(existingInfo, spawnResult);
|
|
874
|
+
Object.assign(existingInfo, spawnResult.info);
|
|
875
|
+
|
|
876
|
+
// Update stall service with child process reference
|
|
877
|
+
stallService.setChildProcess(lane.name, spawnResult.child);
|
|
778
878
|
|
|
779
|
-
// Track
|
|
879
|
+
// Track stdout for activity detection - delegate to StallDetectionService
|
|
780
880
|
if (spawnResult.child.stdout) {
|
|
781
881
|
spawnResult.child.stdout.on('data', (data: Buffer) => {
|
|
782
|
-
const
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
// Update auto-recovery manager with real activity
|
|
797
|
-
autoRecoveryManager.recordActivity(lane.name, data.length, info.lastOutput);
|
|
798
|
-
} else if (lines.length > 0) {
|
|
799
|
-
// Only heartbeats received - update auto-recovery manager with 0 bytes to avoid resetting idle timer
|
|
800
|
-
autoRecoveryManager.recordActivity(lane.name, 0, info.lastOutput);
|
|
801
|
-
}
|
|
882
|
+
const output = data.toString();
|
|
883
|
+
const lines = output.split('\n').filter(l => l.trim());
|
|
884
|
+
|
|
885
|
+
// Filter out heartbeats from activity tracking
|
|
886
|
+
const realLines = lines.filter(line => !(line.includes('Heartbeat') && line.includes('bytes received')));
|
|
887
|
+
|
|
888
|
+
if (realLines.length > 0) {
|
|
889
|
+
// Real activity - record with bytes
|
|
890
|
+
const lastRealLine = realLines[realLines.length - 1]!;
|
|
891
|
+
stallService.recordActivity(lane.name, data.length, lastRealLine);
|
|
892
|
+
} else if (lines.length > 0) {
|
|
893
|
+
// Heartbeat only - record with 0 bytes (won't reset timer)
|
|
894
|
+
stallService.recordActivity(lane.name, 0);
|
|
802
895
|
}
|
|
803
896
|
});
|
|
804
897
|
}
|
|
805
898
|
|
|
806
|
-
// Register lane with auto-recovery manager
|
|
807
|
-
autoRecoveryManager.registerLane(lane.name);
|
|
808
|
-
|
|
809
899
|
// Update lane tracking
|
|
810
900
|
lane.taskStartTime = now;
|
|
811
901
|
|
|
@@ -836,234 +926,47 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
836
926
|
if (result.name === '__poll__' || (now - lastStallCheck >= 10000)) {
|
|
837
927
|
lastStallCheck = now;
|
|
838
928
|
|
|
839
|
-
// Periodic stall check
|
|
929
|
+
// Periodic stall check using unified StallDetectionService
|
|
840
930
|
for (const [laneName, info] of running.entries()) {
|
|
841
|
-
const idleTime = now - info.lastActivity;
|
|
842
931
|
const lane = lanes.find(l => l.name === laneName)!;
|
|
843
932
|
|
|
844
|
-
if (process.env['DEBUG_STALL']) {
|
|
845
|
-
logger.debug(`[${laneName}] Stall check: idle=${Math.round(idleTime/1000)}s, bytesDelta=${info.bytesReceived - info.lastBytesCheck}, phase=${info.stallPhase}`);
|
|
846
|
-
}
|
|
847
|
-
|
|
848
933
|
// Check state file for progress updates
|
|
849
|
-
let progressTime = 0;
|
|
850
934
|
try {
|
|
851
935
|
const stateStat = fs.statSync(info.statePath);
|
|
852
|
-
const
|
|
853
|
-
if (
|
|
854
|
-
|
|
936
|
+
const stallState = stallService.getState(laneName);
|
|
937
|
+
if (stallState && stateStat.mtimeMs > stallState.lastStateUpdateTime) {
|
|
938
|
+
stallService.recordStateUpdate(laneName);
|
|
855
939
|
}
|
|
856
|
-
progressTime = now - info.lastStateUpdate;
|
|
857
940
|
} catch {
|
|
858
941
|
// State file might not exist yet
|
|
859
942
|
}
|
|
860
943
|
|
|
861
|
-
//
|
|
862
|
-
|
|
863
|
-
|
|
944
|
+
// Debug logging
|
|
945
|
+
if (process.env['DEBUG_STALL']) {
|
|
946
|
+
logger.debug(`[${laneName}] ${stallService.dumpState(laneName)}`);
|
|
947
|
+
}
|
|
864
948
|
|
|
865
|
-
//
|
|
866
|
-
const analysis =
|
|
867
|
-
stallPhase: info.stallPhase,
|
|
868
|
-
idleTimeMs: idleTime,
|
|
869
|
-
progressTimeMs: progressTime,
|
|
870
|
-
lastOutput: info.lastOutput,
|
|
871
|
-
restartCount: lane.restartCount || 0,
|
|
872
|
-
taskStartTimeMs: info.taskStartTime,
|
|
873
|
-
bytesReceived: bytesDelta, // Bytes since last check
|
|
874
|
-
continueSignalsSent: info.continueSignalsSent,
|
|
875
|
-
}, stallConfig);
|
|
949
|
+
// Run stall analysis and recovery (all logic is in StallDetectionService)
|
|
950
|
+
const analysis = stallService.checkAndRecover(laneName);
|
|
876
951
|
|
|
877
|
-
//
|
|
952
|
+
// Log to lane log manager if there was an action
|
|
878
953
|
if (analysis.action !== RecoveryAction.NONE) {
|
|
879
|
-
logFailure(laneName, analysis);
|
|
880
954
|
info.logManager?.log('error', analysis.message);
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
laneName,
|
|
893
|
-
idleSeconds: Math.round(idleTime / 1000),
|
|
894
|
-
signalCount: info.continueSignalsSent,
|
|
895
|
-
});
|
|
896
|
-
} catch (e) {
|
|
897
|
-
logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
|
|
898
|
-
}
|
|
899
|
-
} else if (analysis.action === RecoveryAction.STRONGER_PROMPT) {
|
|
900
|
-
const interventionPath = safeJoin(laneRunDirs[laneName]!, 'intervention.txt');
|
|
901
|
-
const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck. Please continue with your current task immediately. If you're waiting for something, explain what you need and proceed with what you can do now. If you've completed the task, summarize your work and finish.`;
|
|
902
|
-
try {
|
|
903
|
-
fs.writeFileSync(interventionPath, strongerPrompt);
|
|
904
|
-
info.stallPhase = 2;
|
|
905
|
-
info.lastActivity = now;
|
|
906
|
-
logger.warn(`[${laneName}] Sent stronger prompt after continue signal failed`);
|
|
907
|
-
|
|
908
|
-
events.emit('recovery.stronger_prompt', { laneName });
|
|
909
|
-
} catch (e) {
|
|
910
|
-
logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
|
|
911
|
-
}
|
|
912
|
-
} else if (analysis.action === RecoveryAction.KILL_AND_RESTART ||
|
|
913
|
-
analysis.action === RecoveryAction.RESTART_LANE ||
|
|
914
|
-
analysis.action === RecoveryAction.RESTART_LANE_FROM_CHECKPOINT) {
|
|
915
|
-
lane.restartCount = (lane.restartCount || 0) + 1;
|
|
916
|
-
info.stallPhase = 3;
|
|
917
|
-
|
|
918
|
-
// Try to get checkpoint info
|
|
919
|
-
const checkpoint = getLatestCheckpoint(laneRunDirs[laneName]!);
|
|
920
|
-
if (checkpoint) {
|
|
921
|
-
logger.info(`[${laneName}] Checkpoint available: ${checkpoint.id} (task ${checkpoint.taskIndex})`);
|
|
922
|
-
}
|
|
923
|
-
|
|
924
|
-
// Kill the process
|
|
925
|
-
try {
|
|
926
|
-
info.child.kill('SIGKILL');
|
|
927
|
-
} catch {
|
|
928
|
-
// Process might already be dead
|
|
929
|
-
}
|
|
930
|
-
|
|
931
|
-
logger.warn(`[${laneName}] Killing and restarting lane (restart #${lane.restartCount})`);
|
|
932
|
-
|
|
933
|
-
events.emit('recovery.restart', {
|
|
934
|
-
laneName,
|
|
935
|
-
restartCount: lane.restartCount,
|
|
936
|
-
maxRestarts: stallConfig.maxRestarts,
|
|
937
|
-
});
|
|
938
|
-
} else if (analysis.action === RecoveryAction.RUN_DOCTOR) {
|
|
939
|
-
info.stallPhase = 4;
|
|
940
|
-
|
|
941
|
-
// Run diagnostics
|
|
942
|
-
logger.error(`[${laneName}] Running diagnostics due to persistent failures...`);
|
|
943
|
-
|
|
944
|
-
// Import health check dynamically to avoid circular dependency
|
|
945
|
-
const { checkAgentHealth, checkAuthHealth } = await import('../utils/health');
|
|
946
|
-
|
|
947
|
-
const [agentHealth, authHealth] = await Promise.all([
|
|
948
|
-
checkAgentHealth(),
|
|
949
|
-
checkAuthHealth(),
|
|
950
|
-
]);
|
|
951
|
-
|
|
952
|
-
const issues: string[] = [];
|
|
953
|
-
if (!agentHealth.ok) issues.push(`Agent: ${agentHealth.message}`);
|
|
954
|
-
if (!authHealth.ok) issues.push(`Auth: ${authHealth.message}`);
|
|
955
|
-
|
|
956
|
-
if (issues.length > 0) {
|
|
957
|
-
logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
|
|
958
|
-
} else {
|
|
959
|
-
logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
|
|
960
|
-
}
|
|
961
|
-
|
|
962
|
-
// Save diagnostic to file
|
|
963
|
-
const diagnosticPath = safeJoin(laneRunDirs[laneName]!, 'diagnostic.json');
|
|
964
|
-
fs.writeFileSync(diagnosticPath, JSON.stringify({
|
|
965
|
-
timestamp: Date.now(),
|
|
966
|
-
agentHealthy: agentHealth.ok,
|
|
967
|
-
authHealthy: authHealth.ok,
|
|
968
|
-
issues,
|
|
969
|
-
analysis,
|
|
970
|
-
}, null, 2));
|
|
971
|
-
|
|
972
|
-
// Kill the process
|
|
973
|
-
try {
|
|
974
|
-
info.child.kill('SIGKILL');
|
|
975
|
-
} catch {
|
|
976
|
-
// Process might already be dead
|
|
977
|
-
}
|
|
978
|
-
|
|
979
|
-
logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
|
|
980
|
-
|
|
981
|
-
// Save POF for failed recovery
|
|
982
|
-
const recoveryState = autoRecoveryManager.getState(laneName);
|
|
983
|
-
if (recoveryState) {
|
|
984
|
-
try {
|
|
985
|
-
const laneStatePath = safeJoin(laneRunDirs[laneName]!, 'state.json');
|
|
986
|
-
const laneState = loadState<LaneState>(laneStatePath);
|
|
987
|
-
const pofDir = safeJoin(runRoot, '..', '..', 'pof');
|
|
988
|
-
const diagnosticInfo = {
|
|
989
|
-
timestamp: Date.now(),
|
|
990
|
-
agentHealthy: agentHealth.ok,
|
|
991
|
-
authHealthy: authHealth.ok,
|
|
992
|
-
systemHealthy: true,
|
|
993
|
-
suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
|
|
994
|
-
details: issues.join('\n') || 'No obvious issues found',
|
|
995
|
-
};
|
|
996
|
-
const pofEntry = createPOFFromRecoveryState(
|
|
997
|
-
runId,
|
|
998
|
-
runRoot,
|
|
999
|
-
laneName,
|
|
1000
|
-
recoveryState,
|
|
1001
|
-
laneState,
|
|
1002
|
-
diagnosticInfo
|
|
1003
|
-
);
|
|
1004
|
-
savePOF(runId, pofDir, pofEntry);
|
|
1005
|
-
} catch (pofError: any) {
|
|
1006
|
-
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
1007
|
-
}
|
|
1008
|
-
}
|
|
1009
|
-
|
|
1010
|
-
events.emit('recovery.diagnosed', {
|
|
1011
|
-
laneName,
|
|
1012
|
-
diagnostic: { agentHealthy: agentHealth.ok, authHealthy: authHealth.ok, issues },
|
|
1013
|
-
});
|
|
1014
|
-
} else if (analysis.action === RecoveryAction.ABORT_LANE) {
|
|
1015
|
-
info.stallPhase = 5;
|
|
1016
|
-
|
|
1017
|
-
try {
|
|
1018
|
-
info.child.kill('SIGKILL');
|
|
1019
|
-
} catch {
|
|
1020
|
-
// Process might already be dead
|
|
1021
|
-
}
|
|
1022
|
-
|
|
1023
|
-
logger.error(`[${laneName}] Aborting lane due to repeated stalls`);
|
|
1024
|
-
|
|
1025
|
-
// Save POF for failed recovery
|
|
1026
|
-
const recoveryState = autoRecoveryManager.getState(laneName);
|
|
1027
|
-
if (recoveryState) {
|
|
1028
|
-
try {
|
|
1029
|
-
const laneStatePath = safeJoin(laneRunDirs[laneName]!, 'state.json');
|
|
1030
|
-
const laneState = loadState<LaneState>(laneStatePath);
|
|
1031
|
-
const pofDir = safeJoin(runRoot, '..', '..', 'pof');
|
|
1032
|
-
const pofEntry = createPOFFromRecoveryState(
|
|
1033
|
-
runId,
|
|
1034
|
-
runRoot,
|
|
1035
|
-
laneName,
|
|
1036
|
-
recoveryState,
|
|
1037
|
-
laneState,
|
|
1038
|
-
recoveryState.diagnosticInfo
|
|
1039
|
-
);
|
|
1040
|
-
savePOF(runId, pofDir, pofEntry);
|
|
1041
|
-
} catch (pofError: any) {
|
|
1042
|
-
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
1043
|
-
}
|
|
1044
|
-
}
|
|
1045
|
-
} else if (analysis.action === RecoveryAction.SEND_GIT_GUIDANCE) {
|
|
1046
|
-
// Send guidance message to agent for git issues
|
|
1047
|
-
const interventionPath = safeJoin(laneRunDirs[laneName]!, 'intervention.txt');
|
|
1048
|
-
|
|
1049
|
-
// Determine which guidance to send based on the failure type
|
|
1050
|
-
let guidance: string;
|
|
1051
|
-
if (analysis.type === FailureType.GIT_PUSH_REJECTED) {
|
|
1052
|
-
guidance = getGitPushFailureGuidance();
|
|
1053
|
-
} else if (analysis.type === FailureType.MERGE_CONFLICT) {
|
|
1054
|
-
guidance = getMergeConflictGuidance();
|
|
1055
|
-
} else {
|
|
1056
|
-
guidance = getGitErrorGuidance(analysis.message);
|
|
1057
|
-
}
|
|
1058
|
-
|
|
1059
|
-
try {
|
|
1060
|
-
fs.writeFileSync(interventionPath, guidance);
|
|
1061
|
-
info.lastActivity = now;
|
|
1062
|
-
logger.info(`[${laneName}] Sent git issue guidance to agent`);
|
|
1063
|
-
} catch (e: any) {
|
|
1064
|
-
logger.error(`[${laneName}] Failed to send guidance: ${e.message}`);
|
|
1065
|
-
}
|
|
955
|
+
|
|
956
|
+
// Handle special case: RUN_DOCTOR needs async operations
|
|
957
|
+
if (analysis.action === RecoveryAction.RUN_DOCTOR) {
|
|
958
|
+
await handleDoctorDiagnostics(
|
|
959
|
+
laneName,
|
|
960
|
+
laneRunDirs[laneName]!,
|
|
961
|
+
runId,
|
|
962
|
+
runRoot,
|
|
963
|
+
stallService,
|
|
964
|
+
info.child
|
|
965
|
+
);
|
|
1066
966
|
}
|
|
967
|
+
|
|
968
|
+
// Sync restartCount back to lane info (for restart logic in process exit handler)
|
|
969
|
+
lane.restartCount = stallService.getRestartCount(laneName);
|
|
1067
970
|
}
|
|
1068
971
|
}
|
|
1069
972
|
continue;
|
|
@@ -1073,8 +976,11 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
1073
976
|
running.delete(finished.name);
|
|
1074
977
|
exitCodes[finished.name] = finished.code;
|
|
1075
978
|
|
|
1076
|
-
//
|
|
1077
|
-
|
|
979
|
+
// Get stall state before unregistering
|
|
980
|
+
const stallPhase = stallService.getPhase(finished.name);
|
|
981
|
+
|
|
982
|
+
// Unregister from stall detection service
|
|
983
|
+
stallService.unregisterLane(finished.name);
|
|
1078
984
|
|
|
1079
985
|
if (finished.code === 0) {
|
|
1080
986
|
completedLanes.add(finished.name);
|
|
@@ -1104,8 +1010,8 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
1104
1010
|
logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
|
|
1105
1011
|
}
|
|
1106
1012
|
} else {
|
|
1107
|
-
// Check if it was a restart request
|
|
1108
|
-
if (
|
|
1013
|
+
// Check if it was a restart request (RESTART_REQUESTED phase)
|
|
1014
|
+
if (stallPhase === StallPhase.RESTART_REQUESTED) {
|
|
1109
1015
|
logger.info(`🔄 Lane ${finished.name} is being restarted due to stall...`);
|
|
1110
1016
|
|
|
1111
1017
|
// Update startIndex from current state to resume from the same task
|
|
@@ -1126,7 +1032,7 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
1126
1032
|
failedLanes.add(finished.name);
|
|
1127
1033
|
|
|
1128
1034
|
let errorMsg = 'Process exited with non-zero code';
|
|
1129
|
-
if (
|
|
1035
|
+
if (stallPhase >= StallPhase.DIAGNOSED) {
|
|
1130
1036
|
errorMsg = 'Stopped due to repeated stall';
|
|
1131
1037
|
} else if (info.logManager) {
|
|
1132
1038
|
const lastError = info.logManager.getLastError();
|