@litmers/cursorflow-orchestrator 0.1.31 → 0.1.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/README.md +182 -59
- package/commands/cursorflow-add.md +159 -0
- package/commands/cursorflow-doctor.md +45 -23
- package/commands/cursorflow-monitor.md +23 -2
- package/commands/cursorflow-new.md +87 -0
- package/commands/cursorflow-run.md +60 -111
- package/dist/cli/add.d.ts +7 -0
- package/dist/cli/add.js +377 -0
- package/dist/cli/add.js.map +1 -0
- package/dist/cli/clean.js +1 -0
- package/dist/cli/clean.js.map +1 -1
- package/dist/cli/config.d.ts +7 -0
- package/dist/cli/config.js +181 -0
- package/dist/cli/config.js.map +1 -0
- package/dist/cli/doctor.js +47 -4
- package/dist/cli/doctor.js.map +1 -1
- package/dist/cli/index.js +34 -30
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/logs.js +17 -34
- package/dist/cli/logs.js.map +1 -1
- package/dist/cli/monitor.js +62 -65
- package/dist/cli/monitor.js.map +1 -1
- package/dist/cli/new.d.ts +7 -0
- package/dist/cli/new.js +232 -0
- package/dist/cli/new.js.map +1 -0
- package/dist/cli/prepare.js +95 -193
- package/dist/cli/prepare.js.map +1 -1
- package/dist/cli/resume.js +57 -68
- package/dist/cli/resume.js.map +1 -1
- package/dist/cli/run.js +60 -30
- package/dist/cli/run.js.map +1 -1
- package/dist/cli/stop.js +6 -0
- package/dist/cli/stop.js.map +1 -1
- package/dist/cli/tasks.d.ts +5 -3
- package/dist/cli/tasks.js +181 -29
- package/dist/cli/tasks.js.map +1 -1
- package/dist/core/failure-policy.d.ts +9 -0
- package/dist/core/failure-policy.js +9 -0
- package/dist/core/failure-policy.js.map +1 -1
- package/dist/core/orchestrator.d.ts +20 -6
- package/dist/core/orchestrator.js +215 -334
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/core/runner/agent.d.ts +27 -0
- package/dist/core/runner/agent.js +294 -0
- package/dist/core/runner/agent.js.map +1 -0
- package/dist/core/runner/index.d.ts +5 -0
- package/dist/core/runner/index.js +22 -0
- package/dist/core/runner/index.js.map +1 -0
- package/dist/core/runner/pipeline.d.ts +9 -0
- package/dist/core/runner/pipeline.js +539 -0
- package/dist/core/runner/pipeline.js.map +1 -0
- package/dist/core/runner/prompt.d.ts +25 -0
- package/dist/core/runner/prompt.js +175 -0
- package/dist/core/runner/prompt.js.map +1 -0
- package/dist/core/runner/task.d.ts +26 -0
- package/dist/core/runner/task.js +283 -0
- package/dist/core/runner/task.js.map +1 -0
- package/dist/core/runner/utils.d.ts +37 -0
- package/dist/core/runner/utils.js +161 -0
- package/dist/core/runner/utils.js.map +1 -0
- package/dist/core/runner.d.ts +2 -96
- package/dist/core/runner.js +11 -1136
- package/dist/core/runner.js.map +1 -1
- package/dist/core/stall-detection.d.ts +326 -0
- package/dist/core/stall-detection.js +781 -0
- package/dist/core/stall-detection.js.map +1 -0
- package/dist/services/logging/console.js +2 -1
- package/dist/services/logging/console.js.map +1 -1
- package/dist/types/config.d.ts +6 -6
- package/dist/types/flow.d.ts +84 -0
- package/dist/types/flow.js +10 -0
- package/dist/types/flow.js.map +1 -0
- package/dist/types/index.d.ts +1 -0
- package/dist/types/index.js +3 -3
- package/dist/types/index.js.map +1 -1
- package/dist/types/lane.d.ts +0 -2
- package/dist/types/logging.d.ts +5 -1
- package/dist/types/task.d.ts +7 -11
- package/dist/utils/config.d.ts +5 -1
- package/dist/utils/config.js +15 -16
- package/dist/utils/config.js.map +1 -1
- package/dist/utils/dependency.d.ts +36 -1
- package/dist/utils/dependency.js +256 -1
- package/dist/utils/dependency.js.map +1 -1
- package/dist/utils/doctor.js +40 -8
- package/dist/utils/doctor.js.map +1 -1
- package/dist/utils/enhanced-logger.d.ts +45 -82
- package/dist/utils/enhanced-logger.js +239 -844
- package/dist/utils/enhanced-logger.js.map +1 -1
- package/dist/utils/flow.d.ts +9 -0
- package/dist/utils/flow.js +73 -0
- package/dist/utils/flow.js.map +1 -0
- package/dist/utils/git.d.ts +29 -0
- package/dist/utils/git.js +115 -5
- package/dist/utils/git.js.map +1 -1
- package/dist/utils/state.js +0 -2
- package/dist/utils/state.js.map +1 -1
- package/dist/utils/task-service.d.ts +2 -2
- package/dist/utils/task-service.js +40 -31
- package/dist/utils/task-service.js.map +1 -1
- package/package.json +4 -3
- package/src/cli/add.ts +397 -0
- package/src/cli/clean.ts +1 -0
- package/src/cli/config.ts +177 -0
- package/src/cli/doctor.ts +48 -4
- package/src/cli/index.ts +36 -32
- package/src/cli/logs.ts +20 -33
- package/src/cli/monitor.ts +70 -75
- package/src/cli/new.ts +235 -0
- package/src/cli/prepare.ts +98 -205
- package/src/cli/resume.ts +61 -76
- package/src/cli/run.ts +333 -306
- package/src/cli/stop.ts +8 -0
- package/src/cli/tasks.ts +200 -21
- package/src/core/failure-policy.ts +9 -0
- package/src/core/orchestrator.ts +279 -379
- package/src/core/runner/agent.ts +314 -0
- package/src/core/runner/index.ts +6 -0
- package/src/core/runner/pipeline.ts +567 -0
- package/src/core/runner/prompt.ts +174 -0
- package/src/core/runner/task.ts +320 -0
- package/src/core/runner/utils.ts +142 -0
- package/src/core/runner.ts +8 -1347
- package/src/core/stall-detection.ts +936 -0
- package/src/services/logging/console.ts +2 -1
- package/src/types/config.ts +6 -6
- package/src/types/flow.ts +91 -0
- package/src/types/index.ts +15 -3
- package/src/types/lane.ts +0 -2
- package/src/types/logging.ts +5 -1
- package/src/types/task.ts +7 -11
- package/src/utils/config.ts +16 -17
- package/src/utils/dependency.ts +311 -2
- package/src/utils/doctor.ts +36 -8
- package/src/utils/enhanced-logger.ts +264 -927
- package/src/utils/flow.ts +42 -0
- package/src/utils/git.ts +145 -5
- package/src/utils/state.ts +0 -2
- package/src/utils/task-service.ts +48 -40
- package/commands/cursorflow-review.md +0 -56
- package/commands/cursorflow-runs.md +0 -59
- package/dist/cli/runs.d.ts +0 -5
- package/dist/cli/runs.js +0 -214
- package/dist/cli/runs.js.map +0 -1
- package/dist/core/reviewer.d.ts +0 -66
- package/dist/core/reviewer.js +0 -265
- package/dist/core/reviewer.js.map +0 -1
- package/src/cli/runs.ts +0 -212
- package/src/core/reviewer.ts +0 -285
package/src/core/orchestrator.ts
CHANGED
|
@@ -25,28 +25,35 @@ import {
|
|
|
25
25
|
EnhancedLogManager,
|
|
26
26
|
createLogManager,
|
|
27
27
|
DEFAULT_LOG_CONFIG,
|
|
28
|
-
ParsedMessage
|
|
28
|
+
ParsedMessage,
|
|
29
|
+
stripAnsi
|
|
29
30
|
} from '../utils/enhanced-logger';
|
|
30
31
|
import { formatMessageForConsole } from '../utils/log-formatter';
|
|
31
|
-
import {
|
|
32
|
+
import { FailureType, analyzeFailure as analyzeFailureFromPolicy } from './failure-policy';
|
|
32
33
|
import {
|
|
33
|
-
getAutoRecoveryManager,
|
|
34
|
-
DEFAULT_AUTO_RECOVERY_CONFIG,
|
|
35
|
-
AutoRecoveryConfig,
|
|
36
34
|
savePOF,
|
|
37
35
|
createPOFFromRecoveryState,
|
|
38
36
|
getGitPushFailureGuidance,
|
|
39
37
|
getMergeConflictGuidance,
|
|
40
38
|
getGitErrorGuidance,
|
|
39
|
+
LaneRecoveryState,
|
|
41
40
|
} from './auto-recovery';
|
|
41
|
+
import {
|
|
42
|
+
StallDetectionService,
|
|
43
|
+
getStallService,
|
|
44
|
+
StallDetectionConfig,
|
|
45
|
+
DEFAULT_STALL_CONFIG,
|
|
46
|
+
RecoveryAction,
|
|
47
|
+
StallPhase,
|
|
48
|
+
StallAnalysis,
|
|
49
|
+
} from './stall-detection';
|
|
42
50
|
import { detectCyclicDependencies, validateDependencies, printDependencyGraph, DependencyInfo } from '../utils/dependency';
|
|
43
51
|
import { preflightCheck, printPreflightReport, autoRepair } from '../utils/health';
|
|
44
52
|
import { getLatestCheckpoint } from '../utils/checkpoint';
|
|
45
53
|
import { cleanStaleLocks, getLockDir } from '../utils/lock';
|
|
46
54
|
|
|
47
55
|
/** Default stall detection configuration - 2 minute idle timeout for recovery */
|
|
48
|
-
const DEFAULT_ORCHESTRATOR_STALL_CONFIG: StallDetectionConfig = {
|
|
49
|
-
...DEFAULT_STALL_CONFIG,
|
|
56
|
+
const DEFAULT_ORCHESTRATOR_STALL_CONFIG: Partial<StallDetectionConfig> = {
|
|
50
57
|
idleTimeoutMs: 2 * 60 * 1000, // 2 minutes (idle detection for continue signal)
|
|
51
58
|
progressTimeoutMs: 10 * 60 * 1000, // 10 minutes (only triggers if no activity at all)
|
|
52
59
|
maxRestarts: 2,
|
|
@@ -55,7 +62,6 @@ const DEFAULT_ORCHESTRATOR_STALL_CONFIG: StallDetectionConfig = {
|
|
|
55
62
|
export interface LaneInfo {
|
|
56
63
|
name: string;
|
|
57
64
|
path: string;
|
|
58
|
-
dependsOn: string[];
|
|
59
65
|
startIndex?: number; // Current task index to resume from
|
|
60
66
|
restartCount?: number; // Number of times restarted due to stall
|
|
61
67
|
lastStateUpdate?: number; // Timestamp of last state file update
|
|
@@ -66,24 +72,22 @@ export interface SpawnLaneResult {
|
|
|
66
72
|
child: ChildProcess;
|
|
67
73
|
logPath: string;
|
|
68
74
|
logManager?: EnhancedLogManager;
|
|
75
|
+
info: RunningLaneInfo;
|
|
69
76
|
}
|
|
70
77
|
|
|
71
78
|
/**
|
|
72
79
|
* Lane execution tracking info
|
|
80
|
+
*
|
|
81
|
+
* NOTE: Stall 감지 관련 상태(lastActivity, stallPhase 등)는 StallDetectionService에서 관리
|
|
82
|
+
* 여기서는 프로세스 관리에 필요한 최소한의 정보만 유지
|
|
73
83
|
*/
|
|
74
84
|
interface RunningLaneInfo {
|
|
75
85
|
child: ChildProcess;
|
|
76
86
|
logPath: string;
|
|
77
87
|
logManager?: EnhancedLogManager;
|
|
78
|
-
lastActivity: number;
|
|
79
|
-
lastStateUpdate: number;
|
|
80
|
-
stallPhase: number; // 0: normal, 1: continued, 2: stronger_prompt, 3: restarted
|
|
81
|
-
taskStartTime: number;
|
|
82
|
-
lastOutput: string;
|
|
83
88
|
statePath: string;
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
continueSignalsSent: number; // Number of continue signals sent
|
|
89
|
+
laneIndex: number;
|
|
90
|
+
currentTaskIndex?: number;
|
|
87
91
|
}
|
|
88
92
|
|
|
89
93
|
/**
|
|
@@ -106,6 +110,109 @@ function logFileTail(filePath: string, lines: number = 10): void {
|
|
|
106
110
|
}
|
|
107
111
|
}
|
|
108
112
|
|
|
113
|
+
/**
|
|
114
|
+
* Handle RUN_DOCTOR action - runs async health diagnostics
|
|
115
|
+
*/
|
|
116
|
+
async function handleDoctorDiagnostics(
|
|
117
|
+
laneName: string,
|
|
118
|
+
laneRunDir: string,
|
|
119
|
+
runId: string,
|
|
120
|
+
runRoot: string,
|
|
121
|
+
stallService: StallDetectionService,
|
|
122
|
+
child: ChildProcess
|
|
123
|
+
): Promise<void> {
|
|
124
|
+
// Import health check dynamically to avoid circular dependency
|
|
125
|
+
const { checkAgentHealth, checkAuthHealth } = await import('../utils/health');
|
|
126
|
+
|
|
127
|
+
const [agentHealth, authHealth] = await Promise.all([
|
|
128
|
+
checkAgentHealth(),
|
|
129
|
+
checkAuthHealth(),
|
|
130
|
+
]);
|
|
131
|
+
|
|
132
|
+
const issues: string[] = [];
|
|
133
|
+
if (!agentHealth.ok) issues.push(`Agent: ${agentHealth.message}`);
|
|
134
|
+
if (!authHealth.ok) issues.push(`Auth: ${authHealth.message}`);
|
|
135
|
+
|
|
136
|
+
if (issues.length > 0) {
|
|
137
|
+
logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
|
|
138
|
+
} else {
|
|
139
|
+
logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Save diagnostic to file
|
|
143
|
+
const diagnosticPath = safeJoin(laneRunDir, 'diagnostic.json');
|
|
144
|
+
fs.writeFileSync(diagnosticPath, JSON.stringify({
|
|
145
|
+
timestamp: Date.now(),
|
|
146
|
+
agentHealthy: agentHealth.ok,
|
|
147
|
+
authHealthy: authHealth.ok,
|
|
148
|
+
issues,
|
|
149
|
+
}, null, 2));
|
|
150
|
+
|
|
151
|
+
// Kill the process
|
|
152
|
+
try {
|
|
153
|
+
child.kill('SIGKILL');
|
|
154
|
+
} catch {
|
|
155
|
+
// Process might already be dead
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
|
|
159
|
+
|
|
160
|
+
// Save POF for failed recovery
|
|
161
|
+
const stallState = stallService.getState(laneName);
|
|
162
|
+
if (stallState) {
|
|
163
|
+
try {
|
|
164
|
+
const laneStatePath = safeJoin(laneRunDir, 'state.json');
|
|
165
|
+
const laneState = loadState<LaneState>(laneStatePath);
|
|
166
|
+
const pofDir = safeJoin(runRoot, '..', '..', 'pof');
|
|
167
|
+
|
|
168
|
+
// Convert stall state to recovery state format for POF
|
|
169
|
+
// Note: StallPhase and RecoveryStage have compatible numeric values (0-5)
|
|
170
|
+
const recoveryState: LaneRecoveryState = {
|
|
171
|
+
laneName,
|
|
172
|
+
stage: stallState.phase as unknown as number, // Both enums use 0-5
|
|
173
|
+
lastActivityTime: stallState.lastRealActivityTime,
|
|
174
|
+
lastBytesReceived: stallState.bytesSinceLastCheck,
|
|
175
|
+
totalBytesReceived: stallState.totalBytesReceived,
|
|
176
|
+
lastOutput: stallState.lastOutput,
|
|
177
|
+
restartCount: stallState.restartCount,
|
|
178
|
+
continueSignalsSent: stallState.continueSignalCount,
|
|
179
|
+
lastStageChangeTime: stallState.lastPhaseChangeTime,
|
|
180
|
+
isLongOperation: stallState.isLongOperation,
|
|
181
|
+
failureHistory: stallState.failureHistory.map(f => ({
|
|
182
|
+
timestamp: f.timestamp,
|
|
183
|
+
stage: f.phase as unknown as number, // Both enums use 0-5
|
|
184
|
+
action: f.action as string,
|
|
185
|
+
message: f.message,
|
|
186
|
+
idleTimeMs: f.idleTimeMs,
|
|
187
|
+
bytesReceived: f.bytesReceived,
|
|
188
|
+
lastOutput: f.lastOutput,
|
|
189
|
+
})),
|
|
190
|
+
};
|
|
191
|
+
|
|
192
|
+
const diagnosticInfo = {
|
|
193
|
+
timestamp: Date.now(),
|
|
194
|
+
agentHealthy: agentHealth.ok,
|
|
195
|
+
authHealthy: authHealth.ok,
|
|
196
|
+
systemHealthy: true,
|
|
197
|
+
suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
|
|
198
|
+
details: issues.join('\n') || 'No obvious issues found',
|
|
199
|
+
};
|
|
200
|
+
|
|
201
|
+
const pofEntry = createPOFFromRecoveryState(
|
|
202
|
+
runId,
|
|
203
|
+
runRoot,
|
|
204
|
+
laneName,
|
|
205
|
+
recoveryState,
|
|
206
|
+
laneState,
|
|
207
|
+
diagnosticInfo
|
|
208
|
+
);
|
|
209
|
+
savePOF(runId, pofDir, pofEntry);
|
|
210
|
+
} catch (pofError: any) {
|
|
211
|
+
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
109
216
|
/**
|
|
110
217
|
* Spawn a lane process
|
|
111
218
|
*/
|
|
@@ -120,6 +227,7 @@ export function spawnLane({
|
|
|
120
227
|
enhancedLogConfig,
|
|
121
228
|
noGit = false,
|
|
122
229
|
onActivity,
|
|
230
|
+
laneIndex = 0,
|
|
123
231
|
}: {
|
|
124
232
|
laneName: string;
|
|
125
233
|
tasksFile: string;
|
|
@@ -131,6 +239,7 @@ export function spawnLane({
|
|
|
131
239
|
enhancedLogConfig?: Partial<EnhancedLogConfig>;
|
|
132
240
|
noGit?: boolean;
|
|
133
241
|
onActivity?: () => void;
|
|
242
|
+
laneIndex?: number;
|
|
134
243
|
}): SpawnLaneResult {
|
|
135
244
|
fs.mkdirSync(laneRunDir, { recursive: true});
|
|
136
245
|
|
|
@@ -169,17 +278,25 @@ export function spawnLane({
|
|
|
169
278
|
};
|
|
170
279
|
|
|
171
280
|
if (logConfig.enabled) {
|
|
281
|
+
// Helper to get dynamic lane label like [L1-T1-lanename10]
|
|
282
|
+
const getDynamicLabel = () => {
|
|
283
|
+
const laneNum = `L${laneIndex + 1}`;
|
|
284
|
+
const taskPart = info.currentTaskIndex ? `-T${info.currentTaskIndex}` : '';
|
|
285
|
+
const shortLaneName = laneName.substring(0, 10);
|
|
286
|
+
return `[${laneNum}${taskPart}-${shortLaneName}]`;
|
|
287
|
+
};
|
|
288
|
+
|
|
172
289
|
// Create callback for clean console output
|
|
173
290
|
const onParsedMessage = (msg: ParsedMessage) => {
|
|
174
291
|
if (onActivity) onActivity();
|
|
175
292
|
const formatted = formatMessageForConsole(msg, {
|
|
176
|
-
laneLabel:
|
|
293
|
+
laneLabel: getDynamicLabel(),
|
|
177
294
|
includeTimestamp: true
|
|
178
295
|
});
|
|
179
296
|
process.stdout.write(formatted + '\n');
|
|
180
297
|
};
|
|
181
298
|
|
|
182
|
-
logManager = createLogManager(laneRunDir, laneName, logConfig, onParsedMessage);
|
|
299
|
+
logManager = createLogManager(laneRunDir, laneName, logConfig, onParsedMessage, laneIndex);
|
|
183
300
|
logPath = logManager.getLogPaths().clean;
|
|
184
301
|
|
|
185
302
|
// Spawn with pipe for enhanced logging
|
|
@@ -189,6 +306,16 @@ export function spawnLane({
|
|
|
189
306
|
detached: false,
|
|
190
307
|
});
|
|
191
308
|
|
|
309
|
+
// Initialize info object for stdout handler to use
|
|
310
|
+
const info: RunningLaneInfo = {
|
|
311
|
+
child,
|
|
312
|
+
logManager,
|
|
313
|
+
logPath,
|
|
314
|
+
statePath: safeJoin(laneRunDir, 'state.json'),
|
|
315
|
+
laneIndex,
|
|
316
|
+
currentTaskIndex: startIndex > 0 ? startIndex + 1 : 0
|
|
317
|
+
};
|
|
318
|
+
|
|
192
319
|
// Buffer for non-JSON lines
|
|
193
320
|
let lineBuffer = '';
|
|
194
321
|
|
|
@@ -205,24 +332,52 @@ export function spawnLane({
|
|
|
205
332
|
|
|
206
333
|
for (const line of lines) {
|
|
207
334
|
const trimmed = line.trim();
|
|
335
|
+
if (!trimmed) continue;
|
|
336
|
+
|
|
337
|
+
// Detect task start/progress to update label
|
|
338
|
+
// Example: [1/1] hello-task
|
|
339
|
+
const cleanLine = stripAnsi(trimmed);
|
|
340
|
+
const taskMatch = cleanLine.match(/^\s*\[(\d+)\/(\d+)\]\s+(.+)$/);
|
|
341
|
+
if (taskMatch) {
|
|
342
|
+
info.currentTaskIndex = parseInt(taskMatch[1]!);
|
|
343
|
+
// Update log manager's task index to keep it in sync for readable log
|
|
344
|
+
if (logManager) {
|
|
345
|
+
logManager.setTask(taskMatch[3]!.trim(), undefined, info.currentTaskIndex - 1);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
|
|
208
349
|
// Show if it's a timestamped log line (starts with [YYYY-MM-DD... or [HH:MM:SS])
|
|
209
350
|
// or if it's NOT a noisy JSON line
|
|
210
|
-
const hasTimestamp = /^\[\d{4}-\d{2}-\d{2}T|\^\[\d{2}:\d{2}:\d{2}\]/.test(trimmed);
|
|
211
351
|
const isJson = trimmed.startsWith('{') || trimmed.includes('{"type"');
|
|
212
352
|
// Filter out heartbeats - they should NOT reset the idle timer
|
|
213
353
|
const isHeartbeat = trimmed.includes('Heartbeat') && trimmed.includes('bytes received');
|
|
214
354
|
|
|
215
|
-
if (
|
|
355
|
+
if (!isJson) {
|
|
216
356
|
// Only trigger activity for non-heartbeat lines
|
|
217
357
|
if (onActivity && !isHeartbeat) onActivity();
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
358
|
+
|
|
359
|
+
const currentLabel = getDynamicLabel();
|
|
360
|
+
const coloredLabel = `${logger.COLORS.magenta}${currentLabel}${logger.COLORS.reset}`;
|
|
361
|
+
|
|
362
|
+
// Regex that matches timestamp even if it has ANSI color codes
|
|
363
|
+
// Matches: [24:39:14] or \x1b[90m[24:39:14]\x1b[0m
|
|
364
|
+
const timestampRegex = /^((?:\x1b\[[0-9;]*m)*)\[(\d{4}-\d{2}-\d{2}T|\d{2}:\d{2}:\d{2})\]/;
|
|
365
|
+
const tsMatch = trimmed.match(timestampRegex);
|
|
366
|
+
|
|
367
|
+
if (tsMatch) {
|
|
368
|
+
// If line already has timestamp format, just add lane prefix
|
|
369
|
+
// Check if lane label is already present to avoid triple duplication
|
|
370
|
+
if (!trimmed.includes(currentLabel)) {
|
|
371
|
+
// Insert label after the timestamp part
|
|
372
|
+
const tsPart = tsMatch[0];
|
|
373
|
+
const formatted = trimmed.replace(tsPart, `${tsPart} ${coloredLabel}`);
|
|
374
|
+
process.stdout.write(formatted + '\n');
|
|
375
|
+
} else {
|
|
376
|
+
process.stdout.write(trimmed + '\n');
|
|
377
|
+
}
|
|
223
378
|
} else {
|
|
224
379
|
// Add full prefix: timestamp + lane
|
|
225
|
-
process.stdout.write(`${logger.COLORS.gray}[${new Date().toLocaleTimeString('en-US', { hour12: false })}]${logger.COLORS.reset} ${
|
|
380
|
+
process.stdout.write(`${logger.COLORS.gray}[${new Date().toLocaleTimeString('en-US', { hour12: false })}]${logger.COLORS.reset} ${coloredLabel} ${line}\n`);
|
|
226
381
|
}
|
|
227
382
|
}
|
|
228
383
|
}
|
|
@@ -244,11 +399,14 @@ export function spawnLane({
|
|
|
244
399
|
trimmed.includes('actual output');
|
|
245
400
|
|
|
246
401
|
const ts = new Date().toLocaleTimeString('en-US', { hour12: false });
|
|
402
|
+
const currentLabel = getDynamicLabel();
|
|
403
|
+
const coloredLabel = `${logger.COLORS.magenta}${currentLabel}${logger.COLORS.reset}`;
|
|
404
|
+
|
|
247
405
|
if (isStatus) {
|
|
248
|
-
process.stdout.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${
|
|
406
|
+
process.stdout.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${coloredLabel} ${trimmed}\n`);
|
|
249
407
|
} else {
|
|
250
408
|
if (onActivity) onActivity();
|
|
251
|
-
process.stderr.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${
|
|
409
|
+
process.stderr.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${coloredLabel} ${logger.COLORS.red}❌ ERR ${trimmed}${logger.COLORS.reset}\n`);
|
|
252
410
|
}
|
|
253
411
|
}
|
|
254
412
|
}
|
|
@@ -259,9 +417,11 @@ export function spawnLane({
|
|
|
259
417
|
child.on('exit', () => {
|
|
260
418
|
logManager?.close();
|
|
261
419
|
});
|
|
420
|
+
|
|
421
|
+
return { child, logPath, logManager, info };
|
|
262
422
|
} else {
|
|
263
423
|
// Fallback to simple file logging
|
|
264
|
-
logPath = safeJoin(laneRunDir, 'terminal.log');
|
|
424
|
+
logPath = safeJoin(laneRunDir, 'terminal-readable.log');
|
|
265
425
|
const logFd = fs.openSync(logPath, 'a');
|
|
266
426
|
|
|
267
427
|
child = spawn('node', args, {
|
|
@@ -275,9 +435,19 @@ export function spawnLane({
|
|
|
275
435
|
} catch {
|
|
276
436
|
// Ignore
|
|
277
437
|
}
|
|
438
|
+
|
|
439
|
+
return {
|
|
440
|
+
child,
|
|
441
|
+
logPath,
|
|
442
|
+
logManager,
|
|
443
|
+
info: {
|
|
444
|
+
child,
|
|
445
|
+
logPath,
|
|
446
|
+
statePath: safeJoin(laneRunDir, 'state.json'),
|
|
447
|
+
laneIndex
|
|
448
|
+
}
|
|
449
|
+
};
|
|
278
450
|
}
|
|
279
|
-
|
|
280
|
-
return { child, logPath, logManager };
|
|
281
451
|
}
|
|
282
452
|
|
|
283
453
|
/**
|
|
@@ -296,7 +466,7 @@ export function waitChild(proc: ChildProcess): Promise<number> {
|
|
|
296
466
|
}
|
|
297
467
|
|
|
298
468
|
/**
|
|
299
|
-
* List lane task files in directory
|
|
469
|
+
* List lane task files in directory
|
|
300
470
|
*/
|
|
301
471
|
export function listLaneFiles(tasksDir: string): LaneInfo[] {
|
|
302
472
|
if (!fs.existsSync(tasksDir)) {
|
|
@@ -305,24 +475,15 @@ export function listLaneFiles(tasksDir: string): LaneInfo[] {
|
|
|
305
475
|
|
|
306
476
|
const files = fs.readdirSync(tasksDir);
|
|
307
477
|
return files
|
|
308
|
-
.filter(f => f.endsWith('.json'))
|
|
478
|
+
.filter(f => f.endsWith('.json') && f !== 'flow.meta.json')
|
|
309
479
|
.sort()
|
|
310
480
|
.map(f => {
|
|
311
481
|
const filePath = safeJoin(tasksDir, f);
|
|
312
482
|
const name = path.basename(f, '.json');
|
|
313
|
-
let dependsOn: string[] = [];
|
|
314
|
-
|
|
315
|
-
try {
|
|
316
|
-
const config = JSON.parse(fs.readFileSync(filePath, 'utf8')) as RunnerConfig;
|
|
317
|
-
dependsOn = config.dependsOn || [];
|
|
318
|
-
} catch (e) {
|
|
319
|
-
logger.warn(`Failed to parse config for lane ${name}: ${e}`);
|
|
320
|
-
}
|
|
321
483
|
|
|
322
484
|
return {
|
|
323
485
|
name,
|
|
324
486
|
path: filePath,
|
|
325
|
-
dependsOn,
|
|
326
487
|
};
|
|
327
488
|
});
|
|
328
489
|
}
|
|
@@ -339,8 +500,7 @@ export function printLaneStatus(lanes: LaneInfo[], laneRunDirs: Record<string, s
|
|
|
339
500
|
const state = loadState<LaneState>(statePath);
|
|
340
501
|
|
|
341
502
|
if (!state) {
|
|
342
|
-
|
|
343
|
-
return { lane: lane.name, status: isWaiting ? 'waiting' : 'pending', task: '-' };
|
|
503
|
+
return { lane: lane.name, status: 'pending', task: '-' };
|
|
344
504
|
}
|
|
345
505
|
|
|
346
506
|
const idx = (state.currentTaskIndex || 0) + 1;
|
|
@@ -388,12 +548,12 @@ async function resolveAllDependencies(
|
|
|
388
548
|
const worktreeDir = state?.worktreeDir || safeJoin(runRoot, 'resolution-worktree');
|
|
389
549
|
|
|
390
550
|
if (!fs.existsSync(worktreeDir)) {
|
|
391
|
-
logger.info(
|
|
551
|
+
logger.info(`🏗️ Creating resolution worktree at ${worktreeDir}`);
|
|
392
552
|
git.createWorktree(worktreeDir, pipelineBranch, { baseBranch: git.getCurrentBranch() });
|
|
393
553
|
}
|
|
394
554
|
|
|
395
555
|
// 3. Resolve on pipeline branch
|
|
396
|
-
logger.info(
|
|
556
|
+
logger.info(`🔄 Resolving dependencies on branch ${pipelineBranch}`);
|
|
397
557
|
git.runGit(['checkout', pipelineBranch], { cwd: worktreeDir });
|
|
398
558
|
|
|
399
559
|
for (const cmd of uniqueCommands) {
|
|
@@ -474,7 +634,6 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
474
634
|
noGit?: boolean;
|
|
475
635
|
skipPreflight?: boolean;
|
|
476
636
|
stallConfig?: Partial<StallDetectionConfig>;
|
|
477
|
-
autoRecoveryConfig?: Partial<AutoRecoveryConfig>;
|
|
478
637
|
} = {}): Promise<{ lanes: LaneInfo[]; exitCodes: Record<string, number>; runRoot: string }> {
|
|
479
638
|
const lanes = listLaneFiles(tasksDir);
|
|
480
639
|
|
|
@@ -510,34 +669,11 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
510
669
|
logger.success('✓ Preflight checks passed');
|
|
511
670
|
}
|
|
512
671
|
|
|
513
|
-
// Validate dependencies and detect cycles
|
|
514
|
-
logger.section('📊 Dependency Analysis');
|
|
515
|
-
|
|
516
|
-
const depInfos: DependencyInfo[] = lanes.map(l => ({
|
|
517
|
-
name: l.name,
|
|
518
|
-
dependsOn: l.dependsOn,
|
|
519
|
-
}));
|
|
520
|
-
|
|
521
|
-
const depValidation = validateDependencies(depInfos);
|
|
522
|
-
|
|
523
|
-
if (!depValidation.valid) {
|
|
524
|
-
logger.error('❌ Dependency validation failed:');
|
|
525
|
-
for (const err of depValidation.errors) {
|
|
526
|
-
logger.error(` • ${err}`);
|
|
527
|
-
}
|
|
528
|
-
throw new Error('Invalid dependency configuration');
|
|
529
|
-
}
|
|
530
|
-
|
|
531
|
-
if (depValidation.warnings.length > 0) {
|
|
532
|
-
for (const warn of depValidation.warnings) {
|
|
533
|
-
logger.warn(`⚠️ ${warn}`);
|
|
534
|
-
}
|
|
535
|
-
}
|
|
536
|
-
|
|
537
|
-
// Print dependency graph
|
|
538
|
-
printDependencyGraph(depInfos);
|
|
539
|
-
|
|
540
672
|
const config = loadConfig();
|
|
673
|
+
|
|
674
|
+
// Set verbose git logging from config
|
|
675
|
+
git.setVerboseGit(config.verboseGit || false);
|
|
676
|
+
|
|
541
677
|
const logsDir = getLogsDir(config);
|
|
542
678
|
const runId = `run-${Date.now()}`;
|
|
543
679
|
// Use absolute path for runRoot to avoid issues with subfolders
|
|
@@ -561,17 +697,11 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
561
697
|
const randomSuffix = Math.random().toString(36).substring(2, 7);
|
|
562
698
|
const pipelineBranch = `cursorflow/run-${Date.now().toString(36)}-${randomSuffix}`;
|
|
563
699
|
|
|
564
|
-
//
|
|
565
|
-
const
|
|
700
|
+
// Initialize unified stall detection service (Single Source of Truth)
|
|
701
|
+
const stallService = getStallService({
|
|
566
702
|
...DEFAULT_ORCHESTRATOR_STALL_CONFIG,
|
|
567
703
|
...options.stallConfig,
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
// Initialize auto-recovery manager
|
|
571
|
-
const autoRecoveryManager = getAutoRecoveryManager({
|
|
572
|
-
...DEFAULT_AUTO_RECOVERY_CONFIG,
|
|
573
|
-
idleTimeoutMs: stallConfig.idleTimeoutMs, // Sync with stall config
|
|
574
|
-
...options.autoRecoveryConfig,
|
|
704
|
+
verbose: process.env['DEBUG_STALL'] === 'true',
|
|
575
705
|
});
|
|
576
706
|
|
|
577
707
|
// Initialize event system
|
|
@@ -632,6 +762,7 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
632
762
|
|
|
633
763
|
laneWorktreeDirs[lane.name] = laneWorktreeDir;
|
|
634
764
|
|
|
765
|
+
logger.info(`🏗️ Initializing lane ${lane.name}: branch=${lanePipelineBranch}`);
|
|
635
766
|
const initialState = createLaneState(lane.name, taskConfig, lane.path, {
|
|
636
767
|
pipelineBranch: lanePipelineBranch,
|
|
637
768
|
worktreeDir: laneWorktreeDir
|
|
@@ -647,21 +778,6 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
647
778
|
logger.info(`Run directory: ${runRoot}`);
|
|
648
779
|
logger.info(`Lanes: ${lanes.length}`);
|
|
649
780
|
|
|
650
|
-
// Display dependency graph
|
|
651
|
-
logger.info('\n📊 Dependency Graph:');
|
|
652
|
-
for (const lane of lanes) {
|
|
653
|
-
const deps = lane.dependsOn.length > 0 ? ` [depends on: ${lane.dependsOn.join(', ')}]` : '';
|
|
654
|
-
console.log(` ${logger.COLORS.cyan}${lane.name}${logger.COLORS.reset}${deps}`);
|
|
655
|
-
|
|
656
|
-
// Simple tree-like visualization for deep dependencies
|
|
657
|
-
if (lane.dependsOn.length > 0) {
|
|
658
|
-
for (const dep of lane.dependsOn) {
|
|
659
|
-
console.log(` └─ ${dep}`);
|
|
660
|
-
}
|
|
661
|
-
}
|
|
662
|
-
}
|
|
663
|
-
console.log('');
|
|
664
|
-
|
|
665
781
|
// Disable auto-resolve when noGit mode is enabled
|
|
666
782
|
const autoResolve = !options.noGit && options.autoResolveDependencies !== false;
|
|
667
783
|
|
|
@@ -696,29 +812,12 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
696
812
|
|
|
697
813
|
try {
|
|
698
814
|
while (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length || (blockedLanes.size > 0 && running.size === 0)) {
|
|
699
|
-
// 1. Identify lanes ready to start
|
|
815
|
+
// 1. Identify lanes ready to start (all lanes can start immediately - no lane-level dependencies)
|
|
700
816
|
const readyToStart = lanes.filter(lane => {
|
|
701
817
|
// Not already running or completed or failed or blocked
|
|
702
818
|
if (running.has(lane.name) || completedLanes.has(lane.name) || failedLanes.has(lane.name) || blockedLanes.has(lane.name)) {
|
|
703
819
|
return false;
|
|
704
820
|
}
|
|
705
|
-
|
|
706
|
-
// Check dependencies
|
|
707
|
-
for (const dep of lane.dependsOn) {
|
|
708
|
-
if (failedLanes.has(dep)) {
|
|
709
|
-
logger.error(`Lane ${lane.name} will not start because dependency ${dep} failed`);
|
|
710
|
-
failedLanes.add(lane.name);
|
|
711
|
-
exitCodes[lane.name] = 1;
|
|
712
|
-
return false;
|
|
713
|
-
}
|
|
714
|
-
if (blockedLanes.has(dep)) {
|
|
715
|
-
// If a dependency is blocked, wait
|
|
716
|
-
return false;
|
|
717
|
-
}
|
|
718
|
-
if (!completedLanes.has(dep)) {
|
|
719
|
-
return false;
|
|
720
|
-
}
|
|
721
|
-
}
|
|
722
821
|
return true;
|
|
723
822
|
});
|
|
724
823
|
|
|
@@ -737,23 +836,23 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
737
836
|
logger.info(`Lane started: ${lane.name}${lane.startIndex ? ` (resuming from ${lane.startIndex})` : ''}`);
|
|
738
837
|
|
|
739
838
|
const now = Date.now();
|
|
740
|
-
|
|
839
|
+
|
|
840
|
+
// Register lane with unified stall detection service FIRST
|
|
841
|
+
stallService.registerLane(lane.name, {
|
|
842
|
+
laneRunDir: laneRunDirs[lane.name]!,
|
|
843
|
+
});
|
|
844
|
+
|
|
845
|
+
const laneIdx = lanes.findIndex(l => l.name === lane.name);
|
|
846
|
+
|
|
847
|
+
// Pre-register lane in running map
|
|
741
848
|
running.set(lane.name, {
|
|
742
849
|
child: {} as any, // Placeholder, will be replaced below
|
|
743
850
|
logManager: undefined,
|
|
744
851
|
logPath: '',
|
|
745
|
-
lastActivity: now,
|
|
746
|
-
lastStateUpdate: now,
|
|
747
|
-
stallPhase: 0,
|
|
748
|
-
taskStartTime: now,
|
|
749
|
-
lastOutput: '',
|
|
750
852
|
statePath: laneStatePath,
|
|
751
|
-
|
|
752
|
-
lastBytesCheck: 0,
|
|
753
|
-
continueSignalsSent: 0,
|
|
853
|
+
laneIndex: laneIdx >= 0 ? laneIdx : 0,
|
|
754
854
|
});
|
|
755
855
|
|
|
756
|
-
let lastOutput = '';
|
|
757
856
|
const spawnResult = spawnLane({
|
|
758
857
|
laneName: lane.name,
|
|
759
858
|
tasksFile: lane.path,
|
|
@@ -764,55 +863,40 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
764
863
|
worktreeDir: laneWorktreeDirs[lane.name],
|
|
765
864
|
enhancedLogConfig: options.enhancedLogging,
|
|
766
865
|
noGit: options.noGit,
|
|
866
|
+
laneIndex: laneIdx >= 0 ? laneIdx : 0,
|
|
767
867
|
onActivity: () => {
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
const actNow = Date.now();
|
|
771
|
-
info.lastActivity = actNow;
|
|
772
|
-
info.lastStateUpdate = actNow;
|
|
773
|
-
info.stallPhase = 0;
|
|
774
|
-
}
|
|
868
|
+
// Record state file update activity
|
|
869
|
+
stallService.recordStateUpdate(lane.name);
|
|
775
870
|
}
|
|
776
871
|
});
|
|
777
872
|
|
|
778
873
|
// Update with actual spawn result
|
|
779
874
|
const existingInfo = running.get(lane.name)!;
|
|
780
|
-
Object.assign(existingInfo, spawnResult);
|
|
875
|
+
Object.assign(existingInfo, spawnResult.info);
|
|
876
|
+
|
|
877
|
+
// Update stall service with child process reference
|
|
878
|
+
stallService.setChildProcess(lane.name, spawnResult.child);
|
|
781
879
|
|
|
782
|
-
// Track
|
|
880
|
+
// Track stdout for activity detection - delegate to StallDetectionService
|
|
783
881
|
if (spawnResult.child.stdout) {
|
|
784
882
|
spawnResult.child.stdout.on('data', (data: Buffer) => {
|
|
785
|
-
const
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
const lastRealLine = realLines[realLines.length - 1]!;
|
|
800
|
-
info.lastOutput = lastRealLine;
|
|
801
|
-
info.bytesReceived += data.length;
|
|
802
|
-
|
|
803
|
-
// Update auto-recovery manager with real activity
|
|
804
|
-
autoRecoveryManager.recordActivity(lane.name, data.length, info.lastOutput);
|
|
805
|
-
} else if (lines.length > 0) {
|
|
806
|
-
// Only heartbeats received - do NOT update lastActivity (keep stall timer running)
|
|
807
|
-
autoRecoveryManager.recordActivity(lane.name, 0, info.lastOutput);
|
|
808
|
-
}
|
|
883
|
+
const output = data.toString();
|
|
884
|
+
const lines = output.split('\n').filter(l => l.trim());
|
|
885
|
+
|
|
886
|
+
// Filter out heartbeats from activity tracking
|
|
887
|
+
const realLines = lines.filter(line => !(line.includes('Heartbeat') && line.includes('bytes received')));
|
|
888
|
+
|
|
889
|
+
if (realLines.length > 0) {
|
|
890
|
+
// Real activity - record with bytes
|
|
891
|
+
const lastRealLine = realLines[realLines.length - 1]!;
|
|
892
|
+
stallService.recordActivity(lane.name, data.length, lastRealLine);
|
|
893
|
+
} else if (lines.length > 0) {
|
|
894
|
+
// Heartbeat only - record with 0 bytes (won't reset timer)
|
|
895
|
+
stallService.recordActivity(lane.name, 0);
|
|
809
896
|
}
|
|
810
897
|
});
|
|
811
898
|
}
|
|
812
899
|
|
|
813
|
-
// Register lane with auto-recovery manager
|
|
814
|
-
autoRecoveryManager.registerLane(lane.name);
|
|
815
|
-
|
|
816
900
|
// Update lane tracking
|
|
817
901
|
lane.taskStartTime = now;
|
|
818
902
|
|
|
@@ -843,234 +927,47 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
843
927
|
if (result.name === '__poll__' || (now - lastStallCheck >= 10000)) {
|
|
844
928
|
lastStallCheck = now;
|
|
845
929
|
|
|
846
|
-
// Periodic stall check
|
|
930
|
+
// Periodic stall check using unified StallDetectionService
|
|
847
931
|
for (const [laneName, info] of running.entries()) {
|
|
848
|
-
const idleTime = now - info.lastActivity;
|
|
849
932
|
const lane = lanes.find(l => l.name === laneName)!;
|
|
850
933
|
|
|
851
|
-
if (process.env['DEBUG_STALL']) {
|
|
852
|
-
logger.debug(`[${laneName}] Stall check: idle=${Math.round(idleTime/1000)}s, bytesDelta=${info.bytesReceived - info.lastBytesCheck}, phase=${info.stallPhase}`);
|
|
853
|
-
}
|
|
854
|
-
|
|
855
934
|
// Check state file for progress updates
|
|
856
|
-
let progressTime = 0;
|
|
857
935
|
try {
|
|
858
936
|
const stateStat = fs.statSync(info.statePath);
|
|
859
|
-
const
|
|
860
|
-
if (
|
|
861
|
-
|
|
937
|
+
const stallState = stallService.getState(laneName);
|
|
938
|
+
if (stallState && stateStat.mtimeMs > stallState.lastStateUpdateTime) {
|
|
939
|
+
stallService.recordStateUpdate(laneName);
|
|
862
940
|
}
|
|
863
|
-
progressTime = now - info.lastStateUpdate;
|
|
864
941
|
} catch {
|
|
865
942
|
// State file might not exist yet
|
|
866
943
|
}
|
|
867
944
|
|
|
868
|
-
//
|
|
869
|
-
|
|
870
|
-
|
|
945
|
+
// Debug logging
|
|
946
|
+
if (process.env['DEBUG_STALL']) {
|
|
947
|
+
logger.debug(`[${laneName}] ${stallService.dumpState(laneName)}`);
|
|
948
|
+
}
|
|
871
949
|
|
|
872
|
-
//
|
|
873
|
-
const analysis =
|
|
874
|
-
stallPhase: info.stallPhase,
|
|
875
|
-
idleTimeMs: idleTime,
|
|
876
|
-
progressTimeMs: progressTime,
|
|
877
|
-
lastOutput: info.lastOutput,
|
|
878
|
-
restartCount: lane.restartCount || 0,
|
|
879
|
-
taskStartTimeMs: info.taskStartTime,
|
|
880
|
-
bytesReceived: bytesDelta, // Bytes since last check
|
|
881
|
-
continueSignalsSent: info.continueSignalsSent,
|
|
882
|
-
}, stallConfig);
|
|
950
|
+
// Run stall analysis and recovery (all logic is in StallDetectionService)
|
|
951
|
+
const analysis = stallService.checkAndRecover(laneName);
|
|
883
952
|
|
|
884
|
-
//
|
|
953
|
+
// Log to lane log manager if there was an action
|
|
885
954
|
if (analysis.action !== RecoveryAction.NONE) {
|
|
886
|
-
logFailure(laneName, analysis);
|
|
887
955
|
info.logManager?.log('error', analysis.message);
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
laneName,
|
|
900
|
-
idleSeconds: Math.round(idleTime / 1000),
|
|
901
|
-
signalCount: info.continueSignalsSent,
|
|
902
|
-
});
|
|
903
|
-
} catch (e) {
|
|
904
|
-
logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
|
|
905
|
-
}
|
|
906
|
-
} else if (analysis.action === RecoveryAction.STRONGER_PROMPT) {
|
|
907
|
-
const interventionPath = safeJoin(laneRunDirs[laneName]!, 'intervention.txt');
|
|
908
|
-
const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck. Please continue with your current task immediately. If you're waiting for something, explain what you need and proceed with what you can do now. If you've completed the task, summarize your work and finish.`;
|
|
909
|
-
try {
|
|
910
|
-
fs.writeFileSync(interventionPath, strongerPrompt);
|
|
911
|
-
info.stallPhase = 2;
|
|
912
|
-
info.lastActivity = now;
|
|
913
|
-
logger.warn(`[${laneName}] Sent stronger prompt after continue signal failed`);
|
|
914
|
-
|
|
915
|
-
events.emit('recovery.stronger_prompt', { laneName });
|
|
916
|
-
} catch (e) {
|
|
917
|
-
logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
|
|
918
|
-
}
|
|
919
|
-
} else if (analysis.action === RecoveryAction.KILL_AND_RESTART ||
|
|
920
|
-
analysis.action === RecoveryAction.RESTART_LANE ||
|
|
921
|
-
analysis.action === RecoveryAction.RESTART_LANE_FROM_CHECKPOINT) {
|
|
922
|
-
lane.restartCount = (lane.restartCount || 0) + 1;
|
|
923
|
-
info.stallPhase = 3;
|
|
924
|
-
|
|
925
|
-
// Try to get checkpoint info
|
|
926
|
-
const checkpoint = getLatestCheckpoint(laneRunDirs[laneName]!);
|
|
927
|
-
if (checkpoint) {
|
|
928
|
-
logger.info(`[${laneName}] Checkpoint available: ${checkpoint.id} (task ${checkpoint.taskIndex})`);
|
|
929
|
-
}
|
|
930
|
-
|
|
931
|
-
// Kill the process
|
|
932
|
-
try {
|
|
933
|
-
info.child.kill('SIGKILL');
|
|
934
|
-
} catch {
|
|
935
|
-
// Process might already be dead
|
|
936
|
-
}
|
|
937
|
-
|
|
938
|
-
logger.warn(`[${laneName}] Killing and restarting lane (restart #${lane.restartCount})`);
|
|
939
|
-
|
|
940
|
-
events.emit('recovery.restart', {
|
|
941
|
-
laneName,
|
|
942
|
-
restartCount: lane.restartCount,
|
|
943
|
-
maxRestarts: stallConfig.maxRestarts,
|
|
944
|
-
});
|
|
945
|
-
} else if (analysis.action === RecoveryAction.RUN_DOCTOR) {
|
|
946
|
-
info.stallPhase = 4;
|
|
947
|
-
|
|
948
|
-
// Run diagnostics
|
|
949
|
-
logger.error(`[${laneName}] Running diagnostics due to persistent failures...`);
|
|
950
|
-
|
|
951
|
-
// Import health check dynamically to avoid circular dependency
|
|
952
|
-
const { checkAgentHealth, checkAuthHealth } = await import('../utils/health');
|
|
953
|
-
|
|
954
|
-
const [agentHealth, authHealth] = await Promise.all([
|
|
955
|
-
checkAgentHealth(),
|
|
956
|
-
checkAuthHealth(),
|
|
957
|
-
]);
|
|
958
|
-
|
|
959
|
-
const issues: string[] = [];
|
|
960
|
-
if (!agentHealth.ok) issues.push(`Agent: ${agentHealth.message}`);
|
|
961
|
-
if (!authHealth.ok) issues.push(`Auth: ${authHealth.message}`);
|
|
962
|
-
|
|
963
|
-
if (issues.length > 0) {
|
|
964
|
-
logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
|
|
965
|
-
} else {
|
|
966
|
-
logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
|
|
967
|
-
}
|
|
968
|
-
|
|
969
|
-
// Save diagnostic to file
|
|
970
|
-
const diagnosticPath = safeJoin(laneRunDirs[laneName]!, 'diagnostic.json');
|
|
971
|
-
fs.writeFileSync(diagnosticPath, JSON.stringify({
|
|
972
|
-
timestamp: Date.now(),
|
|
973
|
-
agentHealthy: agentHealth.ok,
|
|
974
|
-
authHealthy: authHealth.ok,
|
|
975
|
-
issues,
|
|
976
|
-
analysis,
|
|
977
|
-
}, null, 2));
|
|
978
|
-
|
|
979
|
-
// Kill the process
|
|
980
|
-
try {
|
|
981
|
-
info.child.kill('SIGKILL');
|
|
982
|
-
} catch {
|
|
983
|
-
// Process might already be dead
|
|
984
|
-
}
|
|
985
|
-
|
|
986
|
-
logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
|
|
987
|
-
|
|
988
|
-
// Save POF for failed recovery
|
|
989
|
-
const recoveryState = autoRecoveryManager.getState(laneName);
|
|
990
|
-
if (recoveryState) {
|
|
991
|
-
try {
|
|
992
|
-
const laneStatePath = safeJoin(laneRunDirs[laneName]!, 'state.json');
|
|
993
|
-
const laneState = loadState<LaneState>(laneStatePath);
|
|
994
|
-
const pofDir = safeJoin(runRoot, '..', '..', 'pof');
|
|
995
|
-
const diagnosticInfo = {
|
|
996
|
-
timestamp: Date.now(),
|
|
997
|
-
agentHealthy: agentHealth.ok,
|
|
998
|
-
authHealthy: authHealth.ok,
|
|
999
|
-
systemHealthy: true,
|
|
1000
|
-
suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
|
|
1001
|
-
details: issues.join('\n') || 'No obvious issues found',
|
|
1002
|
-
};
|
|
1003
|
-
const pofEntry = createPOFFromRecoveryState(
|
|
1004
|
-
runId,
|
|
1005
|
-
runRoot,
|
|
1006
|
-
laneName,
|
|
1007
|
-
recoveryState,
|
|
1008
|
-
laneState,
|
|
1009
|
-
diagnosticInfo
|
|
1010
|
-
);
|
|
1011
|
-
savePOF(runId, pofDir, pofEntry);
|
|
1012
|
-
} catch (pofError: any) {
|
|
1013
|
-
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
1014
|
-
}
|
|
1015
|
-
}
|
|
1016
|
-
|
|
1017
|
-
events.emit('recovery.diagnosed', {
|
|
1018
|
-
laneName,
|
|
1019
|
-
diagnostic: { agentHealthy: agentHealth.ok, authHealthy: authHealth.ok, issues },
|
|
1020
|
-
});
|
|
1021
|
-
} else if (analysis.action === RecoveryAction.ABORT_LANE) {
|
|
1022
|
-
info.stallPhase = 5;
|
|
1023
|
-
|
|
1024
|
-
try {
|
|
1025
|
-
info.child.kill('SIGKILL');
|
|
1026
|
-
} catch {
|
|
1027
|
-
// Process might already be dead
|
|
1028
|
-
}
|
|
1029
|
-
|
|
1030
|
-
logger.error(`[${laneName}] Aborting lane due to repeated stalls`);
|
|
1031
|
-
|
|
1032
|
-
// Save POF for failed recovery
|
|
1033
|
-
const recoveryState = autoRecoveryManager.getState(laneName);
|
|
1034
|
-
if (recoveryState) {
|
|
1035
|
-
try {
|
|
1036
|
-
const laneStatePath = safeJoin(laneRunDirs[laneName]!, 'state.json');
|
|
1037
|
-
const laneState = loadState<LaneState>(laneStatePath);
|
|
1038
|
-
const pofDir = safeJoin(runRoot, '..', '..', 'pof');
|
|
1039
|
-
const pofEntry = createPOFFromRecoveryState(
|
|
1040
|
-
runId,
|
|
1041
|
-
runRoot,
|
|
1042
|
-
laneName,
|
|
1043
|
-
recoveryState,
|
|
1044
|
-
laneState,
|
|
1045
|
-
recoveryState.diagnosticInfo
|
|
1046
|
-
);
|
|
1047
|
-
savePOF(runId, pofDir, pofEntry);
|
|
1048
|
-
} catch (pofError: any) {
|
|
1049
|
-
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
1050
|
-
}
|
|
1051
|
-
}
|
|
1052
|
-
} else if (analysis.action === RecoveryAction.SEND_GIT_GUIDANCE) {
|
|
1053
|
-
// Send guidance message to agent for git issues
|
|
1054
|
-
const interventionPath = safeJoin(laneRunDirs[laneName]!, 'intervention.txt');
|
|
1055
|
-
|
|
1056
|
-
// Determine which guidance to send based on the failure type
|
|
1057
|
-
let guidance: string;
|
|
1058
|
-
if (analysis.type === FailureType.GIT_PUSH_REJECTED) {
|
|
1059
|
-
guidance = getGitPushFailureGuidance();
|
|
1060
|
-
} else if (analysis.type === FailureType.MERGE_CONFLICT) {
|
|
1061
|
-
guidance = getMergeConflictGuidance();
|
|
1062
|
-
} else {
|
|
1063
|
-
guidance = getGitErrorGuidance(analysis.message);
|
|
1064
|
-
}
|
|
1065
|
-
|
|
1066
|
-
try {
|
|
1067
|
-
fs.writeFileSync(interventionPath, guidance);
|
|
1068
|
-
info.lastActivity = now;
|
|
1069
|
-
logger.info(`[${laneName}] Sent git issue guidance to agent`);
|
|
1070
|
-
} catch (e: any) {
|
|
1071
|
-
logger.error(`[${laneName}] Failed to send guidance: ${e.message}`);
|
|
1072
|
-
}
|
|
956
|
+
|
|
957
|
+
// Handle special case: RUN_DOCTOR needs async operations
|
|
958
|
+
if (analysis.action === RecoveryAction.RUN_DOCTOR) {
|
|
959
|
+
await handleDoctorDiagnostics(
|
|
960
|
+
laneName,
|
|
961
|
+
laneRunDirs[laneName]!,
|
|
962
|
+
runId,
|
|
963
|
+
runRoot,
|
|
964
|
+
stallService,
|
|
965
|
+
info.child
|
|
966
|
+
);
|
|
1073
967
|
}
|
|
968
|
+
|
|
969
|
+
// Sync restartCount back to lane info (for restart logic in process exit handler)
|
|
970
|
+
lane.restartCount = stallService.getRestartCount(laneName);
|
|
1074
971
|
}
|
|
1075
972
|
}
|
|
1076
973
|
continue;
|
|
@@ -1080,8 +977,11 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
1080
977
|
running.delete(finished.name);
|
|
1081
978
|
exitCodes[finished.name] = finished.code;
|
|
1082
979
|
|
|
1083
|
-
//
|
|
1084
|
-
|
|
980
|
+
// Get stall state before unregistering
|
|
981
|
+
const stallPhase = stallService.getPhase(finished.name);
|
|
982
|
+
|
|
983
|
+
// Unregister from stall detection service
|
|
984
|
+
stallService.unregisterLane(finished.name);
|
|
1085
985
|
|
|
1086
986
|
if (finished.code === 0) {
|
|
1087
987
|
completedLanes.add(finished.name);
|
|
@@ -1111,8 +1011,8 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
1111
1011
|
logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
|
|
1112
1012
|
}
|
|
1113
1013
|
} else {
|
|
1114
|
-
// Check if it was a restart request
|
|
1115
|
-
if (
|
|
1014
|
+
// Check if it was a restart request (RESTART_REQUESTED phase)
|
|
1015
|
+
if (stallPhase === StallPhase.RESTART_REQUESTED) {
|
|
1116
1016
|
logger.info(`🔄 Lane ${finished.name} is being restarted due to stall...`);
|
|
1117
1017
|
|
|
1118
1018
|
// Update startIndex from current state to resume from the same task
|
|
@@ -1133,7 +1033,7 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
1133
1033
|
failedLanes.add(finished.name);
|
|
1134
1034
|
|
|
1135
1035
|
let errorMsg = 'Process exited with non-zero code';
|
|
1136
|
-
if (
|
|
1036
|
+
if (stallPhase >= StallPhase.DIAGNOSED) {
|
|
1137
1037
|
errorMsg = 'Stopped due to repeated stall';
|
|
1138
1038
|
} else if (info.logManager) {
|
|
1139
1039
|
const lastError = info.logManager.getLastError();
|