@litmers/cursorflow-orchestrator 0.1.31 → 0.1.36
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/README.md +182 -59
- package/commands/cursorflow-add.md +159 -0
- package/commands/cursorflow-doctor.md +45 -23
- package/commands/cursorflow-monitor.md +23 -2
- package/commands/cursorflow-new.md +87 -0
- package/commands/cursorflow-run.md +60 -111
- package/dist/cli/add.d.ts +7 -0
- package/dist/cli/add.js +377 -0
- package/dist/cli/add.js.map +1 -0
- package/dist/cli/clean.js +1 -0
- package/dist/cli/clean.js.map +1 -1
- package/dist/cli/config.d.ts +7 -0
- package/dist/cli/config.js +181 -0
- package/dist/cli/config.js.map +1 -0
- package/dist/cli/doctor.js +47 -4
- package/dist/cli/doctor.js.map +1 -1
- package/dist/cli/index.js +34 -30
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/logs.js +17 -34
- package/dist/cli/logs.js.map +1 -1
- package/dist/cli/monitor.js +62 -65
- package/dist/cli/monitor.js.map +1 -1
- package/dist/cli/new.d.ts +7 -0
- package/dist/cli/new.js +232 -0
- package/dist/cli/new.js.map +1 -0
- package/dist/cli/prepare.js +95 -193
- package/dist/cli/prepare.js.map +1 -1
- package/dist/cli/resume.js +57 -68
- package/dist/cli/resume.js.map +1 -1
- package/dist/cli/run.js +60 -30
- package/dist/cli/run.js.map +1 -1
- package/dist/cli/stop.js +6 -0
- package/dist/cli/stop.js.map +1 -1
- package/dist/cli/tasks.d.ts +5 -3
- package/dist/cli/tasks.js +181 -29
- package/dist/cli/tasks.js.map +1 -1
- package/dist/core/failure-policy.d.ts +9 -0
- package/dist/core/failure-policy.js +9 -0
- package/dist/core/failure-policy.js.map +1 -1
- package/dist/core/orchestrator.d.ts +20 -6
- package/dist/core/orchestrator.js +215 -334
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/core/runner/agent.d.ts +27 -0
- package/dist/core/runner/agent.js +294 -0
- package/dist/core/runner/agent.js.map +1 -0
- package/dist/core/runner/index.d.ts +5 -0
- package/dist/core/runner/index.js +22 -0
- package/dist/core/runner/index.js.map +1 -0
- package/dist/core/runner/pipeline.d.ts +9 -0
- package/dist/core/runner/pipeline.js +539 -0
- package/dist/core/runner/pipeline.js.map +1 -0
- package/dist/core/runner/prompt.d.ts +25 -0
- package/dist/core/runner/prompt.js +175 -0
- package/dist/core/runner/prompt.js.map +1 -0
- package/dist/core/runner/task.d.ts +26 -0
- package/dist/core/runner/task.js +283 -0
- package/dist/core/runner/task.js.map +1 -0
- package/dist/core/runner/utils.d.ts +37 -0
- package/dist/core/runner/utils.js +161 -0
- package/dist/core/runner/utils.js.map +1 -0
- package/dist/core/runner.d.ts +2 -96
- package/dist/core/runner.js +11 -1136
- package/dist/core/runner.js.map +1 -1
- package/dist/core/stall-detection.d.ts +326 -0
- package/dist/core/stall-detection.js +781 -0
- package/dist/core/stall-detection.js.map +1 -0
- package/dist/services/logging/console.js +2 -1
- package/dist/services/logging/console.js.map +1 -1
- package/dist/types/config.d.ts +6 -6
- package/dist/types/flow.d.ts +84 -0
- package/dist/types/flow.js +10 -0
- package/dist/types/flow.js.map +1 -0
- package/dist/types/index.d.ts +1 -0
- package/dist/types/index.js +3 -3
- package/dist/types/index.js.map +1 -1
- package/dist/types/lane.d.ts +0 -2
- package/dist/types/logging.d.ts +5 -1
- package/dist/types/task.d.ts +7 -11
- package/dist/utils/config.d.ts +5 -1
- package/dist/utils/config.js +15 -16
- package/dist/utils/config.js.map +1 -1
- package/dist/utils/dependency.d.ts +36 -1
- package/dist/utils/dependency.js +256 -1
- package/dist/utils/dependency.js.map +1 -1
- package/dist/utils/doctor.js +40 -8
- package/dist/utils/doctor.js.map +1 -1
- package/dist/utils/enhanced-logger.d.ts +45 -82
- package/dist/utils/enhanced-logger.js +239 -844
- package/dist/utils/enhanced-logger.js.map +1 -1
- package/dist/utils/flow.d.ts +9 -0
- package/dist/utils/flow.js +73 -0
- package/dist/utils/flow.js.map +1 -0
- package/dist/utils/git.d.ts +29 -0
- package/dist/utils/git.js +115 -5
- package/dist/utils/git.js.map +1 -1
- package/dist/utils/state.js +0 -2
- package/dist/utils/state.js.map +1 -1
- package/dist/utils/task-service.d.ts +2 -2
- package/dist/utils/task-service.js +40 -31
- package/dist/utils/task-service.js.map +1 -1
- package/package.json +4 -3
- package/src/cli/add.ts +397 -0
- package/src/cli/clean.ts +1 -0
- package/src/cli/config.ts +177 -0
- package/src/cli/doctor.ts +48 -4
- package/src/cli/index.ts +36 -32
- package/src/cli/logs.ts +20 -33
- package/src/cli/monitor.ts +70 -75
- package/src/cli/new.ts +235 -0
- package/src/cli/prepare.ts +98 -205
- package/src/cli/resume.ts +61 -76
- package/src/cli/run.ts +333 -306
- package/src/cli/stop.ts +8 -0
- package/src/cli/tasks.ts +200 -21
- package/src/core/failure-policy.ts +9 -0
- package/src/core/orchestrator.ts +279 -379
- package/src/core/runner/agent.ts +314 -0
- package/src/core/runner/index.ts +6 -0
- package/src/core/runner/pipeline.ts +567 -0
- package/src/core/runner/prompt.ts +174 -0
- package/src/core/runner/task.ts +320 -0
- package/src/core/runner/utils.ts +142 -0
- package/src/core/runner.ts +8 -1347
- package/src/core/stall-detection.ts +936 -0
- package/src/services/logging/console.ts +2 -1
- package/src/types/config.ts +6 -6
- package/src/types/flow.ts +91 -0
- package/src/types/index.ts +15 -3
- package/src/types/lane.ts +0 -2
- package/src/types/logging.ts +5 -1
- package/src/types/task.ts +7 -11
- package/src/utils/config.ts +16 -17
- package/src/utils/dependency.ts +311 -2
- package/src/utils/doctor.ts +36 -8
- package/src/utils/enhanced-logger.ts +264 -927
- package/src/utils/flow.ts +42 -0
- package/src/utils/git.ts +145 -5
- package/src/utils/state.ts +0 -2
- package/src/utils/task-service.ts +48 -40
- package/commands/cursorflow-review.md +0 -56
- package/commands/cursorflow-runs.md +0 -59
- package/dist/cli/runs.d.ts +0 -5
- package/dist/cli/runs.js +0 -214
- package/dist/cli/runs.js.map +0 -1
- package/dist/core/reviewer.d.ts +0 -66
- package/dist/core/reviewer.js +0 -265
- package/dist/core/reviewer.js.map +0 -1
- package/src/cli/runs.ts +0 -212
- package/src/core/reviewer.ts +0 -285
|
@@ -60,15 +60,12 @@ const child_process_2 = require("child_process");
|
|
|
60
60
|
const path_1 = require("../utils/path");
|
|
61
61
|
const enhanced_logger_1 = require("../utils/enhanced-logger");
|
|
62
62
|
const log_formatter_1 = require("../utils/log-formatter");
|
|
63
|
-
const failure_policy_1 = require("./failure-policy");
|
|
64
63
|
const auto_recovery_1 = require("./auto-recovery");
|
|
65
|
-
const
|
|
64
|
+
const stall_detection_1 = require("./stall-detection");
|
|
66
65
|
const health_1 = require("../utils/health");
|
|
67
|
-
const checkpoint_1 = require("../utils/checkpoint");
|
|
68
66
|
const lock_1 = require("../utils/lock");
|
|
69
67
|
/** Default stall detection configuration - 2 minute idle timeout for recovery */
|
|
70
68
|
const DEFAULT_ORCHESTRATOR_STALL_CONFIG = {
|
|
71
|
-
...failure_policy_1.DEFAULT_STALL_CONFIG,
|
|
72
69
|
idleTimeoutMs: 2 * 60 * 1000, // 2 minutes (idle detection for continue signal)
|
|
73
70
|
progressTimeoutMs: 10 * 60 * 1000, // 10 minutes (only triggers if no activity at all)
|
|
74
71
|
maxRestarts: 2,
|
|
@@ -94,10 +91,93 @@ function logFileTail(filePath, lines = 10) {
|
|
|
94
91
|
// Ignore log reading errors
|
|
95
92
|
}
|
|
96
93
|
}
|
|
94
|
+
/**
|
|
95
|
+
* Handle RUN_DOCTOR action - runs async health diagnostics
|
|
96
|
+
*/
|
|
97
|
+
async function handleDoctorDiagnostics(laneName, laneRunDir, runId, runRoot, stallService, child) {
|
|
98
|
+
// Import health check dynamically to avoid circular dependency
|
|
99
|
+
const { checkAgentHealth, checkAuthHealth } = await Promise.resolve().then(() => __importStar(require('../utils/health')));
|
|
100
|
+
const [agentHealth, authHealth] = await Promise.all([
|
|
101
|
+
checkAgentHealth(),
|
|
102
|
+
checkAuthHealth(),
|
|
103
|
+
]);
|
|
104
|
+
const issues = [];
|
|
105
|
+
if (!agentHealth.ok)
|
|
106
|
+
issues.push(`Agent: ${agentHealth.message}`);
|
|
107
|
+
if (!authHealth.ok)
|
|
108
|
+
issues.push(`Auth: ${authHealth.message}`);
|
|
109
|
+
if (issues.length > 0) {
|
|
110
|
+
logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
|
|
114
|
+
}
|
|
115
|
+
// Save diagnostic to file
|
|
116
|
+
const diagnosticPath = (0, path_1.safeJoin)(laneRunDir, 'diagnostic.json');
|
|
117
|
+
fs.writeFileSync(diagnosticPath, JSON.stringify({
|
|
118
|
+
timestamp: Date.now(),
|
|
119
|
+
agentHealthy: agentHealth.ok,
|
|
120
|
+
authHealthy: authHealth.ok,
|
|
121
|
+
issues,
|
|
122
|
+
}, null, 2));
|
|
123
|
+
// Kill the process
|
|
124
|
+
try {
|
|
125
|
+
child.kill('SIGKILL');
|
|
126
|
+
}
|
|
127
|
+
catch {
|
|
128
|
+
// Process might already be dead
|
|
129
|
+
}
|
|
130
|
+
logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
|
|
131
|
+
// Save POF for failed recovery
|
|
132
|
+
const stallState = stallService.getState(laneName);
|
|
133
|
+
if (stallState) {
|
|
134
|
+
try {
|
|
135
|
+
const laneStatePath = (0, path_1.safeJoin)(laneRunDir, 'state.json');
|
|
136
|
+
const laneState = (0, state_1.loadState)(laneStatePath);
|
|
137
|
+
const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
|
|
138
|
+
// Convert stall state to recovery state format for POF
|
|
139
|
+
// Note: StallPhase and RecoveryStage have compatible numeric values (0-5)
|
|
140
|
+
const recoveryState = {
|
|
141
|
+
laneName,
|
|
142
|
+
stage: stallState.phase, // Both enums use 0-5
|
|
143
|
+
lastActivityTime: stallState.lastRealActivityTime,
|
|
144
|
+
lastBytesReceived: stallState.bytesSinceLastCheck,
|
|
145
|
+
totalBytesReceived: stallState.totalBytesReceived,
|
|
146
|
+
lastOutput: stallState.lastOutput,
|
|
147
|
+
restartCount: stallState.restartCount,
|
|
148
|
+
continueSignalsSent: stallState.continueSignalCount,
|
|
149
|
+
lastStageChangeTime: stallState.lastPhaseChangeTime,
|
|
150
|
+
isLongOperation: stallState.isLongOperation,
|
|
151
|
+
failureHistory: stallState.failureHistory.map(f => ({
|
|
152
|
+
timestamp: f.timestamp,
|
|
153
|
+
stage: f.phase, // Both enums use 0-5
|
|
154
|
+
action: f.action,
|
|
155
|
+
message: f.message,
|
|
156
|
+
idleTimeMs: f.idleTimeMs,
|
|
157
|
+
bytesReceived: f.bytesReceived,
|
|
158
|
+
lastOutput: f.lastOutput,
|
|
159
|
+
})),
|
|
160
|
+
};
|
|
161
|
+
const diagnosticInfo = {
|
|
162
|
+
timestamp: Date.now(),
|
|
163
|
+
agentHealthy: agentHealth.ok,
|
|
164
|
+
authHealthy: authHealth.ok,
|
|
165
|
+
systemHealthy: true,
|
|
166
|
+
suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
|
|
167
|
+
details: issues.join('\n') || 'No obvious issues found',
|
|
168
|
+
};
|
|
169
|
+
const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, diagnosticInfo);
|
|
170
|
+
(0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
|
|
171
|
+
}
|
|
172
|
+
catch (pofError) {
|
|
173
|
+
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
97
177
|
/**
|
|
98
178
|
* Spawn a lane process
|
|
99
179
|
*/
|
|
100
|
-
function spawnLane({ laneName, tasksFile, laneRunDir, executor, startIndex = 0, pipelineBranch, worktreeDir, enhancedLogConfig, noGit = false, onActivity, }) {
|
|
180
|
+
function spawnLane({ laneName, tasksFile, laneRunDir, executor, startIndex = 0, pipelineBranch, worktreeDir, enhancedLogConfig, noGit = false, onActivity, laneIndex = 0, }) {
|
|
101
181
|
fs.mkdirSync(laneRunDir, { recursive: true });
|
|
102
182
|
// Use extension-less resolve to handle both .ts (dev) and .js (dist)
|
|
103
183
|
const runnerPath = require.resolve('./runner');
|
|
@@ -127,17 +207,24 @@ function spawnLane({ laneName, tasksFile, laneRunDir, executor, startIndex = 0,
|
|
|
127
207
|
...process.env,
|
|
128
208
|
};
|
|
129
209
|
if (logConfig.enabled) {
|
|
210
|
+
// Helper to get dynamic lane label like [L1-T1-lanename10]
|
|
211
|
+
const getDynamicLabel = () => {
|
|
212
|
+
const laneNum = `L${laneIndex + 1}`;
|
|
213
|
+
const taskPart = info.currentTaskIndex ? `-T${info.currentTaskIndex}` : '';
|
|
214
|
+
const shortLaneName = laneName.substring(0, 10);
|
|
215
|
+
return `[${laneNum}${taskPart}-${shortLaneName}]`;
|
|
216
|
+
};
|
|
130
217
|
// Create callback for clean console output
|
|
131
218
|
const onParsedMessage = (msg) => {
|
|
132
219
|
if (onActivity)
|
|
133
220
|
onActivity();
|
|
134
221
|
const formatted = (0, log_formatter_1.formatMessageForConsole)(msg, {
|
|
135
|
-
laneLabel:
|
|
222
|
+
laneLabel: getDynamicLabel(),
|
|
136
223
|
includeTimestamp: true
|
|
137
224
|
});
|
|
138
225
|
process.stdout.write(formatted + '\n');
|
|
139
226
|
};
|
|
140
|
-
logManager = (0, enhanced_logger_1.createLogManager)(laneRunDir, laneName, logConfig, onParsedMessage);
|
|
227
|
+
logManager = (0, enhanced_logger_1.createLogManager)(laneRunDir, laneName, logConfig, onParsedMessage, laneIndex);
|
|
141
228
|
logPath = logManager.getLogPaths().clean;
|
|
142
229
|
// Spawn with pipe for enhanced logging
|
|
143
230
|
child = (0, child_process_1.spawn)('node', args, {
|
|
@@ -145,6 +232,15 @@ function spawnLane({ laneName, tasksFile, laneRunDir, executor, startIndex = 0,
|
|
|
145
232
|
env: childEnv,
|
|
146
233
|
detached: false,
|
|
147
234
|
});
|
|
235
|
+
// Initialize info object for stdout handler to use
|
|
236
|
+
const info = {
|
|
237
|
+
child,
|
|
238
|
+
logManager,
|
|
239
|
+
logPath,
|
|
240
|
+
statePath: (0, path_1.safeJoin)(laneRunDir, 'state.json'),
|
|
241
|
+
laneIndex,
|
|
242
|
+
currentTaskIndex: startIndex > 0 ? startIndex + 1 : 0
|
|
243
|
+
};
|
|
148
244
|
// Buffer for non-JSON lines
|
|
149
245
|
let lineBuffer = '';
|
|
150
246
|
// Pipe stdout and stderr through enhanced logger
|
|
@@ -158,25 +254,50 @@ function spawnLane({ laneName, tasksFile, laneRunDir, executor, startIndex = 0,
|
|
|
158
254
|
lineBuffer = lines.pop() || '';
|
|
159
255
|
for (const line of lines) {
|
|
160
256
|
const trimmed = line.trim();
|
|
257
|
+
if (!trimmed)
|
|
258
|
+
continue;
|
|
259
|
+
// Detect task start/progress to update label
|
|
260
|
+
// Example: [1/1] hello-task
|
|
261
|
+
const cleanLine = (0, enhanced_logger_1.stripAnsi)(trimmed);
|
|
262
|
+
const taskMatch = cleanLine.match(/^\s*\[(\d+)\/(\d+)\]\s+(.+)$/);
|
|
263
|
+
if (taskMatch) {
|
|
264
|
+
info.currentTaskIndex = parseInt(taskMatch[1]);
|
|
265
|
+
// Update log manager's task index to keep it in sync for readable log
|
|
266
|
+
if (logManager) {
|
|
267
|
+
logManager.setTask(taskMatch[3].trim(), undefined, info.currentTaskIndex - 1);
|
|
268
|
+
}
|
|
269
|
+
}
|
|
161
270
|
// Show if it's a timestamped log line (starts with [YYYY-MM-DD... or [HH:MM:SS])
|
|
162
271
|
// or if it's NOT a noisy JSON line
|
|
163
|
-
const hasTimestamp = /^\[\d{4}-\d{2}-\d{2}T|\^\[\d{2}:\d{2}:\d{2}\]/.test(trimmed);
|
|
164
272
|
const isJson = trimmed.startsWith('{') || trimmed.includes('{"type"');
|
|
165
273
|
// Filter out heartbeats - they should NOT reset the idle timer
|
|
166
274
|
const isHeartbeat = trimmed.includes('Heartbeat') && trimmed.includes('bytes received');
|
|
167
|
-
if (
|
|
275
|
+
if (!isJson) {
|
|
168
276
|
// Only trigger activity for non-heartbeat lines
|
|
169
277
|
if (onActivity && !isHeartbeat)
|
|
170
278
|
onActivity();
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
279
|
+
const currentLabel = getDynamicLabel();
|
|
280
|
+
const coloredLabel = `${logger.COLORS.magenta}${currentLabel}${logger.COLORS.reset}`;
|
|
281
|
+
// Regex that matches timestamp even if it has ANSI color codes
|
|
282
|
+
// Matches: [24:39:14] or \x1b[90m[24:39:14]\x1b[0m
|
|
283
|
+
const timestampRegex = /^((?:\x1b\[[0-9;]*m)*)\[(\d{4}-\d{2}-\d{2}T|\d{2}:\d{2}:\d{2})\]/;
|
|
284
|
+
const tsMatch = trimmed.match(timestampRegex);
|
|
285
|
+
if (tsMatch) {
|
|
286
|
+
// If line already has timestamp format, just add lane prefix
|
|
287
|
+
// Check if lane label is already present to avoid triple duplication
|
|
288
|
+
if (!trimmed.includes(currentLabel)) {
|
|
289
|
+
// Insert label after the timestamp part
|
|
290
|
+
const tsPart = tsMatch[0];
|
|
291
|
+
const formatted = trimmed.replace(tsPart, `${tsPart} ${coloredLabel}`);
|
|
292
|
+
process.stdout.write(formatted + '\n');
|
|
293
|
+
}
|
|
294
|
+
else {
|
|
295
|
+
process.stdout.write(trimmed + '\n');
|
|
296
|
+
}
|
|
176
297
|
}
|
|
177
298
|
else {
|
|
178
299
|
// Add full prefix: timestamp + lane
|
|
179
|
-
process.stdout.write(`${logger.COLORS.gray}[${new Date().toLocaleTimeString('en-US', { hour12: false })}]${logger.COLORS.reset} ${
|
|
300
|
+
process.stdout.write(`${logger.COLORS.gray}[${new Date().toLocaleTimeString('en-US', { hour12: false })}]${logger.COLORS.reset} ${coloredLabel} ${line}\n`);
|
|
180
301
|
}
|
|
181
302
|
}
|
|
182
303
|
}
|
|
@@ -196,13 +317,15 @@ function spawnLane({ laneName, tasksFile, laneRunDir, executor, startIndex = 0,
|
|
|
196
317
|
trimmed.startsWith('HEAD is now at') ||
|
|
197
318
|
trimmed.includes('actual output');
|
|
198
319
|
const ts = new Date().toLocaleTimeString('en-US', { hour12: false });
|
|
320
|
+
const currentLabel = getDynamicLabel();
|
|
321
|
+
const coloredLabel = `${logger.COLORS.magenta}${currentLabel}${logger.COLORS.reset}`;
|
|
199
322
|
if (isStatus) {
|
|
200
|
-
process.stdout.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${
|
|
323
|
+
process.stdout.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${coloredLabel} ${trimmed}\n`);
|
|
201
324
|
}
|
|
202
325
|
else {
|
|
203
326
|
if (onActivity)
|
|
204
327
|
onActivity();
|
|
205
|
-
process.stderr.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${
|
|
328
|
+
process.stderr.write(`${logger.COLORS.gray}[${ts}]${logger.COLORS.reset} ${coloredLabel} ${logger.COLORS.red}❌ ERR ${trimmed}${logger.COLORS.reset}\n`);
|
|
206
329
|
}
|
|
207
330
|
}
|
|
208
331
|
}
|
|
@@ -212,10 +335,11 @@ function spawnLane({ laneName, tasksFile, laneRunDir, executor, startIndex = 0,
|
|
|
212
335
|
child.on('exit', () => {
|
|
213
336
|
logManager?.close();
|
|
214
337
|
});
|
|
338
|
+
return { child, logPath, logManager, info };
|
|
215
339
|
}
|
|
216
340
|
else {
|
|
217
341
|
// Fallback to simple file logging
|
|
218
|
-
logPath = (0, path_1.safeJoin)(laneRunDir, 'terminal.log');
|
|
342
|
+
logPath = (0, path_1.safeJoin)(laneRunDir, 'terminal-readable.log');
|
|
219
343
|
const logFd = fs.openSync(logPath, 'a');
|
|
220
344
|
child = (0, child_process_1.spawn)('node', args, {
|
|
221
345
|
stdio: ['ignore', logFd, logFd],
|
|
@@ -228,8 +352,18 @@ function spawnLane({ laneName, tasksFile, laneRunDir, executor, startIndex = 0,
|
|
|
228
352
|
catch {
|
|
229
353
|
// Ignore
|
|
230
354
|
}
|
|
355
|
+
return {
|
|
356
|
+
child,
|
|
357
|
+
logPath,
|
|
358
|
+
logManager,
|
|
359
|
+
info: {
|
|
360
|
+
child,
|
|
361
|
+
logPath,
|
|
362
|
+
statePath: (0, path_1.safeJoin)(laneRunDir, 'state.json'),
|
|
363
|
+
laneIndex
|
|
364
|
+
}
|
|
365
|
+
};
|
|
231
366
|
}
|
|
232
|
-
return { child, logPath, logManager };
|
|
233
367
|
}
|
|
234
368
|
/**
|
|
235
369
|
* Wait for child process to exit
|
|
@@ -245,7 +379,7 @@ function waitChild(proc) {
|
|
|
245
379
|
});
|
|
246
380
|
}
|
|
247
381
|
/**
|
|
248
|
-
* List lane task files in directory
|
|
382
|
+
* List lane task files in directory
|
|
249
383
|
*/
|
|
250
384
|
function listLaneFiles(tasksDir) {
|
|
251
385
|
if (!fs.existsSync(tasksDir)) {
|
|
@@ -253,23 +387,14 @@ function listLaneFiles(tasksDir) {
|
|
|
253
387
|
}
|
|
254
388
|
const files = fs.readdirSync(tasksDir);
|
|
255
389
|
return files
|
|
256
|
-
.filter(f => f.endsWith('.json'))
|
|
390
|
+
.filter(f => f.endsWith('.json') && f !== 'flow.meta.json')
|
|
257
391
|
.sort()
|
|
258
392
|
.map(f => {
|
|
259
393
|
const filePath = (0, path_1.safeJoin)(tasksDir, f);
|
|
260
394
|
const name = path.basename(f, '.json');
|
|
261
|
-
let dependsOn = [];
|
|
262
|
-
try {
|
|
263
|
-
const config = JSON.parse(fs.readFileSync(filePath, 'utf8'));
|
|
264
|
-
dependsOn = config.dependsOn || [];
|
|
265
|
-
}
|
|
266
|
-
catch (e) {
|
|
267
|
-
logger.warn(`Failed to parse config for lane ${name}: ${e}`);
|
|
268
|
-
}
|
|
269
395
|
return {
|
|
270
396
|
name,
|
|
271
397
|
path: filePath,
|
|
272
|
-
dependsOn,
|
|
273
398
|
};
|
|
274
399
|
});
|
|
275
400
|
}
|
|
@@ -284,8 +409,7 @@ function printLaneStatus(lanes, laneRunDirs) {
|
|
|
284
409
|
const statePath = (0, path_1.safeJoin)(dir, 'state.json');
|
|
285
410
|
const state = (0, state_1.loadState)(statePath);
|
|
286
411
|
if (!state) {
|
|
287
|
-
|
|
288
|
-
return { lane: lane.name, status: isWaiting ? 'waiting' : 'pending', task: '-' };
|
|
412
|
+
return { lane: lane.name, status: 'pending', task: '-' };
|
|
289
413
|
}
|
|
290
414
|
const idx = (state.currentTaskIndex || 0) + 1;
|
|
291
415
|
return {
|
|
@@ -322,11 +446,11 @@ async function resolveAllDependencies(blockedLanes, allLanes, laneRunDirs, pipel
|
|
|
322
446
|
const state = (0, state_1.loadState)(statePath);
|
|
323
447
|
const worktreeDir = state?.worktreeDir || (0, path_1.safeJoin)(runRoot, 'resolution-worktree');
|
|
324
448
|
if (!fs.existsSync(worktreeDir)) {
|
|
325
|
-
logger.info(
|
|
449
|
+
logger.info(`🏗️ Creating resolution worktree at ${worktreeDir}`);
|
|
326
450
|
git.createWorktree(worktreeDir, pipelineBranch, { baseBranch: git.getCurrentBranch() });
|
|
327
451
|
}
|
|
328
452
|
// 3. Resolve on pipeline branch
|
|
329
|
-
logger.info(
|
|
453
|
+
logger.info(`🔄 Resolving dependencies on branch ${pipelineBranch}`);
|
|
330
454
|
git.runGit(['checkout', pipelineBranch], { cwd: worktreeDir });
|
|
331
455
|
for (const cmd of uniqueCommands) {
|
|
332
456
|
logger.info(`Running: ${cmd}`);
|
|
@@ -416,28 +540,9 @@ async function orchestrate(tasksDir, options = {}) {
|
|
|
416
540
|
}
|
|
417
541
|
logger.success('✓ Preflight checks passed');
|
|
418
542
|
}
|
|
419
|
-
// Validate dependencies and detect cycles
|
|
420
|
-
logger.section('📊 Dependency Analysis');
|
|
421
|
-
const depInfos = lanes.map(l => ({
|
|
422
|
-
name: l.name,
|
|
423
|
-
dependsOn: l.dependsOn,
|
|
424
|
-
}));
|
|
425
|
-
const depValidation = (0, dependency_1.validateDependencies)(depInfos);
|
|
426
|
-
if (!depValidation.valid) {
|
|
427
|
-
logger.error('❌ Dependency validation failed:');
|
|
428
|
-
for (const err of depValidation.errors) {
|
|
429
|
-
logger.error(` • ${err}`);
|
|
430
|
-
}
|
|
431
|
-
throw new Error('Invalid dependency configuration');
|
|
432
|
-
}
|
|
433
|
-
if (depValidation.warnings.length > 0) {
|
|
434
|
-
for (const warn of depValidation.warnings) {
|
|
435
|
-
logger.warn(`⚠️ ${warn}`);
|
|
436
|
-
}
|
|
437
|
-
}
|
|
438
|
-
// Print dependency graph
|
|
439
|
-
(0, dependency_1.printDependencyGraph)(depInfos);
|
|
440
543
|
const config = (0, config_1.loadConfig)();
|
|
544
|
+
// Set verbose git logging from config
|
|
545
|
+
git.setVerboseGit(config.verboseGit || false);
|
|
441
546
|
const logsDir = (0, config_1.getLogsDir)(config);
|
|
442
547
|
const runId = `run-${Date.now()}`;
|
|
443
548
|
// Use absolute path for runRoot to avoid issues with subfolders
|
|
@@ -458,16 +563,11 @@ async function orchestrate(tasksDir, options = {}) {
|
|
|
458
563
|
}
|
|
459
564
|
const randomSuffix = Math.random().toString(36).substring(2, 7);
|
|
460
565
|
const pipelineBranch = `cursorflow/run-${Date.now().toString(36)}-${randomSuffix}`;
|
|
461
|
-
//
|
|
462
|
-
const
|
|
566
|
+
// Initialize unified stall detection service (Single Source of Truth)
|
|
567
|
+
const stallService = (0, stall_detection_1.getStallService)({
|
|
463
568
|
...DEFAULT_ORCHESTRATOR_STALL_CONFIG,
|
|
464
569
|
...options.stallConfig,
|
|
465
|
-
|
|
466
|
-
// Initialize auto-recovery manager
|
|
467
|
-
const autoRecoveryManager = (0, auto_recovery_1.getAutoRecoveryManager)({
|
|
468
|
-
...auto_recovery_1.DEFAULT_AUTO_RECOVERY_CONFIG,
|
|
469
|
-
idleTimeoutMs: stallConfig.idleTimeoutMs, // Sync with stall config
|
|
470
|
-
...options.autoRecoveryConfig,
|
|
570
|
+
verbose: process.env['DEBUG_STALL'] === 'true',
|
|
471
571
|
});
|
|
472
572
|
// Initialize event system
|
|
473
573
|
events_1.events.setRunId(runId);
|
|
@@ -512,6 +612,7 @@ async function orchestrate(tasksDir, options = {}) {
|
|
|
512
612
|
fs.mkdirSync(worktreeParent, { recursive: true });
|
|
513
613
|
}
|
|
514
614
|
laneWorktreeDirs[lane.name] = laneWorktreeDir;
|
|
615
|
+
logger.info(`🏗️ Initializing lane ${lane.name}: branch=${lanePipelineBranch}`);
|
|
515
616
|
const initialState = (0, state_1.createLaneState)(lane.name, taskConfig, lane.path, {
|
|
516
617
|
pipelineBranch: lanePipelineBranch,
|
|
517
618
|
worktreeDir: laneWorktreeDir
|
|
@@ -526,19 +627,6 @@ async function orchestrate(tasksDir, options = {}) {
|
|
|
526
627
|
logger.info(`Tasks directory: ${tasksDir}`);
|
|
527
628
|
logger.info(`Run directory: ${runRoot}`);
|
|
528
629
|
logger.info(`Lanes: ${lanes.length}`);
|
|
529
|
-
// Display dependency graph
|
|
530
|
-
logger.info('\n📊 Dependency Graph:');
|
|
531
|
-
for (const lane of lanes) {
|
|
532
|
-
const deps = lane.dependsOn.length > 0 ? ` [depends on: ${lane.dependsOn.join(', ')}]` : '';
|
|
533
|
-
console.log(` ${logger.COLORS.cyan}${lane.name}${logger.COLORS.reset}${deps}`);
|
|
534
|
-
// Simple tree-like visualization for deep dependencies
|
|
535
|
-
if (lane.dependsOn.length > 0) {
|
|
536
|
-
for (const dep of lane.dependsOn) {
|
|
537
|
-
console.log(` └─ ${dep}`);
|
|
538
|
-
}
|
|
539
|
-
}
|
|
540
|
-
}
|
|
541
|
-
console.log('');
|
|
542
630
|
// Disable auto-resolve when noGit mode is enabled
|
|
543
631
|
const autoResolve = !options.noGit && options.autoResolveDependencies !== false;
|
|
544
632
|
if (options.noGit) {
|
|
@@ -568,28 +656,12 @@ async function orchestrate(tasksDir, options = {}) {
|
|
|
568
656
|
let lastStallCheck = Date.now();
|
|
569
657
|
try {
|
|
570
658
|
while (completedLanes.size + failedLanes.size + blockedLanes.size < lanes.length || (blockedLanes.size > 0 && running.size === 0)) {
|
|
571
|
-
// 1. Identify lanes ready to start
|
|
659
|
+
// 1. Identify lanes ready to start (all lanes can start immediately - no lane-level dependencies)
|
|
572
660
|
const readyToStart = lanes.filter(lane => {
|
|
573
661
|
// Not already running or completed or failed or blocked
|
|
574
662
|
if (running.has(lane.name) || completedLanes.has(lane.name) || failedLanes.has(lane.name) || blockedLanes.has(lane.name)) {
|
|
575
663
|
return false;
|
|
576
664
|
}
|
|
577
|
-
// Check dependencies
|
|
578
|
-
for (const dep of lane.dependsOn) {
|
|
579
|
-
if (failedLanes.has(dep)) {
|
|
580
|
-
logger.error(`Lane ${lane.name} will not start because dependency ${dep} failed`);
|
|
581
|
-
failedLanes.add(lane.name);
|
|
582
|
-
exitCodes[lane.name] = 1;
|
|
583
|
-
return false;
|
|
584
|
-
}
|
|
585
|
-
if (blockedLanes.has(dep)) {
|
|
586
|
-
// If a dependency is blocked, wait
|
|
587
|
-
return false;
|
|
588
|
-
}
|
|
589
|
-
if (!completedLanes.has(dep)) {
|
|
590
|
-
return false;
|
|
591
|
-
}
|
|
592
|
-
}
|
|
593
665
|
return true;
|
|
594
666
|
});
|
|
595
667
|
// 2. Spawn ready lanes up to maxConcurrent
|
|
@@ -604,22 +676,19 @@ async function orchestrate(tasksDir, options = {}) {
|
|
|
604
676
|
}
|
|
605
677
|
logger.info(`Lane started: ${lane.name}${lane.startIndex ? ` (resuming from ${lane.startIndex})` : ''}`);
|
|
606
678
|
const now = Date.now();
|
|
607
|
-
//
|
|
679
|
+
// Register lane with unified stall detection service FIRST
|
|
680
|
+
stallService.registerLane(lane.name, {
|
|
681
|
+
laneRunDir: laneRunDirs[lane.name],
|
|
682
|
+
});
|
|
683
|
+
const laneIdx = lanes.findIndex(l => l.name === lane.name);
|
|
684
|
+
// Pre-register lane in running map
|
|
608
685
|
running.set(lane.name, {
|
|
609
686
|
child: {}, // Placeholder, will be replaced below
|
|
610
687
|
logManager: undefined,
|
|
611
688
|
logPath: '',
|
|
612
|
-
lastActivity: now,
|
|
613
|
-
lastStateUpdate: now,
|
|
614
|
-
stallPhase: 0,
|
|
615
|
-
taskStartTime: now,
|
|
616
|
-
lastOutput: '',
|
|
617
689
|
statePath: laneStatePath,
|
|
618
|
-
|
|
619
|
-
lastBytesCheck: 0,
|
|
620
|
-
continueSignalsSent: 0,
|
|
690
|
+
laneIndex: laneIdx >= 0 ? laneIdx : 0,
|
|
621
691
|
});
|
|
622
|
-
let lastOutput = '';
|
|
623
692
|
const spawnResult = spawnLane({
|
|
624
693
|
laneName: lane.name,
|
|
625
694
|
tasksFile: lane.path,
|
|
@@ -630,48 +699,35 @@ async function orchestrate(tasksDir, options = {}) {
|
|
|
630
699
|
worktreeDir: laneWorktreeDirs[lane.name],
|
|
631
700
|
enhancedLogConfig: options.enhancedLogging,
|
|
632
701
|
noGit: options.noGit,
|
|
702
|
+
laneIndex: laneIdx >= 0 ? laneIdx : 0,
|
|
633
703
|
onActivity: () => {
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
const actNow = Date.now();
|
|
637
|
-
info.lastActivity = actNow;
|
|
638
|
-
info.lastStateUpdate = actNow;
|
|
639
|
-
info.stallPhase = 0;
|
|
640
|
-
}
|
|
704
|
+
// Record state file update activity
|
|
705
|
+
stallService.recordStateUpdate(lane.name);
|
|
641
706
|
}
|
|
642
707
|
});
|
|
643
708
|
// Update with actual spawn result
|
|
644
709
|
const existingInfo = running.get(lane.name);
|
|
645
|
-
Object.assign(existingInfo, spawnResult);
|
|
646
|
-
//
|
|
710
|
+
Object.assign(existingInfo, spawnResult.info);
|
|
711
|
+
// Update stall service with child process reference
|
|
712
|
+
stallService.setChildProcess(lane.name, spawnResult.child);
|
|
713
|
+
// Track stdout for activity detection - delegate to StallDetectionService
|
|
647
714
|
if (spawnResult.child.stdout) {
|
|
648
715
|
spawnResult.child.stdout.on('data', (data) => {
|
|
649
|
-
const
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
info.lastOutput = lastRealLine;
|
|
662
|
-
info.bytesReceived += data.length;
|
|
663
|
-
// Update auto-recovery manager with real activity
|
|
664
|
-
autoRecoveryManager.recordActivity(lane.name, data.length, info.lastOutput);
|
|
665
|
-
}
|
|
666
|
-
else if (lines.length > 0) {
|
|
667
|
-
// Only heartbeats received - do NOT update lastActivity (keep stall timer running)
|
|
668
|
-
autoRecoveryManager.recordActivity(lane.name, 0, info.lastOutput);
|
|
669
|
-
}
|
|
716
|
+
const output = data.toString();
|
|
717
|
+
const lines = output.split('\n').filter(l => l.trim());
|
|
718
|
+
// Filter out heartbeats from activity tracking
|
|
719
|
+
const realLines = lines.filter(line => !(line.includes('Heartbeat') && line.includes('bytes received')));
|
|
720
|
+
if (realLines.length > 0) {
|
|
721
|
+
// Real activity - record with bytes
|
|
722
|
+
const lastRealLine = realLines[realLines.length - 1];
|
|
723
|
+
stallService.recordActivity(lane.name, data.length, lastRealLine);
|
|
724
|
+
}
|
|
725
|
+
else if (lines.length > 0) {
|
|
726
|
+
// Heartbeat only - record with 0 bytes (won't reset timer)
|
|
727
|
+
stallService.recordActivity(lane.name, 0);
|
|
670
728
|
}
|
|
671
729
|
});
|
|
672
730
|
}
|
|
673
|
-
// Register lane with auto-recovery manager
|
|
674
|
-
autoRecoveryManager.registerLane(lane.name);
|
|
675
731
|
// Update lane tracking
|
|
676
732
|
lane.taskStartTime = now;
|
|
677
733
|
events_1.events.emit('lane.started', {
|
|
@@ -697,212 +753,35 @@ async function orchestrate(tasksDir, options = {}) {
|
|
|
697
753
|
const now = Date.now();
|
|
698
754
|
if (result.name === '__poll__' || (now - lastStallCheck >= 10000)) {
|
|
699
755
|
lastStallCheck = now;
|
|
700
|
-
// Periodic stall check
|
|
756
|
+
// Periodic stall check using unified StallDetectionService
|
|
701
757
|
for (const [laneName, info] of running.entries()) {
|
|
702
|
-
const idleTime = now - info.lastActivity;
|
|
703
758
|
const lane = lanes.find(l => l.name === laneName);
|
|
704
|
-
if (process.env['DEBUG_STALL']) {
|
|
705
|
-
logger.debug(`[${laneName}] Stall check: idle=${Math.round(idleTime / 1000)}s, bytesDelta=${info.bytesReceived - info.lastBytesCheck}, phase=${info.stallPhase}`);
|
|
706
|
-
}
|
|
707
759
|
// Check state file for progress updates
|
|
708
|
-
let progressTime = 0;
|
|
709
760
|
try {
|
|
710
761
|
const stateStat = fs.statSync(info.statePath);
|
|
711
|
-
const
|
|
712
|
-
if (
|
|
713
|
-
|
|
762
|
+
const stallState = stallService.getState(laneName);
|
|
763
|
+
if (stallState && stateStat.mtimeMs > stallState.lastStateUpdateTime) {
|
|
764
|
+
stallService.recordStateUpdate(laneName);
|
|
714
765
|
}
|
|
715
|
-
progressTime = now - info.lastStateUpdate;
|
|
716
766
|
}
|
|
717
767
|
catch {
|
|
718
768
|
// State file might not exist yet
|
|
719
769
|
}
|
|
720
|
-
//
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
lastOutput: info.lastOutput,
|
|
729
|
-
restartCount: lane.restartCount || 0,
|
|
730
|
-
taskStartTimeMs: info.taskStartTime,
|
|
731
|
-
bytesReceived: bytesDelta, // Bytes since last check
|
|
732
|
-
continueSignalsSent: info.continueSignalsSent,
|
|
733
|
-
}, stallConfig);
|
|
734
|
-
// Only act if action is not NONE
|
|
735
|
-
if (analysis.action !== failure_policy_1.RecoveryAction.NONE) {
|
|
736
|
-
(0, failure_policy_1.logFailure)(laneName, analysis);
|
|
770
|
+
// Debug logging
|
|
771
|
+
if (process.env['DEBUG_STALL']) {
|
|
772
|
+
logger.debug(`[${laneName}] ${stallService.dumpState(laneName)}`);
|
|
773
|
+
}
|
|
774
|
+
// Run stall analysis and recovery (all logic is in StallDetectionService)
|
|
775
|
+
const analysis = stallService.checkAndRecover(laneName);
|
|
776
|
+
// Log to lane log manager if there was an action
|
|
777
|
+
if (analysis.action !== stall_detection_1.RecoveryAction.NONE) {
|
|
737
778
|
info.logManager?.log('error', analysis.message);
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
fs.writeFileSync(interventionPath, 'continue');
|
|
742
|
-
info.stallPhase = 1;
|
|
743
|
-
info.lastActivity = now;
|
|
744
|
-
info.continueSignalsSent++;
|
|
745
|
-
logger.info(`[${laneName}] Sent continue signal (#${info.continueSignalsSent})`);
|
|
746
|
-
events_1.events.emit('recovery.continue_signal', {
|
|
747
|
-
laneName,
|
|
748
|
-
idleSeconds: Math.round(idleTime / 1000),
|
|
749
|
-
signalCount: info.continueSignalsSent,
|
|
750
|
-
});
|
|
751
|
-
}
|
|
752
|
-
catch (e) {
|
|
753
|
-
logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
|
|
754
|
-
}
|
|
755
|
-
}
|
|
756
|
-
else if (analysis.action === failure_policy_1.RecoveryAction.STRONGER_PROMPT) {
|
|
757
|
-
const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
|
|
758
|
-
const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck. Please continue with your current task immediately. If you're waiting for something, explain what you need and proceed with what you can do now. If you've completed the task, summarize your work and finish.`;
|
|
759
|
-
try {
|
|
760
|
-
fs.writeFileSync(interventionPath, strongerPrompt);
|
|
761
|
-
info.stallPhase = 2;
|
|
762
|
-
info.lastActivity = now;
|
|
763
|
-
logger.warn(`[${laneName}] Sent stronger prompt after continue signal failed`);
|
|
764
|
-
events_1.events.emit('recovery.stronger_prompt', { laneName });
|
|
765
|
-
}
|
|
766
|
-
catch (e) {
|
|
767
|
-
logger.error(`Failed to write intervention file for ${laneName}: ${e}`);
|
|
768
|
-
}
|
|
769
|
-
}
|
|
770
|
-
else if (analysis.action === failure_policy_1.RecoveryAction.KILL_AND_RESTART ||
|
|
771
|
-
analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE ||
|
|
772
|
-
analysis.action === failure_policy_1.RecoveryAction.RESTART_LANE_FROM_CHECKPOINT) {
|
|
773
|
-
lane.restartCount = (lane.restartCount || 0) + 1;
|
|
774
|
-
info.stallPhase = 3;
|
|
775
|
-
// Try to get checkpoint info
|
|
776
|
-
const checkpoint = (0, checkpoint_1.getLatestCheckpoint)(laneRunDirs[laneName]);
|
|
777
|
-
if (checkpoint) {
|
|
778
|
-
logger.info(`[${laneName}] Checkpoint available: ${checkpoint.id} (task ${checkpoint.taskIndex})`);
|
|
779
|
-
}
|
|
780
|
-
// Kill the process
|
|
781
|
-
try {
|
|
782
|
-
info.child.kill('SIGKILL');
|
|
783
|
-
}
|
|
784
|
-
catch {
|
|
785
|
-
// Process might already be dead
|
|
786
|
-
}
|
|
787
|
-
logger.warn(`[${laneName}] Killing and restarting lane (restart #${lane.restartCount})`);
|
|
788
|
-
events_1.events.emit('recovery.restart', {
|
|
789
|
-
laneName,
|
|
790
|
-
restartCount: lane.restartCount,
|
|
791
|
-
maxRestarts: stallConfig.maxRestarts,
|
|
792
|
-
});
|
|
793
|
-
}
|
|
794
|
-
else if (analysis.action === failure_policy_1.RecoveryAction.RUN_DOCTOR) {
|
|
795
|
-
info.stallPhase = 4;
|
|
796
|
-
// Run diagnostics
|
|
797
|
-
logger.error(`[${laneName}] Running diagnostics due to persistent failures...`);
|
|
798
|
-
// Import health check dynamically to avoid circular dependency
|
|
799
|
-
const { checkAgentHealth, checkAuthHealth } = await Promise.resolve().then(() => __importStar(require('../utils/health')));
|
|
800
|
-
const [agentHealth, authHealth] = await Promise.all([
|
|
801
|
-
checkAgentHealth(),
|
|
802
|
-
checkAuthHealth(),
|
|
803
|
-
]);
|
|
804
|
-
const issues = [];
|
|
805
|
-
if (!agentHealth.ok)
|
|
806
|
-
issues.push(`Agent: ${agentHealth.message}`);
|
|
807
|
-
if (!authHealth.ok)
|
|
808
|
-
issues.push(`Auth: ${authHealth.message}`);
|
|
809
|
-
if (issues.length > 0) {
|
|
810
|
-
logger.error(`[${laneName}] Diagnostic issues found:\n ${issues.join('\n ')}`);
|
|
811
|
-
}
|
|
812
|
-
else {
|
|
813
|
-
logger.warn(`[${laneName}] No obvious issues found. The problem may be with the AI model or network.`);
|
|
814
|
-
}
|
|
815
|
-
// Save diagnostic to file
|
|
816
|
-
const diagnosticPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'diagnostic.json');
|
|
817
|
-
fs.writeFileSync(diagnosticPath, JSON.stringify({
|
|
818
|
-
timestamp: Date.now(),
|
|
819
|
-
agentHealthy: agentHealth.ok,
|
|
820
|
-
authHealthy: authHealth.ok,
|
|
821
|
-
issues,
|
|
822
|
-
analysis,
|
|
823
|
-
}, null, 2));
|
|
824
|
-
// Kill the process
|
|
825
|
-
try {
|
|
826
|
-
info.child.kill('SIGKILL');
|
|
827
|
-
}
|
|
828
|
-
catch {
|
|
829
|
-
// Process might already be dead
|
|
830
|
-
}
|
|
831
|
-
logger.error(`[${laneName}] Aborting lane after diagnostic. Check ${diagnosticPath} for details.`);
|
|
832
|
-
// Save POF for failed recovery
|
|
833
|
-
const recoveryState = autoRecoveryManager.getState(laneName);
|
|
834
|
-
if (recoveryState) {
|
|
835
|
-
try {
|
|
836
|
-
const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
|
|
837
|
-
const laneState = (0, state_1.loadState)(laneStatePath);
|
|
838
|
-
const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
|
|
839
|
-
const diagnosticInfo = {
|
|
840
|
-
timestamp: Date.now(),
|
|
841
|
-
agentHealthy: agentHealth.ok,
|
|
842
|
-
authHealthy: authHealth.ok,
|
|
843
|
-
systemHealthy: true,
|
|
844
|
-
suggestedAction: issues.length > 0 ? 'Fix the issues above and retry' : 'Try with a different model',
|
|
845
|
-
details: issues.join('\n') || 'No obvious issues found',
|
|
846
|
-
};
|
|
847
|
-
const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, diagnosticInfo);
|
|
848
|
-
(0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
|
|
849
|
-
}
|
|
850
|
-
catch (pofError) {
|
|
851
|
-
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
852
|
-
}
|
|
853
|
-
}
|
|
854
|
-
events_1.events.emit('recovery.diagnosed', {
|
|
855
|
-
laneName,
|
|
856
|
-
diagnostic: { agentHealthy: agentHealth.ok, authHealthy: authHealth.ok, issues },
|
|
857
|
-
});
|
|
858
|
-
}
|
|
859
|
-
else if (analysis.action === failure_policy_1.RecoveryAction.ABORT_LANE) {
|
|
860
|
-
info.stallPhase = 5;
|
|
861
|
-
try {
|
|
862
|
-
info.child.kill('SIGKILL');
|
|
863
|
-
}
|
|
864
|
-
catch {
|
|
865
|
-
// Process might already be dead
|
|
866
|
-
}
|
|
867
|
-
logger.error(`[${laneName}] Aborting lane due to repeated stalls`);
|
|
868
|
-
// Save POF for failed recovery
|
|
869
|
-
const recoveryState = autoRecoveryManager.getState(laneName);
|
|
870
|
-
if (recoveryState) {
|
|
871
|
-
try {
|
|
872
|
-
const laneStatePath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'state.json');
|
|
873
|
-
const laneState = (0, state_1.loadState)(laneStatePath);
|
|
874
|
-
const pofDir = (0, path_1.safeJoin)(runRoot, '..', '..', 'pof');
|
|
875
|
-
const pofEntry = (0, auto_recovery_1.createPOFFromRecoveryState)(runId, runRoot, laneName, recoveryState, laneState, recoveryState.diagnosticInfo);
|
|
876
|
-
(0, auto_recovery_1.savePOF)(runId, pofDir, pofEntry);
|
|
877
|
-
}
|
|
878
|
-
catch (pofError) {
|
|
879
|
-
logger.warn(`[${laneName}] Failed to save POF: ${pofError.message}`);
|
|
880
|
-
}
|
|
881
|
-
}
|
|
882
|
-
}
|
|
883
|
-
else if (analysis.action === failure_policy_1.RecoveryAction.SEND_GIT_GUIDANCE) {
|
|
884
|
-
// Send guidance message to agent for git issues
|
|
885
|
-
const interventionPath = (0, path_1.safeJoin)(laneRunDirs[laneName], 'intervention.txt');
|
|
886
|
-
// Determine which guidance to send based on the failure type
|
|
887
|
-
let guidance;
|
|
888
|
-
if (analysis.type === failure_policy_1.FailureType.GIT_PUSH_REJECTED) {
|
|
889
|
-
guidance = (0, auto_recovery_1.getGitPushFailureGuidance)();
|
|
890
|
-
}
|
|
891
|
-
else if (analysis.type === failure_policy_1.FailureType.MERGE_CONFLICT) {
|
|
892
|
-
guidance = (0, auto_recovery_1.getMergeConflictGuidance)();
|
|
893
|
-
}
|
|
894
|
-
else {
|
|
895
|
-
guidance = (0, auto_recovery_1.getGitErrorGuidance)(analysis.message);
|
|
896
|
-
}
|
|
897
|
-
try {
|
|
898
|
-
fs.writeFileSync(interventionPath, guidance);
|
|
899
|
-
info.lastActivity = now;
|
|
900
|
-
logger.info(`[${laneName}] Sent git issue guidance to agent`);
|
|
901
|
-
}
|
|
902
|
-
catch (e) {
|
|
903
|
-
logger.error(`[${laneName}] Failed to send guidance: ${e.message}`);
|
|
904
|
-
}
|
|
779
|
+
// Handle special case: RUN_DOCTOR needs async operations
|
|
780
|
+
if (analysis.action === stall_detection_1.RecoveryAction.RUN_DOCTOR) {
|
|
781
|
+
await handleDoctorDiagnostics(laneName, laneRunDirs[laneName], runId, runRoot, stallService, info.child);
|
|
905
782
|
}
|
|
783
|
+
// Sync restartCount back to lane info (for restart logic in process exit handler)
|
|
784
|
+
lane.restartCount = stallService.getRestartCount(laneName);
|
|
906
785
|
}
|
|
907
786
|
}
|
|
908
787
|
continue;
|
|
@@ -912,8 +791,10 @@ async function orchestrate(tasksDir, options = {}) {
|
|
|
912
791
|
const info = running.get(finished.name);
|
|
913
792
|
running.delete(finished.name);
|
|
914
793
|
exitCodes[finished.name] = finished.code;
|
|
915
|
-
//
|
|
916
|
-
|
|
794
|
+
// Get stall state before unregistering
|
|
795
|
+
const stallPhase = stallService.getPhase(finished.name);
|
|
796
|
+
// Unregister from stall detection service
|
|
797
|
+
stallService.unregisterLane(finished.name);
|
|
917
798
|
if (finished.code === 0) {
|
|
918
799
|
completedLanes.add(finished.name);
|
|
919
800
|
events_1.events.emit('lane.completed', {
|
|
@@ -943,8 +824,8 @@ async function orchestrate(tasksDir, options = {}) {
|
|
|
943
824
|
}
|
|
944
825
|
}
|
|
945
826
|
else {
|
|
946
|
-
// Check if it was a restart request
|
|
947
|
-
if (
|
|
827
|
+
// Check if it was a restart request (RESTART_REQUESTED phase)
|
|
828
|
+
if (stallPhase === stall_detection_1.StallPhase.RESTART_REQUESTED) {
|
|
948
829
|
logger.info(`🔄 Lane ${finished.name} is being restarted due to stall...`);
|
|
949
830
|
// Update startIndex from current state to resume from the same task
|
|
950
831
|
const statePath = (0, path_1.safeJoin)(laneRunDirs[finished.name], 'state.json');
|
|
@@ -961,7 +842,7 @@ async function orchestrate(tasksDir, options = {}) {
|
|
|
961
842
|
}
|
|
962
843
|
failedLanes.add(finished.name);
|
|
963
844
|
let errorMsg = 'Process exited with non-zero code';
|
|
964
|
-
if (
|
|
845
|
+
if (stallPhase >= stall_detection_1.StallPhase.DIAGNOSED) {
|
|
965
846
|
errorMsg = 'Stopped due to repeated stall';
|
|
966
847
|
}
|
|
967
848
|
else if (info.logManager) {
|