@litmers/cursorflow-orchestrator 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/dist/cli/index.js +0 -6
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/monitor.js +18 -2
- package/dist/cli/monitor.js.map +1 -1
- package/dist/cli/signal.js +33 -29
- package/dist/cli/signal.js.map +1 -1
- package/dist/core/auto-recovery.d.ts +2 -117
- package/dist/core/auto-recovery.js +4 -487
- package/dist/core/auto-recovery.js.map +1 -1
- package/dist/core/failure-policy.d.ts +0 -52
- package/dist/core/failure-policy.js +7 -174
- package/dist/core/failure-policy.js.map +1 -1
- package/dist/core/intervention.d.ts +0 -6
- package/dist/core/intervention.js +1 -17
- package/dist/core/intervention.js.map +1 -1
- package/dist/core/orchestrator.js +10 -3
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/core/runner/agent.js +18 -15
- package/dist/core/runner/agent.js.map +1 -1
- package/dist/core/stall-detection.js +9 -7
- package/dist/core/stall-detection.js.map +1 -1
- package/package.json +2 -13
- package/src/cli/index.ts +0 -6
- package/src/cli/monitor.ts +18 -2
- package/src/cli/signal.ts +38 -34
- package/src/core/auto-recovery.ts +13 -595
- package/src/core/failure-policy.ts +7 -228
- package/src/core/intervention.ts +0 -18
- package/src/core/orchestrator.ts +13 -3
- package/src/core/runner/agent.ts +21 -16
- package/src/core/stall-detection.ts +11 -9
- package/dist/cli/prepare.d.ts +0 -7
- package/dist/cli/prepare.js +0 -690
- package/dist/cli/prepare.js.map +0 -1
- package/src/cli/prepare.ts +0 -777
|
@@ -8,49 +8,12 @@
|
|
|
8
8
|
* - Doctor integration for persistent failures
|
|
9
9
|
* - POF (Post-mortem of Failure) saving for failed recoveries
|
|
10
10
|
*/
|
|
11
|
-
import { ChildProcess } from 'child_process';
|
|
12
11
|
import { LaneState } from '../utils/types';
|
|
13
|
-
/** Recovery stages for escalating interventions */
|
|
14
|
-
export declare enum RecoveryStage {
|
|
15
|
-
/** Normal operation - monitoring */
|
|
16
|
-
NORMAL = 0,
|
|
17
|
-
/** First intervention - send continue signal */
|
|
18
|
-
CONTINUE_SIGNAL = 1,
|
|
19
|
-
/** Second intervention - send stronger prompt */
|
|
20
|
-
STRONGER_PROMPT = 2,
|
|
21
|
-
/** Third intervention - kill and restart process */
|
|
22
|
-
RESTART_PROCESS = 3,
|
|
23
|
-
/** Final stage - run doctor and report */
|
|
24
|
-
DIAGNOSE = 4,
|
|
25
|
-
/** No more recovery possible */
|
|
26
|
-
ABORT = 5
|
|
27
|
-
}
|
|
28
|
-
/** Configuration for auto-recovery behavior */
|
|
29
|
-
export interface AutoRecoveryConfig {
|
|
30
|
-
/** Time without activity before sending continue signal (default: 2 minutes) */
|
|
31
|
-
idleTimeoutMs: number;
|
|
32
|
-
/** Time to wait after continue signal before escalating (default: 2 minutes) */
|
|
33
|
-
continueGraceMs: number;
|
|
34
|
-
/** Time to wait after stronger prompt before escalating (default: 2 minutes) */
|
|
35
|
-
strongerPromptGraceMs: number;
|
|
36
|
-
/** Maximum number of restarts before aborting (default: 2) */
|
|
37
|
-
maxRestarts: number;
|
|
38
|
-
/** Whether to run doctor on persistent failures (default: true) */
|
|
39
|
-
runDoctorOnFailure: boolean;
|
|
40
|
-
/** Patterns indicating long-running operations (won't trigger idle) */
|
|
41
|
-
longOperationPatterns: RegExp[];
|
|
42
|
-
/** Grace period for long operations (default: 10 minutes) */
|
|
43
|
-
longOperationGraceMs: number;
|
|
44
|
-
/** Enable verbose logging */
|
|
45
|
-
verbose: boolean;
|
|
46
|
-
}
|
|
47
|
-
/** Default auto-recovery configuration */
|
|
48
|
-
export declare const DEFAULT_AUTO_RECOVERY_CONFIG: AutoRecoveryConfig;
|
|
49
12
|
/** State tracking for a single lane's recovery */
|
|
50
13
|
export interface LaneRecoveryState {
|
|
51
14
|
laneName: string;
|
|
52
15
|
runId: string;
|
|
53
|
-
stage:
|
|
16
|
+
stage: number;
|
|
54
17
|
lastActivityTime: number;
|
|
55
18
|
lastBytesReceived: number;
|
|
56
19
|
totalBytesReceived: number;
|
|
@@ -71,19 +34,10 @@ export interface DiagnosticInfo {
|
|
|
71
34
|
suggestedAction: string;
|
|
72
35
|
details: string;
|
|
73
36
|
}
|
|
74
|
-
/** Recovery action result */
|
|
75
|
-
export interface RecoveryActionResult {
|
|
76
|
-
success: boolean;
|
|
77
|
-
action: string;
|
|
78
|
-
message: string;
|
|
79
|
-
shouldContinue: boolean;
|
|
80
|
-
nextStage?: RecoveryStage;
|
|
81
|
-
diagnostic?: DiagnosticInfo;
|
|
82
|
-
}
|
|
83
37
|
/** Record of a failure for POF */
|
|
84
38
|
export interface FailureRecord {
|
|
85
39
|
timestamp: number;
|
|
86
|
-
stage:
|
|
40
|
+
stage: number;
|
|
87
41
|
action: string;
|
|
88
42
|
message: string;
|
|
89
43
|
idleTimeMs: number;
|
|
@@ -126,67 +80,6 @@ export declare function getGitPushFailureGuidance(): string;
|
|
|
126
80
|
export declare function getMergeConflictGuidance(): string;
|
|
127
81
|
/** Generate guidance message for general git error */
|
|
128
82
|
export declare function getGitErrorGuidance(errorMessage: string): string;
|
|
129
|
-
/**
|
|
130
|
-
* Manages recovery state for all lanes
|
|
131
|
-
*/
|
|
132
|
-
export declare class AutoRecoveryManager {
|
|
133
|
-
private config;
|
|
134
|
-
private laneStates;
|
|
135
|
-
private eventHandlers;
|
|
136
|
-
constructor(config?: Partial<AutoRecoveryConfig>);
|
|
137
|
-
/**
|
|
138
|
-
* Register a lane for recovery monitoring
|
|
139
|
-
*/
|
|
140
|
-
registerLane(laneName: string, runId: string): void;
|
|
141
|
-
/**
|
|
142
|
-
* Unregister a lane from recovery monitoring
|
|
143
|
-
*/
|
|
144
|
-
unregisterLane(laneName: string): void;
|
|
145
|
-
/**
|
|
146
|
-
* Record activity for a lane
|
|
147
|
-
*/
|
|
148
|
-
recordActivity(laneName: string, bytesReceived?: number, output?: string): void;
|
|
149
|
-
/**
|
|
150
|
-
* Get current recovery state for a lane
|
|
151
|
-
*/
|
|
152
|
-
getState(laneName: string): LaneRecoveryState | undefined;
|
|
153
|
-
/**
|
|
154
|
-
* Check if a lane needs recovery intervention
|
|
155
|
-
*/
|
|
156
|
-
needsIntervention(laneName: string): boolean;
|
|
157
|
-
/**
|
|
158
|
-
* Get the next recovery action for a lane
|
|
159
|
-
*/
|
|
160
|
-
getRecoveryAction(laneName: string, laneRunDir: string, child?: ChildProcess): Promise<RecoveryActionResult>;
|
|
161
|
-
/**
|
|
162
|
-
* Send a continue signal to the lane
|
|
163
|
-
*/
|
|
164
|
-
private sendContinueSignal;
|
|
165
|
-
/**
|
|
166
|
-
* Send a stronger prompt to nudge the agent
|
|
167
|
-
*/
|
|
168
|
-
private sendStrongerPrompt;
|
|
169
|
-
/**
|
|
170
|
-
* Request process restart
|
|
171
|
-
*/
|
|
172
|
-
private requestRestart;
|
|
173
|
-
/**
|
|
174
|
-
* Run diagnostic checks
|
|
175
|
-
*/
|
|
176
|
-
private runDiagnosis;
|
|
177
|
-
/**
|
|
178
|
-
* Get failure history for a lane
|
|
179
|
-
*/
|
|
180
|
-
getFailureHistory(laneName: string): FailureRecord[];
|
|
181
|
-
/**
|
|
182
|
-
* Get configuration
|
|
183
|
-
*/
|
|
184
|
-
getConfig(): AutoRecoveryConfig;
|
|
185
|
-
/**
|
|
186
|
-
* Update configuration
|
|
187
|
-
*/
|
|
188
|
-
updateConfig(config: Partial<AutoRecoveryConfig>): void;
|
|
189
|
-
}
|
|
190
83
|
/**
|
|
191
84
|
* Save a POF entry to the pof directory
|
|
192
85
|
*/
|
|
@@ -203,11 +96,3 @@ export declare function loadPOF(pofDir: string, runId: string): POFEntry | null;
|
|
|
203
96
|
* List all POF files in a directory
|
|
204
97
|
*/
|
|
205
98
|
export declare function listPOFs(pofDir: string): string[];
|
|
206
|
-
/**
|
|
207
|
-
* Get or create the default auto-recovery manager
|
|
208
|
-
*/
|
|
209
|
-
export declare function getAutoRecoveryManager(config?: Partial<AutoRecoveryConfig>): AutoRecoveryManager;
|
|
210
|
-
/**
|
|
211
|
-
* Reset the default manager (for testing)
|
|
212
|
-
*/
|
|
213
|
-
export declare function resetAutoRecoveryManager(): void;
|
|
@@ -43,7 +43,6 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
43
43
|
};
|
|
44
44
|
})();
|
|
45
45
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
46
|
-
exports.AutoRecoveryManager = exports.DEFAULT_AUTO_RECOVERY_CONFIG = exports.RecoveryStage = void 0;
|
|
47
46
|
exports.getGitPushFailureGuidance = getGitPushFailureGuidance;
|
|
48
47
|
exports.getMergeConflictGuidance = getMergeConflictGuidance;
|
|
49
48
|
exports.getGitErrorGuidance = getGitErrorGuidance;
|
|
@@ -51,54 +50,9 @@ exports.savePOF = savePOF;
|
|
|
51
50
|
exports.createPOFFromRecoveryState = createPOFFromRecoveryState;
|
|
52
51
|
exports.loadPOF = loadPOF;
|
|
53
52
|
exports.listPOFs = listPOFs;
|
|
54
|
-
exports.getAutoRecoveryManager = getAutoRecoveryManager;
|
|
55
|
-
exports.resetAutoRecoveryManager = resetAutoRecoveryManager;
|
|
56
53
|
const fs = __importStar(require("fs"));
|
|
57
54
|
const logger = __importStar(require("../utils/logger"));
|
|
58
|
-
const events_1 = require("../utils/events");
|
|
59
55
|
const path_1 = require("../utils/path");
|
|
60
|
-
const health_1 = require("../utils/health");
|
|
61
|
-
// ============================================================================
|
|
62
|
-
// Types & Constants
|
|
63
|
-
// ============================================================================
|
|
64
|
-
/** Recovery stages for escalating interventions */
|
|
65
|
-
var RecoveryStage;
|
|
66
|
-
(function (RecoveryStage) {
|
|
67
|
-
/** Normal operation - monitoring */
|
|
68
|
-
RecoveryStage[RecoveryStage["NORMAL"] = 0] = "NORMAL";
|
|
69
|
-
/** First intervention - send continue signal */
|
|
70
|
-
RecoveryStage[RecoveryStage["CONTINUE_SIGNAL"] = 1] = "CONTINUE_SIGNAL";
|
|
71
|
-
/** Second intervention - send stronger prompt */
|
|
72
|
-
RecoveryStage[RecoveryStage["STRONGER_PROMPT"] = 2] = "STRONGER_PROMPT";
|
|
73
|
-
/** Third intervention - kill and restart process */
|
|
74
|
-
RecoveryStage[RecoveryStage["RESTART_PROCESS"] = 3] = "RESTART_PROCESS";
|
|
75
|
-
/** Final stage - run doctor and report */
|
|
76
|
-
RecoveryStage[RecoveryStage["DIAGNOSE"] = 4] = "DIAGNOSE";
|
|
77
|
-
/** No more recovery possible */
|
|
78
|
-
RecoveryStage[RecoveryStage["ABORT"] = 5] = "ABORT";
|
|
79
|
-
})(RecoveryStage || (exports.RecoveryStage = RecoveryStage = {}));
|
|
80
|
-
/** Default auto-recovery configuration */
|
|
81
|
-
exports.DEFAULT_AUTO_RECOVERY_CONFIG = {
|
|
82
|
-
idleTimeoutMs: 2 * 60 * 1000, // 2 minutes - idle detection
|
|
83
|
-
continueGraceMs: 2 * 60 * 1000, // 2 minutes after continue
|
|
84
|
-
strongerPromptGraceMs: 2 * 60 * 1000, // 2 minutes after stronger prompt
|
|
85
|
-
maxRestarts: 2,
|
|
86
|
-
runDoctorOnFailure: true,
|
|
87
|
-
longOperationPatterns: [
|
|
88
|
-
/installing\s+dependencies/i,
|
|
89
|
-
/npm\s+(i|install|ci)/i,
|
|
90
|
-
/pnpm\s+(i|install)/i,
|
|
91
|
-
/yarn\s+(install)?/i,
|
|
92
|
-
/building/i,
|
|
93
|
-
/compiling/i,
|
|
94
|
-
/bundling/i,
|
|
95
|
-
/downloading/i,
|
|
96
|
-
/fetching/i,
|
|
97
|
-
/cloning/i,
|
|
98
|
-
],
|
|
99
|
-
longOperationGraceMs: 10 * 60 * 1000, // 10 minutes for long ops
|
|
100
|
-
verbose: false,
|
|
101
|
-
};
|
|
102
56
|
// ============================================================================
|
|
103
57
|
// Guidance Messages for Git Issues
|
|
104
58
|
// ============================================================================
|
|
@@ -160,427 +114,7 @@ ${errorMessage}
|
|
|
160
114
|
작업을 계속 진행해주세요.`;
|
|
161
115
|
}
|
|
162
116
|
// ============================================================================
|
|
163
|
-
//
|
|
164
|
-
// ============================================================================
|
|
165
|
-
/**
|
|
166
|
-
* Manages recovery state for all lanes
|
|
167
|
-
*/
|
|
168
|
-
class AutoRecoveryManager {
|
|
169
|
-
config;
|
|
170
|
-
laneStates = new Map();
|
|
171
|
-
eventHandlers = new Map();
|
|
172
|
-
constructor(config = {}) {
|
|
173
|
-
this.config = { ...exports.DEFAULT_AUTO_RECOVERY_CONFIG, ...config };
|
|
174
|
-
}
|
|
175
|
-
/**
|
|
176
|
-
* Register a lane for recovery monitoring
|
|
177
|
-
*/
|
|
178
|
-
registerLane(laneName, runId) {
|
|
179
|
-
const now = Date.now();
|
|
180
|
-
this.laneStates.set(laneName, {
|
|
181
|
-
laneName,
|
|
182
|
-
runId,
|
|
183
|
-
stage: RecoveryStage.NORMAL,
|
|
184
|
-
lastActivityTime: now,
|
|
185
|
-
lastBytesReceived: 0,
|
|
186
|
-
totalBytesReceived: 0,
|
|
187
|
-
lastOutput: '',
|
|
188
|
-
restartCount: 0,
|
|
189
|
-
continueSignalsSent: 0,
|
|
190
|
-
lastStageChangeTime: now,
|
|
191
|
-
isLongOperation: false,
|
|
192
|
-
failureHistory: [],
|
|
193
|
-
});
|
|
194
|
-
if (this.config.verbose) {
|
|
195
|
-
logger.info(`[AutoRecovery] Registered lane: ${laneName}`);
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
/**
|
|
199
|
-
* Unregister a lane from recovery monitoring
|
|
200
|
-
*/
|
|
201
|
-
unregisterLane(laneName) {
|
|
202
|
-
this.laneStates.delete(laneName);
|
|
203
|
-
const handler = this.eventHandlers.get(laneName);
|
|
204
|
-
if (handler) {
|
|
205
|
-
this.eventHandlers.delete(laneName);
|
|
206
|
-
}
|
|
207
|
-
}
|
|
208
|
-
/**
|
|
209
|
-
* Record activity for a lane
|
|
210
|
-
*/
|
|
211
|
-
recordActivity(laneName, bytesReceived = 0, output) {
|
|
212
|
-
const state = this.laneStates.get(laneName);
|
|
213
|
-
if (!state)
|
|
214
|
-
return;
|
|
215
|
-
const now = Date.now();
|
|
216
|
-
// Only update activity time if we actually received bytes
|
|
217
|
-
// This allows heartbeats to be recorded (for logs/bytes) without resetting the idle timer
|
|
218
|
-
if (bytesReceived > 0) {
|
|
219
|
-
state.lastActivityTime = now;
|
|
220
|
-
state.lastBytesReceived = bytesReceived;
|
|
221
|
-
state.totalBytesReceived += bytesReceived;
|
|
222
|
-
}
|
|
223
|
-
if (output) {
|
|
224
|
-
state.lastOutput = output;
|
|
225
|
-
// Check if this is a long operation
|
|
226
|
-
state.isLongOperation = this.config.longOperationPatterns.some(p => p.test(output));
|
|
227
|
-
}
|
|
228
|
-
// Reset stage if we got meaningful activity
|
|
229
|
-
if (bytesReceived > 0 && state.stage !== RecoveryStage.NORMAL) {
|
|
230
|
-
if (this.config.verbose) {
|
|
231
|
-
logger.info(`[AutoRecovery] [${laneName}] Activity detected, resetting to NORMAL stage`);
|
|
232
|
-
}
|
|
233
|
-
state.stage = RecoveryStage.NORMAL;
|
|
234
|
-
state.lastStageChangeTime = now;
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
/**
|
|
238
|
-
* Get current recovery state for a lane
|
|
239
|
-
*/
|
|
240
|
-
getState(laneName) {
|
|
241
|
-
return this.laneStates.get(laneName);
|
|
242
|
-
}
|
|
243
|
-
/**
|
|
244
|
-
* Check if a lane needs recovery intervention
|
|
245
|
-
*/
|
|
246
|
-
needsIntervention(laneName) {
|
|
247
|
-
const state = this.laneStates.get(laneName);
|
|
248
|
-
if (!state)
|
|
249
|
-
return false;
|
|
250
|
-
const now = Date.now();
|
|
251
|
-
const idleTime = now - state.lastActivityTime;
|
|
252
|
-
// Use longer timeout for long operations
|
|
253
|
-
const effectiveTimeout = state.isLongOperation
|
|
254
|
-
? this.config.longOperationGraceMs
|
|
255
|
-
: this.config.idleTimeoutMs;
|
|
256
|
-
// Check based on current stage
|
|
257
|
-
switch (state.stage) {
|
|
258
|
-
case RecoveryStage.NORMAL:
|
|
259
|
-
return idleTime > effectiveTimeout;
|
|
260
|
-
case RecoveryStage.CONTINUE_SIGNAL:
|
|
261
|
-
return (now - state.lastStageChangeTime) > this.config.continueGraceMs;
|
|
262
|
-
case RecoveryStage.STRONGER_PROMPT:
|
|
263
|
-
return (now - state.lastStageChangeTime) > this.config.strongerPromptGraceMs;
|
|
264
|
-
case RecoveryStage.RESTART_PROCESS:
|
|
265
|
-
// After restart, use normal timeout to detect if it's working
|
|
266
|
-
return idleTime > effectiveTimeout;
|
|
267
|
-
case RecoveryStage.DIAGNOSE:
|
|
268
|
-
case RecoveryStage.ABORT:
|
|
269
|
-
return false; // No more interventions
|
|
270
|
-
default:
|
|
271
|
-
return false;
|
|
272
|
-
}
|
|
273
|
-
}
|
|
274
|
-
/**
|
|
275
|
-
* Get the next recovery action for a lane
|
|
276
|
-
*/
|
|
277
|
-
async getRecoveryAction(laneName, laneRunDir, child) {
|
|
278
|
-
const state = this.laneStates.get(laneName);
|
|
279
|
-
if (!state) {
|
|
280
|
-
return {
|
|
281
|
-
success: false,
|
|
282
|
-
action: 'none',
|
|
283
|
-
message: 'Lane not registered',
|
|
284
|
-
shouldContinue: false,
|
|
285
|
-
};
|
|
286
|
-
}
|
|
287
|
-
const now = Date.now();
|
|
288
|
-
const idleTime = now - state.lastActivityTime;
|
|
289
|
-
const idleSeconds = Math.round(idleTime / 1000);
|
|
290
|
-
switch (state.stage) {
|
|
291
|
-
case RecoveryStage.NORMAL:
|
|
292
|
-
// Escalate to CONTINUE_SIGNAL
|
|
293
|
-
return await this.sendContinueSignal(laneName, laneRunDir, state, idleSeconds);
|
|
294
|
-
case RecoveryStage.CONTINUE_SIGNAL:
|
|
295
|
-
// Try a stronger prompt
|
|
296
|
-
return await this.sendStrongerPrompt(laneName, laneRunDir, state);
|
|
297
|
-
case RecoveryStage.STRONGER_PROMPT:
|
|
298
|
-
// Try restarting the process
|
|
299
|
-
if (state.restartCount < this.config.maxRestarts) {
|
|
300
|
-
return await this.requestRestart(laneName, state, child);
|
|
301
|
-
}
|
|
302
|
-
// Fall through to diagnose
|
|
303
|
-
state.stage = RecoveryStage.DIAGNOSE;
|
|
304
|
-
state.lastStageChangeTime = now;
|
|
305
|
-
return await this.runDiagnosis(laneName, laneRunDir, state);
|
|
306
|
-
case RecoveryStage.RESTART_PROCESS:
|
|
307
|
-
// After restart, if still no response, diagnose
|
|
308
|
-
if (state.restartCount >= this.config.maxRestarts) {
|
|
309
|
-
state.stage = RecoveryStage.DIAGNOSE;
|
|
310
|
-
state.lastStageChangeTime = now;
|
|
311
|
-
return await this.runDiagnosis(laneName, laneRunDir, state);
|
|
312
|
-
}
|
|
313
|
-
// Try continue signal again after restart
|
|
314
|
-
return await this.sendContinueSignal(laneName, laneRunDir, state, idleSeconds);
|
|
315
|
-
case RecoveryStage.DIAGNOSE:
|
|
316
|
-
// Final stage - abort
|
|
317
|
-
state.stage = RecoveryStage.ABORT;
|
|
318
|
-
state.lastStageChangeTime = now;
|
|
319
|
-
return {
|
|
320
|
-
success: false,
|
|
321
|
-
action: 'abort',
|
|
322
|
-
message: `Lane ${laneName} failed after all recovery attempts`,
|
|
323
|
-
shouldContinue: false,
|
|
324
|
-
nextStage: RecoveryStage.ABORT,
|
|
325
|
-
diagnostic: state.diagnosticInfo,
|
|
326
|
-
};
|
|
327
|
-
default:
|
|
328
|
-
return {
|
|
329
|
-
success: false,
|
|
330
|
-
action: 'abort',
|
|
331
|
-
message: 'Recovery exhausted',
|
|
332
|
-
shouldContinue: false,
|
|
333
|
-
};
|
|
334
|
-
}
|
|
335
|
-
}
|
|
336
|
-
/**
|
|
337
|
-
* Send a continue signal to the lane
|
|
338
|
-
*/
|
|
339
|
-
async sendContinueSignal(laneName, laneRunDir, state, idleSeconds) {
|
|
340
|
-
const interventionPath = (0, path_1.safeJoin)(laneRunDir, 'intervention.txt');
|
|
341
|
-
try {
|
|
342
|
-
fs.writeFileSync(interventionPath, 'continue');
|
|
343
|
-
state.stage = RecoveryStage.CONTINUE_SIGNAL;
|
|
344
|
-
state.lastStageChangeTime = Date.now();
|
|
345
|
-
state.continueSignalsSent++;
|
|
346
|
-
// Record failure history
|
|
347
|
-
state.failureHistory.push({
|
|
348
|
-
timestamp: Date.now(),
|
|
349
|
-
stage: RecoveryStage.CONTINUE_SIGNAL,
|
|
350
|
-
action: 'continue_signal',
|
|
351
|
-
message: `Idle for ${idleSeconds}s`,
|
|
352
|
-
idleTimeMs: idleSeconds * 1000,
|
|
353
|
-
bytesReceived: state.totalBytesReceived,
|
|
354
|
-
lastOutput: state.lastOutput,
|
|
355
|
-
});
|
|
356
|
-
const message = `[${laneName}] Idle for ${idleSeconds}s - sent continue signal (#${state.continueSignalsSent})`;
|
|
357
|
-
logger.warn(message);
|
|
358
|
-
events_1.events.emit('recovery.continue_signal', {
|
|
359
|
-
runId: state.runId,
|
|
360
|
-
laneName,
|
|
361
|
-
idleSeconds,
|
|
362
|
-
signalCount: state.continueSignalsSent,
|
|
363
|
-
});
|
|
364
|
-
return {
|
|
365
|
-
success: true,
|
|
366
|
-
action: 'continue_signal',
|
|
367
|
-
message,
|
|
368
|
-
shouldContinue: true,
|
|
369
|
-
nextStage: RecoveryStage.CONTINUE_SIGNAL,
|
|
370
|
-
};
|
|
371
|
-
}
|
|
372
|
-
catch (error) {
|
|
373
|
-
logger.error(`[AutoRecovery] Failed to send continue signal to ${laneName}: ${error.message}`);
|
|
374
|
-
return {
|
|
375
|
-
success: false,
|
|
376
|
-
action: 'continue_signal',
|
|
377
|
-
message: `Failed to send continue signal: ${error.message}`,
|
|
378
|
-
shouldContinue: true,
|
|
379
|
-
};
|
|
380
|
-
}
|
|
381
|
-
}
|
|
382
|
-
/**
|
|
383
|
-
* Send a stronger prompt to nudge the agent
|
|
384
|
-
*/
|
|
385
|
-
async sendStrongerPrompt(laneName, laneRunDir, state) {
|
|
386
|
-
const interventionPath = (0, path_1.safeJoin)(laneRunDir, 'intervention.txt');
|
|
387
|
-
const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck or waiting.
|
|
388
|
-
Please continue with your current task immediately.
|
|
389
|
-
If you're waiting for something, explain what you need and proceed with what you can do now.
|
|
390
|
-
If you've completed the task, please summarize your work and finish.
|
|
391
|
-
If you encountered a git error, resolve it and continue.`;
|
|
392
|
-
try {
|
|
393
|
-
fs.writeFileSync(interventionPath, strongerPrompt);
|
|
394
|
-
state.stage = RecoveryStage.STRONGER_PROMPT;
|
|
395
|
-
state.lastStageChangeTime = Date.now();
|
|
396
|
-
// Record failure history
|
|
397
|
-
state.failureHistory.push({
|
|
398
|
-
timestamp: Date.now(),
|
|
399
|
-
stage: RecoveryStage.STRONGER_PROMPT,
|
|
400
|
-
action: 'stronger_prompt',
|
|
401
|
-
message: 'Still idle after continue signal',
|
|
402
|
-
idleTimeMs: Date.now() - state.lastActivityTime,
|
|
403
|
-
bytesReceived: state.totalBytesReceived,
|
|
404
|
-
lastOutput: state.lastOutput,
|
|
405
|
-
});
|
|
406
|
-
const message = `[${laneName}] Still idle after continue signal - sent stronger prompt`;
|
|
407
|
-
logger.warn(message);
|
|
408
|
-
events_1.events.emit('recovery.stronger_prompt', {
|
|
409
|
-
runId: state.runId,
|
|
410
|
-
laneName,
|
|
411
|
-
prompt: strongerPrompt,
|
|
412
|
-
});
|
|
413
|
-
return {
|
|
414
|
-
success: true,
|
|
415
|
-
action: 'stronger_prompt',
|
|
416
|
-
message,
|
|
417
|
-
shouldContinue: true,
|
|
418
|
-
nextStage: RecoveryStage.STRONGER_PROMPT,
|
|
419
|
-
};
|
|
420
|
-
}
|
|
421
|
-
catch (error) {
|
|
422
|
-
logger.error(`[AutoRecovery] Failed to send stronger prompt to ${laneName}: ${error.message}`);
|
|
423
|
-
return {
|
|
424
|
-
success: false,
|
|
425
|
-
action: 'stronger_prompt',
|
|
426
|
-
message: `Failed to send stronger prompt: ${error.message}`,
|
|
427
|
-
shouldContinue: true,
|
|
428
|
-
};
|
|
429
|
-
}
|
|
430
|
-
}
|
|
431
|
-
/**
|
|
432
|
-
* Request process restart
|
|
433
|
-
*/
|
|
434
|
-
async requestRestart(laneName, state, child) {
|
|
435
|
-
state.restartCount++;
|
|
436
|
-
state.stage = RecoveryStage.RESTART_PROCESS;
|
|
437
|
-
state.lastStageChangeTime = Date.now();
|
|
438
|
-
// Record failure history
|
|
439
|
-
state.failureHistory.push({
|
|
440
|
-
timestamp: Date.now(),
|
|
441
|
-
stage: RecoveryStage.RESTART_PROCESS,
|
|
442
|
-
action: 'restart',
|
|
443
|
-
message: `Restart attempt ${state.restartCount}/${this.config.maxRestarts}`,
|
|
444
|
-
idleTimeMs: Date.now() - state.lastActivityTime,
|
|
445
|
-
bytesReceived: state.totalBytesReceived,
|
|
446
|
-
lastOutput: state.lastOutput,
|
|
447
|
-
});
|
|
448
|
-
// Kill the current process if provided
|
|
449
|
-
if (child && child.pid && !child.killed) {
|
|
450
|
-
try {
|
|
451
|
-
child.kill('SIGKILL');
|
|
452
|
-
logger.info(`[AutoRecovery] [${laneName}] Killed process ${child.pid}`);
|
|
453
|
-
}
|
|
454
|
-
catch (error) {
|
|
455
|
-
logger.warn(`[AutoRecovery] [${laneName}] Failed to kill process: ${error.message}`);
|
|
456
|
-
}
|
|
457
|
-
}
|
|
458
|
-
const message = `[${laneName}] Restarting lane (attempt ${state.restartCount}/${this.config.maxRestarts})`;
|
|
459
|
-
logger.warn(message);
|
|
460
|
-
events_1.events.emit('recovery.restart', {
|
|
461
|
-
runId: state.runId,
|
|
462
|
-
laneName,
|
|
463
|
-
restartCount: state.restartCount,
|
|
464
|
-
maxRestarts: this.config.maxRestarts,
|
|
465
|
-
});
|
|
466
|
-
return {
|
|
467
|
-
success: true,
|
|
468
|
-
action: 'restart',
|
|
469
|
-
message,
|
|
470
|
-
shouldContinue: true,
|
|
471
|
-
nextStage: RecoveryStage.RESTART_PROCESS,
|
|
472
|
-
};
|
|
473
|
-
}
|
|
474
|
-
/**
|
|
475
|
-
* Run diagnostic checks
|
|
476
|
-
*/
|
|
477
|
-
async runDiagnosis(laneName, laneRunDir, state) {
|
|
478
|
-
if (!this.config.runDoctorOnFailure) {
|
|
479
|
-
return {
|
|
480
|
-
success: false,
|
|
481
|
-
action: 'diagnose',
|
|
482
|
-
message: 'Diagnosis skipped (disabled in config)',
|
|
483
|
-
shouldContinue: false,
|
|
484
|
-
};
|
|
485
|
-
}
|
|
486
|
-
logger.info(`[AutoRecovery] [${laneName}] Running diagnostic checks...`);
|
|
487
|
-
try {
|
|
488
|
-
// Run health checks
|
|
489
|
-
const [agentHealth, authHealth] = await Promise.all([
|
|
490
|
-
(0, health_1.checkAgentHealth)(),
|
|
491
|
-
(0, health_1.checkAuthHealth)(),
|
|
492
|
-
]);
|
|
493
|
-
const systemHealth = await (0, health_1.runHealthCheck)({ skipRemote: true, skipAuth: true });
|
|
494
|
-
const diagnostic = {
|
|
495
|
-
timestamp: Date.now(),
|
|
496
|
-
agentHealthy: agentHealth.ok,
|
|
497
|
-
authHealthy: authHealth.ok,
|
|
498
|
-
systemHealthy: systemHealth.healthy,
|
|
499
|
-
suggestedAction: '',
|
|
500
|
-
details: '',
|
|
501
|
-
};
|
|
502
|
-
// Analyze and suggest action
|
|
503
|
-
const issues = [];
|
|
504
|
-
if (!agentHealth.ok) {
|
|
505
|
-
issues.push(`Agent: ${agentHealth.message}`);
|
|
506
|
-
}
|
|
507
|
-
if (!authHealth.ok) {
|
|
508
|
-
issues.push(`Auth: ${authHealth.message}`);
|
|
509
|
-
diagnostic.suggestedAction = 'Please sign in to Cursor IDE and verify authentication';
|
|
510
|
-
}
|
|
511
|
-
if (!systemHealth.healthy) {
|
|
512
|
-
const failedChecks = systemHealth.checks.filter(c => !c.ok);
|
|
513
|
-
issues.push(`System: ${failedChecks.map(c => c.message).join(', ')}`);
|
|
514
|
-
}
|
|
515
|
-
if (issues.length === 0) {
|
|
516
|
-
diagnostic.details = 'All health checks passed. The issue may be with the AI model or network.';
|
|
517
|
-
diagnostic.suggestedAction = 'Try resuming with a different model or wait and retry.';
|
|
518
|
-
}
|
|
519
|
-
else {
|
|
520
|
-
diagnostic.details = issues.join('\n');
|
|
521
|
-
}
|
|
522
|
-
state.diagnosticInfo = diagnostic;
|
|
523
|
-
// Record failure history
|
|
524
|
-
state.failureHistory.push({
|
|
525
|
-
timestamp: Date.now(),
|
|
526
|
-
stage: RecoveryStage.DIAGNOSE,
|
|
527
|
-
action: 'diagnose',
|
|
528
|
-
message: diagnostic.details,
|
|
529
|
-
idleTimeMs: Date.now() - state.lastActivityTime,
|
|
530
|
-
bytesReceived: state.totalBytesReceived,
|
|
531
|
-
lastOutput: state.lastOutput,
|
|
532
|
-
});
|
|
533
|
-
// Save diagnostic to file
|
|
534
|
-
const diagnosticPath = (0, path_1.safeJoin)(laneRunDir, 'diagnostic.json');
|
|
535
|
-
fs.writeFileSync(diagnosticPath, JSON.stringify(diagnostic, null, 2));
|
|
536
|
-
const message = `[${laneName}] Diagnostic complete:\n${diagnostic.details}\nSuggested action: ${diagnostic.suggestedAction}`;
|
|
537
|
-
logger.error(message);
|
|
538
|
-
events_1.events.emit('recovery.diagnosed', {
|
|
539
|
-
runId: state.runId,
|
|
540
|
-
laneName,
|
|
541
|
-
diagnostic,
|
|
542
|
-
});
|
|
543
|
-
return {
|
|
544
|
-
success: true,
|
|
545
|
-
action: 'diagnose',
|
|
546
|
-
message,
|
|
547
|
-
shouldContinue: false,
|
|
548
|
-
diagnostic,
|
|
549
|
-
};
|
|
550
|
-
}
|
|
551
|
-
catch (error) {
|
|
552
|
-
logger.error(`[AutoRecovery] Diagnostic failed: ${error.message}`);
|
|
553
|
-
return {
|
|
554
|
-
success: false,
|
|
555
|
-
action: 'diagnose',
|
|
556
|
-
message: `Diagnostic failed: ${error.message}`,
|
|
557
|
-
shouldContinue: false,
|
|
558
|
-
};
|
|
559
|
-
}
|
|
560
|
-
}
|
|
561
|
-
/**
|
|
562
|
-
* Get failure history for a lane
|
|
563
|
-
*/
|
|
564
|
-
getFailureHistory(laneName) {
|
|
565
|
-
const state = this.laneStates.get(laneName);
|
|
566
|
-
return state?.failureHistory || [];
|
|
567
|
-
}
|
|
568
|
-
/**
|
|
569
|
-
* Get configuration
|
|
570
|
-
*/
|
|
571
|
-
getConfig() {
|
|
572
|
-
return { ...this.config };
|
|
573
|
-
}
|
|
574
|
-
/**
|
|
575
|
-
* Update configuration
|
|
576
|
-
*/
|
|
577
|
-
updateConfig(config) {
|
|
578
|
-
this.config = { ...this.config, ...config };
|
|
579
|
-
}
|
|
580
|
-
}
|
|
581
|
-
exports.AutoRecoveryManager = AutoRecoveryManager;
|
|
582
|
-
// ============================================================================
|
|
583
|
-
// POF (Post-mortem of Failure) Management
|
|
117
|
+
// Post-Mortem of Failure (POF) Management
|
|
584
118
|
// ============================================================================
|
|
585
119
|
/**
|
|
586
120
|
* Save a POF entry to the pof directory
|
|
@@ -721,24 +255,7 @@ function listPOFs(pofDir) {
|
|
|
721
255
|
// ============================================================================
|
|
722
256
|
// Exports
|
|
723
257
|
// ============================================================================
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
* Get or create the default auto-recovery manager
|
|
728
|
-
*/
|
|
729
|
-
function getAutoRecoveryManager(config) {
|
|
730
|
-
if (!defaultManager) {
|
|
731
|
-
defaultManager = new AutoRecoveryManager(config);
|
|
732
|
-
}
|
|
733
|
-
else if (config) {
|
|
734
|
-
defaultManager.updateConfig(config);
|
|
735
|
-
}
|
|
736
|
-
return defaultManager;
|
|
737
|
-
}
|
|
738
|
-
/**
|
|
739
|
-
* Reset the default manager (for testing)
|
|
740
|
-
*/
|
|
741
|
-
function resetAutoRecoveryManager() {
|
|
742
|
-
defaultManager = null;
|
|
743
|
-
}
|
|
258
|
+
// AutoRecoveryManager class removed. All stall detection and recovery logic
|
|
259
|
+
// has been moved to StallDetectionService in ./stall-detection.ts.
|
|
260
|
+
// Utility functions for POF and git guidance are kept below.
|
|
744
261
|
//# sourceMappingURL=auto-recovery.js.map
|