@litmers/cursorflow-orchestrator 0.1.18 → 0.1.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +25 -0
- package/README.md +25 -7
- package/commands/cursorflow-clean.md +19 -0
- package/commands/cursorflow-runs.md +59 -0
- package/commands/cursorflow-stop.md +55 -0
- package/dist/cli/clean.js +178 -6
- package/dist/cli/clean.js.map +1 -1
- package/dist/cli/index.js +12 -1
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/init.js +8 -7
- package/dist/cli/init.js.map +1 -1
- package/dist/cli/logs.js +126 -77
- package/dist/cli/logs.js.map +1 -1
- package/dist/cli/monitor.d.ts +7 -0
- package/dist/cli/monitor.js +1021 -202
- package/dist/cli/monitor.js.map +1 -1
- package/dist/cli/prepare.js +39 -21
- package/dist/cli/prepare.js.map +1 -1
- package/dist/cli/resume.js +268 -163
- package/dist/cli/resume.js.map +1 -1
- package/dist/cli/run.js +11 -5
- package/dist/cli/run.js.map +1 -1
- package/dist/cli/runs.d.ts +5 -0
- package/dist/cli/runs.js +214 -0
- package/dist/cli/runs.js.map +1 -0
- package/dist/cli/setup-commands.js +0 -0
- package/dist/cli/signal.js +8 -8
- package/dist/cli/signal.js.map +1 -1
- package/dist/cli/stop.d.ts +5 -0
- package/dist/cli/stop.js +215 -0
- package/dist/cli/stop.js.map +1 -0
- package/dist/cli/tasks.d.ts +10 -0
- package/dist/cli/tasks.js +165 -0
- package/dist/cli/tasks.js.map +1 -0
- package/dist/core/auto-recovery.d.ts +212 -0
- package/dist/core/auto-recovery.js +737 -0
- package/dist/core/auto-recovery.js.map +1 -0
- package/dist/core/failure-policy.d.ts +156 -0
- package/dist/core/failure-policy.js +488 -0
- package/dist/core/failure-policy.js.map +1 -0
- package/dist/core/orchestrator.d.ts +16 -2
- package/dist/core/orchestrator.js +439 -105
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/core/reviewer.d.ts +2 -0
- package/dist/core/reviewer.js +2 -0
- package/dist/core/reviewer.js.map +1 -1
- package/dist/core/runner.d.ts +33 -10
- package/dist/core/runner.js +374 -164
- package/dist/core/runner.js.map +1 -1
- package/dist/services/logging/buffer.d.ts +67 -0
- package/dist/services/logging/buffer.js +309 -0
- package/dist/services/logging/buffer.js.map +1 -0
- package/dist/services/logging/console.d.ts +89 -0
- package/dist/services/logging/console.js +169 -0
- package/dist/services/logging/console.js.map +1 -0
- package/dist/services/logging/file-writer.d.ts +71 -0
- package/dist/services/logging/file-writer.js +516 -0
- package/dist/services/logging/file-writer.js.map +1 -0
- package/dist/services/logging/formatter.d.ts +39 -0
- package/dist/services/logging/formatter.js +227 -0
- package/dist/services/logging/formatter.js.map +1 -0
- package/dist/services/logging/index.d.ts +11 -0
- package/dist/services/logging/index.js +30 -0
- package/dist/services/logging/index.js.map +1 -0
- package/dist/services/logging/parser.d.ts +31 -0
- package/dist/services/logging/parser.js +222 -0
- package/dist/services/logging/parser.js.map +1 -0
- package/dist/services/process/index.d.ts +59 -0
- package/dist/services/process/index.js +257 -0
- package/dist/services/process/index.js.map +1 -0
- package/dist/types/agent.d.ts +20 -0
- package/dist/types/agent.js +6 -0
- package/dist/types/agent.js.map +1 -0
- package/dist/types/config.d.ts +65 -0
- package/dist/types/config.js +6 -0
- package/dist/types/config.js.map +1 -0
- package/dist/types/events.d.ts +125 -0
- package/dist/types/events.js +6 -0
- package/dist/types/events.js.map +1 -0
- package/dist/types/index.d.ts +12 -0
- package/dist/types/index.js +37 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/lane.d.ts +43 -0
- package/dist/types/lane.js +6 -0
- package/dist/types/lane.js.map +1 -0
- package/dist/types/logging.d.ts +71 -0
- package/dist/types/logging.js +16 -0
- package/dist/types/logging.js.map +1 -0
- package/dist/types/review.d.ts +17 -0
- package/dist/types/review.js +6 -0
- package/dist/types/review.js.map +1 -0
- package/dist/types/run.d.ts +32 -0
- package/dist/types/run.js +6 -0
- package/dist/types/run.js.map +1 -0
- package/dist/types/task.d.ts +71 -0
- package/dist/types/task.js +6 -0
- package/dist/types/task.js.map +1 -0
- package/dist/ui/components.d.ts +134 -0
- package/dist/ui/components.js +389 -0
- package/dist/ui/components.js.map +1 -0
- package/dist/ui/log-viewer.d.ts +49 -0
- package/dist/ui/log-viewer.js +449 -0
- package/dist/ui/log-viewer.js.map +1 -0
- package/dist/utils/checkpoint.d.ts +87 -0
- package/dist/utils/checkpoint.js +317 -0
- package/dist/utils/checkpoint.js.map +1 -0
- package/dist/utils/config.d.ts +4 -0
- package/dist/utils/config.js +18 -8
- package/dist/utils/config.js.map +1 -1
- package/dist/utils/cursor-agent.js.map +1 -1
- package/dist/utils/dependency.d.ts +74 -0
- package/dist/utils/dependency.js +420 -0
- package/dist/utils/dependency.js.map +1 -0
- package/dist/utils/doctor.js +17 -11
- package/dist/utils/doctor.js.map +1 -1
- package/dist/utils/enhanced-logger.d.ts +10 -33
- package/dist/utils/enhanced-logger.js +108 -20
- package/dist/utils/enhanced-logger.js.map +1 -1
- package/dist/utils/git.d.ts +121 -0
- package/dist/utils/git.js +484 -11
- package/dist/utils/git.js.map +1 -1
- package/dist/utils/health.d.ts +91 -0
- package/dist/utils/health.js +556 -0
- package/dist/utils/health.js.map +1 -0
- package/dist/utils/lock.d.ts +95 -0
- package/dist/utils/lock.js +332 -0
- package/dist/utils/lock.js.map +1 -0
- package/dist/utils/log-buffer.d.ts +17 -0
- package/dist/utils/log-buffer.js +14 -0
- package/dist/utils/log-buffer.js.map +1 -0
- package/dist/utils/log-constants.d.ts +23 -0
- package/dist/utils/log-constants.js +28 -0
- package/dist/utils/log-constants.js.map +1 -0
- package/dist/utils/log-formatter.d.ts +25 -0
- package/dist/utils/log-formatter.js +237 -0
- package/dist/utils/log-formatter.js.map +1 -0
- package/dist/utils/log-service.d.ts +19 -0
- package/dist/utils/log-service.js +47 -0
- package/dist/utils/log-service.js.map +1 -0
- package/dist/utils/logger.d.ts +46 -27
- package/dist/utils/logger.js +82 -60
- package/dist/utils/logger.js.map +1 -1
- package/dist/utils/path.d.ts +19 -0
- package/dist/utils/path.js +77 -0
- package/dist/utils/path.js.map +1 -0
- package/dist/utils/process-manager.d.ts +21 -0
- package/dist/utils/process-manager.js +138 -0
- package/dist/utils/process-manager.js.map +1 -0
- package/dist/utils/retry.d.ts +121 -0
- package/dist/utils/retry.js +374 -0
- package/dist/utils/retry.js.map +1 -0
- package/dist/utils/run-service.d.ts +88 -0
- package/dist/utils/run-service.js +412 -0
- package/dist/utils/run-service.js.map +1 -0
- package/dist/utils/state.d.ts +62 -3
- package/dist/utils/state.js +317 -11
- package/dist/utils/state.js.map +1 -1
- package/dist/utils/task-service.d.ts +82 -0
- package/dist/utils/task-service.js +348 -0
- package/dist/utils/task-service.js.map +1 -0
- package/dist/utils/template.d.ts +14 -0
- package/dist/utils/template.js +122 -0
- package/dist/utils/template.js.map +1 -0
- package/dist/utils/types.d.ts +2 -271
- package/dist/utils/types.js +16 -0
- package/dist/utils/types.js.map +1 -1
- package/package.json +38 -23
- package/scripts/ai-security-check.js +0 -1
- package/scripts/local-security-gate.sh +0 -0
- package/scripts/monitor-lanes.sh +94 -0
- package/scripts/patches/test-cursor-agent.js +0 -1
- package/scripts/release.sh +0 -0
- package/scripts/setup-security.sh +0 -0
- package/scripts/stream-logs.sh +72 -0
- package/scripts/verify-and-fix.sh +0 -0
- package/src/cli/clean.ts +187 -6
- package/src/cli/index.ts +12 -1
- package/src/cli/init.ts +8 -7
- package/src/cli/logs.ts +124 -77
- package/src/cli/monitor.ts +1815 -898
- package/src/cli/prepare.ts +41 -21
- package/src/cli/resume.ts +753 -626
- package/src/cli/run.ts +12 -5
- package/src/cli/runs.ts +212 -0
- package/src/cli/setup-commands.ts +0 -0
- package/src/cli/signal.ts +8 -7
- package/src/cli/stop.ts +209 -0
- package/src/cli/tasks.ts +154 -0
- package/src/core/auto-recovery.ts +909 -0
- package/src/core/failure-policy.ts +592 -0
- package/src/core/orchestrator.ts +1131 -704
- package/src/core/reviewer.ts +4 -0
- package/src/core/runner.ts +444 -180
- package/src/services/logging/buffer.ts +326 -0
- package/src/services/logging/console.ts +193 -0
- package/src/services/logging/file-writer.ts +526 -0
- package/src/services/logging/formatter.ts +268 -0
- package/src/services/logging/index.ts +16 -0
- package/src/services/logging/parser.ts +232 -0
- package/src/services/process/index.ts +261 -0
- package/src/types/agent.ts +24 -0
- package/src/types/config.ts +79 -0
- package/src/types/events.ts +156 -0
- package/src/types/index.ts +29 -0
- package/src/types/lane.ts +56 -0
- package/src/types/logging.ts +96 -0
- package/src/types/review.ts +20 -0
- package/src/types/run.ts +37 -0
- package/src/types/task.ts +79 -0
- package/src/ui/components.ts +430 -0
- package/src/ui/log-viewer.ts +485 -0
- package/src/utils/checkpoint.ts +374 -0
- package/src/utils/config.ts +18 -8
- package/src/utils/cursor-agent.ts +1 -1
- package/src/utils/dependency.ts +482 -0
- package/src/utils/doctor.ts +18 -11
- package/src/utils/enhanced-logger.ts +122 -60
- package/src/utils/git.ts +517 -11
- package/src/utils/health.ts +596 -0
- package/src/utils/lock.ts +346 -0
- package/src/utils/log-buffer.ts +28 -0
- package/src/utils/log-constants.ts +26 -0
- package/src/utils/log-formatter.ts +245 -0
- package/src/utils/log-service.ts +49 -0
- package/src/utils/logger.ts +100 -51
- package/src/utils/path.ts +45 -0
- package/src/utils/process-manager.ts +100 -0
- package/src/utils/retry.ts +413 -0
- package/src/utils/run-service.ts +433 -0
- package/src/utils/state.ts +385 -11
- package/src/utils/task-service.ts +370 -0
- package/src/utils/template.ts +92 -0
- package/src/utils/types.ts +2 -314
- package/templates/basic.json +21 -0
|
@@ -0,0 +1,737 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Auto-Recovery Module
|
|
4
|
+
*
|
|
5
|
+
* Automatic recovery strategies for common orchestration failures:
|
|
6
|
+
* - Agent idle/no response detection with escalating interventions
|
|
7
|
+
* - Guidance messages for git conflicts and push failures
|
|
8
|
+
* - Process health monitoring with restart capabilities
|
|
9
|
+
* - Doctor integration for persistent failures
|
|
10
|
+
* - POF (Post-mortem of Failure) saving for failed recoveries
|
|
11
|
+
*/
|
|
12
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
13
|
+
if (k2 === undefined) k2 = k;
|
|
14
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
15
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
16
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
17
|
+
}
|
|
18
|
+
Object.defineProperty(o, k2, desc);
|
|
19
|
+
}) : (function(o, m, k, k2) {
|
|
20
|
+
if (k2 === undefined) k2 = k;
|
|
21
|
+
o[k2] = m[k];
|
|
22
|
+
}));
|
|
23
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
24
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
25
|
+
}) : function(o, v) {
|
|
26
|
+
o["default"] = v;
|
|
27
|
+
});
|
|
28
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
29
|
+
var ownKeys = function(o) {
|
|
30
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
31
|
+
var ar = [];
|
|
32
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
33
|
+
return ar;
|
|
34
|
+
};
|
|
35
|
+
return ownKeys(o);
|
|
36
|
+
};
|
|
37
|
+
return function (mod) {
|
|
38
|
+
if (mod && mod.__esModule) return mod;
|
|
39
|
+
var result = {};
|
|
40
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
41
|
+
__setModuleDefault(result, mod);
|
|
42
|
+
return result;
|
|
43
|
+
};
|
|
44
|
+
})();
|
|
45
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
46
|
+
exports.AutoRecoveryManager = exports.DEFAULT_AUTO_RECOVERY_CONFIG = exports.RecoveryStage = void 0;
|
|
47
|
+
exports.getGitPushFailureGuidance = getGitPushFailureGuidance;
|
|
48
|
+
exports.getMergeConflictGuidance = getMergeConflictGuidance;
|
|
49
|
+
exports.getGitErrorGuidance = getGitErrorGuidance;
|
|
50
|
+
exports.savePOF = savePOF;
|
|
51
|
+
exports.createPOFFromRecoveryState = createPOFFromRecoveryState;
|
|
52
|
+
exports.loadPOF = loadPOF;
|
|
53
|
+
exports.listPOFs = listPOFs;
|
|
54
|
+
exports.getAutoRecoveryManager = getAutoRecoveryManager;
|
|
55
|
+
exports.resetAutoRecoveryManager = resetAutoRecoveryManager;
|
|
56
|
+
const fs = __importStar(require("fs"));
|
|
57
|
+
const logger = __importStar(require("../utils/logger"));
|
|
58
|
+
const events_1 = require("../utils/events");
|
|
59
|
+
const path_1 = require("../utils/path");
|
|
60
|
+
const health_1 = require("../utils/health");
|
|
61
|
+
// ============================================================================
|
|
62
|
+
// Types & Constants
|
|
63
|
+
// ============================================================================
|
|
64
|
+
/** Recovery stages for escalating interventions */
|
|
65
|
+
var RecoveryStage;
|
|
66
|
+
(function (RecoveryStage) {
|
|
67
|
+
/** Normal operation - monitoring */
|
|
68
|
+
RecoveryStage[RecoveryStage["NORMAL"] = 0] = "NORMAL";
|
|
69
|
+
/** First intervention - send continue signal */
|
|
70
|
+
RecoveryStage[RecoveryStage["CONTINUE_SIGNAL"] = 1] = "CONTINUE_SIGNAL";
|
|
71
|
+
/** Second intervention - send stronger prompt */
|
|
72
|
+
RecoveryStage[RecoveryStage["STRONGER_PROMPT"] = 2] = "STRONGER_PROMPT";
|
|
73
|
+
/** Third intervention - kill and restart process */
|
|
74
|
+
RecoveryStage[RecoveryStage["RESTART_PROCESS"] = 3] = "RESTART_PROCESS";
|
|
75
|
+
/** Final stage - run doctor and report */
|
|
76
|
+
RecoveryStage[RecoveryStage["DIAGNOSE"] = 4] = "DIAGNOSE";
|
|
77
|
+
/** No more recovery possible */
|
|
78
|
+
RecoveryStage[RecoveryStage["ABORT"] = 5] = "ABORT";
|
|
79
|
+
})(RecoveryStage || (exports.RecoveryStage = RecoveryStage = {}));
|
|
80
|
+
/** Default auto-recovery configuration */
|
|
81
|
+
exports.DEFAULT_AUTO_RECOVERY_CONFIG = {
|
|
82
|
+
idleTimeoutMs: 60 * 1000, // 1 minute - quick detection
|
|
83
|
+
continueGraceMs: 60 * 1000, // 1 minute after continue
|
|
84
|
+
strongerPromptGraceMs: 60 * 1000, // 1 minute after stronger prompt
|
|
85
|
+
maxRestarts: 2,
|
|
86
|
+
runDoctorOnFailure: true,
|
|
87
|
+
longOperationPatterns: [
|
|
88
|
+
/installing\s+dependencies/i,
|
|
89
|
+
/npm\s+(i|install|ci)/i,
|
|
90
|
+
/pnpm\s+(i|install)/i,
|
|
91
|
+
/yarn\s+(install)?/i,
|
|
92
|
+
/building/i,
|
|
93
|
+
/compiling/i,
|
|
94
|
+
/bundling/i,
|
|
95
|
+
/downloading/i,
|
|
96
|
+
/fetching/i,
|
|
97
|
+
/cloning/i,
|
|
98
|
+
],
|
|
99
|
+
longOperationGraceMs: 10 * 60 * 1000, // 10 minutes for long ops
|
|
100
|
+
verbose: false,
|
|
101
|
+
};
|
|
102
|
+
// ============================================================================
|
|
103
|
+
// Guidance Messages for Git Issues
|
|
104
|
+
// ============================================================================
|
|
105
|
+
/** Generate guidance message for git push failure */
|
|
106
|
+
function getGitPushFailureGuidance() {
|
|
107
|
+
return `[SYSTEM INTERVENTION] Git push가 실패했습니다. 다음 단계를 수행해주세요:
|
|
108
|
+
|
|
109
|
+
1. 먼저 원격 변경사항을 가져오세요:
|
|
110
|
+
\`\`\`bash
|
|
111
|
+
git fetch origin
|
|
112
|
+
git pull --rebase origin HEAD
|
|
113
|
+
\`\`\`
|
|
114
|
+
|
|
115
|
+
2. 충돌이 발생하면 해결하세요:
|
|
116
|
+
- 충돌 파일을 확인하고 수정
|
|
117
|
+
- git add로 스테이징
|
|
118
|
+
- git rebase --continue 실행
|
|
119
|
+
|
|
120
|
+
3. 다시 푸시하세요:
|
|
121
|
+
\`\`\`bash
|
|
122
|
+
git push origin HEAD
|
|
123
|
+
\`\`\`
|
|
124
|
+
|
|
125
|
+
작업을 계속 진행해주세요.`;
|
|
126
|
+
}
|
|
127
|
+
/** Generate guidance message for merge conflict */
|
|
128
|
+
function getMergeConflictGuidance() {
|
|
129
|
+
return `[SYSTEM INTERVENTION] Merge conflict가 발생했습니다. 다음 단계를 수행해주세요:
|
|
130
|
+
|
|
131
|
+
1. 충돌 파일 확인:
|
|
132
|
+
\`\`\`bash
|
|
133
|
+
git status
|
|
134
|
+
\`\`\`
|
|
135
|
+
|
|
136
|
+
2. 각 충돌 파일을 열어서 수동으로 해결:
|
|
137
|
+
- <<<<<<< 와 >>>>>>> 사이의 내용을 확인
|
|
138
|
+
- 적절한 코드를 선택하거나 병합
|
|
139
|
+
- 충돌 마커 제거
|
|
140
|
+
|
|
141
|
+
3. 해결 후 스테이징 및 커밋:
|
|
142
|
+
\`\`\`bash
|
|
143
|
+
git add -A
|
|
144
|
+
git commit -m "chore: resolve merge conflict"
|
|
145
|
+
git push origin HEAD
|
|
146
|
+
\`\`\`
|
|
147
|
+
|
|
148
|
+
작업을 계속 진행해주세요.`;
|
|
149
|
+
}
|
|
150
|
+
/** Generate guidance message for general git error */
|
|
151
|
+
function getGitErrorGuidance(errorMessage) {
|
|
152
|
+
return `[SYSTEM INTERVENTION] Git 작업 중 오류가 발생했습니다:
|
|
153
|
+
${errorMessage}
|
|
154
|
+
|
|
155
|
+
다음을 시도해주세요:
|
|
156
|
+
1. git status로 현재 상태 확인
|
|
157
|
+
2. 필요시 git reset --hard HEAD로 초기화
|
|
158
|
+
3. 원격 저장소와 동기화: git fetch origin && git pull --rebase
|
|
159
|
+
|
|
160
|
+
작업을 계속 진행해주세요.`;
|
|
161
|
+
}
|
|
162
|
+
// ============================================================================
|
|
163
|
+
// Recovery State Manager
|
|
164
|
+
// ============================================================================
|
|
165
|
+
/**
|
|
166
|
+
* Manages recovery state for all lanes
|
|
167
|
+
*/
|
|
168
|
+
class AutoRecoveryManager {
|
|
169
|
+
config;
|
|
170
|
+
laneStates = new Map();
|
|
171
|
+
eventHandlers = new Map();
|
|
172
|
+
constructor(config = {}) {
|
|
173
|
+
this.config = { ...exports.DEFAULT_AUTO_RECOVERY_CONFIG, ...config };
|
|
174
|
+
}
|
|
175
|
+
/**
|
|
176
|
+
* Register a lane for recovery monitoring
|
|
177
|
+
*/
|
|
178
|
+
registerLane(laneName) {
|
|
179
|
+
const now = Date.now();
|
|
180
|
+
this.laneStates.set(laneName, {
|
|
181
|
+
laneName,
|
|
182
|
+
stage: RecoveryStage.NORMAL,
|
|
183
|
+
lastActivityTime: now,
|
|
184
|
+
lastBytesReceived: 0,
|
|
185
|
+
totalBytesReceived: 0,
|
|
186
|
+
lastOutput: '',
|
|
187
|
+
restartCount: 0,
|
|
188
|
+
continueSignalsSent: 0,
|
|
189
|
+
lastStageChangeTime: now,
|
|
190
|
+
isLongOperation: false,
|
|
191
|
+
failureHistory: [],
|
|
192
|
+
});
|
|
193
|
+
if (this.config.verbose) {
|
|
194
|
+
logger.info(`[AutoRecovery] Registered lane: ${laneName}`);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Unregister a lane from recovery monitoring
|
|
199
|
+
*/
|
|
200
|
+
unregisterLane(laneName) {
|
|
201
|
+
this.laneStates.delete(laneName);
|
|
202
|
+
const handler = this.eventHandlers.get(laneName);
|
|
203
|
+
if (handler) {
|
|
204
|
+
this.eventHandlers.delete(laneName);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* Record activity for a lane
|
|
209
|
+
*/
|
|
210
|
+
recordActivity(laneName, bytesReceived = 0, output) {
|
|
211
|
+
const state = this.laneStates.get(laneName);
|
|
212
|
+
if (!state)
|
|
213
|
+
return;
|
|
214
|
+
const now = Date.now();
|
|
215
|
+
state.lastActivityTime = now;
|
|
216
|
+
if (bytesReceived > 0) {
|
|
217
|
+
state.lastBytesReceived = bytesReceived;
|
|
218
|
+
state.totalBytesReceived += bytesReceived;
|
|
219
|
+
}
|
|
220
|
+
if (output) {
|
|
221
|
+
state.lastOutput = output;
|
|
222
|
+
// Check if this is a long operation
|
|
223
|
+
state.isLongOperation = this.config.longOperationPatterns.some(p => p.test(output));
|
|
224
|
+
}
|
|
225
|
+
// Reset stage if we got meaningful activity
|
|
226
|
+
if (bytesReceived > 0 && state.stage !== RecoveryStage.NORMAL) {
|
|
227
|
+
if (this.config.verbose) {
|
|
228
|
+
logger.info(`[AutoRecovery] [${laneName}] Activity detected, resetting to NORMAL stage`);
|
|
229
|
+
}
|
|
230
|
+
state.stage = RecoveryStage.NORMAL;
|
|
231
|
+
state.lastStageChangeTime = now;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
/**
|
|
235
|
+
* Get current recovery state for a lane
|
|
236
|
+
*/
|
|
237
|
+
getState(laneName) {
|
|
238
|
+
return this.laneStates.get(laneName);
|
|
239
|
+
}
|
|
240
|
+
/**
|
|
241
|
+
* Check if a lane needs recovery intervention
|
|
242
|
+
*/
|
|
243
|
+
needsIntervention(laneName) {
|
|
244
|
+
const state = this.laneStates.get(laneName);
|
|
245
|
+
if (!state)
|
|
246
|
+
return false;
|
|
247
|
+
const now = Date.now();
|
|
248
|
+
const idleTime = now - state.lastActivityTime;
|
|
249
|
+
// Use longer timeout for long operations
|
|
250
|
+
const effectiveTimeout = state.isLongOperation
|
|
251
|
+
? this.config.longOperationGraceMs
|
|
252
|
+
: this.config.idleTimeoutMs;
|
|
253
|
+
// Check based on current stage
|
|
254
|
+
switch (state.stage) {
|
|
255
|
+
case RecoveryStage.NORMAL:
|
|
256
|
+
return idleTime > effectiveTimeout;
|
|
257
|
+
case RecoveryStage.CONTINUE_SIGNAL:
|
|
258
|
+
return (now - state.lastStageChangeTime) > this.config.continueGraceMs;
|
|
259
|
+
case RecoveryStage.STRONGER_PROMPT:
|
|
260
|
+
return (now - state.lastStageChangeTime) > this.config.strongerPromptGraceMs;
|
|
261
|
+
case RecoveryStage.RESTART_PROCESS:
|
|
262
|
+
// After restart, use normal timeout to detect if it's working
|
|
263
|
+
return idleTime > effectiveTimeout;
|
|
264
|
+
case RecoveryStage.DIAGNOSE:
|
|
265
|
+
case RecoveryStage.ABORT:
|
|
266
|
+
return false; // No more interventions
|
|
267
|
+
default:
|
|
268
|
+
return false;
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
/**
|
|
272
|
+
* Get the next recovery action for a lane
|
|
273
|
+
*/
|
|
274
|
+
async getRecoveryAction(laneName, laneRunDir, child) {
|
|
275
|
+
const state = this.laneStates.get(laneName);
|
|
276
|
+
if (!state) {
|
|
277
|
+
return {
|
|
278
|
+
success: false,
|
|
279
|
+
action: 'none',
|
|
280
|
+
message: 'Lane not registered',
|
|
281
|
+
shouldContinue: false,
|
|
282
|
+
};
|
|
283
|
+
}
|
|
284
|
+
const now = Date.now();
|
|
285
|
+
const idleTime = now - state.lastActivityTime;
|
|
286
|
+
const idleSeconds = Math.round(idleTime / 1000);
|
|
287
|
+
switch (state.stage) {
|
|
288
|
+
case RecoveryStage.NORMAL:
|
|
289
|
+
// Escalate to CONTINUE_SIGNAL
|
|
290
|
+
return await this.sendContinueSignal(laneName, laneRunDir, state, idleSeconds);
|
|
291
|
+
case RecoveryStage.CONTINUE_SIGNAL:
|
|
292
|
+
// Try a stronger prompt
|
|
293
|
+
return await this.sendStrongerPrompt(laneName, laneRunDir, state);
|
|
294
|
+
case RecoveryStage.STRONGER_PROMPT:
|
|
295
|
+
// Try restarting the process
|
|
296
|
+
if (state.restartCount < this.config.maxRestarts) {
|
|
297
|
+
return await this.requestRestart(laneName, state, child);
|
|
298
|
+
}
|
|
299
|
+
// Fall through to diagnose
|
|
300
|
+
state.stage = RecoveryStage.DIAGNOSE;
|
|
301
|
+
state.lastStageChangeTime = now;
|
|
302
|
+
return await this.runDiagnosis(laneName, laneRunDir, state);
|
|
303
|
+
case RecoveryStage.RESTART_PROCESS:
|
|
304
|
+
// After restart, if still no response, diagnose
|
|
305
|
+
if (state.restartCount >= this.config.maxRestarts) {
|
|
306
|
+
state.stage = RecoveryStage.DIAGNOSE;
|
|
307
|
+
state.lastStageChangeTime = now;
|
|
308
|
+
return await this.runDiagnosis(laneName, laneRunDir, state);
|
|
309
|
+
}
|
|
310
|
+
// Try continue signal again after restart
|
|
311
|
+
return await this.sendContinueSignal(laneName, laneRunDir, state, idleSeconds);
|
|
312
|
+
case RecoveryStage.DIAGNOSE:
|
|
313
|
+
// Final stage - abort
|
|
314
|
+
state.stage = RecoveryStage.ABORT;
|
|
315
|
+
state.lastStageChangeTime = now;
|
|
316
|
+
return {
|
|
317
|
+
success: false,
|
|
318
|
+
action: 'abort',
|
|
319
|
+
message: `Lane ${laneName} failed after all recovery attempts`,
|
|
320
|
+
shouldContinue: false,
|
|
321
|
+
nextStage: RecoveryStage.ABORT,
|
|
322
|
+
diagnostic: state.diagnosticInfo,
|
|
323
|
+
};
|
|
324
|
+
default:
|
|
325
|
+
return {
|
|
326
|
+
success: false,
|
|
327
|
+
action: 'abort',
|
|
328
|
+
message: 'Recovery exhausted',
|
|
329
|
+
shouldContinue: false,
|
|
330
|
+
};
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
/**
|
|
334
|
+
* Send a continue signal to the lane
|
|
335
|
+
*/
|
|
336
|
+
async sendContinueSignal(laneName, laneRunDir, state, idleSeconds) {
|
|
337
|
+
const interventionPath = (0, path_1.safeJoin)(laneRunDir, 'intervention.txt');
|
|
338
|
+
try {
|
|
339
|
+
fs.writeFileSync(interventionPath, 'continue');
|
|
340
|
+
state.stage = RecoveryStage.CONTINUE_SIGNAL;
|
|
341
|
+
state.lastStageChangeTime = Date.now();
|
|
342
|
+
state.continueSignalsSent++;
|
|
343
|
+
// Record failure history
|
|
344
|
+
state.failureHistory.push({
|
|
345
|
+
timestamp: Date.now(),
|
|
346
|
+
stage: RecoveryStage.CONTINUE_SIGNAL,
|
|
347
|
+
action: 'continue_signal',
|
|
348
|
+
message: `Idle for ${idleSeconds}s`,
|
|
349
|
+
idleTimeMs: idleSeconds * 1000,
|
|
350
|
+
bytesReceived: state.totalBytesReceived,
|
|
351
|
+
lastOutput: state.lastOutput,
|
|
352
|
+
});
|
|
353
|
+
const message = `[${laneName}] Idle for ${idleSeconds}s - sent continue signal (#${state.continueSignalsSent})`;
|
|
354
|
+
logger.warn(message);
|
|
355
|
+
events_1.events.emit('recovery.continue_signal', {
|
|
356
|
+
laneName,
|
|
357
|
+
idleSeconds,
|
|
358
|
+
signalCount: state.continueSignalsSent,
|
|
359
|
+
});
|
|
360
|
+
return {
|
|
361
|
+
success: true,
|
|
362
|
+
action: 'continue_signal',
|
|
363
|
+
message,
|
|
364
|
+
shouldContinue: true,
|
|
365
|
+
nextStage: RecoveryStage.CONTINUE_SIGNAL,
|
|
366
|
+
};
|
|
367
|
+
}
|
|
368
|
+
catch (error) {
|
|
369
|
+
logger.error(`[AutoRecovery] Failed to send continue signal to ${laneName}: ${error.message}`);
|
|
370
|
+
return {
|
|
371
|
+
success: false,
|
|
372
|
+
action: 'continue_signal',
|
|
373
|
+
message: `Failed to send continue signal: ${error.message}`,
|
|
374
|
+
shouldContinue: true,
|
|
375
|
+
};
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
/**
|
|
379
|
+
* Send a stronger prompt to nudge the agent
|
|
380
|
+
*/
|
|
381
|
+
async sendStrongerPrompt(laneName, laneRunDir, state) {
|
|
382
|
+
const interventionPath = (0, path_1.safeJoin)(laneRunDir, 'intervention.txt');
|
|
383
|
+
const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck or waiting.
|
|
384
|
+
Please continue with your current task immediately.
|
|
385
|
+
If you're waiting for something, explain what you need and proceed with what you can do now.
|
|
386
|
+
If you've completed the task, please summarize your work and finish.
|
|
387
|
+
If you encountered a git error, resolve it and continue.`;
|
|
388
|
+
try {
|
|
389
|
+
fs.writeFileSync(interventionPath, strongerPrompt);
|
|
390
|
+
state.stage = RecoveryStage.STRONGER_PROMPT;
|
|
391
|
+
state.lastStageChangeTime = Date.now();
|
|
392
|
+
// Record failure history
|
|
393
|
+
state.failureHistory.push({
|
|
394
|
+
timestamp: Date.now(),
|
|
395
|
+
stage: RecoveryStage.STRONGER_PROMPT,
|
|
396
|
+
action: 'stronger_prompt',
|
|
397
|
+
message: 'Still idle after continue signal',
|
|
398
|
+
idleTimeMs: Date.now() - state.lastActivityTime,
|
|
399
|
+
bytesReceived: state.totalBytesReceived,
|
|
400
|
+
lastOutput: state.lastOutput,
|
|
401
|
+
});
|
|
402
|
+
const message = `[${laneName}] Still idle after continue signal - sent stronger prompt`;
|
|
403
|
+
logger.warn(message);
|
|
404
|
+
events_1.events.emit('recovery.stronger_prompt', {
|
|
405
|
+
laneName,
|
|
406
|
+
prompt: strongerPrompt,
|
|
407
|
+
});
|
|
408
|
+
return {
|
|
409
|
+
success: true,
|
|
410
|
+
action: 'stronger_prompt',
|
|
411
|
+
message,
|
|
412
|
+
shouldContinue: true,
|
|
413
|
+
nextStage: RecoveryStage.STRONGER_PROMPT,
|
|
414
|
+
};
|
|
415
|
+
}
|
|
416
|
+
catch (error) {
|
|
417
|
+
logger.error(`[AutoRecovery] Failed to send stronger prompt to ${laneName}: ${error.message}`);
|
|
418
|
+
return {
|
|
419
|
+
success: false,
|
|
420
|
+
action: 'stronger_prompt',
|
|
421
|
+
message: `Failed to send stronger prompt: ${error.message}`,
|
|
422
|
+
shouldContinue: true,
|
|
423
|
+
};
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
/**
|
|
427
|
+
* Request process restart
|
|
428
|
+
*/
|
|
429
|
+
async requestRestart(laneName, state, child) {
|
|
430
|
+
state.restartCount++;
|
|
431
|
+
state.stage = RecoveryStage.RESTART_PROCESS;
|
|
432
|
+
state.lastStageChangeTime = Date.now();
|
|
433
|
+
// Record failure history
|
|
434
|
+
state.failureHistory.push({
|
|
435
|
+
timestamp: Date.now(),
|
|
436
|
+
stage: RecoveryStage.RESTART_PROCESS,
|
|
437
|
+
action: 'restart',
|
|
438
|
+
message: `Restart attempt ${state.restartCount}/${this.config.maxRestarts}`,
|
|
439
|
+
idleTimeMs: Date.now() - state.lastActivityTime,
|
|
440
|
+
bytesReceived: state.totalBytesReceived,
|
|
441
|
+
lastOutput: state.lastOutput,
|
|
442
|
+
});
|
|
443
|
+
// Kill the current process if provided
|
|
444
|
+
if (child && child.pid && !child.killed) {
|
|
445
|
+
try {
|
|
446
|
+
child.kill('SIGKILL');
|
|
447
|
+
logger.info(`[AutoRecovery] [${laneName}] Killed process ${child.pid}`);
|
|
448
|
+
}
|
|
449
|
+
catch (error) {
|
|
450
|
+
logger.warn(`[AutoRecovery] [${laneName}] Failed to kill process: ${error.message}`);
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
const message = `[${laneName}] Restarting lane (attempt ${state.restartCount}/${this.config.maxRestarts})`;
|
|
454
|
+
logger.warn(message);
|
|
455
|
+
events_1.events.emit('recovery.restart', {
|
|
456
|
+
laneName,
|
|
457
|
+
restartCount: state.restartCount,
|
|
458
|
+
maxRestarts: this.config.maxRestarts,
|
|
459
|
+
});
|
|
460
|
+
return {
|
|
461
|
+
success: true,
|
|
462
|
+
action: 'restart',
|
|
463
|
+
message,
|
|
464
|
+
shouldContinue: true,
|
|
465
|
+
nextStage: RecoveryStage.RESTART_PROCESS,
|
|
466
|
+
};
|
|
467
|
+
}
|
|
468
|
+
/**
|
|
469
|
+
* Run diagnostic checks
|
|
470
|
+
*/
|
|
471
|
+
async runDiagnosis(laneName, laneRunDir, state) {
|
|
472
|
+
if (!this.config.runDoctorOnFailure) {
|
|
473
|
+
return {
|
|
474
|
+
success: false,
|
|
475
|
+
action: 'diagnose',
|
|
476
|
+
message: 'Diagnosis skipped (disabled in config)',
|
|
477
|
+
shouldContinue: false,
|
|
478
|
+
};
|
|
479
|
+
}
|
|
480
|
+
logger.info(`[AutoRecovery] [${laneName}] Running diagnostic checks...`);
|
|
481
|
+
try {
|
|
482
|
+
// Run health checks
|
|
483
|
+
const [agentHealth, authHealth] = await Promise.all([
|
|
484
|
+
(0, health_1.checkAgentHealth)(),
|
|
485
|
+
(0, health_1.checkAuthHealth)(),
|
|
486
|
+
]);
|
|
487
|
+
const systemHealth = await (0, health_1.runHealthCheck)({ skipRemote: true, skipAuth: true });
|
|
488
|
+
const diagnostic = {
|
|
489
|
+
timestamp: Date.now(),
|
|
490
|
+
agentHealthy: agentHealth.ok,
|
|
491
|
+
authHealthy: authHealth.ok,
|
|
492
|
+
systemHealthy: systemHealth.healthy,
|
|
493
|
+
suggestedAction: '',
|
|
494
|
+
details: '',
|
|
495
|
+
};
|
|
496
|
+
// Analyze and suggest action
|
|
497
|
+
const issues = [];
|
|
498
|
+
if (!agentHealth.ok) {
|
|
499
|
+
issues.push(`Agent: ${agentHealth.message}`);
|
|
500
|
+
}
|
|
501
|
+
if (!authHealth.ok) {
|
|
502
|
+
issues.push(`Auth: ${authHealth.message}`);
|
|
503
|
+
diagnostic.suggestedAction = 'Please sign in to Cursor IDE and verify authentication';
|
|
504
|
+
}
|
|
505
|
+
if (!systemHealth.healthy) {
|
|
506
|
+
const failedChecks = systemHealth.checks.filter(c => !c.ok);
|
|
507
|
+
issues.push(`System: ${failedChecks.map(c => c.message).join(', ')}`);
|
|
508
|
+
}
|
|
509
|
+
if (issues.length === 0) {
|
|
510
|
+
diagnostic.details = 'All health checks passed. The issue may be with the AI model or network.';
|
|
511
|
+
diagnostic.suggestedAction = 'Try resuming with a different model or wait and retry.';
|
|
512
|
+
}
|
|
513
|
+
else {
|
|
514
|
+
diagnostic.details = issues.join('\n');
|
|
515
|
+
}
|
|
516
|
+
state.diagnosticInfo = diagnostic;
|
|
517
|
+
// Record failure history
|
|
518
|
+
state.failureHistory.push({
|
|
519
|
+
timestamp: Date.now(),
|
|
520
|
+
stage: RecoveryStage.DIAGNOSE,
|
|
521
|
+
action: 'diagnose',
|
|
522
|
+
message: diagnostic.details,
|
|
523
|
+
idleTimeMs: Date.now() - state.lastActivityTime,
|
|
524
|
+
bytesReceived: state.totalBytesReceived,
|
|
525
|
+
lastOutput: state.lastOutput,
|
|
526
|
+
});
|
|
527
|
+
// Save diagnostic to file
|
|
528
|
+
const diagnosticPath = (0, path_1.safeJoin)(laneRunDir, 'diagnostic.json');
|
|
529
|
+
fs.writeFileSync(diagnosticPath, JSON.stringify(diagnostic, null, 2));
|
|
530
|
+
const message = `[${laneName}] Diagnostic complete:\n${diagnostic.details}\nSuggested action: ${diagnostic.suggestedAction}`;
|
|
531
|
+
logger.error(message);
|
|
532
|
+
events_1.events.emit('recovery.diagnosed', {
|
|
533
|
+
laneName,
|
|
534
|
+
diagnostic,
|
|
535
|
+
});
|
|
536
|
+
return {
|
|
537
|
+
success: true,
|
|
538
|
+
action: 'diagnose',
|
|
539
|
+
message,
|
|
540
|
+
shouldContinue: false,
|
|
541
|
+
diagnostic,
|
|
542
|
+
};
|
|
543
|
+
}
|
|
544
|
+
catch (error) {
|
|
545
|
+
logger.error(`[AutoRecovery] Diagnostic failed: ${error.message}`);
|
|
546
|
+
return {
|
|
547
|
+
success: false,
|
|
548
|
+
action: 'diagnose',
|
|
549
|
+
message: `Diagnostic failed: ${error.message}`,
|
|
550
|
+
shouldContinue: false,
|
|
551
|
+
};
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
/**
|
|
555
|
+
* Get failure history for a lane
|
|
556
|
+
*/
|
|
557
|
+
getFailureHistory(laneName) {
|
|
558
|
+
const state = this.laneStates.get(laneName);
|
|
559
|
+
return state?.failureHistory || [];
|
|
560
|
+
}
|
|
561
|
+
/**
|
|
562
|
+
* Get configuration
|
|
563
|
+
*/
|
|
564
|
+
getConfig() {
|
|
565
|
+
return { ...this.config };
|
|
566
|
+
}
|
|
567
|
+
/**
|
|
568
|
+
* Update configuration
|
|
569
|
+
*/
|
|
570
|
+
updateConfig(config) {
|
|
571
|
+
this.config = { ...this.config, ...config };
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
exports.AutoRecoveryManager = AutoRecoveryManager;
|
|
575
|
+
// ============================================================================
|
|
576
|
+
// POF (Post-mortem of Failure) Management
|
|
577
|
+
// ============================================================================
|
|
578
|
+
/**
|
|
579
|
+
* Save a POF entry to the pof directory
|
|
580
|
+
*/
|
|
581
|
+
function savePOF(runId, pofDir, entry) {
|
|
582
|
+
// Ensure pof directory exists
|
|
583
|
+
if (!fs.existsSync(pofDir)) {
|
|
584
|
+
fs.mkdirSync(pofDir, { recursive: true });
|
|
585
|
+
}
|
|
586
|
+
const pofPath = (0, path_1.safeJoin)(pofDir, `pof-${runId}.json`);
|
|
587
|
+
let existingPOF = null;
|
|
588
|
+
try {
|
|
589
|
+
const data = fs.readFileSync(pofPath, 'utf8');
|
|
590
|
+
existingPOF = JSON.parse(data);
|
|
591
|
+
}
|
|
592
|
+
catch {
|
|
593
|
+
// File doesn't exist or is invalid JSON - ignore
|
|
594
|
+
}
|
|
595
|
+
// If there's an existing POF, add it to previousFailures
|
|
596
|
+
if (existingPOF) {
|
|
597
|
+
entry.previousFailures = entry.previousFailures || [];
|
|
598
|
+
entry.previousFailures.unshift(existingPOF);
|
|
599
|
+
}
|
|
600
|
+
// Use atomic write: write to temp file then rename
|
|
601
|
+
const tempPath = `${pofPath}.${Math.random().toString(36).substring(2, 7)}.tmp`;
|
|
602
|
+
try {
|
|
603
|
+
fs.writeFileSync(tempPath, JSON.stringify(entry, null, 2), 'utf8');
|
|
604
|
+
fs.renameSync(tempPath, pofPath);
|
|
605
|
+
}
|
|
606
|
+
catch (err) {
|
|
607
|
+
// If temp file was created, try to clean it up
|
|
608
|
+
try {
|
|
609
|
+
if (fs.existsSync(tempPath))
|
|
610
|
+
fs.unlinkSync(tempPath);
|
|
611
|
+
}
|
|
612
|
+
catch { /* ignore */ }
|
|
613
|
+
throw err;
|
|
614
|
+
}
|
|
615
|
+
logger.info(`[POF] Saved post-mortem to ${pofPath}`);
|
|
616
|
+
return pofPath;
|
|
617
|
+
}
|
|
618
|
+
/**
|
|
619
|
+
* Create a POF entry from recovery state
|
|
620
|
+
*/
|
|
621
|
+
function createPOFFromRecoveryState(runId, runDir, laneName, state, laneState, diagnostic) {
|
|
622
|
+
const now = new Date();
|
|
623
|
+
// Determine root cause type
|
|
624
|
+
let rootCauseType = 'AGENT_NO_RESPONSE';
|
|
625
|
+
let rootCauseDescription = 'Agent stopped responding and did not recover after multiple interventions';
|
|
626
|
+
const symptoms = [];
|
|
627
|
+
if (state.totalBytesReceived === 0) {
|
|
628
|
+
rootCauseType = 'AGENT_NO_RESPONSE';
|
|
629
|
+
rootCauseDescription = 'Agent produced 0 bytes of output - possible API or network issue';
|
|
630
|
+
symptoms.push('No bytes received from agent');
|
|
631
|
+
}
|
|
632
|
+
else if (state.restartCount >= 2) {
|
|
633
|
+
rootCauseType = 'ZOMBIE_PROCESS';
|
|
634
|
+
rootCauseDescription = 'Lane processes repeatedly failed to make progress after restarts';
|
|
635
|
+
symptoms.push(`Restarted ${state.restartCount} times without success`);
|
|
636
|
+
}
|
|
637
|
+
symptoms.push(`Total bytes received: ${state.totalBytesReceived}`);
|
|
638
|
+
symptoms.push(`Continue signals sent: ${state.continueSignalsSent}`);
|
|
639
|
+
symptoms.push(`Last output: ${state.lastOutput.substring(0, 100)}...`);
|
|
640
|
+
// Possible causes based on diagnostic
|
|
641
|
+
const possibleCauses = [
|
|
642
|
+
'Model API rate limiting or quota exceeded',
|
|
643
|
+
'Cursor authentication token expired',
|
|
644
|
+
'Network connectivity issues',
|
|
645
|
+
'Agent process hung waiting for stdin/stdout',
|
|
646
|
+
];
|
|
647
|
+
if (diagnostic) {
|
|
648
|
+
if (!diagnostic.agentHealthy) {
|
|
649
|
+
possibleCauses.unshift('cursor-agent CLI is not responding properly');
|
|
650
|
+
}
|
|
651
|
+
if (!diagnostic.authHealthy) {
|
|
652
|
+
possibleCauses.unshift('Cursor authentication failed or expired');
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
const entry = {
|
|
656
|
+
title: 'Run Failure Post-mortem',
|
|
657
|
+
runId,
|
|
658
|
+
failureTime: now.toISOString(),
|
|
659
|
+
detectedAt: now.toISOString(),
|
|
660
|
+
summary: `Lane ${laneName} failed after ${state.restartCount} restart(s) and ${state.continueSignalsSent} continue signal(s)`,
|
|
661
|
+
rootCause: {
|
|
662
|
+
type: rootCauseType,
|
|
663
|
+
description: rootCauseDescription,
|
|
664
|
+
symptoms,
|
|
665
|
+
},
|
|
666
|
+
affectedLanes: [
|
|
667
|
+
{
|
|
668
|
+
name: laneName,
|
|
669
|
+
status: 'failed',
|
|
670
|
+
task: laneState ? `[${(laneState.currentTaskIndex || 0) + 1}/${laneState.totalTasks}]` : 'unknown',
|
|
671
|
+
taskIndex: laneState?.currentTaskIndex || 0,
|
|
672
|
+
pid: laneState?.pid,
|
|
673
|
+
reason: rootCauseDescription,
|
|
674
|
+
recoveryAttempts: state.failureHistory,
|
|
675
|
+
},
|
|
676
|
+
],
|
|
677
|
+
possibleCauses,
|
|
678
|
+
recovery: {
|
|
679
|
+
command: `cursorflow resume --all --run-dir ${runDir}`,
|
|
680
|
+
description: 'Resume all failed lanes from their last checkpoint',
|
|
681
|
+
alternativeCommand: `cursorflow resume --all --restart --run-dir ${runDir}`,
|
|
682
|
+
alternativeDescription: 'Restart all failed lanes from the beginning',
|
|
683
|
+
},
|
|
684
|
+
};
|
|
685
|
+
return entry;
|
|
686
|
+
}
|
|
687
|
+
/**
|
|
688
|
+
* Load existing POF entries for a run
|
|
689
|
+
*/
|
|
690
|
+
function loadPOF(pofDir, runId) {
|
|
691
|
+
const pofPath = (0, path_1.safeJoin)(pofDir, `pof-${runId}.json`);
|
|
692
|
+
if (!fs.existsSync(pofPath)) {
|
|
693
|
+
return null;
|
|
694
|
+
}
|
|
695
|
+
try {
|
|
696
|
+
return JSON.parse(fs.readFileSync(pofPath, 'utf8'));
|
|
697
|
+
}
|
|
698
|
+
catch (error) {
|
|
699
|
+
logger.warn(`[POF] Failed to load POF from ${pofPath}: ${error.message}`);
|
|
700
|
+
return null;
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
/**
|
|
704
|
+
* List all POF files in a directory
|
|
705
|
+
*/
|
|
706
|
+
function listPOFs(pofDir) {
|
|
707
|
+
if (!fs.existsSync(pofDir)) {
|
|
708
|
+
return [];
|
|
709
|
+
}
|
|
710
|
+
return fs.readdirSync(pofDir)
|
|
711
|
+
.filter(f => f.startsWith('pof-') && f.endsWith('.json'))
|
|
712
|
+
.map(f => (0, path_1.safeJoin)(pofDir, f));
|
|
713
|
+
}
|
|
714
|
+
// ============================================================================
|
|
715
|
+
// Exports
|
|
716
|
+
// ============================================================================
|
|
717
|
+
/** Singleton instance for easy access */
|
|
718
|
+
let defaultManager = null;
|
|
719
|
+
/**
|
|
720
|
+
* Get or create the default auto-recovery manager
|
|
721
|
+
*/
|
|
722
|
+
function getAutoRecoveryManager(config) {
|
|
723
|
+
if (!defaultManager) {
|
|
724
|
+
defaultManager = new AutoRecoveryManager(config);
|
|
725
|
+
}
|
|
726
|
+
else if (config) {
|
|
727
|
+
defaultManager.updateConfig(config);
|
|
728
|
+
}
|
|
729
|
+
return defaultManager;
|
|
730
|
+
}
|
|
731
|
+
/**
|
|
732
|
+
* Reset the default manager (for testing)
|
|
733
|
+
*/
|
|
734
|
+
function resetAutoRecoveryManager() {
|
|
735
|
+
defaultManager = null;
|
|
736
|
+
}
|
|
737
|
+
//# sourceMappingURL=auto-recovery.js.map
|