@litmers/cursorflow-orchestrator 0.1.18 → 0.1.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/README.md +25 -7
  3. package/commands/cursorflow-clean.md +19 -0
  4. package/commands/cursorflow-runs.md +59 -0
  5. package/commands/cursorflow-stop.md +55 -0
  6. package/dist/cli/clean.js +178 -6
  7. package/dist/cli/clean.js.map +1 -1
  8. package/dist/cli/index.js +12 -1
  9. package/dist/cli/index.js.map +1 -1
  10. package/dist/cli/init.js +8 -7
  11. package/dist/cli/init.js.map +1 -1
  12. package/dist/cli/logs.js +126 -77
  13. package/dist/cli/logs.js.map +1 -1
  14. package/dist/cli/monitor.d.ts +7 -0
  15. package/dist/cli/monitor.js +1021 -202
  16. package/dist/cli/monitor.js.map +1 -1
  17. package/dist/cli/prepare.js +39 -21
  18. package/dist/cli/prepare.js.map +1 -1
  19. package/dist/cli/resume.js +268 -163
  20. package/dist/cli/resume.js.map +1 -1
  21. package/dist/cli/run.js +11 -5
  22. package/dist/cli/run.js.map +1 -1
  23. package/dist/cli/runs.d.ts +5 -0
  24. package/dist/cli/runs.js +214 -0
  25. package/dist/cli/runs.js.map +1 -0
  26. package/dist/cli/setup-commands.js +0 -0
  27. package/dist/cli/signal.js +8 -8
  28. package/dist/cli/signal.js.map +1 -1
  29. package/dist/cli/stop.d.ts +5 -0
  30. package/dist/cli/stop.js +215 -0
  31. package/dist/cli/stop.js.map +1 -0
  32. package/dist/cli/tasks.d.ts +10 -0
  33. package/dist/cli/tasks.js +165 -0
  34. package/dist/cli/tasks.js.map +1 -0
  35. package/dist/core/auto-recovery.d.ts +212 -0
  36. package/dist/core/auto-recovery.js +737 -0
  37. package/dist/core/auto-recovery.js.map +1 -0
  38. package/dist/core/failure-policy.d.ts +156 -0
  39. package/dist/core/failure-policy.js +488 -0
  40. package/dist/core/failure-policy.js.map +1 -0
  41. package/dist/core/orchestrator.d.ts +16 -2
  42. package/dist/core/orchestrator.js +439 -105
  43. package/dist/core/orchestrator.js.map +1 -1
  44. package/dist/core/reviewer.d.ts +2 -0
  45. package/dist/core/reviewer.js +2 -0
  46. package/dist/core/reviewer.js.map +1 -1
  47. package/dist/core/runner.d.ts +33 -10
  48. package/dist/core/runner.js +374 -164
  49. package/dist/core/runner.js.map +1 -1
  50. package/dist/services/logging/buffer.d.ts +67 -0
  51. package/dist/services/logging/buffer.js +309 -0
  52. package/dist/services/logging/buffer.js.map +1 -0
  53. package/dist/services/logging/console.d.ts +89 -0
  54. package/dist/services/logging/console.js +169 -0
  55. package/dist/services/logging/console.js.map +1 -0
  56. package/dist/services/logging/file-writer.d.ts +71 -0
  57. package/dist/services/logging/file-writer.js +516 -0
  58. package/dist/services/logging/file-writer.js.map +1 -0
  59. package/dist/services/logging/formatter.d.ts +39 -0
  60. package/dist/services/logging/formatter.js +227 -0
  61. package/dist/services/logging/formatter.js.map +1 -0
  62. package/dist/services/logging/index.d.ts +11 -0
  63. package/dist/services/logging/index.js +30 -0
  64. package/dist/services/logging/index.js.map +1 -0
  65. package/dist/services/logging/parser.d.ts +31 -0
  66. package/dist/services/logging/parser.js +222 -0
  67. package/dist/services/logging/parser.js.map +1 -0
  68. package/dist/services/process/index.d.ts +59 -0
  69. package/dist/services/process/index.js +257 -0
  70. package/dist/services/process/index.js.map +1 -0
  71. package/dist/types/agent.d.ts +20 -0
  72. package/dist/types/agent.js +6 -0
  73. package/dist/types/agent.js.map +1 -0
  74. package/dist/types/config.d.ts +65 -0
  75. package/dist/types/config.js +6 -0
  76. package/dist/types/config.js.map +1 -0
  77. package/dist/types/events.d.ts +125 -0
  78. package/dist/types/events.js +6 -0
  79. package/dist/types/events.js.map +1 -0
  80. package/dist/types/index.d.ts +12 -0
  81. package/dist/types/index.js +37 -0
  82. package/dist/types/index.js.map +1 -0
  83. package/dist/types/lane.d.ts +43 -0
  84. package/dist/types/lane.js +6 -0
  85. package/dist/types/lane.js.map +1 -0
  86. package/dist/types/logging.d.ts +71 -0
  87. package/dist/types/logging.js +16 -0
  88. package/dist/types/logging.js.map +1 -0
  89. package/dist/types/review.d.ts +17 -0
  90. package/dist/types/review.js +6 -0
  91. package/dist/types/review.js.map +1 -0
  92. package/dist/types/run.d.ts +32 -0
  93. package/dist/types/run.js +6 -0
  94. package/dist/types/run.js.map +1 -0
  95. package/dist/types/task.d.ts +71 -0
  96. package/dist/types/task.js +6 -0
  97. package/dist/types/task.js.map +1 -0
  98. package/dist/ui/components.d.ts +134 -0
  99. package/dist/ui/components.js +389 -0
  100. package/dist/ui/components.js.map +1 -0
  101. package/dist/ui/log-viewer.d.ts +49 -0
  102. package/dist/ui/log-viewer.js +449 -0
  103. package/dist/ui/log-viewer.js.map +1 -0
  104. package/dist/utils/checkpoint.d.ts +87 -0
  105. package/dist/utils/checkpoint.js +317 -0
  106. package/dist/utils/checkpoint.js.map +1 -0
  107. package/dist/utils/config.d.ts +4 -0
  108. package/dist/utils/config.js +18 -8
  109. package/dist/utils/config.js.map +1 -1
  110. package/dist/utils/cursor-agent.js.map +1 -1
  111. package/dist/utils/dependency.d.ts +74 -0
  112. package/dist/utils/dependency.js +420 -0
  113. package/dist/utils/dependency.js.map +1 -0
  114. package/dist/utils/doctor.js +17 -11
  115. package/dist/utils/doctor.js.map +1 -1
  116. package/dist/utils/enhanced-logger.d.ts +10 -33
  117. package/dist/utils/enhanced-logger.js +108 -20
  118. package/dist/utils/enhanced-logger.js.map +1 -1
  119. package/dist/utils/git.d.ts +121 -0
  120. package/dist/utils/git.js +484 -11
  121. package/dist/utils/git.js.map +1 -1
  122. package/dist/utils/health.d.ts +91 -0
  123. package/dist/utils/health.js +556 -0
  124. package/dist/utils/health.js.map +1 -0
  125. package/dist/utils/lock.d.ts +95 -0
  126. package/dist/utils/lock.js +332 -0
  127. package/dist/utils/lock.js.map +1 -0
  128. package/dist/utils/log-buffer.d.ts +17 -0
  129. package/dist/utils/log-buffer.js +14 -0
  130. package/dist/utils/log-buffer.js.map +1 -0
  131. package/dist/utils/log-constants.d.ts +23 -0
  132. package/dist/utils/log-constants.js +28 -0
  133. package/dist/utils/log-constants.js.map +1 -0
  134. package/dist/utils/log-formatter.d.ts +25 -0
  135. package/dist/utils/log-formatter.js +237 -0
  136. package/dist/utils/log-formatter.js.map +1 -0
  137. package/dist/utils/log-service.d.ts +19 -0
  138. package/dist/utils/log-service.js +47 -0
  139. package/dist/utils/log-service.js.map +1 -0
  140. package/dist/utils/logger.d.ts +46 -27
  141. package/dist/utils/logger.js +82 -60
  142. package/dist/utils/logger.js.map +1 -1
  143. package/dist/utils/path.d.ts +19 -0
  144. package/dist/utils/path.js +77 -0
  145. package/dist/utils/path.js.map +1 -0
  146. package/dist/utils/process-manager.d.ts +21 -0
  147. package/dist/utils/process-manager.js +138 -0
  148. package/dist/utils/process-manager.js.map +1 -0
  149. package/dist/utils/retry.d.ts +121 -0
  150. package/dist/utils/retry.js +374 -0
  151. package/dist/utils/retry.js.map +1 -0
  152. package/dist/utils/run-service.d.ts +88 -0
  153. package/dist/utils/run-service.js +412 -0
  154. package/dist/utils/run-service.js.map +1 -0
  155. package/dist/utils/state.d.ts +62 -3
  156. package/dist/utils/state.js +317 -11
  157. package/dist/utils/state.js.map +1 -1
  158. package/dist/utils/task-service.d.ts +82 -0
  159. package/dist/utils/task-service.js +348 -0
  160. package/dist/utils/task-service.js.map +1 -0
  161. package/dist/utils/template.d.ts +14 -0
  162. package/dist/utils/template.js +122 -0
  163. package/dist/utils/template.js.map +1 -0
  164. package/dist/utils/types.d.ts +2 -271
  165. package/dist/utils/types.js +16 -0
  166. package/dist/utils/types.js.map +1 -1
  167. package/package.json +38 -23
  168. package/scripts/ai-security-check.js +0 -1
  169. package/scripts/local-security-gate.sh +0 -0
  170. package/scripts/monitor-lanes.sh +94 -0
  171. package/scripts/patches/test-cursor-agent.js +0 -1
  172. package/scripts/release.sh +0 -0
  173. package/scripts/setup-security.sh +0 -0
  174. package/scripts/stream-logs.sh +72 -0
  175. package/scripts/verify-and-fix.sh +0 -0
  176. package/src/cli/clean.ts +187 -6
  177. package/src/cli/index.ts +12 -1
  178. package/src/cli/init.ts +8 -7
  179. package/src/cli/logs.ts +124 -77
  180. package/src/cli/monitor.ts +1815 -898
  181. package/src/cli/prepare.ts +41 -21
  182. package/src/cli/resume.ts +753 -626
  183. package/src/cli/run.ts +12 -5
  184. package/src/cli/runs.ts +212 -0
  185. package/src/cli/setup-commands.ts +0 -0
  186. package/src/cli/signal.ts +8 -7
  187. package/src/cli/stop.ts +209 -0
  188. package/src/cli/tasks.ts +154 -0
  189. package/src/core/auto-recovery.ts +909 -0
  190. package/src/core/failure-policy.ts +592 -0
  191. package/src/core/orchestrator.ts +1131 -704
  192. package/src/core/reviewer.ts +4 -0
  193. package/src/core/runner.ts +444 -180
  194. package/src/services/logging/buffer.ts +326 -0
  195. package/src/services/logging/console.ts +193 -0
  196. package/src/services/logging/file-writer.ts +526 -0
  197. package/src/services/logging/formatter.ts +268 -0
  198. package/src/services/logging/index.ts +16 -0
  199. package/src/services/logging/parser.ts +232 -0
  200. package/src/services/process/index.ts +261 -0
  201. package/src/types/agent.ts +24 -0
  202. package/src/types/config.ts +79 -0
  203. package/src/types/events.ts +156 -0
  204. package/src/types/index.ts +29 -0
  205. package/src/types/lane.ts +56 -0
  206. package/src/types/logging.ts +96 -0
  207. package/src/types/review.ts +20 -0
  208. package/src/types/run.ts +37 -0
  209. package/src/types/task.ts +79 -0
  210. package/src/ui/components.ts +430 -0
  211. package/src/ui/log-viewer.ts +485 -0
  212. package/src/utils/checkpoint.ts +374 -0
  213. package/src/utils/config.ts +18 -8
  214. package/src/utils/cursor-agent.ts +1 -1
  215. package/src/utils/dependency.ts +482 -0
  216. package/src/utils/doctor.ts +18 -11
  217. package/src/utils/enhanced-logger.ts +122 -60
  218. package/src/utils/git.ts +517 -11
  219. package/src/utils/health.ts +596 -0
  220. package/src/utils/lock.ts +346 -0
  221. package/src/utils/log-buffer.ts +28 -0
  222. package/src/utils/log-constants.ts +26 -0
  223. package/src/utils/log-formatter.ts +245 -0
  224. package/src/utils/log-service.ts +49 -0
  225. package/src/utils/logger.ts +100 -51
  226. package/src/utils/path.ts +45 -0
  227. package/src/utils/process-manager.ts +100 -0
  228. package/src/utils/retry.ts +413 -0
  229. package/src/utils/run-service.ts +433 -0
  230. package/src/utils/state.ts +385 -11
  231. package/src/utils/task-service.ts +370 -0
  232. package/src/utils/template.ts +92 -0
  233. package/src/utils/types.ts +2 -314
  234. package/templates/basic.json +21 -0
@@ -0,0 +1,909 @@
1
+ /**
2
+ * Auto-Recovery Module
3
+ *
4
+ * Automatic recovery strategies for common orchestration failures:
5
+ * - Agent idle/no response detection with escalating interventions
6
+ * - Guidance messages for git conflicts and push failures
7
+ * - Process health monitoring with restart capabilities
8
+ * - Doctor integration for persistent failures
9
+ * - POF (Post-mortem of Failure) saving for failed recoveries
10
+ */
11
+
12
+ import * as fs from 'fs';
13
+ import { ChildProcess } from 'child_process';
14
+
15
+ import * as logger from '../utils/logger';
16
+ import { LaneState } from '../utils/types';
17
+ import { events } from '../utils/events';
18
+ import { safeJoin } from '../utils/path';
19
+ import { runHealthCheck, checkAgentHealth, checkAuthHealth } from '../utils/health';
20
+
21
+ // ============================================================================
22
+ // Types & Constants
23
+ // ============================================================================
24
+
25
+ /** Recovery stages for escalating interventions */
26
+ export enum RecoveryStage {
27
+ /** Normal operation - monitoring */
28
+ NORMAL = 0,
29
+ /** First intervention - send continue signal */
30
+ CONTINUE_SIGNAL = 1,
31
+ /** Second intervention - send stronger prompt */
32
+ STRONGER_PROMPT = 2,
33
+ /** Third intervention - kill and restart process */
34
+ RESTART_PROCESS = 3,
35
+ /** Final stage - run doctor and report */
36
+ DIAGNOSE = 4,
37
+ /** No more recovery possible */
38
+ ABORT = 5,
39
+ }
40
+
41
+ /** Configuration for auto-recovery behavior */
42
+ export interface AutoRecoveryConfig {
43
+ /** Time without activity before sending continue signal (default: 1 minute) */
44
+ idleTimeoutMs: number;
45
+ /** Time to wait after continue signal before escalating (default: 1 minute) */
46
+ continueGraceMs: number;
47
+ /** Time to wait after stronger prompt before escalating (default: 1 minute) */
48
+ strongerPromptGraceMs: number;
49
+ /** Maximum number of restarts before aborting (default: 2) */
50
+ maxRestarts: number;
51
+ /** Whether to run doctor on persistent failures (default: true) */
52
+ runDoctorOnFailure: boolean;
53
+ /** Patterns indicating long-running operations (won't trigger idle) */
54
+ longOperationPatterns: RegExp[];
55
+ /** Grace period for long operations (default: 10 minutes) */
56
+ longOperationGraceMs: number;
57
+ /** Enable verbose logging */
58
+ verbose: boolean;
59
+ }
60
+
61
+ /** Default auto-recovery configuration */
62
+ export const DEFAULT_AUTO_RECOVERY_CONFIG: AutoRecoveryConfig = {
63
+ idleTimeoutMs: 60 * 1000, // 1 minute - quick detection
64
+ continueGraceMs: 60 * 1000, // 1 minute after continue
65
+ strongerPromptGraceMs: 60 * 1000, // 1 minute after stronger prompt
66
+ maxRestarts: 2,
67
+ runDoctorOnFailure: true,
68
+ longOperationPatterns: [
69
+ /installing\s+dependencies/i,
70
+ /npm\s+(i|install|ci)/i,
71
+ /pnpm\s+(i|install)/i,
72
+ /yarn\s+(install)?/i,
73
+ /building/i,
74
+ /compiling/i,
75
+ /bundling/i,
76
+ /downloading/i,
77
+ /fetching/i,
78
+ /cloning/i,
79
+ ],
80
+ longOperationGraceMs: 10 * 60 * 1000, // 10 minutes for long ops
81
+ verbose: false,
82
+ };
83
+
84
+ /** State tracking for a single lane's recovery */
85
+ export interface LaneRecoveryState {
86
+ laneName: string;
87
+ stage: RecoveryStage;
88
+ lastActivityTime: number;
89
+ lastBytesReceived: number;
90
+ totalBytesReceived: number;
91
+ lastOutput: string;
92
+ restartCount: number;
93
+ continueSignalsSent: number;
94
+ lastStageChangeTime: number;
95
+ diagnosticInfo?: DiagnosticInfo;
96
+ isLongOperation: boolean;
97
+ failureHistory: FailureRecord[];
98
+ }
99
+
100
+ /** Diagnostic information from doctor */
101
+ export interface DiagnosticInfo {
102
+ timestamp: number;
103
+ agentHealthy: boolean;
104
+ authHealthy: boolean;
105
+ systemHealthy: boolean;
106
+ suggestedAction: string;
107
+ details: string;
108
+ }
109
+
110
+ /** Recovery action result */
111
+ export interface RecoveryActionResult {
112
+ success: boolean;
113
+ action: string;
114
+ message: string;
115
+ shouldContinue: boolean;
116
+ nextStage?: RecoveryStage;
117
+ diagnostic?: DiagnosticInfo;
118
+ }
119
+
120
+ /** Record of a failure for POF */
121
+ export interface FailureRecord {
122
+ timestamp: number;
123
+ stage: RecoveryStage;
124
+ action: string;
125
+ message: string;
126
+ idleTimeMs: number;
127
+ bytesReceived: number;
128
+ lastOutput: string;
129
+ }
130
+
131
+ /** POF (Post-mortem of Failure) entry */
132
+ export interface POFEntry {
133
+ title: string;
134
+ runId: string;
135
+ failureTime: string;
136
+ detectedAt: string;
137
+ summary: string;
138
+ rootCause: {
139
+ type: string;
140
+ description: string;
141
+ symptoms: string[];
142
+ };
143
+ affectedLanes: Array<{
144
+ name: string;
145
+ status: string;
146
+ task: string;
147
+ taskIndex: number;
148
+ pid?: number;
149
+ reason: string;
150
+ recoveryAttempts: FailureRecord[];
151
+ }>;
152
+ possibleCauses: string[];
153
+ recovery: {
154
+ command: string;
155
+ description: string;
156
+ alternativeCommand?: string;
157
+ alternativeDescription?: string;
158
+ };
159
+ previousFailures?: POFEntry[];
160
+ }
161
+
162
+ // ============================================================================
163
+ // Guidance Messages for Git Issues
164
+ // ============================================================================
165
+
166
+ /** Generate guidance message for git push failure */
167
+ export function getGitPushFailureGuidance(): string {
168
+ return `[SYSTEM INTERVENTION] Git push가 실패했습니다. 다음 단계를 수행해주세요:
169
+
170
+ 1. 먼저 원격 변경사항을 가져오세요:
171
+ \`\`\`bash
172
+ git fetch origin
173
+ git pull --rebase origin HEAD
174
+ \`\`\`
175
+
176
+ 2. 충돌이 발생하면 해결하세요:
177
+ - 충돌 파일을 확인하고 수정
178
+ - git add로 스테이징
179
+ - git rebase --continue 실행
180
+
181
+ 3. 다시 푸시하세요:
182
+ \`\`\`bash
183
+ git push origin HEAD
184
+ \`\`\`
185
+
186
+ 작업을 계속 진행해주세요.`;
187
+ }
188
+
189
+ /** Generate guidance message for merge conflict */
190
+ export function getMergeConflictGuidance(): string {
191
+ return `[SYSTEM INTERVENTION] Merge conflict가 발생했습니다. 다음 단계를 수행해주세요:
192
+
193
+ 1. 충돌 파일 확인:
194
+ \`\`\`bash
195
+ git status
196
+ \`\`\`
197
+
198
+ 2. 각 충돌 파일을 열어서 수동으로 해결:
199
+ - <<<<<<< 와 >>>>>>> 사이의 내용을 확인
200
+ - 적절한 코드를 선택하거나 병합
201
+ - 충돌 마커 제거
202
+
203
+ 3. 해결 후 스테이징 및 커밋:
204
+ \`\`\`bash
205
+ git add -A
206
+ git commit -m "chore: resolve merge conflict"
207
+ git push origin HEAD
208
+ \`\`\`
209
+
210
+ 작업을 계속 진행해주세요.`;
211
+ }
212
+
213
+ /** Generate guidance message for general git error */
214
+ export function getGitErrorGuidance(errorMessage: string): string {
215
+ return `[SYSTEM INTERVENTION] Git 작업 중 오류가 발생했습니다:
216
+ ${errorMessage}
217
+
218
+ 다음을 시도해주세요:
219
+ 1. git status로 현재 상태 확인
220
+ 2. 필요시 git reset --hard HEAD로 초기화
221
+ 3. 원격 저장소와 동기화: git fetch origin && git pull --rebase
222
+
223
+ 작업을 계속 진행해주세요.`;
224
+ }
225
+
226
+ // ============================================================================
227
+ // Recovery State Manager
228
+ // ============================================================================
229
+
230
+ /**
231
+ * Manages recovery state for all lanes
232
+ */
233
+ export class AutoRecoveryManager {
234
+ private config: AutoRecoveryConfig;
235
+ private laneStates: Map<string, LaneRecoveryState> = new Map();
236
+ private eventHandlers: Map<string, () => void> = new Map();
237
+
238
+ constructor(config: Partial<AutoRecoveryConfig> = {}) {
239
+ this.config = { ...DEFAULT_AUTO_RECOVERY_CONFIG, ...config };
240
+ }
241
+
242
+ /**
243
+ * Register a lane for recovery monitoring
244
+ */
245
+ registerLane(laneName: string): void {
246
+ const now = Date.now();
247
+ this.laneStates.set(laneName, {
248
+ laneName,
249
+ stage: RecoveryStage.NORMAL,
250
+ lastActivityTime: now,
251
+ lastBytesReceived: 0,
252
+ totalBytesReceived: 0,
253
+ lastOutput: '',
254
+ restartCount: 0,
255
+ continueSignalsSent: 0,
256
+ lastStageChangeTime: now,
257
+ isLongOperation: false,
258
+ failureHistory: [],
259
+ });
260
+
261
+ if (this.config.verbose) {
262
+ logger.info(`[AutoRecovery] Registered lane: ${laneName}`);
263
+ }
264
+ }
265
+
266
+ /**
267
+ * Unregister a lane from recovery monitoring
268
+ */
269
+ unregisterLane(laneName: string): void {
270
+ this.laneStates.delete(laneName);
271
+
272
+ const handler = this.eventHandlers.get(laneName);
273
+ if (handler) {
274
+ this.eventHandlers.delete(laneName);
275
+ }
276
+ }
277
+
278
+ /**
279
+ * Record activity for a lane
280
+ */
281
+ recordActivity(laneName: string, bytesReceived: number = 0, output?: string): void {
282
+ const state = this.laneStates.get(laneName);
283
+ if (!state) return;
284
+
285
+ const now = Date.now();
286
+ state.lastActivityTime = now;
287
+
288
+ if (bytesReceived > 0) {
289
+ state.lastBytesReceived = bytesReceived;
290
+ state.totalBytesReceived += bytesReceived;
291
+ }
292
+
293
+ if (output) {
294
+ state.lastOutput = output;
295
+ // Check if this is a long operation
296
+ state.isLongOperation = this.config.longOperationPatterns.some(p => p.test(output));
297
+ }
298
+
299
+ // Reset stage if we got meaningful activity
300
+ if (bytesReceived > 0 && state.stage !== RecoveryStage.NORMAL) {
301
+ if (this.config.verbose) {
302
+ logger.info(`[AutoRecovery] [${laneName}] Activity detected, resetting to NORMAL stage`);
303
+ }
304
+ state.stage = RecoveryStage.NORMAL;
305
+ state.lastStageChangeTime = now;
306
+ }
307
+ }
308
+
309
+ /**
310
+ * Get current recovery state for a lane
311
+ */
312
+ getState(laneName: string): LaneRecoveryState | undefined {
313
+ return this.laneStates.get(laneName);
314
+ }
315
+
316
+ /**
317
+ * Check if a lane needs recovery intervention
318
+ */
319
+ needsIntervention(laneName: string): boolean {
320
+ const state = this.laneStates.get(laneName);
321
+ if (!state) return false;
322
+
323
+ const now = Date.now();
324
+ const idleTime = now - state.lastActivityTime;
325
+
326
+ // Use longer timeout for long operations
327
+ const effectiveTimeout = state.isLongOperation
328
+ ? this.config.longOperationGraceMs
329
+ : this.config.idleTimeoutMs;
330
+
331
+ // Check based on current stage
332
+ switch (state.stage) {
333
+ case RecoveryStage.NORMAL:
334
+ return idleTime > effectiveTimeout;
335
+
336
+ case RecoveryStage.CONTINUE_SIGNAL:
337
+ return (now - state.lastStageChangeTime) > this.config.continueGraceMs;
338
+
339
+ case RecoveryStage.STRONGER_PROMPT:
340
+ return (now - state.lastStageChangeTime) > this.config.strongerPromptGraceMs;
341
+
342
+ case RecoveryStage.RESTART_PROCESS:
343
+ // After restart, use normal timeout to detect if it's working
344
+ return idleTime > effectiveTimeout;
345
+
346
+ case RecoveryStage.DIAGNOSE:
347
+ case RecoveryStage.ABORT:
348
+ return false; // No more interventions
349
+
350
+ default:
351
+ return false;
352
+ }
353
+ }
354
+
355
+ /**
356
+ * Get the next recovery action for a lane
357
+ */
358
+ async getRecoveryAction(
359
+ laneName: string,
360
+ laneRunDir: string,
361
+ child?: ChildProcess
362
+ ): Promise<RecoveryActionResult> {
363
+ const state = this.laneStates.get(laneName);
364
+ if (!state) {
365
+ return {
366
+ success: false,
367
+ action: 'none',
368
+ message: 'Lane not registered',
369
+ shouldContinue: false,
370
+ };
371
+ }
372
+
373
+ const now = Date.now();
374
+ const idleTime = now - state.lastActivityTime;
375
+ const idleSeconds = Math.round(idleTime / 1000);
376
+
377
+ switch (state.stage) {
378
+ case RecoveryStage.NORMAL:
379
+ // Escalate to CONTINUE_SIGNAL
380
+ return await this.sendContinueSignal(laneName, laneRunDir, state, idleSeconds);
381
+
382
+ case RecoveryStage.CONTINUE_SIGNAL:
383
+ // Try a stronger prompt
384
+ return await this.sendStrongerPrompt(laneName, laneRunDir, state);
385
+
386
+ case RecoveryStage.STRONGER_PROMPT:
387
+ // Try restarting the process
388
+ if (state.restartCount < this.config.maxRestarts) {
389
+ return await this.requestRestart(laneName, state, child);
390
+ }
391
+ // Fall through to diagnose
392
+ state.stage = RecoveryStage.DIAGNOSE;
393
+ state.lastStageChangeTime = now;
394
+ return await this.runDiagnosis(laneName, laneRunDir, state);
395
+
396
+ case RecoveryStage.RESTART_PROCESS:
397
+ // After restart, if still no response, diagnose
398
+ if (state.restartCount >= this.config.maxRestarts) {
399
+ state.stage = RecoveryStage.DIAGNOSE;
400
+ state.lastStageChangeTime = now;
401
+ return await this.runDiagnosis(laneName, laneRunDir, state);
402
+ }
403
+ // Try continue signal again after restart
404
+ return await this.sendContinueSignal(laneName, laneRunDir, state, idleSeconds);
405
+
406
+ case RecoveryStage.DIAGNOSE:
407
+ // Final stage - abort
408
+ state.stage = RecoveryStage.ABORT;
409
+ state.lastStageChangeTime = now;
410
+ return {
411
+ success: false,
412
+ action: 'abort',
413
+ message: `Lane ${laneName} failed after all recovery attempts`,
414
+ shouldContinue: false,
415
+ nextStage: RecoveryStage.ABORT,
416
+ diagnostic: state.diagnosticInfo,
417
+ };
418
+
419
+ default:
420
+ return {
421
+ success: false,
422
+ action: 'abort',
423
+ message: 'Recovery exhausted',
424
+ shouldContinue: false,
425
+ };
426
+ }
427
+ }
428
+
429
+ /**
430
+ * Send a continue signal to the lane
431
+ */
432
+ private async sendContinueSignal(
433
+ laneName: string,
434
+ laneRunDir: string,
435
+ state: LaneRecoveryState,
436
+ idleSeconds: number
437
+ ): Promise<RecoveryActionResult> {
438
+ const interventionPath = safeJoin(laneRunDir, 'intervention.txt');
439
+
440
+ try {
441
+ fs.writeFileSync(interventionPath, 'continue');
442
+
443
+ state.stage = RecoveryStage.CONTINUE_SIGNAL;
444
+ state.lastStageChangeTime = Date.now();
445
+ state.continueSignalsSent++;
446
+
447
+ // Record failure history
448
+ state.failureHistory.push({
449
+ timestamp: Date.now(),
450
+ stage: RecoveryStage.CONTINUE_SIGNAL,
451
+ action: 'continue_signal',
452
+ message: `Idle for ${idleSeconds}s`,
453
+ idleTimeMs: idleSeconds * 1000,
454
+ bytesReceived: state.totalBytesReceived,
455
+ lastOutput: state.lastOutput,
456
+ });
457
+
458
+ const message = `[${laneName}] Idle for ${idleSeconds}s - sent continue signal (#${state.continueSignalsSent})`;
459
+ logger.warn(message);
460
+
461
+ events.emit('recovery.continue_signal', {
462
+ laneName,
463
+ idleSeconds,
464
+ signalCount: state.continueSignalsSent,
465
+ });
466
+
467
+ return {
468
+ success: true,
469
+ action: 'continue_signal',
470
+ message,
471
+ shouldContinue: true,
472
+ nextStage: RecoveryStage.CONTINUE_SIGNAL,
473
+ };
474
+ } catch (error: any) {
475
+ logger.error(`[AutoRecovery] Failed to send continue signal to ${laneName}: ${error.message}`);
476
+ return {
477
+ success: false,
478
+ action: 'continue_signal',
479
+ message: `Failed to send continue signal: ${error.message}`,
480
+ shouldContinue: true,
481
+ };
482
+ }
483
+ }
484
+
485
+ /**
486
+ * Send a stronger prompt to nudge the agent
487
+ */
488
+ private async sendStrongerPrompt(
489
+ laneName: string,
490
+ laneRunDir: string,
491
+ state: LaneRecoveryState
492
+ ): Promise<RecoveryActionResult> {
493
+ const interventionPath = safeJoin(laneRunDir, 'intervention.txt');
494
+
495
+ const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck or waiting.
496
+ Please continue with your current task immediately.
497
+ If you're waiting for something, explain what you need and proceed with what you can do now.
498
+ If you've completed the task, please summarize your work and finish.
499
+ If you encountered a git error, resolve it and continue.`;
500
+
501
+ try {
502
+ fs.writeFileSync(interventionPath, strongerPrompt);
503
+
504
+ state.stage = RecoveryStage.STRONGER_PROMPT;
505
+ state.lastStageChangeTime = Date.now();
506
+
507
+ // Record failure history
508
+ state.failureHistory.push({
509
+ timestamp: Date.now(),
510
+ stage: RecoveryStage.STRONGER_PROMPT,
511
+ action: 'stronger_prompt',
512
+ message: 'Still idle after continue signal',
513
+ idleTimeMs: Date.now() - state.lastActivityTime,
514
+ bytesReceived: state.totalBytesReceived,
515
+ lastOutput: state.lastOutput,
516
+ });
517
+
518
+ const message = `[${laneName}] Still idle after continue signal - sent stronger prompt`;
519
+ logger.warn(message);
520
+
521
+ events.emit('recovery.stronger_prompt', {
522
+ laneName,
523
+ prompt: strongerPrompt,
524
+ });
525
+
526
+ return {
527
+ success: true,
528
+ action: 'stronger_prompt',
529
+ message,
530
+ shouldContinue: true,
531
+ nextStage: RecoveryStage.STRONGER_PROMPT,
532
+ };
533
+ } catch (error: any) {
534
+ logger.error(`[AutoRecovery] Failed to send stronger prompt to ${laneName}: ${error.message}`);
535
+ return {
536
+ success: false,
537
+ action: 'stronger_prompt',
538
+ message: `Failed to send stronger prompt: ${error.message}`,
539
+ shouldContinue: true,
540
+ };
541
+ }
542
+ }
543
+
544
+ /**
545
+ * Request process restart
546
+ */
547
+ private async requestRestart(
548
+ laneName: string,
549
+ state: LaneRecoveryState,
550
+ child?: ChildProcess
551
+ ): Promise<RecoveryActionResult> {
552
+ state.restartCount++;
553
+ state.stage = RecoveryStage.RESTART_PROCESS;
554
+ state.lastStageChangeTime = Date.now();
555
+
556
+ // Record failure history
557
+ state.failureHistory.push({
558
+ timestamp: Date.now(),
559
+ stage: RecoveryStage.RESTART_PROCESS,
560
+ action: 'restart',
561
+ message: `Restart attempt ${state.restartCount}/${this.config.maxRestarts}`,
562
+ idleTimeMs: Date.now() - state.lastActivityTime,
563
+ bytesReceived: state.totalBytesReceived,
564
+ lastOutput: state.lastOutput,
565
+ });
566
+
567
+ // Kill the current process if provided
568
+ if (child && child.pid && !child.killed) {
569
+ try {
570
+ child.kill('SIGKILL');
571
+ logger.info(`[AutoRecovery] [${laneName}] Killed process ${child.pid}`);
572
+ } catch (error: any) {
573
+ logger.warn(`[AutoRecovery] [${laneName}] Failed to kill process: ${error.message}`);
574
+ }
575
+ }
576
+
577
+ const message = `[${laneName}] Restarting lane (attempt ${state.restartCount}/${this.config.maxRestarts})`;
578
+ logger.warn(message);
579
+
580
+ events.emit('recovery.restart', {
581
+ laneName,
582
+ restartCount: state.restartCount,
583
+ maxRestarts: this.config.maxRestarts,
584
+ });
585
+
586
+ return {
587
+ success: true,
588
+ action: 'restart',
589
+ message,
590
+ shouldContinue: true,
591
+ nextStage: RecoveryStage.RESTART_PROCESS,
592
+ };
593
+ }
594
+
595
+ /**
596
+ * Run diagnostic checks
597
+ */
598
+ private async runDiagnosis(
599
+ laneName: string,
600
+ laneRunDir: string,
601
+ state: LaneRecoveryState
602
+ ): Promise<RecoveryActionResult> {
603
+ if (!this.config.runDoctorOnFailure) {
604
+ return {
605
+ success: false,
606
+ action: 'diagnose',
607
+ message: 'Diagnosis skipped (disabled in config)',
608
+ shouldContinue: false,
609
+ };
610
+ }
611
+
612
+ logger.info(`[AutoRecovery] [${laneName}] Running diagnostic checks...`);
613
+
614
+ try {
615
+ // Run health checks
616
+ const [agentHealth, authHealth] = await Promise.all([
617
+ checkAgentHealth(),
618
+ checkAuthHealth(),
619
+ ]);
620
+
621
+ const systemHealth = await runHealthCheck({ skipRemote: true, skipAuth: true });
622
+
623
+ const diagnostic: DiagnosticInfo = {
624
+ timestamp: Date.now(),
625
+ agentHealthy: agentHealth.ok,
626
+ authHealthy: authHealth.ok,
627
+ systemHealthy: systemHealth.healthy,
628
+ suggestedAction: '',
629
+ details: '',
630
+ };
631
+
632
+ // Analyze and suggest action
633
+ const issues: string[] = [];
634
+
635
+ if (!agentHealth.ok) {
636
+ issues.push(`Agent: ${agentHealth.message}`);
637
+ }
638
+
639
+ if (!authHealth.ok) {
640
+ issues.push(`Auth: ${authHealth.message}`);
641
+ diagnostic.suggestedAction = 'Please sign in to Cursor IDE and verify authentication';
642
+ }
643
+
644
+ if (!systemHealth.healthy) {
645
+ const failedChecks = systemHealth.checks.filter(c => !c.ok);
646
+ issues.push(`System: ${failedChecks.map(c => c.message).join(', ')}`);
647
+ }
648
+
649
+ if (issues.length === 0) {
650
+ diagnostic.details = 'All health checks passed. The issue may be with the AI model or network.';
651
+ diagnostic.suggestedAction = 'Try resuming with a different model or wait and retry.';
652
+ } else {
653
+ diagnostic.details = issues.join('\n');
654
+ }
655
+
656
+ state.diagnosticInfo = diagnostic;
657
+
658
+ // Record failure history
659
+ state.failureHistory.push({
660
+ timestamp: Date.now(),
661
+ stage: RecoveryStage.DIAGNOSE,
662
+ action: 'diagnose',
663
+ message: diagnostic.details,
664
+ idleTimeMs: Date.now() - state.lastActivityTime,
665
+ bytesReceived: state.totalBytesReceived,
666
+ lastOutput: state.lastOutput,
667
+ });
668
+
669
+ // Save diagnostic to file
670
+ const diagnosticPath = safeJoin(laneRunDir, 'diagnostic.json');
671
+ fs.writeFileSync(diagnosticPath, JSON.stringify(diagnostic, null, 2));
672
+
673
+ const message = `[${laneName}] Diagnostic complete:\n${diagnostic.details}\nSuggested action: ${diagnostic.suggestedAction}`;
674
+ logger.error(message);
675
+
676
+ events.emit('recovery.diagnosed', {
677
+ laneName,
678
+ diagnostic,
679
+ });
680
+
681
+ return {
682
+ success: true,
683
+ action: 'diagnose',
684
+ message,
685
+ shouldContinue: false,
686
+ diagnostic,
687
+ };
688
+ } catch (error: any) {
689
+ logger.error(`[AutoRecovery] Diagnostic failed: ${error.message}`);
690
+ return {
691
+ success: false,
692
+ action: 'diagnose',
693
+ message: `Diagnostic failed: ${error.message}`,
694
+ shouldContinue: false,
695
+ };
696
+ }
697
+ }
698
+
699
+ /**
700
+ * Get failure history for a lane
701
+ */
702
+ getFailureHistory(laneName: string): FailureRecord[] {
703
+ const state = this.laneStates.get(laneName);
704
+ return state?.failureHistory || [];
705
+ }
706
+
707
+ /**
708
+ * Get configuration
709
+ */
710
+ getConfig(): AutoRecoveryConfig {
711
+ return { ...this.config };
712
+ }
713
+
714
+ /**
715
+ * Update configuration
716
+ */
717
+ updateConfig(config: Partial<AutoRecoveryConfig>): void {
718
+ this.config = { ...this.config, ...config };
719
+ }
720
+ }
721
+
722
+ // ============================================================================
723
+ // POF (Post-mortem of Failure) Management
724
+ // ============================================================================
725
+
726
+ /**
727
+ * Save a POF entry to the pof directory
728
+ */
729
+ export function savePOF(
730
+ runId: string,
731
+ pofDir: string,
732
+ entry: POFEntry
733
+ ): string {
734
+ // Ensure pof directory exists
735
+ if (!fs.existsSync(pofDir)) {
736
+ fs.mkdirSync(pofDir, { recursive: true });
737
+ }
738
+
739
+ const pofPath = safeJoin(pofDir, `pof-${runId}.json`);
740
+
741
+ let existingPOF: POFEntry | null = null;
742
+ try {
743
+ const data = fs.readFileSync(pofPath, 'utf8');
744
+ existingPOF = JSON.parse(data);
745
+ } catch {
746
+ // File doesn't exist or is invalid JSON - ignore
747
+ }
748
+
749
+ // If there's an existing POF, add it to previousFailures
750
+ if (existingPOF) {
751
+ entry.previousFailures = entry.previousFailures || [];
752
+ entry.previousFailures.unshift(existingPOF);
753
+ }
754
+
755
+ // Use atomic write: write to temp file then rename
756
+ const tempPath = `${pofPath}.${Math.random().toString(36).substring(2, 7)}.tmp`;
757
+ try {
758
+ fs.writeFileSync(tempPath, JSON.stringify(entry, null, 2), 'utf8');
759
+ fs.renameSync(tempPath, pofPath);
760
+ } catch (err) {
761
+ // If temp file was created, try to clean it up
762
+ try { if (fs.existsSync(tempPath)) fs.unlinkSync(tempPath); } catch { /* ignore */ }
763
+ throw err;
764
+ }
765
+
766
+ logger.info(`[POF] Saved post-mortem to ${pofPath}`);
767
+
768
+ return pofPath;
769
+ }
770
+
771
+ /**
772
+ * Create a POF entry from recovery state
773
+ */
774
+ export function createPOFFromRecoveryState(
775
+ runId: string,
776
+ runDir: string,
777
+ laneName: string,
778
+ state: LaneRecoveryState,
779
+ laneState: LaneState | null,
780
+ diagnostic?: DiagnosticInfo
781
+ ): POFEntry {
782
+ const now = new Date();
783
+
784
+ // Determine root cause type
785
+ let rootCauseType = 'AGENT_NO_RESPONSE';
786
+ let rootCauseDescription = 'Agent stopped responding and did not recover after multiple interventions';
787
+ const symptoms: string[] = [];
788
+
789
+ if (state.totalBytesReceived === 0) {
790
+ rootCauseType = 'AGENT_NO_RESPONSE';
791
+ rootCauseDescription = 'Agent produced 0 bytes of output - possible API or network issue';
792
+ symptoms.push('No bytes received from agent');
793
+ } else if (state.restartCount >= 2) {
794
+ rootCauseType = 'ZOMBIE_PROCESS';
795
+ rootCauseDescription = 'Lane processes repeatedly failed to make progress after restarts';
796
+ symptoms.push(`Restarted ${state.restartCount} times without success`);
797
+ }
798
+
799
+ symptoms.push(`Total bytes received: ${state.totalBytesReceived}`);
800
+ symptoms.push(`Continue signals sent: ${state.continueSignalsSent}`);
801
+ symptoms.push(`Last output: ${state.lastOutput.substring(0, 100)}...`);
802
+
803
+ // Possible causes based on diagnostic
804
+ const possibleCauses: string[] = [
805
+ 'Model API rate limiting or quota exceeded',
806
+ 'Cursor authentication token expired',
807
+ 'Network connectivity issues',
808
+ 'Agent process hung waiting for stdin/stdout',
809
+ ];
810
+
811
+ if (diagnostic) {
812
+ if (!diagnostic.agentHealthy) {
813
+ possibleCauses.unshift('cursor-agent CLI is not responding properly');
814
+ }
815
+ if (!diagnostic.authHealthy) {
816
+ possibleCauses.unshift('Cursor authentication failed or expired');
817
+ }
818
+ }
819
+
820
+ const entry: POFEntry = {
821
+ title: 'Run Failure Post-mortem',
822
+ runId,
823
+ failureTime: now.toISOString(),
824
+ detectedAt: now.toISOString(),
825
+ summary: `Lane ${laneName} failed after ${state.restartCount} restart(s) and ${state.continueSignalsSent} continue signal(s)`,
826
+ rootCause: {
827
+ type: rootCauseType,
828
+ description: rootCauseDescription,
829
+ symptoms,
830
+ },
831
+ affectedLanes: [
832
+ {
833
+ name: laneName,
834
+ status: 'failed',
835
+ task: laneState ? `[${(laneState.currentTaskIndex || 0) + 1}/${laneState.totalTasks}]` : 'unknown',
836
+ taskIndex: laneState?.currentTaskIndex || 0,
837
+ pid: laneState?.pid,
838
+ reason: rootCauseDescription,
839
+ recoveryAttempts: state.failureHistory,
840
+ },
841
+ ],
842
+ possibleCauses,
843
+ recovery: {
844
+ command: `cursorflow resume --all --run-dir ${runDir}`,
845
+ description: 'Resume all failed lanes from their last checkpoint',
846
+ alternativeCommand: `cursorflow resume --all --restart --run-dir ${runDir}`,
847
+ alternativeDescription: 'Restart all failed lanes from the beginning',
848
+ },
849
+ };
850
+
851
+ return entry;
852
+ }
853
+
854
+ /**
855
+ * Load existing POF entries for a run
856
+ */
857
+ export function loadPOF(pofDir: string, runId: string): POFEntry | null {
858
+ const pofPath = safeJoin(pofDir, `pof-${runId}.json`);
859
+
860
+ if (!fs.existsSync(pofPath)) {
861
+ return null;
862
+ }
863
+
864
+ try {
865
+ return JSON.parse(fs.readFileSync(pofPath, 'utf8'));
866
+ } catch (error: any) {
867
+ logger.warn(`[POF] Failed to load POF from ${pofPath}: ${error.message}`);
868
+ return null;
869
+ }
870
+ }
871
+
872
+ /**
873
+ * List all POF files in a directory
874
+ */
875
+ export function listPOFs(pofDir: string): string[] {
876
+ if (!fs.existsSync(pofDir)) {
877
+ return [];
878
+ }
879
+
880
+ return fs.readdirSync(pofDir)
881
+ .filter(f => f.startsWith('pof-') && f.endsWith('.json'))
882
+ .map(f => safeJoin(pofDir, f));
883
+ }
884
+
885
+ // ============================================================================
886
+ // Exports
887
+ // ============================================================================
888
+
889
+ /** Singleton instance for easy access */
890
+ let defaultManager: AutoRecoveryManager | null = null;
891
+
892
+ /**
893
+ * Get or create the default auto-recovery manager
894
+ */
895
+ export function getAutoRecoveryManager(config?: Partial<AutoRecoveryConfig>): AutoRecoveryManager {
896
+ if (!defaultManager) {
897
+ defaultManager = new AutoRecoveryManager(config);
898
+ } else if (config) {
899
+ defaultManager.updateConfig(config);
900
+ }
901
+ return defaultManager;
902
+ }
903
+
904
+ /**
905
+ * Reset the default manager (for testing)
906
+ */
907
+ export function resetAutoRecoveryManager(): void {
908
+ defaultManager = null;
909
+ }