@litmers/cursorflow-orchestrator 0.1.20 → 0.1.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (224) hide show
  1. package/CHANGELOG.md +20 -0
  2. package/commands/cursorflow-clean.md +19 -0
  3. package/commands/cursorflow-runs.md +59 -0
  4. package/commands/cursorflow-stop.md +55 -0
  5. package/dist/cli/clean.js +171 -0
  6. package/dist/cli/clean.js.map +1 -1
  7. package/dist/cli/index.js +7 -0
  8. package/dist/cli/index.js.map +1 -1
  9. package/dist/cli/init.js +1 -1
  10. package/dist/cli/init.js.map +1 -1
  11. package/dist/cli/logs.js +83 -42
  12. package/dist/cli/logs.js.map +1 -1
  13. package/dist/cli/monitor.d.ts +7 -0
  14. package/dist/cli/monitor.js +1007 -189
  15. package/dist/cli/monitor.js.map +1 -1
  16. package/dist/cli/prepare.js +87 -3
  17. package/dist/cli/prepare.js.map +1 -1
  18. package/dist/cli/resume.js +188 -236
  19. package/dist/cli/resume.js.map +1 -1
  20. package/dist/cli/run.js +125 -3
  21. package/dist/cli/run.js.map +1 -1
  22. package/dist/cli/runs.d.ts +5 -0
  23. package/dist/cli/runs.js +214 -0
  24. package/dist/cli/runs.js.map +1 -0
  25. package/dist/cli/setup-commands.js +0 -0
  26. package/dist/cli/signal.js +1 -1
  27. package/dist/cli/signal.js.map +1 -1
  28. package/dist/cli/stop.d.ts +5 -0
  29. package/dist/cli/stop.js +215 -0
  30. package/dist/cli/stop.js.map +1 -0
  31. package/dist/cli/tasks.d.ts +10 -0
  32. package/dist/cli/tasks.js +165 -0
  33. package/dist/cli/tasks.js.map +1 -0
  34. package/dist/core/auto-recovery.d.ts +212 -0
  35. package/dist/core/auto-recovery.js +737 -0
  36. package/dist/core/auto-recovery.js.map +1 -0
  37. package/dist/core/failure-policy.d.ts +156 -0
  38. package/dist/core/failure-policy.js +488 -0
  39. package/dist/core/failure-policy.js.map +1 -0
  40. package/dist/core/orchestrator.d.ts +15 -2
  41. package/dist/core/orchestrator.js +397 -15
  42. package/dist/core/orchestrator.js.map +1 -1
  43. package/dist/core/reviewer.d.ts +2 -0
  44. package/dist/core/reviewer.js +2 -0
  45. package/dist/core/reviewer.js.map +1 -1
  46. package/dist/core/runner.d.ts +33 -10
  47. package/dist/core/runner.js +321 -146
  48. package/dist/core/runner.js.map +1 -1
  49. package/dist/services/logging/buffer.d.ts +67 -0
  50. package/dist/services/logging/buffer.js +309 -0
  51. package/dist/services/logging/buffer.js.map +1 -0
  52. package/dist/services/logging/console.d.ts +89 -0
  53. package/dist/services/logging/console.js +169 -0
  54. package/dist/services/logging/console.js.map +1 -0
  55. package/dist/services/logging/file-writer.d.ts +71 -0
  56. package/dist/services/logging/file-writer.js +516 -0
  57. package/dist/services/logging/file-writer.js.map +1 -0
  58. package/dist/services/logging/formatter.d.ts +39 -0
  59. package/dist/services/logging/formatter.js +227 -0
  60. package/dist/services/logging/formatter.js.map +1 -0
  61. package/dist/services/logging/index.d.ts +11 -0
  62. package/dist/services/logging/index.js +30 -0
  63. package/dist/services/logging/index.js.map +1 -0
  64. package/dist/services/logging/parser.d.ts +31 -0
  65. package/dist/services/logging/parser.js +222 -0
  66. package/dist/services/logging/parser.js.map +1 -0
  67. package/dist/services/process/index.d.ts +59 -0
  68. package/dist/services/process/index.js +257 -0
  69. package/dist/services/process/index.js.map +1 -0
  70. package/dist/types/agent.d.ts +20 -0
  71. package/dist/types/agent.js +6 -0
  72. package/dist/types/agent.js.map +1 -0
  73. package/dist/types/config.d.ts +65 -0
  74. package/dist/types/config.js +6 -0
  75. package/dist/types/config.js.map +1 -0
  76. package/dist/types/events.d.ts +125 -0
  77. package/dist/types/events.js +6 -0
  78. package/dist/types/events.js.map +1 -0
  79. package/dist/types/index.d.ts +12 -0
  80. package/dist/types/index.js +37 -0
  81. package/dist/types/index.js.map +1 -0
  82. package/dist/types/lane.d.ts +43 -0
  83. package/dist/types/lane.js +6 -0
  84. package/dist/types/lane.js.map +1 -0
  85. package/dist/types/logging.d.ts +71 -0
  86. package/dist/types/logging.js +16 -0
  87. package/dist/types/logging.js.map +1 -0
  88. package/dist/types/review.d.ts +17 -0
  89. package/dist/types/review.js +6 -0
  90. package/dist/types/review.js.map +1 -0
  91. package/dist/types/run.d.ts +32 -0
  92. package/dist/types/run.js +6 -0
  93. package/dist/types/run.js.map +1 -0
  94. package/dist/types/task.d.ts +71 -0
  95. package/dist/types/task.js +6 -0
  96. package/dist/types/task.js.map +1 -0
  97. package/dist/ui/components.d.ts +134 -0
  98. package/dist/ui/components.js +389 -0
  99. package/dist/ui/components.js.map +1 -0
  100. package/dist/ui/log-viewer.d.ts +49 -0
  101. package/dist/ui/log-viewer.js +449 -0
  102. package/dist/ui/log-viewer.js.map +1 -0
  103. package/dist/utils/checkpoint.d.ts +87 -0
  104. package/dist/utils/checkpoint.js +317 -0
  105. package/dist/utils/checkpoint.js.map +1 -0
  106. package/dist/utils/config.d.ts +4 -0
  107. package/dist/utils/config.js +11 -2
  108. package/dist/utils/config.js.map +1 -1
  109. package/dist/utils/cursor-agent.js.map +1 -1
  110. package/dist/utils/dependency.d.ts +74 -0
  111. package/dist/utils/dependency.js +420 -0
  112. package/dist/utils/dependency.js.map +1 -0
  113. package/dist/utils/doctor.js +10 -5
  114. package/dist/utils/doctor.js.map +1 -1
  115. package/dist/utils/enhanced-logger.d.ts +10 -33
  116. package/dist/utils/enhanced-logger.js +94 -9
  117. package/dist/utils/enhanced-logger.js.map +1 -1
  118. package/dist/utils/git.d.ts +121 -0
  119. package/dist/utils/git.js +322 -2
  120. package/dist/utils/git.js.map +1 -1
  121. package/dist/utils/health.d.ts +91 -0
  122. package/dist/utils/health.js +556 -0
  123. package/dist/utils/health.js.map +1 -0
  124. package/dist/utils/lock.d.ts +95 -0
  125. package/dist/utils/lock.js +332 -0
  126. package/dist/utils/lock.js.map +1 -0
  127. package/dist/utils/log-buffer.d.ts +17 -0
  128. package/dist/utils/log-buffer.js +14 -0
  129. package/dist/utils/log-buffer.js.map +1 -0
  130. package/dist/utils/log-constants.d.ts +23 -0
  131. package/dist/utils/log-constants.js +28 -0
  132. package/dist/utils/log-constants.js.map +1 -0
  133. package/dist/utils/log-formatter.d.ts +9 -0
  134. package/dist/utils/log-formatter.js +113 -70
  135. package/dist/utils/log-formatter.js.map +1 -1
  136. package/dist/utils/log-service.d.ts +19 -0
  137. package/dist/utils/log-service.js +47 -0
  138. package/dist/utils/log-service.js.map +1 -0
  139. package/dist/utils/logger.d.ts +46 -27
  140. package/dist/utils/logger.js +82 -60
  141. package/dist/utils/logger.js.map +1 -1
  142. package/dist/utils/process-manager.d.ts +21 -0
  143. package/dist/utils/process-manager.js +138 -0
  144. package/dist/utils/process-manager.js.map +1 -0
  145. package/dist/utils/retry.d.ts +121 -0
  146. package/dist/utils/retry.js +374 -0
  147. package/dist/utils/retry.js.map +1 -0
  148. package/dist/utils/run-service.d.ts +88 -0
  149. package/dist/utils/run-service.js +412 -0
  150. package/dist/utils/run-service.js.map +1 -0
  151. package/dist/utils/state.d.ts +58 -2
  152. package/dist/utils/state.js +306 -3
  153. package/dist/utils/state.js.map +1 -1
  154. package/dist/utils/task-service.d.ts +82 -0
  155. package/dist/utils/task-service.js +348 -0
  156. package/dist/utils/task-service.js.map +1 -0
  157. package/dist/utils/types.d.ts +2 -272
  158. package/dist/utils/types.js +16 -0
  159. package/dist/utils/types.js.map +1 -1
  160. package/package.json +38 -23
  161. package/scripts/ai-security-check.js +0 -1
  162. package/scripts/local-security-gate.sh +0 -0
  163. package/scripts/monitor-lanes.sh +94 -0
  164. package/scripts/patches/test-cursor-agent.js +0 -1
  165. package/scripts/release.sh +0 -0
  166. package/scripts/setup-security.sh +0 -0
  167. package/scripts/stream-logs.sh +72 -0
  168. package/scripts/verify-and-fix.sh +0 -0
  169. package/src/cli/clean.ts +180 -0
  170. package/src/cli/index.ts +7 -0
  171. package/src/cli/init.ts +1 -1
  172. package/src/cli/logs.ts +79 -42
  173. package/src/cli/monitor.ts +1815 -899
  174. package/src/cli/prepare.ts +97 -3
  175. package/src/cli/resume.ts +220 -277
  176. package/src/cli/run.ts +154 -3
  177. package/src/cli/runs.ts +212 -0
  178. package/src/cli/setup-commands.ts +0 -0
  179. package/src/cli/signal.ts +1 -1
  180. package/src/cli/stop.ts +209 -0
  181. package/src/cli/tasks.ts +154 -0
  182. package/src/core/auto-recovery.ts +909 -0
  183. package/src/core/failure-policy.ts +592 -0
  184. package/src/core/orchestrator.ts +1136 -675
  185. package/src/core/reviewer.ts +4 -0
  186. package/src/core/runner.ts +1443 -1217
  187. package/src/services/logging/buffer.ts +326 -0
  188. package/src/services/logging/console.ts +193 -0
  189. package/src/services/logging/file-writer.ts +526 -0
  190. package/src/services/logging/formatter.ts +268 -0
  191. package/src/services/logging/index.ts +16 -0
  192. package/src/services/logging/parser.ts +232 -0
  193. package/src/services/process/index.ts +261 -0
  194. package/src/types/agent.ts +24 -0
  195. package/src/types/config.ts +79 -0
  196. package/src/types/events.ts +156 -0
  197. package/src/types/index.ts +29 -0
  198. package/src/types/lane.ts +56 -0
  199. package/src/types/logging.ts +96 -0
  200. package/src/types/review.ts +20 -0
  201. package/src/types/run.ts +37 -0
  202. package/src/types/task.ts +79 -0
  203. package/src/ui/components.ts +430 -0
  204. package/src/ui/log-viewer.ts +485 -0
  205. package/src/utils/checkpoint.ts +374 -0
  206. package/src/utils/config.ts +11 -2
  207. package/src/utils/cursor-agent.ts +1 -1
  208. package/src/utils/dependency.ts +482 -0
  209. package/src/utils/doctor.ts +11 -5
  210. package/src/utils/enhanced-logger.ts +108 -49
  211. package/src/utils/git.ts +871 -499
  212. package/src/utils/health.ts +596 -0
  213. package/src/utils/lock.ts +346 -0
  214. package/src/utils/log-buffer.ts +28 -0
  215. package/src/utils/log-constants.ts +26 -0
  216. package/src/utils/log-formatter.ts +120 -37
  217. package/src/utils/log-service.ts +49 -0
  218. package/src/utils/logger.ts +100 -51
  219. package/src/utils/process-manager.ts +100 -0
  220. package/src/utils/retry.ts +413 -0
  221. package/src/utils/run-service.ts +433 -0
  222. package/src/utils/state.ts +369 -3
  223. package/src/utils/task-service.ts +370 -0
  224. package/src/utils/types.ts +2 -315
@@ -0,0 +1,592 @@
1
+ /**
2
+ * Failure Policy - Centralized management of failure cases and recovery actions
3
+ *
4
+ * Features:
5
+ * - Multi-layer stall detection
6
+ * - Circuit breaker integration
7
+ * - Configurable recovery strategies
8
+ */
9
+
10
+ import * as logger from '../utils/logger';
11
+ import { getCircuitBreaker, CircuitState } from '../utils/retry';
12
+
13
+ export enum FailureType {
14
+ STALL_IDLE = 'STALL_IDLE',
15
+ STALL_NO_PROGRESS = 'STALL_NO_PROGRESS',
16
+ STALL_ZERO_BYTES = 'STALL_ZERO_BYTES',
17
+ AGENT_UNAVAILABLE = 'AGENT_UNAVAILABLE',
18
+ AGENT_AUTH_ERROR = 'AGENT_AUTH_ERROR',
19
+ AGENT_RATE_LIMIT = 'AGENT_RATE_LIMIT',
20
+ AGENT_TIMEOUT = 'AGENT_TIMEOUT',
21
+ AGENT_NO_RESPONSE = 'AGENT_NO_RESPONSE',
22
+ ZOMBIE_PROCESS = 'ZOMBIE_PROCESS',
23
+ DEPENDENCY_BLOCK = 'DEPENDENCY_BLOCK',
24
+ DEPENDENCY_FAILED = 'DEPENDENCY_FAILED',
25
+ DEPENDENCY_TIMEOUT = 'DEPENDENCY_TIMEOUT',
26
+ REVIEW_FAIL = 'REVIEW_FAIL',
27
+ GIT_ERROR = 'GIT_ERROR',
28
+ GIT_PUSH_REJECTED = 'GIT_PUSH_REJECTED',
29
+ MERGE_CONFLICT = 'MERGE_CONFLICT',
30
+ NETWORK_ERROR = 'NETWORK_ERROR',
31
+ STATE_CORRUPTION = 'STATE_CORRUPTION',
32
+ UNKNOWN_CRASH = 'UNKNOWN_CRASH',
33
+ }
34
+
35
+ export enum RecoveryAction {
36
+ CONTINUE_SIGNAL = 'CONTINUE_SIGNAL',
37
+ STRONGER_PROMPT = 'STRONGER_PROMPT',
38
+ RETRY_TASK = 'RETRY_TASK',
39
+ RESTART_LANE = 'RESTART_LANE',
40
+ RESTART_LANE_FROM_CHECKPOINT = 'RESTART_LANE_FROM_CHECKPOINT',
41
+ KILL_AND_RESTART = 'KILL_AND_RESTART',
42
+ ABORT_LANE = 'ABORT_LANE',
43
+ WAIT_FOR_USER = 'WAIT_FOR_USER',
44
+ WAIT_AND_RETRY = 'WAIT_AND_RETRY',
45
+ RESET_GIT = 'RESET_GIT',
46
+ SEND_GIT_GUIDANCE = 'SEND_GIT_GUIDANCE',
47
+ RUN_DOCTOR = 'RUN_DOCTOR',
48
+ NONE = 'NONE',
49
+ }
50
+
51
+ export interface FailureAnalysis {
52
+ type: FailureType;
53
+ action: RecoveryAction;
54
+ message: string;
55
+ isTransient: boolean;
56
+ suggestedDelayMs?: number;
57
+ details?: Record<string, any>;
58
+ }
59
+
60
+ /**
61
+ * Multi-layer stall detection configuration
62
+ */
63
+ export interface StallDetectionConfig {
64
+ /** Time without stdout activity before sending continue signal */
65
+ idleTimeoutMs: number;
66
+ /** Time without state file update before considering stalled */
67
+ progressTimeoutMs: number;
68
+ /** Maximum time for a single task */
69
+ taskTimeoutMs: number;
70
+ /** Grace period for known long operations (e.g., npm install) */
71
+ longOperationGraceMs: number;
72
+ /** Patterns that indicate long operations */
73
+ longOperationPatterns: RegExp[];
74
+ /** Maximum restarts before aborting */
75
+ maxRestarts: number;
76
+ }
77
+
78
+ export const DEFAULT_STALL_CONFIG: StallDetectionConfig = {
79
+ idleTimeoutMs: 2 * 60 * 1000, // 2 minutes without output (idle detection)
80
+ progressTimeoutMs: 10 * 60 * 1000, // 10 minutes without progress
81
+ taskTimeoutMs: 30 * 60 * 1000, // 30 minutes max per task
82
+ longOperationGraceMs: 10 * 60 * 1000, // 10 minute grace for long ops
83
+ longOperationPatterns: [
84
+ /Installing dependencies/i,
85
+ /npm install/i,
86
+ /pnpm install/i,
87
+ /yarn install/i,
88
+ /Building/i,
89
+ /Compiling/i,
90
+ /Downloading/i,
91
+ /Fetching/i,
92
+ /Cloning/i,
93
+ /Bundling/i,
94
+ ],
95
+ maxRestarts: 2,
96
+ };
97
+
98
+ export interface StallContext {
99
+ /** Current stall phase (0: normal, 1: continued, 2: stronger_prompt, 3: restarted) */
100
+ stallPhase: number;
101
+ /** Time since last activity */
102
+ idleTimeMs: number;
103
+ /** Time since last state update */
104
+ progressTimeMs?: number;
105
+ /** Last output line (for long operation detection) */
106
+ lastOutput?: string;
107
+ /** Number of restarts */
108
+ restartCount?: number;
109
+ /** Task start time */
110
+ taskStartTimeMs?: number;
111
+ /** Bytes received since last check (0 = no response at all) */
112
+ bytesReceived?: number;
113
+ /** Number of continue signals already sent */
114
+ continueSignalsSent?: number;
115
+ }
116
+
117
+ export interface FailureContext {
118
+ exitCode?: number;
119
+ stallPhase?: number;
120
+ idleTimeMs?: number;
121
+ retryCount?: number;
122
+ progressTimeMs?: number;
123
+ lastOutput?: string;
124
+ restartCount?: number;
125
+ taskStartTimeMs?: number;
126
+ circuitBreakerName?: string;
127
+ }
128
+
129
+ /**
130
+ * Analyze stall condition with multi-layer detection and escalating recovery
131
+ *
132
+ * Recovery escalation stages:
133
+ * 1. Phase 0 → Phase 1: Send continue signal (after 2 min idle)
134
+ * 2. Phase 1 → Phase 2: Send stronger prompt (after 2 min grace)
135
+ * 3. Phase 2 → Phase 3: Kill and restart process (after 2 min grace)
136
+ * 4. Phase 3+: Abort after max restarts exceeded
137
+ */
138
+ export function analyzeStall(context: StallContext, config: StallDetectionConfig = DEFAULT_STALL_CONFIG): FailureAnalysis {
139
+ const {
140
+ stallPhase,
141
+ idleTimeMs,
142
+ progressTimeMs,
143
+ lastOutput,
144
+ restartCount = 0,
145
+ taskStartTimeMs,
146
+ bytesReceived = -1, // -1 means not tracked
147
+ continueSignalsSent = 0,
148
+ } = context;
149
+
150
+ // Check if this might be a long operation
151
+ const isLongOperation = lastOutput && config.longOperationPatterns.some(p => p.test(lastOutput));
152
+ const effectiveIdleTimeout = isLongOperation ? config.longOperationGraceMs : config.idleTimeoutMs;
153
+
154
+ // Check for task timeout
155
+ if (taskStartTimeMs && (Date.now() - taskStartTimeMs) > config.taskTimeoutMs) {
156
+ return {
157
+ type: FailureType.AGENT_TIMEOUT,
158
+ action: restartCount < config.maxRestarts ? RecoveryAction.KILL_AND_RESTART : RecoveryAction.RUN_DOCTOR,
159
+ message: `Task exceeded maximum timeout of ${Math.round(config.taskTimeoutMs / 60000)} minutes`,
160
+ isTransient: restartCount < config.maxRestarts,
161
+ details: { taskDurationMs: Date.now() - taskStartTimeMs, restartCount },
162
+ };
163
+ }
164
+
165
+ // Check for zero bytes received (agent completely unresponsive)
166
+ if (bytesReceived === 0 && idleTimeMs > effectiveIdleTimeout) {
167
+ return {
168
+ type: FailureType.AGENT_NO_RESPONSE,
169
+ action: stallPhase < 2 ? RecoveryAction.CONTINUE_SIGNAL : RecoveryAction.KILL_AND_RESTART,
170
+ message: `Agent produced 0 bytes for ${Math.round(idleTimeMs / 1000)}s - possible API issue`,
171
+ isTransient: true,
172
+ details: { idleTimeMs, bytesReceived, stallPhase },
173
+ };
174
+ }
175
+
176
+ // Check for no progress (state file not updating)
177
+ if (progressTimeMs && progressTimeMs > config.progressTimeoutMs) {
178
+ return {
179
+ type: FailureType.STALL_NO_PROGRESS,
180
+ action: stallPhase === 0 ? RecoveryAction.CONTINUE_SIGNAL :
181
+ stallPhase === 1 ? RecoveryAction.STRONGER_PROMPT :
182
+ RecoveryAction.KILL_AND_RESTART,
183
+ message: `No progress for ${Math.round(progressTimeMs / 60000)} minutes`,
184
+ isTransient: true,
185
+ details: { progressTimeMs, stallPhase },
186
+ };
187
+ }
188
+
189
+ // Phase 0: Normal operation, check for initial idle
190
+ if (stallPhase === 0 && idleTimeMs > effectiveIdleTimeout) {
191
+ return {
192
+ type: FailureType.STALL_IDLE,
193
+ action: RecoveryAction.CONTINUE_SIGNAL,
194
+ message: `Lane idle for ${Math.round(idleTimeMs / 1000)}s. Sending continue signal...`,
195
+ isTransient: true,
196
+ details: { idleTimeMs, isLongOperation, phase: 0 },
197
+ };
198
+ }
199
+
200
+ // Phase 1: Continue signal sent, wait for response
201
+ if (stallPhase === 1) {
202
+ const graceTimeout = 2 * 60 * 1000; // 2 minutes grace after continue
203
+
204
+ if (idleTimeMs > graceTimeout) {
205
+ return {
206
+ type: FailureType.STALL_IDLE,
207
+ action: RecoveryAction.STRONGER_PROMPT,
208
+ message: `Still idle after continue signal. Sending stronger prompt...`,
209
+ isTransient: true,
210
+ details: { idleTimeMs, continueSignalsSent, phase: 1 },
211
+ };
212
+ }
213
+ }
214
+
215
+ // Phase 2: Stronger prompt sent, wait or escalate
216
+ if (stallPhase === 2) {
217
+ const strongerGraceTimeout = 2 * 60 * 1000; // 2 minutes grace after stronger prompt
218
+
219
+ if (idleTimeMs > strongerGraceTimeout) {
220
+ if (restartCount < config.maxRestarts) {
221
+ return {
222
+ type: FailureType.STALL_IDLE,
223
+ action: RecoveryAction.KILL_AND_RESTART,
224
+ message: `No response after stronger prompt. Killing and restarting process...`,
225
+ isTransient: true,
226
+ details: { idleTimeMs, restartCount, maxRestarts: config.maxRestarts, phase: 2 },
227
+ };
228
+ } else {
229
+ return {
230
+ type: FailureType.STALL_IDLE,
231
+ action: RecoveryAction.RUN_DOCTOR,
232
+ message: `Lane failed after ${restartCount} restarts. Running diagnostics...`,
233
+ isTransient: false,
234
+ details: { restartCount, phase: 2 },
235
+ };
236
+ }
237
+ }
238
+ }
239
+
240
+ // Phase 3+: After restart, monitor with shorter timeout
241
+ if (stallPhase >= 3) {
242
+ const postRestartTimeout = config.idleTimeoutMs * 0.75; // Shorter timeout after restart
243
+
244
+ if (idleTimeMs > postRestartTimeout) {
245
+ if (restartCount < config.maxRestarts) {
246
+ return {
247
+ type: FailureType.STALL_IDLE,
248
+ action: RecoveryAction.CONTINUE_SIGNAL,
249
+ message: `Lane idle after restart. Retrying continue signal...`,
250
+ isTransient: true,
251
+ details: { idleTimeMs, restartCount, phase: stallPhase },
252
+ };
253
+ } else {
254
+ return {
255
+ type: FailureType.STALL_IDLE,
256
+ action: RecoveryAction.RUN_DOCTOR,
257
+ message: `Lane repeatedly stalled. Running diagnostics for root cause...`,
258
+ isTransient: false,
259
+ details: { stallPhase, restartCount },
260
+ };
261
+ }
262
+ }
263
+ }
264
+
265
+ // No action needed yet
266
+ return {
267
+ type: FailureType.STALL_IDLE,
268
+ action: RecoveryAction.NONE,
269
+ message: 'Monitoring for stall',
270
+ isTransient: true,
271
+ };
272
+ }
273
+
274
+ /**
275
+ * Analyze an error message or state to determine the failure type and recovery action
276
+ */
277
+ export function analyzeFailure(error: string | null | undefined, context?: FailureContext): FailureAnalysis {
278
+ const msg = (error || '').toLowerCase();
279
+
280
+ // Check circuit breaker status first
281
+ if (context?.circuitBreakerName) {
282
+ const breaker = getCircuitBreaker(context.circuitBreakerName);
283
+ if (breaker.getState() === CircuitState.OPEN) {
284
+ const waitTime = breaker.getTimeUntilRetry();
285
+ return {
286
+ type: FailureType.AGENT_UNAVAILABLE,
287
+ action: RecoveryAction.WAIT_AND_RETRY,
288
+ message: `Circuit breaker open. Retry in ${Math.round(waitTime / 1000)}s`,
289
+ isTransient: true,
290
+ suggestedDelayMs: waitTime,
291
+ };
292
+ }
293
+ }
294
+
295
+ // 1. Network errors
296
+ if (msg.includes('econnreset') || msg.includes('econnrefused') ||
297
+ msg.includes('etimedout') || msg.includes('enotfound') ||
298
+ msg.includes('socket hang up') || msg.includes('network')) {
299
+ return {
300
+ type: FailureType.NETWORK_ERROR,
301
+ action: (context?.retryCount || 0) < 3 ? RecoveryAction.RETRY_TASK : RecoveryAction.RESTART_LANE,
302
+ message: 'Network error. Retrying...',
303
+ isTransient: true,
304
+ suggestedDelayMs: 5000 * Math.pow(2, context?.retryCount || 0),
305
+ };
306
+ }
307
+
308
+ // 2. Agent service unavailable
309
+ if (msg.includes('connecterror') && msg.includes('[unavailable]')) {
310
+ return {
311
+ type: FailureType.AGENT_UNAVAILABLE,
312
+ action: (context?.retryCount || 0) < 3 ? RecoveryAction.RETRY_TASK : RecoveryAction.RESTART_LANE,
313
+ message: 'Agent service is temporarily unavailable. Retrying with a new agent session.',
314
+ isTransient: true,
315
+ suggestedDelayMs: 10000,
316
+ };
317
+ }
318
+
319
+ // 3. Authentication errors
320
+ if (msg.includes('not authenticated') || msg.includes('unauthorized') ||
321
+ msg.includes('401') || msg.includes('auth failed')) {
322
+ return {
323
+ type: FailureType.AGENT_AUTH_ERROR,
324
+ action: RecoveryAction.WAIT_FOR_USER,
325
+ message: 'Cursor authentication failed. Please sign in to Cursor IDE.',
326
+ isTransient: false,
327
+ };
328
+ }
329
+
330
+ // 4. Rate limits
331
+ if (msg.includes('rate limit') || msg.includes('quota') ||
332
+ msg.includes('429') || msg.includes('too many requests')) {
333
+ return {
334
+ type: FailureType.AGENT_RATE_LIMIT,
335
+ action: RecoveryAction.WAIT_AND_RETRY,
336
+ message: 'API rate limit reached. Waiting before retry...',
337
+ isTransient: true,
338
+ suggestedDelayMs: 60000, // 1 minute
339
+ };
340
+ }
341
+
342
+ // 5. Timeout
343
+ if (msg.includes('timeout') || msg.includes('timed out')) {
344
+ return {
345
+ type: FailureType.AGENT_TIMEOUT,
346
+ action: (context?.retryCount || 0) < 2 ? RecoveryAction.RETRY_TASK : RecoveryAction.RESTART_LANE,
347
+ message: 'Operation timed out.',
348
+ isTransient: true,
349
+ };
350
+ }
351
+
352
+ // 6. Git/merge errors - send guidance to agent
353
+ if (msg.includes('conflict') || msg.includes('merge failed') || msg.includes('automatic merge failed')) {
354
+ return {
355
+ type: FailureType.MERGE_CONFLICT,
356
+ action: RecoveryAction.SEND_GIT_GUIDANCE,
357
+ message: 'Merge conflict detected. Sending guidance to agent...',
358
+ isTransient: true,
359
+ };
360
+ }
361
+
362
+ // Git push rejected (common in parallel lanes)
363
+ if (msg.includes('rejected') || msg.includes('non-fast-forward') ||
364
+ msg.includes('failed to push') || msg.includes('fetch first')) {
365
+ return {
366
+ type: FailureType.GIT_PUSH_REJECTED,
367
+ action: RecoveryAction.SEND_GIT_GUIDANCE,
368
+ message: 'Git push rejected. Sending guidance to agent...',
369
+ isTransient: true,
370
+ };
371
+ }
372
+
373
+ if (msg.includes('git') && (msg.includes('error') || msg.includes('failed'))) {
374
+ return {
375
+ type: FailureType.GIT_ERROR,
376
+ action: (context?.retryCount || 0) < 2 ? RecoveryAction.RETRY_TASK : RecoveryAction.RESET_GIT,
377
+ message: 'Git operation failed.',
378
+ isTransient: true,
379
+ };
380
+ }
381
+
382
+ // 7. Dependency blocks (Exit Code 2)
383
+ if (context?.exitCode === 2 || msg.includes('dependency_change_required')) {
384
+ return {
385
+ type: FailureType.DEPENDENCY_BLOCK,
386
+ action: RecoveryAction.NONE, // Handled by orchestrator resolve logic
387
+ message: 'Lane is blocked on dependency change request.',
388
+ isTransient: false,
389
+ };
390
+ }
391
+
392
+ // 8. Dependency failures
393
+ if (msg.includes('dependency failed') || msg.includes('dependency timeout')) {
394
+ const isDependencyTimeout = msg.includes('timeout');
395
+ return {
396
+ type: isDependencyTimeout ? FailureType.DEPENDENCY_TIMEOUT : FailureType.DEPENDENCY_FAILED,
397
+ action: RecoveryAction.ABORT_LANE,
398
+ message: isDependencyTimeout ? 'Dependency wait timed out.' : 'A dependency lane has failed.',
399
+ isTransient: false,
400
+ };
401
+ }
402
+
403
+ // 9. State corruption
404
+ if (msg.includes('state') && (msg.includes('corrupt') || msg.includes('invalid') || msg.includes('parse'))) {
405
+ return {
406
+ type: FailureType.STATE_CORRUPTION,
407
+ action: RecoveryAction.RESTART_LANE_FROM_CHECKPOINT,
408
+ message: 'State file corruption detected.',
409
+ isTransient: false,
410
+ };
411
+ }
412
+
413
+ // 10. Stalls (handled by phase)
414
+ if (context?.stallPhase !== undefined && context.stallPhase >= 0) {
415
+ return analyzeStall({
416
+ stallPhase: context.stallPhase,
417
+ idleTimeMs: context.idleTimeMs || 0,
418
+ progressTimeMs: context.progressTimeMs,
419
+ restartCount: context.restartCount,
420
+ taskStartTimeMs: context.taskStartTimeMs,
421
+ });
422
+ }
423
+
424
+ // 11. Default fallback
425
+ return {
426
+ type: FailureType.UNKNOWN_CRASH,
427
+ action: RecoveryAction.ABORT_LANE,
428
+ message: error || `Process exited with code ${context?.exitCode}`,
429
+ isTransient: false,
430
+ };
431
+ }
432
+
433
+ /**
434
+ * Log the failure analysis to the appropriate channels
435
+ */
436
+ export function logFailure(laneName: string, analysis: FailureAnalysis, loggerInstance: any = logger): void {
437
+ const label = `[${laneName}]`;
438
+ const actionLabel = analysis.action === RecoveryAction.NONE ? '' : ` -> Action: ${analysis.action}`;
439
+ const delayLabel = analysis.suggestedDelayMs ? ` (delay: ${Math.round(analysis.suggestedDelayMs / 1000)}s)` : '';
440
+
441
+ const message = `${label} ${analysis.type}: ${analysis.message}${actionLabel}${delayLabel}`;
442
+
443
+ if (analysis.isTransient) {
444
+ loggerInstance.warn(message);
445
+ } else {
446
+ loggerInstance.error(message);
447
+ }
448
+
449
+ // Log details if present
450
+ if (analysis.details && process.env['DEBUG']) {
451
+ loggerInstance.info(` Details: ${JSON.stringify(analysis.details)}`);
452
+ }
453
+ }
454
+
455
+ /**
456
+ * Get suggested delay based on failure analysis and retry count
457
+ */
458
+ export function getSuggestedDelay(analysis: FailureAnalysis, retryCount: number): number {
459
+ if (analysis.suggestedDelayMs) {
460
+ return analysis.suggestedDelayMs;
461
+ }
462
+
463
+ // Exponential backoff
464
+ const baseDelay = 5000;
465
+ const maxDelay = 60000;
466
+
467
+ return Math.min(baseDelay * Math.pow(2, retryCount), maxDelay);
468
+ }
469
+
470
+ /**
471
+ * Executes a function with retry logic based on failure analysis
472
+ */
473
+ export async function withRetry<T>(
474
+ laneName: string,
475
+ fn: () => Promise<T>,
476
+ isError: (res: T) => { ok: boolean; error?: string },
477
+ options: {
478
+ maxRetries?: number;
479
+ delayMs?: number;
480
+ circuitBreakerName?: string;
481
+ } = {}
482
+ ): Promise<T> {
483
+ const maxRetries = options.maxRetries || 3;
484
+ const baseDelayMs = options.delayMs || 5000;
485
+ let attempt = 0;
486
+
487
+ // Get circuit breaker if specified
488
+ const breaker = options.circuitBreakerName
489
+ ? getCircuitBreaker(options.circuitBreakerName)
490
+ : null;
491
+
492
+ while (true) {
493
+ // Check circuit breaker
494
+ if (breaker && !breaker.canCall()) {
495
+ const waitTime = breaker.getTimeUntilRetry();
496
+ logger.warn(`[${laneName}] Circuit breaker open. Waiting ${Math.round(waitTime / 1000)}s...`);
497
+ await new Promise(resolve => setTimeout(resolve, waitTime));
498
+ continue;
499
+ }
500
+
501
+ const result = await fn();
502
+ const status = isError(result);
503
+
504
+ if (status.ok) {
505
+ if (breaker) breaker.recordSuccess();
506
+ return result;
507
+ }
508
+
509
+ if (breaker) breaker.recordFailure();
510
+
511
+ const analysis = analyzeFailure(status.error, {
512
+ retryCount: attempt,
513
+ circuitBreakerName: options.circuitBreakerName,
514
+ });
515
+
516
+ if ((analysis.action === RecoveryAction.RETRY_TASK ||
517
+ analysis.action === RecoveryAction.WAIT_AND_RETRY) &&
518
+ attempt < maxRetries) {
519
+ attempt++;
520
+ logFailure(laneName, analysis);
521
+
522
+ const delay = getSuggestedDelay(analysis, attempt) || baseDelayMs;
523
+ logger.info(`Attempt ${attempt}/${maxRetries} failed. Retrying in ${Math.round(delay / 1000)}s...`);
524
+
525
+ await new Promise(resolve => setTimeout(resolve, delay));
526
+ continue;
527
+ }
528
+
529
+ return result;
530
+ }
531
+ }
532
+
533
+ /**
534
+ * Create a failure report for logging/monitoring
535
+ */
536
+ export interface FailureReport {
537
+ timestamp: string;
538
+ laneName: string;
539
+ analysis: FailureAnalysis;
540
+ context: FailureContext;
541
+ resolved: boolean;
542
+ resolutionAction?: RecoveryAction;
543
+ }
544
+
545
+ export function createFailureReport(
546
+ laneName: string,
547
+ analysis: FailureAnalysis,
548
+ context: FailureContext
549
+ ): FailureReport {
550
+ return {
551
+ timestamp: new Date().toISOString(),
552
+ laneName,
553
+ analysis,
554
+ context,
555
+ resolved: false,
556
+ };
557
+ }
558
+
559
+ /**
560
+ * Failure statistics for monitoring
561
+ */
562
+ export interface FailureStats {
563
+ totalFailures: number;
564
+ byType: Record<FailureType, number>;
565
+ byAction: Record<RecoveryAction, number>;
566
+ transientCount: number;
567
+ permanentCount: number;
568
+ }
569
+
570
+ export function createEmptyStats(): FailureStats {
571
+ return {
572
+ totalFailures: 0,
573
+ byType: {} as Record<FailureType, number>,
574
+ byAction: {} as Record<RecoveryAction, number>,
575
+ transientCount: 0,
576
+ permanentCount: 0,
577
+ };
578
+ }
579
+
580
+ export function updateStats(stats: FailureStats, analysis: FailureAnalysis): FailureStats {
581
+ stats.totalFailures++;
582
+ stats.byType[analysis.type] = (stats.byType[analysis.type] || 0) + 1;
583
+ stats.byAction[analysis.action] = (stats.byAction[analysis.action] || 0) + 1;
584
+
585
+ if (analysis.isTransient) {
586
+ stats.transientCount++;
587
+ } else {
588
+ stats.permanentCount++;
589
+ }
590
+
591
+ return stats;
592
+ }