@litmers/cursorflow-orchestrator 0.1.20 → 0.1.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/commands/cursorflow-clean.md +19 -0
- package/commands/cursorflow-runs.md +59 -0
- package/commands/cursorflow-stop.md +55 -0
- package/dist/cli/clean.js +171 -0
- package/dist/cli/clean.js.map +1 -1
- package/dist/cli/index.js +7 -0
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/init.js +1 -1
- package/dist/cli/init.js.map +1 -1
- package/dist/cli/logs.js +83 -42
- package/dist/cli/logs.js.map +1 -1
- package/dist/cli/monitor.d.ts +7 -0
- package/dist/cli/monitor.js +1007 -189
- package/dist/cli/monitor.js.map +1 -1
- package/dist/cli/prepare.js +4 -3
- package/dist/cli/prepare.js.map +1 -1
- package/dist/cli/resume.js +188 -236
- package/dist/cli/resume.js.map +1 -1
- package/dist/cli/run.js +8 -3
- package/dist/cli/run.js.map +1 -1
- package/dist/cli/runs.d.ts +5 -0
- package/dist/cli/runs.js +214 -0
- package/dist/cli/runs.js.map +1 -0
- package/dist/cli/setup-commands.js +0 -0
- package/dist/cli/signal.js +1 -1
- package/dist/cli/signal.js.map +1 -1
- package/dist/cli/stop.d.ts +5 -0
- package/dist/cli/stop.js +215 -0
- package/dist/cli/stop.js.map +1 -0
- package/dist/cli/tasks.d.ts +10 -0
- package/dist/cli/tasks.js +165 -0
- package/dist/cli/tasks.js.map +1 -0
- package/dist/core/auto-recovery.d.ts +212 -0
- package/dist/core/auto-recovery.js +737 -0
- package/dist/core/auto-recovery.js.map +1 -0
- package/dist/core/failure-policy.d.ts +156 -0
- package/dist/core/failure-policy.js +488 -0
- package/dist/core/failure-policy.js.map +1 -0
- package/dist/core/orchestrator.d.ts +15 -2
- package/dist/core/orchestrator.js +392 -15
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/core/reviewer.d.ts +2 -0
- package/dist/core/reviewer.js +2 -0
- package/dist/core/reviewer.js.map +1 -1
- package/dist/core/runner.d.ts +33 -10
- package/dist/core/runner.js +321 -146
- package/dist/core/runner.js.map +1 -1
- package/dist/services/logging/buffer.d.ts +67 -0
- package/dist/services/logging/buffer.js +309 -0
- package/dist/services/logging/buffer.js.map +1 -0
- package/dist/services/logging/console.d.ts +89 -0
- package/dist/services/logging/console.js +169 -0
- package/dist/services/logging/console.js.map +1 -0
- package/dist/services/logging/file-writer.d.ts +71 -0
- package/dist/services/logging/file-writer.js +516 -0
- package/dist/services/logging/file-writer.js.map +1 -0
- package/dist/services/logging/formatter.d.ts +39 -0
- package/dist/services/logging/formatter.js +227 -0
- package/dist/services/logging/formatter.js.map +1 -0
- package/dist/services/logging/index.d.ts +11 -0
- package/dist/services/logging/index.js +30 -0
- package/dist/services/logging/index.js.map +1 -0
- package/dist/services/logging/parser.d.ts +31 -0
- package/dist/services/logging/parser.js +222 -0
- package/dist/services/logging/parser.js.map +1 -0
- package/dist/services/process/index.d.ts +59 -0
- package/dist/services/process/index.js +257 -0
- package/dist/services/process/index.js.map +1 -0
- package/dist/types/agent.d.ts +20 -0
- package/dist/types/agent.js +6 -0
- package/dist/types/agent.js.map +1 -0
- package/dist/types/config.d.ts +65 -0
- package/dist/types/config.js +6 -0
- package/dist/types/config.js.map +1 -0
- package/dist/types/events.d.ts +125 -0
- package/dist/types/events.js +6 -0
- package/dist/types/events.js.map +1 -0
- package/dist/types/index.d.ts +12 -0
- package/dist/types/index.js +37 -0
- package/dist/types/index.js.map +1 -0
- package/dist/types/lane.d.ts +43 -0
- package/dist/types/lane.js +6 -0
- package/dist/types/lane.js.map +1 -0
- package/dist/types/logging.d.ts +71 -0
- package/dist/types/logging.js +16 -0
- package/dist/types/logging.js.map +1 -0
- package/dist/types/review.d.ts +17 -0
- package/dist/types/review.js +6 -0
- package/dist/types/review.js.map +1 -0
- package/dist/types/run.d.ts +32 -0
- package/dist/types/run.js +6 -0
- package/dist/types/run.js.map +1 -0
- package/dist/types/task.d.ts +71 -0
- package/dist/types/task.js +6 -0
- package/dist/types/task.js.map +1 -0
- package/dist/ui/components.d.ts +134 -0
- package/dist/ui/components.js +389 -0
- package/dist/ui/components.js.map +1 -0
- package/dist/ui/log-viewer.d.ts +49 -0
- package/dist/ui/log-viewer.js +449 -0
- package/dist/ui/log-viewer.js.map +1 -0
- package/dist/utils/checkpoint.d.ts +87 -0
- package/dist/utils/checkpoint.js +317 -0
- package/dist/utils/checkpoint.js.map +1 -0
- package/dist/utils/config.d.ts +4 -0
- package/dist/utils/config.js +11 -2
- package/dist/utils/config.js.map +1 -1
- package/dist/utils/cursor-agent.js.map +1 -1
- package/dist/utils/dependency.d.ts +74 -0
- package/dist/utils/dependency.js +420 -0
- package/dist/utils/dependency.js.map +1 -0
- package/dist/utils/doctor.js +10 -5
- package/dist/utils/doctor.js.map +1 -1
- package/dist/utils/enhanced-logger.d.ts +10 -33
- package/dist/utils/enhanced-logger.js +94 -9
- package/dist/utils/enhanced-logger.js.map +1 -1
- package/dist/utils/git.d.ts +121 -0
- package/dist/utils/git.js +322 -2
- package/dist/utils/git.js.map +1 -1
- package/dist/utils/health.d.ts +91 -0
- package/dist/utils/health.js +556 -0
- package/dist/utils/health.js.map +1 -0
- package/dist/utils/lock.d.ts +95 -0
- package/dist/utils/lock.js +332 -0
- package/dist/utils/lock.js.map +1 -0
- package/dist/utils/log-buffer.d.ts +17 -0
- package/dist/utils/log-buffer.js +14 -0
- package/dist/utils/log-buffer.js.map +1 -0
- package/dist/utils/log-constants.d.ts +23 -0
- package/dist/utils/log-constants.js +28 -0
- package/dist/utils/log-constants.js.map +1 -0
- package/dist/utils/log-formatter.d.ts +9 -0
- package/dist/utils/log-formatter.js +113 -70
- package/dist/utils/log-formatter.js.map +1 -1
- package/dist/utils/log-service.d.ts +19 -0
- package/dist/utils/log-service.js +47 -0
- package/dist/utils/log-service.js.map +1 -0
- package/dist/utils/logger.d.ts +46 -27
- package/dist/utils/logger.js +82 -60
- package/dist/utils/logger.js.map +1 -1
- package/dist/utils/process-manager.d.ts +21 -0
- package/dist/utils/process-manager.js +138 -0
- package/dist/utils/process-manager.js.map +1 -0
- package/dist/utils/retry.d.ts +121 -0
- package/dist/utils/retry.js +374 -0
- package/dist/utils/retry.js.map +1 -0
- package/dist/utils/run-service.d.ts +88 -0
- package/dist/utils/run-service.js +412 -0
- package/dist/utils/run-service.js.map +1 -0
- package/dist/utils/state.d.ts +58 -2
- package/dist/utils/state.js +306 -3
- package/dist/utils/state.js.map +1 -1
- package/dist/utils/task-service.d.ts +82 -0
- package/dist/utils/task-service.js +348 -0
- package/dist/utils/task-service.js.map +1 -0
- package/dist/utils/types.d.ts +2 -272
- package/dist/utils/types.js +16 -0
- package/dist/utils/types.js.map +1 -1
- package/package.json +38 -23
- package/scripts/ai-security-check.js +0 -1
- package/scripts/local-security-gate.sh +0 -0
- package/scripts/monitor-lanes.sh +94 -0
- package/scripts/patches/test-cursor-agent.js +0 -1
- package/scripts/release.sh +0 -0
- package/scripts/setup-security.sh +0 -0
- package/scripts/stream-logs.sh +72 -0
- package/scripts/verify-and-fix.sh +0 -0
- package/src/cli/clean.ts +180 -0
- package/src/cli/index.ts +7 -0
- package/src/cli/init.ts +1 -1
- package/src/cli/logs.ts +79 -42
- package/src/cli/monitor.ts +1815 -899
- package/src/cli/prepare.ts +4 -3
- package/src/cli/resume.ts +220 -277
- package/src/cli/run.ts +9 -3
- package/src/cli/runs.ts +212 -0
- package/src/cli/setup-commands.ts +0 -0
- package/src/cli/signal.ts +1 -1
- package/src/cli/stop.ts +209 -0
- package/src/cli/tasks.ts +154 -0
- package/src/core/auto-recovery.ts +909 -0
- package/src/core/failure-policy.ts +592 -0
- package/src/core/orchestrator.ts +1131 -675
- package/src/core/reviewer.ts +4 -0
- package/src/core/runner.ts +388 -162
- package/src/services/logging/buffer.ts +326 -0
- package/src/services/logging/console.ts +193 -0
- package/src/services/logging/file-writer.ts +526 -0
- package/src/services/logging/formatter.ts +268 -0
- package/src/services/logging/index.ts +16 -0
- package/src/services/logging/parser.ts +232 -0
- package/src/services/process/index.ts +261 -0
- package/src/types/agent.ts +24 -0
- package/src/types/config.ts +79 -0
- package/src/types/events.ts +156 -0
- package/src/types/index.ts +29 -0
- package/src/types/lane.ts +56 -0
- package/src/types/logging.ts +96 -0
- package/src/types/review.ts +20 -0
- package/src/types/run.ts +37 -0
- package/src/types/task.ts +79 -0
- package/src/ui/components.ts +430 -0
- package/src/ui/log-viewer.ts +485 -0
- package/src/utils/checkpoint.ts +374 -0
- package/src/utils/config.ts +11 -2
- package/src/utils/cursor-agent.ts +1 -1
- package/src/utils/dependency.ts +482 -0
- package/src/utils/doctor.ts +11 -5
- package/src/utils/enhanced-logger.ts +108 -49
- package/src/utils/git.ts +374 -2
- package/src/utils/health.ts +596 -0
- package/src/utils/lock.ts +346 -0
- package/src/utils/log-buffer.ts +28 -0
- package/src/utils/log-constants.ts +26 -0
- package/src/utils/log-formatter.ts +120 -37
- package/src/utils/log-service.ts +49 -0
- package/src/utils/logger.ts +100 -51
- package/src/utils/process-manager.ts +100 -0
- package/src/utils/retry.ts +413 -0
- package/src/utils/run-service.ts +433 -0
- package/src/utils/state.ts +369 -3
- package/src/utils/task-service.ts +370 -0
- package/src/utils/types.ts +2 -315
|
@@ -0,0 +1,592 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Failure Policy - Centralized management of failure cases and recovery actions
|
|
3
|
+
*
|
|
4
|
+
* Features:
|
|
5
|
+
* - Multi-layer stall detection
|
|
6
|
+
* - Circuit breaker integration
|
|
7
|
+
* - Configurable recovery strategies
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import * as logger from '../utils/logger';
|
|
11
|
+
import { getCircuitBreaker, CircuitState } from '../utils/retry';
|
|
12
|
+
|
|
13
|
+
export enum FailureType {
|
|
14
|
+
STALL_IDLE = 'STALL_IDLE',
|
|
15
|
+
STALL_NO_PROGRESS = 'STALL_NO_PROGRESS',
|
|
16
|
+
STALL_ZERO_BYTES = 'STALL_ZERO_BYTES',
|
|
17
|
+
AGENT_UNAVAILABLE = 'AGENT_UNAVAILABLE',
|
|
18
|
+
AGENT_AUTH_ERROR = 'AGENT_AUTH_ERROR',
|
|
19
|
+
AGENT_RATE_LIMIT = 'AGENT_RATE_LIMIT',
|
|
20
|
+
AGENT_TIMEOUT = 'AGENT_TIMEOUT',
|
|
21
|
+
AGENT_NO_RESPONSE = 'AGENT_NO_RESPONSE',
|
|
22
|
+
ZOMBIE_PROCESS = 'ZOMBIE_PROCESS',
|
|
23
|
+
DEPENDENCY_BLOCK = 'DEPENDENCY_BLOCK',
|
|
24
|
+
DEPENDENCY_FAILED = 'DEPENDENCY_FAILED',
|
|
25
|
+
DEPENDENCY_TIMEOUT = 'DEPENDENCY_TIMEOUT',
|
|
26
|
+
REVIEW_FAIL = 'REVIEW_FAIL',
|
|
27
|
+
GIT_ERROR = 'GIT_ERROR',
|
|
28
|
+
GIT_PUSH_REJECTED = 'GIT_PUSH_REJECTED',
|
|
29
|
+
MERGE_CONFLICT = 'MERGE_CONFLICT',
|
|
30
|
+
NETWORK_ERROR = 'NETWORK_ERROR',
|
|
31
|
+
STATE_CORRUPTION = 'STATE_CORRUPTION',
|
|
32
|
+
UNKNOWN_CRASH = 'UNKNOWN_CRASH',
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export enum RecoveryAction {
|
|
36
|
+
CONTINUE_SIGNAL = 'CONTINUE_SIGNAL',
|
|
37
|
+
STRONGER_PROMPT = 'STRONGER_PROMPT',
|
|
38
|
+
RETRY_TASK = 'RETRY_TASK',
|
|
39
|
+
RESTART_LANE = 'RESTART_LANE',
|
|
40
|
+
RESTART_LANE_FROM_CHECKPOINT = 'RESTART_LANE_FROM_CHECKPOINT',
|
|
41
|
+
KILL_AND_RESTART = 'KILL_AND_RESTART',
|
|
42
|
+
ABORT_LANE = 'ABORT_LANE',
|
|
43
|
+
WAIT_FOR_USER = 'WAIT_FOR_USER',
|
|
44
|
+
WAIT_AND_RETRY = 'WAIT_AND_RETRY',
|
|
45
|
+
RESET_GIT = 'RESET_GIT',
|
|
46
|
+
SEND_GIT_GUIDANCE = 'SEND_GIT_GUIDANCE',
|
|
47
|
+
RUN_DOCTOR = 'RUN_DOCTOR',
|
|
48
|
+
NONE = 'NONE',
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export interface FailureAnalysis {
|
|
52
|
+
type: FailureType;
|
|
53
|
+
action: RecoveryAction;
|
|
54
|
+
message: string;
|
|
55
|
+
isTransient: boolean;
|
|
56
|
+
suggestedDelayMs?: number;
|
|
57
|
+
details?: Record<string, any>;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Multi-layer stall detection configuration
|
|
62
|
+
*/
|
|
63
|
+
export interface StallDetectionConfig {
|
|
64
|
+
/** Time without stdout activity before sending continue signal */
|
|
65
|
+
idleTimeoutMs: number;
|
|
66
|
+
/** Time without state file update before considering stalled */
|
|
67
|
+
progressTimeoutMs: number;
|
|
68
|
+
/** Maximum time for a single task */
|
|
69
|
+
taskTimeoutMs: number;
|
|
70
|
+
/** Grace period for known long operations (e.g., npm install) */
|
|
71
|
+
longOperationGraceMs: number;
|
|
72
|
+
/** Patterns that indicate long operations */
|
|
73
|
+
longOperationPatterns: RegExp[];
|
|
74
|
+
/** Maximum restarts before aborting */
|
|
75
|
+
maxRestarts: number;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export const DEFAULT_STALL_CONFIG: StallDetectionConfig = {
|
|
79
|
+
idleTimeoutMs: 60 * 1000, // 1 minute without output (quick detection)
|
|
80
|
+
progressTimeoutMs: 10 * 60 * 1000, // 10 minutes without progress
|
|
81
|
+
taskTimeoutMs: 30 * 60 * 1000, // 30 minutes max per task
|
|
82
|
+
longOperationGraceMs: 10 * 60 * 1000, // 10 minute grace for long ops
|
|
83
|
+
longOperationPatterns: [
|
|
84
|
+
/Installing dependencies/i,
|
|
85
|
+
/npm install/i,
|
|
86
|
+
/pnpm install/i,
|
|
87
|
+
/yarn install/i,
|
|
88
|
+
/Building/i,
|
|
89
|
+
/Compiling/i,
|
|
90
|
+
/Downloading/i,
|
|
91
|
+
/Fetching/i,
|
|
92
|
+
/Cloning/i,
|
|
93
|
+
/Bundling/i,
|
|
94
|
+
],
|
|
95
|
+
maxRestarts: 2,
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
export interface StallContext {
|
|
99
|
+
/** Current stall phase (0: normal, 1: continued, 2: stronger_prompt, 3: restarted) */
|
|
100
|
+
stallPhase: number;
|
|
101
|
+
/** Time since last activity */
|
|
102
|
+
idleTimeMs: number;
|
|
103
|
+
/** Time since last state update */
|
|
104
|
+
progressTimeMs?: number;
|
|
105
|
+
/** Last output line (for long operation detection) */
|
|
106
|
+
lastOutput?: string;
|
|
107
|
+
/** Number of restarts */
|
|
108
|
+
restartCount?: number;
|
|
109
|
+
/** Task start time */
|
|
110
|
+
taskStartTimeMs?: number;
|
|
111
|
+
/** Bytes received since last check (0 = no response at all) */
|
|
112
|
+
bytesReceived?: number;
|
|
113
|
+
/** Number of continue signals already sent */
|
|
114
|
+
continueSignalsSent?: number;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
export interface FailureContext {
|
|
118
|
+
exitCode?: number;
|
|
119
|
+
stallPhase?: number;
|
|
120
|
+
idleTimeMs?: number;
|
|
121
|
+
retryCount?: number;
|
|
122
|
+
progressTimeMs?: number;
|
|
123
|
+
lastOutput?: string;
|
|
124
|
+
restartCount?: number;
|
|
125
|
+
taskStartTimeMs?: number;
|
|
126
|
+
circuitBreakerName?: string;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Analyze stall condition with multi-layer detection and escalating recovery
|
|
131
|
+
*
|
|
132
|
+
* Recovery escalation stages:
|
|
133
|
+
* 1. Phase 0 → Phase 1: Send continue signal (after 2 min idle)
|
|
134
|
+
* 2. Phase 1 → Phase 2: Send stronger prompt (after 1.5 min grace)
|
|
135
|
+
* 3. Phase 2 → Phase 3: Kill and restart process (after 1 min grace)
|
|
136
|
+
* 4. Phase 3+: Abort after max restarts exceeded
|
|
137
|
+
*/
|
|
138
|
+
export function analyzeStall(context: StallContext, config: StallDetectionConfig = DEFAULT_STALL_CONFIG): FailureAnalysis {
|
|
139
|
+
const {
|
|
140
|
+
stallPhase,
|
|
141
|
+
idleTimeMs,
|
|
142
|
+
progressTimeMs,
|
|
143
|
+
lastOutput,
|
|
144
|
+
restartCount = 0,
|
|
145
|
+
taskStartTimeMs,
|
|
146
|
+
bytesReceived = -1, // -1 means not tracked
|
|
147
|
+
continueSignalsSent = 0,
|
|
148
|
+
} = context;
|
|
149
|
+
|
|
150
|
+
// Check if this might be a long operation
|
|
151
|
+
const isLongOperation = lastOutput && config.longOperationPatterns.some(p => p.test(lastOutput));
|
|
152
|
+
const effectiveIdleTimeout = isLongOperation ? config.longOperationGraceMs : config.idleTimeoutMs;
|
|
153
|
+
|
|
154
|
+
// Check for task timeout
|
|
155
|
+
if (taskStartTimeMs && (Date.now() - taskStartTimeMs) > config.taskTimeoutMs) {
|
|
156
|
+
return {
|
|
157
|
+
type: FailureType.AGENT_TIMEOUT,
|
|
158
|
+
action: restartCount < config.maxRestarts ? RecoveryAction.KILL_AND_RESTART : RecoveryAction.RUN_DOCTOR,
|
|
159
|
+
message: `Task exceeded maximum timeout of ${Math.round(config.taskTimeoutMs / 60000)} minutes`,
|
|
160
|
+
isTransient: restartCount < config.maxRestarts,
|
|
161
|
+
details: { taskDurationMs: Date.now() - taskStartTimeMs, restartCount },
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Check for zero bytes received (agent completely unresponsive)
|
|
166
|
+
if (bytesReceived === 0 && idleTimeMs > effectiveIdleTimeout) {
|
|
167
|
+
return {
|
|
168
|
+
type: FailureType.AGENT_NO_RESPONSE,
|
|
169
|
+
action: stallPhase < 2 ? RecoveryAction.CONTINUE_SIGNAL : RecoveryAction.KILL_AND_RESTART,
|
|
170
|
+
message: `Agent produced 0 bytes for ${Math.round(idleTimeMs / 1000)}s - possible API issue`,
|
|
171
|
+
isTransient: true,
|
|
172
|
+
details: { idleTimeMs, bytesReceived, stallPhase },
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Check for no progress (state file not updating)
|
|
177
|
+
if (progressTimeMs && progressTimeMs > config.progressTimeoutMs) {
|
|
178
|
+
return {
|
|
179
|
+
type: FailureType.STALL_NO_PROGRESS,
|
|
180
|
+
action: stallPhase === 0 ? RecoveryAction.CONTINUE_SIGNAL :
|
|
181
|
+
stallPhase === 1 ? RecoveryAction.STRONGER_PROMPT :
|
|
182
|
+
RecoveryAction.KILL_AND_RESTART,
|
|
183
|
+
message: `No progress for ${Math.round(progressTimeMs / 60000)} minutes`,
|
|
184
|
+
isTransient: true,
|
|
185
|
+
details: { progressTimeMs, stallPhase },
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// Phase 0: Normal operation, check for initial idle
|
|
190
|
+
if (stallPhase === 0 && idleTimeMs > effectiveIdleTimeout) {
|
|
191
|
+
return {
|
|
192
|
+
type: FailureType.STALL_IDLE,
|
|
193
|
+
action: RecoveryAction.CONTINUE_SIGNAL,
|
|
194
|
+
message: `Lane idle for ${Math.round(idleTimeMs / 1000)}s. Sending continue signal...`,
|
|
195
|
+
isTransient: true,
|
|
196
|
+
details: { idleTimeMs, isLongOperation, phase: 0 },
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Phase 1: Continue signal sent, wait for response
|
|
201
|
+
if (stallPhase === 1) {
|
|
202
|
+
const graceTimeout = 90 * 1000; // 1.5 minutes grace after continue
|
|
203
|
+
|
|
204
|
+
if (idleTimeMs > graceTimeout) {
|
|
205
|
+
return {
|
|
206
|
+
type: FailureType.STALL_IDLE,
|
|
207
|
+
action: RecoveryAction.STRONGER_PROMPT,
|
|
208
|
+
message: `Still idle after continue signal. Sending stronger prompt...`,
|
|
209
|
+
isTransient: true,
|
|
210
|
+
details: { idleTimeMs, continueSignalsSent, phase: 1 },
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Phase 2: Stronger prompt sent, wait or escalate
|
|
216
|
+
if (stallPhase === 2) {
|
|
217
|
+
const strongerGraceTimeout = 60 * 1000; // 1 minute grace after stronger prompt
|
|
218
|
+
|
|
219
|
+
if (idleTimeMs > strongerGraceTimeout) {
|
|
220
|
+
if (restartCount < config.maxRestarts) {
|
|
221
|
+
return {
|
|
222
|
+
type: FailureType.STALL_IDLE,
|
|
223
|
+
action: RecoveryAction.KILL_AND_RESTART,
|
|
224
|
+
message: `No response after stronger prompt. Killing and restarting process...`,
|
|
225
|
+
isTransient: true,
|
|
226
|
+
details: { idleTimeMs, restartCount, maxRestarts: config.maxRestarts, phase: 2 },
|
|
227
|
+
};
|
|
228
|
+
} else {
|
|
229
|
+
return {
|
|
230
|
+
type: FailureType.STALL_IDLE,
|
|
231
|
+
action: RecoveryAction.RUN_DOCTOR,
|
|
232
|
+
message: `Lane failed after ${restartCount} restarts. Running diagnostics...`,
|
|
233
|
+
isTransient: false,
|
|
234
|
+
details: { restartCount, phase: 2 },
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Phase 3+: After restart, monitor with shorter timeout
|
|
241
|
+
if (stallPhase >= 3) {
|
|
242
|
+
const postRestartTimeout = config.idleTimeoutMs * 0.75; // Shorter timeout after restart
|
|
243
|
+
|
|
244
|
+
if (idleTimeMs > postRestartTimeout) {
|
|
245
|
+
if (restartCount < config.maxRestarts) {
|
|
246
|
+
return {
|
|
247
|
+
type: FailureType.STALL_IDLE,
|
|
248
|
+
action: RecoveryAction.CONTINUE_SIGNAL,
|
|
249
|
+
message: `Lane idle after restart. Retrying continue signal...`,
|
|
250
|
+
isTransient: true,
|
|
251
|
+
details: { idleTimeMs, restartCount, phase: stallPhase },
|
|
252
|
+
};
|
|
253
|
+
} else {
|
|
254
|
+
return {
|
|
255
|
+
type: FailureType.STALL_IDLE,
|
|
256
|
+
action: RecoveryAction.RUN_DOCTOR,
|
|
257
|
+
message: `Lane repeatedly stalled. Running diagnostics for root cause...`,
|
|
258
|
+
isTransient: false,
|
|
259
|
+
details: { stallPhase, restartCount },
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// No action needed yet
|
|
266
|
+
return {
|
|
267
|
+
type: FailureType.STALL_IDLE,
|
|
268
|
+
action: RecoveryAction.NONE,
|
|
269
|
+
message: 'Monitoring for stall',
|
|
270
|
+
isTransient: true,
|
|
271
|
+
};
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Analyze an error message or state to determine the failure type and recovery action
|
|
276
|
+
*/
|
|
277
|
+
export function analyzeFailure(error: string | null | undefined, context?: FailureContext): FailureAnalysis {
|
|
278
|
+
const msg = (error || '').toLowerCase();
|
|
279
|
+
|
|
280
|
+
// Check circuit breaker status first
|
|
281
|
+
if (context?.circuitBreakerName) {
|
|
282
|
+
const breaker = getCircuitBreaker(context.circuitBreakerName);
|
|
283
|
+
if (breaker.getState() === CircuitState.OPEN) {
|
|
284
|
+
const waitTime = breaker.getTimeUntilRetry();
|
|
285
|
+
return {
|
|
286
|
+
type: FailureType.AGENT_UNAVAILABLE,
|
|
287
|
+
action: RecoveryAction.WAIT_AND_RETRY,
|
|
288
|
+
message: `Circuit breaker open. Retry in ${Math.round(waitTime / 1000)}s`,
|
|
289
|
+
isTransient: true,
|
|
290
|
+
suggestedDelayMs: waitTime,
|
|
291
|
+
};
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// 1. Network errors
|
|
296
|
+
if (msg.includes('econnreset') || msg.includes('econnrefused') ||
|
|
297
|
+
msg.includes('etimedout') || msg.includes('enotfound') ||
|
|
298
|
+
msg.includes('socket hang up') || msg.includes('network')) {
|
|
299
|
+
return {
|
|
300
|
+
type: FailureType.NETWORK_ERROR,
|
|
301
|
+
action: (context?.retryCount || 0) < 3 ? RecoveryAction.RETRY_TASK : RecoveryAction.RESTART_LANE,
|
|
302
|
+
message: 'Network error. Retrying...',
|
|
303
|
+
isTransient: true,
|
|
304
|
+
suggestedDelayMs: 5000 * Math.pow(2, context?.retryCount || 0),
|
|
305
|
+
};
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// 2. Agent service unavailable
|
|
309
|
+
if (msg.includes('connecterror') && msg.includes('[unavailable]')) {
|
|
310
|
+
return {
|
|
311
|
+
type: FailureType.AGENT_UNAVAILABLE,
|
|
312
|
+
action: (context?.retryCount || 0) < 3 ? RecoveryAction.RETRY_TASK : RecoveryAction.RESTART_LANE,
|
|
313
|
+
message: 'Agent service is temporarily unavailable. Retrying with a new agent session.',
|
|
314
|
+
isTransient: true,
|
|
315
|
+
suggestedDelayMs: 10000,
|
|
316
|
+
};
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// 3. Authentication errors
|
|
320
|
+
if (msg.includes('not authenticated') || msg.includes('unauthorized') ||
|
|
321
|
+
msg.includes('401') || msg.includes('auth failed')) {
|
|
322
|
+
return {
|
|
323
|
+
type: FailureType.AGENT_AUTH_ERROR,
|
|
324
|
+
action: RecoveryAction.WAIT_FOR_USER,
|
|
325
|
+
message: 'Cursor authentication failed. Please sign in to Cursor IDE.',
|
|
326
|
+
isTransient: false,
|
|
327
|
+
};
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
// 4. Rate limits
|
|
331
|
+
if (msg.includes('rate limit') || msg.includes('quota') ||
|
|
332
|
+
msg.includes('429') || msg.includes('too many requests')) {
|
|
333
|
+
return {
|
|
334
|
+
type: FailureType.AGENT_RATE_LIMIT,
|
|
335
|
+
action: RecoveryAction.WAIT_AND_RETRY,
|
|
336
|
+
message: 'API rate limit reached. Waiting before retry...',
|
|
337
|
+
isTransient: true,
|
|
338
|
+
suggestedDelayMs: 60000, // 1 minute
|
|
339
|
+
};
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// 5. Timeout
|
|
343
|
+
if (msg.includes('timeout') || msg.includes('timed out')) {
|
|
344
|
+
return {
|
|
345
|
+
type: FailureType.AGENT_TIMEOUT,
|
|
346
|
+
action: (context?.retryCount || 0) < 2 ? RecoveryAction.RETRY_TASK : RecoveryAction.RESTART_LANE,
|
|
347
|
+
message: 'Operation timed out.',
|
|
348
|
+
isTransient: true,
|
|
349
|
+
};
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// 6. Git/merge errors - send guidance to agent
|
|
353
|
+
if (msg.includes('conflict') || msg.includes('merge failed') || msg.includes('automatic merge failed')) {
|
|
354
|
+
return {
|
|
355
|
+
type: FailureType.MERGE_CONFLICT,
|
|
356
|
+
action: RecoveryAction.SEND_GIT_GUIDANCE,
|
|
357
|
+
message: 'Merge conflict detected. Sending guidance to agent...',
|
|
358
|
+
isTransient: true,
|
|
359
|
+
};
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Git push rejected (common in parallel lanes)
|
|
363
|
+
if (msg.includes('rejected') || msg.includes('non-fast-forward') ||
|
|
364
|
+
msg.includes('failed to push') || msg.includes('fetch first')) {
|
|
365
|
+
return {
|
|
366
|
+
type: FailureType.GIT_PUSH_REJECTED,
|
|
367
|
+
action: RecoveryAction.SEND_GIT_GUIDANCE,
|
|
368
|
+
message: 'Git push rejected. Sending guidance to agent...',
|
|
369
|
+
isTransient: true,
|
|
370
|
+
};
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
if (msg.includes('git') && (msg.includes('error') || msg.includes('failed'))) {
|
|
374
|
+
return {
|
|
375
|
+
type: FailureType.GIT_ERROR,
|
|
376
|
+
action: (context?.retryCount || 0) < 2 ? RecoveryAction.RETRY_TASK : RecoveryAction.RESET_GIT,
|
|
377
|
+
message: 'Git operation failed.',
|
|
378
|
+
isTransient: true,
|
|
379
|
+
};
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// 7. Dependency blocks (Exit Code 2)
|
|
383
|
+
if (context?.exitCode === 2 || msg.includes('dependency_change_required')) {
|
|
384
|
+
return {
|
|
385
|
+
type: FailureType.DEPENDENCY_BLOCK,
|
|
386
|
+
action: RecoveryAction.NONE, // Handled by orchestrator resolve logic
|
|
387
|
+
message: 'Lane is blocked on dependency change request.',
|
|
388
|
+
isTransient: false,
|
|
389
|
+
};
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
// 8. Dependency failures
|
|
393
|
+
if (msg.includes('dependency failed') || msg.includes('dependency timeout')) {
|
|
394
|
+
const isDependencyTimeout = msg.includes('timeout');
|
|
395
|
+
return {
|
|
396
|
+
type: isDependencyTimeout ? FailureType.DEPENDENCY_TIMEOUT : FailureType.DEPENDENCY_FAILED,
|
|
397
|
+
action: RecoveryAction.ABORT_LANE,
|
|
398
|
+
message: isDependencyTimeout ? 'Dependency wait timed out.' : 'A dependency lane has failed.',
|
|
399
|
+
isTransient: false,
|
|
400
|
+
};
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
// 9. State corruption
|
|
404
|
+
if (msg.includes('state') && (msg.includes('corrupt') || msg.includes('invalid') || msg.includes('parse'))) {
|
|
405
|
+
return {
|
|
406
|
+
type: FailureType.STATE_CORRUPTION,
|
|
407
|
+
action: RecoveryAction.RESTART_LANE_FROM_CHECKPOINT,
|
|
408
|
+
message: 'State file corruption detected.',
|
|
409
|
+
isTransient: false,
|
|
410
|
+
};
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
// 10. Stalls (handled by phase)
|
|
414
|
+
if (context?.stallPhase !== undefined && context.stallPhase >= 0) {
|
|
415
|
+
return analyzeStall({
|
|
416
|
+
stallPhase: context.stallPhase,
|
|
417
|
+
idleTimeMs: context.idleTimeMs || 0,
|
|
418
|
+
progressTimeMs: context.progressTimeMs,
|
|
419
|
+
restartCount: context.restartCount,
|
|
420
|
+
taskStartTimeMs: context.taskStartTimeMs,
|
|
421
|
+
});
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
// 11. Default fallback
|
|
425
|
+
return {
|
|
426
|
+
type: FailureType.UNKNOWN_CRASH,
|
|
427
|
+
action: RecoveryAction.ABORT_LANE,
|
|
428
|
+
message: error || `Process exited with code ${context?.exitCode}`,
|
|
429
|
+
isTransient: false,
|
|
430
|
+
};
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
/**
|
|
434
|
+
* Log the failure analysis to the appropriate channels
|
|
435
|
+
*/
|
|
436
|
+
export function logFailure(laneName: string, analysis: FailureAnalysis, loggerInstance: any = logger): void {
|
|
437
|
+
const label = `[${laneName}]`;
|
|
438
|
+
const actionLabel = analysis.action === RecoveryAction.NONE ? '' : ` -> Action: ${analysis.action}`;
|
|
439
|
+
const delayLabel = analysis.suggestedDelayMs ? ` (delay: ${Math.round(analysis.suggestedDelayMs / 1000)}s)` : '';
|
|
440
|
+
|
|
441
|
+
const message = `${label} ${analysis.type}: ${analysis.message}${actionLabel}${delayLabel}`;
|
|
442
|
+
|
|
443
|
+
if (analysis.isTransient) {
|
|
444
|
+
loggerInstance.warn(message);
|
|
445
|
+
} else {
|
|
446
|
+
loggerInstance.error(message);
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
// Log details if present
|
|
450
|
+
if (analysis.details && process.env['DEBUG']) {
|
|
451
|
+
loggerInstance.info(` Details: ${JSON.stringify(analysis.details)}`);
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
/**
|
|
456
|
+
* Get suggested delay based on failure analysis and retry count
|
|
457
|
+
*/
|
|
458
|
+
export function getSuggestedDelay(analysis: FailureAnalysis, retryCount: number): number {
|
|
459
|
+
if (analysis.suggestedDelayMs) {
|
|
460
|
+
return analysis.suggestedDelayMs;
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
// Exponential backoff
|
|
464
|
+
const baseDelay = 5000;
|
|
465
|
+
const maxDelay = 60000;
|
|
466
|
+
|
|
467
|
+
return Math.min(baseDelay * Math.pow(2, retryCount), maxDelay);
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
/**
|
|
471
|
+
* Executes a function with retry logic based on failure analysis
|
|
472
|
+
*/
|
|
473
|
+
export async function withRetry<T>(
|
|
474
|
+
laneName: string,
|
|
475
|
+
fn: () => Promise<T>,
|
|
476
|
+
isError: (res: T) => { ok: boolean; error?: string },
|
|
477
|
+
options: {
|
|
478
|
+
maxRetries?: number;
|
|
479
|
+
delayMs?: number;
|
|
480
|
+
circuitBreakerName?: string;
|
|
481
|
+
} = {}
|
|
482
|
+
): Promise<T> {
|
|
483
|
+
const maxRetries = options.maxRetries || 3;
|
|
484
|
+
const baseDelayMs = options.delayMs || 5000;
|
|
485
|
+
let attempt = 0;
|
|
486
|
+
|
|
487
|
+
// Get circuit breaker if specified
|
|
488
|
+
const breaker = options.circuitBreakerName
|
|
489
|
+
? getCircuitBreaker(options.circuitBreakerName)
|
|
490
|
+
: null;
|
|
491
|
+
|
|
492
|
+
while (true) {
|
|
493
|
+
// Check circuit breaker
|
|
494
|
+
if (breaker && !breaker.canCall()) {
|
|
495
|
+
const waitTime = breaker.getTimeUntilRetry();
|
|
496
|
+
logger.warn(`[${laneName}] Circuit breaker open. Waiting ${Math.round(waitTime / 1000)}s...`);
|
|
497
|
+
await new Promise(resolve => setTimeout(resolve, waitTime));
|
|
498
|
+
continue;
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
const result = await fn();
|
|
502
|
+
const status = isError(result);
|
|
503
|
+
|
|
504
|
+
if (status.ok) {
|
|
505
|
+
if (breaker) breaker.recordSuccess();
|
|
506
|
+
return result;
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
if (breaker) breaker.recordFailure();
|
|
510
|
+
|
|
511
|
+
const analysis = analyzeFailure(status.error, {
|
|
512
|
+
retryCount: attempt,
|
|
513
|
+
circuitBreakerName: options.circuitBreakerName,
|
|
514
|
+
});
|
|
515
|
+
|
|
516
|
+
if ((analysis.action === RecoveryAction.RETRY_TASK ||
|
|
517
|
+
analysis.action === RecoveryAction.WAIT_AND_RETRY) &&
|
|
518
|
+
attempt < maxRetries) {
|
|
519
|
+
attempt++;
|
|
520
|
+
logFailure(laneName, analysis);
|
|
521
|
+
|
|
522
|
+
const delay = getSuggestedDelay(analysis, attempt) || baseDelayMs;
|
|
523
|
+
logger.info(`Attempt ${attempt}/${maxRetries} failed. Retrying in ${Math.round(delay / 1000)}s...`);
|
|
524
|
+
|
|
525
|
+
await new Promise(resolve => setTimeout(resolve, delay));
|
|
526
|
+
continue;
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
return result;
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
/**
|
|
534
|
+
* Create a failure report for logging/monitoring
|
|
535
|
+
*/
|
|
536
|
+
export interface FailureReport {
|
|
537
|
+
timestamp: string;
|
|
538
|
+
laneName: string;
|
|
539
|
+
analysis: FailureAnalysis;
|
|
540
|
+
context: FailureContext;
|
|
541
|
+
resolved: boolean;
|
|
542
|
+
resolutionAction?: RecoveryAction;
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
export function createFailureReport(
|
|
546
|
+
laneName: string,
|
|
547
|
+
analysis: FailureAnalysis,
|
|
548
|
+
context: FailureContext
|
|
549
|
+
): FailureReport {
|
|
550
|
+
return {
|
|
551
|
+
timestamp: new Date().toISOString(),
|
|
552
|
+
laneName,
|
|
553
|
+
analysis,
|
|
554
|
+
context,
|
|
555
|
+
resolved: false,
|
|
556
|
+
};
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
/**
|
|
560
|
+
* Failure statistics for monitoring
|
|
561
|
+
*/
|
|
562
|
+
export interface FailureStats {
|
|
563
|
+
totalFailures: number;
|
|
564
|
+
byType: Record<FailureType, number>;
|
|
565
|
+
byAction: Record<RecoveryAction, number>;
|
|
566
|
+
transientCount: number;
|
|
567
|
+
permanentCount: number;
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
export function createEmptyStats(): FailureStats {
|
|
571
|
+
return {
|
|
572
|
+
totalFailures: 0,
|
|
573
|
+
byType: {} as Record<FailureType, number>,
|
|
574
|
+
byAction: {} as Record<RecoveryAction, number>,
|
|
575
|
+
transientCount: 0,
|
|
576
|
+
permanentCount: 0,
|
|
577
|
+
};
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
export function updateStats(stats: FailureStats, analysis: FailureAnalysis): FailureStats {
|
|
581
|
+
stats.totalFailures++;
|
|
582
|
+
stats.byType[analysis.type] = (stats.byType[analysis.type] || 0) + 1;
|
|
583
|
+
stats.byAction[analysis.action] = (stats.byAction[analysis.action] || 0) + 1;
|
|
584
|
+
|
|
585
|
+
if (analysis.isTransient) {
|
|
586
|
+
stats.transientCount++;
|
|
587
|
+
} else {
|
|
588
|
+
stats.permanentCount++;
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
return stats;
|
|
592
|
+
}
|