@litmers/cursorflow-orchestrator 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -59,60 +59,6 @@ export interface FailureAnalysis {
59
59
  /**
60
60
  * Multi-layer stall detection configuration
61
61
  */
62
- export interface StallDetectionConfig {
63
- /** Time without stdout activity before sending continue signal */
64
- idleTimeoutMs: number;
65
- /** Time without state file update before considering stalled */
66
- progressTimeoutMs: number;
67
- /** Maximum time for a single task */
68
- taskTimeoutMs: number;
69
- /** Grace period for known long operations (e.g., npm install) */
70
- longOperationGraceMs: number;
71
- /** Patterns that indicate long operations */
72
- longOperationPatterns: RegExp[];
73
- /** Maximum restarts before aborting */
74
- maxRestarts: number;
75
- }
76
-
77
- export const DEFAULT_STALL_CONFIG: StallDetectionConfig = {
78
- idleTimeoutMs: 2 * 60 * 1000, // 2 minutes without output (idle detection)
79
- progressTimeoutMs: 10 * 60 * 1000, // 10 minutes without progress
80
- taskTimeoutMs: 30 * 60 * 1000, // 30 minutes max per task
81
- longOperationGraceMs: 10 * 60 * 1000, // 10 minute grace for long ops
82
- longOperationPatterns: [
83
- /Installing dependencies/i,
84
- /npm install/i,
85
- /pnpm install/i,
86
- /yarn install/i,
87
- /Building/i,
88
- /Compiling/i,
89
- /Downloading/i,
90
- /Fetching/i,
91
- /Cloning/i,
92
- /Bundling/i,
93
- ],
94
- maxRestarts: 2,
95
- };
96
-
97
- export interface StallContext {
98
- /** Current stall phase (0: normal, 1: continued, 2: stronger_prompt, 3: restarted) */
99
- stallPhase: number;
100
- /** Time since last activity */
101
- idleTimeMs: number;
102
- /** Time since last state update */
103
- progressTimeMs?: number;
104
- /** Last output line (for long operation detection) */
105
- lastOutput?: string;
106
- /** Number of restarts */
107
- restartCount?: number;
108
- /** Task start time */
109
- taskStartTimeMs?: number;
110
- /** Bytes received since last check (0 = no response at all) */
111
- bytesReceived?: number;
112
- /** Number of continue signals already sent */
113
- continueSignalsSent?: number;
114
- }
115
-
116
62
  export interface FailureContext {
117
63
  exitCode?: number;
118
64
  stallPhase?: number;
@@ -125,167 +71,6 @@ export interface FailureContext {
125
71
  circuitBreakerName?: string;
126
72
  }
127
73
 
128
- /**
129
- * Analyze stall condition with multi-layer detection and escalating recovery
130
- *
131
- * @deprecated Use StallDetectionService from './stall-detection' instead.
132
- * This function is kept for backward compatibility but will be removed in a future version.
133
- *
134
- * The new unified StallDetectionService provides:
135
- * - Single source of truth for stall state
136
- * - Automatic recovery action execution
137
- * - Better heartbeat filtering
138
- * - Consistent state management
139
- *
140
- * Recovery escalation stages:
141
- * 1. Phase 0 → Phase 1: Send continue signal (after 2 min idle)
142
- * 2. Phase 1 → Phase 2: Send stronger prompt (after 2 min grace)
143
- * 3. Phase 2 → Phase 3: Kill and restart process (after 2 min grace)
144
- * 4. Phase 3+: Abort after max restarts exceeded
145
- */
146
- export function analyzeStall(context: StallContext, config: StallDetectionConfig = DEFAULT_STALL_CONFIG): FailureAnalysis {
147
- const {
148
- stallPhase,
149
- idleTimeMs,
150
- progressTimeMs,
151
- lastOutput,
152
- restartCount = 0,
153
- taskStartTimeMs,
154
- bytesReceived = -1, // -1 means not tracked
155
- continueSignalsSent = 0,
156
- } = context;
157
-
158
- // Check if this might be a long operation
159
- const isLongOperation = lastOutput && config.longOperationPatterns.some(p => p.test(lastOutput));
160
-
161
- // If it's a long operation but we've received 0 real bytes for a while,
162
- // reduce the grace period to avoid waiting forever for a hung process.
163
- // We use 2x the normal idle timeout as a "sanity check" for silent long operations.
164
- const silentLongOpCappedTimeout = config.idleTimeoutMs * 2;
165
- const effectiveIdleTimeout = isLongOperation
166
- ? (bytesReceived === 0 ? Math.min(config.longOperationGraceMs, silentLongOpCappedTimeout) : config.longOperationGraceMs)
167
- : config.idleTimeoutMs;
168
-
169
- // Check for task timeout
170
- if (taskStartTimeMs && (Date.now() - taskStartTimeMs) > config.taskTimeoutMs) {
171
- return {
172
- type: FailureType.AGENT_TIMEOUT,
173
- action: restartCount < config.maxRestarts ? RecoveryAction.KILL_AND_RESTART : RecoveryAction.RUN_DOCTOR,
174
- message: `Task exceeded maximum timeout of ${Math.round(config.taskTimeoutMs / 60000)} minutes`,
175
- isTransient: restartCount < config.maxRestarts,
176
- details: { taskDurationMs: Date.now() - taskStartTimeMs, restartCount },
177
- };
178
- }
179
-
180
- // Check for zero bytes received (agent completely unresponsive)
181
- if (bytesReceived === 0 && idleTimeMs > effectiveIdleTimeout) {
182
- return {
183
- type: FailureType.AGENT_NO_RESPONSE,
184
- action: stallPhase < 2 ? RecoveryAction.CONTINUE_SIGNAL : RecoveryAction.KILL_AND_RESTART,
185
- message: `Agent produced 0 bytes for ${Math.round(idleTimeMs / 1000)}s - possible API issue`,
186
- isTransient: true,
187
- details: { idleTimeMs, bytesReceived, stallPhase },
188
- };
189
- }
190
-
191
- // Check for no progress (state file not updating)
192
- if (progressTimeMs && progressTimeMs > config.progressTimeoutMs) {
193
- return {
194
- type: FailureType.STALL_NO_PROGRESS,
195
- action: stallPhase === 0 ? RecoveryAction.CONTINUE_SIGNAL :
196
- stallPhase === 1 ? RecoveryAction.STRONGER_PROMPT :
197
- RecoveryAction.KILL_AND_RESTART,
198
- message: `No progress for ${Math.round(progressTimeMs / 60000)} minutes`,
199
- isTransient: true,
200
- details: { progressTimeMs, stallPhase },
201
- };
202
- }
203
-
204
- // Phase 0: Normal operation, check for initial idle
205
- if (stallPhase === 0 && idleTimeMs > effectiveIdleTimeout) {
206
- return {
207
- type: FailureType.STALL_IDLE,
208
- action: RecoveryAction.CONTINUE_SIGNAL,
209
- message: `Lane idle for ${Math.round(idleTimeMs / 1000)}s. Sending continue signal...`,
210
- isTransient: true,
211
- details: { idleTimeMs, isLongOperation, phase: 0 },
212
- };
213
- }
214
-
215
- // Phase 1: Continue signal sent, wait for response
216
- if (stallPhase === 1) {
217
- const graceTimeout = 2 * 60 * 1000; // 2 minutes grace after continue
218
-
219
- if (idleTimeMs > graceTimeout) {
220
- return {
221
- type: FailureType.STALL_IDLE,
222
- action: RecoveryAction.STRONGER_PROMPT,
223
- message: `Still idle after continue signal. Sending stronger prompt...`,
224
- isTransient: true,
225
- details: { idleTimeMs, continueSignalsSent, phase: 1 },
226
- };
227
- }
228
- }
229
-
230
- // Phase 2: Stronger prompt sent, wait or escalate
231
- if (stallPhase === 2) {
232
- const strongerGraceTimeout = 2 * 60 * 1000; // 2 minutes grace after stronger prompt
233
-
234
- if (idleTimeMs > strongerGraceTimeout) {
235
- if (restartCount < config.maxRestarts) {
236
- return {
237
- type: FailureType.STALL_IDLE,
238
- action: RecoveryAction.KILL_AND_RESTART,
239
- message: `No response after stronger prompt. Killing and restarting process...`,
240
- isTransient: true,
241
- details: { idleTimeMs, restartCount, maxRestarts: config.maxRestarts, phase: 2 },
242
- };
243
- } else {
244
- return {
245
- type: FailureType.STALL_IDLE,
246
- action: RecoveryAction.RUN_DOCTOR,
247
- message: `Lane failed after ${restartCount} restarts. Running diagnostics...`,
248
- isTransient: false,
249
- details: { restartCount, phase: 2 },
250
- };
251
- }
252
- }
253
- }
254
-
255
- // Phase 3+: After restart, monitor with shorter timeout
256
- if (stallPhase >= 3) {
257
- const postRestartTimeout = config.idleTimeoutMs * 0.75; // Shorter timeout after restart
258
-
259
- if (idleTimeMs > postRestartTimeout) {
260
- if (restartCount < config.maxRestarts) {
261
- return {
262
- type: FailureType.STALL_IDLE,
263
- action: RecoveryAction.CONTINUE_SIGNAL,
264
- message: `Lane idle after restart. Retrying continue signal...`,
265
- isTransient: true,
266
- details: { idleTimeMs, restartCount, phase: stallPhase },
267
- };
268
- } else {
269
- return {
270
- type: FailureType.STALL_IDLE,
271
- action: RecoveryAction.RUN_DOCTOR,
272
- message: `Lane repeatedly stalled. Running diagnostics for root cause...`,
273
- isTransient: false,
274
- details: { stallPhase, restartCount },
275
- };
276
- }
277
- }
278
- }
279
-
280
- // No action needed yet
281
- return {
282
- type: FailureType.STALL_IDLE,
283
- action: RecoveryAction.NONE,
284
- message: 'Monitoring for stall',
285
- isTransient: true,
286
- };
287
- }
288
-
289
74
  /**
290
75
  * Analyze an error message or state to determine the failure type and recovery action
291
76
  */
@@ -310,11 +95,12 @@ export function analyzeFailure(error: string | null | undefined, context?: Failu
310
95
  // 1. Network errors
311
96
  if (msg.includes('econnreset') || msg.includes('econnrefused') ||
312
97
  msg.includes('etimedout') || msg.includes('enotfound') ||
313
- msg.includes('socket hang up') || msg.includes('network')) {
98
+ msg.includes('socket hang up') || msg.includes('network') ||
99
+ msg.includes('canceled') || msg.includes('http/2') || msg.includes('stream closed')) {
314
100
  return {
315
101
  type: FailureType.NETWORK_ERROR,
316
102
  action: (context?.retryCount || 0) < 3 ? RecoveryAction.RETRY_TASK : RecoveryAction.RESTART_LANE,
317
- message: 'Network error. Retrying...',
103
+ message: 'Network error or connection lost. Retrying...',
318
104
  isTransient: true,
319
105
  suggestedDelayMs: 5000 * Math.pow(2, context?.retryCount || 0),
320
106
  };
@@ -425,17 +211,10 @@ export function analyzeFailure(error: string | null | undefined, context?: Failu
425
211
  };
426
212
  }
427
213
 
428
- // 10. Stalls (handled by phase)
429
- if (context?.stallPhase !== undefined && context.stallPhase >= 0) {
430
- return analyzeStall({
431
- stallPhase: context.stallPhase,
432
- idleTimeMs: context.idleTimeMs || 0,
433
- progressTimeMs: context.progressTimeMs,
434
- restartCount: context.restartCount,
435
- taskStartTimeMs: context.taskStartTimeMs,
436
- });
437
- }
438
-
214
+ // 10. Stalls
215
+ // Deprecated: analyzeStall call removed. Orchestrator now uses StallDetectionService
216
+ // for all stall-related monitoring and recovery.
217
+
439
218
  // 11. Default fallback
440
219
  return {
441
220
  type: FailureType.UNKNOWN_CRASH,
@@ -82,9 +82,6 @@ export interface InterventionResult {
82
82
  /** 개입 요청 파일명 */
83
83
  export const PENDING_INTERVENTION_FILE = 'pending-intervention.json';
84
84
 
85
- /** 기존 intervention.txt 파일명 (호환성 유지) */
86
- export const LEGACY_INTERVENTION_FILE = 'intervention.txt';
87
-
88
85
  /** 프로세스 종료 대기 시간 (ms) */
89
86
  const KILL_TIMEOUT_MS = 5000;
90
87
 
@@ -137,13 +134,6 @@ export function getPendingInterventionPath(laneRunDir: string): string {
137
134
  return safeJoin(laneRunDir, PENDING_INTERVENTION_FILE);
138
135
  }
139
136
 
140
- /**
141
- * 기존 intervention.txt 경로 가져오기 (호환성)
142
- */
143
- export function getLegacyInterventionPath(laneRunDir: string): string {
144
- return safeJoin(laneRunDir, LEGACY_INTERVENTION_FILE);
145
- }
146
-
147
137
  /**
148
138
  * 개입 요청 생성 및 저장
149
139
  *
@@ -173,10 +163,6 @@ export function createInterventionRequest(
173
163
  fs.writeFileSync(filePath, JSON.stringify(fullRequest, null, 2), 'utf8');
174
164
  logger.debug(`[Intervention] Created request: ${filePath}`);
175
165
 
176
- // 기존 intervention.txt에도 기록 (호환성 및 로깅용)
177
- const legacyPath = getLegacyInterventionPath(laneRunDir);
178
- fs.writeFileSync(legacyPath, fullRequest.message, 'utf8');
179
-
180
166
  return filePath;
181
167
  }
182
168
 
@@ -204,15 +190,11 @@ export function readPendingIntervention(laneRunDir: string): InterventionRequest
204
190
  */
205
191
  export function clearPendingIntervention(laneRunDir: string): void {
206
192
  const filePath = getPendingInterventionPath(laneRunDir);
207
- const legacyPath = getLegacyInterventionPath(laneRunDir);
208
193
 
209
194
  try {
210
195
  if (fs.existsSync(filePath)) {
211
196
  fs.unlinkSync(filePath);
212
197
  }
213
- if (fs.existsSync(legacyPath)) {
214
- fs.unlinkSync(legacyPath);
215
- }
216
198
  } catch (error) {
217
199
  // Ignore cleanup errors
218
200
  }
@@ -40,6 +40,9 @@ import {
40
40
  getGitErrorGuidance,
41
41
  LaneRecoveryState,
42
42
  } from './auto-recovery';
43
+ import {
44
+ isInterventionRestart,
45
+ } from './intervention';
43
46
  import {
44
47
  StallDetectionService,
45
48
  getStallService,
@@ -994,9 +997,16 @@ export async function orchestrate(tasksDir: string, options: {
994
997
  logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
995
998
  }
996
999
  } else {
997
- // Check if it was a restart request (RESTART_REQUESTED phase)
998
- if (stallPhase === StallPhase.RESTART_REQUESTED) {
999
- logger.info(`🔄 Lane ${finished.name} is being restarted due to stall...`);
1000
+ // Check if it was a restart request or intervention (killed to be resumed)
1001
+ if (stallPhase === StallPhase.RESTART_REQUESTED ||
1002
+ stallPhase === StallPhase.CONTINUE_SENT ||
1003
+ stallPhase === StallPhase.STRONGER_PROMPT_SENT ||
1004
+ isInterventionRestart(laneRunDirs[finished.name]!)) {
1005
+ const isManual = isInterventionRestart(laneRunDirs[finished.name]!);
1006
+ const phaseName = isManual ? 'manual intervention' :
1007
+ (stallPhase === StallPhase.RESTART_REQUESTED ? 'restart' : 'automatic intervention');
1008
+
1009
+ logger.info(`🔄 Lane ${finished.name} is being resumed/restarted due to ${phaseName}...`);
1000
1010
 
1001
1011
  // Update startIndex from current state to resume from the same task
1002
1012
  const statePath = safeJoin(laneRunDirs[finished.name]!, 'state.json');
@@ -218,27 +218,32 @@ async function cursorAgentSendRaw({ workspaceDir, chatId, prompt, model, signalD
218
218
  fs.mkdirSync(signalDir, { recursive: true });
219
219
  }
220
220
 
221
- const interventionPath = path.join(signalDir, 'intervention.txt');
221
+ const interventionPath = path.join(signalDir, 'pending-intervention.json');
222
222
  const timeoutPath = path.join(signalDir, 'timeout.txt');
223
223
 
224
- // Watch for timeout signals from UI (intervention via stdin no longer works)
224
+ // Watch for signals from UI (intervention via stdin no longer works)
225
225
  signalWatcher = fs.watch(signalDir, (event, filename) => {
226
- if (filename === 'intervention.txt' && fs.existsSync(interventionPath)) {
226
+ if (filename === 'pending-intervention.json' && fs.existsSync(interventionPath)) {
227
227
  try {
228
- const message = fs.readFileSync(interventionPath, 'utf8').trim();
229
- if (message) {
230
- // Log intervention but cannot send via stdin (already closed)
231
- logger.warn(`👋 Intervention received but stdin is closed (cursor-agent CLI limitation): ${message.substring(0, 50)}...`);
232
-
233
- if (signalDir) {
234
- const convoPath = path.join(signalDir, 'conversation.jsonl');
235
- appendLog(convoPath, createConversationEntry('intervention', `[INTERVENTION IGNORED - stdin closed]: ${message}`, {
236
- task: taskName || 'AGENT_TURN',
237
- model: 'manual'
238
- }));
239
- }
240
- fs.unlinkSync(interventionPath);
228
+ // Log intervention
229
+ logger.warn(`👋 Intervention received. Interrupting agent for resume...`);
230
+
231
+ if (signalDir) {
232
+ const convoPath = path.join(signalDir, 'conversation.jsonl');
233
+ appendLog(convoPath, createConversationEntry('intervention', `[INTERVENTION RECEIVED]`, {
234
+ task: taskName || 'AGENT_TURN',
235
+ model: 'manual'
236
+ }));
241
237
  }
238
+
239
+ // Kill the agent child process.
240
+ // This will cause the runner to finish this task (with error)
241
+ // and since we've already written pending-intervention.json,
242
+ // it will be picked up on next resume.
243
+ child.kill('SIGTERM');
244
+
245
+ // Note: we don't unlink pending-intervention.json here,
246
+ // the runner will do it when resuming to read the message.
242
247
  } catch {}
243
248
  }
244
249
 
@@ -827,16 +827,17 @@ export class StallDetectionService {
827
827
  priority: 5,
828
828
  });
829
829
 
830
- // 2. 프로세스 종료 (있는 경우)
830
+ // 2. 상태 먼저 업데이트 (race condition 방지)
831
+ state.phase = StallPhase.CONTINUE_SENT;
832
+ state.lastPhaseChangeTime = Date.now();
833
+ state.continueSignalCount++;
834
+
835
+ // 3. 프로세스 종료 (있는 경우)
831
836
  if (state.childProcess?.pid && !state.childProcess.killed) {
832
837
  logger.info(`[${state.laneName}] Interrupting process ${state.childProcess.pid} for continue signal`);
833
838
  await killAndWait(state.childProcess.pid);
834
839
  }
835
840
 
836
- state.phase = StallPhase.CONTINUE_SENT;
837
- state.lastPhaseChangeTime = Date.now();
838
- state.continueSignalCount++;
839
-
840
841
  logger.info(`[${state.laneName}] Continue signal queued (#${state.continueSignalCount}) - agent will resume with intervention`);
841
842
 
842
843
  events.emit('recovery.continue_signal', {
@@ -875,15 +876,16 @@ export class StallDetectionService {
875
876
  priority: 7,
876
877
  });
877
878
 
878
- // 2. 프로세스 종료 (있는 경우)
879
+ // 2. 상태 먼저 업데이트 (race condition 방지)
880
+ state.phase = StallPhase.STRONGER_PROMPT_SENT;
881
+ state.lastPhaseChangeTime = Date.now();
882
+
883
+ // 3. 프로세스 종료 (있는 경우)
879
884
  if (state.childProcess?.pid && !state.childProcess.killed) {
880
885
  logger.warn(`[${state.laneName}] Interrupting process ${state.childProcess.pid} for stronger prompt`);
881
886
  await killAndWait(state.childProcess.pid);
882
887
  }
883
888
 
884
- state.phase = StallPhase.STRONGER_PROMPT_SENT;
885
- state.lastPhaseChangeTime = Date.now();
886
-
887
889
  logger.warn(`[${state.laneName}] Stronger prompt queued - agent will resume with intervention`);
888
890
 
889
891
  events.emit('recovery.stronger_prompt', {
@@ -1,7 +0,0 @@
1
- /**
2
- * CursorFlow prepare command
3
- *
4
- * Prepare task files for a new feature - Terminal-first approach
5
- */
6
- declare function prepare(args: string[]): Promise<void>;
7
- export = prepare;