npm - @litmers/cursorflow-orchestrator - Versions diffs - 0.2.2 → 0.2.3 - Mend

@litmers/cursorflow-orchestrator 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/README.md +1 -0
package/dist/cli/index.js +0 -6
package/dist/cli/index.js.map +1 -1
package/dist/cli/monitor.js +18 -2
package/dist/cli/monitor.js.map +1 -1
package/dist/cli/signal.js +33 -29
package/dist/cli/signal.js.map +1 -1
package/dist/core/auto-recovery.d.ts +2 -117
package/dist/core/auto-recovery.js +4 -487
package/dist/core/auto-recovery.js.map +1 -1
package/dist/core/failure-policy.d.ts +0 -52
package/dist/core/failure-policy.js +7 -174
package/dist/core/failure-policy.js.map +1 -1
package/dist/core/intervention.d.ts +0 -6
package/dist/core/intervention.js +1 -17
package/dist/core/intervention.js.map +1 -1
package/dist/core/orchestrator.js +10 -3
package/dist/core/orchestrator.js.map +1 -1
package/dist/core/runner/agent.js +18 -15
package/dist/core/runner/agent.js.map +1 -1
package/dist/core/stall-detection.js +9 -7
package/dist/core/stall-detection.js.map +1 -1
package/package.json +2 -13
package/src/cli/index.ts +0 -6
package/src/cli/monitor.ts +18 -2
package/src/cli/signal.ts +38 -34
package/src/core/auto-recovery.ts +13 -595
package/src/core/failure-policy.ts +7 -228
package/src/core/intervention.ts +0 -18
package/src/core/orchestrator.ts +13 -3
package/src/core/runner/agent.ts +21 -16
package/src/core/stall-detection.ts +11 -9
package/dist/cli/prepare.d.ts +0 -7
package/dist/cli/prepare.js +0 -690
package/dist/cli/prepare.js.map +0 -1
package/src/cli/prepare.ts +0 -777

package/src/core/failure-policy.ts CHANGED Viewed

@@ -59,60 +59,6 @@ export interface FailureAnalysis {
 /**
  * Multi-layer stall detection configuration
  */
-export interface StallDetectionConfig {
-  /** Time without stdout activity before sending continue signal */
-  idleTimeoutMs: number;
-  /** Time without state file update before considering stalled */
-  progressTimeoutMs: number;
-  /** Maximum time for a single task */
-  taskTimeoutMs: number;
-  /** Grace period for known long operations (e.g., npm install) */
-  longOperationGraceMs: number;
-  /** Patterns that indicate long operations */
-  longOperationPatterns: RegExp[];
-  /** Maximum restarts before aborting */
-  maxRestarts: number;
-}
-export const DEFAULT_STALL_CONFIG: StallDetectionConfig = {
-  idleTimeoutMs: 2 * 60 * 1000,       // 2 minutes without output (idle detection)
-  progressTimeoutMs: 10 * 60 * 1000,  // 10 minutes without progress
-  taskTimeoutMs: 30 * 60 * 1000,      // 30 minutes max per task
-  longOperationGraceMs: 10 * 60 * 1000, // 10 minute grace for long ops
-  longOperationPatterns: [
-    /Installing dependencies/i,
-    /npm install/i,
-    /pnpm install/i,
-    /yarn install/i,
-    /Building/i,
-    /Compiling/i,
-    /Downloading/i,
-    /Fetching/i,
-    /Cloning/i,
-    /Bundling/i,
-  ],
-  maxRestarts: 2,
-};
-export interface StallContext {
-  /** Current stall phase (0: normal, 1: continued, 2: stronger_prompt, 3: restarted) */
-  stallPhase: number;
-  /** Time since last activity */
-  idleTimeMs: number;
-  /** Time since last state update */
-  progressTimeMs?: number;
-  /** Last output line (for long operation detection) */
-  lastOutput?: string;
-  /** Number of restarts */
-  restartCount?: number;
-  /** Task start time */
-  taskStartTimeMs?: number;
-  /** Bytes received since last check (0 = no response at all) */
-  bytesReceived?: number;
-  /** Number of continue signals already sent */
-  continueSignalsSent?: number;
-}
 export interface FailureContext {
   exitCode?: number;
   stallPhase?: number;
@@ -125,167 +71,6 @@ export interface FailureContext {
   circuitBreakerName?: string;
 }
-/**
- * Analyze stall condition with multi-layer detection and escalating recovery
- *
- * @deprecated Use StallDetectionService from './stall-detection' instead.
- * This function is kept for backward compatibility but will be removed in a future version.
- *
- * The new unified StallDetectionService provides:
- * - Single source of truth for stall state
- * - Automatic recovery action execution
- * - Better heartbeat filtering
- * - Consistent state management
- *
- * Recovery escalation stages:
- * 1. Phase 0 → Phase 1: Send continue signal (after 2 min idle)
- * 2. Phase 1 → Phase 2: Send stronger prompt (after 2 min grace)
- * 3. Phase 2 → Phase 3: Kill and restart process (after 2 min grace)
- * 4. Phase 3+: Abort after max restarts exceeded
- */
-export function analyzeStall(context: StallContext, config: StallDetectionConfig = DEFAULT_STALL_CONFIG): FailureAnalysis {
-  const {
-    stallPhase,
-    idleTimeMs,
-    progressTimeMs,
-    lastOutput,
-    restartCount = 0,
-    taskStartTimeMs,
-    bytesReceived = -1, // -1 means not tracked
-    continueSignalsSent = 0,
-  } = context;
-  // Check if this might be a long operation
-  const isLongOperation = lastOutput && config.longOperationPatterns.some(p => p.test(lastOutput));
-  // If it's a long operation but we've received 0 real bytes for a while,
-  // reduce the grace period to avoid waiting forever for a hung process.
-  // We use 2x the normal idle timeout as a "sanity check" for silent long operations.
-  const silentLongOpCappedTimeout = config.idleTimeoutMs * 2;
-  const effectiveIdleTimeout = isLongOperation
-    ? (bytesReceived === 0 ? Math.min(config.longOperationGraceMs, silentLongOpCappedTimeout) : config.longOperationGraceMs)
-    : config.idleTimeoutMs;
-  // Check for task timeout
-  if (taskStartTimeMs && (Date.now() - taskStartTimeMs) > config.taskTimeoutMs) {
-    return {
-      type: FailureType.AGENT_TIMEOUT,
-      action: restartCount < config.maxRestarts ? RecoveryAction.KILL_AND_RESTART : RecoveryAction.RUN_DOCTOR,
-      message: `Task exceeded maximum timeout of ${Math.round(config.taskTimeoutMs / 60000)} minutes`,
-      isTransient: restartCount < config.maxRestarts,
-      details: { taskDurationMs: Date.now() - taskStartTimeMs, restartCount },
-    };
-  }
-  // Check for zero bytes received (agent completely unresponsive)
-  if (bytesReceived === 0 && idleTimeMs > effectiveIdleTimeout) {
-    return {
-      type: FailureType.AGENT_NO_RESPONSE,
-      action: stallPhase < 2 ? RecoveryAction.CONTINUE_SIGNAL : RecoveryAction.KILL_AND_RESTART,
-      message: `Agent produced 0 bytes for ${Math.round(idleTimeMs / 1000)}s - possible API issue`,
-      isTransient: true,
-      details: { idleTimeMs, bytesReceived, stallPhase },
-    };
-  }
-  // Check for no progress (state file not updating)
-  if (progressTimeMs && progressTimeMs > config.progressTimeoutMs) {
-    return {
-      type: FailureType.STALL_NO_PROGRESS,
-      action: stallPhase === 0 ? RecoveryAction.CONTINUE_SIGNAL :
-              stallPhase === 1 ? RecoveryAction.STRONGER_PROMPT :
-              RecoveryAction.KILL_AND_RESTART,
-      message: `No progress for ${Math.round(progressTimeMs / 60000)} minutes`,
-      isTransient: true,
-      details: { progressTimeMs, stallPhase },
-    };
-  }
-  // Phase 0: Normal operation, check for initial idle
-  if (stallPhase === 0 && idleTimeMs > effectiveIdleTimeout) {
-    return {
-      type: FailureType.STALL_IDLE,
-      action: RecoveryAction.CONTINUE_SIGNAL,
-      message: `Lane idle for ${Math.round(idleTimeMs / 1000)}s. Sending continue signal...`,
-      isTransient: true,
-      details: { idleTimeMs, isLongOperation, phase: 0 },
-    };
-  }
-  // Phase 1: Continue signal sent, wait for response
-  if (stallPhase === 1) {
-    const graceTimeout = 2 * 60 * 1000; // 2 minutes grace after continue
-    if (idleTimeMs > graceTimeout) {
-      return {
-        type: FailureType.STALL_IDLE,
-        action: RecoveryAction.STRONGER_PROMPT,
-        message: `Still idle after continue signal. Sending stronger prompt...`,
-        isTransient: true,
-        details: { idleTimeMs, continueSignalsSent, phase: 1 },
-      };
-    }
-  }
-  // Phase 2: Stronger prompt sent, wait or escalate
-  if (stallPhase === 2) {
-    const strongerGraceTimeout = 2 * 60 * 1000; // 2 minutes grace after stronger prompt
-    if (idleTimeMs > strongerGraceTimeout) {
-      if (restartCount < config.maxRestarts) {
-        return {
-          type: FailureType.STALL_IDLE,
-          action: RecoveryAction.KILL_AND_RESTART,
-          message: `No response after stronger prompt. Killing and restarting process...`,
-          isTransient: true,
-          details: { idleTimeMs, restartCount, maxRestarts: config.maxRestarts, phase: 2 },
-        };
-      } else {
-        return {
-          type: FailureType.STALL_IDLE,
-          action: RecoveryAction.RUN_DOCTOR,
-          message: `Lane failed after ${restartCount} restarts. Running diagnostics...`,
-          isTransient: false,
-          details: { restartCount, phase: 2 },
-        };
-      }
-    }
-  }
-  // Phase 3+: After restart, monitor with shorter timeout
-  if (stallPhase >= 3) {
-    const postRestartTimeout = config.idleTimeoutMs * 0.75; // Shorter timeout after restart
-    if (idleTimeMs > postRestartTimeout) {
-      if (restartCount < config.maxRestarts) {
-        return {
-          type: FailureType.STALL_IDLE,
-          action: RecoveryAction.CONTINUE_SIGNAL,
-          message: `Lane idle after restart. Retrying continue signal...`,
-          isTransient: true,
-          details: { idleTimeMs, restartCount, phase: stallPhase },
-        };
-      } else {
-        return {
-          type: FailureType.STALL_IDLE,
-          action: RecoveryAction.RUN_DOCTOR,
-          message: `Lane repeatedly stalled. Running diagnostics for root cause...`,
-          isTransient: false,
-          details: { stallPhase, restartCount },
-        };
-      }
-    }
-  }
-  // No action needed yet
-  return {
-    type: FailureType.STALL_IDLE,
-    action: RecoveryAction.NONE,
-    message: 'Monitoring for stall',
-    isTransient: true,
-  };
-}
 /**
  * Analyze an error message or state to determine the failure type and recovery action
  */
@@ -310,11 +95,12 @@ export function analyzeFailure(error: string | null | undefined, context?: Failu
   // 1. Network errors
   if (msg.includes('econnreset') || msg.includes('econnrefused') ||
       msg.includes('etimedout') || msg.includes('enotfound') ||
-      msg.includes('socket hang up') || msg.includes('network')) {
+      msg.includes('socket hang up') || msg.includes('network') ||
+      msg.includes('canceled') || msg.includes('http/2') || msg.includes('stream closed')) {
     return {
       type: FailureType.NETWORK_ERROR,
       action: (context?.retryCount || 0) < 3 ? RecoveryAction.RETRY_TASK : RecoveryAction.RESTART_LANE,
-      message: 'Network error. Retrying...',
+      message: 'Network error or connection lost. Retrying...',
       isTransient: true,
       suggestedDelayMs: 5000 * Math.pow(2, context?.retryCount || 0),
     };
@@ -425,17 +211,10 @@ export function analyzeFailure(error: string | null | undefined, context?: Failu
     };
   }
-  // 10. Stalls (handled by phase)
-  if (context?.stallPhase !== undefined && context.stallPhase >= 0) {
-    return analyzeStall({
-      stallPhase: context.stallPhase,
-      idleTimeMs: context.idleTimeMs || 0,
-      progressTimeMs: context.progressTimeMs,
-      restartCount: context.restartCount,
-      taskStartTimeMs: context.taskStartTimeMs,
-    });
-  }
+  // 10. Stalls
+  // Deprecated: analyzeStall call removed. Orchestrator now uses StallDetectionService
+  // for all stall-related monitoring and recovery.
   // 11. Default fallback
   return {
     type: FailureType.UNKNOWN_CRASH,

package/src/core/intervention.ts CHANGED Viewed

@@ -82,9 +82,6 @@ export interface InterventionResult {
 /** 개입 요청 파일명 */
 export const PENDING_INTERVENTION_FILE = 'pending-intervention.json';
-/** 기존 intervention.txt 파일명 (호환성 유지) */
-export const LEGACY_INTERVENTION_FILE = 'intervention.txt';
 /** 프로세스 종료 대기 시간 (ms) */
 const KILL_TIMEOUT_MS = 5000;
@@ -137,13 +134,6 @@ export function getPendingInterventionPath(laneRunDir: string): string {
   return safeJoin(laneRunDir, PENDING_INTERVENTION_FILE);
 }
-/**
- * 기존 intervention.txt 경로 가져오기 (호환성)
- */
-export function getLegacyInterventionPath(laneRunDir: string): string {
-  return safeJoin(laneRunDir, LEGACY_INTERVENTION_FILE);
-}
 /**
  * 개입 요청 생성 및 저장
  *
@@ -173,10 +163,6 @@ export function createInterventionRequest(
   fs.writeFileSync(filePath, JSON.stringify(fullRequest, null, 2), 'utf8');
   logger.debug(`[Intervention] Created request: ${filePath}`);
-  // 기존 intervention.txt에도 기록 (호환성 및 로깅용)
-  const legacyPath = getLegacyInterventionPath(laneRunDir);
-  fs.writeFileSync(legacyPath, fullRequest.message, 'utf8');
   return filePath;
 }
@@ -204,15 +190,11 @@ export function readPendingIntervention(laneRunDir: string): InterventionRequest
  */
 export function clearPendingIntervention(laneRunDir: string): void {
   const filePath = getPendingInterventionPath(laneRunDir);
-  const legacyPath = getLegacyInterventionPath(laneRunDir);
   try {
     if (fs.existsSync(filePath)) {
       fs.unlinkSync(filePath);
     }
-    if (fs.existsSync(legacyPath)) {
-      fs.unlinkSync(legacyPath);
-    }
   } catch (error) {
     // Ignore cleanup errors
   }

package/src/core/orchestrator.ts CHANGED Viewed

@@ -40,6 +40,9 @@ import {
   getGitErrorGuidance,
   LaneRecoveryState,
 } from './auto-recovery';
+import {
+  isInterventionRestart,
+} from './intervention';
 import {
   StallDetectionService,
   getStallService,
@@ -994,9 +997,16 @@ export async function orchestrate(tasksDir: string, options: {
             logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
           }
         } else {
-          // Check if it was a restart request (RESTART_REQUESTED phase)
-          if (stallPhase === StallPhase.RESTART_REQUESTED) {
-            logger.info(`🔄 Lane ${finished.name} is being restarted due to stall...`);
+          // Check if it was a restart request or intervention (killed to be resumed)
+          if (stallPhase === StallPhase.RESTART_REQUESTED ||
+              stallPhase === StallPhase.CONTINUE_SENT ||
+              stallPhase === StallPhase.STRONGER_PROMPT_SENT ||
+              isInterventionRestart(laneRunDirs[finished.name]!)) {
+            const isManual = isInterventionRestart(laneRunDirs[finished.name]!);
+            const phaseName = isManual ? 'manual intervention' :
+                            (stallPhase === StallPhase.RESTART_REQUESTED ? 'restart' : 'automatic intervention');
+            logger.info(`🔄 Lane ${finished.name} is being resumed/restarted due to ${phaseName}...`);
             // Update startIndex from current state to resume from the same task
             const statePath = safeJoin(laneRunDirs[finished.name]!, 'state.json');

package/src/core/runner/agent.ts CHANGED Viewed

@@ -218,27 +218,32 @@ async function cursorAgentSendRaw({ workspaceDir, chatId, prompt, model, signalD
         fs.mkdirSync(signalDir, { recursive: true });
       }
-      const interventionPath = path.join(signalDir, 'intervention.txt');
+      const interventionPath = path.join(signalDir, 'pending-intervention.json');
       const timeoutPath = path.join(signalDir, 'timeout.txt');
-      // Watch for timeout signals from UI (intervention via stdin no longer works)
+      // Watch for signals from UI (intervention via stdin no longer works)
       signalWatcher = fs.watch(signalDir, (event, filename) => {
-        if (filename === 'intervention.txt' && fs.existsSync(interventionPath)) {
+        if (filename === 'pending-intervention.json' && fs.existsSync(interventionPath)) {
           try {
-            const message = fs.readFileSync(interventionPath, 'utf8').trim();
-            if (message) {
-              // Log intervention but cannot send via stdin (already closed)
-              logger.warn(`👋 Intervention received but stdin is closed (cursor-agent CLI limitation): ${message.substring(0, 50)}...`);
-              if (signalDir) {
-                const convoPath = path.join(signalDir, 'conversation.jsonl');
-                appendLog(convoPath, createConversationEntry('intervention', `[INTERVENTION IGNORED - stdin closed]: ${message}`, {
-                  task: taskName || 'AGENT_TURN',
-                  model: 'manual'
-                }));
-              }
-              fs.unlinkSync(interventionPath);
+            // Log intervention
+            logger.warn(`👋 Intervention received. Interrupting agent for resume...`);
+            if (signalDir) {
+              const convoPath = path.join(signalDir, 'conversation.jsonl');
+              appendLog(convoPath, createConversationEntry('intervention', `[INTERVENTION RECEIVED]`, {
+                task: taskName || 'AGENT_TURN',
+                model: 'manual'
+              }));
             }
+            // Kill the agent child process.
+            // This will cause the runner to finish this task (with error)
+            // and since we've already written pending-intervention.json,
+            // it will be picked up on next resume.
+            child.kill('SIGTERM');
+            // Note: we don't unlink pending-intervention.json here,
+            // the runner will do it when resuming to read the message.
           } catch {}
         }

package/src/core/stall-detection.ts CHANGED Viewed

@@ -827,16 +827,17 @@ export class StallDetectionService {
         priority: 5,
       });
-      // 2. 프로세스 종료 (있는 경우)
+      // 2. 상태 먼저 업데이트 (race condition 방지)
+      state.phase = StallPhase.CONTINUE_SENT;
+      state.lastPhaseChangeTime = Date.now();
+      state.continueSignalCount++;
+      // 3. 프로세스 종료 (있는 경우)
       if (state.childProcess?.pid && !state.childProcess.killed) {
         logger.info(`[${state.laneName}] Interrupting process ${state.childProcess.pid} for continue signal`);
         await killAndWait(state.childProcess.pid);
       }
-      state.phase = StallPhase.CONTINUE_SENT;
-      state.lastPhaseChangeTime = Date.now();
-      state.continueSignalCount++;
       logger.info(`[${state.laneName}] Continue signal queued (#${state.continueSignalCount}) - agent will resume with intervention`);
       events.emit('recovery.continue_signal', {
@@ -875,15 +876,16 @@ export class StallDetectionService {
         priority: 7,
       });
-      // 2. 프로세스 종료 (있는 경우)
+      // 2. 상태 먼저 업데이트 (race condition 방지)
+      state.phase = StallPhase.STRONGER_PROMPT_SENT;
+      state.lastPhaseChangeTime = Date.now();
+      // 3. 프로세스 종료 (있는 경우)
       if (state.childProcess?.pid && !state.childProcess.killed) {
         logger.warn(`[${state.laneName}] Interrupting process ${state.childProcess.pid} for stronger prompt`);
         await killAndWait(state.childProcess.pid);
       }
-      state.phase = StallPhase.STRONGER_PROMPT_SENT;
-      state.lastPhaseChangeTime = Date.now();
       logger.warn(`[${state.laneName}] Stronger prompt queued - agent will resume with intervention`);
       events.emit('recovery.stronger_prompt', {

package/dist/cli/prepare.d.ts DELETED Viewed

@@ -1,7 +0,0 @@
-/**
- * CursorFlow prepare command
- *
- * Prepare task files for a new feature - Terminal-first approach
- */
-declare function prepare(args: string[]): Promise<void>;
-export = prepare;