@litmers/cursorflow-orchestrator 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,75 +17,23 @@ import { LaneState } from '../utils/types';
17
17
  import { events } from '../utils/events';
18
18
  import { safeJoin } from '../utils/path';
19
19
  import { runHealthCheck, checkAgentHealth, checkAuthHealth } from '../utils/health';
20
+ import {
21
+ createInterventionRequest,
22
+ InterventionType,
23
+ createContinueMessage,
24
+ createStrongerPromptMessage,
25
+ createRestartMessage,
26
+ } from './intervention';
20
27
 
21
28
  // ============================================================================
22
29
  // Types & Constants
23
30
  // ============================================================================
24
31
 
25
- /** Recovery stages for escalating interventions */
26
- export enum RecoveryStage {
27
- /** Normal operation - monitoring */
28
- NORMAL = 0,
29
- /** First intervention - send continue signal */
30
- CONTINUE_SIGNAL = 1,
31
- /** Second intervention - send stronger prompt */
32
- STRONGER_PROMPT = 2,
33
- /** Third intervention - kill and restart process */
34
- RESTART_PROCESS = 3,
35
- /** Final stage - run doctor and report */
36
- DIAGNOSE = 4,
37
- /** No more recovery possible */
38
- ABORT = 5,
39
- }
40
-
41
- /** Configuration for auto-recovery behavior */
42
- export interface AutoRecoveryConfig {
43
- /** Time without activity before sending continue signal (default: 2 minutes) */
44
- idleTimeoutMs: number;
45
- /** Time to wait after continue signal before escalating (default: 2 minutes) */
46
- continueGraceMs: number;
47
- /** Time to wait after stronger prompt before escalating (default: 2 minutes) */
48
- strongerPromptGraceMs: number;
49
- /** Maximum number of restarts before aborting (default: 2) */
50
- maxRestarts: number;
51
- /** Whether to run doctor on persistent failures (default: true) */
52
- runDoctorOnFailure: boolean;
53
- /** Patterns indicating long-running operations (won't trigger idle) */
54
- longOperationPatterns: RegExp[];
55
- /** Grace period for long operations (default: 10 minutes) */
56
- longOperationGraceMs: number;
57
- /** Enable verbose logging */
58
- verbose: boolean;
59
- }
60
-
61
- /** Default auto-recovery configuration */
62
- export const DEFAULT_AUTO_RECOVERY_CONFIG: AutoRecoveryConfig = {
63
- idleTimeoutMs: 2 * 60 * 1000, // 2 minutes - idle detection
64
- continueGraceMs: 2 * 60 * 1000, // 2 minutes after continue
65
- strongerPromptGraceMs: 2 * 60 * 1000, // 2 minutes after stronger prompt
66
- maxRestarts: 2,
67
- runDoctorOnFailure: true,
68
- longOperationPatterns: [
69
- /installing\s+dependencies/i,
70
- /npm\s+(i|install|ci)/i,
71
- /pnpm\s+(i|install)/i,
72
- /yarn\s+(install)?/i,
73
- /building/i,
74
- /compiling/i,
75
- /bundling/i,
76
- /downloading/i,
77
- /fetching/i,
78
- /cloning/i,
79
- ],
80
- longOperationGraceMs: 10 * 60 * 1000, // 10 minutes for long ops
81
- verbose: false,
82
- };
83
-
84
32
  /** State tracking for a single lane's recovery */
85
33
  export interface LaneRecoveryState {
86
34
  laneName: string;
87
35
  runId: string;
88
- stage: RecoveryStage;
36
+ stage: number;
89
37
  lastActivityTime: number;
90
38
  lastBytesReceived: number;
91
39
  totalBytesReceived: number;
@@ -108,20 +56,10 @@ export interface DiagnosticInfo {
108
56
  details: string;
109
57
  }
110
58
 
111
- /** Recovery action result */
112
- export interface RecoveryActionResult {
113
- success: boolean;
114
- action: string;
115
- message: string;
116
- shouldContinue: boolean;
117
- nextStage?: RecoveryStage;
118
- diagnostic?: DiagnosticInfo;
119
- }
120
-
121
59
  /** Record of a failure for POF */
122
60
  export interface FailureRecord {
123
61
  timestamp: number;
124
- stage: RecoveryStage;
62
+ stage: number;
125
63
  action: string;
126
64
  message: string;
127
65
  idleTimeMs: number;
@@ -225,510 +163,7 @@ ${errorMessage}
225
163
  }
226
164
 
227
165
  // ============================================================================
228
- // Recovery State Manager
229
- // ============================================================================
230
-
231
- /**
232
- * Manages recovery state for all lanes
233
- */
234
- export class AutoRecoveryManager {
235
- private config: AutoRecoveryConfig;
236
- private laneStates: Map<string, LaneRecoveryState> = new Map();
237
- private eventHandlers: Map<string, () => void> = new Map();
238
-
239
- constructor(config: Partial<AutoRecoveryConfig> = {}) {
240
- this.config = { ...DEFAULT_AUTO_RECOVERY_CONFIG, ...config };
241
- }
242
-
243
- /**
244
- * Register a lane for recovery monitoring
245
- */
246
- registerLane(laneName: string, runId: string): void {
247
- const now = Date.now();
248
- this.laneStates.set(laneName, {
249
- laneName,
250
- runId,
251
- stage: RecoveryStage.NORMAL,
252
- lastActivityTime: now,
253
- lastBytesReceived: 0,
254
- totalBytesReceived: 0,
255
- lastOutput: '',
256
- restartCount: 0,
257
- continueSignalsSent: 0,
258
- lastStageChangeTime: now,
259
- isLongOperation: false,
260
- failureHistory: [],
261
- });
262
-
263
- if (this.config.verbose) {
264
- logger.info(`[AutoRecovery] Registered lane: ${laneName}`);
265
- }
266
- }
267
-
268
- /**
269
- * Unregister a lane from recovery monitoring
270
- */
271
- unregisterLane(laneName: string): void {
272
- this.laneStates.delete(laneName);
273
-
274
- const handler = this.eventHandlers.get(laneName);
275
- if (handler) {
276
- this.eventHandlers.delete(laneName);
277
- }
278
- }
279
-
280
- /**
281
- * Record activity for a lane
282
- */
283
- recordActivity(laneName: string, bytesReceived: number = 0, output?: string): void {
284
- const state = this.laneStates.get(laneName);
285
- if (!state) return;
286
-
287
- const now = Date.now();
288
-
289
- // Only update activity time if we actually received bytes
290
- // This allows heartbeats to be recorded (for logs/bytes) without resetting the idle timer
291
- if (bytesReceived > 0) {
292
- state.lastActivityTime = now;
293
- state.lastBytesReceived = bytesReceived;
294
- state.totalBytesReceived += bytesReceived;
295
- }
296
-
297
- if (output) {
298
- state.lastOutput = output;
299
- // Check if this is a long operation
300
- state.isLongOperation = this.config.longOperationPatterns.some(p => p.test(output));
301
- }
302
-
303
- // Reset stage if we got meaningful activity
304
- if (bytesReceived > 0 && state.stage !== RecoveryStage.NORMAL) {
305
- if (this.config.verbose) {
306
- logger.info(`[AutoRecovery] [${laneName}] Activity detected, resetting to NORMAL stage`);
307
- }
308
- state.stage = RecoveryStage.NORMAL;
309
- state.lastStageChangeTime = now;
310
- }
311
- }
312
-
313
- /**
314
- * Get current recovery state for a lane
315
- */
316
- getState(laneName: string): LaneRecoveryState | undefined {
317
- return this.laneStates.get(laneName);
318
- }
319
-
320
- /**
321
- * Check if a lane needs recovery intervention
322
- */
323
- needsIntervention(laneName: string): boolean {
324
- const state = this.laneStates.get(laneName);
325
- if (!state) return false;
326
-
327
- const now = Date.now();
328
- const idleTime = now - state.lastActivityTime;
329
-
330
- // Use longer timeout for long operations
331
- const effectiveTimeout = state.isLongOperation
332
- ? this.config.longOperationGraceMs
333
- : this.config.idleTimeoutMs;
334
-
335
- // Check based on current stage
336
- switch (state.stage) {
337
- case RecoveryStage.NORMAL:
338
- return idleTime > effectiveTimeout;
339
-
340
- case RecoveryStage.CONTINUE_SIGNAL:
341
- return (now - state.lastStageChangeTime) > this.config.continueGraceMs;
342
-
343
- case RecoveryStage.STRONGER_PROMPT:
344
- return (now - state.lastStageChangeTime) > this.config.strongerPromptGraceMs;
345
-
346
- case RecoveryStage.RESTART_PROCESS:
347
- // After restart, use normal timeout to detect if it's working
348
- return idleTime > effectiveTimeout;
349
-
350
- case RecoveryStage.DIAGNOSE:
351
- case RecoveryStage.ABORT:
352
- return false; // No more interventions
353
-
354
- default:
355
- return false;
356
- }
357
- }
358
-
359
- /**
360
- * Get the next recovery action for a lane
361
- */
362
- async getRecoveryAction(
363
- laneName: string,
364
- laneRunDir: string,
365
- child?: ChildProcess
366
- ): Promise<RecoveryActionResult> {
367
- const state = this.laneStates.get(laneName);
368
- if (!state) {
369
- return {
370
- success: false,
371
- action: 'none',
372
- message: 'Lane not registered',
373
- shouldContinue: false,
374
- };
375
- }
376
-
377
- const now = Date.now();
378
- const idleTime = now - state.lastActivityTime;
379
- const idleSeconds = Math.round(idleTime / 1000);
380
-
381
- switch (state.stage) {
382
- case RecoveryStage.NORMAL:
383
- // Escalate to CONTINUE_SIGNAL
384
- return await this.sendContinueSignal(laneName, laneRunDir, state, idleSeconds);
385
-
386
- case RecoveryStage.CONTINUE_SIGNAL:
387
- // Try a stronger prompt
388
- return await this.sendStrongerPrompt(laneName, laneRunDir, state);
389
-
390
- case RecoveryStage.STRONGER_PROMPT:
391
- // Try restarting the process
392
- if (state.restartCount < this.config.maxRestarts) {
393
- return await this.requestRestart(laneName, state, child);
394
- }
395
- // Fall through to diagnose
396
- state.stage = RecoveryStage.DIAGNOSE;
397
- state.lastStageChangeTime = now;
398
- return await this.runDiagnosis(laneName, laneRunDir, state);
399
-
400
- case RecoveryStage.RESTART_PROCESS:
401
- // After restart, if still no response, diagnose
402
- if (state.restartCount >= this.config.maxRestarts) {
403
- state.stage = RecoveryStage.DIAGNOSE;
404
- state.lastStageChangeTime = now;
405
- return await this.runDiagnosis(laneName, laneRunDir, state);
406
- }
407
- // Try continue signal again after restart
408
- return await this.sendContinueSignal(laneName, laneRunDir, state, idleSeconds);
409
-
410
- case RecoveryStage.DIAGNOSE:
411
- // Final stage - abort
412
- state.stage = RecoveryStage.ABORT;
413
- state.lastStageChangeTime = now;
414
- return {
415
- success: false,
416
- action: 'abort',
417
- message: `Lane ${laneName} failed after all recovery attempts`,
418
- shouldContinue: false,
419
- nextStage: RecoveryStage.ABORT,
420
- diagnostic: state.diagnosticInfo,
421
- };
422
-
423
- default:
424
- return {
425
- success: false,
426
- action: 'abort',
427
- message: 'Recovery exhausted',
428
- shouldContinue: false,
429
- };
430
- }
431
- }
432
-
433
- /**
434
- * Send a continue signal to the lane
435
- */
436
- private async sendContinueSignal(
437
- laneName: string,
438
- laneRunDir: string,
439
- state: LaneRecoveryState,
440
- idleSeconds: number
441
- ): Promise<RecoveryActionResult> {
442
- const interventionPath = safeJoin(laneRunDir, 'intervention.txt');
443
-
444
- try {
445
- fs.writeFileSync(interventionPath, 'continue');
446
-
447
- state.stage = RecoveryStage.CONTINUE_SIGNAL;
448
- state.lastStageChangeTime = Date.now();
449
- state.continueSignalsSent++;
450
-
451
- // Record failure history
452
- state.failureHistory.push({
453
- timestamp: Date.now(),
454
- stage: RecoveryStage.CONTINUE_SIGNAL,
455
- action: 'continue_signal',
456
- message: `Idle for ${idleSeconds}s`,
457
- idleTimeMs: idleSeconds * 1000,
458
- bytesReceived: state.totalBytesReceived,
459
- lastOutput: state.lastOutput,
460
- });
461
-
462
- const message = `[${laneName}] Idle for ${idleSeconds}s - sent continue signal (#${state.continueSignalsSent})`;
463
- logger.warn(message);
464
-
465
- events.emit('recovery.continue_signal', {
466
- runId: state.runId,
467
- laneName,
468
- idleSeconds,
469
- signalCount: state.continueSignalsSent,
470
- });
471
-
472
- return {
473
- success: true,
474
- action: 'continue_signal',
475
- message,
476
- shouldContinue: true,
477
- nextStage: RecoveryStage.CONTINUE_SIGNAL,
478
- };
479
- } catch (error: any) {
480
- logger.error(`[AutoRecovery] Failed to send continue signal to ${laneName}: ${error.message}`);
481
- return {
482
- success: false,
483
- action: 'continue_signal',
484
- message: `Failed to send continue signal: ${error.message}`,
485
- shouldContinue: true,
486
- };
487
- }
488
- }
489
-
490
- /**
491
- * Send a stronger prompt to nudge the agent
492
- */
493
- private async sendStrongerPrompt(
494
- laneName: string,
495
- laneRunDir: string,
496
- state: LaneRecoveryState
497
- ): Promise<RecoveryActionResult> {
498
- const interventionPath = safeJoin(laneRunDir, 'intervention.txt');
499
-
500
- const strongerPrompt = `[SYSTEM INTERVENTION] You seem to be stuck or waiting.
501
- Please continue with your current task immediately.
502
- If you're waiting for something, explain what you need and proceed with what you can do now.
503
- If you've completed the task, please summarize your work and finish.
504
- If you encountered a git error, resolve it and continue.`;
505
-
506
- try {
507
- fs.writeFileSync(interventionPath, strongerPrompt);
508
-
509
- state.stage = RecoveryStage.STRONGER_PROMPT;
510
- state.lastStageChangeTime = Date.now();
511
-
512
- // Record failure history
513
- state.failureHistory.push({
514
- timestamp: Date.now(),
515
- stage: RecoveryStage.STRONGER_PROMPT,
516
- action: 'stronger_prompt',
517
- message: 'Still idle after continue signal',
518
- idleTimeMs: Date.now() - state.lastActivityTime,
519
- bytesReceived: state.totalBytesReceived,
520
- lastOutput: state.lastOutput,
521
- });
522
-
523
- const message = `[${laneName}] Still idle after continue signal - sent stronger prompt`;
524
- logger.warn(message);
525
-
526
- events.emit('recovery.stronger_prompt', {
527
- runId: state.runId,
528
- laneName,
529
- prompt: strongerPrompt,
530
- });
531
-
532
- return {
533
- success: true,
534
- action: 'stronger_prompt',
535
- message,
536
- shouldContinue: true,
537
- nextStage: RecoveryStage.STRONGER_PROMPT,
538
- };
539
- } catch (error: any) {
540
- logger.error(`[AutoRecovery] Failed to send stronger prompt to ${laneName}: ${error.message}`);
541
- return {
542
- success: false,
543
- action: 'stronger_prompt',
544
- message: `Failed to send stronger prompt: ${error.message}`,
545
- shouldContinue: true,
546
- };
547
- }
548
- }
549
-
550
- /**
551
- * Request process restart
552
- */
553
- private async requestRestart(
554
- laneName: string,
555
- state: LaneRecoveryState,
556
- child?: ChildProcess
557
- ): Promise<RecoveryActionResult> {
558
- state.restartCount++;
559
- state.stage = RecoveryStage.RESTART_PROCESS;
560
- state.lastStageChangeTime = Date.now();
561
-
562
- // Record failure history
563
- state.failureHistory.push({
564
- timestamp: Date.now(),
565
- stage: RecoveryStage.RESTART_PROCESS,
566
- action: 'restart',
567
- message: `Restart attempt ${state.restartCount}/${this.config.maxRestarts}`,
568
- idleTimeMs: Date.now() - state.lastActivityTime,
569
- bytesReceived: state.totalBytesReceived,
570
- lastOutput: state.lastOutput,
571
- });
572
-
573
- // Kill the current process if provided
574
- if (child && child.pid && !child.killed) {
575
- try {
576
- child.kill('SIGKILL');
577
- logger.info(`[AutoRecovery] [${laneName}] Killed process ${child.pid}`);
578
- } catch (error: any) {
579
- logger.warn(`[AutoRecovery] [${laneName}] Failed to kill process: ${error.message}`);
580
- }
581
- }
582
-
583
- const message = `[${laneName}] Restarting lane (attempt ${state.restartCount}/${this.config.maxRestarts})`;
584
- logger.warn(message);
585
-
586
- events.emit('recovery.restart', {
587
- runId: state.runId,
588
- laneName,
589
- restartCount: state.restartCount,
590
- maxRestarts: this.config.maxRestarts,
591
- });
592
-
593
- return {
594
- success: true,
595
- action: 'restart',
596
- message,
597
- shouldContinue: true,
598
- nextStage: RecoveryStage.RESTART_PROCESS,
599
- };
600
- }
601
-
602
- /**
603
- * Run diagnostic checks
604
- */
605
- private async runDiagnosis(
606
- laneName: string,
607
- laneRunDir: string,
608
- state: LaneRecoveryState
609
- ): Promise<RecoveryActionResult> {
610
- if (!this.config.runDoctorOnFailure) {
611
- return {
612
- success: false,
613
- action: 'diagnose',
614
- message: 'Diagnosis skipped (disabled in config)',
615
- shouldContinue: false,
616
- };
617
- }
618
-
619
- logger.info(`[AutoRecovery] [${laneName}] Running diagnostic checks...`);
620
-
621
- try {
622
- // Run health checks
623
- const [agentHealth, authHealth] = await Promise.all([
624
- checkAgentHealth(),
625
- checkAuthHealth(),
626
- ]);
627
-
628
- const systemHealth = await runHealthCheck({ skipRemote: true, skipAuth: true });
629
-
630
- const diagnostic: DiagnosticInfo = {
631
- timestamp: Date.now(),
632
- agentHealthy: agentHealth.ok,
633
- authHealthy: authHealth.ok,
634
- systemHealthy: systemHealth.healthy,
635
- suggestedAction: '',
636
- details: '',
637
- };
638
-
639
- // Analyze and suggest action
640
- const issues: string[] = [];
641
-
642
- if (!agentHealth.ok) {
643
- issues.push(`Agent: ${agentHealth.message}`);
644
- }
645
-
646
- if (!authHealth.ok) {
647
- issues.push(`Auth: ${authHealth.message}`);
648
- diagnostic.suggestedAction = 'Please sign in to Cursor IDE and verify authentication';
649
- }
650
-
651
- if (!systemHealth.healthy) {
652
- const failedChecks = systemHealth.checks.filter(c => !c.ok);
653
- issues.push(`System: ${failedChecks.map(c => c.message).join(', ')}`);
654
- }
655
-
656
- if (issues.length === 0) {
657
- diagnostic.details = 'All health checks passed. The issue may be with the AI model or network.';
658
- diagnostic.suggestedAction = 'Try resuming with a different model or wait and retry.';
659
- } else {
660
- diagnostic.details = issues.join('\n');
661
- }
662
-
663
- state.diagnosticInfo = diagnostic;
664
-
665
- // Record failure history
666
- state.failureHistory.push({
667
- timestamp: Date.now(),
668
- stage: RecoveryStage.DIAGNOSE,
669
- action: 'diagnose',
670
- message: diagnostic.details,
671
- idleTimeMs: Date.now() - state.lastActivityTime,
672
- bytesReceived: state.totalBytesReceived,
673
- lastOutput: state.lastOutput,
674
- });
675
-
676
- // Save diagnostic to file
677
- const diagnosticPath = safeJoin(laneRunDir, 'diagnostic.json');
678
- fs.writeFileSync(diagnosticPath, JSON.stringify(diagnostic, null, 2));
679
-
680
- const message = `[${laneName}] Diagnostic complete:\n${diagnostic.details}\nSuggested action: ${diagnostic.suggestedAction}`;
681
- logger.error(message);
682
-
683
- events.emit('recovery.diagnosed', {
684
- runId: state.runId,
685
- laneName,
686
- diagnostic,
687
- });
688
-
689
- return {
690
- success: true,
691
- action: 'diagnose',
692
- message,
693
- shouldContinue: false,
694
- diagnostic,
695
- };
696
- } catch (error: any) {
697
- logger.error(`[AutoRecovery] Diagnostic failed: ${error.message}`);
698
- return {
699
- success: false,
700
- action: 'diagnose',
701
- message: `Diagnostic failed: ${error.message}`,
702
- shouldContinue: false,
703
- };
704
- }
705
- }
706
-
707
- /**
708
- * Get failure history for a lane
709
- */
710
- getFailureHistory(laneName: string): FailureRecord[] {
711
- const state = this.laneStates.get(laneName);
712
- return state?.failureHistory || [];
713
- }
714
-
715
- /**
716
- * Get configuration
717
- */
718
- getConfig(): AutoRecoveryConfig {
719
- return { ...this.config };
720
- }
721
-
722
- /**
723
- * Update configuration
724
- */
725
- updateConfig(config: Partial<AutoRecoveryConfig>): void {
726
- this.config = { ...this.config, ...config };
727
- }
728
- }
729
-
730
- // ============================================================================
731
- // POF (Post-mortem of Failure) Management
166
+ // Post-Mortem of Failure (POF) Management
732
167
  // ============================================================================
733
168
 
734
169
  /**
@@ -894,24 +329,7 @@ export function listPOFs(pofDir: string): string[] {
894
329
  // Exports
895
330
  // ============================================================================
896
331
 
897
- /** Singleton instance for easy access */
898
- let defaultManager: AutoRecoveryManager | null = null;
332
+ // AutoRecoveryManager class removed. All stall detection and recovery logic
333
+ // has been moved to StallDetectionService in ./stall-detection.ts.
334
+ // Utility functions for POF and git guidance are kept below.
899
335
 
900
- /**
901
- * Get or create the default auto-recovery manager
902
- */
903
- export function getAutoRecoveryManager(config?: Partial<AutoRecoveryConfig>): AutoRecoveryManager {
904
- if (!defaultManager) {
905
- defaultManager = new AutoRecoveryManager(config);
906
- } else if (config) {
907
- defaultManager.updateConfig(config);
908
- }
909
- return defaultManager;
910
- }
911
-
912
- /**
913
- * Reset the default manager (for testing)
914
- */
915
- export function resetAutoRecoveryManager(): void {
916
- defaultManager = null;
917
- }