@litmers/cursorflow-orchestrator 0.1.30 → 0.1.34

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. package/README.md +144 -52
  2. package/commands/cursorflow-add.md +159 -0
  3. package/commands/cursorflow-monitor.md +23 -2
  4. package/commands/cursorflow-new.md +87 -0
  5. package/dist/cli/add.d.ts +7 -0
  6. package/dist/cli/add.js +377 -0
  7. package/dist/cli/add.js.map +1 -0
  8. package/dist/cli/clean.js +1 -0
  9. package/dist/cli/clean.js.map +1 -1
  10. package/dist/cli/config.d.ts +7 -0
  11. package/dist/cli/config.js +181 -0
  12. package/dist/cli/config.js.map +1 -0
  13. package/dist/cli/index.js +34 -30
  14. package/dist/cli/index.js.map +1 -1
  15. package/dist/cli/logs.js +7 -33
  16. package/dist/cli/logs.js.map +1 -1
  17. package/dist/cli/monitor.js +51 -62
  18. package/dist/cli/monitor.js.map +1 -1
  19. package/dist/cli/new.d.ts +7 -0
  20. package/dist/cli/new.js +232 -0
  21. package/dist/cli/new.js.map +1 -0
  22. package/dist/cli/prepare.js +95 -193
  23. package/dist/cli/prepare.js.map +1 -1
  24. package/dist/cli/resume.js +11 -47
  25. package/dist/cli/resume.js.map +1 -1
  26. package/dist/cli/run.js +27 -22
  27. package/dist/cli/run.js.map +1 -1
  28. package/dist/cli/tasks.js +1 -2
  29. package/dist/cli/tasks.js.map +1 -1
  30. package/dist/core/failure-policy.d.ts +9 -0
  31. package/dist/core/failure-policy.js +9 -0
  32. package/dist/core/failure-policy.js.map +1 -1
  33. package/dist/core/orchestrator.d.ts +20 -6
  34. package/dist/core/orchestrator.js +217 -331
  35. package/dist/core/orchestrator.js.map +1 -1
  36. package/dist/core/runner/agent.d.ts +27 -0
  37. package/dist/core/runner/agent.js +294 -0
  38. package/dist/core/runner/agent.js.map +1 -0
  39. package/dist/core/runner/index.d.ts +5 -0
  40. package/dist/core/runner/index.js +22 -0
  41. package/dist/core/runner/index.js.map +1 -0
  42. package/dist/core/runner/pipeline.d.ts +9 -0
  43. package/dist/core/runner/pipeline.js +539 -0
  44. package/dist/core/runner/pipeline.js.map +1 -0
  45. package/dist/core/runner/prompt.d.ts +25 -0
  46. package/dist/core/runner/prompt.js +175 -0
  47. package/dist/core/runner/prompt.js.map +1 -0
  48. package/dist/core/runner/task.d.ts +26 -0
  49. package/dist/core/runner/task.js +283 -0
  50. package/dist/core/runner/task.js.map +1 -0
  51. package/dist/core/runner/utils.d.ts +37 -0
  52. package/dist/core/runner/utils.js +161 -0
  53. package/dist/core/runner/utils.js.map +1 -0
  54. package/dist/core/runner.d.ts +2 -96
  55. package/dist/core/runner.js +11 -1136
  56. package/dist/core/runner.js.map +1 -1
  57. package/dist/core/stall-detection.d.ts +326 -0
  58. package/dist/core/stall-detection.js +781 -0
  59. package/dist/core/stall-detection.js.map +1 -0
  60. package/dist/types/config.d.ts +6 -6
  61. package/dist/types/flow.d.ts +84 -0
  62. package/dist/types/flow.js +10 -0
  63. package/dist/types/flow.js.map +1 -0
  64. package/dist/types/index.d.ts +1 -0
  65. package/dist/types/index.js +3 -3
  66. package/dist/types/index.js.map +1 -1
  67. package/dist/types/lane.d.ts +0 -2
  68. package/dist/types/logging.d.ts +5 -1
  69. package/dist/types/task.d.ts +7 -11
  70. package/dist/utils/config.js +7 -15
  71. package/dist/utils/config.js.map +1 -1
  72. package/dist/utils/dependency.d.ts +36 -1
  73. package/dist/utils/dependency.js +256 -1
  74. package/dist/utils/dependency.js.map +1 -1
  75. package/dist/utils/enhanced-logger.d.ts +45 -82
  76. package/dist/utils/enhanced-logger.js +238 -844
  77. package/dist/utils/enhanced-logger.js.map +1 -1
  78. package/dist/utils/git.d.ts +29 -0
  79. package/dist/utils/git.js +115 -5
  80. package/dist/utils/git.js.map +1 -1
  81. package/dist/utils/state.js +0 -2
  82. package/dist/utils/state.js.map +1 -1
  83. package/dist/utils/task-service.d.ts +2 -2
  84. package/dist/utils/task-service.js +40 -31
  85. package/dist/utils/task-service.js.map +1 -1
  86. package/package.json +4 -3
  87. package/src/cli/add.ts +397 -0
  88. package/src/cli/clean.ts +1 -0
  89. package/src/cli/config.ts +177 -0
  90. package/src/cli/index.ts +36 -32
  91. package/src/cli/logs.ts +7 -31
  92. package/src/cli/monitor.ts +55 -71
  93. package/src/cli/new.ts +235 -0
  94. package/src/cli/prepare.ts +98 -205
  95. package/src/cli/resume.ts +13 -56
  96. package/src/cli/run.ts +311 -306
  97. package/src/cli/tasks.ts +1 -2
  98. package/src/core/failure-policy.ts +9 -0
  99. package/src/core/orchestrator.ts +281 -375
  100. package/src/core/runner/agent.ts +314 -0
  101. package/src/core/runner/index.ts +6 -0
  102. package/src/core/runner/pipeline.ts +567 -0
  103. package/src/core/runner/prompt.ts +174 -0
  104. package/src/core/runner/task.ts +320 -0
  105. package/src/core/runner/utils.ts +142 -0
  106. package/src/core/runner.ts +8 -1347
  107. package/src/core/stall-detection.ts +936 -0
  108. package/src/types/config.ts +6 -6
  109. package/src/types/flow.ts +91 -0
  110. package/src/types/index.ts +15 -3
  111. package/src/types/lane.ts +0 -2
  112. package/src/types/logging.ts +5 -1
  113. package/src/types/task.ts +7 -11
  114. package/src/utils/config.ts +8 -16
  115. package/src/utils/dependency.ts +311 -2
  116. package/src/utils/enhanced-logger.ts +263 -927
  117. package/src/utils/git.ts +145 -5
  118. package/src/utils/state.ts +0 -2
  119. package/src/utils/task-service.ts +48 -40
  120. package/commands/cursorflow-review.md +0 -56
  121. package/commands/cursorflow-runs.md +0 -59
  122. package/dist/cli/runs.d.ts +0 -5
  123. package/dist/cli/runs.js +0 -214
  124. package/dist/cli/runs.js.map +0 -1
  125. package/dist/core/reviewer.d.ts +0 -66
  126. package/dist/core/reviewer.js +0 -265
  127. package/dist/core/reviewer.js.map +0 -1
  128. package/src/cli/runs.ts +0 -212
  129. package/src/core/reviewer.ts +0 -285
@@ -0,0 +1,936 @@
1
+ /**
2
+ * Stall Detection Service - 통합된 교착 상태 감지 및 복구 시스템
3
+ *
4
+ * 기존 분산된 로직을 단일 모듈로 통합:
5
+ * - orchestrator.ts의 RunningLaneInfo 상태 관리
6
+ * - failure-policy.ts의 analyzeStall() 분석 로직
7
+ * - auto-recovery.ts의 AutoRecoveryManager 복구 로직
8
+ *
9
+ * 핵심 원칙:
10
+ * 1. 단일 상태 저장소 (Single Source of Truth)
11
+ * 2. 명확한 상태 전이 (State Machine)
12
+ * 3. 실제 활동만 타이머 리셋 (Heartbeat 제외)
13
+ */
14
+
15
+ import * as fs from 'fs';
16
+ import { ChildProcess } from 'child_process';
17
+
18
+ import * as logger from '../utils/logger';
19
+ import { events } from '../utils/events';
20
+ import { safeJoin } from '../utils/path';
21
+
22
+ // ============================================================================
23
+ // 설정 (Configuration)
24
+ // ============================================================================
25
+
26
+ /**
27
+ * Stall 감지 설정
28
+ *
29
+ * 모든 타임아웃 및 패턴을 한 곳에서 관리
30
+ */
31
+ export interface StallDetectionConfig {
32
+ /** stdout 활동 없이 대기하는 시간 (기본: 2분) */
33
+ idleTimeoutMs: number;
34
+ /** state.json 업데이트 없이 대기하는 시간 (기본: 10분) */
35
+ progressTimeoutMs: number;
36
+ /** 단일 태스크 최대 실행 시간 (기본: 30분) */
37
+ taskTimeoutMs: number;
38
+ /** 장기 작업 감지 패턴 */
39
+ longOperationPatterns: RegExp[];
40
+ /** 장기 작업 유예 시간 (기본: 10분) */
41
+ longOperationGraceMs: number;
42
+ /** continue 신호 후 유예 시간 (기본: 2분) */
43
+ continueGraceMs: number;
44
+ /** stronger prompt 후 유예 시간 (기본: 2분) */
45
+ strongerPromptGraceMs: number;
46
+ /** 최대 재시작 횟수 (기본: 2) */
47
+ maxRestarts: number;
48
+ /** 실패 시 doctor 실행 여부 (기본: true) */
49
+ runDoctorOnFailure: boolean;
50
+ /** 디버그 로깅 활성화 */
51
+ verbose: boolean;
52
+ }
53
+
54
+ export const DEFAULT_STALL_CONFIG: StallDetectionConfig = {
55
+ idleTimeoutMs: 2 * 60 * 1000, // 2분 - idle 감지
56
+ progressTimeoutMs: 10 * 60 * 1000, // 10분 - progress 감지
57
+ taskTimeoutMs: 30 * 60 * 1000, // 30분 - task timeout
58
+ longOperationPatterns: [
59
+ /installing\s+dependencies/i,
60
+ /npm\s+(i|install|ci)/i,
61
+ /pnpm\s+(i|install)/i,
62
+ /yarn\s+(install)?/i,
63
+ /building/i,
64
+ /compiling/i,
65
+ /bundling/i,
66
+ /downloading/i,
67
+ /fetching/i,
68
+ /cloning/i,
69
+ ],
70
+ longOperationGraceMs: 10 * 60 * 1000, // 10분 - 장기 작업 유예
71
+ continueGraceMs: 2 * 60 * 1000, // 2분 - continue 후 유예
72
+ strongerPromptGraceMs: 2 * 60 * 1000, // 2분 - stronger prompt 후 유예
73
+ maxRestarts: 2,
74
+ runDoctorOnFailure: true,
75
+ verbose: false,
76
+ };
77
+
78
+ // ============================================================================
79
+ // 상태 정의 (State Definitions)
80
+ // ============================================================================
81
+
82
+ /**
83
+ * Stall 복구 단계 (State Machine)
84
+ *
85
+ * 상태 전이:
86
+ * NORMAL → CONTINUE_SENT → STRONGER_PROMPT_SENT → RESTART_REQUESTED → DIAGNOSED → ABORTED
87
+ * ↑__________________________________________________________|
88
+ * (실제 활동 감지 시 리셋)
89
+ */
90
+ export enum StallPhase {
91
+ /** 정상 작동 - 모니터링 중 */
92
+ NORMAL = 0,
93
+ /** Continue 신호 발송 완료 - 응답 대기 */
94
+ CONTINUE_SENT = 1,
95
+ /** Stronger prompt 발송 완료 - 응답 대기 */
96
+ STRONGER_PROMPT_SENT = 2,
97
+ /** 재시작 요청 - 프로세스 종료/재시작 중 */
98
+ RESTART_REQUESTED = 3,
99
+ /** 진단 완료 - 더 이상 복구 불가 */
100
+ DIAGNOSED = 4,
101
+ /** 최종 실패 - 중단됨 */
102
+ ABORTED = 5,
103
+ }
104
+
105
+ /**
106
+ * 복구 액션 종류
107
+ */
108
+ export enum RecoveryAction {
109
+ /** 액션 필요 없음 - 정상 */
110
+ NONE = 'NONE',
111
+ /** Continue 신호 발송 */
112
+ SEND_CONTINUE = 'SEND_CONTINUE',
113
+ /** Stronger prompt 발송 */
114
+ SEND_STRONGER_PROMPT = 'SEND_STRONGER_PROMPT',
115
+ /** 프로세스 재시작 요청 */
116
+ REQUEST_RESTART = 'REQUEST_RESTART',
117
+ /** Doctor 실행 및 진단 */
118
+ RUN_DOCTOR = 'RUN_DOCTOR',
119
+ /** 레인 중단 */
120
+ ABORT_LANE = 'ABORT_LANE',
121
+ }
122
+
123
+ /**
124
+ * Stall 유형
125
+ */
126
+ export enum StallType {
127
+ /** stdout이 idle 상태 */
128
+ IDLE = 'IDLE',
129
+ /** state.json이 업데이트되지 않음 */
130
+ NO_PROGRESS = 'NO_PROGRESS',
131
+ /** 0바이트 수신 (에이전트 무응답) */
132
+ ZERO_BYTES = 'ZERO_BYTES',
133
+ /** 태스크 타임아웃 */
134
+ TASK_TIMEOUT = 'TASK_TIMEOUT',
135
+ }
136
+
137
+ /**
138
+ * Lane별 Stall 상태 (Single Source of Truth)
139
+ */
140
+ export interface LaneStallState {
141
+ /** Lane 이름 */
142
+ laneName: string;
143
+ /** 현재 복구 단계 */
144
+ phase: StallPhase;
145
+ /** 마지막 실제 활동 시간 (bytes > 0) */
146
+ lastRealActivityTime: number;
147
+ /** 마지막 상태 변경 시간 (phase 변경) */
148
+ lastPhaseChangeTime: number;
149
+ /** 마지막 state.json 업데이트 시간 */
150
+ lastStateUpdateTime: number;
151
+ /** 태스크 시작 시간 */
152
+ taskStartTime: number;
153
+ /** 마지막 출력 라인 (장기 작업 감지용) */
154
+ lastOutput: string;
155
+ /** 마지막 체크 이후 수신 바이트 */
156
+ bytesSinceLastCheck: number;
157
+ /** 총 수신 바이트 */
158
+ totalBytesReceived: number;
159
+ /** 마지막 체크 시점의 총 바이트 (delta 계산용) */
160
+ bytesAtLastCheck: number;
161
+ /** 재시작 횟수 */
162
+ restartCount: number;
163
+ /** Continue 신호 발송 횟수 */
164
+ continueSignalCount: number;
165
+ /** 장기 작업 진행 중 여부 */
166
+ isLongOperation: boolean;
167
+ /** 연결된 ChildProcess (재시작용) */
168
+ childProcess?: ChildProcess;
169
+ /** Lane 실행 디렉토리 */
170
+ laneRunDir?: string;
171
+ /** 실패 이력 (POF용) */
172
+ failureHistory: FailureRecord[];
173
+ }
174
+
175
+ /**
176
+ * Stall 분석 결과
177
+ */
178
+ export interface StallAnalysis {
179
+ /** Stall 유형 */
180
+ type: StallType;
181
+ /** 권장 복구 액션 */
182
+ action: RecoveryAction;
183
+ /** 사용자 표시용 메시지 */
184
+ message: string;
185
+ /** 일시적 문제 여부 (복구 가능) */
186
+ isTransient: boolean;
187
+ /** 추가 정보 */
188
+ details?: Record<string, any>;
189
+ }
190
+
191
+ /**
192
+ * 실패 기록 (POF용)
193
+ */
194
+ export interface FailureRecord {
195
+ timestamp: number;
196
+ phase: StallPhase;
197
+ action: RecoveryAction;
198
+ message: string;
199
+ idleTimeMs: number;
200
+ bytesReceived: number;
201
+ lastOutput: string;
202
+ }
203
+
204
+ // ============================================================================
205
+ // Stall Detection Service
206
+ // ============================================================================
207
+
208
+ /**
209
+ * 통합 Stall 감지 서비스
210
+ *
211
+ * 사용법:
212
+ * ```typescript
213
+ * const stallService = StallDetectionService.getInstance();
214
+ * stallService.registerLane('lane-1', { laneRunDir: '/path/to/lane' });
215
+ *
216
+ * // 활동 기록
217
+ * stallService.recordActivity('lane-1', bytesReceived, outputLine);
218
+ *
219
+ * // 주기적 체크 (10초마다)
220
+ * const result = stallService.checkAndRecover('lane-1');
221
+ * if (result.action !== RecoveryAction.NONE) {
222
+ * // 복구 액션 처리
223
+ * }
224
+ * ```
225
+ */
226
+ export class StallDetectionService {
227
+ private static instance: StallDetectionService | null = null;
228
+
229
+ private config: StallDetectionConfig;
230
+ private laneStates: Map<string, LaneStallState> = new Map();
231
+
232
+ private constructor(config: Partial<StallDetectionConfig> = {}) {
233
+ this.config = { ...DEFAULT_STALL_CONFIG, ...config };
234
+ }
235
+
236
+ /**
237
+ * 싱글톤 인스턴스 획득
238
+ */
239
+ static getInstance(config?: Partial<StallDetectionConfig>): StallDetectionService {
240
+ if (!StallDetectionService.instance) {
241
+ StallDetectionService.instance = new StallDetectionService(config);
242
+ } else if (config) {
243
+ StallDetectionService.instance.updateConfig(config);
244
+ }
245
+ return StallDetectionService.instance;
246
+ }
247
+
248
+ /**
249
+ * 인스턴스 리셋 (테스트용)
250
+ */
251
+ static resetInstance(): void {
252
+ StallDetectionService.instance = null;
253
+ }
254
+
255
+ /**
256
+ * 설정 업데이트
257
+ */
258
+ updateConfig(config: Partial<StallDetectionConfig>): void {
259
+ this.config = { ...this.config, ...config };
260
+ }
261
+
262
+ /**
263
+ * 현재 설정 조회
264
+ */
265
+ getConfig(): StallDetectionConfig {
266
+ return { ...this.config };
267
+ }
268
+
269
+ // --------------------------------------------------------------------------
270
+ // Lane 등록/해제
271
+ // --------------------------------------------------------------------------
272
+
273
+ /**
274
+ * Lane 등록
275
+ */
276
+ registerLane(
277
+ laneName: string,
278
+ options: {
279
+ laneRunDir?: string;
280
+ childProcess?: ChildProcess;
281
+ startIndex?: number;
282
+ } = {}
283
+ ): void {
284
+ const now = Date.now();
285
+
286
+ this.laneStates.set(laneName, {
287
+ laneName,
288
+ phase: StallPhase.NORMAL,
289
+ lastRealActivityTime: now,
290
+ lastPhaseChangeTime: now,
291
+ lastStateUpdateTime: now,
292
+ taskStartTime: now,
293
+ lastOutput: '',
294
+ bytesSinceLastCheck: 0,
295
+ totalBytesReceived: 0,
296
+ bytesAtLastCheck: 0,
297
+ restartCount: 0,
298
+ continueSignalCount: 0,
299
+ isLongOperation: false,
300
+ childProcess: options.childProcess,
301
+ laneRunDir: options.laneRunDir,
302
+ failureHistory: [],
303
+ });
304
+
305
+ if (this.config.verbose) {
306
+ logger.debug(`[StallService] Lane registered: ${laneName}`);
307
+ }
308
+ }
309
+
310
+ /**
311
+ * Lane 해제
312
+ */
313
+ unregisterLane(laneName: string): void {
314
+ this.laneStates.delete(laneName);
315
+
316
+ if (this.config.verbose) {
317
+ logger.debug(`[StallService] Lane unregistered: ${laneName}`);
318
+ }
319
+ }
320
+
321
+ /**
322
+ * Lane 상태 조회
323
+ */
324
+ getState(laneName: string): LaneStallState | undefined {
325
+ return this.laneStates.get(laneName);
326
+ }
327
+
328
+ /**
329
+ * ChildProcess 업데이트 (spawn 후 설정)
330
+ */
331
+ setChildProcess(laneName: string, child: ChildProcess): void {
332
+ const state = this.laneStates.get(laneName);
333
+ if (state) {
334
+ state.childProcess = child;
335
+ }
336
+ }
337
+
338
+ /**
339
+ * LaneRunDir 업데이트
340
+ */
341
+ setLaneRunDir(laneName: string, dir: string): void {
342
+ const state = this.laneStates.get(laneName);
343
+ if (state) {
344
+ state.laneRunDir = dir;
345
+ }
346
+ }
347
+
348
+ // --------------------------------------------------------------------------
349
+ // 활동 기록 (Activity Recording)
350
+ // --------------------------------------------------------------------------
351
+
352
+ /**
353
+ * 활동 기록 - stdout/stderr에서 데이터 수신 시 호출
354
+ *
355
+ * @param laneName Lane 이름
356
+ * @param bytesReceived 수신한 바이트 수 (0 = heartbeat)
357
+ * @param output 출력 라인 (장기 작업 감지용)
358
+ *
359
+ * 핵심 규칙:
360
+ * - bytesReceived > 0: 실제 활동 → lastRealActivityTime 업데이트, phase 리셋
361
+ * - bytesReceived === 0: heartbeat → lastOutput만 업데이트, 타이머 리셋 안함
362
+ */
363
+ recordActivity(laneName: string, bytesReceived: number, output?: string): void {
364
+ const state = this.laneStates.get(laneName);
365
+ if (!state) return;
366
+
367
+ const now = Date.now();
368
+
369
+ // 출력 업데이트 (장기 작업 감지용)
370
+ if (output) {
371
+ state.lastOutput = output;
372
+ state.isLongOperation = this.config.longOperationPatterns.some(p => p.test(output));
373
+ }
374
+
375
+ // bytesReceived > 0일 때만 실제 활동으로 인정
376
+ if (bytesReceived > 0) {
377
+ state.lastRealActivityTime = now;
378
+ state.totalBytesReceived += bytesReceived;
379
+ state.bytesSinceLastCheck += bytesReceived;
380
+
381
+ // 실제 활동 감지 시 phase를 NORMAL로 리셋
382
+ if (state.phase !== StallPhase.NORMAL && state.phase < StallPhase.RESTART_REQUESTED) {
383
+ if (this.config.verbose) {
384
+ logger.debug(`[StallService] [${laneName}] Real activity detected (${bytesReceived} bytes), resetting to NORMAL`);
385
+ }
386
+ state.phase = StallPhase.NORMAL;
387
+ state.lastPhaseChangeTime = now;
388
+ }
389
+ }
390
+ // bytesReceived === 0: heartbeat는 타이머 리셋하지 않음
391
+ }
392
+
393
+ /**
394
+ * State.json 업데이트 기록
395
+ */
396
+ recordStateUpdate(laneName: string): void {
397
+ const state = this.laneStates.get(laneName);
398
+ if (state) {
399
+ state.lastStateUpdateTime = Date.now();
400
+ }
401
+ }
402
+
403
+ /**
404
+ * 새 태스크 시작 기록
405
+ */
406
+ recordTaskStart(laneName: string): void {
407
+ const state = this.laneStates.get(laneName);
408
+ if (state) {
409
+ const now = Date.now();
410
+ state.taskStartTime = now;
411
+ state.lastRealActivityTime = now;
412
+ state.lastStateUpdateTime = now;
413
+ // 새 태스크 시작 시 phase 리셋
414
+ state.phase = StallPhase.NORMAL;
415
+ state.lastPhaseChangeTime = now;
416
+ }
417
+ }
418
+
419
+ // --------------------------------------------------------------------------
420
+ // Stall 분석 (Analysis)
421
+ // --------------------------------------------------------------------------
422
+
423
+ /**
424
+ * Stall 상태 분석 - 현재 상태에서 필요한 액션 결정
425
+ *
426
+ * 분석 우선순위:
427
+ * 1. Task timeout (30분) → RESTART/DOCTOR
428
+ * 2. Zero bytes + idle → AGENT_NO_RESPONSE
429
+ * 3. No progress (10분) → 단계별 에스컬레이션
430
+ * 4. Idle timeout (2분) → 단계별 에스컬레이션
431
+ */
432
+ analyzeStall(laneName: string): StallAnalysis {
433
+ const state = this.laneStates.get(laneName);
434
+ if (!state) {
435
+ return {
436
+ type: StallType.IDLE,
437
+ action: RecoveryAction.NONE,
438
+ message: 'Lane not found',
439
+ isTransient: false,
440
+ };
441
+ }
442
+
443
+ const now = Date.now();
444
+ const idleTime = now - state.lastRealActivityTime;
445
+ const progressTime = now - state.lastStateUpdateTime;
446
+ const taskTime = now - state.taskStartTime;
447
+ const timeSincePhaseChange = now - state.lastPhaseChangeTime;
448
+
449
+ // 바이트 델타 계산
450
+ const bytesDelta = state.totalBytesReceived - state.bytesAtLastCheck;
451
+
452
+ // 장기 작업 유예 시간 적용
453
+ const effectiveIdleTimeout = state.isLongOperation
454
+ ? this.config.longOperationGraceMs
455
+ : this.config.idleTimeoutMs;
456
+
457
+ // 1. Task timeout 체크 (최우선)
458
+ if (taskTime > this.config.taskTimeoutMs) {
459
+ return {
460
+ type: StallType.TASK_TIMEOUT,
461
+ action: state.restartCount < this.config.maxRestarts
462
+ ? RecoveryAction.REQUEST_RESTART
463
+ : RecoveryAction.RUN_DOCTOR,
464
+ message: `Task exceeded maximum timeout of ${Math.round(this.config.taskTimeoutMs / 60000)} minutes`,
465
+ isTransient: state.restartCount < this.config.maxRestarts,
466
+ details: { taskTimeMs: taskTime, restartCount: state.restartCount },
467
+ };
468
+ }
469
+
470
+ // 2. Zero bytes + idle 체크 (에이전트 무응답)
471
+ if (bytesDelta === 0 && idleTime > effectiveIdleTimeout) {
472
+ return {
473
+ type: StallType.ZERO_BYTES,
474
+ action: state.phase < StallPhase.STRONGER_PROMPT_SENT
475
+ ? RecoveryAction.SEND_CONTINUE
476
+ : RecoveryAction.REQUEST_RESTART,
477
+ message: `Agent produced 0 bytes for ${Math.round(idleTime / 1000)}s - possible API issue`,
478
+ isTransient: true,
479
+ details: { idleTimeMs: idleTime, bytesDelta, phase: state.phase },
480
+ };
481
+ }
482
+
483
+ // 3. Progress timeout 체크
484
+ if (progressTime > this.config.progressTimeoutMs) {
485
+ return this.getEscalatedAction(state, StallType.NO_PROGRESS, progressTime);
486
+ }
487
+
488
+ // 4. Phase별 상태 체크
489
+ switch (state.phase) {
490
+ case StallPhase.NORMAL:
491
+ // Idle timeout 체크
492
+ if (idleTime > effectiveIdleTimeout) {
493
+ return {
494
+ type: StallType.IDLE,
495
+ action: RecoveryAction.SEND_CONTINUE,
496
+ message: `Lane idle for ${Math.round(idleTime / 1000)}s. Sending continue signal...`,
497
+ isTransient: true,
498
+ details: { idleTimeMs: idleTime, isLongOperation: state.isLongOperation },
499
+ };
500
+ }
501
+ break;
502
+
503
+ case StallPhase.CONTINUE_SENT:
504
+ // Continue 신호 후 유예 시간 초과
505
+ if (timeSincePhaseChange > this.config.continueGraceMs) {
506
+ return {
507
+ type: StallType.IDLE,
508
+ action: RecoveryAction.SEND_STRONGER_PROMPT,
509
+ message: `Still idle after continue signal. Sending stronger prompt...`,
510
+ isTransient: true,
511
+ details: { timeSincePhaseChange, continueSignalCount: state.continueSignalCount },
512
+ };
513
+ }
514
+ break;
515
+
516
+ case StallPhase.STRONGER_PROMPT_SENT:
517
+ // Stronger prompt 후 유예 시간 초과
518
+ if (timeSincePhaseChange > this.config.strongerPromptGraceMs) {
519
+ if (state.restartCount < this.config.maxRestarts) {
520
+ return {
521
+ type: StallType.IDLE,
522
+ action: RecoveryAction.REQUEST_RESTART,
523
+ message: `No response after stronger prompt. Killing and restarting process...`,
524
+ isTransient: true,
525
+ details: { restartCount: state.restartCount, maxRestarts: this.config.maxRestarts },
526
+ };
527
+ } else {
528
+ return {
529
+ type: StallType.IDLE,
530
+ action: RecoveryAction.RUN_DOCTOR,
531
+ message: `Lane failed after ${state.restartCount} restarts. Running diagnostics...`,
532
+ isTransient: false,
533
+ details: { restartCount: state.restartCount },
534
+ };
535
+ }
536
+ }
537
+ break;
538
+
539
+ case StallPhase.RESTART_REQUESTED:
540
+ // 재시작 후 idle timeout의 75%로 더 짧게 감지
541
+ const postRestartTimeout = effectiveIdleTimeout * 0.75;
542
+ if (idleTime > postRestartTimeout) {
543
+ if (state.restartCount < this.config.maxRestarts) {
544
+ return {
545
+ type: StallType.IDLE,
546
+ action: RecoveryAction.SEND_CONTINUE,
547
+ message: `Lane idle after restart. Retrying continue signal...`,
548
+ isTransient: true,
549
+ details: { idleTimeMs: idleTime, restartCount: state.restartCount },
550
+ };
551
+ } else {
552
+ return {
553
+ type: StallType.IDLE,
554
+ action: RecoveryAction.RUN_DOCTOR,
555
+ message: `Lane repeatedly stalled. Running diagnostics...`,
556
+ isTransient: false,
557
+ details: { restartCount: state.restartCount },
558
+ };
559
+ }
560
+ }
561
+ break;
562
+
563
+ case StallPhase.DIAGNOSED:
564
+ case StallPhase.ABORTED:
565
+ // 더 이상 복구 시도 안함
566
+ return {
567
+ type: StallType.IDLE,
568
+ action: RecoveryAction.ABORT_LANE,
569
+ message: 'Lane recovery exhausted',
570
+ isTransient: false,
571
+ };
572
+ }
573
+
574
+ // 액션 필요 없음
575
+ return {
576
+ type: StallType.IDLE,
577
+ action: RecoveryAction.NONE,
578
+ message: 'Monitoring',
579
+ isTransient: true,
580
+ };
581
+ }
582
+
583
+ /**
584
+ * Progress timeout에 대한 에스컬레이션 액션 결정
585
+ */
586
+ private getEscalatedAction(state: LaneStallState, type: StallType, progressTime: number): StallAnalysis {
587
+ switch (state.phase) {
588
+ case StallPhase.NORMAL:
589
+ return {
590
+ type,
591
+ action: RecoveryAction.SEND_CONTINUE,
592
+ message: `No progress for ${Math.round(progressTime / 60000)} minutes. Sending continue signal...`,
593
+ isTransient: true,
594
+ details: { progressTimeMs: progressTime },
595
+ };
596
+
597
+ case StallPhase.CONTINUE_SENT:
598
+ return {
599
+ type,
600
+ action: RecoveryAction.SEND_STRONGER_PROMPT,
601
+ message: `Still no progress. Sending stronger prompt...`,
602
+ isTransient: true,
603
+ details: { progressTimeMs: progressTime },
604
+ };
605
+
606
+ default:
607
+ if (state.restartCount < this.config.maxRestarts) {
608
+ return {
609
+ type,
610
+ action: RecoveryAction.REQUEST_RESTART,
611
+ message: `No progress after interventions. Restarting...`,
612
+ isTransient: true,
613
+ details: { progressTimeMs: progressTime, restartCount: state.restartCount },
614
+ };
615
+ } else {
616
+ return {
617
+ type,
618
+ action: RecoveryAction.RUN_DOCTOR,
619
+ message: `Persistent no-progress state. Running diagnostics...`,
620
+ isTransient: false,
621
+ details: { progressTimeMs: progressTime, restartCount: state.restartCount },
622
+ };
623
+ }
624
+ }
625
+ }
626
+
627
+ // --------------------------------------------------------------------------
628
+ // 복구 액션 실행 (Recovery Actions)
629
+ // --------------------------------------------------------------------------
630
+
631
+ /**
632
+ * Stall 체크 및 복구 액션 실행
633
+ *
634
+ * @returns 실행된 분석 결과 (orchestrator에서 추가 처리 필요시 사용)
635
+ */
636
+ checkAndRecover(laneName: string): StallAnalysis {
637
+ const state = this.laneStates.get(laneName);
638
+ if (!state) {
639
+ return {
640
+ type: StallType.IDLE,
641
+ action: RecoveryAction.NONE,
642
+ message: 'Lane not found',
643
+ isTransient: false,
644
+ };
645
+ }
646
+
647
+ // 바이트 델타 업데이트 (다음 체크를 위해)
648
+ state.bytesAtLastCheck = state.totalBytesReceived;
649
+ state.bytesSinceLastCheck = 0;
650
+
651
+ const analysis = this.analyzeStall(laneName);
652
+
653
+ if (analysis.action === RecoveryAction.NONE) {
654
+ return analysis;
655
+ }
656
+
657
+ // 로그 출력
658
+ this.logAnalysis(laneName, analysis);
659
+
660
+ // 실패 이력 기록
661
+ this.recordFailure(state, analysis);
662
+
663
+ // 액션 실행
664
+ switch (analysis.action) {
665
+ case RecoveryAction.SEND_CONTINUE:
666
+ this.sendContinueSignal(state);
667
+ break;
668
+
669
+ case RecoveryAction.SEND_STRONGER_PROMPT:
670
+ this.sendStrongerPrompt(state);
671
+ break;
672
+
673
+ case RecoveryAction.REQUEST_RESTART:
674
+ this.requestRestart(state);
675
+ break;
676
+
677
+ case RecoveryAction.RUN_DOCTOR:
678
+ this.markForDiagnosis(state);
679
+ break;
680
+
681
+ case RecoveryAction.ABORT_LANE:
682
+ this.markAsAborted(state);
683
+ break;
684
+ }
685
+
686
+ return analysis;
687
+ }
688
+
689
+ /**
690
+ * Continue 신호 발송
691
+ */
692
+ private sendContinueSignal(state: LaneStallState): void {
693
+ if (!state.laneRunDir) {
694
+ logger.error(`[StallService] [${state.laneName}] Cannot send continue signal: laneRunDir not set`);
695
+ return;
696
+ }
697
+
698
+ const interventionPath = safeJoin(state.laneRunDir, 'intervention.txt');
699
+
700
+ try {
701
+ fs.writeFileSync(interventionPath, 'continue');
702
+
703
+ state.phase = StallPhase.CONTINUE_SENT;
704
+ state.lastPhaseChangeTime = Date.now();
705
+ state.continueSignalCount++;
706
+
707
+ logger.info(`[${state.laneName}] Sent continue signal (#${state.continueSignalCount})`);
708
+
709
+ events.emit('recovery.continue_signal', {
710
+ laneName: state.laneName,
711
+ idleSeconds: Math.round((Date.now() - state.lastRealActivityTime) / 1000),
712
+ signalCount: state.continueSignalCount,
713
+ });
714
+ } catch (error: any) {
715
+ logger.error(`[StallService] [${state.laneName}] Failed to send continue signal: ${error.message}`);
716
+ }
717
+ }
718
+
719
+ /**
720
+ * Stronger prompt 발송
721
+ */
722
+ private sendStrongerPrompt(state: LaneStallState): void {
723
+ if (!state.laneRunDir) {
724
+ logger.error(`[StallService] [${state.laneName}] Cannot send stronger prompt: laneRunDir not set`);
725
+ return;
726
+ }
727
+
728
+ const interventionPath = safeJoin(state.laneRunDir, 'intervention.txt');
729
+ const prompt = `[SYSTEM INTERVENTION] You seem to be stuck or waiting.
730
+ Please continue with your current task immediately.
731
+ If you're waiting for something, explain what you need and proceed with what you can do now.
732
+ If you've completed the task, please summarize your work and finish.
733
+ If you encountered a git error, resolve it and continue.`;
734
+
735
+ try {
736
+ fs.writeFileSync(interventionPath, prompt);
737
+
738
+ state.phase = StallPhase.STRONGER_PROMPT_SENT;
739
+ state.lastPhaseChangeTime = Date.now();
740
+
741
+ logger.warn(`[${state.laneName}] Sent stronger prompt after continue signal failed`);
742
+
743
+ events.emit('recovery.stronger_prompt', {
744
+ laneName: state.laneName,
745
+ });
746
+ } catch (error: any) {
747
+ logger.error(`[StallService] [${state.laneName}] Failed to send stronger prompt: ${error.message}`);
748
+ }
749
+ }
750
+
751
+ /**
752
+ * 재시작 요청 (프로세스 종료)
753
+ */
754
+ private requestRestart(state: LaneStallState): void {
755
+ state.restartCount++;
756
+ state.phase = StallPhase.RESTART_REQUESTED;
757
+ state.lastPhaseChangeTime = Date.now();
758
+
759
+ // 프로세스 종료
760
+ if (state.childProcess && !state.childProcess.killed) {
761
+ try {
762
+ state.childProcess.kill('SIGKILL');
763
+ logger.info(`[StallService] [${state.laneName}] Killed process ${state.childProcess.pid}`);
764
+ } catch (error: any) {
765
+ logger.warn(`[StallService] [${state.laneName}] Failed to kill process: ${error.message}`);
766
+ }
767
+ }
768
+
769
+ logger.warn(`[${state.laneName}] Killing and restarting lane (restart #${state.restartCount})`);
770
+
771
+ events.emit('recovery.restart', {
772
+ laneName: state.laneName,
773
+ restartCount: state.restartCount,
774
+ maxRestarts: this.config.maxRestarts,
775
+ });
776
+ }
777
+
778
+ /**
779
+ * 진단 필요 상태로 마킹
780
+ */
781
+ private markForDiagnosis(state: LaneStallState): void {
782
+ state.phase = StallPhase.DIAGNOSED;
783
+ state.lastPhaseChangeTime = Date.now();
784
+
785
+ logger.error(`[${state.laneName}] Running diagnostics due to persistent failures...`);
786
+
787
+ events.emit('recovery.diagnosed', {
788
+ laneName: state.laneName,
789
+ restartCount: state.restartCount,
790
+ });
791
+ }
792
+
793
+ /**
794
+ * 중단 상태로 마킹
795
+ */
796
+ private markAsAborted(state: LaneStallState): void {
797
+ state.phase = StallPhase.ABORTED;
798
+ state.lastPhaseChangeTime = Date.now();
799
+
800
+ logger.error(`[${state.laneName}] Lane aborted after recovery exhausted`);
801
+
802
+ events.emit('recovery.aborted', {
803
+ laneName: state.laneName,
804
+ });
805
+ }
806
+
807
+ /**
808
+ * 실패 이력 기록
809
+ */
810
+ private recordFailure(state: LaneStallState, analysis: StallAnalysis): void {
811
+ state.failureHistory.push({
812
+ timestamp: Date.now(),
813
+ phase: state.phase,
814
+ action: analysis.action,
815
+ message: analysis.message,
816
+ idleTimeMs: Date.now() - state.lastRealActivityTime,
817
+ bytesReceived: state.totalBytesReceived,
818
+ lastOutput: state.lastOutput.substring(0, 200),
819
+ });
820
+ }
821
+
822
+ /**
823
+ * 분석 결과 로깅
824
+ */
825
+ private logAnalysis(laneName: string, analysis: StallAnalysis): void {
826
+ const actionLabel = analysis.action === RecoveryAction.NONE ? '' : ` -> Action: ${analysis.action}`;
827
+ const message = `[${laneName}] ${analysis.type}: ${analysis.message}${actionLabel}`;
828
+
829
+ if (analysis.isTransient) {
830
+ logger.warn(message);
831
+ } else {
832
+ logger.error(message);
833
+ }
834
+
835
+ if (this.config.verbose && analysis.details) {
836
+ logger.debug(`[StallService] Details: ${JSON.stringify(analysis.details)}`);
837
+ }
838
+ }
839
+
840
+ // --------------------------------------------------------------------------
841
+ // 유틸리티
842
+ // --------------------------------------------------------------------------
843
+
844
+ /**
845
+ * Lane의 실패 이력 조회
846
+ */
847
+ getFailureHistory(laneName: string): FailureRecord[] {
848
+ return this.laneStates.get(laneName)?.failureHistory || [];
849
+ }
850
+
851
+ /**
852
+ * 재시작 횟수 조회
853
+ */
854
+ getRestartCount(laneName: string): number {
855
+ return this.laneStates.get(laneName)?.restartCount || 0;
856
+ }
857
+
858
+ /**
859
+ * 현재 phase 조회
860
+ */
861
+ getPhase(laneName: string): StallPhase {
862
+ return this.laneStates.get(laneName)?.phase ?? StallPhase.NORMAL;
863
+ }
864
+
865
+ /**
866
+ * 재시작 후 phase 리셋 (orchestrator에서 새 프로세스 시작 시 호출)
867
+ */
868
+ resetAfterRestart(laneName: string): void {
869
+ const state = this.laneStates.get(laneName);
870
+ if (state) {
871
+ const now = Date.now();
872
+ // RESTART_REQUESTED 유지 (재시작 후 모니터링 위해)
873
+ state.lastRealActivityTime = now;
874
+ state.lastPhaseChangeTime = now;
875
+ state.bytesSinceLastCheck = 0;
876
+ state.bytesAtLastCheck = state.totalBytesReceived;
877
+ }
878
+ }
879
+
880
+ /**
881
+ * Lane의 startIndex 업데이트 필요 여부 확인 (재시작 후)
882
+ */
883
+ needsStartIndexUpdate(laneName: string): boolean {
884
+ const state = this.laneStates.get(laneName);
885
+ return state?.phase === StallPhase.RESTART_REQUESTED;
886
+ }
887
+
888
+ /**
889
+ * Phase가 DIAGNOSED 이상인지 확인 (더 이상 복구 불가)
890
+ */
891
+ isUnrecoverable(laneName: string): boolean {
892
+ const phase = this.getPhase(laneName);
893
+ return phase >= StallPhase.DIAGNOSED;
894
+ }
895
+
896
+ /**
897
+ * 디버그용 상태 덤프
898
+ */
899
+ dumpState(laneName: string): string {
900
+ const state = this.laneStates.get(laneName);
901
+ if (!state) return `Lane ${laneName} not found`;
902
+
903
+ const now = Date.now();
904
+ return JSON.stringify({
905
+ laneName: state.laneName,
906
+ phase: StallPhase[state.phase],
907
+ idleTimeMs: now - state.lastRealActivityTime,
908
+ progressTimeMs: now - state.lastStateUpdateTime,
909
+ taskTimeMs: now - state.taskStartTime,
910
+ totalBytesReceived: state.totalBytesReceived,
911
+ restartCount: state.restartCount,
912
+ continueSignalCount: state.continueSignalCount,
913
+ isLongOperation: state.isLongOperation,
914
+ lastOutput: state.lastOutput.substring(0, 100),
915
+ }, null, 2);
916
+ }
917
+ }
918
+
919
+ // ============================================================================
920
+ // 편의 함수 (Convenience Functions)
921
+ // ============================================================================
922
+
923
+ /**
924
+ * 싱글톤 인스턴스 획득 (간편 접근)
925
+ */
926
+ export function getStallService(config?: Partial<StallDetectionConfig>): StallDetectionService {
927
+ return StallDetectionService.getInstance(config);
928
+ }
929
+
930
+ /**
931
+ * 인스턴스 리셋 (테스트용)
932
+ */
933
+ export function resetStallService(): void {
934
+ StallDetectionService.resetInstance();
935
+ }
936
+