@litmers/cursorflow-orchestrator 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -0
- package/dist/cli/index.js +0 -6
- package/dist/cli/index.js.map +1 -1
- package/dist/cli/monitor.js +18 -2
- package/dist/cli/monitor.js.map +1 -1
- package/dist/cli/signal.js +33 -29
- package/dist/cli/signal.js.map +1 -1
- package/dist/core/auto-recovery.d.ts +2 -117
- package/dist/core/auto-recovery.js +4 -487
- package/dist/core/auto-recovery.js.map +1 -1
- package/dist/core/failure-policy.d.ts +0 -52
- package/dist/core/failure-policy.js +7 -174
- package/dist/core/failure-policy.js.map +1 -1
- package/dist/core/intervention.d.ts +0 -6
- package/dist/core/intervention.js +1 -17
- package/dist/core/intervention.js.map +1 -1
- package/dist/core/orchestrator.js +10 -3
- package/dist/core/orchestrator.js.map +1 -1
- package/dist/core/runner/agent.js +18 -15
- package/dist/core/runner/agent.js.map +1 -1
- package/dist/core/stall-detection.js +9 -7
- package/dist/core/stall-detection.js.map +1 -1
- package/package.json +2 -13
- package/src/cli/index.ts +0 -6
- package/src/cli/monitor.ts +18 -2
- package/src/cli/signal.ts +38 -34
- package/src/core/auto-recovery.ts +13 -595
- package/src/core/failure-policy.ts +7 -228
- package/src/core/intervention.ts +0 -18
- package/src/core/orchestrator.ts +13 -3
- package/src/core/runner/agent.ts +21 -16
- package/src/core/stall-detection.ts +11 -9
- package/dist/cli/prepare.d.ts +0 -7
- package/dist/cli/prepare.js +0 -690
- package/dist/cli/prepare.js.map +0 -1
- package/src/cli/prepare.ts +0 -777
|
@@ -59,60 +59,6 @@ export interface FailureAnalysis {
|
|
|
59
59
|
/**
|
|
60
60
|
* Multi-layer stall detection configuration
|
|
61
61
|
*/
|
|
62
|
-
export interface StallDetectionConfig {
|
|
63
|
-
/** Time without stdout activity before sending continue signal */
|
|
64
|
-
idleTimeoutMs: number;
|
|
65
|
-
/** Time without state file update before considering stalled */
|
|
66
|
-
progressTimeoutMs: number;
|
|
67
|
-
/** Maximum time for a single task */
|
|
68
|
-
taskTimeoutMs: number;
|
|
69
|
-
/** Grace period for known long operations (e.g., npm install) */
|
|
70
|
-
longOperationGraceMs: number;
|
|
71
|
-
/** Patterns that indicate long operations */
|
|
72
|
-
longOperationPatterns: RegExp[];
|
|
73
|
-
/** Maximum restarts before aborting */
|
|
74
|
-
maxRestarts: number;
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
export const DEFAULT_STALL_CONFIG: StallDetectionConfig = {
|
|
78
|
-
idleTimeoutMs: 2 * 60 * 1000, // 2 minutes without output (idle detection)
|
|
79
|
-
progressTimeoutMs: 10 * 60 * 1000, // 10 minutes without progress
|
|
80
|
-
taskTimeoutMs: 30 * 60 * 1000, // 30 minutes max per task
|
|
81
|
-
longOperationGraceMs: 10 * 60 * 1000, // 10 minute grace for long ops
|
|
82
|
-
longOperationPatterns: [
|
|
83
|
-
/Installing dependencies/i,
|
|
84
|
-
/npm install/i,
|
|
85
|
-
/pnpm install/i,
|
|
86
|
-
/yarn install/i,
|
|
87
|
-
/Building/i,
|
|
88
|
-
/Compiling/i,
|
|
89
|
-
/Downloading/i,
|
|
90
|
-
/Fetching/i,
|
|
91
|
-
/Cloning/i,
|
|
92
|
-
/Bundling/i,
|
|
93
|
-
],
|
|
94
|
-
maxRestarts: 2,
|
|
95
|
-
};
|
|
96
|
-
|
|
97
|
-
export interface StallContext {
|
|
98
|
-
/** Current stall phase (0: normal, 1: continued, 2: stronger_prompt, 3: restarted) */
|
|
99
|
-
stallPhase: number;
|
|
100
|
-
/** Time since last activity */
|
|
101
|
-
idleTimeMs: number;
|
|
102
|
-
/** Time since last state update */
|
|
103
|
-
progressTimeMs?: number;
|
|
104
|
-
/** Last output line (for long operation detection) */
|
|
105
|
-
lastOutput?: string;
|
|
106
|
-
/** Number of restarts */
|
|
107
|
-
restartCount?: number;
|
|
108
|
-
/** Task start time */
|
|
109
|
-
taskStartTimeMs?: number;
|
|
110
|
-
/** Bytes received since last check (0 = no response at all) */
|
|
111
|
-
bytesReceived?: number;
|
|
112
|
-
/** Number of continue signals already sent */
|
|
113
|
-
continueSignalsSent?: number;
|
|
114
|
-
}
|
|
115
|
-
|
|
116
62
|
export interface FailureContext {
|
|
117
63
|
exitCode?: number;
|
|
118
64
|
stallPhase?: number;
|
|
@@ -125,167 +71,6 @@ export interface FailureContext {
|
|
|
125
71
|
circuitBreakerName?: string;
|
|
126
72
|
}
|
|
127
73
|
|
|
128
|
-
/**
|
|
129
|
-
* Analyze stall condition with multi-layer detection and escalating recovery
|
|
130
|
-
*
|
|
131
|
-
* @deprecated Use StallDetectionService from './stall-detection' instead.
|
|
132
|
-
* This function is kept for backward compatibility but will be removed in a future version.
|
|
133
|
-
*
|
|
134
|
-
* The new unified StallDetectionService provides:
|
|
135
|
-
* - Single source of truth for stall state
|
|
136
|
-
* - Automatic recovery action execution
|
|
137
|
-
* - Better heartbeat filtering
|
|
138
|
-
* - Consistent state management
|
|
139
|
-
*
|
|
140
|
-
* Recovery escalation stages:
|
|
141
|
-
* 1. Phase 0 → Phase 1: Send continue signal (after 2 min idle)
|
|
142
|
-
* 2. Phase 1 → Phase 2: Send stronger prompt (after 2 min grace)
|
|
143
|
-
* 3. Phase 2 → Phase 3: Kill and restart process (after 2 min grace)
|
|
144
|
-
* 4. Phase 3+: Abort after max restarts exceeded
|
|
145
|
-
*/
|
|
146
|
-
export function analyzeStall(context: StallContext, config: StallDetectionConfig = DEFAULT_STALL_CONFIG): FailureAnalysis {
|
|
147
|
-
const {
|
|
148
|
-
stallPhase,
|
|
149
|
-
idleTimeMs,
|
|
150
|
-
progressTimeMs,
|
|
151
|
-
lastOutput,
|
|
152
|
-
restartCount = 0,
|
|
153
|
-
taskStartTimeMs,
|
|
154
|
-
bytesReceived = -1, // -1 means not tracked
|
|
155
|
-
continueSignalsSent = 0,
|
|
156
|
-
} = context;
|
|
157
|
-
|
|
158
|
-
// Check if this might be a long operation
|
|
159
|
-
const isLongOperation = lastOutput && config.longOperationPatterns.some(p => p.test(lastOutput));
|
|
160
|
-
|
|
161
|
-
// If it's a long operation but we've received 0 real bytes for a while,
|
|
162
|
-
// reduce the grace period to avoid waiting forever for a hung process.
|
|
163
|
-
// We use 2x the normal idle timeout as a "sanity check" for silent long operations.
|
|
164
|
-
const silentLongOpCappedTimeout = config.idleTimeoutMs * 2;
|
|
165
|
-
const effectiveIdleTimeout = isLongOperation
|
|
166
|
-
? (bytesReceived === 0 ? Math.min(config.longOperationGraceMs, silentLongOpCappedTimeout) : config.longOperationGraceMs)
|
|
167
|
-
: config.idleTimeoutMs;
|
|
168
|
-
|
|
169
|
-
// Check for task timeout
|
|
170
|
-
if (taskStartTimeMs && (Date.now() - taskStartTimeMs) > config.taskTimeoutMs) {
|
|
171
|
-
return {
|
|
172
|
-
type: FailureType.AGENT_TIMEOUT,
|
|
173
|
-
action: restartCount < config.maxRestarts ? RecoveryAction.KILL_AND_RESTART : RecoveryAction.RUN_DOCTOR,
|
|
174
|
-
message: `Task exceeded maximum timeout of ${Math.round(config.taskTimeoutMs / 60000)} minutes`,
|
|
175
|
-
isTransient: restartCount < config.maxRestarts,
|
|
176
|
-
details: { taskDurationMs: Date.now() - taskStartTimeMs, restartCount },
|
|
177
|
-
};
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
// Check for zero bytes received (agent completely unresponsive)
|
|
181
|
-
if (bytesReceived === 0 && idleTimeMs > effectiveIdleTimeout) {
|
|
182
|
-
return {
|
|
183
|
-
type: FailureType.AGENT_NO_RESPONSE,
|
|
184
|
-
action: stallPhase < 2 ? RecoveryAction.CONTINUE_SIGNAL : RecoveryAction.KILL_AND_RESTART,
|
|
185
|
-
message: `Agent produced 0 bytes for ${Math.round(idleTimeMs / 1000)}s - possible API issue`,
|
|
186
|
-
isTransient: true,
|
|
187
|
-
details: { idleTimeMs, bytesReceived, stallPhase },
|
|
188
|
-
};
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
// Check for no progress (state file not updating)
|
|
192
|
-
if (progressTimeMs && progressTimeMs > config.progressTimeoutMs) {
|
|
193
|
-
return {
|
|
194
|
-
type: FailureType.STALL_NO_PROGRESS,
|
|
195
|
-
action: stallPhase === 0 ? RecoveryAction.CONTINUE_SIGNAL :
|
|
196
|
-
stallPhase === 1 ? RecoveryAction.STRONGER_PROMPT :
|
|
197
|
-
RecoveryAction.KILL_AND_RESTART,
|
|
198
|
-
message: `No progress for ${Math.round(progressTimeMs / 60000)} minutes`,
|
|
199
|
-
isTransient: true,
|
|
200
|
-
details: { progressTimeMs, stallPhase },
|
|
201
|
-
};
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
// Phase 0: Normal operation, check for initial idle
|
|
205
|
-
if (stallPhase === 0 && idleTimeMs > effectiveIdleTimeout) {
|
|
206
|
-
return {
|
|
207
|
-
type: FailureType.STALL_IDLE,
|
|
208
|
-
action: RecoveryAction.CONTINUE_SIGNAL,
|
|
209
|
-
message: `Lane idle for ${Math.round(idleTimeMs / 1000)}s. Sending continue signal...`,
|
|
210
|
-
isTransient: true,
|
|
211
|
-
details: { idleTimeMs, isLongOperation, phase: 0 },
|
|
212
|
-
};
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
// Phase 1: Continue signal sent, wait for response
|
|
216
|
-
if (stallPhase === 1) {
|
|
217
|
-
const graceTimeout = 2 * 60 * 1000; // 2 minutes grace after continue
|
|
218
|
-
|
|
219
|
-
if (idleTimeMs > graceTimeout) {
|
|
220
|
-
return {
|
|
221
|
-
type: FailureType.STALL_IDLE,
|
|
222
|
-
action: RecoveryAction.STRONGER_PROMPT,
|
|
223
|
-
message: `Still idle after continue signal. Sending stronger prompt...`,
|
|
224
|
-
isTransient: true,
|
|
225
|
-
details: { idleTimeMs, continueSignalsSent, phase: 1 },
|
|
226
|
-
};
|
|
227
|
-
}
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
// Phase 2: Stronger prompt sent, wait or escalate
|
|
231
|
-
if (stallPhase === 2) {
|
|
232
|
-
const strongerGraceTimeout = 2 * 60 * 1000; // 2 minutes grace after stronger prompt
|
|
233
|
-
|
|
234
|
-
if (idleTimeMs > strongerGraceTimeout) {
|
|
235
|
-
if (restartCount < config.maxRestarts) {
|
|
236
|
-
return {
|
|
237
|
-
type: FailureType.STALL_IDLE,
|
|
238
|
-
action: RecoveryAction.KILL_AND_RESTART,
|
|
239
|
-
message: `No response after stronger prompt. Killing and restarting process...`,
|
|
240
|
-
isTransient: true,
|
|
241
|
-
details: { idleTimeMs, restartCount, maxRestarts: config.maxRestarts, phase: 2 },
|
|
242
|
-
};
|
|
243
|
-
} else {
|
|
244
|
-
return {
|
|
245
|
-
type: FailureType.STALL_IDLE,
|
|
246
|
-
action: RecoveryAction.RUN_DOCTOR,
|
|
247
|
-
message: `Lane failed after ${restartCount} restarts. Running diagnostics...`,
|
|
248
|
-
isTransient: false,
|
|
249
|
-
details: { restartCount, phase: 2 },
|
|
250
|
-
};
|
|
251
|
-
}
|
|
252
|
-
}
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
// Phase 3+: After restart, monitor with shorter timeout
|
|
256
|
-
if (stallPhase >= 3) {
|
|
257
|
-
const postRestartTimeout = config.idleTimeoutMs * 0.75; // Shorter timeout after restart
|
|
258
|
-
|
|
259
|
-
if (idleTimeMs > postRestartTimeout) {
|
|
260
|
-
if (restartCount < config.maxRestarts) {
|
|
261
|
-
return {
|
|
262
|
-
type: FailureType.STALL_IDLE,
|
|
263
|
-
action: RecoveryAction.CONTINUE_SIGNAL,
|
|
264
|
-
message: `Lane idle after restart. Retrying continue signal...`,
|
|
265
|
-
isTransient: true,
|
|
266
|
-
details: { idleTimeMs, restartCount, phase: stallPhase },
|
|
267
|
-
};
|
|
268
|
-
} else {
|
|
269
|
-
return {
|
|
270
|
-
type: FailureType.STALL_IDLE,
|
|
271
|
-
action: RecoveryAction.RUN_DOCTOR,
|
|
272
|
-
message: `Lane repeatedly stalled. Running diagnostics for root cause...`,
|
|
273
|
-
isTransient: false,
|
|
274
|
-
details: { stallPhase, restartCount },
|
|
275
|
-
};
|
|
276
|
-
}
|
|
277
|
-
}
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
// No action needed yet
|
|
281
|
-
return {
|
|
282
|
-
type: FailureType.STALL_IDLE,
|
|
283
|
-
action: RecoveryAction.NONE,
|
|
284
|
-
message: 'Monitoring for stall',
|
|
285
|
-
isTransient: true,
|
|
286
|
-
};
|
|
287
|
-
}
|
|
288
|
-
|
|
289
74
|
/**
|
|
290
75
|
* Analyze an error message or state to determine the failure type and recovery action
|
|
291
76
|
*/
|
|
@@ -310,11 +95,12 @@ export function analyzeFailure(error: string | null | undefined, context?: Failu
|
|
|
310
95
|
// 1. Network errors
|
|
311
96
|
if (msg.includes('econnreset') || msg.includes('econnrefused') ||
|
|
312
97
|
msg.includes('etimedout') || msg.includes('enotfound') ||
|
|
313
|
-
msg.includes('socket hang up') || msg.includes('network')
|
|
98
|
+
msg.includes('socket hang up') || msg.includes('network') ||
|
|
99
|
+
msg.includes('canceled') || msg.includes('http/2') || msg.includes('stream closed')) {
|
|
314
100
|
return {
|
|
315
101
|
type: FailureType.NETWORK_ERROR,
|
|
316
102
|
action: (context?.retryCount || 0) < 3 ? RecoveryAction.RETRY_TASK : RecoveryAction.RESTART_LANE,
|
|
317
|
-
message: 'Network error. Retrying...',
|
|
103
|
+
message: 'Network error or connection lost. Retrying...',
|
|
318
104
|
isTransient: true,
|
|
319
105
|
suggestedDelayMs: 5000 * Math.pow(2, context?.retryCount || 0),
|
|
320
106
|
};
|
|
@@ -425,17 +211,10 @@ export function analyzeFailure(error: string | null | undefined, context?: Failu
|
|
|
425
211
|
};
|
|
426
212
|
}
|
|
427
213
|
|
|
428
|
-
// 10. Stalls
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
idleTimeMs: context.idleTimeMs || 0,
|
|
433
|
-
progressTimeMs: context.progressTimeMs,
|
|
434
|
-
restartCount: context.restartCount,
|
|
435
|
-
taskStartTimeMs: context.taskStartTimeMs,
|
|
436
|
-
});
|
|
437
|
-
}
|
|
438
|
-
|
|
214
|
+
// 10. Stalls
|
|
215
|
+
// Deprecated: analyzeStall call removed. Orchestrator now uses StallDetectionService
|
|
216
|
+
// for all stall-related monitoring and recovery.
|
|
217
|
+
|
|
439
218
|
// 11. Default fallback
|
|
440
219
|
return {
|
|
441
220
|
type: FailureType.UNKNOWN_CRASH,
|
package/src/core/intervention.ts
CHANGED
|
@@ -82,9 +82,6 @@ export interface InterventionResult {
|
|
|
82
82
|
/** 개입 요청 파일명 */
|
|
83
83
|
export const PENDING_INTERVENTION_FILE = 'pending-intervention.json';
|
|
84
84
|
|
|
85
|
-
/** 기존 intervention.txt 파일명 (호환성 유지) */
|
|
86
|
-
export const LEGACY_INTERVENTION_FILE = 'intervention.txt';
|
|
87
|
-
|
|
88
85
|
/** 프로세스 종료 대기 시간 (ms) */
|
|
89
86
|
const KILL_TIMEOUT_MS = 5000;
|
|
90
87
|
|
|
@@ -137,13 +134,6 @@ export function getPendingInterventionPath(laneRunDir: string): string {
|
|
|
137
134
|
return safeJoin(laneRunDir, PENDING_INTERVENTION_FILE);
|
|
138
135
|
}
|
|
139
136
|
|
|
140
|
-
/**
|
|
141
|
-
* 기존 intervention.txt 경로 가져오기 (호환성)
|
|
142
|
-
*/
|
|
143
|
-
export function getLegacyInterventionPath(laneRunDir: string): string {
|
|
144
|
-
return safeJoin(laneRunDir, LEGACY_INTERVENTION_FILE);
|
|
145
|
-
}
|
|
146
|
-
|
|
147
137
|
/**
|
|
148
138
|
* 개입 요청 생성 및 저장
|
|
149
139
|
*
|
|
@@ -173,10 +163,6 @@ export function createInterventionRequest(
|
|
|
173
163
|
fs.writeFileSync(filePath, JSON.stringify(fullRequest, null, 2), 'utf8');
|
|
174
164
|
logger.debug(`[Intervention] Created request: ${filePath}`);
|
|
175
165
|
|
|
176
|
-
// 기존 intervention.txt에도 기록 (호환성 및 로깅용)
|
|
177
|
-
const legacyPath = getLegacyInterventionPath(laneRunDir);
|
|
178
|
-
fs.writeFileSync(legacyPath, fullRequest.message, 'utf8');
|
|
179
|
-
|
|
180
166
|
return filePath;
|
|
181
167
|
}
|
|
182
168
|
|
|
@@ -204,15 +190,11 @@ export function readPendingIntervention(laneRunDir: string): InterventionRequest
|
|
|
204
190
|
*/
|
|
205
191
|
export function clearPendingIntervention(laneRunDir: string): void {
|
|
206
192
|
const filePath = getPendingInterventionPath(laneRunDir);
|
|
207
|
-
const legacyPath = getLegacyInterventionPath(laneRunDir);
|
|
208
193
|
|
|
209
194
|
try {
|
|
210
195
|
if (fs.existsSync(filePath)) {
|
|
211
196
|
fs.unlinkSync(filePath);
|
|
212
197
|
}
|
|
213
|
-
if (fs.existsSync(legacyPath)) {
|
|
214
|
-
fs.unlinkSync(legacyPath);
|
|
215
|
-
}
|
|
216
198
|
} catch (error) {
|
|
217
199
|
// Ignore cleanup errors
|
|
218
200
|
}
|
package/src/core/orchestrator.ts
CHANGED
|
@@ -40,6 +40,9 @@ import {
|
|
|
40
40
|
getGitErrorGuidance,
|
|
41
41
|
LaneRecoveryState,
|
|
42
42
|
} from './auto-recovery';
|
|
43
|
+
import {
|
|
44
|
+
isInterventionRestart,
|
|
45
|
+
} from './intervention';
|
|
43
46
|
import {
|
|
44
47
|
StallDetectionService,
|
|
45
48
|
getStallService,
|
|
@@ -994,9 +997,16 @@ export async function orchestrate(tasksDir: string, options: {
|
|
|
994
997
|
logger.error(`Lane ${finished.name} exited with code 2 but no dependency request found`);
|
|
995
998
|
}
|
|
996
999
|
} else {
|
|
997
|
-
// Check if it was a restart request (
|
|
998
|
-
if (stallPhase === StallPhase.RESTART_REQUESTED
|
|
999
|
-
|
|
1000
|
+
// Check if it was a restart request or intervention (killed to be resumed)
|
|
1001
|
+
if (stallPhase === StallPhase.RESTART_REQUESTED ||
|
|
1002
|
+
stallPhase === StallPhase.CONTINUE_SENT ||
|
|
1003
|
+
stallPhase === StallPhase.STRONGER_PROMPT_SENT ||
|
|
1004
|
+
isInterventionRestart(laneRunDirs[finished.name]!)) {
|
|
1005
|
+
const isManual = isInterventionRestart(laneRunDirs[finished.name]!);
|
|
1006
|
+
const phaseName = isManual ? 'manual intervention' :
|
|
1007
|
+
(stallPhase === StallPhase.RESTART_REQUESTED ? 'restart' : 'automatic intervention');
|
|
1008
|
+
|
|
1009
|
+
logger.info(`🔄 Lane ${finished.name} is being resumed/restarted due to ${phaseName}...`);
|
|
1000
1010
|
|
|
1001
1011
|
// Update startIndex from current state to resume from the same task
|
|
1002
1012
|
const statePath = safeJoin(laneRunDirs[finished.name]!, 'state.json');
|
package/src/core/runner/agent.ts
CHANGED
|
@@ -218,27 +218,32 @@ async function cursorAgentSendRaw({ workspaceDir, chatId, prompt, model, signalD
|
|
|
218
218
|
fs.mkdirSync(signalDir, { recursive: true });
|
|
219
219
|
}
|
|
220
220
|
|
|
221
|
-
const interventionPath = path.join(signalDir, 'intervention.
|
|
221
|
+
const interventionPath = path.join(signalDir, 'pending-intervention.json');
|
|
222
222
|
const timeoutPath = path.join(signalDir, 'timeout.txt');
|
|
223
223
|
|
|
224
|
-
// Watch for
|
|
224
|
+
// Watch for signals from UI (intervention via stdin no longer works)
|
|
225
225
|
signalWatcher = fs.watch(signalDir, (event, filename) => {
|
|
226
|
-
if (filename === 'intervention.
|
|
226
|
+
if (filename === 'pending-intervention.json' && fs.existsSync(interventionPath)) {
|
|
227
227
|
try {
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
model: 'manual'
|
|
238
|
-
}));
|
|
239
|
-
}
|
|
240
|
-
fs.unlinkSync(interventionPath);
|
|
228
|
+
// Log intervention
|
|
229
|
+
logger.warn(`👋 Intervention received. Interrupting agent for resume...`);
|
|
230
|
+
|
|
231
|
+
if (signalDir) {
|
|
232
|
+
const convoPath = path.join(signalDir, 'conversation.jsonl');
|
|
233
|
+
appendLog(convoPath, createConversationEntry('intervention', `[INTERVENTION RECEIVED]`, {
|
|
234
|
+
task: taskName || 'AGENT_TURN',
|
|
235
|
+
model: 'manual'
|
|
236
|
+
}));
|
|
241
237
|
}
|
|
238
|
+
|
|
239
|
+
// Kill the agent child process.
|
|
240
|
+
// This will cause the runner to finish this task (with error)
|
|
241
|
+
// and since we've already written pending-intervention.json,
|
|
242
|
+
// it will be picked up on next resume.
|
|
243
|
+
child.kill('SIGTERM');
|
|
244
|
+
|
|
245
|
+
// Note: we don't unlink pending-intervention.json here,
|
|
246
|
+
// the runner will do it when resuming to read the message.
|
|
242
247
|
} catch {}
|
|
243
248
|
}
|
|
244
249
|
|
|
@@ -827,16 +827,17 @@ export class StallDetectionService {
|
|
|
827
827
|
priority: 5,
|
|
828
828
|
});
|
|
829
829
|
|
|
830
|
-
// 2.
|
|
830
|
+
// 2. 상태 먼저 업데이트 (race condition 방지)
|
|
831
|
+
state.phase = StallPhase.CONTINUE_SENT;
|
|
832
|
+
state.lastPhaseChangeTime = Date.now();
|
|
833
|
+
state.continueSignalCount++;
|
|
834
|
+
|
|
835
|
+
// 3. 프로세스 종료 (있는 경우)
|
|
831
836
|
if (state.childProcess?.pid && !state.childProcess.killed) {
|
|
832
837
|
logger.info(`[${state.laneName}] Interrupting process ${state.childProcess.pid} for continue signal`);
|
|
833
838
|
await killAndWait(state.childProcess.pid);
|
|
834
839
|
}
|
|
835
840
|
|
|
836
|
-
state.phase = StallPhase.CONTINUE_SENT;
|
|
837
|
-
state.lastPhaseChangeTime = Date.now();
|
|
838
|
-
state.continueSignalCount++;
|
|
839
|
-
|
|
840
841
|
logger.info(`[${state.laneName}] Continue signal queued (#${state.continueSignalCount}) - agent will resume with intervention`);
|
|
841
842
|
|
|
842
843
|
events.emit('recovery.continue_signal', {
|
|
@@ -875,15 +876,16 @@ export class StallDetectionService {
|
|
|
875
876
|
priority: 7,
|
|
876
877
|
});
|
|
877
878
|
|
|
878
|
-
// 2.
|
|
879
|
+
// 2. 상태 먼저 업데이트 (race condition 방지)
|
|
880
|
+
state.phase = StallPhase.STRONGER_PROMPT_SENT;
|
|
881
|
+
state.lastPhaseChangeTime = Date.now();
|
|
882
|
+
|
|
883
|
+
// 3. 프로세스 종료 (있는 경우)
|
|
879
884
|
if (state.childProcess?.pid && !state.childProcess.killed) {
|
|
880
885
|
logger.warn(`[${state.laneName}] Interrupting process ${state.childProcess.pid} for stronger prompt`);
|
|
881
886
|
await killAndWait(state.childProcess.pid);
|
|
882
887
|
}
|
|
883
888
|
|
|
884
|
-
state.phase = StallPhase.STRONGER_PROMPT_SENT;
|
|
885
|
-
state.lastPhaseChangeTime = Date.now();
|
|
886
|
-
|
|
887
889
|
logger.warn(`[${state.laneName}] Stronger prompt queued - agent will resume with intervention`);
|
|
888
890
|
|
|
889
891
|
events.emit('recovery.stronger_prompt', {
|