karajan-code 1.9.1 → 1.9.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -1
- package/docs/README.es.md +2 -1
- package/package.json +1 -1
- package/src/agents/aider-agent.js +10 -2
- package/src/agents/claude-agent.js +10 -2
- package/src/agents/codex-agent.js +14 -4
- package/src/agents/gemini-agent.js +10 -2
- package/src/commands/plan.js +7 -1
- package/src/config.js +2 -0
- package/src/mcp/server-handlers.js +66 -5
- package/src/roles/planner-role.js +16 -0
- package/src/utils/process.js +89 -8
- package/src/utils/stall-detector.js +51 -18
package/README.md
CHANGED
|
@@ -43,6 +43,7 @@ Instead of running one AI agent and manually reviewing its output, `kj` chains a
|
|
|
43
43
|
- **Task decomposition** — triage detects when tasks should be split and recommends subtasks; with Planning Game integration, creates linked cards with sequential blocking
|
|
44
44
|
- **Retry with backoff** — automatic recovery from transient API errors (429, 5xx) with exponential backoff and jitter
|
|
45
45
|
- **Pipeline stage tracker** — cumulative progress view during `kj_run` showing which stages are done, running, or pending — both in CLI and via MCP events for real-time host rendering
|
|
46
|
+
- **Planner observability guardrails** — continuous heartbeat/stall telemetry, configurable max-silence protection (`session.max_agent_silence_minutes`), and hard runtime cap (`session.max_planner_minutes`) to avoid long stuck planner runs
|
|
46
47
|
- **Planning Game integration** — optionally pair with [Planning Game](https://github.com/AgenteIA-Geniova/planning-game) for agile project management (tasks, sprints, estimation) — like Jira, but open-source and XP-native
|
|
47
48
|
|
|
48
49
|
> **Best with MCP** — Karajan Code is designed to be used as an MCP server inside your AI agent (Claude, Codex, etc.). The agent sends tasks to `kj_run`, gets real-time progress notifications, and receives structured results — no copy-pasting needed.
|
|
@@ -417,7 +418,19 @@ After `npm install -g karajan-code`, the MCP server is auto-registered in Claude
|
|
|
417
418
|
| `kj_roles` | List roles or show role templates |
|
|
418
419
|
| `kj_code` | Run coder-only mode |
|
|
419
420
|
| `kj_review` | Run reviewer-only mode |
|
|
420
|
-
| `kj_plan` | Generate implementation plan |
|
|
421
|
+
| `kj_plan` | Generate implementation plan with heartbeat/stall telemetry and clearer diagnostics |
|
|
422
|
+
|
|
423
|
+
### MCP restart after version updates
|
|
424
|
+
|
|
425
|
+
If you update Karajan Code (for example `npm install -g karajan-code` to a new version) while your MCP host session is still open, the current `karajan-mcp` process may exit and the host can show `Transport closed`.
|
|
426
|
+
|
|
427
|
+
This is expected behavior: the MCP server detects a version mismatch and exits so the host can spawn a fresh process with the new code.
|
|
428
|
+
|
|
429
|
+
Quick recovery:
|
|
430
|
+
|
|
431
|
+
1. Restart your MCP host session (Claude/Codex/new terminal session).
|
|
432
|
+
2. Verify the server is listed (`codex mcp list` or your host equivalent).
|
|
433
|
+
3. Run a lightweight check (`kj_config`) before continuing with larger runs.
|
|
421
434
|
|
|
422
435
|
### Recommended Companion MCPs
|
|
423
436
|
|
package/docs/README.es.md
CHANGED
|
@@ -42,6 +42,7 @@ En lugar de ejecutar un agente de IA y revisar manualmente su output, `kj` encad
|
|
|
42
42
|
- **Descomposicion de tareas** — triage detecta cuando una tarea debe dividirse y recomienda subtareas; con integracion Planning Game, crea cards vinculadas con bloqueo secuencial
|
|
43
43
|
- **Retry con backoff** — recuperacion automatica ante errores transitorios de API (429, 5xx) con backoff exponencial y jitter
|
|
44
44
|
- **Pipeline stage tracker** — vista de progreso acumulativo durante `kj_run` mostrando que stages estan completadas, en ejecucion o pendientes — tanto en CLI como via eventos MCP para renderizado en tiempo real en el host
|
|
45
|
+
- **Guardarrailes de observabilidad del planner** — telemetria continua de heartbeat/stall, proteccion configurable por silencio maximo (`session.max_agent_silence_minutes`) y limite duro de ejecucion (`session.max_planner_minutes`) para evitar bloqueos prolongados en `kj_plan`/planner
|
|
45
46
|
- **Integracion con Planning Game** — combina opcionalmente con [Planning Game](https://github.com/AgenteIA-Geniova/planning-game) para gestion agil de proyectos (tareas, sprints, estimacion) — como Jira, pero open-source y nativo XP
|
|
46
47
|
|
|
47
48
|
> **Mejor con MCP** — Karajan Code esta disenado para usarse como servidor MCP dentro de tu agente de IA (Claude, Codex, etc.). El agente envia tareas a `kj_run`, recibe notificaciones de progreso en tiempo real, y obtiene resultados estructurados — sin copiar y pegar.
|
|
@@ -201,7 +202,7 @@ Tras `npm install -g karajan-code`, el servidor MCP se auto-registra en las conf
|
|
|
201
202
|
| `kj_roles` | Listar roles o mostrar templates |
|
|
202
203
|
| `kj_code` | Modo solo coder |
|
|
203
204
|
| `kj_review` | Modo solo reviewer |
|
|
204
|
-
| `kj_plan` | Generar plan de implementacion |
|
|
205
|
+
| `kj_plan` | Generar plan de implementacion con telemetria heartbeat/stall y diagnostico mas claro |
|
|
205
206
|
|
|
206
207
|
### MCPs complementarios recomendados
|
|
207
208
|
|
package/package.json
CHANGED
|
@@ -8,7 +8,11 @@ export class AiderAgent extends BaseAgent {
|
|
|
8
8
|
const args = ["--yes", "--message", task.prompt];
|
|
9
9
|
const model = this.getRoleModel(role);
|
|
10
10
|
if (model) args.push("--model", model);
|
|
11
|
-
const res = await runCommand(resolveBin("aider"), args, {
|
|
11
|
+
const res = await runCommand(resolveBin("aider"), args, {
|
|
12
|
+
onOutput: task.onOutput,
|
|
13
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
14
|
+
timeout: task.timeoutMs
|
|
15
|
+
});
|
|
12
16
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
13
17
|
}
|
|
14
18
|
|
|
@@ -17,7 +21,11 @@ export class AiderAgent extends BaseAgent {
|
|
|
17
21
|
const args = ["--yes", "--message", task.prompt];
|
|
18
22
|
const model = this.getRoleModel(role);
|
|
19
23
|
if (model) args.push("--model", model);
|
|
20
|
-
const res = await runCommand(resolveBin("aider"), args, {
|
|
24
|
+
const res = await runCommand(resolveBin("aider"), args, {
|
|
25
|
+
onOutput: task.onOutput,
|
|
26
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
27
|
+
timeout: task.timeoutMs
|
|
28
|
+
});
|
|
21
29
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
22
30
|
}
|
|
23
31
|
}
|
|
@@ -81,7 +81,11 @@ export class ClaudeAgent extends BaseAgent {
|
|
|
81
81
|
if (task.onOutput) {
|
|
82
82
|
args.push("--output-format", "stream-json");
|
|
83
83
|
const streamFilter = createStreamJsonFilter(task.onOutput);
|
|
84
|
-
const res = await runCommand(resolveBin("claude"), args, {
|
|
84
|
+
const res = await runCommand(resolveBin("claude"), args, {
|
|
85
|
+
onOutput: streamFilter,
|
|
86
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
87
|
+
timeout: task.timeoutMs
|
|
88
|
+
});
|
|
85
89
|
const output = extractTextFromStreamJson(res.stdout);
|
|
86
90
|
return { ok: res.exitCode === 0, output, error: res.stderr, exitCode: res.exitCode };
|
|
87
91
|
}
|
|
@@ -94,7 +98,11 @@ export class ClaudeAgent extends BaseAgent {
|
|
|
94
98
|
const args = ["-p", task.prompt, "--output-format", "json"];
|
|
95
99
|
const model = this.getRoleModel(task.role || "reviewer");
|
|
96
100
|
if (model) args.push("--model", model);
|
|
97
|
-
const res = await runCommand(resolveBin("claude"), args, {
|
|
101
|
+
const res = await runCommand(resolveBin("claude"), args, {
|
|
102
|
+
onOutput: task.onOutput,
|
|
103
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
104
|
+
timeout: task.timeoutMs
|
|
105
|
+
});
|
|
98
106
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
99
107
|
}
|
|
100
108
|
}
|
|
@@ -9,8 +9,13 @@ export class CodexAgent extends BaseAgent {
|
|
|
9
9
|
const model = this.getRoleModel(role);
|
|
10
10
|
if (model) args.push("--model", model);
|
|
11
11
|
if (this.isAutoApproveEnabled(role)) args.push("--full-auto");
|
|
12
|
-
args.push(
|
|
13
|
-
const res = await runCommand(resolveBin("codex"), args, {
|
|
12
|
+
args.push("-");
|
|
13
|
+
const res = await runCommand(resolveBin("codex"), args, {
|
|
14
|
+
onOutput: task.onOutput,
|
|
15
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
16
|
+
timeout: task.timeoutMs,
|
|
17
|
+
input: task.prompt
|
|
18
|
+
});
|
|
14
19
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
15
20
|
}
|
|
16
21
|
|
|
@@ -18,8 +23,13 @@ export class CodexAgent extends BaseAgent {
|
|
|
18
23
|
const args = ["exec"];
|
|
19
24
|
const model = this.getRoleModel(task.role || "reviewer");
|
|
20
25
|
if (model) args.push("--model", model);
|
|
21
|
-
args.push(
|
|
22
|
-
const res = await runCommand(resolveBin("codex"), args, {
|
|
26
|
+
args.push("-");
|
|
27
|
+
const res = await runCommand(resolveBin("codex"), args, {
|
|
28
|
+
onOutput: task.onOutput,
|
|
29
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
30
|
+
timeout: task.timeoutMs,
|
|
31
|
+
input: task.prompt
|
|
32
|
+
});
|
|
23
33
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
24
34
|
}
|
|
25
35
|
}
|
|
@@ -8,7 +8,11 @@ export class GeminiAgent extends BaseAgent {
|
|
|
8
8
|
const args = ["-p", task.prompt];
|
|
9
9
|
const model = this.getRoleModel(role);
|
|
10
10
|
if (model) args.push("--model", model);
|
|
11
|
-
const res = await runCommand(resolveBin("gemini"), args, {
|
|
11
|
+
const res = await runCommand(resolveBin("gemini"), args, {
|
|
12
|
+
onOutput: task.onOutput,
|
|
13
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
14
|
+
timeout: task.timeoutMs
|
|
15
|
+
});
|
|
12
16
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
13
17
|
}
|
|
14
18
|
|
|
@@ -17,7 +21,11 @@ export class GeminiAgent extends BaseAgent {
|
|
|
17
21
|
const args = ["-p", task.prompt, "--output-format", "json"];
|
|
18
22
|
const model = this.getRoleModel(role);
|
|
19
23
|
if (model) args.push("--model", model);
|
|
20
|
-
const res = await runCommand(resolveBin("gemini"), args, {
|
|
24
|
+
const res = await runCommand(resolveBin("gemini"), args, {
|
|
25
|
+
onOutput: task.onOutput,
|
|
26
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
27
|
+
timeout: task.timeoutMs
|
|
28
|
+
});
|
|
21
29
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
22
30
|
}
|
|
23
31
|
}
|
package/src/commands/plan.js
CHANGED
|
@@ -46,7 +46,13 @@ export async function planCommand({ task, config, logger, json, context }) {
|
|
|
46
46
|
|
|
47
47
|
const planner = createAgent(plannerRole.provider, config, logger);
|
|
48
48
|
const prompt = buildPlannerPrompt({ task, context });
|
|
49
|
-
const
|
|
49
|
+
const silenceTimeoutMs = Number(config?.session?.max_agent_silence_minutes) > 0
|
|
50
|
+
? Math.round(Number(config.session.max_agent_silence_minutes) * 60 * 1000)
|
|
51
|
+
: undefined;
|
|
52
|
+
const timeoutMs = Number(config?.session?.max_planner_minutes) > 0
|
|
53
|
+
? Math.round(Number(config.session.max_planner_minutes) * 60 * 1000)
|
|
54
|
+
: undefined;
|
|
55
|
+
const result = await planner.runTask({ prompt, role: "planner", silenceTimeoutMs, timeoutMs });
|
|
50
56
|
|
|
51
57
|
if (!result.ok) {
|
|
52
58
|
throw new Error(result.error || result.output || "Planner failed");
|
package/src/config.js
CHANGED
|
@@ -114,7 +114,9 @@ const DEFAULTS = {
|
|
|
114
114
|
session: {
|
|
115
115
|
max_iteration_minutes: 30,
|
|
116
116
|
max_total_minutes: 120,
|
|
117
|
+
max_planner_minutes: 60,
|
|
117
118
|
checkpoint_interval_minutes: 5,
|
|
119
|
+
max_agent_silence_minutes: 20,
|
|
118
120
|
fail_fast_repeats: 2,
|
|
119
121
|
repeat_detection_threshold: 2,
|
|
120
122
|
max_sonar_retries: 3,
|
|
@@ -21,6 +21,7 @@ import { parseMaybeJsonString } from "../review/parser.js";
|
|
|
21
21
|
import { computeBaseRef, generateDiff } from "../review/diff-generator.js";
|
|
22
22
|
import { resolveReviewProfile } from "../review/profiles.js";
|
|
23
23
|
import { createRunLog, readRunLog } from "../utils/run-log.js";
|
|
24
|
+
import { currentBranch } from "../utils/git.js";
|
|
24
25
|
|
|
25
26
|
/**
|
|
26
27
|
* Resolve the user's project directory via MCP roots.
|
|
@@ -62,6 +63,18 @@ export function classifyError(error) {
|
|
|
62
63
|
const msg = error?.message || String(error);
|
|
63
64
|
const lower = msg.toLowerCase();
|
|
64
65
|
|
|
66
|
+
if (
|
|
67
|
+
lower.includes("without output")
|
|
68
|
+
|| lower.includes("silent for")
|
|
69
|
+
|| lower.includes("unresponsive")
|
|
70
|
+
|| lower.includes("exceeded max silence")
|
|
71
|
+
) {
|
|
72
|
+
return {
|
|
73
|
+
category: "agent_stall",
|
|
74
|
+
suggestion: "Agent output stalled. Check live details with kj_status, then retry with a smaller prompt or increase session.max_agent_silence_minutes if needed."
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
|
|
65
78
|
if (lower.includes("sonar") && (lower.includes("connect") || lower.includes("econnrefused") || lower.includes("not available") || lower.includes("not running"))) {
|
|
66
79
|
return {
|
|
67
80
|
category: "sonar_unavailable",
|
|
@@ -97,6 +110,13 @@ export function classifyError(error) {
|
|
|
97
110
|
};
|
|
98
111
|
}
|
|
99
112
|
|
|
113
|
+
if (lower.includes("you are on the base branch")) {
|
|
114
|
+
return {
|
|
115
|
+
category: "branch_error",
|
|
116
|
+
suggestion: "Create a feature branch before running Karajan. Use 'git checkout -b feat/<task-description>' and then retry. Do NOT run kj_code directly on the base branch."
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
|
|
100
120
|
if (lower.includes("not a git repository")) {
|
|
101
121
|
return {
|
|
102
122
|
category: "git_error",
|
|
@@ -107,6 +127,23 @@ export function classifyError(error) {
|
|
|
107
127
|
return { category: "unknown", suggestion: null };
|
|
108
128
|
}
|
|
109
129
|
|
|
130
|
+
export async function assertNotOnBaseBranch(config) {
|
|
131
|
+
const baseBranch = config?.base_branch || "main";
|
|
132
|
+
let branch;
|
|
133
|
+
try {
|
|
134
|
+
branch = await currentBranch();
|
|
135
|
+
} catch {
|
|
136
|
+
return; // not a git repo or detached HEAD — let downstream handle it
|
|
137
|
+
}
|
|
138
|
+
if (branch === baseBranch) {
|
|
139
|
+
throw new Error(
|
|
140
|
+
`You are on the base branch '${baseBranch}'. Karajan needs a feature branch to compute the diff for review. ` +
|
|
141
|
+
`Create a new branch first (e.g. 'git checkout -b feat/<task-description>') and then run this command again. ` +
|
|
142
|
+
`Do NOT run kj_code directly — create the branch first so the full pipeline (code + review) works correctly.`
|
|
143
|
+
);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
110
147
|
export function enrichedFailPayload(error, toolName) {
|
|
111
148
|
const msg = error?.message || String(error);
|
|
112
149
|
const { category, suggestion } = classifyError(error);
|
|
@@ -149,6 +186,7 @@ export function buildAskQuestion(server) {
|
|
|
149
186
|
|
|
150
187
|
export async function handleRunDirect(a, server, extra) {
|
|
151
188
|
const config = await buildConfig(a);
|
|
189
|
+
await assertNotOnBaseBranch(config);
|
|
152
190
|
const logger = createLogger(config.output.log_level, "mcp");
|
|
153
191
|
|
|
154
192
|
const requiredProviders = [
|
|
@@ -235,7 +273,15 @@ export async function handlePlanDirect(a, server, extra) {
|
|
|
235
273
|
|
|
236
274
|
const projectDir = await resolveProjectDir(server);
|
|
237
275
|
const runLog = createRunLog(projectDir);
|
|
238
|
-
|
|
276
|
+
const silenceTimeoutMs = Number(config?.session?.max_agent_silence_minutes) > 0
|
|
277
|
+
? Math.round(Number(config.session.max_agent_silence_minutes) * 60 * 1000)
|
|
278
|
+
: undefined;
|
|
279
|
+
const plannerTimeoutMs = Number(config?.session?.max_planner_minutes) > 0
|
|
280
|
+
? Math.round(Number(config.session.max_planner_minutes) * 60 * 1000)
|
|
281
|
+
: undefined;
|
|
282
|
+
runLog.logText(
|
|
283
|
+
`[kj_plan] started — provider=${plannerRole.provider}, max_silence=${silenceTimeoutMs ? `${Math.round(silenceTimeoutMs / 1000)}s` : "disabled"}, max_runtime=${plannerTimeoutMs ? `${Math.round(plannerTimeoutMs / 1000)}s` : "disabled"}`
|
|
284
|
+
);
|
|
239
285
|
const emitter = buildDirectEmitter(server, runLog);
|
|
240
286
|
const eventBase = { sessionId: null, iteration: 0, startedAt: Date.now() };
|
|
241
287
|
const onOutput = ({ stream, line }) => {
|
|
@@ -250,18 +296,31 @@ export async function handlePlanDirect(a, server, extra) {
|
|
|
250
296
|
sendTrackerLog(server, "planner", "running", plannerRole.provider);
|
|
251
297
|
runLog.logText(`[planner] agent launched, waiting for response...`);
|
|
252
298
|
let result;
|
|
299
|
+
let plannerStats = null;
|
|
253
300
|
try {
|
|
254
|
-
result = await planner.runTask({
|
|
301
|
+
result = await planner.runTask({
|
|
302
|
+
prompt,
|
|
303
|
+
role: "planner",
|
|
304
|
+
onOutput: stallDetector.onOutput,
|
|
305
|
+
silenceTimeoutMs,
|
|
306
|
+
timeoutMs: plannerTimeoutMs
|
|
307
|
+
});
|
|
255
308
|
} finally {
|
|
256
309
|
stallDetector.stop();
|
|
257
|
-
|
|
258
|
-
runLog.logText(
|
|
310
|
+
plannerStats = stallDetector.stats();
|
|
311
|
+
runLog.logText(
|
|
312
|
+
`[planner] finished — lines=${plannerStats.lineCount}, bytes=${plannerStats.bytesReceived}, elapsed=${Math.round(plannerStats.elapsedMs / 1000)}s`
|
|
313
|
+
);
|
|
259
314
|
runLog.close();
|
|
260
315
|
}
|
|
261
316
|
|
|
262
317
|
if (!result.ok) {
|
|
263
318
|
sendTrackerLog(server, "planner", "failed");
|
|
264
|
-
|
|
319
|
+
const baseError = result.error || result.output || "Planner failed";
|
|
320
|
+
const statsSuffix = plannerStats
|
|
321
|
+
? ` [lines=${plannerStats.lineCount}, bytes=${plannerStats.bytesReceived}, elapsed=${Math.round(plannerStats.elapsedMs / 1000)}s]`
|
|
322
|
+
: "";
|
|
323
|
+
throw new Error(`${baseError}${statsSuffix}`);
|
|
265
324
|
}
|
|
266
325
|
|
|
267
326
|
sendTrackerLog(server, "planner", "done");
|
|
@@ -271,6 +330,7 @@ export async function handlePlanDirect(a, server, extra) {
|
|
|
271
330
|
|
|
272
331
|
export async function handleCodeDirect(a, server, extra) {
|
|
273
332
|
const config = await buildConfig(a, "code");
|
|
333
|
+
await assertNotOnBaseBranch(config);
|
|
274
334
|
const logger = createLogger(config.output.log_level, "mcp");
|
|
275
335
|
|
|
276
336
|
const coderRole = resolveRole(config, "coder");
|
|
@@ -319,6 +379,7 @@ export async function handleCodeDirect(a, server, extra) {
|
|
|
319
379
|
|
|
320
380
|
export async function handleReviewDirect(a, server, extra) {
|
|
321
381
|
const config = await buildConfig(a, "review");
|
|
382
|
+
await assertNotOnBaseBranch(config);
|
|
322
383
|
const logger = createLogger(config.output.log_level, "mcp");
|
|
323
384
|
|
|
324
385
|
const reviewerRole = resolveRole(config, "reviewer");
|
|
@@ -9,6 +9,18 @@ function resolveProvider(config) {
|
|
|
9
9
|
);
|
|
10
10
|
}
|
|
11
11
|
|
|
12
|
+
function resolvePlannerSilenceTimeoutMs(config) {
|
|
13
|
+
const minutes = Number(config?.session?.max_agent_silence_minutes);
|
|
14
|
+
if (!Number.isFinite(minutes) || minutes <= 0) return null;
|
|
15
|
+
return Math.round(minutes * 60 * 1000);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function resolvePlannerRuntimeTimeoutMs(config) {
|
|
19
|
+
const minutes = Number(config?.session?.max_planner_minutes);
|
|
20
|
+
if (!Number.isFinite(minutes) || minutes <= 0) return null;
|
|
21
|
+
return Math.round(minutes * 60 * 1000);
|
|
22
|
+
}
|
|
23
|
+
|
|
12
24
|
function buildPrompt({ task, instructions, research, triageDecomposition }) {
|
|
13
25
|
const sections = [];
|
|
14
26
|
|
|
@@ -78,6 +90,10 @@ export class PlannerRole extends BaseRole {
|
|
|
78
90
|
|
|
79
91
|
const runArgs = { prompt, role: "planner" };
|
|
80
92
|
if (onOutput) runArgs.onOutput = onOutput;
|
|
93
|
+
const silenceTimeoutMs = resolvePlannerSilenceTimeoutMs(this.config);
|
|
94
|
+
if (silenceTimeoutMs) runArgs.silenceTimeoutMs = silenceTimeoutMs;
|
|
95
|
+
const timeoutMs = resolvePlannerRuntimeTimeoutMs(this.config);
|
|
96
|
+
if (timeoutMs) runArgs.timeoutMs = timeoutMs;
|
|
81
97
|
const result = await agent.runTask(runArgs);
|
|
82
98
|
|
|
83
99
|
if (!result.ok) {
|
package/src/utils/process.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { execa } from "execa";
|
|
2
2
|
|
|
3
3
|
export async function runCommand(command, args = [], options = {}) {
|
|
4
|
-
const { timeout, onOutput, ...rest } = options;
|
|
4
|
+
const { timeout, onOutput, silenceTimeoutMs, partialOutputFlushMs, ...rest } = options;
|
|
5
5
|
const subprocess = execa(command, args, {
|
|
6
6
|
reject: false,
|
|
7
7
|
...rest
|
|
@@ -9,37 +9,98 @@ export async function runCommand(command, args = [], options = {}) {
|
|
|
9
9
|
|
|
10
10
|
let stdoutAccum = "";
|
|
11
11
|
let stderrAccum = "";
|
|
12
|
+
let outputSilenceTimer = null;
|
|
13
|
+
let silenceTimedOut = false;
|
|
14
|
+
|
|
15
|
+
function clearSilenceTimer() {
|
|
16
|
+
if (outputSilenceTimer) {
|
|
17
|
+
clearTimeout(outputSilenceTimer);
|
|
18
|
+
outputSilenceTimer = null;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function armSilenceTimer() {
|
|
23
|
+
const ms = Number(silenceTimeoutMs);
|
|
24
|
+
if (!Number.isFinite(ms) || ms <= 0 || silenceTimedOut) return;
|
|
25
|
+
clearSilenceTimer();
|
|
26
|
+
outputSilenceTimer = setTimeout(() => {
|
|
27
|
+
silenceTimedOut = true;
|
|
28
|
+
try {
|
|
29
|
+
subprocess.kill("SIGKILL", { forceKillAfterDelay: 1000 });
|
|
30
|
+
} catch {
|
|
31
|
+
// no-op
|
|
32
|
+
}
|
|
33
|
+
}, ms);
|
|
34
|
+
}
|
|
12
35
|
|
|
13
36
|
if (subprocess.stdout) {
|
|
14
37
|
subprocess.stdout.on("data", (chunk) => {
|
|
15
38
|
stdoutAccum += chunk.toString();
|
|
39
|
+
armSilenceTimer();
|
|
16
40
|
});
|
|
17
41
|
}
|
|
18
42
|
if (subprocess.stderr) {
|
|
19
43
|
subprocess.stderr.on("data", (chunk) => {
|
|
20
44
|
stderrAccum += chunk.toString();
|
|
45
|
+
armSilenceTimer();
|
|
21
46
|
});
|
|
22
47
|
}
|
|
23
48
|
|
|
49
|
+
let flushInterval = null;
|
|
24
50
|
if (onOutput) {
|
|
25
|
-
const
|
|
26
|
-
|
|
51
|
+
const flushMs = Number(partialOutputFlushMs) > 0 ? Number(partialOutputFlushMs) : 2000;
|
|
52
|
+
const streams = {};
|
|
53
|
+
const makeHandler = (stream) => {
|
|
54
|
+
const state = { partial: "", dirty: false };
|
|
55
|
+
streams[stream] = state;
|
|
27
56
|
return (chunk) => {
|
|
28
|
-
partial += chunk.toString();
|
|
29
|
-
const lines = partial.split(
|
|
30
|
-
partial = lines.pop();
|
|
57
|
+
state.partial += chunk.toString();
|
|
58
|
+
const lines = state.partial.split(/\r\n|\n|\r/);
|
|
59
|
+
state.partial = lines.pop() ?? "";
|
|
60
|
+
state.dirty = state.partial.length > 0;
|
|
31
61
|
for (const line of lines) {
|
|
32
62
|
if (line) onOutput({ stream, line });
|
|
33
63
|
}
|
|
34
64
|
};
|
|
35
65
|
};
|
|
36
|
-
|
|
37
|
-
|
|
66
|
+
|
|
67
|
+
const flushPartials = () => {
|
|
68
|
+
for (const [stream, state] of Object.entries(streams)) {
|
|
69
|
+
if (!state.dirty || !state.partial) continue;
|
|
70
|
+
onOutput({ stream, line: state.partial });
|
|
71
|
+
state.partial = "";
|
|
72
|
+
state.dirty = false;
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
if (subprocess.stdout) subprocess.stdout.on("data", makeHandler("stdout"));
|
|
77
|
+
if (subprocess.stderr) subprocess.stderr.on("data", makeHandler("stderr"));
|
|
78
|
+
flushInterval = setInterval(flushPartials, flushMs);
|
|
79
|
+
flushInterval.unref?.();
|
|
80
|
+
|
|
81
|
+
subprocess.finally(() => {
|
|
82
|
+
flushPartials();
|
|
83
|
+
if (flushInterval) {
|
|
84
|
+
clearInterval(flushInterval);
|
|
85
|
+
flushInterval = null;
|
|
86
|
+
}
|
|
87
|
+
});
|
|
38
88
|
}
|
|
89
|
+
armSilenceTimer();
|
|
39
90
|
|
|
40
91
|
try {
|
|
41
92
|
if (!timeout) {
|
|
42
93
|
const result = await subprocess;
|
|
94
|
+
clearSilenceTimer();
|
|
95
|
+
if (silenceTimedOut) {
|
|
96
|
+
return {
|
|
97
|
+
exitCode: 143,
|
|
98
|
+
stdout: stdoutAccum,
|
|
99
|
+
stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
|
|
100
|
+
timedOut: true,
|
|
101
|
+
signal: "SIGKILL"
|
|
102
|
+
};
|
|
103
|
+
}
|
|
43
104
|
return enrichResult(result, stdoutAccum, stderrAccum);
|
|
44
105
|
}
|
|
45
106
|
|
|
@@ -63,8 +124,28 @@ export async function runCommand(command, args = [], options = {}) {
|
|
|
63
124
|
|
|
64
125
|
const result = await Promise.race([subprocess, timeoutResult]);
|
|
65
126
|
if (timer) clearTimeout(timer);
|
|
127
|
+
clearSilenceTimer();
|
|
128
|
+
if (silenceTimedOut) {
|
|
129
|
+
return {
|
|
130
|
+
exitCode: 143,
|
|
131
|
+
stdout: stdoutAccum,
|
|
132
|
+
stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
|
|
133
|
+
timedOut: true,
|
|
134
|
+
signal: "SIGKILL"
|
|
135
|
+
};
|
|
136
|
+
}
|
|
66
137
|
return enrichResult(result, stdoutAccum, stderrAccum);
|
|
67
138
|
} catch (error) {
|
|
139
|
+
clearSilenceTimer();
|
|
140
|
+
if (silenceTimedOut) {
|
|
141
|
+
return {
|
|
142
|
+
exitCode: 143,
|
|
143
|
+
stdout: error?.stdout || stdoutAccum,
|
|
144
|
+
stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
|
|
145
|
+
timedOut: true,
|
|
146
|
+
signal: error?.signal || "SIGKILL"
|
|
147
|
+
};
|
|
148
|
+
}
|
|
68
149
|
const details = [
|
|
69
150
|
error?.shortMessage,
|
|
70
151
|
error?.originalMessage,
|
|
@@ -15,6 +15,7 @@ import { emitProgress, makeEvent } from "./events.js";
|
|
|
15
15
|
const DEFAULT_HEARTBEAT_INTERVAL_MS = 30_000; // heartbeat every 30s
|
|
16
16
|
const DEFAULT_STALL_TIMEOUT_MS = 120_000; // warn after 2min silence
|
|
17
17
|
const DEFAULT_CRITICAL_TIMEOUT_MS = 300_000; // critical after 5min silence
|
|
18
|
+
const DEFAULT_STALL_REPEAT_MS = 60_000; // repeat stall notices every 60s
|
|
18
19
|
|
|
19
20
|
export function createStallDetector({
|
|
20
21
|
onOutput,
|
|
@@ -24,23 +25,30 @@ export function createStallDetector({
|
|
|
24
25
|
provider,
|
|
25
26
|
heartbeatIntervalMs = DEFAULT_HEARTBEAT_INTERVAL_MS,
|
|
26
27
|
stallTimeoutMs = DEFAULT_STALL_TIMEOUT_MS,
|
|
27
|
-
criticalTimeoutMs = DEFAULT_CRITICAL_TIMEOUT_MS
|
|
28
|
+
criticalTimeoutMs = DEFAULT_CRITICAL_TIMEOUT_MS,
|
|
29
|
+
stallRepeatMs = DEFAULT_STALL_REPEAT_MS,
|
|
30
|
+
maxSilenceMs = null,
|
|
31
|
+
onMaxSilence = null
|
|
28
32
|
}) {
|
|
29
33
|
let lastActivityAt = Date.now();
|
|
30
34
|
let lineCount = 0;
|
|
31
35
|
let bytesReceived = 0;
|
|
32
|
-
let stallWarned = false;
|
|
33
|
-
let criticalWarned = false;
|
|
34
36
|
let heartbeatTimer = null;
|
|
35
37
|
const startedAt = Date.now();
|
|
38
|
+
let lastStallWarnAt = 0;
|
|
39
|
+
let lastCriticalWarnAt = 0;
|
|
40
|
+
let maxSilenceTriggered = false;
|
|
36
41
|
|
|
37
42
|
function emitHeartbeat() {
|
|
38
43
|
const now = Date.now();
|
|
39
44
|
const silenceMs = now - lastActivityAt;
|
|
40
45
|
const elapsedMs = now - startedAt;
|
|
46
|
+
const shouldWarn = silenceMs >= stallTimeoutMs;
|
|
47
|
+
const shouldCritical = silenceMs >= criticalTimeoutMs;
|
|
48
|
+
const repeatWindow = Math.max(1000, Number(stallRepeatMs) || DEFAULT_STALL_REPEAT_MS);
|
|
41
49
|
|
|
42
|
-
if (
|
|
43
|
-
|
|
50
|
+
if (shouldCritical && (now - lastCriticalWarnAt >= repeatWindow)) {
|
|
51
|
+
lastCriticalWarnAt = now;
|
|
44
52
|
emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
|
|
45
53
|
status: "critical",
|
|
46
54
|
message: `Agent ${provider} unresponsive for ${Math.round(silenceMs / 1000)}s — may be hung`,
|
|
@@ -53,8 +61,8 @@ export function createStallDetector({
|
|
|
53
61
|
severity: "critical"
|
|
54
62
|
}
|
|
55
63
|
}));
|
|
56
|
-
} else if (
|
|
57
|
-
|
|
64
|
+
} else if (shouldWarn && (now - lastStallWarnAt >= repeatWindow)) {
|
|
65
|
+
lastStallWarnAt = now;
|
|
58
66
|
emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
|
|
59
67
|
status: "warning",
|
|
60
68
|
message: `Agent ${provider} silent for ${Math.round(silenceMs / 1000)}s — still waiting`,
|
|
@@ -67,20 +75,49 @@ export function createStallDetector({
|
|
|
67
75
|
severity: "warning"
|
|
68
76
|
}
|
|
69
77
|
}));
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
emitProgress(emitter, makeEvent("agent:heartbeat", { ...eventBase, stage }, {
|
|
81
|
+
message: silenceMs < stallTimeoutMs
|
|
82
|
+
? `Agent ${provider} active — ${lineCount} lines, ${Math.round(elapsedMs / 1000)}s elapsed`
|
|
83
|
+
: `Agent ${provider} waiting — silent ${Math.round(silenceMs / 1000)}s, ${Math.round(elapsedMs / 1000)}s elapsed`,
|
|
84
|
+
detail: {
|
|
85
|
+
provider,
|
|
86
|
+
elapsedMs,
|
|
87
|
+
silenceMs,
|
|
88
|
+
lineCount,
|
|
89
|
+
bytesReceived,
|
|
90
|
+
status: silenceMs < stallTimeoutMs ? "active" : "waiting"
|
|
91
|
+
}
|
|
92
|
+
}));
|
|
74
93
|
|
|
75
|
-
|
|
76
|
-
|
|
94
|
+
const hardLimit = Number(maxSilenceMs);
|
|
95
|
+
if (!maxSilenceTriggered && Number.isFinite(hardLimit) && hardLimit > 0 && silenceMs >= hardLimit) {
|
|
96
|
+
maxSilenceTriggered = true;
|
|
97
|
+
emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
|
|
98
|
+
status: "fail",
|
|
99
|
+
message: `Agent ${provider} exceeded max silence (${Math.round(hardLimit / 1000)}s)`,
|
|
77
100
|
detail: {
|
|
78
101
|
provider,
|
|
102
|
+
silenceMs,
|
|
79
103
|
elapsedMs,
|
|
80
104
|
lineCount,
|
|
81
|
-
bytesReceived
|
|
105
|
+
bytesReceived,
|
|
106
|
+
severity: "fatal",
|
|
107
|
+
maxSilenceMs: hardLimit
|
|
82
108
|
}
|
|
83
109
|
}));
|
|
110
|
+
if (typeof onMaxSilence === "function") {
|
|
111
|
+
onMaxSilence({
|
|
112
|
+
provider,
|
|
113
|
+
stage,
|
|
114
|
+
silenceMs,
|
|
115
|
+
elapsedMs,
|
|
116
|
+
lineCount,
|
|
117
|
+
bytesReceived,
|
|
118
|
+
maxSilenceMs: hardLimit
|
|
119
|
+
});
|
|
120
|
+
}
|
|
84
121
|
}
|
|
85
122
|
}
|
|
86
123
|
|
|
@@ -92,10 +129,6 @@ export function createStallDetector({
|
|
|
92
129
|
lineCount++;
|
|
93
130
|
bytesReceived += data.line?.length || 0;
|
|
94
131
|
|
|
95
|
-
// Reset stall flags on new activity
|
|
96
|
-
stallWarned = false;
|
|
97
|
-
criticalWarned = false;
|
|
98
|
-
|
|
99
132
|
// Forward to the original callback
|
|
100
133
|
if (onOutput) {
|
|
101
134
|
onOutput(data);
|