karajan-code 1.9.1 → 1.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -43,6 +43,7 @@ Instead of running one AI agent and manually reviewing its output, `kj` chains a
43
43
  - **Task decomposition** — triage detects when tasks should be split and recommends subtasks; with Planning Game integration, creates linked cards with sequential blocking
44
44
  - **Retry with backoff** — automatic recovery from transient API errors (429, 5xx) with exponential backoff and jitter
45
45
  - **Pipeline stage tracker** — cumulative progress view during `kj_run` showing which stages are done, running, or pending — both in CLI and via MCP events for real-time host rendering
46
+ - **Planner observability guardrails** — continuous heartbeat/stall telemetry, configurable max-silence protection (`session.max_agent_silence_minutes`), and hard runtime cap (`session.max_planner_minutes`) to avoid long stuck planner runs
46
47
  - **Planning Game integration** — optionally pair with [Planning Game](https://github.com/AgenteIA-Geniova/planning-game) for agile project management (tasks, sprints, estimation) — like Jira, but open-source and XP-native
47
48
 
48
49
  > **Best with MCP** — Karajan Code is designed to be used as an MCP server inside your AI agent (Claude, Codex, etc.). The agent sends tasks to `kj_run`, gets real-time progress notifications, and receives structured results — no copy-pasting needed.
@@ -417,7 +418,19 @@ After `npm install -g karajan-code`, the MCP server is auto-registered in Claude
417
418
  | `kj_roles` | List roles or show role templates |
418
419
  | `kj_code` | Run coder-only mode |
419
420
  | `kj_review` | Run reviewer-only mode |
420
- | `kj_plan` | Generate implementation plan |
421
+ | `kj_plan` | Generate implementation plan with heartbeat/stall telemetry and clearer diagnostics |
422
+
423
+ ### MCP restart after version updates
424
+
425
+ If you update Karajan Code (for example `npm install -g karajan-code` to a new version) while your MCP host session is still open, the current `karajan-mcp` process may exit and the host can show `Transport closed`.
426
+
427
+ This is expected behavior: the MCP server detects a version mismatch and exits so the host can spawn a fresh process with the new code.
428
+
429
+ Quick recovery:
430
+
431
+ 1. Restart your MCP host session (Claude/Codex/new terminal session).
432
+ 2. Verify the server is listed (`codex mcp list` or your host equivalent).
433
+ 3. Run a lightweight check (`kj_config`) before continuing with larger runs.
421
434
 
422
435
  ### Recommended Companion MCPs
423
436
 
package/docs/README.es.md CHANGED
@@ -42,6 +42,7 @@ En lugar de ejecutar un agente de IA y revisar manualmente su output, `kj` encad
42
42
  - **Descomposicion de tareas** — triage detecta cuando una tarea debe dividirse y recomienda subtareas; con integracion Planning Game, crea cards vinculadas con bloqueo secuencial
43
43
  - **Retry con backoff** — recuperacion automatica ante errores transitorios de API (429, 5xx) con backoff exponencial y jitter
44
44
  - **Pipeline stage tracker** — vista de progreso acumulativo durante `kj_run` mostrando que stages estan completadas, en ejecucion o pendientes — tanto en CLI como via eventos MCP para renderizado en tiempo real en el host
45
+ - **Guardarrailes de observabilidad del planner** — telemetria continua de heartbeat/stall, proteccion configurable por silencio maximo (`session.max_agent_silence_minutes`) y limite duro de ejecucion (`session.max_planner_minutes`) para evitar bloqueos prolongados en `kj_plan`/planner
45
46
  - **Integracion con Planning Game** — combina opcionalmente con [Planning Game](https://github.com/AgenteIA-Geniova/planning-game) para gestion agil de proyectos (tareas, sprints, estimacion) — como Jira, pero open-source y nativo XP
46
47
 
47
48
  > **Mejor con MCP** — Karajan Code esta disenado para usarse como servidor MCP dentro de tu agente de IA (Claude, Codex, etc.). El agente envia tareas a `kj_run`, recibe notificaciones de progreso en tiempo real, y obtiene resultados estructurados — sin copiar y pegar.
@@ -201,7 +202,7 @@ Tras `npm install -g karajan-code`, el servidor MCP se auto-registra en las conf
201
202
  | `kj_roles` | Listar roles o mostrar templates |
202
203
  | `kj_code` | Modo solo coder |
203
204
  | `kj_review` | Modo solo reviewer |
204
- | `kj_plan` | Generar plan de implementacion |
205
+ | `kj_plan` | Generar plan de implementacion con telemetria heartbeat/stall y diagnostico mas claro |
205
206
 
206
207
  ### MCPs complementarios recomendados
207
208
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "karajan-code",
3
- "version": "1.9.1",
3
+ "version": "1.9.4",
4
4
  "description": "Local multi-agent coding orchestrator with TDD, SonarQube, and code review pipeline",
5
5
  "type": "module",
6
6
  "license": "AGPL-3.0",
@@ -8,7 +8,11 @@ export class AiderAgent extends BaseAgent {
8
8
  const args = ["--yes", "--message", task.prompt];
9
9
  const model = this.getRoleModel(role);
10
10
  if (model) args.push("--model", model);
11
- const res = await runCommand(resolveBin("aider"), args, { onOutput: task.onOutput });
11
+ const res = await runCommand(resolveBin("aider"), args, {
12
+ onOutput: task.onOutput,
13
+ silenceTimeoutMs: task.silenceTimeoutMs,
14
+ timeout: task.timeoutMs
15
+ });
12
16
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
13
17
  }
14
18
 
@@ -17,7 +21,11 @@ export class AiderAgent extends BaseAgent {
17
21
  const args = ["--yes", "--message", task.prompt];
18
22
  const model = this.getRoleModel(role);
19
23
  if (model) args.push("--model", model);
20
- const res = await runCommand(resolveBin("aider"), args, { onOutput: task.onOutput });
24
+ const res = await runCommand(resolveBin("aider"), args, {
25
+ onOutput: task.onOutput,
26
+ silenceTimeoutMs: task.silenceTimeoutMs,
27
+ timeout: task.timeoutMs
28
+ });
21
29
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
22
30
  }
23
31
  }
@@ -81,7 +81,11 @@ export class ClaudeAgent extends BaseAgent {
81
81
  if (task.onOutput) {
82
82
  args.push("--output-format", "stream-json");
83
83
  const streamFilter = createStreamJsonFilter(task.onOutput);
84
- const res = await runCommand(resolveBin("claude"), args, { onOutput: streamFilter });
84
+ const res = await runCommand(resolveBin("claude"), args, {
85
+ onOutput: streamFilter,
86
+ silenceTimeoutMs: task.silenceTimeoutMs,
87
+ timeout: task.timeoutMs
88
+ });
85
89
  const output = extractTextFromStreamJson(res.stdout);
86
90
  return { ok: res.exitCode === 0, output, error: res.stderr, exitCode: res.exitCode };
87
91
  }
@@ -94,7 +98,11 @@ export class ClaudeAgent extends BaseAgent {
94
98
  const args = ["-p", task.prompt, "--output-format", "json"];
95
99
  const model = this.getRoleModel(task.role || "reviewer");
96
100
  if (model) args.push("--model", model);
97
- const res = await runCommand(resolveBin("claude"), args, { onOutput: task.onOutput });
101
+ const res = await runCommand(resolveBin("claude"), args, {
102
+ onOutput: task.onOutput,
103
+ silenceTimeoutMs: task.silenceTimeoutMs,
104
+ timeout: task.timeoutMs
105
+ });
98
106
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
99
107
  }
100
108
  }
@@ -9,8 +9,13 @@ export class CodexAgent extends BaseAgent {
9
9
  const model = this.getRoleModel(role);
10
10
  if (model) args.push("--model", model);
11
11
  if (this.isAutoApproveEnabled(role)) args.push("--full-auto");
12
- args.push(task.prompt);
13
- const res = await runCommand(resolveBin("codex"), args, { onOutput: task.onOutput });
12
+ args.push("-");
13
+ const res = await runCommand(resolveBin("codex"), args, {
14
+ onOutput: task.onOutput,
15
+ silenceTimeoutMs: task.silenceTimeoutMs,
16
+ timeout: task.timeoutMs,
17
+ input: task.prompt
18
+ });
14
19
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
15
20
  }
16
21
 
@@ -18,8 +23,13 @@ export class CodexAgent extends BaseAgent {
18
23
  const args = ["exec"];
19
24
  const model = this.getRoleModel(task.role || "reviewer");
20
25
  if (model) args.push("--model", model);
21
- args.push(task.prompt);
22
- const res = await runCommand(resolveBin("codex"), args, { onOutput: task.onOutput });
26
+ args.push("-");
27
+ const res = await runCommand(resolveBin("codex"), args, {
28
+ onOutput: task.onOutput,
29
+ silenceTimeoutMs: task.silenceTimeoutMs,
30
+ timeout: task.timeoutMs,
31
+ input: task.prompt
32
+ });
23
33
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
24
34
  }
25
35
  }
@@ -8,7 +8,11 @@ export class GeminiAgent extends BaseAgent {
8
8
  const args = ["-p", task.prompt];
9
9
  const model = this.getRoleModel(role);
10
10
  if (model) args.push("--model", model);
11
- const res = await runCommand(resolveBin("gemini"), args, { onOutput: task.onOutput });
11
+ const res = await runCommand(resolveBin("gemini"), args, {
12
+ onOutput: task.onOutput,
13
+ silenceTimeoutMs: task.silenceTimeoutMs,
14
+ timeout: task.timeoutMs
15
+ });
12
16
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
13
17
  }
14
18
 
@@ -17,7 +21,11 @@ export class GeminiAgent extends BaseAgent {
17
21
  const args = ["-p", task.prompt, "--output-format", "json"];
18
22
  const model = this.getRoleModel(role);
19
23
  if (model) args.push("--model", model);
20
- const res = await runCommand(resolveBin("gemini"), args, { onOutput: task.onOutput });
24
+ const res = await runCommand(resolveBin("gemini"), args, {
25
+ onOutput: task.onOutput,
26
+ silenceTimeoutMs: task.silenceTimeoutMs,
27
+ timeout: task.timeoutMs
28
+ });
21
29
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
22
30
  }
23
31
  }
@@ -46,7 +46,13 @@ export async function planCommand({ task, config, logger, json, context }) {
46
46
 
47
47
  const planner = createAgent(plannerRole.provider, config, logger);
48
48
  const prompt = buildPlannerPrompt({ task, context });
49
- const result = await planner.runTask({ prompt, role: "planner" });
49
+ const silenceTimeoutMs = Number(config?.session?.max_agent_silence_minutes) > 0
50
+ ? Math.round(Number(config.session.max_agent_silence_minutes) * 60 * 1000)
51
+ : undefined;
52
+ const timeoutMs = Number(config?.session?.max_planner_minutes) > 0
53
+ ? Math.round(Number(config.session.max_planner_minutes) * 60 * 1000)
54
+ : undefined;
55
+ const result = await planner.runTask({ prompt, role: "planner", silenceTimeoutMs, timeoutMs });
50
56
 
51
57
  if (!result.ok) {
52
58
  throw new Error(result.error || result.output || "Planner failed");
package/src/config.js CHANGED
@@ -114,7 +114,9 @@ const DEFAULTS = {
114
114
  session: {
115
115
  max_iteration_minutes: 30,
116
116
  max_total_minutes: 120,
117
+ max_planner_minutes: 60,
117
118
  checkpoint_interval_minutes: 5,
119
+ max_agent_silence_minutes: 20,
118
120
  fail_fast_repeats: 2,
119
121
  repeat_detection_threshold: 2,
120
122
  max_sonar_retries: 3,
@@ -21,6 +21,7 @@ import { parseMaybeJsonString } from "../review/parser.js";
21
21
  import { computeBaseRef, generateDiff } from "../review/diff-generator.js";
22
22
  import { resolveReviewProfile } from "../review/profiles.js";
23
23
  import { createRunLog, readRunLog } from "../utils/run-log.js";
24
+ import { currentBranch } from "../utils/git.js";
24
25
 
25
26
  /**
26
27
  * Resolve the user's project directory via MCP roots.
@@ -62,6 +63,18 @@ export function classifyError(error) {
62
63
  const msg = error?.message || String(error);
63
64
  const lower = msg.toLowerCase();
64
65
 
66
+ if (
67
+ lower.includes("without output")
68
+ || lower.includes("silent for")
69
+ || lower.includes("unresponsive")
70
+ || lower.includes("exceeded max silence")
71
+ ) {
72
+ return {
73
+ category: "agent_stall",
74
+ suggestion: "Agent output stalled. Check live details with kj_status, then retry with a smaller prompt or increase session.max_agent_silence_minutes if needed."
75
+ };
76
+ }
77
+
65
78
  if (lower.includes("sonar") && (lower.includes("connect") || lower.includes("econnrefused") || lower.includes("not available") || lower.includes("not running"))) {
66
79
  return {
67
80
  category: "sonar_unavailable",
@@ -97,6 +110,13 @@ export function classifyError(error) {
97
110
  };
98
111
  }
99
112
 
113
+ if (lower.includes("you are on the base branch")) {
114
+ return {
115
+ category: "branch_error",
116
+ suggestion: "Create a feature branch before running Karajan. Use 'git checkout -b feat/<task-description>' and then retry. Do NOT run kj_code directly on the base branch."
117
+ };
118
+ }
119
+
100
120
  if (lower.includes("not a git repository")) {
101
121
  return {
102
122
  category: "git_error",
@@ -107,6 +127,23 @@ export function classifyError(error) {
107
127
  return { category: "unknown", suggestion: null };
108
128
  }
109
129
 
130
+ export async function assertNotOnBaseBranch(config) {
131
+ const baseBranch = config?.base_branch || "main";
132
+ let branch;
133
+ try {
134
+ branch = await currentBranch();
135
+ } catch {
136
+ return; // not a git repo or detached HEAD — let downstream handle it
137
+ }
138
+ if (branch === baseBranch) {
139
+ throw new Error(
140
+ `You are on the base branch '${baseBranch}'. Karajan needs a feature branch to compute the diff for review. ` +
141
+ `Create a new branch first (e.g. 'git checkout -b feat/<task-description>') and then run this command again. ` +
142
+ `Do NOT run kj_code directly — create the branch first so the full pipeline (code + review) works correctly.`
143
+ );
144
+ }
145
+ }
146
+
110
147
  export function enrichedFailPayload(error, toolName) {
111
148
  const msg = error?.message || String(error);
112
149
  const { category, suggestion } = classifyError(error);
@@ -149,6 +186,7 @@ export function buildAskQuestion(server) {
149
186
 
150
187
  export async function handleRunDirect(a, server, extra) {
151
188
  const config = await buildConfig(a);
189
+ await assertNotOnBaseBranch(config);
152
190
  const logger = createLogger(config.output.log_level, "mcp");
153
191
 
154
192
  const requiredProviders = [
@@ -235,7 +273,15 @@ export async function handlePlanDirect(a, server, extra) {
235
273
 
236
274
  const projectDir = await resolveProjectDir(server);
237
275
  const runLog = createRunLog(projectDir);
238
- runLog.logText(`[kj_plan] started provider=${plannerRole.provider}`);
276
+ const silenceTimeoutMs = Number(config?.session?.max_agent_silence_minutes) > 0
277
+ ? Math.round(Number(config.session.max_agent_silence_minutes) * 60 * 1000)
278
+ : undefined;
279
+ const plannerTimeoutMs = Number(config?.session?.max_planner_minutes) > 0
280
+ ? Math.round(Number(config.session.max_planner_minutes) * 60 * 1000)
281
+ : undefined;
282
+ runLog.logText(
283
+ `[kj_plan] started — provider=${plannerRole.provider}, max_silence=${silenceTimeoutMs ? `${Math.round(silenceTimeoutMs / 1000)}s` : "disabled"}, max_runtime=${plannerTimeoutMs ? `${Math.round(plannerTimeoutMs / 1000)}s` : "disabled"}`
284
+ );
239
285
  const emitter = buildDirectEmitter(server, runLog);
240
286
  const eventBase = { sessionId: null, iteration: 0, startedAt: Date.now() };
241
287
  const onOutput = ({ stream, line }) => {
@@ -250,18 +296,31 @@ export async function handlePlanDirect(a, server, extra) {
250
296
  sendTrackerLog(server, "planner", "running", plannerRole.provider);
251
297
  runLog.logText(`[planner] agent launched, waiting for response...`);
252
298
  let result;
299
+ let plannerStats = null;
253
300
  try {
254
- result = await planner.runTask({ prompt, role: "planner", onOutput: stallDetector.onOutput });
301
+ result = await planner.runTask({
302
+ prompt,
303
+ role: "planner",
304
+ onOutput: stallDetector.onOutput,
305
+ silenceTimeoutMs,
306
+ timeoutMs: plannerTimeoutMs
307
+ });
255
308
  } finally {
256
309
  stallDetector.stop();
257
- const stats = stallDetector.stats();
258
- runLog.logText(`[planner] finished — lines=${stats.lineCount}, bytes=${stats.bytesReceived}, elapsed=${Math.round(stats.elapsedMs / 1000)}s`);
310
+ plannerStats = stallDetector.stats();
311
+ runLog.logText(
312
+ `[planner] finished — lines=${plannerStats.lineCount}, bytes=${plannerStats.bytesReceived}, elapsed=${Math.round(plannerStats.elapsedMs / 1000)}s`
313
+ );
259
314
  runLog.close();
260
315
  }
261
316
 
262
317
  if (!result.ok) {
263
318
  sendTrackerLog(server, "planner", "failed");
264
- throw new Error(result.error || result.output || "Planner failed");
319
+ const baseError = result.error || result.output || "Planner failed";
320
+ const statsSuffix = plannerStats
321
+ ? ` [lines=${plannerStats.lineCount}, bytes=${plannerStats.bytesReceived}, elapsed=${Math.round(plannerStats.elapsedMs / 1000)}s]`
322
+ : "";
323
+ throw new Error(`${baseError}${statsSuffix}`);
265
324
  }
266
325
 
267
326
  sendTrackerLog(server, "planner", "done");
@@ -271,6 +330,7 @@ export async function handlePlanDirect(a, server, extra) {
271
330
 
272
331
  export async function handleCodeDirect(a, server, extra) {
273
332
  const config = await buildConfig(a, "code");
333
+ await assertNotOnBaseBranch(config);
274
334
  const logger = createLogger(config.output.log_level, "mcp");
275
335
 
276
336
  const coderRole = resolveRole(config, "coder");
@@ -319,6 +379,7 @@ export async function handleCodeDirect(a, server, extra) {
319
379
 
320
380
  export async function handleReviewDirect(a, server, extra) {
321
381
  const config = await buildConfig(a, "review");
382
+ await assertNotOnBaseBranch(config);
322
383
  const logger = createLogger(config.output.log_level, "mcp");
323
384
 
324
385
  const reviewerRole = resolveRole(config, "reviewer");
@@ -9,6 +9,18 @@ function resolveProvider(config) {
9
9
  );
10
10
  }
11
11
 
12
+ function resolvePlannerSilenceTimeoutMs(config) {
13
+ const minutes = Number(config?.session?.max_agent_silence_minutes);
14
+ if (!Number.isFinite(minutes) || minutes <= 0) return null;
15
+ return Math.round(minutes * 60 * 1000);
16
+ }
17
+
18
+ function resolvePlannerRuntimeTimeoutMs(config) {
19
+ const minutes = Number(config?.session?.max_planner_minutes);
20
+ if (!Number.isFinite(minutes) || minutes <= 0) return null;
21
+ return Math.round(minutes * 60 * 1000);
22
+ }
23
+
12
24
  function buildPrompt({ task, instructions, research, triageDecomposition }) {
13
25
  const sections = [];
14
26
 
@@ -78,6 +90,10 @@ export class PlannerRole extends BaseRole {
78
90
 
79
91
  const runArgs = { prompt, role: "planner" };
80
92
  if (onOutput) runArgs.onOutput = onOutput;
93
+ const silenceTimeoutMs = resolvePlannerSilenceTimeoutMs(this.config);
94
+ if (silenceTimeoutMs) runArgs.silenceTimeoutMs = silenceTimeoutMs;
95
+ const timeoutMs = resolvePlannerRuntimeTimeoutMs(this.config);
96
+ if (timeoutMs) runArgs.timeoutMs = timeoutMs;
81
97
  const result = await agent.runTask(runArgs);
82
98
 
83
99
  if (!result.ok) {
@@ -1,7 +1,7 @@
1
1
  import { execa } from "execa";
2
2
 
3
3
  export async function runCommand(command, args = [], options = {}) {
4
- const { timeout, onOutput, ...rest } = options;
4
+ const { timeout, onOutput, silenceTimeoutMs, partialOutputFlushMs, ...rest } = options;
5
5
  const subprocess = execa(command, args, {
6
6
  reject: false,
7
7
  ...rest
@@ -9,37 +9,98 @@ export async function runCommand(command, args = [], options = {}) {
9
9
 
10
10
  let stdoutAccum = "";
11
11
  let stderrAccum = "";
12
+ let outputSilenceTimer = null;
13
+ let silenceTimedOut = false;
14
+
15
+ function clearSilenceTimer() {
16
+ if (outputSilenceTimer) {
17
+ clearTimeout(outputSilenceTimer);
18
+ outputSilenceTimer = null;
19
+ }
20
+ }
21
+
22
+ function armSilenceTimer() {
23
+ const ms = Number(silenceTimeoutMs);
24
+ if (!Number.isFinite(ms) || ms <= 0 || silenceTimedOut) return;
25
+ clearSilenceTimer();
26
+ outputSilenceTimer = setTimeout(() => {
27
+ silenceTimedOut = true;
28
+ try {
29
+ subprocess.kill("SIGKILL", { forceKillAfterDelay: 1000 });
30
+ } catch {
31
+ // no-op
32
+ }
33
+ }, ms);
34
+ }
12
35
 
13
36
  if (subprocess.stdout) {
14
37
  subprocess.stdout.on("data", (chunk) => {
15
38
  stdoutAccum += chunk.toString();
39
+ armSilenceTimer();
16
40
  });
17
41
  }
18
42
  if (subprocess.stderr) {
19
43
  subprocess.stderr.on("data", (chunk) => {
20
44
  stderrAccum += chunk.toString();
45
+ armSilenceTimer();
21
46
  });
22
47
  }
23
48
 
49
+ let flushInterval = null;
24
50
  if (onOutput) {
25
- const handler = (stream) => {
26
- let partial = "";
51
+ const flushMs = Number(partialOutputFlushMs) > 0 ? Number(partialOutputFlushMs) : 2000;
52
+ const streams = {};
53
+ const makeHandler = (stream) => {
54
+ const state = { partial: "", dirty: false };
55
+ streams[stream] = state;
27
56
  return (chunk) => {
28
- partial += chunk.toString();
29
- const lines = partial.split("\n");
30
- partial = lines.pop();
57
+ state.partial += chunk.toString();
58
+ const lines = state.partial.split(/\r\n|\n|\r/);
59
+ state.partial = lines.pop() ?? "";
60
+ state.dirty = state.partial.length > 0;
31
61
  for (const line of lines) {
32
62
  if (line) onOutput({ stream, line });
33
63
  }
34
64
  };
35
65
  };
36
- if (subprocess.stdout) subprocess.stdout.on("data", handler("stdout"));
37
- if (subprocess.stderr) subprocess.stderr.on("data", handler("stderr"));
66
+
67
+ const flushPartials = () => {
68
+ for (const [stream, state] of Object.entries(streams)) {
69
+ if (!state.dirty || !state.partial) continue;
70
+ onOutput({ stream, line: state.partial });
71
+ state.partial = "";
72
+ state.dirty = false;
73
+ }
74
+ };
75
+
76
+ if (subprocess.stdout) subprocess.stdout.on("data", makeHandler("stdout"));
77
+ if (subprocess.stderr) subprocess.stderr.on("data", makeHandler("stderr"));
78
+ flushInterval = setInterval(flushPartials, flushMs);
79
+ flushInterval.unref?.();
80
+
81
+ subprocess.finally(() => {
82
+ flushPartials();
83
+ if (flushInterval) {
84
+ clearInterval(flushInterval);
85
+ flushInterval = null;
86
+ }
87
+ });
38
88
  }
89
+ armSilenceTimer();
39
90
 
40
91
  try {
41
92
  if (!timeout) {
42
93
  const result = await subprocess;
94
+ clearSilenceTimer();
95
+ if (silenceTimedOut) {
96
+ return {
97
+ exitCode: 143,
98
+ stdout: stdoutAccum,
99
+ stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
100
+ timedOut: true,
101
+ signal: "SIGKILL"
102
+ };
103
+ }
43
104
  return enrichResult(result, stdoutAccum, stderrAccum);
44
105
  }
45
106
 
@@ -63,8 +124,28 @@ export async function runCommand(command, args = [], options = {}) {
63
124
 
64
125
  const result = await Promise.race([subprocess, timeoutResult]);
65
126
  if (timer) clearTimeout(timer);
127
+ clearSilenceTimer();
128
+ if (silenceTimedOut) {
129
+ return {
130
+ exitCode: 143,
131
+ stdout: stdoutAccum,
132
+ stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
133
+ timedOut: true,
134
+ signal: "SIGKILL"
135
+ };
136
+ }
66
137
  return enrichResult(result, stdoutAccum, stderrAccum);
67
138
  } catch (error) {
139
+ clearSilenceTimer();
140
+ if (silenceTimedOut) {
141
+ return {
142
+ exitCode: 143,
143
+ stdout: error?.stdout || stdoutAccum,
144
+ stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
145
+ timedOut: true,
146
+ signal: error?.signal || "SIGKILL"
147
+ };
148
+ }
68
149
  const details = [
69
150
  error?.shortMessage,
70
151
  error?.originalMessage,
@@ -15,6 +15,7 @@ import { emitProgress, makeEvent } from "./events.js";
15
15
  const DEFAULT_HEARTBEAT_INTERVAL_MS = 30_000; // heartbeat every 30s
16
16
  const DEFAULT_STALL_TIMEOUT_MS = 120_000; // warn after 2min silence
17
17
  const DEFAULT_CRITICAL_TIMEOUT_MS = 300_000; // critical after 5min silence
18
+ const DEFAULT_STALL_REPEAT_MS = 60_000; // repeat stall notices every 60s
18
19
 
19
20
  export function createStallDetector({
20
21
  onOutput,
@@ -24,23 +25,30 @@ export function createStallDetector({
24
25
  provider,
25
26
  heartbeatIntervalMs = DEFAULT_HEARTBEAT_INTERVAL_MS,
26
27
  stallTimeoutMs = DEFAULT_STALL_TIMEOUT_MS,
27
- criticalTimeoutMs = DEFAULT_CRITICAL_TIMEOUT_MS
28
+ criticalTimeoutMs = DEFAULT_CRITICAL_TIMEOUT_MS,
29
+ stallRepeatMs = DEFAULT_STALL_REPEAT_MS,
30
+ maxSilenceMs = null,
31
+ onMaxSilence = null
28
32
  }) {
29
33
  let lastActivityAt = Date.now();
30
34
  let lineCount = 0;
31
35
  let bytesReceived = 0;
32
- let stallWarned = false;
33
- let criticalWarned = false;
34
36
  let heartbeatTimer = null;
35
37
  const startedAt = Date.now();
38
+ let lastStallWarnAt = 0;
39
+ let lastCriticalWarnAt = 0;
40
+ let maxSilenceTriggered = false;
36
41
 
37
42
  function emitHeartbeat() {
38
43
  const now = Date.now();
39
44
  const silenceMs = now - lastActivityAt;
40
45
  const elapsedMs = now - startedAt;
46
+ const shouldWarn = silenceMs >= stallTimeoutMs;
47
+ const shouldCritical = silenceMs >= criticalTimeoutMs;
48
+ const repeatWindow = Math.max(1000, Number(stallRepeatMs) || DEFAULT_STALL_REPEAT_MS);
41
49
 
42
- if (silenceMs >= criticalTimeoutMs && !criticalWarned) {
43
- criticalWarned = true;
50
+ if (shouldCritical && (now - lastCriticalWarnAt >= repeatWindow)) {
51
+ lastCriticalWarnAt = now;
44
52
  emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
45
53
  status: "critical",
46
54
  message: `Agent ${provider} unresponsive for ${Math.round(silenceMs / 1000)}s — may be hung`,
@@ -53,8 +61,8 @@ export function createStallDetector({
53
61
  severity: "critical"
54
62
  }
55
63
  }));
56
- } else if (silenceMs >= stallTimeoutMs && !stallWarned) {
57
- stallWarned = true;
64
+ } else if (shouldWarn && (now - lastStallWarnAt >= repeatWindow)) {
65
+ lastStallWarnAt = now;
58
66
  emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
59
67
  status: "warning",
60
68
  message: `Agent ${provider} silent for ${Math.round(silenceMs / 1000)}s — still waiting`,
@@ -67,20 +75,49 @@ export function createStallDetector({
67
75
  severity: "warning"
68
76
  }
69
77
  }));
70
- } else if (silenceMs < stallTimeoutMs) {
71
- // Reset warning flags when activity resumes
72
- stallWarned = false;
73
- criticalWarned = false;
78
+ }
79
+
80
+ emitProgress(emitter, makeEvent("agent:heartbeat", { ...eventBase, stage }, {
81
+ message: silenceMs < stallTimeoutMs
82
+ ? `Agent ${provider} active — ${lineCount} lines, ${Math.round(elapsedMs / 1000)}s elapsed`
83
+ : `Agent ${provider} waiting — silent ${Math.round(silenceMs / 1000)}s, ${Math.round(elapsedMs / 1000)}s elapsed`,
84
+ detail: {
85
+ provider,
86
+ elapsedMs,
87
+ silenceMs,
88
+ lineCount,
89
+ bytesReceived,
90
+ status: silenceMs < stallTimeoutMs ? "active" : "waiting"
91
+ }
92
+ }));
74
93
 
75
- emitProgress(emitter, makeEvent("agent:heartbeat", { ...eventBase, stage }, {
76
- message: `Agent ${provider} active ${lineCount} lines, ${Math.round(elapsedMs / 1000)}s elapsed`,
94
+ const hardLimit = Number(maxSilenceMs);
95
+ if (!maxSilenceTriggered && Number.isFinite(hardLimit) && hardLimit > 0 && silenceMs >= hardLimit) {
96
+ maxSilenceTriggered = true;
97
+ emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
98
+ status: "fail",
99
+ message: `Agent ${provider} exceeded max silence (${Math.round(hardLimit / 1000)}s)`,
77
100
  detail: {
78
101
  provider,
102
+ silenceMs,
79
103
  elapsedMs,
80
104
  lineCount,
81
- bytesReceived
105
+ bytesReceived,
106
+ severity: "fatal",
107
+ maxSilenceMs: hardLimit
82
108
  }
83
109
  }));
110
+ if (typeof onMaxSilence === "function") {
111
+ onMaxSilence({
112
+ provider,
113
+ stage,
114
+ silenceMs,
115
+ elapsedMs,
116
+ lineCount,
117
+ bytesReceived,
118
+ maxSilenceMs: hardLimit
119
+ });
120
+ }
84
121
  }
85
122
  }
86
123
 
@@ -92,10 +129,6 @@ export function createStallDetector({
92
129
  lineCount++;
93
130
  bytesReceived += data.line?.length || 0;
94
131
 
95
- // Reset stall flags on new activity
96
- stallWarned = false;
97
- criticalWarned = false;
98
-
99
132
  // Forward to the original callback
100
133
  if (onOutput) {
101
134
  onOutput(data);