karajan-code 1.9.0 → 1.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -43,6 +43,7 @@ Instead of running one AI agent and manually reviewing its output, `kj` chains a
43
43
  - **Task decomposition** — triage detects when tasks should be split and recommends subtasks; with Planning Game integration, creates linked cards with sequential blocking
44
44
  - **Retry with backoff** — automatic recovery from transient API errors (429, 5xx) with exponential backoff and jitter
45
45
  - **Pipeline stage tracker** — cumulative progress view during `kj_run` showing which stages are done, running, or pending — both in CLI and via MCP events for real-time host rendering
46
+ - **Planner observability guardrails** — continuous heartbeat/stall telemetry, configurable max-silence protection (`session.max_agent_silence_minutes`), and hard runtime cap (`session.max_planner_minutes`) to avoid long stuck planner runs
46
47
  - **Planning Game integration** — optionally pair with [Planning Game](https://github.com/AgenteIA-Geniova/planning-game) for agile project management (tasks, sprints, estimation) — like Jira, but open-source and XP-native
47
48
 
48
49
  > **Best with MCP** — Karajan Code is designed to be used as an MCP server inside your AI agent (Claude, Codex, etc.). The agent sends tasks to `kj_run`, gets real-time progress notifications, and receives structured results — no copy-pasting needed.
@@ -417,7 +418,7 @@ After `npm install -g karajan-code`, the MCP server is auto-registered in Claude
417
418
  | `kj_roles` | List roles or show role templates |
418
419
  | `kj_code` | Run coder-only mode |
419
420
  | `kj_review` | Run reviewer-only mode |
420
- | `kj_plan` | Generate implementation plan |
421
+ | `kj_plan` | Generate implementation plan with heartbeat/stall telemetry and clearer diagnostics |
421
422
 
422
423
  ### Recommended Companion MCPs
423
424
 
package/docs/README.es.md CHANGED
@@ -42,6 +42,7 @@ En lugar de ejecutar un agente de IA y revisar manualmente su output, `kj` encad
42
42
  - **Descomposicion de tareas** — triage detecta cuando una tarea debe dividirse y recomienda subtareas; con integracion Planning Game, crea cards vinculadas con bloqueo secuencial
43
43
  - **Retry con backoff** — recuperacion automatica ante errores transitorios de API (429, 5xx) con backoff exponencial y jitter
44
44
  - **Pipeline stage tracker** — vista de progreso acumulativo durante `kj_run` mostrando que stages estan completadas, en ejecucion o pendientes — tanto en CLI como via eventos MCP para renderizado en tiempo real en el host
45
+ - **Guardarrailes de observabilidad del planner** — telemetria continua de heartbeat/stall, proteccion configurable por silencio maximo (`session.max_agent_silence_minutes`) y limite duro de ejecucion (`session.max_planner_minutes`) para evitar bloqueos prolongados en `kj_plan`/planner
45
46
  - **Integracion con Planning Game** — combina opcionalmente con [Planning Game](https://github.com/AgenteIA-Geniova/planning-game) para gestion agil de proyectos (tareas, sprints, estimacion) — como Jira, pero open-source y nativo XP
46
47
 
47
48
  > **Mejor con MCP** — Karajan Code esta disenado para usarse como servidor MCP dentro de tu agente de IA (Claude, Codex, etc.). El agente envia tareas a `kj_run`, recibe notificaciones de progreso en tiempo real, y obtiene resultados estructurados — sin copiar y pegar.
@@ -201,7 +202,7 @@ Tras `npm install -g karajan-code`, el servidor MCP se auto-registra en las conf
201
202
  | `kj_roles` | Listar roles o mostrar templates |
202
203
  | `kj_code` | Modo solo coder |
203
204
  | `kj_review` | Modo solo reviewer |
204
- | `kj_plan` | Generar plan de implementacion |
205
+ | `kj_plan` | Generar plan de implementacion con telemetria heartbeat/stall y diagnostico mas claro |
205
206
 
206
207
  ### MCPs complementarios recomendados
207
208
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "karajan-code",
3
- "version": "1.9.0",
3
+ "version": "1.9.3",
4
4
  "description": "Local multi-agent coding orchestrator with TDD, SonarQube, and code review pipeline",
5
5
  "type": "module",
6
6
  "license": "AGPL-3.0",
@@ -8,7 +8,11 @@ export class AiderAgent extends BaseAgent {
8
8
  const args = ["--yes", "--message", task.prompt];
9
9
  const model = this.getRoleModel(role);
10
10
  if (model) args.push("--model", model);
11
- const res = await runCommand(resolveBin("aider"), args, { onOutput: task.onOutput });
11
+ const res = await runCommand(resolveBin("aider"), args, {
12
+ onOutput: task.onOutput,
13
+ silenceTimeoutMs: task.silenceTimeoutMs,
14
+ timeout: task.timeoutMs
15
+ });
12
16
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
13
17
  }
14
18
 
@@ -17,7 +21,11 @@ export class AiderAgent extends BaseAgent {
17
21
  const args = ["--yes", "--message", task.prompt];
18
22
  const model = this.getRoleModel(role);
19
23
  if (model) args.push("--model", model);
20
- const res = await runCommand(resolveBin("aider"), args, { onOutput: task.onOutput });
24
+ const res = await runCommand(resolveBin("aider"), args, {
25
+ onOutput: task.onOutput,
26
+ silenceTimeoutMs: task.silenceTimeoutMs,
27
+ timeout: task.timeoutMs
28
+ });
21
29
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
22
30
  }
23
31
  }
@@ -81,7 +81,11 @@ export class ClaudeAgent extends BaseAgent {
81
81
  if (task.onOutput) {
82
82
  args.push("--output-format", "stream-json");
83
83
  const streamFilter = createStreamJsonFilter(task.onOutput);
84
- const res = await runCommand(resolveBin("claude"), args, { onOutput: streamFilter });
84
+ const res = await runCommand(resolveBin("claude"), args, {
85
+ onOutput: streamFilter,
86
+ silenceTimeoutMs: task.silenceTimeoutMs,
87
+ timeout: task.timeoutMs
88
+ });
85
89
  const output = extractTextFromStreamJson(res.stdout);
86
90
  return { ok: res.exitCode === 0, output, error: res.stderr, exitCode: res.exitCode };
87
91
  }
@@ -94,7 +98,11 @@ export class ClaudeAgent extends BaseAgent {
94
98
  const args = ["-p", task.prompt, "--output-format", "json"];
95
99
  const model = this.getRoleModel(task.role || "reviewer");
96
100
  if (model) args.push("--model", model);
97
- const res = await runCommand(resolveBin("claude"), args, { onOutput: task.onOutput });
101
+ const res = await runCommand(resolveBin("claude"), args, {
102
+ onOutput: task.onOutput,
103
+ silenceTimeoutMs: task.silenceTimeoutMs,
104
+ timeout: task.timeoutMs
105
+ });
98
106
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
99
107
  }
100
108
  }
@@ -9,8 +9,13 @@ export class CodexAgent extends BaseAgent {
9
9
  const model = this.getRoleModel(role);
10
10
  if (model) args.push("--model", model);
11
11
  if (this.isAutoApproveEnabled(role)) args.push("--full-auto");
12
- args.push(task.prompt);
13
- const res = await runCommand(resolveBin("codex"), args, { onOutput: task.onOutput });
12
+ args.push("-");
13
+ const res = await runCommand(resolveBin("codex"), args, {
14
+ onOutput: task.onOutput,
15
+ silenceTimeoutMs: task.silenceTimeoutMs,
16
+ timeout: task.timeoutMs,
17
+ input: task.prompt
18
+ });
14
19
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
15
20
  }
16
21
 
@@ -18,8 +23,13 @@ export class CodexAgent extends BaseAgent {
18
23
  const args = ["exec"];
19
24
  const model = this.getRoleModel(task.role || "reviewer");
20
25
  if (model) args.push("--model", model);
21
- args.push(task.prompt);
22
- const res = await runCommand(resolveBin("codex"), args, { onOutput: task.onOutput });
26
+ args.push("-");
27
+ const res = await runCommand(resolveBin("codex"), args, {
28
+ onOutput: task.onOutput,
29
+ silenceTimeoutMs: task.silenceTimeoutMs,
30
+ timeout: task.timeoutMs,
31
+ input: task.prompt
32
+ });
23
33
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
24
34
  }
25
35
  }
@@ -8,7 +8,11 @@ export class GeminiAgent extends BaseAgent {
8
8
  const args = ["-p", task.prompt];
9
9
  const model = this.getRoleModel(role);
10
10
  if (model) args.push("--model", model);
11
- const res = await runCommand(resolveBin("gemini"), args, { onOutput: task.onOutput });
11
+ const res = await runCommand(resolveBin("gemini"), args, {
12
+ onOutput: task.onOutput,
13
+ silenceTimeoutMs: task.silenceTimeoutMs,
14
+ timeout: task.timeoutMs
15
+ });
12
16
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
13
17
  }
14
18
 
@@ -17,7 +21,11 @@ export class GeminiAgent extends BaseAgent {
17
21
  const args = ["-p", task.prompt, "--output-format", "json"];
18
22
  const model = this.getRoleModel(role);
19
23
  if (model) args.push("--model", model);
20
- const res = await runCommand(resolveBin("gemini"), args, { onOutput: task.onOutput });
24
+ const res = await runCommand(resolveBin("gemini"), args, {
25
+ onOutput: task.onOutput,
26
+ silenceTimeoutMs: task.silenceTimeoutMs,
27
+ timeout: task.timeoutMs
28
+ });
21
29
  return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
22
30
  }
23
31
  }
package/src/cli.js CHANGED
@@ -187,6 +187,29 @@ program
187
187
  });
188
188
  });
189
189
 
190
+ program
191
+ .command("update")
192
+ .description("Update karajan-code to the latest version from npm")
193
+ .action(async () => {
194
+ const { execaCommand } = await import("execa");
195
+ console.log(`Current version: ${PKG_VERSION}`);
196
+ console.log("Checking for updates...");
197
+ try {
198
+ const { stdout } = await execaCommand("npm view karajan-code version");
199
+ const latest = stdout.trim();
200
+ if (latest === PKG_VERSION) {
201
+ console.log(`Already on the latest version (${PKG_VERSION}).`);
202
+ return;
203
+ }
204
+ console.log(`Updating ${PKG_VERSION} → ${latest}...`);
205
+ await execaCommand("npm install -g karajan-code@latest", { stdio: "inherit" });
206
+ console.log(`Updated to ${latest}. Restart Claude to pick up the new MCP server.`);
207
+ } catch (err) {
208
+ console.error(`Update failed: ${err.message}`);
209
+ process.exit(1);
210
+ }
211
+ });
212
+
190
213
  const sonar = program.command("sonar").description("Manage SonarQube container");
191
214
  sonar.command("status").action(async () => sonarCommand({ action: "status" }));
192
215
  sonar.command("start").action(async () => sonarCommand({ action: "start" }));
@@ -46,7 +46,13 @@ export async function planCommand({ task, config, logger, json, context }) {
46
46
 
47
47
  const planner = createAgent(plannerRole.provider, config, logger);
48
48
  const prompt = buildPlannerPrompt({ task, context });
49
- const result = await planner.runTask({ prompt, role: "planner" });
49
+ const silenceTimeoutMs = Number(config?.session?.max_agent_silence_minutes) > 0
50
+ ? Math.round(Number(config.session.max_agent_silence_minutes) * 60 * 1000)
51
+ : undefined;
52
+ const timeoutMs = Number(config?.session?.max_planner_minutes) > 0
53
+ ? Math.round(Number(config.session.max_planner_minutes) * 60 * 1000)
54
+ : undefined;
55
+ const result = await planner.runTask({ prompt, role: "planner", silenceTimeoutMs, timeoutMs });
50
56
 
51
57
  if (!result.ok) {
52
58
  throw new Error(result.error || result.output || "Planner failed");
package/src/config.js CHANGED
@@ -114,7 +114,9 @@ const DEFAULTS = {
114
114
  session: {
115
115
  max_iteration_minutes: 30,
116
116
  max_total_minutes: 120,
117
+ max_planner_minutes: 60,
117
118
  checkpoint_interval_minutes: 5,
119
+ max_agent_silence_minutes: 20,
118
120
  fail_fast_repeats: 2,
119
121
  repeat_detection_threshold: 2,
120
122
  max_sonar_retries: 3,
@@ -62,6 +62,18 @@ export function classifyError(error) {
62
62
  const msg = error?.message || String(error);
63
63
  const lower = msg.toLowerCase();
64
64
 
65
+ if (
66
+ lower.includes("without output")
67
+ || lower.includes("silent for")
68
+ || lower.includes("unresponsive")
69
+ || lower.includes("exceeded max silence")
70
+ ) {
71
+ return {
72
+ category: "agent_stall",
73
+ suggestion: "Agent output stalled. Check live details with kj_status, then retry with a smaller prompt or increase session.max_agent_silence_minutes if needed."
74
+ };
75
+ }
76
+
65
77
  if (lower.includes("sonar") && (lower.includes("connect") || lower.includes("econnrefused") || lower.includes("not available") || lower.includes("not running"))) {
66
78
  return {
67
79
  category: "sonar_unavailable",
@@ -235,7 +247,15 @@ export async function handlePlanDirect(a, server, extra) {
235
247
 
236
248
  const projectDir = await resolveProjectDir(server);
237
249
  const runLog = createRunLog(projectDir);
238
- runLog.logText(`[kj_plan] started provider=${plannerRole.provider}`);
250
+ const silenceTimeoutMs = Number(config?.session?.max_agent_silence_minutes) > 0
251
+ ? Math.round(Number(config.session.max_agent_silence_minutes) * 60 * 1000)
252
+ : undefined;
253
+ const plannerTimeoutMs = Number(config?.session?.max_planner_minutes) > 0
254
+ ? Math.round(Number(config.session.max_planner_minutes) * 60 * 1000)
255
+ : undefined;
256
+ runLog.logText(
257
+ `[kj_plan] started — provider=${plannerRole.provider}, max_silence=${silenceTimeoutMs ? `${Math.round(silenceTimeoutMs / 1000)}s` : "disabled"}, max_runtime=${plannerTimeoutMs ? `${Math.round(plannerTimeoutMs / 1000)}s` : "disabled"}`
258
+ );
239
259
  const emitter = buildDirectEmitter(server, runLog);
240
260
  const eventBase = { sessionId: null, iteration: 0, startedAt: Date.now() };
241
261
  const onOutput = ({ stream, line }) => {
@@ -250,18 +270,31 @@ export async function handlePlanDirect(a, server, extra) {
250
270
  sendTrackerLog(server, "planner", "running", plannerRole.provider);
251
271
  runLog.logText(`[planner] agent launched, waiting for response...`);
252
272
  let result;
273
+ let plannerStats = null;
253
274
  try {
254
- result = await planner.runTask({ prompt, role: "planner", onOutput: stallDetector.onOutput });
275
+ result = await planner.runTask({
276
+ prompt,
277
+ role: "planner",
278
+ onOutput: stallDetector.onOutput,
279
+ silenceTimeoutMs,
280
+ timeoutMs: plannerTimeoutMs
281
+ });
255
282
  } finally {
256
283
  stallDetector.stop();
257
- const stats = stallDetector.stats();
258
- runLog.logText(`[planner] finished — lines=${stats.lineCount}, bytes=${stats.bytesReceived}, elapsed=${Math.round(stats.elapsedMs / 1000)}s`);
284
+ plannerStats = stallDetector.stats();
285
+ runLog.logText(
286
+ `[planner] finished — lines=${plannerStats.lineCount}, bytes=${plannerStats.bytesReceived}, elapsed=${Math.round(plannerStats.elapsedMs / 1000)}s`
287
+ );
259
288
  runLog.close();
260
289
  }
261
290
 
262
291
  if (!result.ok) {
263
292
  sendTrackerLog(server, "planner", "failed");
264
- throw new Error(result.error || result.output || "Planner failed");
293
+ const baseError = result.error || result.output || "Planner failed";
294
+ const statsSuffix = plannerStats
295
+ ? ` [lines=${plannerStats.lineCount}, bytes=${plannerStats.bytesReceived}, elapsed=${Math.round(plannerStats.elapsedMs / 1000)}s]`
296
+ : "";
297
+ throw new Error(`${baseError}${statsSuffix}`);
265
298
  }
266
299
 
267
300
  sendTrackerLog(server, "planner", "done");
@@ -9,6 +9,18 @@ function resolveProvider(config) {
9
9
  );
10
10
  }
11
11
 
12
+ function resolvePlannerSilenceTimeoutMs(config) {
13
+ const minutes = Number(config?.session?.max_agent_silence_minutes);
14
+ if (!Number.isFinite(minutes) || minutes <= 0) return null;
15
+ return Math.round(minutes * 60 * 1000);
16
+ }
17
+
18
+ function resolvePlannerRuntimeTimeoutMs(config) {
19
+ const minutes = Number(config?.session?.max_planner_minutes);
20
+ if (!Number.isFinite(minutes) || minutes <= 0) return null;
21
+ return Math.round(minutes * 60 * 1000);
22
+ }
23
+
12
24
  function buildPrompt({ task, instructions, research, triageDecomposition }) {
13
25
  const sections = [];
14
26
 
@@ -78,6 +90,10 @@ export class PlannerRole extends BaseRole {
78
90
 
79
91
  const runArgs = { prompt, role: "planner" };
80
92
  if (onOutput) runArgs.onOutput = onOutput;
93
+ const silenceTimeoutMs = resolvePlannerSilenceTimeoutMs(this.config);
94
+ if (silenceTimeoutMs) runArgs.silenceTimeoutMs = silenceTimeoutMs;
95
+ const timeoutMs = resolvePlannerRuntimeTimeoutMs(this.config);
96
+ if (timeoutMs) runArgs.timeoutMs = timeoutMs;
81
97
  const result = await agent.runTask(runArgs);
82
98
 
83
99
  if (!result.ok) {
@@ -1,7 +1,7 @@
1
1
  import { execa } from "execa";
2
2
 
3
3
  export async function runCommand(command, args = [], options = {}) {
4
- const { timeout, onOutput, ...rest } = options;
4
+ const { timeout, onOutput, silenceTimeoutMs, partialOutputFlushMs, ...rest } = options;
5
5
  const subprocess = execa(command, args, {
6
6
  reject: false,
7
7
  ...rest
@@ -9,37 +9,98 @@ export async function runCommand(command, args = [], options = {}) {
9
9
 
10
10
  let stdoutAccum = "";
11
11
  let stderrAccum = "";
12
+ let outputSilenceTimer = null;
13
+ let silenceTimedOut = false;
14
+
15
+ function clearSilenceTimer() {
16
+ if (outputSilenceTimer) {
17
+ clearTimeout(outputSilenceTimer);
18
+ outputSilenceTimer = null;
19
+ }
20
+ }
21
+
22
+ function armSilenceTimer() {
23
+ const ms = Number(silenceTimeoutMs);
24
+ if (!Number.isFinite(ms) || ms <= 0 || silenceTimedOut) return;
25
+ clearSilenceTimer();
26
+ outputSilenceTimer = setTimeout(() => {
27
+ silenceTimedOut = true;
28
+ try {
29
+ subprocess.kill("SIGKILL", { forceKillAfterDelay: 1000 });
30
+ } catch {
31
+ // no-op
32
+ }
33
+ }, ms);
34
+ }
12
35
 
13
36
  if (subprocess.stdout) {
14
37
  subprocess.stdout.on("data", (chunk) => {
15
38
  stdoutAccum += chunk.toString();
39
+ armSilenceTimer();
16
40
  });
17
41
  }
18
42
  if (subprocess.stderr) {
19
43
  subprocess.stderr.on("data", (chunk) => {
20
44
  stderrAccum += chunk.toString();
45
+ armSilenceTimer();
21
46
  });
22
47
  }
23
48
 
49
+ let flushInterval = null;
24
50
  if (onOutput) {
25
- const handler = (stream) => {
26
- let partial = "";
51
+ const flushMs = Number(partialOutputFlushMs) > 0 ? Number(partialOutputFlushMs) : 2000;
52
+ const streams = {};
53
+ const makeHandler = (stream) => {
54
+ const state = { partial: "", dirty: false };
55
+ streams[stream] = state;
27
56
  return (chunk) => {
28
- partial += chunk.toString();
29
- const lines = partial.split("\n");
30
- partial = lines.pop();
57
+ state.partial += chunk.toString();
58
+ const lines = state.partial.split(/\r\n|\n|\r/);
59
+ state.partial = lines.pop() ?? "";
60
+ state.dirty = state.partial.length > 0;
31
61
  for (const line of lines) {
32
62
  if (line) onOutput({ stream, line });
33
63
  }
34
64
  };
35
65
  };
36
- if (subprocess.stdout) subprocess.stdout.on("data", handler("stdout"));
37
- if (subprocess.stderr) subprocess.stderr.on("data", handler("stderr"));
66
+
67
+ const flushPartials = () => {
68
+ for (const [stream, state] of Object.entries(streams)) {
69
+ if (!state.dirty || !state.partial) continue;
70
+ onOutput({ stream, line: state.partial });
71
+ state.partial = "";
72
+ state.dirty = false;
73
+ }
74
+ };
75
+
76
+ if (subprocess.stdout) subprocess.stdout.on("data", makeHandler("stdout"));
77
+ if (subprocess.stderr) subprocess.stderr.on("data", makeHandler("stderr"));
78
+ flushInterval = setInterval(flushPartials, flushMs);
79
+ flushInterval.unref?.();
80
+
81
+ subprocess.finally(() => {
82
+ flushPartials();
83
+ if (flushInterval) {
84
+ clearInterval(flushInterval);
85
+ flushInterval = null;
86
+ }
87
+ });
38
88
  }
89
+ armSilenceTimer();
39
90
 
40
91
  try {
41
92
  if (!timeout) {
42
93
  const result = await subprocess;
94
+ clearSilenceTimer();
95
+ if (silenceTimedOut) {
96
+ return {
97
+ exitCode: 143,
98
+ stdout: stdoutAccum,
99
+ stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
100
+ timedOut: true,
101
+ signal: "SIGKILL"
102
+ };
103
+ }
43
104
  return enrichResult(result, stdoutAccum, stderrAccum);
44
105
  }
45
106
 
@@ -63,8 +124,28 @@ export async function runCommand(command, args = [], options = {}) {
63
124
 
64
125
  const result = await Promise.race([subprocess, timeoutResult]);
65
126
  if (timer) clearTimeout(timer);
127
+ clearSilenceTimer();
128
+ if (silenceTimedOut) {
129
+ return {
130
+ exitCode: 143,
131
+ stdout: stdoutAccum,
132
+ stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
133
+ timedOut: true,
134
+ signal: "SIGKILL"
135
+ };
136
+ }
66
137
  return enrichResult(result, stdoutAccum, stderrAccum);
67
138
  } catch (error) {
139
+ clearSilenceTimer();
140
+ if (silenceTimedOut) {
141
+ return {
142
+ exitCode: 143,
143
+ stdout: error?.stdout || stdoutAccum,
144
+ stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
145
+ timedOut: true,
146
+ signal: error?.signal || "SIGKILL"
147
+ };
148
+ }
68
149
  const details = [
69
150
  error?.shortMessage,
70
151
  error?.originalMessage,
@@ -15,6 +15,7 @@ import { emitProgress, makeEvent } from "./events.js";
15
15
  const DEFAULT_HEARTBEAT_INTERVAL_MS = 30_000; // heartbeat every 30s
16
16
  const DEFAULT_STALL_TIMEOUT_MS = 120_000; // warn after 2min silence
17
17
  const DEFAULT_CRITICAL_TIMEOUT_MS = 300_000; // critical after 5min silence
18
+ const DEFAULT_STALL_REPEAT_MS = 60_000; // repeat stall notices every 60s
18
19
 
19
20
  export function createStallDetector({
20
21
  onOutput,
@@ -24,23 +25,30 @@ export function createStallDetector({
24
25
  provider,
25
26
  heartbeatIntervalMs = DEFAULT_HEARTBEAT_INTERVAL_MS,
26
27
  stallTimeoutMs = DEFAULT_STALL_TIMEOUT_MS,
27
- criticalTimeoutMs = DEFAULT_CRITICAL_TIMEOUT_MS
28
+ criticalTimeoutMs = DEFAULT_CRITICAL_TIMEOUT_MS,
29
+ stallRepeatMs = DEFAULT_STALL_REPEAT_MS,
30
+ maxSilenceMs = null,
31
+ onMaxSilence = null
28
32
  }) {
29
33
  let lastActivityAt = Date.now();
30
34
  let lineCount = 0;
31
35
  let bytesReceived = 0;
32
- let stallWarned = false;
33
- let criticalWarned = false;
34
36
  let heartbeatTimer = null;
35
37
  const startedAt = Date.now();
38
+ let lastStallWarnAt = 0;
39
+ let lastCriticalWarnAt = 0;
40
+ let maxSilenceTriggered = false;
36
41
 
37
42
  function emitHeartbeat() {
38
43
  const now = Date.now();
39
44
  const silenceMs = now - lastActivityAt;
40
45
  const elapsedMs = now - startedAt;
46
+ const shouldWarn = silenceMs >= stallTimeoutMs;
47
+ const shouldCritical = silenceMs >= criticalTimeoutMs;
48
+ const repeatWindow = Math.max(1000, Number(stallRepeatMs) || DEFAULT_STALL_REPEAT_MS);
41
49
 
42
- if (silenceMs >= criticalTimeoutMs && !criticalWarned) {
43
- criticalWarned = true;
50
+ if (shouldCritical && (now - lastCriticalWarnAt >= repeatWindow)) {
51
+ lastCriticalWarnAt = now;
44
52
  emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
45
53
  status: "critical",
46
54
  message: `Agent ${provider} unresponsive for ${Math.round(silenceMs / 1000)}s — may be hung`,
@@ -53,8 +61,8 @@ export function createStallDetector({
53
61
  severity: "critical"
54
62
  }
55
63
  }));
56
- } else if (silenceMs >= stallTimeoutMs && !stallWarned) {
57
- stallWarned = true;
64
+ } else if (shouldWarn && (now - lastStallWarnAt >= repeatWindow)) {
65
+ lastStallWarnAt = now;
58
66
  emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
59
67
  status: "warning",
60
68
  message: `Agent ${provider} silent for ${Math.round(silenceMs / 1000)}s — still waiting`,
@@ -67,20 +75,49 @@ export function createStallDetector({
67
75
  severity: "warning"
68
76
  }
69
77
  }));
70
- } else if (silenceMs < stallTimeoutMs) {
71
- // Reset warning flags when activity resumes
72
- stallWarned = false;
73
- criticalWarned = false;
78
+ }
79
+
80
+ emitProgress(emitter, makeEvent("agent:heartbeat", { ...eventBase, stage }, {
81
+ message: silenceMs < stallTimeoutMs
82
+ ? `Agent ${provider} active — ${lineCount} lines, ${Math.round(elapsedMs / 1000)}s elapsed`
83
+ : `Agent ${provider} waiting — silent ${Math.round(silenceMs / 1000)}s, ${Math.round(elapsedMs / 1000)}s elapsed`,
84
+ detail: {
85
+ provider,
86
+ elapsedMs,
87
+ silenceMs,
88
+ lineCount,
89
+ bytesReceived,
90
+ status: silenceMs < stallTimeoutMs ? "active" : "waiting"
91
+ }
92
+ }));
74
93
 
75
- emitProgress(emitter, makeEvent("agent:heartbeat", { ...eventBase, stage }, {
76
- message: `Agent ${provider} active ${lineCount} lines, ${Math.round(elapsedMs / 1000)}s elapsed`,
94
+ const hardLimit = Number(maxSilenceMs);
95
+ if (!maxSilenceTriggered && Number.isFinite(hardLimit) && hardLimit > 0 && silenceMs >= hardLimit) {
96
+ maxSilenceTriggered = true;
97
+ emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
98
+ status: "fail",
99
+ message: `Agent ${provider} exceeded max silence (${Math.round(hardLimit / 1000)}s)`,
77
100
  detail: {
78
101
  provider,
102
+ silenceMs,
79
103
  elapsedMs,
80
104
  lineCount,
81
- bytesReceived
105
+ bytesReceived,
106
+ severity: "fatal",
107
+ maxSilenceMs: hardLimit
82
108
  }
83
109
  }));
110
+ if (typeof onMaxSilence === "function") {
111
+ onMaxSilence({
112
+ provider,
113
+ stage,
114
+ silenceMs,
115
+ elapsedMs,
116
+ lineCount,
117
+ bytesReceived,
118
+ maxSilenceMs: hardLimit
119
+ });
120
+ }
84
121
  }
85
122
  }
86
123
 
@@ -92,10 +129,6 @@ export function createStallDetector({
92
129
  lineCount++;
93
130
  bytesReceived += data.line?.length || 0;
94
131
 
95
- // Reset stall flags on new activity
96
- stallWarned = false;
97
- criticalWarned = false;
98
-
99
132
  // Forward to the original callback
100
133
  if (onOutput) {
101
134
  onOutput(data);