npm - karajan-code - Versions diffs - 1.9.1 → 1.9.4 - Mend

karajan-code 1.9.1 → 1.9.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md +14 -1
package/docs/README.es.md +2 -1
package/package.json +1 -1
package/src/agents/aider-agent.js +10 -2
package/src/agents/claude-agent.js +10 -2
package/src/agents/codex-agent.js +14 -4
package/src/agents/gemini-agent.js +10 -2
package/src/commands/plan.js +7 -1
package/src/config.js +2 -0
package/src/mcp/server-handlers.js +66 -5
package/src/roles/planner-role.js +16 -0
package/src/utils/process.js +89 -8
package/src/utils/stall-detector.js +51 -18

package/README.md CHANGED Viewed

@@ -43,6 +43,7 @@ Instead of running one AI agent and manually reviewing its output, `kj` chains a
 - **Task decomposition** — triage detects when tasks should be split and recommends subtasks; with Planning Game integration, creates linked cards with sequential blocking
 - **Retry with backoff** — automatic recovery from transient API errors (429, 5xx) with exponential backoff and jitter
 - **Pipeline stage tracker** — cumulative progress view during `kj_run` showing which stages are done, running, or pending — both in CLI and via MCP events for real-time host rendering
+- **Planner observability guardrails** — continuous heartbeat/stall telemetry, configurable max-silence protection (`session.max_agent_silence_minutes`), and hard runtime cap (`session.max_planner_minutes`) to avoid long stuck planner runs
 - **Planning Game integration** — optionally pair with [Planning Game](https://github.com/AgenteIA-Geniova/planning-game) for agile project management (tasks, sprints, estimation) — like Jira, but open-source and XP-native
 > **Best with MCP** — Karajan Code is designed to be used as an MCP server inside your AI agent (Claude, Codex, etc.). The agent sends tasks to `kj_run`, gets real-time progress notifications, and receives structured results — no copy-pasting needed.
@@ -417,7 +418,19 @@ After `npm install -g karajan-code`, the MCP server is auto-registered in Claude
 | `kj_roles` | List roles or show role templates |
 | `kj_code` | Run coder-only mode |
 | `kj_review` | Run reviewer-only mode |
-| `kj_plan` | Generate implementation plan |
+| `kj_plan` | Generate implementation plan with heartbeat/stall telemetry and clearer diagnostics |
+### MCP restart after version updates
+If you update Karajan Code (for example `npm install -g karajan-code` to a new version) while your MCP host session is still open, the current `karajan-mcp` process may exit and the host can show `Transport closed`.
+This is expected behavior: the MCP server detects a version mismatch and exits so the host can spawn a fresh process with the new code.
+Quick recovery:
+1. Restart your MCP host session (Claude/Codex/new terminal session).
+2. Verify the server is listed (`codex mcp list` or your host equivalent).
+3. Run a lightweight check (`kj_config`) before continuing with larger runs.
 ### Recommended Companion MCPs

package/docs/README.es.md CHANGED Viewed

@@ -42,6 +42,7 @@ En lugar de ejecutar un agente de IA y revisar manualmente su output, `kj` encad
 - **Descomposicion de tareas** — triage detecta cuando una tarea debe dividirse y recomienda subtareas; con integracion Planning Game, crea cards vinculadas con bloqueo secuencial
 - **Retry con backoff** — recuperacion automatica ante errores transitorios de API (429, 5xx) con backoff exponencial y jitter
 - **Pipeline stage tracker** — vista de progreso acumulativo durante `kj_run` mostrando que stages estan completadas, en ejecucion o pendientes — tanto en CLI como via eventos MCP para renderizado en tiempo real en el host
+- **Guardarrailes de observabilidad del planner** — telemetria continua de heartbeat/stall, proteccion configurable por silencio maximo (`session.max_agent_silence_minutes`) y limite duro de ejecucion (`session.max_planner_minutes`) para evitar bloqueos prolongados en `kj_plan`/planner
 - **Integracion con Planning Game** — combina opcionalmente con [Planning Game](https://github.com/AgenteIA-Geniova/planning-game) para gestion agil de proyectos (tareas, sprints, estimacion) — como Jira, pero open-source y nativo XP
 > **Mejor con MCP** — Karajan Code esta disenado para usarse como servidor MCP dentro de tu agente de IA (Claude, Codex, etc.). El agente envia tareas a `kj_run`, recibe notificaciones de progreso en tiempo real, y obtiene resultados estructurados — sin copiar y pegar.
@@ -201,7 +202,7 @@ Tras `npm install -g karajan-code`, el servidor MCP se auto-registra en las conf
 | `kj_roles` | Listar roles o mostrar templates |
 | `kj_code` | Modo solo coder |
 | `kj_review` | Modo solo reviewer |
-| `kj_plan` | Generar plan de implementacion |
+| `kj_plan` | Generar plan de implementacion con telemetria heartbeat/stall y diagnostico mas claro |
 ### MCPs complementarios recomendados

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "karajan-code",
-  "version": "1.9.1",
+  "version": "1.9.4",
   "description": "Local multi-agent coding orchestrator with TDD, SonarQube, and code review pipeline",
   "type": "module",
   "license": "AGPL-3.0",

package/src/agents/aider-agent.js CHANGED Viewed

@@ -8,7 +8,11 @@ export class AiderAgent extends BaseAgent {
     const args = ["--yes", "--message", task.prompt];
     const model = this.getRoleModel(role);
     if (model) args.push("--model", model);
-    const res = await runCommand(resolveBin("aider"), args, { onOutput: task.onOutput });
+    const res = await runCommand(resolveBin("aider"), args, {
+      onOutput: task.onOutput,
+      silenceTimeoutMs: task.silenceTimeoutMs,
+      timeout: task.timeoutMs
+    });
     return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
   }
@@ -17,7 +21,11 @@ export class AiderAgent extends BaseAgent {
     const args = ["--yes", "--message", task.prompt];
     const model = this.getRoleModel(role);
     if (model) args.push("--model", model);
-    const res = await runCommand(resolveBin("aider"), args, { onOutput: task.onOutput });
+    const res = await runCommand(resolveBin("aider"), args, {
+      onOutput: task.onOutput,
+      silenceTimeoutMs: task.silenceTimeoutMs,
+      timeout: task.timeoutMs
+    });
     return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
   }
 }

package/src/agents/claude-agent.js CHANGED Viewed

@@ -81,7 +81,11 @@ export class ClaudeAgent extends BaseAgent {
     if (task.onOutput) {
       args.push("--output-format", "stream-json");
       const streamFilter = createStreamJsonFilter(task.onOutput);
-      const res = await runCommand(resolveBin("claude"), args, { onOutput: streamFilter });
+      const res = await runCommand(resolveBin("claude"), args, {
+        onOutput: streamFilter,
+        silenceTimeoutMs: task.silenceTimeoutMs,
+        timeout: task.timeoutMs
+      });
       const output = extractTextFromStreamJson(res.stdout);
       return { ok: res.exitCode === 0, output, error: res.stderr, exitCode: res.exitCode };
     }
@@ -94,7 +98,11 @@ export class ClaudeAgent extends BaseAgent {
     const args = ["-p", task.prompt, "--output-format", "json"];
     const model = this.getRoleModel(task.role || "reviewer");
     if (model) args.push("--model", model);
-    const res = await runCommand(resolveBin("claude"), args, { onOutput: task.onOutput });
+    const res = await runCommand(resolveBin("claude"), args, {
+      onOutput: task.onOutput,
+      silenceTimeoutMs: task.silenceTimeoutMs,
+      timeout: task.timeoutMs
+    });
     return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
   }
 }

package/src/agents/codex-agent.js CHANGED Viewed

@@ -9,8 +9,13 @@ export class CodexAgent extends BaseAgent {
     const model = this.getRoleModel(role);
     if (model) args.push("--model", model);
     if (this.isAutoApproveEnabled(role)) args.push("--full-auto");
-    args.push(task.prompt);
-    const res = await runCommand(resolveBin("codex"), args, { onOutput: task.onOutput });
+    args.push("-");
+    const res = await runCommand(resolveBin("codex"), args, {
+      onOutput: task.onOutput,
+      silenceTimeoutMs: task.silenceTimeoutMs,
+      timeout: task.timeoutMs,
+      input: task.prompt
+    });
     return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
   }
@@ -18,8 +23,13 @@ export class CodexAgent extends BaseAgent {
     const args = ["exec"];
     const model = this.getRoleModel(task.role || "reviewer");
     if (model) args.push("--model", model);
-    args.push(task.prompt);
-    const res = await runCommand(resolveBin("codex"), args, { onOutput: task.onOutput });
+    args.push("-");
+    const res = await runCommand(resolveBin("codex"), args, {
+      onOutput: task.onOutput,
+      silenceTimeoutMs: task.silenceTimeoutMs,
+      timeout: task.timeoutMs,
+      input: task.prompt
+    });
     return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
   }
 }

package/src/agents/gemini-agent.js CHANGED Viewed

@@ -8,7 +8,11 @@ export class GeminiAgent extends BaseAgent {
     const args = ["-p", task.prompt];
     const model = this.getRoleModel(role);
     if (model) args.push("--model", model);
-    const res = await runCommand(resolveBin("gemini"), args, { onOutput: task.onOutput });
+    const res = await runCommand(resolveBin("gemini"), args, {
+      onOutput: task.onOutput,
+      silenceTimeoutMs: task.silenceTimeoutMs,
+      timeout: task.timeoutMs
+    });
     return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
   }
@@ -17,7 +21,11 @@ export class GeminiAgent extends BaseAgent {
     const args = ["-p", task.prompt, "--output-format", "json"];
     const model = this.getRoleModel(role);
     if (model) args.push("--model", model);
-    const res = await runCommand(resolveBin("gemini"), args, { onOutput: task.onOutput });
+    const res = await runCommand(resolveBin("gemini"), args, {
+      onOutput: task.onOutput,
+      silenceTimeoutMs: task.silenceTimeoutMs,
+      timeout: task.timeoutMs
+    });
     return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
   }
 }

package/src/commands/plan.js CHANGED Viewed

@@ -46,7 +46,13 @@ export async function planCommand({ task, config, logger, json, context }) {
   const planner = createAgent(plannerRole.provider, config, logger);
   const prompt = buildPlannerPrompt({ task, context });
-  const result = await planner.runTask({ prompt, role: "planner" });
+  const silenceTimeoutMs = Number(config?.session?.max_agent_silence_minutes) > 0
+    ? Math.round(Number(config.session.max_agent_silence_minutes) * 60 * 1000)
+    : undefined;
+  const timeoutMs = Number(config?.session?.max_planner_minutes) > 0
+    ? Math.round(Number(config.session.max_planner_minutes) * 60 * 1000)
+    : undefined;
+  const result = await planner.runTask({ prompt, role: "planner", silenceTimeoutMs, timeoutMs });
   if (!result.ok) {
     throw new Error(result.error || result.output || "Planner failed");

package/src/config.js CHANGED Viewed

@@ -114,7 +114,9 @@ const DEFAULTS = {
   session: {
     max_iteration_minutes: 30,
     max_total_minutes: 120,
+    max_planner_minutes: 60,
     checkpoint_interval_minutes: 5,
+    max_agent_silence_minutes: 20,
     fail_fast_repeats: 2,
     repeat_detection_threshold: 2,
     max_sonar_retries: 3,

package/src/mcp/server-handlers.js CHANGED Viewed

@@ -21,6 +21,7 @@ import { parseMaybeJsonString } from "../review/parser.js";
 import { computeBaseRef, generateDiff } from "../review/diff-generator.js";
 import { resolveReviewProfile } from "../review/profiles.js";
 import { createRunLog, readRunLog } from "../utils/run-log.js";
+import { currentBranch } from "../utils/git.js";
 /**
  * Resolve the user's project directory via MCP roots.
@@ -62,6 +63,18 @@ export function classifyError(error) {
   const msg = error?.message || String(error);
   const lower = msg.toLowerCase();
+  if (
+    lower.includes("without output")
+    || lower.includes("silent for")
+    || lower.includes("unresponsive")
+    || lower.includes("exceeded max silence")
+  ) {
+    return {
+      category: "agent_stall",
+      suggestion: "Agent output stalled. Check live details with kj_status, then retry with a smaller prompt or increase session.max_agent_silence_minutes if needed."
+    };
+  }
   if (lower.includes("sonar") && (lower.includes("connect") || lower.includes("econnrefused") || lower.includes("not available") || lower.includes("not running"))) {
     return {
       category: "sonar_unavailable",
@@ -97,6 +110,13 @@ export function classifyError(error) {
     };
   }
+  if (lower.includes("you are on the base branch")) {
+    return {
+      category: "branch_error",
+      suggestion: "Create a feature branch before running Karajan. Use 'git checkout -b feat/<task-description>' and then retry. Do NOT run kj_code directly on the base branch."
+    };
+  }
   if (lower.includes("not a git repository")) {
     return {
       category: "git_error",
@@ -107,6 +127,23 @@ export function classifyError(error) {
   return { category: "unknown", suggestion: null };
 }
+export async function assertNotOnBaseBranch(config) {
+  const baseBranch = config?.base_branch || "main";
+  let branch;
+  try {
+    branch = await currentBranch();
+  } catch {
+    return; // not a git repo or detached HEAD — let downstream handle it
+  }
+  if (branch === baseBranch) {
+    throw new Error(
+      `You are on the base branch '${baseBranch}'. Karajan needs a feature branch to compute the diff for review. ` +
+      `Create a new branch first (e.g. 'git checkout -b feat/<task-description>') and then run this command again. ` +
+      `Do NOT run kj_code directly — create the branch first so the full pipeline (code + review) works correctly.`
+    );
+  }
+}
 export function enrichedFailPayload(error, toolName) {
   const msg = error?.message || String(error);
   const { category, suggestion } = classifyError(error);
@@ -149,6 +186,7 @@ export function buildAskQuestion(server) {
 export async function handleRunDirect(a, server, extra) {
   const config = await buildConfig(a);
+  await assertNotOnBaseBranch(config);
   const logger = createLogger(config.output.log_level, "mcp");
   const requiredProviders = [
@@ -235,7 +273,15 @@ export async function handlePlanDirect(a, server, extra) {
   const projectDir = await resolveProjectDir(server);
   const runLog = createRunLog(projectDir);
-  runLog.logText(`[kj_plan] started — provider=${plannerRole.provider}`);
+  const silenceTimeoutMs = Number(config?.session?.max_agent_silence_minutes) > 0
+    ? Math.round(Number(config.session.max_agent_silence_minutes) * 60 * 1000)
+    : undefined;
+  const plannerTimeoutMs = Number(config?.session?.max_planner_minutes) > 0
+    ? Math.round(Number(config.session.max_planner_minutes) * 60 * 1000)
+    : undefined;
+  runLog.logText(
+    `[kj_plan] started — provider=${plannerRole.provider}, max_silence=${silenceTimeoutMs ? `${Math.round(silenceTimeoutMs / 1000)}s` : "disabled"}, max_runtime=${plannerTimeoutMs ? `${Math.round(plannerTimeoutMs / 1000)}s` : "disabled"}`
+  );
   const emitter = buildDirectEmitter(server, runLog);
   const eventBase = { sessionId: null, iteration: 0, startedAt: Date.now() };
   const onOutput = ({ stream, line }) => {
@@ -250,18 +296,31 @@ export async function handlePlanDirect(a, server, extra) {
   sendTrackerLog(server, "planner", "running", plannerRole.provider);
   runLog.logText(`[planner] agent launched, waiting for response...`);
   let result;
+  let plannerStats = null;
   try {
-    result = await planner.runTask({ prompt, role: "planner", onOutput: stallDetector.onOutput });
+    result = await planner.runTask({
+      prompt,
+      role: "planner",
+      onOutput: stallDetector.onOutput,
+      silenceTimeoutMs,
+      timeoutMs: plannerTimeoutMs
+    });
   } finally {
     stallDetector.stop();
-    const stats = stallDetector.stats();
-    runLog.logText(`[planner] finished — lines=${stats.lineCount}, bytes=${stats.bytesReceived}, elapsed=${Math.round(stats.elapsedMs / 1000)}s`);
+    plannerStats = stallDetector.stats();
+    runLog.logText(
+      `[planner] finished — lines=${plannerStats.lineCount}, bytes=${plannerStats.bytesReceived}, elapsed=${Math.round(plannerStats.elapsedMs / 1000)}s`
+    );
     runLog.close();
   }
   if (!result.ok) {
     sendTrackerLog(server, "planner", "failed");
-    throw new Error(result.error || result.output || "Planner failed");
+    const baseError = result.error || result.output || "Planner failed";
+    const statsSuffix = plannerStats
+      ? ` [lines=${plannerStats.lineCount}, bytes=${plannerStats.bytesReceived}, elapsed=${Math.round(plannerStats.elapsedMs / 1000)}s]`
+      : "";
+    throw new Error(`${baseError}${statsSuffix}`);
   }
   sendTrackerLog(server, "planner", "done");
@@ -271,6 +330,7 @@ export async function handlePlanDirect(a, server, extra) {
 export async function handleCodeDirect(a, server, extra) {
   const config = await buildConfig(a, "code");
+  await assertNotOnBaseBranch(config);
   const logger = createLogger(config.output.log_level, "mcp");
   const coderRole = resolveRole(config, "coder");
@@ -319,6 +379,7 @@ export async function handleCodeDirect(a, server, extra) {
 export async function handleReviewDirect(a, server, extra) {
   const config = await buildConfig(a, "review");
+  await assertNotOnBaseBranch(config);
   const logger = createLogger(config.output.log_level, "mcp");
   const reviewerRole = resolveRole(config, "reviewer");

package/src/roles/planner-role.js CHANGED Viewed

@@ -9,6 +9,18 @@ function resolveProvider(config) {
   );
 }
+function resolvePlannerSilenceTimeoutMs(config) {
+  const minutes = Number(config?.session?.max_agent_silence_minutes);
+  if (!Number.isFinite(minutes) || minutes <= 0) return null;
+  return Math.round(minutes * 60 * 1000);
+}
+function resolvePlannerRuntimeTimeoutMs(config) {
+  const minutes = Number(config?.session?.max_planner_minutes);
+  if (!Number.isFinite(minutes) || minutes <= 0) return null;
+  return Math.round(minutes * 60 * 1000);
+}
 function buildPrompt({ task, instructions, research, triageDecomposition }) {
   const sections = [];
@@ -78,6 +90,10 @@ export class PlannerRole extends BaseRole {
     const runArgs = { prompt, role: "planner" };
     if (onOutput) runArgs.onOutput = onOutput;
+    const silenceTimeoutMs = resolvePlannerSilenceTimeoutMs(this.config);
+    if (silenceTimeoutMs) runArgs.silenceTimeoutMs = silenceTimeoutMs;
+    const timeoutMs = resolvePlannerRuntimeTimeoutMs(this.config);
+    if (timeoutMs) runArgs.timeoutMs = timeoutMs;
     const result = await agent.runTask(runArgs);
     if (!result.ok) {

package/src/utils/process.js CHANGED Viewed

@@ -1,7 +1,7 @@
 import { execa } from "execa";
 export async function runCommand(command, args = [], options = {}) {
-  const { timeout, onOutput, ...rest } = options;
+  const { timeout, onOutput, silenceTimeoutMs, partialOutputFlushMs, ...rest } = options;
   const subprocess = execa(command, args, {
     reject: false,
     ...rest
@@ -9,37 +9,98 @@ export async function runCommand(command, args = [], options = {}) {
   let stdoutAccum = "";
   let stderrAccum = "";
+  let outputSilenceTimer = null;
+  let silenceTimedOut = false;
+  function clearSilenceTimer() {
+    if (outputSilenceTimer) {
+      clearTimeout(outputSilenceTimer);
+      outputSilenceTimer = null;
+    }
+  }
+  function armSilenceTimer() {
+    const ms = Number(silenceTimeoutMs);
+    if (!Number.isFinite(ms) || ms <= 0 || silenceTimedOut) return;
+    clearSilenceTimer();
+    outputSilenceTimer = setTimeout(() => {
+      silenceTimedOut = true;
+      try {
+        subprocess.kill("SIGKILL", { forceKillAfterDelay: 1000 });
+      } catch {
+        // no-op
+      }
+    }, ms);
+  }
   if (subprocess.stdout) {
     subprocess.stdout.on("data", (chunk) => {
       stdoutAccum += chunk.toString();
+      armSilenceTimer();
     });
   }
   if (subprocess.stderr) {
     subprocess.stderr.on("data", (chunk) => {
       stderrAccum += chunk.toString();
+      armSilenceTimer();
     });
   }
+  let flushInterval = null;
   if (onOutput) {
-    const handler = (stream) => {
-      let partial = "";
+    const flushMs = Number(partialOutputFlushMs) > 0 ? Number(partialOutputFlushMs) : 2000;
+    const streams = {};
+    const makeHandler = (stream) => {
+      const state = { partial: "", dirty: false };
+      streams[stream] = state;
       return (chunk) => {
-        partial += chunk.toString();
-        const lines = partial.split("\n");
-        partial = lines.pop();
+        state.partial += chunk.toString();
+        const lines = state.partial.split(/\r\n|\n|\r/);
+        state.partial = lines.pop() ?? "";
+        state.dirty = state.partial.length > 0;
         for (const line of lines) {
           if (line) onOutput({ stream, line });
         }
       };
     };
-    if (subprocess.stdout) subprocess.stdout.on("data", handler("stdout"));
-    if (subprocess.stderr) subprocess.stderr.on("data", handler("stderr"));
+    const flushPartials = () => {
+      for (const [stream, state] of Object.entries(streams)) {
+        if (!state.dirty || !state.partial) continue;
+        onOutput({ stream, line: state.partial });
+        state.partial = "";
+        state.dirty = false;
+      }
+    };
+    if (subprocess.stdout) subprocess.stdout.on("data", makeHandler("stdout"));
+    if (subprocess.stderr) subprocess.stderr.on("data", makeHandler("stderr"));
+    flushInterval = setInterval(flushPartials, flushMs);
+    flushInterval.unref?.();
+    subprocess.finally(() => {
+      flushPartials();
+      if (flushInterval) {
+        clearInterval(flushInterval);
+        flushInterval = null;
+      }
+    });
   }
+  armSilenceTimer();
   try {
     if (!timeout) {
       const result = await subprocess;
+      clearSilenceTimer();
+      if (silenceTimedOut) {
+        return {
+          exitCode: 143,
+          stdout: stdoutAccum,
+          stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
+          timedOut: true,
+          signal: "SIGKILL"
+        };
+      }
       return enrichResult(result, stdoutAccum, stderrAccum);
     }
@@ -63,8 +124,28 @@ export async function runCommand(command, args = [], options = {}) {
     const result = await Promise.race([subprocess, timeoutResult]);
     if (timer) clearTimeout(timer);
+    clearSilenceTimer();
+    if (silenceTimedOut) {
+      return {
+        exitCode: 143,
+        stdout: stdoutAccum,
+        stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
+        timedOut: true,
+        signal: "SIGKILL"
+      };
+    }
     return enrichResult(result, stdoutAccum, stderrAccum);
   } catch (error) {
+    clearSilenceTimer();
+    if (silenceTimedOut) {
+      return {
+        exitCode: 143,
+        stdout: error?.stdout || stdoutAccum,
+        stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
+        timedOut: true,
+        signal: error?.signal || "SIGKILL"
+      };
+    }
     const details = [
       error?.shortMessage,
       error?.originalMessage,

package/src/utils/stall-detector.js CHANGED Viewed

@@ -15,6 +15,7 @@ import { emitProgress, makeEvent } from "./events.js";
 const DEFAULT_HEARTBEAT_INTERVAL_MS = 30_000;   // heartbeat every 30s
 const DEFAULT_STALL_TIMEOUT_MS      = 120_000;  // warn after 2min silence
 const DEFAULT_CRITICAL_TIMEOUT_MS   = 300_000;  // critical after 5min silence
+const DEFAULT_STALL_REPEAT_MS       = 60_000;   // repeat stall notices every 60s
 export function createStallDetector({
   onOutput,
@@ -24,23 +25,30 @@ export function createStallDetector({
   provider,
   heartbeatIntervalMs = DEFAULT_HEARTBEAT_INTERVAL_MS,
   stallTimeoutMs      = DEFAULT_STALL_TIMEOUT_MS,
-  criticalTimeoutMs   = DEFAULT_CRITICAL_TIMEOUT_MS
+  criticalTimeoutMs   = DEFAULT_CRITICAL_TIMEOUT_MS,
+  stallRepeatMs       = DEFAULT_STALL_REPEAT_MS,
+  maxSilenceMs        = null,
+  onMaxSilence        = null
 }) {
   let lastActivityAt = Date.now();
   let lineCount = 0;
   let bytesReceived = 0;
-  let stallWarned = false;
-  let criticalWarned = false;
   let heartbeatTimer = null;
   const startedAt = Date.now();
+  let lastStallWarnAt = 0;
+  let lastCriticalWarnAt = 0;
+  let maxSilenceTriggered = false;
   function emitHeartbeat() {
     const now = Date.now();
     const silenceMs = now - lastActivityAt;
     const elapsedMs = now - startedAt;
+    const shouldWarn = silenceMs >= stallTimeoutMs;
+    const shouldCritical = silenceMs >= criticalTimeoutMs;
+    const repeatWindow = Math.max(1000, Number(stallRepeatMs) || DEFAULT_STALL_REPEAT_MS);
-    if (silenceMs >= criticalTimeoutMs && !criticalWarned) {
-      criticalWarned = true;
+    if (shouldCritical && (now - lastCriticalWarnAt >= repeatWindow)) {
+      lastCriticalWarnAt = now;
       emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
         status: "critical",
         message: `Agent ${provider} unresponsive for ${Math.round(silenceMs / 1000)}s — may be hung`,
@@ -53,8 +61,8 @@ export function createStallDetector({
           severity: "critical"
         }
       }));
-    } else if (silenceMs >= stallTimeoutMs && !stallWarned) {
-      stallWarned = true;
+    } else if (shouldWarn && (now - lastStallWarnAt >= repeatWindow)) {
+      lastStallWarnAt = now;
       emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
         status: "warning",
         message: `Agent ${provider} silent for ${Math.round(silenceMs / 1000)}s — still waiting`,
@@ -67,20 +75,49 @@ export function createStallDetector({
           severity: "warning"
         }
       }));
-    } else if (silenceMs < stallTimeoutMs) {
-      // Reset warning flags when activity resumes
-      stallWarned = false;
-      criticalWarned = false;
+    }
+    emitProgress(emitter, makeEvent("agent:heartbeat", { ...eventBase, stage }, {
+      message: silenceMs < stallTimeoutMs
+        ? `Agent ${provider} active — ${lineCount} lines, ${Math.round(elapsedMs / 1000)}s elapsed`
+        : `Agent ${provider} waiting — silent ${Math.round(silenceMs / 1000)}s, ${Math.round(elapsedMs / 1000)}s elapsed`,
+      detail: {
+        provider,
+        elapsedMs,
+        silenceMs,
+        lineCount,
+        bytesReceived,
+        status: silenceMs < stallTimeoutMs ? "active" : "waiting"
+      }
+    }));
-      emitProgress(emitter, makeEvent("agent:heartbeat", { ...eventBase, stage }, {
-        message: `Agent ${provider} active — ${lineCount} lines, ${Math.round(elapsedMs / 1000)}s elapsed`,
+    const hardLimit = Number(maxSilenceMs);
+    if (!maxSilenceTriggered && Number.isFinite(hardLimit) && hardLimit > 0 && silenceMs >= hardLimit) {
+      maxSilenceTriggered = true;
+      emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
+        status: "fail",
+        message: `Agent ${provider} exceeded max silence (${Math.round(hardLimit / 1000)}s)`,
         detail: {
           provider,
+          silenceMs,
           elapsedMs,
           lineCount,
-          bytesReceived
+          bytesReceived,
+          severity: "fatal",
+          maxSilenceMs: hardLimit
         }
       }));
+      if (typeof onMaxSilence === "function") {
+        onMaxSilence({
+          provider,
+          stage,
+          silenceMs,
+          elapsedMs,
+          lineCount,
+          bytesReceived,
+          maxSilenceMs: hardLimit
+        });
+      }
     }
   }
@@ -92,10 +129,6 @@ export function createStallDetector({
     lineCount++;
     bytesReceived += data.line?.length || 0;
-    // Reset stall flags on new activity
-    stallWarned = false;
-    criticalWarned = false;
     // Forward to the original callback
     if (onOutput) {
       onOutput(data);