karajan-code 1.9.0 → 1.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/docs/README.es.md +2 -1
- package/package.json +1 -1
- package/src/agents/aider-agent.js +10 -2
- package/src/agents/claude-agent.js +10 -2
- package/src/agents/codex-agent.js +14 -4
- package/src/agents/gemini-agent.js +10 -2
- package/src/cli.js +23 -0
- package/src/commands/plan.js +7 -1
- package/src/config.js +2 -0
- package/src/mcp/server-handlers.js +38 -5
- package/src/roles/planner-role.js +16 -0
- package/src/utils/process.js +89 -8
- package/src/utils/stall-detector.js +51 -18
package/README.md
CHANGED
|
@@ -43,6 +43,7 @@ Instead of running one AI agent and manually reviewing its output, `kj` chains a
|
|
|
43
43
|
- **Task decomposition** — triage detects when tasks should be split and recommends subtasks; with Planning Game integration, creates linked cards with sequential blocking
|
|
44
44
|
- **Retry with backoff** — automatic recovery from transient API errors (429, 5xx) with exponential backoff and jitter
|
|
45
45
|
- **Pipeline stage tracker** — cumulative progress view during `kj_run` showing which stages are done, running, or pending — both in CLI and via MCP events for real-time host rendering
|
|
46
|
+
- **Planner observability guardrails** — continuous heartbeat/stall telemetry, configurable max-silence protection (`session.max_agent_silence_minutes`), and hard runtime cap (`session.max_planner_minutes`) to avoid long stuck planner runs
|
|
46
47
|
- **Planning Game integration** — optionally pair with [Planning Game](https://github.com/AgenteIA-Geniova/planning-game) for agile project management (tasks, sprints, estimation) — like Jira, but open-source and XP-native
|
|
47
48
|
|
|
48
49
|
> **Best with MCP** — Karajan Code is designed to be used as an MCP server inside your AI agent (Claude, Codex, etc.). The agent sends tasks to `kj_run`, gets real-time progress notifications, and receives structured results — no copy-pasting needed.
|
|
@@ -417,7 +418,7 @@ After `npm install -g karajan-code`, the MCP server is auto-registered in Claude
|
|
|
417
418
|
| `kj_roles` | List roles or show role templates |
|
|
418
419
|
| `kj_code` | Run coder-only mode |
|
|
419
420
|
| `kj_review` | Run reviewer-only mode |
|
|
420
|
-
| `kj_plan` | Generate implementation plan |
|
|
421
|
+
| `kj_plan` | Generate implementation plan with heartbeat/stall telemetry and clearer diagnostics |
|
|
421
422
|
|
|
422
423
|
### Recommended Companion MCPs
|
|
423
424
|
|
package/docs/README.es.md
CHANGED
|
@@ -42,6 +42,7 @@ En lugar de ejecutar un agente de IA y revisar manualmente su output, `kj` encad
|
|
|
42
42
|
- **Descomposicion de tareas** — triage detecta cuando una tarea debe dividirse y recomienda subtareas; con integracion Planning Game, crea cards vinculadas con bloqueo secuencial
|
|
43
43
|
- **Retry con backoff** — recuperacion automatica ante errores transitorios de API (429, 5xx) con backoff exponencial y jitter
|
|
44
44
|
- **Pipeline stage tracker** — vista de progreso acumulativo durante `kj_run` mostrando que stages estan completadas, en ejecucion o pendientes — tanto en CLI como via eventos MCP para renderizado en tiempo real en el host
|
|
45
|
+
- **Guardarrailes de observabilidad del planner** — telemetria continua de heartbeat/stall, proteccion configurable por silencio maximo (`session.max_agent_silence_minutes`) y limite duro de ejecucion (`session.max_planner_minutes`) para evitar bloqueos prolongados en `kj_plan`/planner
|
|
45
46
|
- **Integracion con Planning Game** — combina opcionalmente con [Planning Game](https://github.com/AgenteIA-Geniova/planning-game) para gestion agil de proyectos (tareas, sprints, estimacion) — como Jira, pero open-source y nativo XP
|
|
46
47
|
|
|
47
48
|
> **Mejor con MCP** — Karajan Code esta disenado para usarse como servidor MCP dentro de tu agente de IA (Claude, Codex, etc.). El agente envia tareas a `kj_run`, recibe notificaciones de progreso en tiempo real, y obtiene resultados estructurados — sin copiar y pegar.
|
|
@@ -201,7 +202,7 @@ Tras `npm install -g karajan-code`, el servidor MCP se auto-registra en las conf
|
|
|
201
202
|
| `kj_roles` | Listar roles o mostrar templates |
|
|
202
203
|
| `kj_code` | Modo solo coder |
|
|
203
204
|
| `kj_review` | Modo solo reviewer |
|
|
204
|
-
| `kj_plan` | Generar plan de implementacion |
|
|
205
|
+
| `kj_plan` | Generar plan de implementacion con telemetria heartbeat/stall y diagnostico mas claro |
|
|
205
206
|
|
|
206
207
|
### MCPs complementarios recomendados
|
|
207
208
|
|
package/package.json
CHANGED
|
@@ -8,7 +8,11 @@ export class AiderAgent extends BaseAgent {
|
|
|
8
8
|
const args = ["--yes", "--message", task.prompt];
|
|
9
9
|
const model = this.getRoleModel(role);
|
|
10
10
|
if (model) args.push("--model", model);
|
|
11
|
-
const res = await runCommand(resolveBin("aider"), args, {
|
|
11
|
+
const res = await runCommand(resolveBin("aider"), args, {
|
|
12
|
+
onOutput: task.onOutput,
|
|
13
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
14
|
+
timeout: task.timeoutMs
|
|
15
|
+
});
|
|
12
16
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
13
17
|
}
|
|
14
18
|
|
|
@@ -17,7 +21,11 @@ export class AiderAgent extends BaseAgent {
|
|
|
17
21
|
const args = ["--yes", "--message", task.prompt];
|
|
18
22
|
const model = this.getRoleModel(role);
|
|
19
23
|
if (model) args.push("--model", model);
|
|
20
|
-
const res = await runCommand(resolveBin("aider"), args, {
|
|
24
|
+
const res = await runCommand(resolveBin("aider"), args, {
|
|
25
|
+
onOutput: task.onOutput,
|
|
26
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
27
|
+
timeout: task.timeoutMs
|
|
28
|
+
});
|
|
21
29
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
22
30
|
}
|
|
23
31
|
}
|
|
@@ -81,7 +81,11 @@ export class ClaudeAgent extends BaseAgent {
|
|
|
81
81
|
if (task.onOutput) {
|
|
82
82
|
args.push("--output-format", "stream-json");
|
|
83
83
|
const streamFilter = createStreamJsonFilter(task.onOutput);
|
|
84
|
-
const res = await runCommand(resolveBin("claude"), args, {
|
|
84
|
+
const res = await runCommand(resolveBin("claude"), args, {
|
|
85
|
+
onOutput: streamFilter,
|
|
86
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
87
|
+
timeout: task.timeoutMs
|
|
88
|
+
});
|
|
85
89
|
const output = extractTextFromStreamJson(res.stdout);
|
|
86
90
|
return { ok: res.exitCode === 0, output, error: res.stderr, exitCode: res.exitCode };
|
|
87
91
|
}
|
|
@@ -94,7 +98,11 @@ export class ClaudeAgent extends BaseAgent {
|
|
|
94
98
|
const args = ["-p", task.prompt, "--output-format", "json"];
|
|
95
99
|
const model = this.getRoleModel(task.role || "reviewer");
|
|
96
100
|
if (model) args.push("--model", model);
|
|
97
|
-
const res = await runCommand(resolveBin("claude"), args, {
|
|
101
|
+
const res = await runCommand(resolveBin("claude"), args, {
|
|
102
|
+
onOutput: task.onOutput,
|
|
103
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
104
|
+
timeout: task.timeoutMs
|
|
105
|
+
});
|
|
98
106
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
99
107
|
}
|
|
100
108
|
}
|
|
@@ -9,8 +9,13 @@ export class CodexAgent extends BaseAgent {
|
|
|
9
9
|
const model = this.getRoleModel(role);
|
|
10
10
|
if (model) args.push("--model", model);
|
|
11
11
|
if (this.isAutoApproveEnabled(role)) args.push("--full-auto");
|
|
12
|
-
args.push(
|
|
13
|
-
const res = await runCommand(resolveBin("codex"), args, {
|
|
12
|
+
args.push("-");
|
|
13
|
+
const res = await runCommand(resolveBin("codex"), args, {
|
|
14
|
+
onOutput: task.onOutput,
|
|
15
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
16
|
+
timeout: task.timeoutMs,
|
|
17
|
+
input: task.prompt
|
|
18
|
+
});
|
|
14
19
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
15
20
|
}
|
|
16
21
|
|
|
@@ -18,8 +23,13 @@ export class CodexAgent extends BaseAgent {
|
|
|
18
23
|
const args = ["exec"];
|
|
19
24
|
const model = this.getRoleModel(task.role || "reviewer");
|
|
20
25
|
if (model) args.push("--model", model);
|
|
21
|
-
args.push(
|
|
22
|
-
const res = await runCommand(resolveBin("codex"), args, {
|
|
26
|
+
args.push("-");
|
|
27
|
+
const res = await runCommand(resolveBin("codex"), args, {
|
|
28
|
+
onOutput: task.onOutput,
|
|
29
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
30
|
+
timeout: task.timeoutMs,
|
|
31
|
+
input: task.prompt
|
|
32
|
+
});
|
|
23
33
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
24
34
|
}
|
|
25
35
|
}
|
|
@@ -8,7 +8,11 @@ export class GeminiAgent extends BaseAgent {
|
|
|
8
8
|
const args = ["-p", task.prompt];
|
|
9
9
|
const model = this.getRoleModel(role);
|
|
10
10
|
if (model) args.push("--model", model);
|
|
11
|
-
const res = await runCommand(resolveBin("gemini"), args, {
|
|
11
|
+
const res = await runCommand(resolveBin("gemini"), args, {
|
|
12
|
+
onOutput: task.onOutput,
|
|
13
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
14
|
+
timeout: task.timeoutMs
|
|
15
|
+
});
|
|
12
16
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
13
17
|
}
|
|
14
18
|
|
|
@@ -17,7 +21,11 @@ export class GeminiAgent extends BaseAgent {
|
|
|
17
21
|
const args = ["-p", task.prompt, "--output-format", "json"];
|
|
18
22
|
const model = this.getRoleModel(role);
|
|
19
23
|
if (model) args.push("--model", model);
|
|
20
|
-
const res = await runCommand(resolveBin("gemini"), args, {
|
|
24
|
+
const res = await runCommand(resolveBin("gemini"), args, {
|
|
25
|
+
onOutput: task.onOutput,
|
|
26
|
+
silenceTimeoutMs: task.silenceTimeoutMs,
|
|
27
|
+
timeout: task.timeoutMs
|
|
28
|
+
});
|
|
21
29
|
return { ok: res.exitCode === 0, output: res.stdout, error: res.stderr, exitCode: res.exitCode };
|
|
22
30
|
}
|
|
23
31
|
}
|
package/src/cli.js
CHANGED
|
@@ -187,6 +187,29 @@ program
|
|
|
187
187
|
});
|
|
188
188
|
});
|
|
189
189
|
|
|
190
|
+
program
|
|
191
|
+
.command("update")
|
|
192
|
+
.description("Update karajan-code to the latest version from npm")
|
|
193
|
+
.action(async () => {
|
|
194
|
+
const { execaCommand } = await import("execa");
|
|
195
|
+
console.log(`Current version: ${PKG_VERSION}`);
|
|
196
|
+
console.log("Checking for updates...");
|
|
197
|
+
try {
|
|
198
|
+
const { stdout } = await execaCommand("npm view karajan-code version");
|
|
199
|
+
const latest = stdout.trim();
|
|
200
|
+
if (latest === PKG_VERSION) {
|
|
201
|
+
console.log(`Already on the latest version (${PKG_VERSION}).`);
|
|
202
|
+
return;
|
|
203
|
+
}
|
|
204
|
+
console.log(`Updating ${PKG_VERSION} → ${latest}...`);
|
|
205
|
+
await execaCommand("npm install -g karajan-code@latest", { stdio: "inherit" });
|
|
206
|
+
console.log(`Updated to ${latest}. Restart Claude to pick up the new MCP server.`);
|
|
207
|
+
} catch (err) {
|
|
208
|
+
console.error(`Update failed: ${err.message}`);
|
|
209
|
+
process.exit(1);
|
|
210
|
+
}
|
|
211
|
+
});
|
|
212
|
+
|
|
190
213
|
const sonar = program.command("sonar").description("Manage SonarQube container");
|
|
191
214
|
sonar.command("status").action(async () => sonarCommand({ action: "status" }));
|
|
192
215
|
sonar.command("start").action(async () => sonarCommand({ action: "start" }));
|
package/src/commands/plan.js
CHANGED
|
@@ -46,7 +46,13 @@ export async function planCommand({ task, config, logger, json, context }) {
|
|
|
46
46
|
|
|
47
47
|
const planner = createAgent(plannerRole.provider, config, logger);
|
|
48
48
|
const prompt = buildPlannerPrompt({ task, context });
|
|
49
|
-
const
|
|
49
|
+
const silenceTimeoutMs = Number(config?.session?.max_agent_silence_minutes) > 0
|
|
50
|
+
? Math.round(Number(config.session.max_agent_silence_minutes) * 60 * 1000)
|
|
51
|
+
: undefined;
|
|
52
|
+
const timeoutMs = Number(config?.session?.max_planner_minutes) > 0
|
|
53
|
+
? Math.round(Number(config.session.max_planner_minutes) * 60 * 1000)
|
|
54
|
+
: undefined;
|
|
55
|
+
const result = await planner.runTask({ prompt, role: "planner", silenceTimeoutMs, timeoutMs });
|
|
50
56
|
|
|
51
57
|
if (!result.ok) {
|
|
52
58
|
throw new Error(result.error || result.output || "Planner failed");
|
package/src/config.js
CHANGED
|
@@ -114,7 +114,9 @@ const DEFAULTS = {
|
|
|
114
114
|
session: {
|
|
115
115
|
max_iteration_minutes: 30,
|
|
116
116
|
max_total_minutes: 120,
|
|
117
|
+
max_planner_minutes: 60,
|
|
117
118
|
checkpoint_interval_minutes: 5,
|
|
119
|
+
max_agent_silence_minutes: 20,
|
|
118
120
|
fail_fast_repeats: 2,
|
|
119
121
|
repeat_detection_threshold: 2,
|
|
120
122
|
max_sonar_retries: 3,
|
|
@@ -62,6 +62,18 @@ export function classifyError(error) {
|
|
|
62
62
|
const msg = error?.message || String(error);
|
|
63
63
|
const lower = msg.toLowerCase();
|
|
64
64
|
|
|
65
|
+
if (
|
|
66
|
+
lower.includes("without output")
|
|
67
|
+
|| lower.includes("silent for")
|
|
68
|
+
|| lower.includes("unresponsive")
|
|
69
|
+
|| lower.includes("exceeded max silence")
|
|
70
|
+
) {
|
|
71
|
+
return {
|
|
72
|
+
category: "agent_stall",
|
|
73
|
+
suggestion: "Agent output stalled. Check live details with kj_status, then retry with a smaller prompt or increase session.max_agent_silence_minutes if needed."
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
|
|
65
77
|
if (lower.includes("sonar") && (lower.includes("connect") || lower.includes("econnrefused") || lower.includes("not available") || lower.includes("not running"))) {
|
|
66
78
|
return {
|
|
67
79
|
category: "sonar_unavailable",
|
|
@@ -235,7 +247,15 @@ export async function handlePlanDirect(a, server, extra) {
|
|
|
235
247
|
|
|
236
248
|
const projectDir = await resolveProjectDir(server);
|
|
237
249
|
const runLog = createRunLog(projectDir);
|
|
238
|
-
|
|
250
|
+
const silenceTimeoutMs = Number(config?.session?.max_agent_silence_minutes) > 0
|
|
251
|
+
? Math.round(Number(config.session.max_agent_silence_minutes) * 60 * 1000)
|
|
252
|
+
: undefined;
|
|
253
|
+
const plannerTimeoutMs = Number(config?.session?.max_planner_minutes) > 0
|
|
254
|
+
? Math.round(Number(config.session.max_planner_minutes) * 60 * 1000)
|
|
255
|
+
: undefined;
|
|
256
|
+
runLog.logText(
|
|
257
|
+
`[kj_plan] started — provider=${plannerRole.provider}, max_silence=${silenceTimeoutMs ? `${Math.round(silenceTimeoutMs / 1000)}s` : "disabled"}, max_runtime=${plannerTimeoutMs ? `${Math.round(plannerTimeoutMs / 1000)}s` : "disabled"}`
|
|
258
|
+
);
|
|
239
259
|
const emitter = buildDirectEmitter(server, runLog);
|
|
240
260
|
const eventBase = { sessionId: null, iteration: 0, startedAt: Date.now() };
|
|
241
261
|
const onOutput = ({ stream, line }) => {
|
|
@@ -250,18 +270,31 @@ export async function handlePlanDirect(a, server, extra) {
|
|
|
250
270
|
sendTrackerLog(server, "planner", "running", plannerRole.provider);
|
|
251
271
|
runLog.logText(`[planner] agent launched, waiting for response...`);
|
|
252
272
|
let result;
|
|
273
|
+
let plannerStats = null;
|
|
253
274
|
try {
|
|
254
|
-
result = await planner.runTask({
|
|
275
|
+
result = await planner.runTask({
|
|
276
|
+
prompt,
|
|
277
|
+
role: "planner",
|
|
278
|
+
onOutput: stallDetector.onOutput,
|
|
279
|
+
silenceTimeoutMs,
|
|
280
|
+
timeoutMs: plannerTimeoutMs
|
|
281
|
+
});
|
|
255
282
|
} finally {
|
|
256
283
|
stallDetector.stop();
|
|
257
|
-
|
|
258
|
-
runLog.logText(
|
|
284
|
+
plannerStats = stallDetector.stats();
|
|
285
|
+
runLog.logText(
|
|
286
|
+
`[planner] finished — lines=${plannerStats.lineCount}, bytes=${plannerStats.bytesReceived}, elapsed=${Math.round(plannerStats.elapsedMs / 1000)}s`
|
|
287
|
+
);
|
|
259
288
|
runLog.close();
|
|
260
289
|
}
|
|
261
290
|
|
|
262
291
|
if (!result.ok) {
|
|
263
292
|
sendTrackerLog(server, "planner", "failed");
|
|
264
|
-
|
|
293
|
+
const baseError = result.error || result.output || "Planner failed";
|
|
294
|
+
const statsSuffix = plannerStats
|
|
295
|
+
? ` [lines=${plannerStats.lineCount}, bytes=${plannerStats.bytesReceived}, elapsed=${Math.round(plannerStats.elapsedMs / 1000)}s]`
|
|
296
|
+
: "";
|
|
297
|
+
throw new Error(`${baseError}${statsSuffix}`);
|
|
265
298
|
}
|
|
266
299
|
|
|
267
300
|
sendTrackerLog(server, "planner", "done");
|
|
@@ -9,6 +9,18 @@ function resolveProvider(config) {
|
|
|
9
9
|
);
|
|
10
10
|
}
|
|
11
11
|
|
|
12
|
+
function resolvePlannerSilenceTimeoutMs(config) {
|
|
13
|
+
const minutes = Number(config?.session?.max_agent_silence_minutes);
|
|
14
|
+
if (!Number.isFinite(minutes) || minutes <= 0) return null;
|
|
15
|
+
return Math.round(minutes * 60 * 1000);
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function resolvePlannerRuntimeTimeoutMs(config) {
|
|
19
|
+
const minutes = Number(config?.session?.max_planner_minutes);
|
|
20
|
+
if (!Number.isFinite(minutes) || minutes <= 0) return null;
|
|
21
|
+
return Math.round(minutes * 60 * 1000);
|
|
22
|
+
}
|
|
23
|
+
|
|
12
24
|
function buildPrompt({ task, instructions, research, triageDecomposition }) {
|
|
13
25
|
const sections = [];
|
|
14
26
|
|
|
@@ -78,6 +90,10 @@ export class PlannerRole extends BaseRole {
|
|
|
78
90
|
|
|
79
91
|
const runArgs = { prompt, role: "planner" };
|
|
80
92
|
if (onOutput) runArgs.onOutput = onOutput;
|
|
93
|
+
const silenceTimeoutMs = resolvePlannerSilenceTimeoutMs(this.config);
|
|
94
|
+
if (silenceTimeoutMs) runArgs.silenceTimeoutMs = silenceTimeoutMs;
|
|
95
|
+
const timeoutMs = resolvePlannerRuntimeTimeoutMs(this.config);
|
|
96
|
+
if (timeoutMs) runArgs.timeoutMs = timeoutMs;
|
|
81
97
|
const result = await agent.runTask(runArgs);
|
|
82
98
|
|
|
83
99
|
if (!result.ok) {
|
package/src/utils/process.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { execa } from "execa";
|
|
2
2
|
|
|
3
3
|
export async function runCommand(command, args = [], options = {}) {
|
|
4
|
-
const { timeout, onOutput, ...rest } = options;
|
|
4
|
+
const { timeout, onOutput, silenceTimeoutMs, partialOutputFlushMs, ...rest } = options;
|
|
5
5
|
const subprocess = execa(command, args, {
|
|
6
6
|
reject: false,
|
|
7
7
|
...rest
|
|
@@ -9,37 +9,98 @@ export async function runCommand(command, args = [], options = {}) {
|
|
|
9
9
|
|
|
10
10
|
let stdoutAccum = "";
|
|
11
11
|
let stderrAccum = "";
|
|
12
|
+
let outputSilenceTimer = null;
|
|
13
|
+
let silenceTimedOut = false;
|
|
14
|
+
|
|
15
|
+
function clearSilenceTimer() {
|
|
16
|
+
if (outputSilenceTimer) {
|
|
17
|
+
clearTimeout(outputSilenceTimer);
|
|
18
|
+
outputSilenceTimer = null;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function armSilenceTimer() {
|
|
23
|
+
const ms = Number(silenceTimeoutMs);
|
|
24
|
+
if (!Number.isFinite(ms) || ms <= 0 || silenceTimedOut) return;
|
|
25
|
+
clearSilenceTimer();
|
|
26
|
+
outputSilenceTimer = setTimeout(() => {
|
|
27
|
+
silenceTimedOut = true;
|
|
28
|
+
try {
|
|
29
|
+
subprocess.kill("SIGKILL", { forceKillAfterDelay: 1000 });
|
|
30
|
+
} catch {
|
|
31
|
+
// no-op
|
|
32
|
+
}
|
|
33
|
+
}, ms);
|
|
34
|
+
}
|
|
12
35
|
|
|
13
36
|
if (subprocess.stdout) {
|
|
14
37
|
subprocess.stdout.on("data", (chunk) => {
|
|
15
38
|
stdoutAccum += chunk.toString();
|
|
39
|
+
armSilenceTimer();
|
|
16
40
|
});
|
|
17
41
|
}
|
|
18
42
|
if (subprocess.stderr) {
|
|
19
43
|
subprocess.stderr.on("data", (chunk) => {
|
|
20
44
|
stderrAccum += chunk.toString();
|
|
45
|
+
armSilenceTimer();
|
|
21
46
|
});
|
|
22
47
|
}
|
|
23
48
|
|
|
49
|
+
let flushInterval = null;
|
|
24
50
|
if (onOutput) {
|
|
25
|
-
const
|
|
26
|
-
|
|
51
|
+
const flushMs = Number(partialOutputFlushMs) > 0 ? Number(partialOutputFlushMs) : 2000;
|
|
52
|
+
const streams = {};
|
|
53
|
+
const makeHandler = (stream) => {
|
|
54
|
+
const state = { partial: "", dirty: false };
|
|
55
|
+
streams[stream] = state;
|
|
27
56
|
return (chunk) => {
|
|
28
|
-
partial += chunk.toString();
|
|
29
|
-
const lines = partial.split(
|
|
30
|
-
partial = lines.pop();
|
|
57
|
+
state.partial += chunk.toString();
|
|
58
|
+
const lines = state.partial.split(/\r\n|\n|\r/);
|
|
59
|
+
state.partial = lines.pop() ?? "";
|
|
60
|
+
state.dirty = state.partial.length > 0;
|
|
31
61
|
for (const line of lines) {
|
|
32
62
|
if (line) onOutput({ stream, line });
|
|
33
63
|
}
|
|
34
64
|
};
|
|
35
65
|
};
|
|
36
|
-
|
|
37
|
-
|
|
66
|
+
|
|
67
|
+
const flushPartials = () => {
|
|
68
|
+
for (const [stream, state] of Object.entries(streams)) {
|
|
69
|
+
if (!state.dirty || !state.partial) continue;
|
|
70
|
+
onOutput({ stream, line: state.partial });
|
|
71
|
+
state.partial = "";
|
|
72
|
+
state.dirty = false;
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
if (subprocess.stdout) subprocess.stdout.on("data", makeHandler("stdout"));
|
|
77
|
+
if (subprocess.stderr) subprocess.stderr.on("data", makeHandler("stderr"));
|
|
78
|
+
flushInterval = setInterval(flushPartials, flushMs);
|
|
79
|
+
flushInterval.unref?.();
|
|
80
|
+
|
|
81
|
+
subprocess.finally(() => {
|
|
82
|
+
flushPartials();
|
|
83
|
+
if (flushInterval) {
|
|
84
|
+
clearInterval(flushInterval);
|
|
85
|
+
flushInterval = null;
|
|
86
|
+
}
|
|
87
|
+
});
|
|
38
88
|
}
|
|
89
|
+
armSilenceTimer();
|
|
39
90
|
|
|
40
91
|
try {
|
|
41
92
|
if (!timeout) {
|
|
42
93
|
const result = await subprocess;
|
|
94
|
+
clearSilenceTimer();
|
|
95
|
+
if (silenceTimedOut) {
|
|
96
|
+
return {
|
|
97
|
+
exitCode: 143,
|
|
98
|
+
stdout: stdoutAccum,
|
|
99
|
+
stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
|
|
100
|
+
timedOut: true,
|
|
101
|
+
signal: "SIGKILL"
|
|
102
|
+
};
|
|
103
|
+
}
|
|
43
104
|
return enrichResult(result, stdoutAccum, stderrAccum);
|
|
44
105
|
}
|
|
45
106
|
|
|
@@ -63,8 +124,28 @@ export async function runCommand(command, args = [], options = {}) {
|
|
|
63
124
|
|
|
64
125
|
const result = await Promise.race([subprocess, timeoutResult]);
|
|
65
126
|
if (timer) clearTimeout(timer);
|
|
127
|
+
clearSilenceTimer();
|
|
128
|
+
if (silenceTimedOut) {
|
|
129
|
+
return {
|
|
130
|
+
exitCode: 143,
|
|
131
|
+
stdout: stdoutAccum,
|
|
132
|
+
stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
|
|
133
|
+
timedOut: true,
|
|
134
|
+
signal: "SIGKILL"
|
|
135
|
+
};
|
|
136
|
+
}
|
|
66
137
|
return enrichResult(result, stdoutAccum, stderrAccum);
|
|
67
138
|
} catch (error) {
|
|
139
|
+
clearSilenceTimer();
|
|
140
|
+
if (silenceTimedOut) {
|
|
141
|
+
return {
|
|
142
|
+
exitCode: 143,
|
|
143
|
+
stdout: error?.stdout || stdoutAccum,
|
|
144
|
+
stderr: `Command killed after ${Number(silenceTimeoutMs)}ms without output`,
|
|
145
|
+
timedOut: true,
|
|
146
|
+
signal: error?.signal || "SIGKILL"
|
|
147
|
+
};
|
|
148
|
+
}
|
|
68
149
|
const details = [
|
|
69
150
|
error?.shortMessage,
|
|
70
151
|
error?.originalMessage,
|
|
@@ -15,6 +15,7 @@ import { emitProgress, makeEvent } from "./events.js";
|
|
|
15
15
|
const DEFAULT_HEARTBEAT_INTERVAL_MS = 30_000; // heartbeat every 30s
|
|
16
16
|
const DEFAULT_STALL_TIMEOUT_MS = 120_000; // warn after 2min silence
|
|
17
17
|
const DEFAULT_CRITICAL_TIMEOUT_MS = 300_000; // critical after 5min silence
|
|
18
|
+
const DEFAULT_STALL_REPEAT_MS = 60_000; // repeat stall notices every 60s
|
|
18
19
|
|
|
19
20
|
export function createStallDetector({
|
|
20
21
|
onOutput,
|
|
@@ -24,23 +25,30 @@ export function createStallDetector({
|
|
|
24
25
|
provider,
|
|
25
26
|
heartbeatIntervalMs = DEFAULT_HEARTBEAT_INTERVAL_MS,
|
|
26
27
|
stallTimeoutMs = DEFAULT_STALL_TIMEOUT_MS,
|
|
27
|
-
criticalTimeoutMs = DEFAULT_CRITICAL_TIMEOUT_MS
|
|
28
|
+
criticalTimeoutMs = DEFAULT_CRITICAL_TIMEOUT_MS,
|
|
29
|
+
stallRepeatMs = DEFAULT_STALL_REPEAT_MS,
|
|
30
|
+
maxSilenceMs = null,
|
|
31
|
+
onMaxSilence = null
|
|
28
32
|
}) {
|
|
29
33
|
let lastActivityAt = Date.now();
|
|
30
34
|
let lineCount = 0;
|
|
31
35
|
let bytesReceived = 0;
|
|
32
|
-
let stallWarned = false;
|
|
33
|
-
let criticalWarned = false;
|
|
34
36
|
let heartbeatTimer = null;
|
|
35
37
|
const startedAt = Date.now();
|
|
38
|
+
let lastStallWarnAt = 0;
|
|
39
|
+
let lastCriticalWarnAt = 0;
|
|
40
|
+
let maxSilenceTriggered = false;
|
|
36
41
|
|
|
37
42
|
function emitHeartbeat() {
|
|
38
43
|
const now = Date.now();
|
|
39
44
|
const silenceMs = now - lastActivityAt;
|
|
40
45
|
const elapsedMs = now - startedAt;
|
|
46
|
+
const shouldWarn = silenceMs >= stallTimeoutMs;
|
|
47
|
+
const shouldCritical = silenceMs >= criticalTimeoutMs;
|
|
48
|
+
const repeatWindow = Math.max(1000, Number(stallRepeatMs) || DEFAULT_STALL_REPEAT_MS);
|
|
41
49
|
|
|
42
|
-
if (
|
|
43
|
-
|
|
50
|
+
if (shouldCritical && (now - lastCriticalWarnAt >= repeatWindow)) {
|
|
51
|
+
lastCriticalWarnAt = now;
|
|
44
52
|
emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
|
|
45
53
|
status: "critical",
|
|
46
54
|
message: `Agent ${provider} unresponsive for ${Math.round(silenceMs / 1000)}s — may be hung`,
|
|
@@ -53,8 +61,8 @@ export function createStallDetector({
|
|
|
53
61
|
severity: "critical"
|
|
54
62
|
}
|
|
55
63
|
}));
|
|
56
|
-
} else if (
|
|
57
|
-
|
|
64
|
+
} else if (shouldWarn && (now - lastStallWarnAt >= repeatWindow)) {
|
|
65
|
+
lastStallWarnAt = now;
|
|
58
66
|
emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
|
|
59
67
|
status: "warning",
|
|
60
68
|
message: `Agent ${provider} silent for ${Math.round(silenceMs / 1000)}s — still waiting`,
|
|
@@ -67,20 +75,49 @@ export function createStallDetector({
|
|
|
67
75
|
severity: "warning"
|
|
68
76
|
}
|
|
69
77
|
}));
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
emitProgress(emitter, makeEvent("agent:heartbeat", { ...eventBase, stage }, {
|
|
81
|
+
message: silenceMs < stallTimeoutMs
|
|
82
|
+
? `Agent ${provider} active — ${lineCount} lines, ${Math.round(elapsedMs / 1000)}s elapsed`
|
|
83
|
+
: `Agent ${provider} waiting — silent ${Math.round(silenceMs / 1000)}s, ${Math.round(elapsedMs / 1000)}s elapsed`,
|
|
84
|
+
detail: {
|
|
85
|
+
provider,
|
|
86
|
+
elapsedMs,
|
|
87
|
+
silenceMs,
|
|
88
|
+
lineCount,
|
|
89
|
+
bytesReceived,
|
|
90
|
+
status: silenceMs < stallTimeoutMs ? "active" : "waiting"
|
|
91
|
+
}
|
|
92
|
+
}));
|
|
74
93
|
|
|
75
|
-
|
|
76
|
-
|
|
94
|
+
const hardLimit = Number(maxSilenceMs);
|
|
95
|
+
if (!maxSilenceTriggered && Number.isFinite(hardLimit) && hardLimit > 0 && silenceMs >= hardLimit) {
|
|
96
|
+
maxSilenceTriggered = true;
|
|
97
|
+
emitProgress(emitter, makeEvent("agent:stall", { ...eventBase, stage }, {
|
|
98
|
+
status: "fail",
|
|
99
|
+
message: `Agent ${provider} exceeded max silence (${Math.round(hardLimit / 1000)}s)`,
|
|
77
100
|
detail: {
|
|
78
101
|
provider,
|
|
102
|
+
silenceMs,
|
|
79
103
|
elapsedMs,
|
|
80
104
|
lineCount,
|
|
81
|
-
bytesReceived
|
|
105
|
+
bytesReceived,
|
|
106
|
+
severity: "fatal",
|
|
107
|
+
maxSilenceMs: hardLimit
|
|
82
108
|
}
|
|
83
109
|
}));
|
|
110
|
+
if (typeof onMaxSilence === "function") {
|
|
111
|
+
onMaxSilence({
|
|
112
|
+
provider,
|
|
113
|
+
stage,
|
|
114
|
+
silenceMs,
|
|
115
|
+
elapsedMs,
|
|
116
|
+
lineCount,
|
|
117
|
+
bytesReceived,
|
|
118
|
+
maxSilenceMs: hardLimit
|
|
119
|
+
});
|
|
120
|
+
}
|
|
84
121
|
}
|
|
85
122
|
}
|
|
86
123
|
|
|
@@ -92,10 +129,6 @@ export function createStallDetector({
|
|
|
92
129
|
lineCount++;
|
|
93
130
|
bytesReceived += data.line?.length || 0;
|
|
94
131
|
|
|
95
|
-
// Reset stall flags on new activity
|
|
96
|
-
stallWarned = false;
|
|
97
|
-
criticalWarned = false;
|
|
98
|
-
|
|
99
132
|
// Forward to the original callback
|
|
100
133
|
if (onOutput) {
|
|
101
134
|
onOutput(data);
|