@tjamescouch/niki 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 James Couch
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/bin/niki CHANGED
@@ -19,8 +19,8 @@
19
19
  * - Diagnostics only contain counters, never message content
20
20
  */
21
21
 
22
- import { spawn } from 'node:child_process';
23
- import { createWriteStream, writeFileSync, mkdirSync, existsSync } from 'node:fs';
22
+ import { spawn, execSync } from 'node:child_process';
23
+ import { createWriteStream, writeFileSync, mkdirSync, existsSync, readFileSync } from 'node:fs';
24
24
  import { dirname, resolve } from 'node:path';
25
25
  import { parseArgs } from 'node:util';
26
26
 
@@ -37,6 +37,10 @@ Options:
37
37
  --timeout <seconds> Max wall-clock runtime before SIGTERM (default: 3600)
38
38
  --max-sends <n> Max agentchat_send calls per minute (default: 10)
39
39
  --max-tool-calls <n> Max total tool calls per minute (default: 30)
40
+ --stall-timeout <secs> Kill after N seconds of no output (default: 60, 0=disabled)
41
+ --startup-timeout <s> Longer stall timeout until first output (default: 180, 0=use stall-timeout)
42
+ --dead-air-timeout <m> Minutes of zero CPU + zero output before kill (default: 5, 0=disabled)
43
+ --max-nudges <n> Max stdin nudge attempts before kill on stall (default: 3)
40
44
  --log <file> Write diagnostics log to file
41
45
  --state <file> Write state JSON on exit (budget used, reason, etc.)
42
46
  --cooldown <seconds> Grace period after SIGTERM before SIGKILL (default: 5)
@@ -60,6 +64,10 @@ const { values: opts } = parseArgs({
60
64
  timeout: { type: 'string', default: '3600' },
61
65
  'max-sends': { type: 'string', default: '10' },
62
66
  'max-tool-calls': { type: 'string', default: '30' },
67
+ 'stall-timeout': { type: 'string', default: '60' },
68
+ 'startup-timeout': { type: 'string', default: '180' },
69
+ 'dead-air-timeout': { type: 'string', default: '5' },
70
+ 'max-nudges': { type: 'string', default: '3' },
63
71
  log: { type: 'string' },
64
72
  state: { type: 'string' },
65
73
  cooldown: { type: 'string', default: '5' },
@@ -72,6 +80,10 @@ const BUDGET = parseInt(opts.budget, 10);
72
80
  const TIMEOUT_S = parseInt(opts.timeout, 10);
73
81
  const MAX_SENDS = parseInt(opts['max-sends'], 10);
74
82
  const MAX_TOOL_CALLS = parseInt(opts['max-tool-calls'], 10);
83
+ const STALL_TIMEOUT_S = parseInt(opts['stall-timeout'], 10);
84
+ const STARTUP_TIMEOUT_S = parseInt(opts['startup-timeout'], 10);
85
+ const DEAD_AIR_TIMEOUT_M = parseFloat(opts['dead-air-timeout']);
86
+ const MAX_NUDGES = parseInt(opts['max-nudges'], 10);
75
87
  const COOLDOWN_S = parseInt(opts.cooldown, 10);
76
88
  const ABORT_FILE = opts['abort-file'] ? resolve(opts['abort-file']) : null;
77
89
  const POLL_INTERVAL = parseInt(opts['poll-interval'], 10);
@@ -92,8 +104,11 @@ const state = {
92
104
  sendCallsThisMinute: 0,
93
105
  exitCode: null,
94
106
  exitSignal: null,
95
- killedBy: null, // 'budget' | 'timeout' | 'rate-sends' | 'rate-tools' | 'abort' | null
107
+ killedBy: null, // 'budget' | 'timeout' | 'rate-sends' | 'rate-tools' | 'abort' | 'stall' | 'dead-air' | null
96
108
  duration: 0,
109
+ stallEvents: 0,
110
+ nudges: 0,
111
+ deadAirChecks: 0,
97
112
  };
98
113
 
99
114
  // Sliding window for per-minute rate limiting
@@ -231,18 +246,241 @@ function killChild(reason) {
231
246
  }, COOLDOWN_S * 1000);
232
247
  }
233
248
 
249
+ // --- Prompt pattern detection ---
250
+
251
+ const PROMPT_PATTERNS = [
252
+ /\(y\/n\)/i,
253
+ /\[Y\/n\]/i,
254
+ /\[y\/N\]/i,
255
+ /\(yes\/no\)/i,
256
+ /Do you want to trust/i,
257
+ /Do you want to allow/i,
258
+ /Press Enter to continue/i,
259
+ /Are you sure/i,
260
+ ];
261
+
262
+ function checkForPrompts(text) {
263
+ for (const pattern of PROMPT_PATTERNS) {
264
+ if (pattern.test(text)) {
265
+ log(`PROMPT detected in stdout: ${text.trim().substring(0, 100)}`);
266
+ state.stallEvents++;
267
+ // Close stdin to dismiss the prompt
268
+ closeStdin();
269
+ return true;
270
+ }
271
+ }
272
+ return false;
273
+ }
274
+
275
+ // --- CPU liveness sampling ---
276
+
277
+ // Reads cumulative CPU time (user+system) for a process tree.
278
+ // Returns total CPU milliseconds, or -1 if unavailable.
279
+ // On Linux: reads /proc/<pid>/stat (works in containers).
280
+ // On macOS: uses ps command as fallback.
281
+ let lastCpuMs = null; // null = no prior sample taken yet
282
+
283
+ function sampleCpuMs(pid) {
284
+ try {
285
+ // Linux: /proc/<pid>/stat fields 14 (utime) and 15 (stime) in clock ticks
286
+ const statPath = `/proc/${pid}/stat`;
287
+ if (existsSync(statPath)) {
288
+ const stat = readFileSync(statPath, 'utf8');
289
+ // Fields are space-separated, but comm (field 2) can contain spaces/parens.
290
+ // Find the closing paren, then split the rest.
291
+ const afterComm = stat.substring(stat.lastIndexOf(')') + 2);
292
+ const fields = afterComm.split(' ');
293
+ // fields[11] = utime (index 13 in original), fields[12] = stime (index 14)
294
+ const utime = parseInt(fields[11], 10) || 0;
295
+ const stime = parseInt(fields[12], 10) || 0;
296
+ // Also grab child times: fields[13] = cutime, fields[14] = cstime
297
+ const cutime = parseInt(fields[13], 10) || 0;
298
+ const cstime = parseInt(fields[14], 10) || 0;
299
+ // Convert clock ticks to ms (typically 100 ticks/sec on Linux)
300
+ const ticksPerSec = 100;
301
+ return ((utime + stime + cutime + cstime) / ticksPerSec) * 1000;
302
+ }
303
+
304
+ // macOS / fallback: use ps to get cumulative CPU time
305
+ const output = execSync(`ps -o cputime= -p ${pid} 2>/dev/null`, { encoding: 'utf8', timeout: 3000 }).trim();
306
+ if (!output) return -1;
307
+ // Format: HH:MM:SS or M:SS
308
+ const parts = output.split(':').map(Number);
309
+ if (parts.length === 3) return (parts[0] * 3600 + parts[1] * 60 + parts[2]) * 1000;
310
+ if (parts.length === 2) return (parts[0] * 60 + parts[1]) * 1000;
311
+ return -1;
312
+ } catch {
313
+ return -1;
314
+ }
315
+ }
316
+
317
+ // Returns true if the child process has consumed CPU since the last sample.
318
+ function hasConsumedCpu(pid) {
319
+ const cpuMs = sampleCpuMs(pid);
320
+ if (cpuMs < 0) return true; // Can't measure → assume alive (safe default)
321
+
322
+ const prev = lastCpuMs;
323
+ lastCpuMs = cpuMs;
324
+
325
+ // First sample — no delta yet, assume alive
326
+ if (prev === null) return true;
327
+
328
+ // If CPU time increased at all, process is doing work
329
+ return cpuMs > prev;
330
+ }
331
+
332
+ // --- Dead air detection ---
333
+
334
+ let deadAirStart = null; // Timestamp when dead air began (null = not in dead air)
335
+ let deadAirPollId = null;
336
+
337
+ function checkDeadAir() {
338
+ if (killed || !child || DEAD_AIR_TIMEOUT_M <= 0) return;
339
+
340
+ state.deadAirChecks++;
341
+ const cpuActive = hasConsumedCpu(child.pid);
342
+ const silenceSec = Math.round((Date.now() - lastOutputTime) / 1000);
343
+
344
+ if (cpuActive) {
345
+ // Process is working — reset dead air, let it cook
346
+ if (deadAirStart) {
347
+ log(`Dead air cleared — CPU active after ${Math.round((Date.now() - deadAirStart) / 1000)}s of silence`);
348
+ deadAirStart = null;
349
+ }
350
+ return;
351
+ }
352
+
353
+ // Zero CPU + zero output
354
+ if (!deadAirStart) {
355
+ deadAirStart = Date.now();
356
+ log(`Dead air started — zero CPU, ${silenceSec}s silence`);
357
+ }
358
+
359
+ const deadAirMin = (Date.now() - deadAirStart) / 60_000;
360
+ if (deadAirMin >= DEAD_AIR_TIMEOUT_M) {
361
+ log(`DEAD AIR — zero CPU + zero output for ${Math.round(deadAirMin)}min (threshold: ${DEAD_AIR_TIMEOUT_M}min)`);
362
+ killChild('dead-air');
363
+ return;
364
+ }
365
+
366
+ log(`Dead air check — zero CPU, ${Math.round(deadAirMin * 10) / 10}/${DEAD_AIR_TIMEOUT_M}min, ${silenceSec}s silence`);
367
+ }
368
+
369
+ // Poll interval: min(30s, threshold/3) — fast polls for short thresholds, 30s cap for production
370
+ const DEAD_AIR_POLL_MS = DEAD_AIR_TIMEOUT_M > 0
371
+ ? Math.min(30_000, Math.max(2_000, (DEAD_AIR_TIMEOUT_M * 60_000) / 3))
372
+ : 30_000;
373
+
374
+ function scheduleDeadAirPoll() {
375
+ if (killed || DEAD_AIR_TIMEOUT_M <= 0) return;
376
+ deadAirPollId = setTimeout(() => {
377
+ checkDeadAir();
378
+ if (!killed) scheduleDeadAirPoll();
379
+ }, jitteredDelay(DEAD_AIR_POLL_MS));
380
+ }
381
+
382
+ // --- Stall detection ---
383
+
384
+ let stallTimer = null;
385
+ let nudgeCount = 0;
386
+ let stdinClosed = false;
387
+ let lastOutputTime = Date.now();
388
+ let gotFirstOutput = false;
389
+
390
+ function onChildOutput() {
391
+ lastOutputTime = Date.now();
392
+ // Reset dead air — got real output
393
+ if (deadAirStart) {
394
+ log(`Dead air cleared — received output after ${Math.round((Date.now() - deadAirStart) / 1000)}s`);
395
+ deadAirStart = null;
396
+ }
397
+ if (!gotFirstOutput) {
398
+ gotFirstOutput = true;
399
+ log(`First output received after ${Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000)}s — switching to stall-timeout=${STALL_TIMEOUT_S}s`);
400
+ }
401
+ resetStallTimer();
402
+ }
403
+
404
+ function currentStallTimeout() {
405
+ // Use startup timeout until first output, then normal stall timeout
406
+ if (!gotFirstOutput && STARTUP_TIMEOUT_S > 0) return STARTUP_TIMEOUT_S;
407
+ return STALL_TIMEOUT_S;
408
+ }
409
+
410
+ function resetStallTimer() {
411
+ if (stallTimer) clearTimeout(stallTimer);
412
+ const timeout = currentStallTimeout();
413
+ if (killed || timeout <= 0) return;
414
+ stallTimer = setTimeout(onStallDetected, timeout * 1000);
415
+ }
416
+
417
+ function closeStdin() {
418
+ if (stdinClosed || !child) return;
419
+ stdinClosed = true;
420
+ try { child.stdin.end(); } catch { /* already closed */ }
421
+ log('Stdin: closed (EOF)');
422
+ }
423
+
424
+ function onStallDetected() {
425
+ if (killed) return;
426
+ state.stallEvents++;
427
+ const silence = Math.round((Date.now() - lastOutputTime) / 1000);
428
+ log(`STALL — no output for ${silence}s (nudges: ${nudgeCount}/${MAX_NUDGES})`);
429
+
430
+ // Escalation: close stdin → nudge → check CPU → kill
431
+ if (!stdinClosed) {
432
+ closeStdin();
433
+ resetStallTimer();
434
+ return;
435
+ }
436
+
437
+ if (nudgeCount < MAX_NUDGES && !child.stdin.writableEnded) {
438
+ nudgeCount++;
439
+ state.nudges = nudgeCount;
440
+ log(`Stall nudge #${nudgeCount}`);
441
+ resetStallTimer();
442
+ return;
443
+ }
444
+
445
+ // If dead air detection is enabled, defer kill to the dead air poller.
446
+ // Only stall-kill if we can confirm zero CPU, or if dead air is disabled.
447
+ if (DEAD_AIR_TIMEOUT_M > 0 && child) {
448
+ const cpuActive = hasConsumedCpu(child.pid);
449
+ if (cpuActive) {
450
+ log(`Stall deferred — process has CPU activity, deferring to dead-air detection`);
451
+ resetStallTimer();
452
+ return;
453
+ }
454
+ }
455
+
456
+ killChild('stall');
457
+ }
458
+
234
459
  // --- Spawn child process ---
235
460
 
236
461
  log(`Starting: ${childCmd} ${childArgs.join(' ').substring(0, 100)}...`);
237
- log(`Budget: ${BUDGET} tokens | Timeout: ${TIMEOUT_S}s | Max sends: ${MAX_SENDS}/min | Max tools: ${MAX_TOOL_CALLS}/min`);
462
+ log(`Budget: ${BUDGET} tokens | Timeout: ${TIMEOUT_S}s | Startup: ${STARTUP_TIMEOUT_S}s | Stall: ${STALL_TIMEOUT_S}s | Dead air: ${DEAD_AIR_TIMEOUT_M}min | Max sends: ${MAX_SENDS}/min | Max tools: ${MAX_TOOL_CALLS}/min`);
238
463
 
239
464
  child = spawn(childCmd, childArgs, {
240
- stdio: ['inherit', 'inherit', 'pipe'], // stdin/stdout pass through, stderr captured
465
+ stdio: ['pipe', 'pipe', 'pipe'], // All piped: niki controls stdin, monitors stdout+stderr
241
466
  env: process.env, // Inherit env (tokens stay in env, never logged)
242
467
  });
243
468
 
244
469
  state.pid = child.pid;
245
470
 
471
+ // Close stdin immediately — claude -p should never need interactive input.
472
+ // This prevents blocking on trust prompts, permission prompts, or stdin reads.
473
+ closeStdin();
474
+
475
+ // --- Monitor stdout ---
476
+
477
+ child.stdout.on('data', (chunk) => {
478
+ // Forward to our stdout (preserves runner's | tee pipeline)
479
+ process.stdout.write(chunk);
480
+ onChildOutput();
481
+ checkForPrompts(chunk.toString());
482
+ });
483
+
246
484
  // --- Monitor stderr ---
247
485
 
248
486
  let stderrBuffer = '';
@@ -252,6 +490,7 @@ child.stderr.on('data', (chunk) => {
252
490
 
253
491
  // Always forward stderr to our stderr (so supervisor captures it)
254
492
  process.stderr.write(chunk);
493
+ onChildOutput();
255
494
 
256
495
  // Buffer and parse line by line
257
496
  stderrBuffer += text;
@@ -277,6 +516,18 @@ child.stderr.on('data', (chunk) => {
277
516
  }
278
517
  });
279
518
 
519
+ // Start stall detection
520
+ if (STALL_TIMEOUT_S > 0 || STARTUP_TIMEOUT_S > 0) {
521
+ log(`Stall detection: startup-timeout=${STARTUP_TIMEOUT_S}s, stall-timeout=${STALL_TIMEOUT_S}s, max-nudges=${MAX_NUDGES}`);
522
+ resetStallTimer();
523
+ }
524
+
525
+ // Start dead air detection
526
+ if (DEAD_AIR_TIMEOUT_M > 0) {
527
+ log(`Dead air detection: ${DEAD_AIR_TIMEOUT_M}min threshold, ${Math.round(DEAD_AIR_POLL_MS / 1000)}s poll interval`);
528
+ scheduleDeadAirPoll();
529
+ }
530
+
280
531
  // --- Abort file polling (with jitter) ---
281
532
 
282
533
  let abortPollId = null;
@@ -315,12 +566,15 @@ const timeoutId = setTimeout(() => {
315
566
 
316
567
  child.on('exit', (code, signal) => {
317
568
  clearTimeout(timeoutId);
569
+ if (stallTimer) clearTimeout(stallTimer);
318
570
  if (abortPollId) clearTimeout(abortPollId);
571
+ if (deadAirPollId) clearTimeout(deadAirPollId);
319
572
  state.exitCode = code;
320
573
  state.exitSignal = signal;
321
574
  state.duration = Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000);
322
575
 
323
- log(`Exit — code: ${code} signal: ${signal} | tokens: ${state.tokensTotal} | tools: ${state.toolCalls} | sends: ${state.sendCalls} | duration: ${state.duration}s${state.killedBy ? ` | killed: ${state.killedBy}` : ''}`);
576
+ state.gotFirstOutput = gotFirstOutput;
577
+ log(`Exit — code: ${code} signal: ${signal} | tokens: ${state.tokensTotal} | tools: ${state.toolCalls} | sends: ${state.sendCalls} | duration: ${state.duration}s | output: ${gotFirstOutput}${state.killedBy ? ` | killed: ${state.killedBy}` : ''}`);
324
578
  writeState();
325
579
 
326
580
  if (logStream) logStream.end();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tjamescouch/niki",
3
- "version": "0.2.0",
3
+ "version": "0.3.0",
4
4
  "description": "Deterministic process supervisor for AI agents — token budgets, rate limits, and abort control",
5
5
  "bin": {
6
6
  "niki": "./bin/niki"
@@ -0,0 +1,275 @@
1
+ #!/bin/bash
2
+ # test-niki.sh — Unit tests for niki process supervisor
3
+ #
4
+ # Tests stdin management, stdout forwarding, stall detection,
5
+ # session handling, and prompt detection.
6
+ #
7
+ # Usage: ./tests/test-niki.sh [--verbose]
8
+
9
+ set -euo pipefail
10
+
11
+ NIKI="$(dirname "$0")/../bin/niki"
12
+ PASSED=0
13
+ FAILED=0
14
+ VERBOSE="${1:-}"
15
+
16
+ red() { printf "\033[31m%s\033[0m" "$1"; }
17
+ green() { printf "\033[32m%s\033[0m" "$1"; }
18
+ bold() { printf "\033[1m%s\033[0m" "$1"; }
19
+
20
+ run_test() {
21
+ local name="$1"
22
+ shift
23
+ local expected_exit="$1"
24
+ shift
25
+
26
+ printf " %-50s " "$name"
27
+
28
+ local output
29
+ local actual_exit=0
30
+ output=$("$@" 2>&1) || actual_exit=$?
31
+
32
+ if [ "$actual_exit" -eq "$expected_exit" ]; then
33
+ green "PASS"
34
+ echo " (exit $actual_exit)"
35
+ PASSED=$((PASSED + 1))
36
+ if [ "$VERBOSE" = "--verbose" ]; then
37
+ echo "$output" | sed 's/^/ | /'
38
+ fi
39
+ else
40
+ red "FAIL"
41
+ echo " (expected exit $expected_exit, got $actual_exit)"
42
+ FAILED=$((FAILED + 1))
43
+ echo "$output" | sed 's/^/ | /'
44
+ fi
45
+ }
46
+
47
+ run_test_output() {
48
+ local name="$1"
49
+ local expected_pattern="$2"
50
+ shift 2
51
+
52
+ printf " %-50s " "$name"
53
+
54
+ local output
55
+ local actual_exit=0
56
+ output=$("$@" 2>&1) || actual_exit=$?
57
+
58
+ if echo "$output" | grep -qE "$expected_pattern"; then
59
+ green "PASS"
60
+ echo ""
61
+ PASSED=$((PASSED + 1))
62
+ if [ "$VERBOSE" = "--verbose" ]; then
63
+ echo "$output" | sed 's/^/ | /'
64
+ fi
65
+ else
66
+ red "FAIL"
67
+ echo " (pattern '$expected_pattern' not found)"
68
+ FAILED=$((FAILED + 1))
69
+ echo "$output" | sed 's/^/ | /'
70
+ fi
71
+ }
72
+
73
+ echo ""
74
+ bold "=== niki unit tests ==="; echo ""
75
+ echo ""
76
+
77
+ # ---- Stdout forwarding ----
78
+
79
+ bold "Stdout forwarding"; echo ""
80
+
81
+ run_test_output \
82
+ "echo passes through stdout" \
83
+ "^hello from niki$" \
84
+ timeout 10 node "$NIKI" --stall-timeout 5 -- echo "hello from niki"
85
+
86
+ run_test \
87
+ "echo exits cleanly (code 0)" \
88
+ 0 \
89
+ timeout 10 node "$NIKI" --stall-timeout 5 -- echo "test"
90
+
91
+ run_test_output \
92
+ "multi-line output preserved" \
93
+ "line2" \
94
+ timeout 10 node "$NIKI" --stall-timeout 5 -- sh -c 'echo line1; echo line2; echo line3'
95
+
96
+ echo ""
97
+
98
+ # ---- Stdin management ----
99
+
100
+ bold "Stdin management"; echo ""
101
+
102
+ run_test_output \
103
+ "stdin closed immediately on spawn" \
104
+ "Stdin: closed" \
105
+ timeout 10 node "$NIKI" --stall-timeout 5 -- echo "ok"
106
+
107
+ run_test \
108
+ "cat exits on EOF (stdin closed)" \
109
+ 0 \
110
+ timeout 10 node "$NIKI" --stall-timeout 5 -- cat
111
+
112
+ echo ""
113
+
114
+ # ---- Stall detection ----
115
+
116
+ bold "Stall detection"; echo ""
117
+
118
+ run_test_output \
119
+ "stall kills silent process" \
120
+ "STALL.*no output" \
121
+ timeout 10 node "$NIKI" --stall-timeout 2 --startup-timeout 0 --max-nudges 0 -- sleep 30
122
+
123
+ run_test_output \
124
+ "stall kill reason logged" \
125
+ "KILL.*reason: stall" \
126
+ timeout 10 node "$NIKI" --stall-timeout 2 --startup-timeout 0 --max-nudges 0 -- sleep 30
127
+
128
+ run_test \
129
+ "stall kill exits non-zero" \
130
+ 1 \
131
+ timeout 10 node "$NIKI" --stall-timeout 2 --startup-timeout 0 --max-nudges 0 -- sleep 30
132
+
133
+ run_test_output \
134
+ "stall disabled when timeout=0" \
135
+ "Exit.*code: 0" \
136
+ timeout 5 node "$NIKI" --stall-timeout 0 -- sh -c 'sleep 1; echo done'
137
+
138
+ echo ""
139
+
140
+ # ---- Stall timeout precision ----
141
+
142
+ bold "Stall timing"; echo ""
143
+
144
+ # Process that outputs then goes silent — stall should fire after the silence
145
+ run_test_output \
146
+ "stall timer resets on output" \
147
+ "Exit.*code: 0" \
148
+ timeout 10 node "$NIKI" --stall-timeout 3 --startup-timeout 0 -- sh -c 'echo tick; sleep 1; echo tick; sleep 1; echo done'
149
+
150
+ echo ""
151
+
152
+ # ---- Startup timeout ----
153
+
154
+ bold "Startup timeout"; echo ""
155
+
156
+ # Startup timeout gives longer grace period before first output
157
+ run_test_output \
158
+ "startup-timeout used before first output" \
159
+ "startup-timeout=5s" \
160
+ timeout 10 node "$NIKI" --stall-timeout 2 --startup-timeout 5 -- sh -c 'sleep 3; echo hello'
161
+
162
+ # After first output, switches to stall-timeout
163
+ run_test_output \
164
+ "switches to stall-timeout after first output" \
165
+ "switching to stall-timeout" \
166
+ timeout 10 node "$NIKI" --stall-timeout 3 --startup-timeout 10 -- sh -c 'echo first; sleep 1; echo done'
167
+
168
+ echo ""
169
+
170
+ # ---- Budget/timeout (existing features, regression) ----
171
+
172
+ bold "Budget and timeout (regression)"; echo ""
173
+
174
+ run_test_output \
175
+ "wall-clock timeout kills" \
176
+ "KILL.*reason: timeout" \
177
+ timeout 10 node "$NIKI" --timeout 2 --stall-timeout 0 -- sleep 30
178
+
179
+ run_test \
180
+ "timeout kill exits non-zero" \
181
+ 1 \
182
+ timeout 10 node "$NIKI" --timeout 2 --stall-timeout 0 -- sleep 30
183
+
184
+ echo ""
185
+
186
+ # ---- Exit code passthrough ----
187
+
188
+ bold "Exit code passthrough"; echo ""
189
+
190
+ run_test \
191
+ "child exit 0 → niki exit 0" \
192
+ 0 \
193
+ timeout 10 node "$NIKI" --stall-timeout 5 -- sh -c 'exit 0'
194
+
195
+ run_test \
196
+ "child exit 1 → niki exit 1" \
197
+ 1 \
198
+ timeout 10 node "$NIKI" --stall-timeout 5 -- sh -c 'exit 1'
199
+
200
+ run_test \
201
+ "child exit 42 → niki exit 42" \
202
+ 42 \
203
+ timeout 10 node "$NIKI" --stall-timeout 5 -- sh -c 'exit 42'
204
+
205
+ echo ""
206
+
207
+ # ---- Abort file ----
208
+
209
+ bold "Abort file"; echo ""
210
+
211
+ ABORT_FILE=$(mktemp)
212
+ rm -f "$ABORT_FILE"
213
+
214
+ run_test_output \
215
+ "abort file kills process" \
216
+ "KILL.*reason: abort" \
217
+ timeout 10 sh -c "node $NIKI --stall-timeout 0 --abort-file $ABORT_FILE -- sh -c 'sleep 1; echo still here; sleep 30' & PID=\$!; sleep 2; touch $ABORT_FILE; wait \$PID 2>/dev/null; echo done"
218
+
219
+ rm -f "$ABORT_FILE"
220
+
221
+ echo ""
222
+
223
+ # ---- SIGTERM forwarding ----
224
+
225
+ bold "SIGTERM forwarding"; echo ""
226
+
227
+ # niki should forward SIGTERM to child and exit
228
+ run_test_output \
229
+ "SIGTERM forwarded to child" \
230
+ "Received SIGTERM" \
231
+ timeout 10 sh -c "node $NIKI --stall-timeout 0 -- sh -c 'echo started; sleep 30' & PID=\$!; sleep 1; kill -TERM \$PID; wait \$PID 2>/dev/null; echo done"
232
+
233
+ echo ""
234
+
235
+ # ---- Dead air detection ----
236
+
237
+ bold "Dead air detection"; echo ""
238
+
239
+ # Dead air kills silent process with zero CPU (sleep has ~zero CPU)
240
+ run_test_output \
241
+ "dead air kills zero-CPU process" \
242
+ "DEAD AIR.*zero CPU" \
243
+ timeout 30 node "$NIKI" --stall-timeout 0 --dead-air-timeout 0.1 -- sleep 30
244
+
245
+ run_test_output \
246
+ "dead air kill reason logged" \
247
+ "KILL.*reason: dead-air" \
248
+ timeout 30 node "$NIKI" --stall-timeout 0 --dead-air-timeout 0.1 -- sleep 30
249
+
250
+ # Dead air defers for CPU-active processes (busy loop uses CPU)
251
+ run_test_output \
252
+ "dead air defers when CPU active" \
253
+ "Exit.*code: 0" \
254
+ timeout 15 node "$NIKI" --stall-timeout 0 --dead-air-timeout 0.1 -- sh -c 'i=0; while [ $i -lt 2000000 ]; do i=$((i+1)); done; echo done'
255
+
256
+ # Dead air disabled when timeout=0
257
+ run_test_output \
258
+ "dead air disabled when timeout=0" \
259
+ "Exit.*code: 0" \
260
+ timeout 10 node "$NIKI" --stall-timeout 0 --dead-air-timeout 0 -- sh -c 'sleep 1; echo done'
261
+
262
+ echo ""
263
+
264
+ # ---- Summary ----
265
+
266
+ echo "────────────────────────────────────────────────"
267
+ TOTAL=$((PASSED + FAILED))
268
+ if [ "$FAILED" -eq 0 ]; then
269
+ green "All $TOTAL tests passed"; echo ""
270
+ else
271
+ red "$FAILED/$TOTAL tests failed"; echo ""
272
+ fi
273
+ echo ""
274
+
275
+ exit "$FAILED"