@tjamescouch/niki 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 James Couch
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/bin/niki CHANGED
@@ -8,6 +8,7 @@
8
8
  * - Wall-clock timeout (kill if exceeded)
9
9
  * - Tool-call rate limiting (kill if agent floods)
10
10
  * - Diagnostics logging
11
+ * - Automatic restart on exit (optional)
11
12
  *
12
13
  * Usage:
13
14
  * niki [options] -- <command> [args...]
@@ -38,18 +39,24 @@ Options:
38
39
  --max-sends <n> Max agentchat_send calls per minute (default: 10)
39
40
  --max-tool-calls <n> Max total tool calls per minute (default: 30)
40
41
  --stall-timeout <secs> Kill after N seconds of no output (default: 60, 0=disabled)
41
- --startup-timeout <s> Longer stall timeout until first output (default: 180, 0=no stall until first output)
42
+ --startup-timeout <s> Longer stall timeout until first output (default: 180, 0=use stall-timeout)
42
43
  --dead-air-timeout <m> Minutes of zero CPU + zero output before kill (default: 5, 0=disabled)
43
44
  --max-nudges <n> Max stdin nudge attempts before kill on stall (default: 3)
44
45
  --log <file> Write diagnostics log to file
46
+ --log-level <level> Minimum log level: debug, info, warn, error (default: info)
47
+ --log-json Emit logs as JSON lines (for machine parsing)
45
48
  --state <file> Write state JSON on exit (budget used, reason, etc.)
46
49
  --cooldown <seconds> Grace period after SIGTERM before SIGKILL (default: 5)
47
50
  --abort-file <path> Poll this file for external abort signal
48
51
  --poll-interval <ms> Base poll interval in ms for abort file (default: 1000)
52
+ --restart Restart the child process when it exits (default: off)
53
+ --max-restarts <n> Max restart attempts, 0=unlimited (default: 0)
54
+ --restart-delay <secs> Delay between restarts with ±30% jitter (default: 5)
49
55
 
50
56
  Examples:
51
57
  niki --budget 500000 -- claude -p "your prompt" --verbose
52
- niki --timeout 1800 --max-sends 5 -- claude -p "..." --model sonnet --verbose`);
58
+ niki --timeout 1800 --max-sends 5 -- claude -p "..." --model sonnet --verbose
59
+ niki --restart --max-restarts 10 -- gro --model gpt-5.2 "your prompt"`);
53
60
  process.exit(1);
54
61
  }
55
62
 
@@ -69,10 +76,15 @@ const { values: opts } = parseArgs({
69
76
  'dead-air-timeout': { type: 'string', default: '5' },
70
77
  'max-nudges': { type: 'string', default: '3' },
71
78
  log: { type: 'string' },
79
+ 'log-level': { type: 'string', default: 'info' },
80
+ 'log-json': { type: 'boolean', default: false },
72
81
  state: { type: 'string' },
73
82
  cooldown: { type: 'string', default: '5' },
74
83
  'abort-file': { type: 'string' },
75
84
  'poll-interval': { type: 'string', default: '1000' },
85
+ restart: { type: 'boolean', default: false },
86
+ 'max-restarts': { type: 'string', default: '0' },
87
+ 'restart-delay': { type: 'string', default: '5' },
76
88
  },
77
89
  });
78
90
 
@@ -89,7 +101,15 @@ const ABORT_FILE = opts['abort-file'] ? resolve(opts['abort-file']) : null;
89
101
  const POLL_INTERVAL = parseInt(opts['poll-interval'], 10);
90
102
  const LOG_FILE = opts.log;
91
103
  const STATE_FILE = opts.state;
104
+ const RESTART = opts.restart;
105
+ const MAX_RESTARTS = parseInt(opts['max-restarts'], 10);
106
+ const RESTART_DELAY_S = parseFloat(opts['restart-delay']);
107
+ const LOG_JSON = opts['log-json'];
92
108
 
109
+ // --- Log levels ---
110
+
111
+ const LOG_LEVELS = { debug: 0, info: 1, warn: 2, error: 3 };
112
+ const LOG_LEVEL = LOG_LEVELS[opts['log-level']] ?? LOG_LEVELS.info;
93
113
  // --- State ---
94
114
 
95
115
  const state = {
@@ -109,12 +129,17 @@ const state = {
109
129
  stallEvents: 0,
110
130
  nudges: 0,
111
131
  deadAirChecks: 0,
132
+ restarts: 0,
112
133
  };
113
134
 
114
135
  // Sliding window for per-minute rate limiting
115
136
  const toolCallTimestamps = [];
116
137
  const sendCallTimestamps = [];
117
138
 
139
+ // Budget threshold tracking — warn once at each level
140
+ const BUDGET_THRESHOLDS = [0.5, 0.75, 0.9];
141
+ const budgetWarned = new Set();
142
+
118
143
  // --- Logging ---
119
144
 
120
145
  let logStream = null;
@@ -123,11 +148,24 @@ if (LOG_FILE) {
123
148
  logStream = createWriteStream(resolve(LOG_FILE), { flags: 'a' });
124
149
  }
125
150
 
126
- function log(msg) {
127
- const line = `[${new Date().toISOString()}] ${msg}`;
128
- if (logStream) logStream.write(line + '\n');
129
- // Also write to stderr so supervisor can capture it
130
- process.stderr.write(`[niki] ${line}\n`);
151
+ function log(msg, level = 'info', fields = null) {
152
+ const numLevel = LOG_LEVELS[level] ?? LOG_LEVELS.info;
153
+ if (numLevel < LOG_LEVEL) return;
154
+
155
+ const ts = new Date().toISOString();
156
+
157
+ if (LOG_JSON) {
158
+ const entry = { ts, level, msg, ...state };
159
+ if (fields) Object.assign(entry, fields);
160
+ const json = JSON.stringify(entry);
161
+ if (logStream) logStream.write(json + '\n');
162
+ process.stderr.write(json + '\n');
163
+ } else {
164
+ const prefix = level === 'info' ? '' : `[${level.toUpperCase()}] `;
165
+ const line = `[${ts}] ${prefix}${msg}`;
166
+ if (logStream) logStream.write(line + '\n');
167
+ process.stderr.write(`[niki] ${line}\n`);
168
+ }
131
169
  }
132
170
 
133
171
  function writeState() {
@@ -141,6 +179,20 @@ function writeState() {
141
179
  }
142
180
  }
143
181
 
182
+ // --- Budget threshold warnings ---
183
+
184
+ function checkBudgetThresholds() {
185
+ if (BUDGET <= 0) return;
186
+ const pct = state.tokensTotal / BUDGET;
187
+ for (const threshold of BUDGET_THRESHOLDS) {
188
+ if (pct >= threshold && !budgetWarned.has(threshold)) {
189
+ budgetWarned.add(threshold);
190
+ const used = state.tokensTotal;
191
+ const remaining = BUDGET - used;
192
+ log(`Budget ${Math.round(threshold * 100)}% — ${used.toLocaleString()}/${BUDGET.toLocaleString()} tokens used, ${remaining.toLocaleString()} remaining`, 'warn');
193
+ }
194
+ }
195
+ }
144
196
  // --- Token parsing from stderr ---
145
197
 
146
198
  // Claude --verbose outputs token usage in stderr. Patterns vary by version.
@@ -163,6 +215,7 @@ const TOKEN_PATTERNS = [
163
215
  ];
164
216
 
165
217
  function parseTokens(line) {
218
+ let changed = false;
166
219
  for (const { regex, field } of TOKEN_PATTERNS) {
167
220
  regex.lastIndex = 0;
168
221
  let match;
@@ -170,30 +223,39 @@ function parseTokens(line) {
170
223
  const count = parseInt(match[1], 10);
171
224
  if (isNaN(count) || count <= 0) continue;
172
225
  if (field === 'in') {
173
- state.tokensIn = Math.max(state.tokensIn, count);
226
+ if (count > state.tokensIn) { state.tokensIn = count; changed = true; }
174
227
  } else {
175
- state.tokensOut = Math.max(state.tokensOut, count);
228
+ if (count > state.tokensOut) { state.tokensOut = count; changed = true; }
176
229
  }
177
230
  state.tokensTotal = state.tokensIn + state.tokensOut;
178
231
  }
179
232
  }
233
+ if (changed) {
234
+ log(`Tokens — in: ${state.tokensIn.toLocaleString()} out: ${state.tokensOut.toLocaleString()} total: ${state.tokensTotal.toLocaleString()}/${BUDGET.toLocaleString()} (${Math.round(state.tokensTotal / BUDGET * 100)}%)`, 'debug');
235
+ checkBudgetThresholds();
236
+ }
180
237
  }
181
238
 
182
239
  // --- Tool call detection from stderr ---
183
240
 
184
241
  // Claude --verbose logs tool calls. We detect sends specifically.
185
- const TOOL_CALL_PATTERN = /(?:Using tool|Tool call|tool_use).*?(\w+)/i;
242
+ const TOOL_CALL_PATTERN = /(?:Using tool|Tool call|tool_use)[:\s]*(\S+)/i;
186
243
  const SEND_PATTERN = /agentchat_send/i;
187
244
 
188
245
  function parseToolCall(line) {
189
- if (TOOL_CALL_PATTERN.test(line)) {
246
+ const toolMatch = line.match(TOOL_CALL_PATTERN);
247
+ if (toolMatch) {
190
248
  const now = Date.now();
249
+ const toolName = toolMatch[1] || 'unknown';
191
250
  state.toolCalls++;
192
251
  toolCallTimestamps.push(now);
193
252
 
194
253
  if (SEND_PATTERN.test(line)) {
195
254
  state.sendCalls++;
196
255
  sendCallTimestamps.push(now);
256
+ log(`Tool call #${state.toolCalls}: ${toolName} (send #${state.sendCalls}, ${state.sendCallsThisMinute + 1}/${MAX_SENDS}/min)`, 'info');
257
+ } else {
258
+ log(`Tool call #${state.toolCalls}: ${toolName} (${state.toolCallsThisMinute + 1}/${MAX_TOOL_CALLS}/min)`, 'debug');
197
259
  }
198
260
  }
199
261
  }
@@ -214,6 +276,14 @@ function checkRateLimits() {
214
276
  state.toolCallsThisMinute = toolCallTimestamps.length;
215
277
  state.sendCallsThisMinute = sendCallTimestamps.length;
216
278
 
279
+ // Warn at 80% of rate limits
280
+ if (state.sendCallsThisMinute === Math.ceil(MAX_SENDS * 0.8)) {
281
+ log(`Rate warning — sends at ${state.sendCallsThisMinute}/${MAX_SENDS}/min (80% threshold)`, 'warn');
282
+ }
283
+ if (state.toolCallsThisMinute === Math.ceil(MAX_TOOL_CALLS * 0.8)) {
284
+ log(`Rate warning — tool calls at ${state.toolCallsThisMinute}/${MAX_TOOL_CALLS}/min (80% threshold)`, 'warn');
285
+ }
286
+
217
287
  if (sendCallTimestamps.length > MAX_SENDS) {
218
288
  return 'rate-sends';
219
289
  }
@@ -222,7 +292,6 @@ function checkRateLimits() {
222
292
  }
223
293
  return null;
224
294
  }
225
-
226
295
  // --- Kill logic ---
227
296
 
228
297
  let child = null;
@@ -232,7 +301,7 @@ function killChild(reason) {
232
301
  if (killed || !child) return;
233
302
  killed = true;
234
303
  state.killedBy = reason;
235
- log(`KILL — reason: ${reason} | tokens: ${state.tokensTotal}/${BUDGET} | sends: ${state.sendCallsThisMinute}/min | tools: ${state.toolCallsThisMinute}/min`);
304
+ log(`KILL — reason: ${reason} | tokens: ${state.tokensTotal}/${BUDGET} | sends: ${state.sendCallsThisMinute}/min | tools: ${state.toolCallsThisMinute}/min`, 'error');
236
305
 
237
306
  child.kill('SIGTERM');
238
307
 
@@ -262,7 +331,7 @@ const PROMPT_PATTERNS = [
262
331
  function checkForPrompts(text) {
263
332
  for (const pattern of PROMPT_PATTERNS) {
264
333
  if (pattern.test(text)) {
265
- log(`PROMPT detected in stdout: ${text.trim().substring(0, 100)}`);
334
+ log(`PROMPT detected in stdout: ${text.trim().substring(0, 100)}`, 'warn');
266
335
  state.stallEvents++;
267
336
  // Close stdin to dismiss the prompt
268
337
  closeStdin();
@@ -329,6 +398,13 @@ function hasConsumedCpu(pid) {
329
398
  return cpuMs > prev;
330
399
  }
331
400
 
401
+ // --- Jitter utility ---
402
+
403
+ function jitteredDelay(base) {
404
+ // ±30% jitter
405
+ const jitter = base * 0.3;
406
+ return base + (Math.random() * 2 * jitter - jitter);
407
+ }
332
408
  // --- Dead air detection ---
333
409
 
334
410
  let deadAirStart = null; // Timestamp when dead air began (null = not in dead air)
@@ -344,7 +420,7 @@ function checkDeadAir() {
344
420
  if (cpuActive) {
345
421
  // Process is working — reset dead air, let it cook
346
422
  if (deadAirStart) {
347
- log(`Dead air cleared — CPU active after ${Math.round((Date.now() - deadAirStart) / 1000)}s of silence`);
423
+ log(`Dead air cleared — CPU active after ${Math.round((Date.now() - deadAirStart) / 1000)}s of silence`, 'info');
348
424
  deadAirStart = null;
349
425
  }
350
426
  return;
@@ -353,17 +429,17 @@ function checkDeadAir() {
353
429
  // Zero CPU + zero output
354
430
  if (!deadAirStart) {
355
431
  deadAirStart = Date.now();
356
- log(`Dead air started — zero CPU, ${silenceSec}s silence`);
432
+ log(`Dead air started — zero CPU, ${silenceSec}s silence`, 'warn');
357
433
  }
358
434
 
359
435
  const deadAirMin = (Date.now() - deadAirStart) / 60_000;
360
436
  if (deadAirMin >= DEAD_AIR_TIMEOUT_M) {
361
- log(`DEAD AIR — zero CPU + zero output for ${Math.round(deadAirMin)}min (threshold: ${DEAD_AIR_TIMEOUT_M}min)`);
437
+ log(`DEAD AIR — zero CPU + zero output for ${Math.round(deadAirMin)}min (threshold: ${DEAD_AIR_TIMEOUT_M}min)`, 'error');
362
438
  killChild('dead-air');
363
439
  return;
364
440
  }
365
441
 
366
- log(`Dead air check — zero CPU, ${Math.round(deadAirMin * 10) / 10}/${DEAD_AIR_TIMEOUT_M}min, ${silenceSec}s silence`);
442
+ log(`Dead air check — zero CPU, ${Math.round(deadAirMin * 10) / 10}/${DEAD_AIR_TIMEOUT_M}min, ${silenceSec}s silence`, 'debug');
367
443
  }
368
444
 
369
445
  // Poll interval: min(30s, threshold/3) — fast polls for short thresholds, 30s cap for production
@@ -391,12 +467,12 @@ function onChildOutput() {
391
467
  lastOutputTime = Date.now();
392
468
  // Reset dead air — got real output
393
469
  if (deadAirStart) {
394
- log(`Dead air cleared — received output after ${Math.round((Date.now() - deadAirStart) / 1000)}s`);
470
+ log(`Dead air cleared — received output after ${Math.round((Date.now() - deadAirStart) / 1000)}s`, 'info');
395
471
  deadAirStart = null;
396
472
  }
397
473
  if (!gotFirstOutput) {
398
474
  gotFirstOutput = true;
399
- log(`First output received after ${Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000)}s — switching to stall-timeout=${STALL_TIMEOUT_S}s`);
475
+ log(`First output received after ${Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000)}s — switching to stall-timeout=${STALL_TIMEOUT_S}s`, 'info');
400
476
  }
401
477
  resetStallTimer();
402
478
  }
@@ -409,8 +485,6 @@ function currentStallTimeout() {
409
485
 
410
486
  function resetStallTimer() {
411
487
  if (stallTimer) clearTimeout(stallTimer);
412
- // startup-timeout=0 means "wait indefinitely for first output"
413
- if (!gotFirstOutput && STARTUP_TIMEOUT_S === 0) return;
414
488
  const timeout = currentStallTimeout();
415
489
  if (killed || timeout <= 0) return;
416
490
  stallTimer = setTimeout(onStallDetected, timeout * 1000);
@@ -420,14 +494,14 @@ function closeStdin() {
420
494
  if (stdinClosed || !child) return;
421
495
  stdinClosed = true;
422
496
  try { child.stdin.end(); } catch { /* already closed */ }
423
- log('Stdin: closed (EOF)');
497
+ log('Stdin: closed (EOF)', 'debug');
424
498
  }
425
499
 
426
500
  function onStallDetected() {
427
501
  if (killed) return;
428
502
  state.stallEvents++;
429
503
  const silence = Math.round((Date.now() - lastOutputTime) / 1000);
430
- log(`STALL — no output for ${silence}s (nudges: ${nudgeCount}/${MAX_NUDGES})`);
504
+ log(`STALL — no output for ${silence}s (nudges: ${nudgeCount}/${MAX_NUDGES})`, 'warn');
431
505
 
432
506
  // Escalation: close stdin → nudge → check CPU → kill
433
507
  if (!stdinClosed) {
@@ -439,7 +513,7 @@ function onStallDetected() {
439
513
  if (nudgeCount < MAX_NUDGES && !child.stdin.writableEnded) {
440
514
  nudgeCount++;
441
515
  state.nudges = nudgeCount;
442
- log(`Stall nudge #${nudgeCount}`);
516
+ log(`Stall nudge #${nudgeCount}`, 'info');
443
517
  resetStallTimer();
444
518
  return;
445
519
  }
@@ -449,7 +523,7 @@ function onStallDetected() {
449
523
  if (DEAD_AIR_TIMEOUT_M > 0 && child) {
450
524
  const cpuActive = hasConsumedCpu(child.pid);
451
525
  if (cpuActive) {
452
- log(`Stall deferred — process has CPU activity, deferring to dead-air detection`);
526
+ log(`Stall deferred — process has CPU activity, deferring to dead-air detection`, 'info');
453
527
  resetStallTimer();
454
528
  return;
455
529
  }
@@ -457,138 +531,195 @@ function onStallDetected() {
457
531
 
458
532
  killChild('stall');
459
533
  }
534
+ // --- Restart logic ---
535
+
536
+ // Reasons that should NOT trigger a restart (hard limits / operator intent)
537
+ const NO_RESTART_REASONS = new Set(['budget', 'rate-sends', 'rate-tools', 'abort']);
538
+ let nikiTerminated = false; // Set when niki itself receives SIGTERM/SIGINT
539
+
540
+ function shouldRestart(code, signal) {
541
+ if (!RESTART) return false;
542
+ if (nikiTerminated) return false;
543
+ if (state.killedBy && NO_RESTART_REASONS.has(state.killedBy)) return false;
544
+ if (MAX_RESTARTS > 0 && state.restarts >= MAX_RESTARTS) return false;
545
+ return true;
546
+ }
460
547
 
461
- // --- Spawn child process ---
462
-
463
- log(`Starting: ${childCmd} ${childArgs.join(' ').substring(0, 100)}...`);
464
- log(`Budget: ${BUDGET} tokens | Timeout: ${TIMEOUT_S}s | Startup: ${STARTUP_TIMEOUT_S}s | Stall: ${STALL_TIMEOUT_S}s | Dead air: ${DEAD_AIR_TIMEOUT_M}min | Max sends: ${MAX_SENDS}/min | Max tools: ${MAX_TOOL_CALLS}/min`);
465
-
466
- child = spawn(childCmd, childArgs, {
467
- stdio: ['pipe', 'pipe', 'pipe'], // All piped: niki controls stdin, monitors stdout+stderr
468
- env: process.env, // Inherit env (tokens stay in env, never logged)
469
- });
548
+ function resetPerRunState() {
549
+ // Reset per-run flags but keep cumulative counters (tokens, toolCalls, etc.)
550
+ killed = false;
551
+ state.killedBy = null;
552
+ state.exitCode = null;
553
+ state.exitSignal = null;
554
+ state.gotFirstOutput = undefined;
555
+
556
+ // Reset stall/dead-air detection
557
+ nudgeCount = 0;
558
+ stdinClosed = false;
559
+ lastOutputTime = Date.now();
560
+ gotFirstOutput = false;
561
+ lastCpuMs = null;
562
+ deadAirStart = null;
563
+
564
+ // Clear rate limit windows (fresh session)
565
+ toolCallTimestamps.length = 0;
566
+ sendCallTimestamps.length = 0;
567
+ state.toolCallsThisMinute = 0;
568
+ state.sendCallsThisMinute = 0;
569
+
570
+ // Reset budget warnings for fresh run
571
+ budgetWarned.clear();
572
+ }
470
573
 
471
- state.pid = child.pid;
574
+ // --- Abort file polling ---
472
575
 
473
- // Close stdin immediately — claude -p should never need interactive input.
474
- // This prevents blocking on trust prompts, permission prompts, or stdin reads.
475
- closeStdin();
576
+ let abortPollId = null;
476
577
 
477
- // --- Monitor stdout ---
578
+ function scheduleAbortPoll() {
579
+ if (!ABORT_FILE || killed) return;
580
+ abortPollId = setTimeout(() => {
581
+ if (killed) return;
582
+ if (existsSync(ABORT_FILE)) {
583
+ log(`Abort file detected: ${ABORT_FILE}`, 'warn');
584
+ killChild('abort');
585
+ return;
586
+ }
587
+ scheduleAbortPoll();
588
+ }, jitteredDelay(POLL_INTERVAL));
589
+ }
478
590
 
479
- child.stdout.on('data', (chunk) => {
480
- // Forward to our stdout (preserves runner's | tee pipeline)
481
- process.stdout.write(chunk);
482
- onChildOutput();
483
- checkForPrompts(chunk.toString());
484
- });
591
+ // --- Spawn and monitor child ---
485
592
 
486
- // --- Monitor stderr ---
593
+ let timeoutId = null;
487
594
 
488
- let stderrBuffer = '';
595
+ function startChild() {
596
+ log(`Starting: ${childCmd} ${childArgs.join(' ').substring(0, 100)}...`, 'info');
597
+ log(`Budget: ${BUDGET.toLocaleString()} tokens | Timeout: ${TIMEOUT_S}s | Startup: ${STARTUP_TIMEOUT_S}s | Stall: ${STALL_TIMEOUT_S}s | Dead air: ${DEAD_AIR_TIMEOUT_M}min | Max sends: ${MAX_SENDS}/min | Max tools: ${MAX_TOOL_CALLS}/min`, 'info');
598
+ if (RESTART) {
599
+ log(`Restart: enabled | max: ${MAX_RESTARTS || 'unlimited'} | delay: ${RESTART_DELAY_S}s ±30% | restarts so far: ${state.restarts}`, 'info');
600
+ }
489
601
 
490
- child.stderr.on('data', (chunk) => {
491
- const text = chunk.toString();
602
+ child = spawn(childCmd, childArgs, {
603
+ stdio: ['pipe', 'pipe', 'pipe'],
604
+ env: process.env,
605
+ });
492
606
 
493
- // Always forward stderr to our stderr (so supervisor captures it)
494
- process.stderr.write(chunk);
495
- onChildOutput();
607
+ state.pid = child.pid;
608
+ log(`Child PID: ${child.pid}`, 'debug');
496
609
 
497
- // Buffer and parse line by line
498
- stderrBuffer += text;
499
- const lines = stderrBuffer.split('\n');
500
- stderrBuffer = lines.pop(); // Keep incomplete last line in buffer
610
+ // Close stdin immediately claude -p should never need interactive input.
611
+ closeStdin();
501
612
 
502
- for (const line of lines) {
503
- parseTokens(line);
504
- parseToolCall(line);
613
+ // --- Monitor stdout ---
505
614
 
506
- // Check budget
507
- if (state.tokensTotal > BUDGET) {
508
- killChild('budget');
509
- return;
510
- }
615
+ child.stdout.on('data', (chunk) => {
616
+ process.stdout.write(chunk);
617
+ onChildOutput();
618
+ checkForPrompts(chunk.toString());
619
+ });
511
620
 
512
- // Check rate limits
513
- const rateViolation = checkRateLimits();
514
- if (rateViolation) {
515
- killChild(rateViolation);
516
- return;
517
- }
518
- }
519
- });
621
+ // --- Monitor stderr ---
520
622
 
521
- // Start stall detection
522
- if (STALL_TIMEOUT_S > 0 || STARTUP_TIMEOUT_S > 0) {
523
- const startupMsg = STARTUP_TIMEOUT_S === 0 ? 'disabled (waiting for first output)' : `${STARTUP_TIMEOUT_S}s`;
524
- log(`Stall detection: startup-timeout=${startupMsg}, stall-timeout=${STALL_TIMEOUT_S}s, max-nudges=${MAX_NUDGES}`);
525
- resetStallTimer();
526
- }
623
+ let stderrBuffer = '';
527
624
 
528
- // Start dead air detection
529
- if (DEAD_AIR_TIMEOUT_M > 0) {
530
- log(`Dead air detection: ${DEAD_AIR_TIMEOUT_M}min threshold, ${Math.round(DEAD_AIR_POLL_MS / 1000)}s poll interval`);
531
- scheduleDeadAirPoll();
532
- }
625
+ child.stderr.on('data', (chunk) => {
626
+ const text = chunk.toString();
627
+ process.stderr.write(chunk);
628
+ onChildOutput();
533
629
 
534
- // --- Abort file polling (with jitter) ---
630
+ stderrBuffer += text;
631
+ const lines = stderrBuffer.split('\n');
632
+ stderrBuffer = lines.pop();
535
633
 
536
- let abortPollId = null;
634
+ for (const line of lines) {
635
+ parseTokens(line);
636
+ parseToolCall(line);
537
637
 
538
- function jitteredDelay(base) {
539
- // ±30% jitter
540
- const jitter = base * 0.3;
541
- return base + (Math.random() * 2 * jitter - jitter);
542
- }
638
+ if (state.tokensTotal > BUDGET) {
639
+ killChild('budget');
640
+ return;
641
+ }
543
642
 
544
- function scheduleAbortPoll() {
545
- if (!ABORT_FILE || killed) return;
546
- abortPollId = setTimeout(() => {
547
- if (killed) return;
548
- if (existsSync(ABORT_FILE)) {
549
- log(`Abort file detected: ${ABORT_FILE}`);
550
- killChild('abort');
551
- return;
643
+ const rateViolation = checkRateLimits();
644
+ if (rateViolation) {
645
+ killChild(rateViolation);
646
+ return;
647
+ }
552
648
  }
553
- scheduleAbortPoll();
554
- }, jitteredDelay(POLL_INTERVAL));
555
- }
556
-
557
- if (ABORT_FILE) {
558
- log(`Abort file: ${ABORT_FILE} (poll: ${POLL_INTERVAL}ms ±30% jitter)`);
559
- scheduleAbortPoll();
560
- }
649
+ });
561
650
 
562
- // --- Timeout ---
651
+ // Start stall detection
652
+ if (STALL_TIMEOUT_S > 0 || STARTUP_TIMEOUT_S > 0) {
653
+ log(`Stall detection: startup-timeout=${STARTUP_TIMEOUT_S}s, stall-timeout=${STALL_TIMEOUT_S}s, max-nudges=${MAX_NUDGES}`, 'info');
654
+ resetStallTimer();
655
+ }
563
656
 
564
- const timeoutId = setTimeout(() => {
565
- killChild('timeout');
566
- }, TIMEOUT_S * 1000);
657
+ // Start dead air detection
658
+ if (DEAD_AIR_TIMEOUT_M > 0) {
659
+ log(`Dead air detection: ${DEAD_AIR_TIMEOUT_M}min threshold, ${Math.round(DEAD_AIR_POLL_MS / 1000)}s poll interval`, 'info');
660
+ scheduleDeadAirPoll();
661
+ }
567
662
 
568
- // --- Clean exit ---
663
+ // Abort file polling
664
+ if (ABORT_FILE) {
665
+ log(`Abort file: ${ABORT_FILE} (poll: ${POLL_INTERVAL}ms ±30% jitter)`, 'info');
666
+ scheduleAbortPoll();
667
+ }
569
668
 
570
- child.on('exit', (code, signal) => {
571
- clearTimeout(timeoutId);
572
- if (stallTimer) clearTimeout(stallTimer);
573
- if (abortPollId) clearTimeout(abortPollId);
574
- if (deadAirPollId) clearTimeout(deadAirPollId);
575
- state.exitCode = code;
576
- state.exitSignal = signal;
577
- state.duration = Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000);
578
-
579
- state.gotFirstOutput = gotFirstOutput;
580
- log(`Exit — code: ${code} signal: ${signal} | tokens: ${state.tokensTotal} | tools: ${state.toolCalls} | sends: ${state.sendCalls} | duration: ${state.duration}s | output: ${gotFirstOutput}${state.killedBy ? ` | killed: ${state.killedBy}` : ''}`);
581
- writeState();
582
-
583
- if (logStream) logStream.end();
584
- process.exit(code ?? 1);
585
- });
669
+ // Per-run timeout
670
+ timeoutId = setTimeout(() => {
671
+ killChild('timeout');
672
+ }, TIMEOUT_S * 1000);
673
+
674
+ // --- Exit handler ---
675
+
676
+ child.on('exit', (code, signal) => {
677
+ clearTimeout(timeoutId);
678
+ if (stallTimer) clearTimeout(stallTimer);
679
+ if (abortPollId) clearTimeout(abortPollId);
680
+ if (deadAirPollId) clearTimeout(deadAirPollId);
681
+
682
+ state.exitCode = code;
683
+ state.exitSignal = signal;
684
+ state.duration = Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000);
685
+ state.gotFirstOutput = gotFirstOutput;
686
+
687
+ const level = state.killedBy ? 'error' : (code === 0 ? 'info' : 'warn');
688
+ log(`Exit — code: ${code} signal: ${signal} | tokens: ${state.tokensTotal.toLocaleString()} | tools: ${state.toolCalls} | sends: ${state.sendCalls} | duration: ${state.duration}s | output: ${gotFirstOutput}${state.killedBy ? ` | killed: ${state.killedBy}` : ''} | restarts: ${state.restarts}`, level);
689
+ writeState();
690
+
691
+ if (shouldRestart(code, signal)) {
692
+ state.restarts++;
693
+ const delay = jitteredDelay(RESTART_DELAY_S * 1000);
694
+ log(`RESTART — attempt ${state.restarts}${MAX_RESTARTS > 0 ? `/${MAX_RESTARTS}` : ''} in ${Math.round(delay / 1000)}s`, 'warn');
695
+ resetPerRunState();
696
+ setTimeout(() => {
697
+ startChild();
698
+ }, delay);
699
+ } else {
700
+ if (RESTART && !shouldRestart(code, signal)) {
701
+ const reason = nikiTerminated ? 'niki received signal' :
702
+ (state.killedBy && NO_RESTART_REASONS.has(state.killedBy)) ? `hard kill (${state.killedBy})` :
703
+ (MAX_RESTARTS > 0 && state.restarts >= MAX_RESTARTS) ? `max restarts reached (${MAX_RESTARTS})` :
704
+ 'restart not enabled';
705
+ log(`NOT RESTARTING — ${reason}`, 'warn');
706
+ }
707
+ if (logStream) logStream.end();
708
+ process.exit(code ?? 1);
709
+ }
710
+ });
711
+ }
586
712
 
587
713
  // --- Signal forwarding ---
588
714
 
589
715
  for (const sig of ['SIGINT', 'SIGTERM']) {
590
716
  process.on(sig, () => {
591
- log(`Received ${sig}, forwarding to child`);
717
+ log(`Received ${sig}, forwarding to child`, 'warn');
718
+ nikiTerminated = true;
592
719
  if (child) child.kill(sig);
593
720
  });
594
721
  }
722
+
723
+ // --- Start ---
724
+
725
+ startChild();
package/package.json CHANGED
@@ -1,10 +1,15 @@
1
1
  {
2
2
  "name": "@tjamescouch/niki",
3
- "version": "0.5.0",
3
+ "version": "0.5.2",
4
4
  "description": "Deterministic process supervisor for AI agents — token budgets, rate limits, and abort control",
5
5
  "bin": {
6
6
  "niki": "./bin/niki"
7
7
  },
8
+ "files": [
9
+ "bin/niki",
10
+ "README.md",
11
+ "LICENSE"
12
+ ],
8
13
  "type": "module",
9
14
  "license": "MIT",
10
15
  "repository": {
package/niki.png DELETED
Binary file
@@ -1,275 +0,0 @@
1
- #!/bin/bash
2
- # test-niki.sh — Unit tests for niki process supervisor
3
- #
4
- # Tests stdin management, stdout forwarding, stall detection,
5
- # session handling, and prompt detection.
6
- #
7
- # Usage: ./tests/test-niki.sh [--verbose]
8
-
9
- set -euo pipefail
10
-
11
- NIKI="$(dirname "$0")/../bin/niki"
12
- PASSED=0
13
- FAILED=0
14
- VERBOSE="${1:-}"
15
-
16
- red() { printf "\033[31m%s\033[0m" "$1"; }
17
- green() { printf "\033[32m%s\033[0m" "$1"; }
18
- bold() { printf "\033[1m%s\033[0m" "$1"; }
19
-
20
- run_test() {
21
- local name="$1"
22
- shift
23
- local expected_exit="$1"
24
- shift
25
-
26
- printf " %-50s " "$name"
27
-
28
- local output
29
- local actual_exit=0
30
- output=$("$@" 2>&1) || actual_exit=$?
31
-
32
- if [ "$actual_exit" -eq "$expected_exit" ]; then
33
- green "PASS"
34
- echo " (exit $actual_exit)"
35
- PASSED=$((PASSED + 1))
36
- if [ "$VERBOSE" = "--verbose" ]; then
37
- echo "$output" | sed 's/^/ | /'
38
- fi
39
- else
40
- red "FAIL"
41
- echo " (expected exit $expected_exit, got $actual_exit)"
42
- FAILED=$((FAILED + 1))
43
- echo "$output" | sed 's/^/ | /'
44
- fi
45
- }
46
-
47
- run_test_output() {
48
- local name="$1"
49
- local expected_pattern="$2"
50
- shift 2
51
-
52
- printf " %-50s " "$name"
53
-
54
- local output
55
- local actual_exit=0
56
- output=$("$@" 2>&1) || actual_exit=$?
57
-
58
- if echo "$output" | grep -qE "$expected_pattern"; then
59
- green "PASS"
60
- echo ""
61
- PASSED=$((PASSED + 1))
62
- if [ "$VERBOSE" = "--verbose" ]; then
63
- echo "$output" | sed 's/^/ | /'
64
- fi
65
- else
66
- red "FAIL"
67
- echo " (pattern '$expected_pattern' not found)"
68
- FAILED=$((FAILED + 1))
69
- echo "$output" | sed 's/^/ | /'
70
- fi
71
- }
72
-
73
- echo ""
74
- bold "=== niki unit tests ==="; echo ""
75
- echo ""
76
-
77
- # ---- Stdout forwarding ----
78
-
79
- bold "Stdout forwarding"; echo ""
80
-
81
- run_test_output \
82
- "echo passes through stdout" \
83
- "^hello from niki$" \
84
- timeout 10 node "$NIKI" --stall-timeout 5 -- echo "hello from niki"
85
-
86
- run_test \
87
- "echo exits cleanly (code 0)" \
88
- 0 \
89
- timeout 10 node "$NIKI" --stall-timeout 5 -- echo "test"
90
-
91
- run_test_output \
92
- "multi-line output preserved" \
93
- "line2" \
94
- timeout 10 node "$NIKI" --stall-timeout 5 -- sh -c 'echo line1; echo line2; echo line3'
95
-
96
- echo ""
97
-
98
- # ---- Stdin management ----
99
-
100
- bold "Stdin management"; echo ""
101
-
102
- run_test_output \
103
- "stdin closed immediately on spawn" \
104
- "Stdin: closed" \
105
- timeout 10 node "$NIKI" --stall-timeout 5 -- echo "ok"
106
-
107
- run_test \
108
- "cat exits on EOF (stdin closed)" \
109
- 0 \
110
- timeout 10 node "$NIKI" --stall-timeout 5 -- cat
111
-
112
- echo ""
113
-
114
- # ---- Stall detection ----
115
-
116
- bold "Stall detection"; echo ""
117
-
118
- run_test_output \
119
- "stall kills silent process" \
120
- "STALL.*no output" \
121
- timeout 10 node "$NIKI" --stall-timeout 2 --startup-timeout 0 --max-nudges 0 -- sleep 30
122
-
123
- run_test_output \
124
- "stall kill reason logged" \
125
- "KILL.*reason: stall" \
126
- timeout 10 node "$NIKI" --stall-timeout 2 --startup-timeout 0 --max-nudges 0 -- sleep 30
127
-
128
- run_test \
129
- "stall kill exits non-zero" \
130
- 1 \
131
- timeout 10 node "$NIKI" --stall-timeout 2 --startup-timeout 0 --max-nudges 0 -- sleep 30
132
-
133
- run_test_output \
134
- "stall disabled when timeout=0" \
135
- "Exit.*code: 0" \
136
- timeout 5 node "$NIKI" --stall-timeout 0 -- sh -c 'sleep 1; echo done'
137
-
138
- echo ""
139
-
140
- # ---- Stall timeout precision ----
141
-
142
- bold "Stall timing"; echo ""
143
-
144
- # Process that outputs then goes silent — stall should fire after the silence
145
- run_test_output \
146
- "stall timer resets on output" \
147
- "Exit.*code: 0" \
148
- timeout 10 node "$NIKI" --stall-timeout 3 --startup-timeout 0 -- sh -c 'echo tick; sleep 1; echo tick; sleep 1; echo done'
149
-
150
- echo ""
151
-
152
- # ---- Startup timeout ----
153
-
154
- bold "Startup timeout"; echo ""
155
-
156
- # Startup timeout gives longer grace period before first output
157
- run_test_output \
158
- "startup-timeout used before first output" \
159
- "startup-timeout=5s" \
160
- timeout 10 node "$NIKI" --stall-timeout 2 --startup-timeout 5 -- sh -c 'sleep 3; echo hello'
161
-
162
- # After first output, switches to stall-timeout
163
- run_test_output \
164
- "switches to stall-timeout after first output" \
165
- "switching to stall-timeout" \
166
- timeout 10 node "$NIKI" --stall-timeout 3 --startup-timeout 10 -- sh -c 'echo first; sleep 1; echo done'
167
-
168
- echo ""
169
-
170
- # ---- Budget/timeout (existing features, regression) ----
171
-
172
- bold "Budget and timeout (regression)"; echo ""
173
-
174
- run_test_output \
175
- "wall-clock timeout kills" \
176
- "KILL.*reason: timeout" \
177
- timeout 10 node "$NIKI" --timeout 2 --stall-timeout 0 -- sleep 30
178
-
179
- run_test \
180
- "timeout kill exits non-zero" \
181
- 1 \
182
- timeout 10 node "$NIKI" --timeout 2 --stall-timeout 0 -- sleep 30
183
-
184
- echo ""
185
-
186
- # ---- Exit code passthrough ----
187
-
188
- bold "Exit code passthrough"; echo ""
189
-
190
- run_test \
191
- "child exit 0 → niki exit 0" \
192
- 0 \
193
- timeout 10 node "$NIKI" --stall-timeout 5 -- sh -c 'exit 0'
194
-
195
- run_test \
196
- "child exit 1 → niki exit 1" \
197
- 1 \
198
- timeout 10 node "$NIKI" --stall-timeout 5 -- sh -c 'exit 1'
199
-
200
- run_test \
201
- "child exit 42 → niki exit 42" \
202
- 42 \
203
- timeout 10 node "$NIKI" --stall-timeout 5 -- sh -c 'exit 42'
204
-
205
- echo ""
206
-
207
- # ---- Abort file ----
208
-
209
- bold "Abort file"; echo ""
210
-
211
- ABORT_FILE=$(mktemp)
212
- rm -f "$ABORT_FILE"
213
-
214
- run_test_output \
215
- "abort file kills process" \
216
- "KILL.*reason: abort" \
217
- timeout 10 sh -c "node $NIKI --stall-timeout 0 --abort-file $ABORT_FILE -- sh -c 'sleep 1; echo still here; sleep 30' & PID=\$!; sleep 2; touch $ABORT_FILE; wait \$PID 2>/dev/null; echo done"
218
-
219
- rm -f "$ABORT_FILE"
220
-
221
- echo ""
222
-
223
- # ---- SIGTERM forwarding ----
224
-
225
- bold "SIGTERM forwarding"; echo ""
226
-
227
- # niki should forward SIGTERM to child and exit
228
- run_test_output \
229
- "SIGTERM forwarded to child" \
230
- "Received SIGTERM" \
231
- timeout 10 sh -c "node $NIKI --stall-timeout 0 -- sh -c 'echo started; sleep 30' & PID=\$!; sleep 1; kill -TERM \$PID; wait \$PID 2>/dev/null; echo done"
232
-
233
- echo ""
234
-
235
- # ---- Dead air detection ----
236
-
237
- bold "Dead air detection"; echo ""
238
-
239
- # Dead air kills silent process with zero CPU (sleep has ~zero CPU)
240
- run_test_output \
241
- "dead air kills zero-CPU process" \
242
- "DEAD AIR.*zero CPU" \
243
- timeout 30 node "$NIKI" --stall-timeout 0 --dead-air-timeout 0.1 -- sleep 30
244
-
245
- run_test_output \
246
- "dead air kill reason logged" \
247
- "KILL.*reason: dead-air" \
248
- timeout 30 node "$NIKI" --stall-timeout 0 --dead-air-timeout 0.1 -- sleep 30
249
-
250
- # Dead air defers for CPU-active processes (busy loop uses CPU)
251
- run_test_output \
252
- "dead air defers when CPU active" \
253
- "Exit.*code: 0" \
254
- timeout 15 node "$NIKI" --stall-timeout 0 --dead-air-timeout 0.1 -- sh -c 'i=0; while [ $i -lt 2000000 ]; do i=$((i+1)); done; echo done'
255
-
256
- # Dead air disabled when timeout=0
257
- run_test_output \
258
- "dead air disabled when timeout=0" \
259
- "Exit.*code: 0" \
260
- timeout 10 node "$NIKI" --stall-timeout 0 --dead-air-timeout 0 -- sh -c 'sleep 1; echo done'
261
-
262
- echo ""
263
-
264
- # ---- Summary ----
265
-
266
- echo "────────────────────────────────────────────────"
267
- TOTAL=$((PASSED + FAILED))
268
- if [ "$FAILED" -eq 0 ]; then
269
- green "All $TOTAL tests passed"; echo ""
270
- else
271
- red "$FAILED/$TOTAL tests failed"; echo ""
272
- fi
273
- echo ""
274
-
275
- exit "$FAILED"