@tjamescouch/niki 0.3.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/LICENSE +1 -1
  2. package/bin/niki +259 -125
  3. package/bin/niki.bak +665 -0
  4. package/package.json +1 -1
package/LICENSE CHANGED
@@ -1,6 +1,6 @@
1
1
  MIT License
2
2
 
3
- Copyright (c) 2025 James Couch
3
+ Copyright (c) 2026 James Couch
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy
6
6
  of this software and associated documentation files (the "Software"), to deal
package/bin/niki CHANGED
@@ -8,6 +8,7 @@
8
8
  * - Wall-clock timeout (kill if exceeded)
9
9
  * - Tool-call rate limiting (kill if agent floods)
10
10
  * - Diagnostics logging
11
+ * - Automatic restart on exit (optional)
11
12
  *
12
13
  * Usage:
13
14
  * niki [options] -- <command> [args...]
@@ -42,14 +43,20 @@ Options:
42
43
  --dead-air-timeout <m> Minutes of zero CPU + zero output before kill (default: 5, 0=disabled)
43
44
  --max-nudges <n> Max stdin nudge attempts before kill on stall (default: 3)
44
45
  --log <file> Write diagnostics log to file
46
+ --log-level <level> Minimum log level: debug, info, warn, error (default: info)
47
+ --log-json Emit logs as JSON lines (for machine parsing)
45
48
  --state <file> Write state JSON on exit (budget used, reason, etc.)
46
49
  --cooldown <seconds> Grace period after SIGTERM before SIGKILL (default: 5)
47
50
  --abort-file <path> Poll this file for external abort signal
48
51
  --poll-interval <ms> Base poll interval in ms for abort file (default: 1000)
52
+ --restart Restart the child process when it exits (default: off)
53
+ --max-restarts <n> Max restart attempts, 0=unlimited (default: 0)
54
+ --restart-delay <secs> Delay between restarts with ±30% jitter (default: 5)
49
55
 
50
56
  Examples:
51
57
  niki --budget 500000 -- claude -p "your prompt" --verbose
52
- niki --timeout 1800 --max-sends 5 -- claude -p "..." --model sonnet --verbose`);
58
+ niki --timeout 1800 --max-sends 5 -- claude -p "..." --model sonnet --verbose
59
+ niki --restart --max-restarts 10 -- gro --model gpt-5.2 "your prompt"`);
53
60
  process.exit(1);
54
61
  }
55
62
 
@@ -69,10 +76,15 @@ const { values: opts } = parseArgs({
69
76
  'dead-air-timeout': { type: 'string', default: '5' },
70
77
  'max-nudges': { type: 'string', default: '3' },
71
78
  log: { type: 'string' },
79
+ 'log-level': { type: 'string', default: 'info' },
80
+ 'log-json': { type: 'boolean', default: false },
72
81
  state: { type: 'string' },
73
82
  cooldown: { type: 'string', default: '5' },
74
83
  'abort-file': { type: 'string' },
75
84
  'poll-interval': { type: 'string', default: '1000' },
85
+ restart: { type: 'boolean', default: false },
86
+ 'max-restarts': { type: 'string', default: '0' },
87
+ 'restart-delay': { type: 'string', default: '5' },
76
88
  },
77
89
  });
78
90
 
@@ -89,7 +101,15 @@ const ABORT_FILE = opts['abort-file'] ? resolve(opts['abort-file']) : null;
89
101
  const POLL_INTERVAL = parseInt(opts['poll-interval'], 10);
90
102
  const LOG_FILE = opts.log;
91
103
  const STATE_FILE = opts.state;
104
+ const RESTART = opts.restart;
105
+ const MAX_RESTARTS = parseInt(opts['max-restarts'], 10);
106
+ const RESTART_DELAY_S = parseFloat(opts['restart-delay']);
107
+ const LOG_JSON = opts['log-json'];
92
108
 
109
+ // --- Log levels ---
110
+
111
+ const LOG_LEVELS = { debug: 0, info: 1, warn: 2, error: 3 };
112
+ const LOG_LEVEL = LOG_LEVELS[opts['log-level']] ?? LOG_LEVELS.info;
93
113
  // --- State ---
94
114
 
95
115
  const state = {
@@ -109,12 +129,17 @@ const state = {
109
129
  stallEvents: 0,
110
130
  nudges: 0,
111
131
  deadAirChecks: 0,
132
+ restarts: 0,
112
133
  };
113
134
 
114
135
  // Sliding window for per-minute rate limiting
115
136
  const toolCallTimestamps = [];
116
137
  const sendCallTimestamps = [];
117
138
 
139
+ // Budget threshold tracking — warn once at each level
140
+ const BUDGET_THRESHOLDS = [0.5, 0.75, 0.9];
141
+ const budgetWarned = new Set();
142
+
118
143
  // --- Logging ---
119
144
 
120
145
  let logStream = null;
@@ -123,11 +148,24 @@ if (LOG_FILE) {
123
148
  logStream = createWriteStream(resolve(LOG_FILE), { flags: 'a' });
124
149
  }
125
150
 
126
- function log(msg) {
127
- const line = `[${new Date().toISOString()}] ${msg}`;
128
- if (logStream) logStream.write(line + '\n');
129
- // Also write to stderr so supervisor can capture it
130
- process.stderr.write(`[niki] ${line}\n`);
151
+ function log(msg, level = 'info', fields = null) {
152
+ const numLevel = LOG_LEVELS[level] ?? LOG_LEVELS.info;
153
+ if (numLevel < LOG_LEVEL) return;
154
+
155
+ const ts = new Date().toISOString();
156
+
157
+ if (LOG_JSON) {
158
+ const entry = { ts, level, msg, ...state };
159
+ if (fields) Object.assign(entry, fields);
160
+ const json = JSON.stringify(entry);
161
+ if (logStream) logStream.write(json + '\n');
162
+ process.stderr.write(json + '\n');
163
+ } else {
164
+ const prefix = level === 'info' ? '' : `[${level.toUpperCase()}] `;
165
+ const line = `[${ts}] ${prefix}${msg}`;
166
+ if (logStream) logStream.write(line + '\n');
167
+ process.stderr.write(`[niki] ${line}\n`);
168
+ }
131
169
  }
132
170
 
133
171
  function writeState() {
@@ -141,6 +179,20 @@ function writeState() {
141
179
  }
142
180
  }
143
181
 
182
+ // --- Budget threshold warnings ---
183
+
184
+ function checkBudgetThresholds() {
185
+ if (BUDGET <= 0) return;
186
+ const pct = state.tokensTotal / BUDGET;
187
+ for (const threshold of BUDGET_THRESHOLDS) {
188
+ if (pct >= threshold && !budgetWarned.has(threshold)) {
189
+ budgetWarned.add(threshold);
190
+ const used = state.tokensTotal;
191
+ const remaining = BUDGET - used;
192
+ log(`Budget ${Math.round(threshold * 100)}% — ${used.toLocaleString()}/${BUDGET.toLocaleString()} tokens used, ${remaining.toLocaleString()} remaining`, 'warn');
193
+ }
194
+ }
195
+ }
144
196
  // --- Token parsing from stderr ---
145
197
 
146
198
  // Claude --verbose outputs token usage in stderr. Patterns vary by version.
@@ -163,6 +215,7 @@ const TOKEN_PATTERNS = [
163
215
  ];
164
216
 
165
217
  function parseTokens(line) {
218
+ let changed = false;
166
219
  for (const { regex, field } of TOKEN_PATTERNS) {
167
220
  regex.lastIndex = 0;
168
221
  let match;
@@ -170,30 +223,39 @@ function parseTokens(line) {
170
223
  const count = parseInt(match[1], 10);
171
224
  if (isNaN(count) || count <= 0) continue;
172
225
  if (field === 'in') {
173
- state.tokensIn = Math.max(state.tokensIn, count);
226
+ if (count > state.tokensIn) { state.tokensIn = count; changed = true; }
174
227
  } else {
175
- state.tokensOut = Math.max(state.tokensOut, count);
228
+ if (count > state.tokensOut) { state.tokensOut = count; changed = true; }
176
229
  }
177
230
  state.tokensTotal = state.tokensIn + state.tokensOut;
178
231
  }
179
232
  }
233
+ if (changed) {
234
+ log(`Tokens — in: ${state.tokensIn.toLocaleString()} out: ${state.tokensOut.toLocaleString()} total: ${state.tokensTotal.toLocaleString()}/${BUDGET.toLocaleString()} (${Math.round(state.tokensTotal / BUDGET * 100)}%)`, 'debug');
235
+ checkBudgetThresholds();
236
+ }
180
237
  }
181
238
 
182
239
  // --- Tool call detection from stderr ---
183
240
 
184
241
  // Claude --verbose logs tool calls. We detect sends specifically.
185
- const TOOL_CALL_PATTERN = /(?:Using tool|Tool call|tool_use).*?(\w+)/i;
242
+ const TOOL_CALL_PATTERN = /(?:Using tool|Tool call|tool_use)[:\s]*(\S+)/i;
186
243
  const SEND_PATTERN = /agentchat_send/i;
187
244
 
188
245
  function parseToolCall(line) {
189
- if (TOOL_CALL_PATTERN.test(line)) {
246
+ const toolMatch = line.match(TOOL_CALL_PATTERN);
247
+ if (toolMatch) {
190
248
  const now = Date.now();
249
+ const toolName = toolMatch[1] || 'unknown';
191
250
  state.toolCalls++;
192
251
  toolCallTimestamps.push(now);
193
252
 
194
253
  if (SEND_PATTERN.test(line)) {
195
254
  state.sendCalls++;
196
255
  sendCallTimestamps.push(now);
256
+ log(`Tool call #${state.toolCalls}: ${toolName} (send #${state.sendCalls}, ${state.sendCallsThisMinute + 1}/${MAX_SENDS}/min)`, 'info');
257
+ } else {
258
+ log(`Tool call #${state.toolCalls}: ${toolName} (${state.toolCallsThisMinute + 1}/${MAX_TOOL_CALLS}/min)`, 'debug');
197
259
  }
198
260
  }
199
261
  }
@@ -214,6 +276,14 @@ function checkRateLimits() {
214
276
  state.toolCallsThisMinute = toolCallTimestamps.length;
215
277
  state.sendCallsThisMinute = sendCallTimestamps.length;
216
278
 
279
+ // Warn at 80% of rate limits
280
+ if (state.sendCallsThisMinute === Math.ceil(MAX_SENDS * 0.8)) {
281
+ log(`Rate warning — sends at ${state.sendCallsThisMinute}/${MAX_SENDS}/min (80% threshold)`, 'warn');
282
+ }
283
+ if (state.toolCallsThisMinute === Math.ceil(MAX_TOOL_CALLS * 0.8)) {
284
+ log(`Rate warning — tool calls at ${state.toolCallsThisMinute}/${MAX_TOOL_CALLS}/min (80% threshold)`, 'warn');
285
+ }
286
+
217
287
  if (sendCallTimestamps.length > MAX_SENDS) {
218
288
  return 'rate-sends';
219
289
  }
@@ -222,7 +292,6 @@ function checkRateLimits() {
222
292
  }
223
293
  return null;
224
294
  }
225
-
226
295
  // --- Kill logic ---
227
296
 
228
297
  let child = null;
@@ -232,7 +301,7 @@ function killChild(reason) {
232
301
  if (killed || !child) return;
233
302
  killed = true;
234
303
  state.killedBy = reason;
235
- log(`KILL — reason: ${reason} | tokens: ${state.tokensTotal}/${BUDGET} | sends: ${state.sendCallsThisMinute}/min | tools: ${state.toolCallsThisMinute}/min`);
304
+ log(`KILL — reason: ${reason} | tokens: ${state.tokensTotal}/${BUDGET} | sends: ${state.sendCallsThisMinute}/min | tools: ${state.toolCallsThisMinute}/min`, 'error');
236
305
 
237
306
  child.kill('SIGTERM');
238
307
 
@@ -262,7 +331,7 @@ const PROMPT_PATTERNS = [
262
331
  function checkForPrompts(text) {
263
332
  for (const pattern of PROMPT_PATTERNS) {
264
333
  if (pattern.test(text)) {
265
- log(`PROMPT detected in stdout: ${text.trim().substring(0, 100)}`);
334
+ log(`PROMPT detected in stdout: ${text.trim().substring(0, 100)}`, 'warn');
266
335
  state.stallEvents++;
267
336
  // Close stdin to dismiss the prompt
268
337
  closeStdin();
@@ -329,6 +398,13 @@ function hasConsumedCpu(pid) {
329
398
  return cpuMs > prev;
330
399
  }
331
400
 
401
+ // --- Jitter utility ---
402
+
403
+ function jitteredDelay(base) {
404
+ // ±30% jitter
405
+ const jitter = base * 0.3;
406
+ return base + (Math.random() * 2 * jitter - jitter);
407
+ }
332
408
  // --- Dead air detection ---
333
409
 
334
410
  let deadAirStart = null; // Timestamp when dead air began (null = not in dead air)
@@ -344,7 +420,7 @@ function checkDeadAir() {
344
420
  if (cpuActive) {
345
421
  // Process is working — reset dead air, let it cook
346
422
  if (deadAirStart) {
347
- log(`Dead air cleared — CPU active after ${Math.round((Date.now() - deadAirStart) / 1000)}s of silence`);
423
+ log(`Dead air cleared — CPU active after ${Math.round((Date.now() - deadAirStart) / 1000)}s of silence`, 'info');
348
424
  deadAirStart = null;
349
425
  }
350
426
  return;
@@ -353,17 +429,17 @@ function checkDeadAir() {
353
429
  // Zero CPU + zero output
354
430
  if (!deadAirStart) {
355
431
  deadAirStart = Date.now();
356
- log(`Dead air started — zero CPU, ${silenceSec}s silence`);
432
+ log(`Dead air started — zero CPU, ${silenceSec}s silence`, 'warn');
357
433
  }
358
434
 
359
435
  const deadAirMin = (Date.now() - deadAirStart) / 60_000;
360
436
  if (deadAirMin >= DEAD_AIR_TIMEOUT_M) {
361
- log(`DEAD AIR — zero CPU + zero output for ${Math.round(deadAirMin)}min (threshold: ${DEAD_AIR_TIMEOUT_M}min)`);
437
+ log(`DEAD AIR — zero CPU + zero output for ${Math.round(deadAirMin)}min (threshold: ${DEAD_AIR_TIMEOUT_M}min)`, 'error');
362
438
  killChild('dead-air');
363
439
  return;
364
440
  }
365
441
 
366
- log(`Dead air check — zero CPU, ${Math.round(deadAirMin * 10) / 10}/${DEAD_AIR_TIMEOUT_M}min, ${silenceSec}s silence`);
442
+ log(`Dead air check — zero CPU, ${Math.round(deadAirMin * 10) / 10}/${DEAD_AIR_TIMEOUT_M}min, ${silenceSec}s silence`, 'debug');
367
443
  }
368
444
 
369
445
  // Poll interval: min(30s, threshold/3) — fast polls for short thresholds, 30s cap for production
@@ -391,12 +467,12 @@ function onChildOutput() {
391
467
  lastOutputTime = Date.now();
392
468
  // Reset dead air — got real output
393
469
  if (deadAirStart) {
394
- log(`Dead air cleared — received output after ${Math.round((Date.now() - deadAirStart) / 1000)}s`);
470
+ log(`Dead air cleared — received output after ${Math.round((Date.now() - deadAirStart) / 1000)}s`, 'info');
395
471
  deadAirStart = null;
396
472
  }
397
473
  if (!gotFirstOutput) {
398
474
  gotFirstOutput = true;
399
- log(`First output received after ${Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000)}s — switching to stall-timeout=${STALL_TIMEOUT_S}s`);
475
+ log(`First output received after ${Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000)}s — switching to stall-timeout=${STALL_TIMEOUT_S}s`, 'info');
400
476
  }
401
477
  resetStallTimer();
402
478
  }
@@ -418,14 +494,14 @@ function closeStdin() {
418
494
  if (stdinClosed || !child) return;
419
495
  stdinClosed = true;
420
496
  try { child.stdin.end(); } catch { /* already closed */ }
421
- log('Stdin: closed (EOF)');
497
+ log('Stdin: closed (EOF)', 'debug');
422
498
  }
423
499
 
424
500
  function onStallDetected() {
425
501
  if (killed) return;
426
502
  state.stallEvents++;
427
503
  const silence = Math.round((Date.now() - lastOutputTime) / 1000);
428
- log(`STALL — no output for ${silence}s (nudges: ${nudgeCount}/${MAX_NUDGES})`);
504
+ log(`STALL — no output for ${silence}s (nudges: ${nudgeCount}/${MAX_NUDGES})`, 'warn');
429
505
 
430
506
  // Escalation: close stdin → nudge → check CPU → kill
431
507
  if (!stdinClosed) {
@@ -437,7 +513,7 @@ function onStallDetected() {
437
513
  if (nudgeCount < MAX_NUDGES && !child.stdin.writableEnded) {
438
514
  nudgeCount++;
439
515
  state.nudges = nudgeCount;
440
- log(`Stall nudge #${nudgeCount}`);
516
+ log(`Stall nudge #${nudgeCount}`, 'info');
441
517
  resetStallTimer();
442
518
  return;
443
519
  }
@@ -447,7 +523,7 @@ function onStallDetected() {
447
523
  if (DEAD_AIR_TIMEOUT_M > 0 && child) {
448
524
  const cpuActive = hasConsumedCpu(child.pid);
449
525
  if (cpuActive) {
450
- log(`Stall deferred — process has CPU activity, deferring to dead-air detection`);
526
+ log(`Stall deferred — process has CPU activity, deferring to dead-air detection`, 'info');
451
527
  resetStallTimer();
452
528
  return;
453
529
  }
@@ -455,137 +531,195 @@ function onStallDetected() {
455
531
 
456
532
  killChild('stall');
457
533
  }
534
+ // --- Restart logic ---
535
+
536
+ // Reasons that should NOT trigger a restart (hard limits / operator intent)
537
+ const NO_RESTART_REASONS = new Set(['budget', 'rate-sends', 'rate-tools', 'abort']);
538
+ let nikiTerminated = false; // Set when niki itself receives SIGTERM/SIGINT
539
+
540
+ function shouldRestart(code, signal) {
541
+ if (!RESTART) return false;
542
+ if (nikiTerminated) return false;
543
+ if (state.killedBy && NO_RESTART_REASONS.has(state.killedBy)) return false;
544
+ if (MAX_RESTARTS > 0 && state.restarts >= MAX_RESTARTS) return false;
545
+ return true;
546
+ }
458
547
 
459
- // --- Spawn child process ---
460
-
461
- log(`Starting: ${childCmd} ${childArgs.join(' ').substring(0, 100)}...`);
462
- log(`Budget: ${BUDGET} tokens | Timeout: ${TIMEOUT_S}s | Startup: ${STARTUP_TIMEOUT_S}s | Stall: ${STALL_TIMEOUT_S}s | Dead air: ${DEAD_AIR_TIMEOUT_M}min | Max sends: ${MAX_SENDS}/min | Max tools: ${MAX_TOOL_CALLS}/min`);
463
-
464
- child = spawn(childCmd, childArgs, {
465
- stdio: ['pipe', 'pipe', 'pipe'], // All piped: niki controls stdin, monitors stdout+stderr
466
- env: process.env, // Inherit env (tokens stay in env, never logged)
467
- });
548
+ function resetPerRunState() {
549
+ // Reset per-run flags but keep cumulative counters (tokens, toolCalls, etc.)
550
+ killed = false;
551
+ state.killedBy = null;
552
+ state.exitCode = null;
553
+ state.exitSignal = null;
554
+ state.gotFirstOutput = undefined;
555
+
556
+ // Reset stall/dead-air detection
557
+ nudgeCount = 0;
558
+ stdinClosed = false;
559
+ lastOutputTime = Date.now();
560
+ gotFirstOutput = false;
561
+ lastCpuMs = null;
562
+ deadAirStart = null;
563
+
564
+ // Clear rate limit windows (fresh session)
565
+ toolCallTimestamps.length = 0;
566
+ sendCallTimestamps.length = 0;
567
+ state.toolCallsThisMinute = 0;
568
+ state.sendCallsThisMinute = 0;
569
+
570
+ // Reset budget warnings for fresh run
571
+ budgetWarned.clear();
572
+ }
468
573
 
469
- state.pid = child.pid;
574
+ // --- Abort file polling ---
470
575
 
471
- // Close stdin immediately — claude -p should never need interactive input.
472
- // This prevents blocking on trust prompts, permission prompts, or stdin reads.
473
- closeStdin();
576
+ let abortPollId = null;
474
577
 
475
- // --- Monitor stdout ---
578
+ function scheduleAbortPoll() {
579
+ if (!ABORT_FILE || killed) return;
580
+ abortPollId = setTimeout(() => {
581
+ if (killed) return;
582
+ if (existsSync(ABORT_FILE)) {
583
+ log(`Abort file detected: ${ABORT_FILE}`, 'warn');
584
+ killChild('abort');
585
+ return;
586
+ }
587
+ scheduleAbortPoll();
588
+ }, jitteredDelay(POLL_INTERVAL));
589
+ }
476
590
 
477
- child.stdout.on('data', (chunk) => {
478
- // Forward to our stdout (preserves runner's | tee pipeline)
479
- process.stdout.write(chunk);
480
- onChildOutput();
481
- checkForPrompts(chunk.toString());
482
- });
591
+ // --- Spawn and monitor child ---
483
592
 
484
- // --- Monitor stderr ---
593
+ let timeoutId = null;
485
594
 
486
- let stderrBuffer = '';
595
+ function startChild() {
596
+ log(`Starting: ${childCmd} ${childArgs.join(' ').substring(0, 100)}...`, 'info');
597
+ log(`Budget: ${BUDGET.toLocaleString()} tokens | Timeout: ${TIMEOUT_S}s | Startup: ${STARTUP_TIMEOUT_S}s | Stall: ${STALL_TIMEOUT_S}s | Dead air: ${DEAD_AIR_TIMEOUT_M}min | Max sends: ${MAX_SENDS}/min | Max tools: ${MAX_TOOL_CALLS}/min`, 'info');
598
+ if (RESTART) {
599
+ log(`Restart: enabled | max: ${MAX_RESTARTS || 'unlimited'} | delay: ${RESTART_DELAY_S}s ±30% | restarts so far: ${state.restarts}`, 'info');
600
+ }
487
601
 
488
- child.stderr.on('data', (chunk) => {
489
- const text = chunk.toString();
602
+ child = spawn(childCmd, childArgs, {
603
+ stdio: ['pipe', 'pipe', 'pipe'],
604
+ env: process.env,
605
+ });
490
606
 
491
- // Always forward stderr to our stderr (so supervisor captures it)
492
- process.stderr.write(chunk);
493
- onChildOutput();
607
+ state.pid = child.pid;
608
+ log(`Child PID: ${child.pid}`, 'debug');
494
609
 
495
- // Buffer and parse line by line
496
- stderrBuffer += text;
497
- const lines = stderrBuffer.split('\n');
498
- stderrBuffer = lines.pop(); // Keep incomplete last line in buffer
610
+ // Close stdin immediately claude -p should never need interactive input.
611
+ closeStdin();
499
612
 
500
- for (const line of lines) {
501
- parseTokens(line);
502
- parseToolCall(line);
613
+ // --- Monitor stdout ---
503
614
 
504
- // Check budget
505
- if (state.tokensTotal > BUDGET) {
506
- killChild('budget');
507
- return;
508
- }
615
+ child.stdout.on('data', (chunk) => {
616
+ process.stdout.write(chunk);
617
+ onChildOutput();
618
+ checkForPrompts(chunk.toString());
619
+ });
509
620
 
510
- // Check rate limits
511
- const rateViolation = checkRateLimits();
512
- if (rateViolation) {
513
- killChild(rateViolation);
514
- return;
515
- }
516
- }
517
- });
621
+ // --- Monitor stderr ---
518
622
 
519
- // Start stall detection
520
- if (STALL_TIMEOUT_S > 0 || STARTUP_TIMEOUT_S > 0) {
521
- log(`Stall detection: startup-timeout=${STARTUP_TIMEOUT_S}s, stall-timeout=${STALL_TIMEOUT_S}s, max-nudges=${MAX_NUDGES}`);
522
- resetStallTimer();
523
- }
623
+ let stderrBuffer = '';
524
624
 
525
- // Start dead air detection
526
- if (DEAD_AIR_TIMEOUT_M > 0) {
527
- log(`Dead air detection: ${DEAD_AIR_TIMEOUT_M}min threshold, ${Math.round(DEAD_AIR_POLL_MS / 1000)}s poll interval`);
528
- scheduleDeadAirPoll();
529
- }
625
+ child.stderr.on('data', (chunk) => {
626
+ const text = chunk.toString();
627
+ process.stderr.write(chunk);
628
+ onChildOutput();
530
629
 
531
- // --- Abort file polling (with jitter) ---
630
+ stderrBuffer += text;
631
+ const lines = stderrBuffer.split('\n');
632
+ stderrBuffer = lines.pop();
532
633
 
533
- let abortPollId = null;
634
+ for (const line of lines) {
635
+ parseTokens(line);
636
+ parseToolCall(line);
534
637
 
535
- function jitteredDelay(base) {
536
- // ±30% jitter
537
- const jitter = base * 0.3;
538
- return base + (Math.random() * 2 * jitter - jitter);
539
- }
638
+ if (state.tokensTotal > BUDGET) {
639
+ killChild('budget');
640
+ return;
641
+ }
540
642
 
541
- function scheduleAbortPoll() {
542
- if (!ABORT_FILE || killed) return;
543
- abortPollId = setTimeout(() => {
544
- if (killed) return;
545
- if (existsSync(ABORT_FILE)) {
546
- log(`Abort file detected: ${ABORT_FILE}`);
547
- killChild('abort');
548
- return;
643
+ const rateViolation = checkRateLimits();
644
+ if (rateViolation) {
645
+ killChild(rateViolation);
646
+ return;
647
+ }
549
648
  }
550
- scheduleAbortPoll();
551
- }, jitteredDelay(POLL_INTERVAL));
552
- }
553
-
554
- if (ABORT_FILE) {
555
- log(`Abort file: ${ABORT_FILE} (poll: ${POLL_INTERVAL}ms ±30% jitter)`);
556
- scheduleAbortPoll();
557
- }
649
+ });
558
650
 
559
- // --- Timeout ---
651
+ // Start stall detection
652
+ if (STALL_TIMEOUT_S > 0 || STARTUP_TIMEOUT_S > 0) {
653
+ log(`Stall detection: startup-timeout=${STARTUP_TIMEOUT_S}s, stall-timeout=${STALL_TIMEOUT_S}s, max-nudges=${MAX_NUDGES}`, 'info');
654
+ resetStallTimer();
655
+ }
560
656
 
561
- const timeoutId = setTimeout(() => {
562
- killChild('timeout');
563
- }, TIMEOUT_S * 1000);
657
+ // Start dead air detection
658
+ if (DEAD_AIR_TIMEOUT_M > 0) {
659
+ log(`Dead air detection: ${DEAD_AIR_TIMEOUT_M}min threshold, ${Math.round(DEAD_AIR_POLL_MS / 1000)}s poll interval`, 'info');
660
+ scheduleDeadAirPoll();
661
+ }
564
662
 
565
- // --- Clean exit ---
663
+ // Abort file polling
664
+ if (ABORT_FILE) {
665
+ log(`Abort file: ${ABORT_FILE} (poll: ${POLL_INTERVAL}ms ±30% jitter)`, 'info');
666
+ scheduleAbortPoll();
667
+ }
566
668
 
567
- child.on('exit', (code, signal) => {
568
- clearTimeout(timeoutId);
569
- if (stallTimer) clearTimeout(stallTimer);
570
- if (abortPollId) clearTimeout(abortPollId);
571
- if (deadAirPollId) clearTimeout(deadAirPollId);
572
- state.exitCode = code;
573
- state.exitSignal = signal;
574
- state.duration = Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000);
575
-
576
- state.gotFirstOutput = gotFirstOutput;
577
- log(`Exit — code: ${code} signal: ${signal} | tokens: ${state.tokensTotal} | tools: ${state.toolCalls} | sends: ${state.sendCalls} | duration: ${state.duration}s | output: ${gotFirstOutput}${state.killedBy ? ` | killed: ${state.killedBy}` : ''}`);
578
- writeState();
579
-
580
- if (logStream) logStream.end();
581
- process.exit(code ?? 1);
582
- });
669
+ // Per-run timeout
670
+ timeoutId = setTimeout(() => {
671
+ killChild('timeout');
672
+ }, TIMEOUT_S * 1000);
673
+
674
+ // --- Exit handler ---
675
+
676
+ child.on('exit', (code, signal) => {
677
+ clearTimeout(timeoutId);
678
+ if (stallTimer) clearTimeout(stallTimer);
679
+ if (abortPollId) clearTimeout(abortPollId);
680
+ if (deadAirPollId) clearTimeout(deadAirPollId);
681
+
682
+ state.exitCode = code;
683
+ state.exitSignal = signal;
684
+ state.duration = Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000);
685
+ state.gotFirstOutput = gotFirstOutput;
686
+
687
+ const level = state.killedBy ? 'error' : (code === 0 ? 'info' : 'warn');
688
+ log(`Exit — code: ${code} signal: ${signal} | tokens: ${state.tokensTotal.toLocaleString()} | tools: ${state.toolCalls} | sends: ${state.sendCalls} | duration: ${state.duration}s | output: ${gotFirstOutput}${state.killedBy ? ` | killed: ${state.killedBy}` : ''} | restarts: ${state.restarts}`, level);
689
+ writeState();
690
+
691
+ if (shouldRestart(code, signal)) {
692
+ state.restarts++;
693
+ const delay = jitteredDelay(RESTART_DELAY_S * 1000);
694
+ log(`RESTART — attempt ${state.restarts}${MAX_RESTARTS > 0 ? `/${MAX_RESTARTS}` : ''} in ${Math.round(delay / 1000)}s`, 'warn');
695
+ resetPerRunState();
696
+ setTimeout(() => {
697
+ startChild();
698
+ }, delay);
699
+ } else {
700
+ if (RESTART && !shouldRestart(code, signal)) {
701
+ const reason = nikiTerminated ? 'niki received signal' :
702
+ (state.killedBy && NO_RESTART_REASONS.has(state.killedBy)) ? `hard kill (${state.killedBy})` :
703
+ (MAX_RESTARTS > 0 && state.restarts >= MAX_RESTARTS) ? `max restarts reached (${MAX_RESTARTS})` :
704
+ 'restart not enabled';
705
+ log(`NOT RESTARTING — ${reason}`, 'warn');
706
+ }
707
+ if (logStream) logStream.end();
708
+ process.exit(code ?? 1);
709
+ }
710
+ });
711
+ }
583
712
 
584
713
  // --- Signal forwarding ---
585
714
 
586
715
  for (const sig of ['SIGINT', 'SIGTERM']) {
587
716
  process.on(sig, () => {
588
- log(`Received ${sig}, forwarding to child`);
717
+ log(`Received ${sig}, forwarding to child`, 'warn');
718
+ nikiTerminated = true;
589
719
  if (child) child.kill(sig);
590
720
  });
591
721
  }
722
+
723
+ // --- Start ---
724
+
725
+ startChild();
package/bin/niki.bak ADDED
@@ -0,0 +1,665 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * niki — Deterministic process supervisor for AI agents.
5
+ *
6
+ * Wraps a child command (e.g. `claude -p`) and enforces:
7
+ * - Token budget (kill if exceeded)
8
+ * - Wall-clock timeout (kill if exceeded)
9
+ * - Tool-call rate limiting (kill if agent floods)
10
+ * - Diagnostics logging
11
+ * - Automatic restart on exit (optional)
12
+ *
13
+ * Usage:
14
+ * niki [options] -- <command> [args...]
15
+ * niki --budget 500000 --timeout 3600 -- claude -p "..." --verbose
16
+ *
17
+ * Security:
18
+ * - Never logs or exposes API tokens
19
+ * - Inherits env from parent (tokens stay in env, never in CLI args)
20
+ * - Diagnostics only contain counters, never message content
21
+ */
22
+
23
+ import { spawn, execSync } from 'node:child_process';
24
+ import { createWriteStream, writeFileSync, mkdirSync, existsSync, readFileSync } from 'node:fs';
25
+ import { dirname, resolve } from 'node:path';
26
+ import { parseArgs } from 'node:util';
27
+
28
+ // --- Argument parsing ---
29
+
30
+ const SEPARATOR = process.argv.indexOf('--');
31
+ if (SEPARATOR === -1 || SEPARATOR === process.argv.length - 1) {
32
+ console.error(`niki — deterministic agent supervisor
33
+
34
+ Usage: niki [options] -- <command> [args...]
35
+
36
+ Options:
37
+ --budget <tokens> Max total tokens (input+output) before SIGTERM (default: 1000000)
38
+ --timeout <seconds> Max wall-clock runtime before SIGTERM (default: 3600)
39
+ --max-sends <n> Max agentchat_send calls per minute (default: 10)
40
+ --max-tool-calls <n> Max total tool calls per minute (default: 30)
41
+ --stall-timeout <secs> Kill after N seconds of no output (default: 60, 0=disabled)
42
+ --startup-timeout <s> Longer stall timeout until first output (default: 180, 0=use stall-timeout)
43
+ --dead-air-timeout <m> Minutes of zero CPU + zero output before kill (default: 5, 0=disabled)
44
+ --max-nudges <n> Max stdin nudge attempts before kill on stall (default: 3)
45
+ --log <file> Write diagnostics log to file
46
+ --state <file> Write state JSON on exit (budget used, reason, etc.)
47
+ --cooldown <seconds> Grace period after SIGTERM before SIGKILL (default: 5)
48
+ --abort-file <path> Poll this file for external abort signal
49
+ --poll-interval <ms> Base poll interval in ms for abort file (default: 1000)
50
+ --restart Restart the child process when it exits (default: off)
51
+ --max-restarts <n> Max restart attempts, 0=unlimited (default: 0)
52
+ --restart-delay <secs> Delay between restarts with ±30% jitter (default: 5)
53
+
54
+ Examples:
55
+ niki --budget 500000 -- claude -p "your prompt" --verbose
56
+ niki --timeout 1800 --max-sends 5 -- claude -p "..." --model sonnet --verbose
57
+ niki --restart --max-restarts 10 -- gro --model gpt-5.2 "your prompt"`);
58
+ process.exit(1);
59
+ }
60
+
61
+ const nikiArgs = process.argv.slice(2, SEPARATOR);
62
+ const childCmd = process.argv[SEPARATOR + 1];
63
+ const childArgs = process.argv.slice(SEPARATOR + 2);
64
+
65
+ const { values: opts } = parseArgs({
66
+ args: nikiArgs,
67
+ options: {
68
+ budget: { type: 'string', default: '1000000' },
69
+ timeout: { type: 'string', default: '3600' },
70
+ 'max-sends': { type: 'string', default: '10' },
71
+ 'max-tool-calls': { type: 'string', default: '30' },
72
+ 'stall-timeout': { type: 'string', default: '60' },
73
+ 'startup-timeout': { type: 'string', default: '180' },
74
+ 'dead-air-timeout': { type: 'string', default: '5' },
75
+ 'max-nudges': { type: 'string', default: '3' },
76
+ log: { type: 'string' },
77
+ state: { type: 'string' },
78
+ cooldown: { type: 'string', default: '5' },
79
+ 'abort-file': { type: 'string' },
80
+ 'poll-interval': { type: 'string', default: '1000' },
81
+ restart: { type: 'boolean', default: false },
82
+ 'max-restarts': { type: 'string', default: '0' },
83
+ 'restart-delay': { type: 'string', default: '5' },
84
+ },
85
+ });
86
+
87
+ const BUDGET = parseInt(opts.budget, 10);
88
+ const TIMEOUT_S = parseInt(opts.timeout, 10);
89
+ const MAX_SENDS = parseInt(opts['max-sends'], 10);
90
+ const MAX_TOOL_CALLS = parseInt(opts['max-tool-calls'], 10);
91
+ const STALL_TIMEOUT_S = parseInt(opts['stall-timeout'], 10);
92
+ const STARTUP_TIMEOUT_S = parseInt(opts['startup-timeout'], 10);
93
+ const DEAD_AIR_TIMEOUT_M = parseFloat(opts['dead-air-timeout']);
94
+ const MAX_NUDGES = parseInt(opts['max-nudges'], 10);
95
+ const COOLDOWN_S = parseInt(opts.cooldown, 10);
96
+ const ABORT_FILE = opts['abort-file'] ? resolve(opts['abort-file']) : null;
97
+ const POLL_INTERVAL = parseInt(opts['poll-interval'], 10);
98
+ const LOG_FILE = opts.log;
99
+ const STATE_FILE = opts.state;
100
+ const RESTART = opts.restart;
101
+ const MAX_RESTARTS = parseInt(opts['max-restarts'], 10);
102
+ const RESTART_DELAY_S = parseFloat(opts['restart-delay']);
103
+
104
+ // --- State ---
105
+
106
+ const state = {
107
+ startedAt: new Date().toISOString(),
108
+ pid: null,
109
+ tokensIn: 0,
110
+ tokensOut: 0,
111
+ tokensTotal: 0,
112
+ toolCalls: 0,
113
+ sendCalls: 0,
114
+ toolCallsThisMinute: 0,
115
+ sendCallsThisMinute: 0,
116
+ exitCode: null,
117
+ exitSignal: null,
118
+ killedBy: null, // 'budget' | 'timeout' | 'rate-sends' | 'rate-tools' | 'abort' | 'stall' | 'dead-air' | null
119
+ duration: 0,
120
+ stallEvents: 0,
121
+ nudges: 0,
122
+ deadAirChecks: 0,
123
+ restarts: 0,
124
+ };
125
+
126
+ // Sliding window for per-minute rate limiting
127
+ const toolCallTimestamps = [];
128
+ const sendCallTimestamps = [];
129
+
130
+ // --- Logging ---
131
+
132
+ let logStream = null;
133
+ if (LOG_FILE) {
134
+ mkdirSync(dirname(resolve(LOG_FILE)), { recursive: true });
135
+ logStream = createWriteStream(resolve(LOG_FILE), { flags: 'a' });
136
+ }
137
+
138
+ function log(msg) {
139
+ const line = `[${new Date().toISOString()}] ${msg}`;
140
+ if (logStream) logStream.write(line + '\n');
141
+ // Also write to stderr so supervisor can capture it
142
+ process.stderr.write(`[niki] ${line}\n`);
143
+ }
144
+
145
+ function writeState() {
146
+ if (!STATE_FILE) return;
147
+ try {
148
+ mkdirSync(dirname(resolve(STATE_FILE)), { recursive: true });
149
+ // Never include env, tokens, or message content — only counters
150
+ writeFileSync(resolve(STATE_FILE), JSON.stringify(state, null, 2) + '\n');
151
+ } catch {
152
+ // Best effort
153
+ }
154
+ }
155
+
156
+ // --- Token parsing from stderr ---
157
+
158
+ // Claude --verbose outputs token usage in stderr. Patterns vary by version.
159
+ // We look for common patterns and extract numbers.
160
+ //
161
+ // Known patterns:
162
+ // "input_tokens": 1234
163
+ // "output_tokens": 567
164
+ // tokens: { input: 1234, output: 567 }
165
+ // Input tokens: 1234
166
+ // Output tokens: 567
167
+
168
+ const TOKEN_PATTERNS = [
169
+ // JSON-style: "input_tokens": 1234
170
+ { regex: /"input_tokens"\s*:\s*(\d+)/g, field: 'in' },
171
+ { regex: /"output_tokens"\s*:\s*(\d+)/g, field: 'out' },
172
+ // Human-readable: Input tokens: 1234
173
+ { regex: /Input tokens:\s*(\d+)/gi, field: 'in' },
174
+ { regex: /Output tokens:\s*(\d+)/gi, field: 'out' },
175
+ ];
176
+
177
+ function parseTokens(line) {
178
+ for (const { regex, field } of TOKEN_PATTERNS) {
179
+ regex.lastIndex = 0;
180
+ let match;
181
+ while ((match = regex.exec(line)) !== null) {
182
+ const count = parseInt(match[1], 10);
183
+ if (isNaN(count) || count <= 0) continue;
184
+ if (field === 'in') {
185
+ state.tokensIn = Math.max(state.tokensIn, count);
186
+ } else {
187
+ state.tokensOut = Math.max(state.tokensOut, count);
188
+ }
189
+ state.tokensTotal = state.tokensIn + state.tokensOut;
190
+ }
191
+ }
192
+ }
193
+
194
+ // --- Tool call detection from stderr ---
195
+
196
+ // Claude --verbose logs tool calls. We detect sends specifically.
197
+ const TOOL_CALL_PATTERN = /(?:Using tool|Tool call|tool_use).*?(\w+)/i;
198
+ const SEND_PATTERN = /agentchat_send/i;
199
+
200
+ function parseToolCall(line) {
201
+ if (TOOL_CALL_PATTERN.test(line)) {
202
+ const now = Date.now();
203
+ state.toolCalls++;
204
+ toolCallTimestamps.push(now);
205
+
206
+ if (SEND_PATTERN.test(line)) {
207
+ state.sendCalls++;
208
+ sendCallTimestamps.push(now);
209
+ }
210
+ }
211
+ }
212
+
213
+ // --- Rate limit checking ---
214
+
215
+ function pruneWindow(timestamps) {
216
+ const cutoff = Date.now() - 60_000; // 1 minute window
217
+ while (timestamps.length > 0 && timestamps[0] < cutoff) {
218
+ timestamps.shift();
219
+ }
220
+ }
221
+
222
+ function checkRateLimits() {
223
+ pruneWindow(toolCallTimestamps);
224
+ pruneWindow(sendCallTimestamps);
225
+
226
+ state.toolCallsThisMinute = toolCallTimestamps.length;
227
+ state.sendCallsThisMinute = sendCallTimestamps.length;
228
+
229
+ if (sendCallTimestamps.length > MAX_SENDS) {
230
+ return 'rate-sends';
231
+ }
232
+ if (toolCallTimestamps.length > MAX_TOOL_CALLS) {
233
+ return 'rate-tools';
234
+ }
235
+ return null;
236
+ }
237
+
238
+ // --- Kill logic ---
239
+
240
+ let child = null;
241
+ let killed = false;
242
+
243
+ function killChild(reason) {
244
+ if (killed || !child) return;
245
+ killed = true;
246
+ state.killedBy = reason;
247
+ log(`KILL — reason: ${reason} | tokens: ${state.tokensTotal}/${BUDGET} | sends: ${state.sendCallsThisMinute}/min | tools: ${state.toolCallsThisMinute}/min`);
248
+
249
+ child.kill('SIGTERM');
250
+
251
+ // Grace period, then SIGKILL
252
+ setTimeout(() => {
253
+ try {
254
+ child.kill('SIGKILL');
255
+ } catch {
256
+ // Already dead
257
+ }
258
+ }, COOLDOWN_S * 1000);
259
+ }
260
+
261
+ // --- Prompt pattern detection ---
262
+
263
+ const PROMPT_PATTERNS = [
264
+ /\(y\/n\)/i,
265
+ /\[Y\/n\]/i,
266
+ /\[y\/N\]/i,
267
+ /\(yes\/no\)/i,
268
+ /Do you want to trust/i,
269
+ /Do you want to allow/i,
270
+ /Press Enter to continue/i,
271
+ /Are you sure/i,
272
+ ];
273
+
274
+ function checkForPrompts(text) {
275
+ for (const pattern of PROMPT_PATTERNS) {
276
+ if (pattern.test(text)) {
277
+ log(`PROMPT detected in stdout: ${text.trim().substring(0, 100)}`);
278
+ state.stallEvents++;
279
+ // Close stdin to dismiss the prompt
280
+ closeStdin();
281
+ return true;
282
+ }
283
+ }
284
+ return false;
285
+ }
286
+
287
+ // --- CPU liveness sampling ---
288
+
289
+ // Reads cumulative CPU time (user+system) for a process tree.
290
+ // Returns total CPU milliseconds, or -1 if unavailable.
291
+ // On Linux: reads /proc/<pid>/stat (works in containers).
292
+ // On macOS: uses ps command as fallback.
293
+ let lastCpuMs = null; // null = no prior sample taken yet
294
+
295
+ function sampleCpuMs(pid) {
296
+ try {
297
+ // Linux: /proc/<pid>/stat fields 14 (utime) and 15 (stime) in clock ticks
298
+ const statPath = `/proc/${pid}/stat`;
299
+ if (existsSync(statPath)) {
300
+ const stat = readFileSync(statPath, 'utf8');
301
+ // Fields are space-separated, but comm (field 2) can contain spaces/parens.
302
+ // Find the closing paren, then split the rest.
303
+ const afterComm = stat.substring(stat.lastIndexOf(')') + 2);
304
+ const fields = afterComm.split(' ');
305
+ // fields[11] = utime (index 13 in original), fields[12] = stime (index 14)
306
+ const utime = parseInt(fields[11], 10) || 0;
307
+ const stime = parseInt(fields[12], 10) || 0;
308
+ // Also grab child times: fields[13] = cutime, fields[14] = cstime
309
+ const cutime = parseInt(fields[13], 10) || 0;
310
+ const cstime = parseInt(fields[14], 10) || 0;
311
+ // Convert clock ticks to ms (typically 100 ticks/sec on Linux)
312
+ const ticksPerSec = 100;
313
+ return ((utime + stime + cutime + cstime) / ticksPerSec) * 1000;
314
+ }
315
+
316
+ // macOS / fallback: use ps to get cumulative CPU time
317
+ const output = execSync(`ps -o cputime= -p ${pid} 2>/dev/null`, { encoding: 'utf8', timeout: 3000 }).trim();
318
+ if (!output) return -1;
319
+ // Format: HH:MM:SS or M:SS
320
+ const parts = output.split(':').map(Number);
321
+ if (parts.length === 3) return (parts[0] * 3600 + parts[1] * 60 + parts[2]) * 1000;
322
+ if (parts.length === 2) return (parts[0] * 60 + parts[1]) * 1000;
323
+ return -1;
324
+ } catch {
325
+ return -1;
326
+ }
327
+ }
328
+
329
+ // Returns true if the child process has consumed CPU since the last sample.
330
+ function hasConsumedCpu(pid) {
331
+ const cpuMs = sampleCpuMs(pid);
332
+ if (cpuMs < 0) return true; // Can't measure → assume alive (safe default)
333
+
334
+ const prev = lastCpuMs;
335
+ lastCpuMs = cpuMs;
336
+
337
+ // First sample — no delta yet, assume alive
338
+ if (prev === null) return true;
339
+
340
+ // If CPU time increased at all, process is doing work
341
+ return cpuMs > prev;
342
+ }
343
+
344
+ // --- Jitter utility ---
345
+
346
+ function jitteredDelay(base) {
347
+ // ±30% jitter
348
+ const jitter = base * 0.3;
349
+ return base + (Math.random() * 2 * jitter - jitter);
350
+ }
351
+
352
+ // --- Dead air detection ---
353
+
354
+ let deadAirStart = null; // Timestamp when dead air began (null = not in dead air)
355
+ let deadAirPollId = null;
356
+
357
+ function checkDeadAir() {
358
+ if (killed || !child || DEAD_AIR_TIMEOUT_M <= 0) return;
359
+
360
+ state.deadAirChecks++;
361
+ const cpuActive = hasConsumedCpu(child.pid);
362
+ const silenceSec = Math.round((Date.now() - lastOutputTime) / 1000);
363
+
364
+ if (cpuActive) {
365
+ // Process is working — reset dead air, let it cook
366
+ if (deadAirStart) {
367
+ log(`Dead air cleared — CPU active after ${Math.round((Date.now() - deadAirStart) / 1000)}s of silence`);
368
+ deadAirStart = null;
369
+ }
370
+ return;
371
+ }
372
+
373
+ // Zero CPU + zero output
374
+ if (!deadAirStart) {
375
+ deadAirStart = Date.now();
376
+ log(`Dead air started — zero CPU, ${silenceSec}s silence`);
377
+ }
378
+
379
+ const deadAirMin = (Date.now() - deadAirStart) / 60_000;
380
+ if (deadAirMin >= DEAD_AIR_TIMEOUT_M) {
381
+ log(`DEAD AIR — zero CPU + zero output for ${Math.round(deadAirMin)}min (threshold: ${DEAD_AIR_TIMEOUT_M}min)`);
382
+ killChild('dead-air');
383
+ return;
384
+ }
385
+
386
+ log(`Dead air check — zero CPU, ${Math.round(deadAirMin * 10) / 10}/${DEAD_AIR_TIMEOUT_M}min, ${silenceSec}s silence`);
387
+ }
388
+
389
+ // Poll interval: min(30s, threshold/3) — fast polls for short thresholds, 30s cap for production
390
+ const DEAD_AIR_POLL_MS = DEAD_AIR_TIMEOUT_M > 0
391
+ ? Math.min(30_000, Math.max(2_000, (DEAD_AIR_TIMEOUT_M * 60_000) / 3))
392
+ : 30_000;
393
+
394
+ function scheduleDeadAirPoll() {
395
+ if (killed || DEAD_AIR_TIMEOUT_M <= 0) return;
396
+ deadAirPollId = setTimeout(() => {
397
+ checkDeadAir();
398
+ if (!killed) scheduleDeadAirPoll();
399
+ }, jitteredDelay(DEAD_AIR_POLL_MS));
400
+ }
401
+
402
+ // --- Stall detection ---
403
+
404
+ let stallTimer = null;
405
+ let nudgeCount = 0;
406
+ let stdinClosed = false;
407
+ let lastOutputTime = Date.now();
408
+ let gotFirstOutput = false;
409
+
410
+ function onChildOutput() {
411
+ lastOutputTime = Date.now();
412
+ // Reset dead air — got real output
413
+ if (deadAirStart) {
414
+ log(`Dead air cleared — received output after ${Math.round((Date.now() - deadAirStart) / 1000)}s`);
415
+ deadAirStart = null;
416
+ }
417
+ if (!gotFirstOutput) {
418
+ gotFirstOutput = true;
419
+ log(`First output received after ${Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000)}s — switching to stall-timeout=${STALL_TIMEOUT_S}s`);
420
+ }
421
+ resetStallTimer();
422
+ }
423
+
424
+ function currentStallTimeout() {
425
+ // Use startup timeout until first output, then normal stall timeout
426
+ if (!gotFirstOutput && STARTUP_TIMEOUT_S > 0) return STARTUP_TIMEOUT_S;
427
+ return STALL_TIMEOUT_S;
428
+ }
429
+
430
+ function resetStallTimer() {
431
+ if (stallTimer) clearTimeout(stallTimer);
432
+ const timeout = currentStallTimeout();
433
+ if (killed || timeout <= 0) return;
434
+ stallTimer = setTimeout(onStallDetected, timeout * 1000);
435
+ }
436
+
437
+ function closeStdin() {
438
+ if (stdinClosed || !child) return;
439
+ stdinClosed = true;
440
+ try { child.stdin.end(); } catch { /* already closed */ }
441
+ log('Stdin: closed (EOF)');
442
+ }
443
+
444
+ function onStallDetected() {
445
+ if (killed) return;
446
+ state.stallEvents++;
447
+ const silence = Math.round((Date.now() - lastOutputTime) / 1000);
448
+ log(`STALL — no output for ${silence}s (nudges: ${nudgeCount}/${MAX_NUDGES})`);
449
+
450
+ // Escalation: close stdin → nudge → check CPU → kill
451
+ if (!stdinClosed) {
452
+ closeStdin();
453
+ resetStallTimer();
454
+ return;
455
+ }
456
+
457
+ if (nudgeCount < MAX_NUDGES && !child.stdin.writableEnded) {
458
+ nudgeCount++;
459
+ state.nudges = nudgeCount;
460
+ log(`Stall nudge #${nudgeCount}`);
461
+ resetStallTimer();
462
+ return;
463
+ }
464
+
465
+ // If dead air detection is enabled, defer kill to the dead air poller.
466
+ // Only stall-kill if we can confirm zero CPU, or if dead air is disabled.
467
+ if (DEAD_AIR_TIMEOUT_M > 0 && child) {
468
+ const cpuActive = hasConsumedCpu(child.pid);
469
+ if (cpuActive) {
470
+ log(`Stall deferred — process has CPU activity, deferring to dead-air detection`);
471
+ resetStallTimer();
472
+ return;
473
+ }
474
+ }
475
+
476
+ killChild('stall');
477
+ }
478
+
479
+ // --- Restart logic ---
480
+
481
+ // Reasons that should NOT trigger a restart (hard limits / operator intent)
482
+ const NO_RESTART_REASONS = new Set(['budget', 'rate-sends', 'rate-tools', 'abort']);
483
+ let nikiTerminated = false; // Set when niki itself receives SIGTERM/SIGINT
484
+
485
+ function shouldRestart(code, signal) {
486
+ if (!RESTART) return false;
487
+ if (nikiTerminated) return false;
488
+ if (state.killedBy && NO_RESTART_REASONS.has(state.killedBy)) return false;
489
+ if (MAX_RESTARTS > 0 && state.restarts >= MAX_RESTARTS) return false;
490
+ return true;
491
+ }
492
+
493
+ function resetPerRunState() {
494
+ // Reset per-run flags but keep cumulative counters (tokens, toolCalls, etc.)
495
+ killed = false;
496
+ state.killedBy = null;
497
+ state.exitCode = null;
498
+ state.exitSignal = null;
499
+ state.gotFirstOutput = undefined;
500
+
501
+ // Reset stall/dead-air detection
502
+ nudgeCount = 0;
503
+ stdinClosed = false;
504
+ lastOutputTime = Date.now();
505
+ gotFirstOutput = false;
506
+ lastCpuMs = null;
507
+ deadAirStart = null;
508
+
509
+ // Clear rate limit windows (fresh session)
510
+ toolCallTimestamps.length = 0;
511
+ sendCallTimestamps.length = 0;
512
+ state.toolCallsThisMinute = 0;
513
+ state.sendCallsThisMinute = 0;
514
+ }
515
+
516
+ // --- Abort file polling ---
517
+
518
+ let abortPollId = null;
519
+
520
+ function scheduleAbortPoll() {
521
+ if (!ABORT_FILE || killed) return;
522
+ abortPollId = setTimeout(() => {
523
+ if (killed) return;
524
+ if (existsSync(ABORT_FILE)) {
525
+ log(`Abort file detected: ${ABORT_FILE}`);
526
+ killChild('abort');
527
+ return;
528
+ }
529
+ scheduleAbortPoll();
530
+ }, jitteredDelay(POLL_INTERVAL));
531
+ }
532
+
533
+ // --- Spawn and monitor child ---
534
+
535
+ let timeoutId = null;
536
+
537
+ function startChild() {
538
+ log(`Starting: ${childCmd} ${childArgs.join(' ').substring(0, 100)}...`);
539
+ log(`Budget: ${BUDGET} tokens | Timeout: ${TIMEOUT_S}s | Startup: ${STARTUP_TIMEOUT_S}s | Stall: ${STALL_TIMEOUT_S}s | Dead air: ${DEAD_AIR_TIMEOUT_M}min | Max sends: ${MAX_SENDS}/min | Max tools: ${MAX_TOOL_CALLS}/min`);
540
+ if (RESTART) {
541
+ log(`Restart: enabled | max: ${MAX_RESTARTS || 'unlimited'} | delay: ${RESTART_DELAY_S}s ±30% | restarts so far: ${state.restarts}`);
542
+ }
543
+
544
+ child = spawn(childCmd, childArgs, {
545
+ stdio: ['pipe', 'pipe', 'pipe'],
546
+ env: process.env,
547
+ });
548
+
549
+ state.pid = child.pid;
550
+
551
+ // Close stdin immediately — claude -p should never need interactive input.
552
+ closeStdin();
553
+
554
+ // --- Monitor stdout ---
555
+
556
+ child.stdout.on('data', (chunk) => {
557
+ process.stdout.write(chunk);
558
+ onChildOutput();
559
+ checkForPrompts(chunk.toString());
560
+ });
561
+
562
+ // --- Monitor stderr ---
563
+
564
+ let stderrBuffer = '';
565
+
566
+ child.stderr.on('data', (chunk) => {
567
+ const text = chunk.toString();
568
+ process.stderr.write(chunk);
569
+ onChildOutput();
570
+
571
+ stderrBuffer += text;
572
+ const lines = stderrBuffer.split('\n');
573
+ stderrBuffer = lines.pop();
574
+
575
+ for (const line of lines) {
576
+ parseTokens(line);
577
+ parseToolCall(line);
578
+
579
+ if (state.tokensTotal > BUDGET) {
580
+ killChild('budget');
581
+ return;
582
+ }
583
+
584
+ const rateViolation = checkRateLimits();
585
+ if (rateViolation) {
586
+ killChild(rateViolation);
587
+ return;
588
+ }
589
+ }
590
+ });
591
+
592
+ // Start stall detection
593
+ if (STALL_TIMEOUT_S > 0 || STARTUP_TIMEOUT_S > 0) {
594
+ log(`Stall detection: startup-timeout=${STARTUP_TIMEOUT_S}s, stall-timeout=${STALL_TIMEOUT_S}s, max-nudges=${MAX_NUDGES}`);
595
+ resetStallTimer();
596
+ }
597
+
598
+ // Start dead air detection
599
+ if (DEAD_AIR_TIMEOUT_M > 0) {
600
+ log(`Dead air detection: ${DEAD_AIR_TIMEOUT_M}min threshold, ${Math.round(DEAD_AIR_POLL_MS / 1000)}s poll interval`);
601
+ scheduleDeadAirPoll();
602
+ }
603
+
604
+ // Abort file polling
605
+ if (ABORT_FILE) {
606
+ log(`Abort file: ${ABORT_FILE} (poll: ${POLL_INTERVAL}ms ±30% jitter)`);
607
+ scheduleAbortPoll();
608
+ }
609
+
610
+ // Per-run timeout
611
+ timeoutId = setTimeout(() => {
612
+ killChild('timeout');
613
+ }, TIMEOUT_S * 1000);
614
+
615
+ // --- Exit handler ---
616
+
617
+ child.on('exit', (code, signal) => {
618
+ clearTimeout(timeoutId);
619
+ if (stallTimer) clearTimeout(stallTimer);
620
+ if (abortPollId) clearTimeout(abortPollId);
621
+ if (deadAirPollId) clearTimeout(deadAirPollId);
622
+
623
+ state.exitCode = code;
624
+ state.exitSignal = signal;
625
+ state.duration = Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000);
626
+ state.gotFirstOutput = gotFirstOutput;
627
+
628
+ log(`Exit — code: ${code} signal: ${signal} | tokens: ${state.tokensTotal} | tools: ${state.toolCalls} | sends: ${state.sendCalls} | duration: ${state.duration}s | output: ${gotFirstOutput}${state.killedBy ? ` | killed: ${state.killedBy}` : ''} | restarts: ${state.restarts}`);
629
+ writeState();
630
+
631
+ if (shouldRestart(code, signal)) {
632
+ state.restarts++;
633
+ const delay = jitteredDelay(RESTART_DELAY_S * 1000);
634
+ log(`RESTART — attempt ${state.restarts}${MAX_RESTARTS > 0 ? `/${MAX_RESTARTS}` : ''} in ${Math.round(delay / 1000)}s`);
635
+ resetPerRunState();
636
+ setTimeout(() => {
637
+ startChild();
638
+ }, delay);
639
+ } else {
640
+ if (RESTART && !shouldRestart(code, signal)) {
641
+ const reason = nikiTerminated ? 'niki received signal' :
642
+ (state.killedBy && NO_RESTART_REASONS.has(state.killedBy)) ? `hard kill (${state.killedBy})` :
643
+ (MAX_RESTARTS > 0 && state.restarts >= MAX_RESTARTS) ? `max restarts reached (${MAX_RESTARTS})` :
644
+ 'restart not enabled';
645
+ log(`NOT RESTARTING — ${reason}`);
646
+ }
647
+ if (logStream) logStream.end();
648
+ process.exit(code ?? 1);
649
+ }
650
+ });
651
+ }
652
+
653
+ // --- Signal forwarding ---
654
+
655
+ for (const sig of ['SIGINT', 'SIGTERM']) {
656
+ process.on(sig, () => {
657
+ log(`Received ${sig}, forwarding to child`);
658
+ nikiTerminated = true;
659
+ if (child) child.kill(sig);
660
+ });
661
+ }
662
+
663
+ // --- Start ---
664
+
665
+ startChild();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tjamescouch/niki",
3
- "version": "0.3.0",
3
+ "version": "0.5.1",
4
4
  "description": "Deterministic process supervisor for AI agents — token budgets, rate limits, and abort control",
5
5
  "bin": {
6
6
  "niki": "./bin/niki"