@tjamescouch/niki 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -1
- package/bin/niki.bak +0 -665
- package/niki.png +0 -0
- package/tests/test-niki.sh +0 -275
package/package.json
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tjamescouch/niki",
|
|
3
|
-
"version": "0.5.
|
|
3
|
+
"version": "0.5.2",
|
|
4
4
|
"description": "Deterministic process supervisor for AI agents — token budgets, rate limits, and abort control",
|
|
5
5
|
"bin": {
|
|
6
6
|
"niki": "./bin/niki"
|
|
7
7
|
},
|
|
8
|
+
"files": [
|
|
9
|
+
"bin/niki",
|
|
10
|
+
"README.md",
|
|
11
|
+
"LICENSE"
|
|
12
|
+
],
|
|
8
13
|
"type": "module",
|
|
9
14
|
"license": "MIT",
|
|
10
15
|
"repository": {
|
package/bin/niki.bak
DELETED
|
@@ -1,665 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* niki — Deterministic process supervisor for AI agents.
|
|
5
|
-
*
|
|
6
|
-
* Wraps a child command (e.g. `claude -p`) and enforces:
|
|
7
|
-
* - Token budget (kill if exceeded)
|
|
8
|
-
* - Wall-clock timeout (kill if exceeded)
|
|
9
|
-
* - Tool-call rate limiting (kill if agent floods)
|
|
10
|
-
* - Diagnostics logging
|
|
11
|
-
* - Automatic restart on exit (optional)
|
|
12
|
-
*
|
|
13
|
-
* Usage:
|
|
14
|
-
* niki [options] -- <command> [args...]
|
|
15
|
-
* niki --budget 500000 --timeout 3600 -- claude -p "..." --verbose
|
|
16
|
-
*
|
|
17
|
-
* Security:
|
|
18
|
-
* - Never logs or exposes API tokens
|
|
19
|
-
* - Inherits env from parent (tokens stay in env, never in CLI args)
|
|
20
|
-
* - Diagnostics only contain counters, never message content
|
|
21
|
-
*/
|
|
22
|
-
|
|
23
|
-
import { spawn, execSync } from 'node:child_process';
|
|
24
|
-
import { createWriteStream, writeFileSync, mkdirSync, existsSync, readFileSync } from 'node:fs';
|
|
25
|
-
import { dirname, resolve } from 'node:path';
|
|
26
|
-
import { parseArgs } from 'node:util';
|
|
27
|
-
|
|
28
|
-
// --- Argument parsing ---
|
|
29
|
-
|
|
30
|
-
const SEPARATOR = process.argv.indexOf('--');
|
|
31
|
-
if (SEPARATOR === -1 || SEPARATOR === process.argv.length - 1) {
|
|
32
|
-
console.error(`niki — deterministic agent supervisor
|
|
33
|
-
|
|
34
|
-
Usage: niki [options] -- <command> [args...]
|
|
35
|
-
|
|
36
|
-
Options:
|
|
37
|
-
--budget <tokens> Max total tokens (input+output) before SIGTERM (default: 1000000)
|
|
38
|
-
--timeout <seconds> Max wall-clock runtime before SIGTERM (default: 3600)
|
|
39
|
-
--max-sends <n> Max agentchat_send calls per minute (default: 10)
|
|
40
|
-
--max-tool-calls <n> Max total tool calls per minute (default: 30)
|
|
41
|
-
--stall-timeout <secs> Kill after N seconds of no output (default: 60, 0=disabled)
|
|
42
|
-
--startup-timeout <s> Longer stall timeout until first output (default: 180, 0=use stall-timeout)
|
|
43
|
-
--dead-air-timeout <m> Minutes of zero CPU + zero output before kill (default: 5, 0=disabled)
|
|
44
|
-
--max-nudges <n> Max stdin nudge attempts before kill on stall (default: 3)
|
|
45
|
-
--log <file> Write diagnostics log to file
|
|
46
|
-
--state <file> Write state JSON on exit (budget used, reason, etc.)
|
|
47
|
-
--cooldown <seconds> Grace period after SIGTERM before SIGKILL (default: 5)
|
|
48
|
-
--abort-file <path> Poll this file for external abort signal
|
|
49
|
-
--poll-interval <ms> Base poll interval in ms for abort file (default: 1000)
|
|
50
|
-
--restart Restart the child process when it exits (default: off)
|
|
51
|
-
--max-restarts <n> Max restart attempts, 0=unlimited (default: 0)
|
|
52
|
-
--restart-delay <secs> Delay between restarts with ±30% jitter (default: 5)
|
|
53
|
-
|
|
54
|
-
Examples:
|
|
55
|
-
niki --budget 500000 -- claude -p "your prompt" --verbose
|
|
56
|
-
niki --timeout 1800 --max-sends 5 -- claude -p "..." --model sonnet --verbose
|
|
57
|
-
niki --restart --max-restarts 10 -- gro --model gpt-5.2 "your prompt"`);
|
|
58
|
-
process.exit(1);
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
const nikiArgs = process.argv.slice(2, SEPARATOR);
|
|
62
|
-
const childCmd = process.argv[SEPARATOR + 1];
|
|
63
|
-
const childArgs = process.argv.slice(SEPARATOR + 2);
|
|
64
|
-
|
|
65
|
-
const { values: opts } = parseArgs({
|
|
66
|
-
args: nikiArgs,
|
|
67
|
-
options: {
|
|
68
|
-
budget: { type: 'string', default: '1000000' },
|
|
69
|
-
timeout: { type: 'string', default: '3600' },
|
|
70
|
-
'max-sends': { type: 'string', default: '10' },
|
|
71
|
-
'max-tool-calls': { type: 'string', default: '30' },
|
|
72
|
-
'stall-timeout': { type: 'string', default: '60' },
|
|
73
|
-
'startup-timeout': { type: 'string', default: '180' },
|
|
74
|
-
'dead-air-timeout': { type: 'string', default: '5' },
|
|
75
|
-
'max-nudges': { type: 'string', default: '3' },
|
|
76
|
-
log: { type: 'string' },
|
|
77
|
-
state: { type: 'string' },
|
|
78
|
-
cooldown: { type: 'string', default: '5' },
|
|
79
|
-
'abort-file': { type: 'string' },
|
|
80
|
-
'poll-interval': { type: 'string', default: '1000' },
|
|
81
|
-
restart: { type: 'boolean', default: false },
|
|
82
|
-
'max-restarts': { type: 'string', default: '0' },
|
|
83
|
-
'restart-delay': { type: 'string', default: '5' },
|
|
84
|
-
},
|
|
85
|
-
});
|
|
86
|
-
|
|
87
|
-
const BUDGET = parseInt(opts.budget, 10);
|
|
88
|
-
const TIMEOUT_S = parseInt(opts.timeout, 10);
|
|
89
|
-
const MAX_SENDS = parseInt(opts['max-sends'], 10);
|
|
90
|
-
const MAX_TOOL_CALLS = parseInt(opts['max-tool-calls'], 10);
|
|
91
|
-
const STALL_TIMEOUT_S = parseInt(opts['stall-timeout'], 10);
|
|
92
|
-
const STARTUP_TIMEOUT_S = parseInt(opts['startup-timeout'], 10);
|
|
93
|
-
const DEAD_AIR_TIMEOUT_M = parseFloat(opts['dead-air-timeout']);
|
|
94
|
-
const MAX_NUDGES = parseInt(opts['max-nudges'], 10);
|
|
95
|
-
const COOLDOWN_S = parseInt(opts.cooldown, 10);
|
|
96
|
-
const ABORT_FILE = opts['abort-file'] ? resolve(opts['abort-file']) : null;
|
|
97
|
-
const POLL_INTERVAL = parseInt(opts['poll-interval'], 10);
|
|
98
|
-
const LOG_FILE = opts.log;
|
|
99
|
-
const STATE_FILE = opts.state;
|
|
100
|
-
const RESTART = opts.restart;
|
|
101
|
-
const MAX_RESTARTS = parseInt(opts['max-restarts'], 10);
|
|
102
|
-
const RESTART_DELAY_S = parseFloat(opts['restart-delay']);
|
|
103
|
-
|
|
104
|
-
// --- State ---
|
|
105
|
-
|
|
106
|
-
const state = {
|
|
107
|
-
startedAt: new Date().toISOString(),
|
|
108
|
-
pid: null,
|
|
109
|
-
tokensIn: 0,
|
|
110
|
-
tokensOut: 0,
|
|
111
|
-
tokensTotal: 0,
|
|
112
|
-
toolCalls: 0,
|
|
113
|
-
sendCalls: 0,
|
|
114
|
-
toolCallsThisMinute: 0,
|
|
115
|
-
sendCallsThisMinute: 0,
|
|
116
|
-
exitCode: null,
|
|
117
|
-
exitSignal: null,
|
|
118
|
-
killedBy: null, // 'budget' | 'timeout' | 'rate-sends' | 'rate-tools' | 'abort' | 'stall' | 'dead-air' | null
|
|
119
|
-
duration: 0,
|
|
120
|
-
stallEvents: 0,
|
|
121
|
-
nudges: 0,
|
|
122
|
-
deadAirChecks: 0,
|
|
123
|
-
restarts: 0,
|
|
124
|
-
};
|
|
125
|
-
|
|
126
|
-
// Sliding window for per-minute rate limiting
|
|
127
|
-
const toolCallTimestamps = [];
|
|
128
|
-
const sendCallTimestamps = [];
|
|
129
|
-
|
|
130
|
-
// --- Logging ---
|
|
131
|
-
|
|
132
|
-
let logStream = null;
|
|
133
|
-
if (LOG_FILE) {
|
|
134
|
-
mkdirSync(dirname(resolve(LOG_FILE)), { recursive: true });
|
|
135
|
-
logStream = createWriteStream(resolve(LOG_FILE), { flags: 'a' });
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
function log(msg) {
|
|
139
|
-
const line = `[${new Date().toISOString()}] ${msg}`;
|
|
140
|
-
if (logStream) logStream.write(line + '\n');
|
|
141
|
-
// Also write to stderr so supervisor can capture it
|
|
142
|
-
process.stderr.write(`[niki] ${line}\n`);
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
function writeState() {
|
|
146
|
-
if (!STATE_FILE) return;
|
|
147
|
-
try {
|
|
148
|
-
mkdirSync(dirname(resolve(STATE_FILE)), { recursive: true });
|
|
149
|
-
// Never include env, tokens, or message content — only counters
|
|
150
|
-
writeFileSync(resolve(STATE_FILE), JSON.stringify(state, null, 2) + '\n');
|
|
151
|
-
} catch {
|
|
152
|
-
// Best effort
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
// --- Token parsing from stderr ---
|
|
157
|
-
|
|
158
|
-
// Claude --verbose outputs token usage in stderr. Patterns vary by version.
|
|
159
|
-
// We look for common patterns and extract numbers.
|
|
160
|
-
//
|
|
161
|
-
// Known patterns:
|
|
162
|
-
// "input_tokens": 1234
|
|
163
|
-
// "output_tokens": 567
|
|
164
|
-
// tokens: { input: 1234, output: 567 }
|
|
165
|
-
// Input tokens: 1234
|
|
166
|
-
// Output tokens: 567
|
|
167
|
-
|
|
168
|
-
const TOKEN_PATTERNS = [
|
|
169
|
-
// JSON-style: "input_tokens": 1234
|
|
170
|
-
{ regex: /"input_tokens"\s*:\s*(\d+)/g, field: 'in' },
|
|
171
|
-
{ regex: /"output_tokens"\s*:\s*(\d+)/g, field: 'out' },
|
|
172
|
-
// Human-readable: Input tokens: 1234
|
|
173
|
-
{ regex: /Input tokens:\s*(\d+)/gi, field: 'in' },
|
|
174
|
-
{ regex: /Output tokens:\s*(\d+)/gi, field: 'out' },
|
|
175
|
-
];
|
|
176
|
-
|
|
177
|
-
function parseTokens(line) {
|
|
178
|
-
for (const { regex, field } of TOKEN_PATTERNS) {
|
|
179
|
-
regex.lastIndex = 0;
|
|
180
|
-
let match;
|
|
181
|
-
while ((match = regex.exec(line)) !== null) {
|
|
182
|
-
const count = parseInt(match[1], 10);
|
|
183
|
-
if (isNaN(count) || count <= 0) continue;
|
|
184
|
-
if (field === 'in') {
|
|
185
|
-
state.tokensIn = Math.max(state.tokensIn, count);
|
|
186
|
-
} else {
|
|
187
|
-
state.tokensOut = Math.max(state.tokensOut, count);
|
|
188
|
-
}
|
|
189
|
-
state.tokensTotal = state.tokensIn + state.tokensOut;
|
|
190
|
-
}
|
|
191
|
-
}
|
|
192
|
-
}
|
|
193
|
-
|
|
194
|
-
// --- Tool call detection from stderr ---
|
|
195
|
-
|
|
196
|
-
// Claude --verbose logs tool calls. We detect sends specifically.
|
|
197
|
-
const TOOL_CALL_PATTERN = /(?:Using tool|Tool call|tool_use).*?(\w+)/i;
|
|
198
|
-
const SEND_PATTERN = /agentchat_send/i;
|
|
199
|
-
|
|
200
|
-
function parseToolCall(line) {
|
|
201
|
-
if (TOOL_CALL_PATTERN.test(line)) {
|
|
202
|
-
const now = Date.now();
|
|
203
|
-
state.toolCalls++;
|
|
204
|
-
toolCallTimestamps.push(now);
|
|
205
|
-
|
|
206
|
-
if (SEND_PATTERN.test(line)) {
|
|
207
|
-
state.sendCalls++;
|
|
208
|
-
sendCallTimestamps.push(now);
|
|
209
|
-
}
|
|
210
|
-
}
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
// --- Rate limit checking ---
|
|
214
|
-
|
|
215
|
-
function pruneWindow(timestamps) {
|
|
216
|
-
const cutoff = Date.now() - 60_000; // 1 minute window
|
|
217
|
-
while (timestamps.length > 0 && timestamps[0] < cutoff) {
|
|
218
|
-
timestamps.shift();
|
|
219
|
-
}
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
function checkRateLimits() {
|
|
223
|
-
pruneWindow(toolCallTimestamps);
|
|
224
|
-
pruneWindow(sendCallTimestamps);
|
|
225
|
-
|
|
226
|
-
state.toolCallsThisMinute = toolCallTimestamps.length;
|
|
227
|
-
state.sendCallsThisMinute = sendCallTimestamps.length;
|
|
228
|
-
|
|
229
|
-
if (sendCallTimestamps.length > MAX_SENDS) {
|
|
230
|
-
return 'rate-sends';
|
|
231
|
-
}
|
|
232
|
-
if (toolCallTimestamps.length > MAX_TOOL_CALLS) {
|
|
233
|
-
return 'rate-tools';
|
|
234
|
-
}
|
|
235
|
-
return null;
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
// --- Kill logic ---
|
|
239
|
-
|
|
240
|
-
let child = null;
|
|
241
|
-
let killed = false;
|
|
242
|
-
|
|
243
|
-
function killChild(reason) {
|
|
244
|
-
if (killed || !child) return;
|
|
245
|
-
killed = true;
|
|
246
|
-
state.killedBy = reason;
|
|
247
|
-
log(`KILL — reason: ${reason} | tokens: ${state.tokensTotal}/${BUDGET} | sends: ${state.sendCallsThisMinute}/min | tools: ${state.toolCallsThisMinute}/min`);
|
|
248
|
-
|
|
249
|
-
child.kill('SIGTERM');
|
|
250
|
-
|
|
251
|
-
// Grace period, then SIGKILL
|
|
252
|
-
setTimeout(() => {
|
|
253
|
-
try {
|
|
254
|
-
child.kill('SIGKILL');
|
|
255
|
-
} catch {
|
|
256
|
-
// Already dead
|
|
257
|
-
}
|
|
258
|
-
}, COOLDOWN_S * 1000);
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
// --- Prompt pattern detection ---
|
|
262
|
-
|
|
263
|
-
const PROMPT_PATTERNS = [
|
|
264
|
-
/\(y\/n\)/i,
|
|
265
|
-
/\[Y\/n\]/i,
|
|
266
|
-
/\[y\/N\]/i,
|
|
267
|
-
/\(yes\/no\)/i,
|
|
268
|
-
/Do you want to trust/i,
|
|
269
|
-
/Do you want to allow/i,
|
|
270
|
-
/Press Enter to continue/i,
|
|
271
|
-
/Are you sure/i,
|
|
272
|
-
];
|
|
273
|
-
|
|
274
|
-
function checkForPrompts(text) {
|
|
275
|
-
for (const pattern of PROMPT_PATTERNS) {
|
|
276
|
-
if (pattern.test(text)) {
|
|
277
|
-
log(`PROMPT detected in stdout: ${text.trim().substring(0, 100)}`);
|
|
278
|
-
state.stallEvents++;
|
|
279
|
-
// Close stdin to dismiss the prompt
|
|
280
|
-
closeStdin();
|
|
281
|
-
return true;
|
|
282
|
-
}
|
|
283
|
-
}
|
|
284
|
-
return false;
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
// --- CPU liveness sampling ---
|
|
288
|
-
|
|
289
|
-
// Reads cumulative CPU time (user+system) for a process tree.
|
|
290
|
-
// Returns total CPU milliseconds, or -1 if unavailable.
|
|
291
|
-
// On Linux: reads /proc/<pid>/stat (works in containers).
|
|
292
|
-
// On macOS: uses ps command as fallback.
|
|
293
|
-
let lastCpuMs = null; // null = no prior sample taken yet
|
|
294
|
-
|
|
295
|
-
function sampleCpuMs(pid) {
|
|
296
|
-
try {
|
|
297
|
-
// Linux: /proc/<pid>/stat fields 14 (utime) and 15 (stime) in clock ticks
|
|
298
|
-
const statPath = `/proc/${pid}/stat`;
|
|
299
|
-
if (existsSync(statPath)) {
|
|
300
|
-
const stat = readFileSync(statPath, 'utf8');
|
|
301
|
-
// Fields are space-separated, but comm (field 2) can contain spaces/parens.
|
|
302
|
-
// Find the closing paren, then split the rest.
|
|
303
|
-
const afterComm = stat.substring(stat.lastIndexOf(')') + 2);
|
|
304
|
-
const fields = afterComm.split(' ');
|
|
305
|
-
// fields[11] = utime (index 13 in original), fields[12] = stime (index 14)
|
|
306
|
-
const utime = parseInt(fields[11], 10) || 0;
|
|
307
|
-
const stime = parseInt(fields[12], 10) || 0;
|
|
308
|
-
// Also grab child times: fields[13] = cutime, fields[14] = cstime
|
|
309
|
-
const cutime = parseInt(fields[13], 10) || 0;
|
|
310
|
-
const cstime = parseInt(fields[14], 10) || 0;
|
|
311
|
-
// Convert clock ticks to ms (typically 100 ticks/sec on Linux)
|
|
312
|
-
const ticksPerSec = 100;
|
|
313
|
-
return ((utime + stime + cutime + cstime) / ticksPerSec) * 1000;
|
|
314
|
-
}
|
|
315
|
-
|
|
316
|
-
// macOS / fallback: use ps to get cumulative CPU time
|
|
317
|
-
const output = execSync(`ps -o cputime= -p ${pid} 2>/dev/null`, { encoding: 'utf8', timeout: 3000 }).trim();
|
|
318
|
-
if (!output) return -1;
|
|
319
|
-
// Format: HH:MM:SS or M:SS
|
|
320
|
-
const parts = output.split(':').map(Number);
|
|
321
|
-
if (parts.length === 3) return (parts[0] * 3600 + parts[1] * 60 + parts[2]) * 1000;
|
|
322
|
-
if (parts.length === 2) return (parts[0] * 60 + parts[1]) * 1000;
|
|
323
|
-
return -1;
|
|
324
|
-
} catch {
|
|
325
|
-
return -1;
|
|
326
|
-
}
|
|
327
|
-
}
|
|
328
|
-
|
|
329
|
-
// Returns true if the child process has consumed CPU since the last sample.
|
|
330
|
-
function hasConsumedCpu(pid) {
|
|
331
|
-
const cpuMs = sampleCpuMs(pid);
|
|
332
|
-
if (cpuMs < 0) return true; // Can't measure → assume alive (safe default)
|
|
333
|
-
|
|
334
|
-
const prev = lastCpuMs;
|
|
335
|
-
lastCpuMs = cpuMs;
|
|
336
|
-
|
|
337
|
-
// First sample — no delta yet, assume alive
|
|
338
|
-
if (prev === null) return true;
|
|
339
|
-
|
|
340
|
-
// If CPU time increased at all, process is doing work
|
|
341
|
-
return cpuMs > prev;
|
|
342
|
-
}
|
|
343
|
-
|
|
344
|
-
// --- Jitter utility ---
|
|
345
|
-
|
|
346
|
-
function jitteredDelay(base) {
|
|
347
|
-
// ±30% jitter
|
|
348
|
-
const jitter = base * 0.3;
|
|
349
|
-
return base + (Math.random() * 2 * jitter - jitter);
|
|
350
|
-
}
|
|
351
|
-
|
|
352
|
-
// --- Dead air detection ---
|
|
353
|
-
|
|
354
|
-
let deadAirStart = null; // Timestamp when dead air began (null = not in dead air)
|
|
355
|
-
let deadAirPollId = null;
|
|
356
|
-
|
|
357
|
-
function checkDeadAir() {
|
|
358
|
-
if (killed || !child || DEAD_AIR_TIMEOUT_M <= 0) return;
|
|
359
|
-
|
|
360
|
-
state.deadAirChecks++;
|
|
361
|
-
const cpuActive = hasConsumedCpu(child.pid);
|
|
362
|
-
const silenceSec = Math.round((Date.now() - lastOutputTime) / 1000);
|
|
363
|
-
|
|
364
|
-
if (cpuActive) {
|
|
365
|
-
// Process is working — reset dead air, let it cook
|
|
366
|
-
if (deadAirStart) {
|
|
367
|
-
log(`Dead air cleared — CPU active after ${Math.round((Date.now() - deadAirStart) / 1000)}s of silence`);
|
|
368
|
-
deadAirStart = null;
|
|
369
|
-
}
|
|
370
|
-
return;
|
|
371
|
-
}
|
|
372
|
-
|
|
373
|
-
// Zero CPU + zero output
|
|
374
|
-
if (!deadAirStart) {
|
|
375
|
-
deadAirStart = Date.now();
|
|
376
|
-
log(`Dead air started — zero CPU, ${silenceSec}s silence`);
|
|
377
|
-
}
|
|
378
|
-
|
|
379
|
-
const deadAirMin = (Date.now() - deadAirStart) / 60_000;
|
|
380
|
-
if (deadAirMin >= DEAD_AIR_TIMEOUT_M) {
|
|
381
|
-
log(`DEAD AIR — zero CPU + zero output for ${Math.round(deadAirMin)}min (threshold: ${DEAD_AIR_TIMEOUT_M}min)`);
|
|
382
|
-
killChild('dead-air');
|
|
383
|
-
return;
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
log(`Dead air check — zero CPU, ${Math.round(deadAirMin * 10) / 10}/${DEAD_AIR_TIMEOUT_M}min, ${silenceSec}s silence`);
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
// Poll interval: min(30s, threshold/3) — fast polls for short thresholds, 30s cap for production
|
|
390
|
-
const DEAD_AIR_POLL_MS = DEAD_AIR_TIMEOUT_M > 0
|
|
391
|
-
? Math.min(30_000, Math.max(2_000, (DEAD_AIR_TIMEOUT_M * 60_000) / 3))
|
|
392
|
-
: 30_000;
|
|
393
|
-
|
|
394
|
-
function scheduleDeadAirPoll() {
|
|
395
|
-
if (killed || DEAD_AIR_TIMEOUT_M <= 0) return;
|
|
396
|
-
deadAirPollId = setTimeout(() => {
|
|
397
|
-
checkDeadAir();
|
|
398
|
-
if (!killed) scheduleDeadAirPoll();
|
|
399
|
-
}, jitteredDelay(DEAD_AIR_POLL_MS));
|
|
400
|
-
}
|
|
401
|
-
|
|
402
|
-
// --- Stall detection ---
|
|
403
|
-
|
|
404
|
-
let stallTimer = null;
|
|
405
|
-
let nudgeCount = 0;
|
|
406
|
-
let stdinClosed = false;
|
|
407
|
-
let lastOutputTime = Date.now();
|
|
408
|
-
let gotFirstOutput = false;
|
|
409
|
-
|
|
410
|
-
function onChildOutput() {
|
|
411
|
-
lastOutputTime = Date.now();
|
|
412
|
-
// Reset dead air — got real output
|
|
413
|
-
if (deadAirStart) {
|
|
414
|
-
log(`Dead air cleared — received output after ${Math.round((Date.now() - deadAirStart) / 1000)}s`);
|
|
415
|
-
deadAirStart = null;
|
|
416
|
-
}
|
|
417
|
-
if (!gotFirstOutput) {
|
|
418
|
-
gotFirstOutput = true;
|
|
419
|
-
log(`First output received after ${Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000)}s — switching to stall-timeout=${STALL_TIMEOUT_S}s`);
|
|
420
|
-
}
|
|
421
|
-
resetStallTimer();
|
|
422
|
-
}
|
|
423
|
-
|
|
424
|
-
function currentStallTimeout() {
|
|
425
|
-
// Use startup timeout until first output, then normal stall timeout
|
|
426
|
-
if (!gotFirstOutput && STARTUP_TIMEOUT_S > 0) return STARTUP_TIMEOUT_S;
|
|
427
|
-
return STALL_TIMEOUT_S;
|
|
428
|
-
}
|
|
429
|
-
|
|
430
|
-
function resetStallTimer() {
|
|
431
|
-
if (stallTimer) clearTimeout(stallTimer);
|
|
432
|
-
const timeout = currentStallTimeout();
|
|
433
|
-
if (killed || timeout <= 0) return;
|
|
434
|
-
stallTimer = setTimeout(onStallDetected, timeout * 1000);
|
|
435
|
-
}
|
|
436
|
-
|
|
437
|
-
function closeStdin() {
|
|
438
|
-
if (stdinClosed || !child) return;
|
|
439
|
-
stdinClosed = true;
|
|
440
|
-
try { child.stdin.end(); } catch { /* already closed */ }
|
|
441
|
-
log('Stdin: closed (EOF)');
|
|
442
|
-
}
|
|
443
|
-
|
|
444
|
-
function onStallDetected() {
|
|
445
|
-
if (killed) return;
|
|
446
|
-
state.stallEvents++;
|
|
447
|
-
const silence = Math.round((Date.now() - lastOutputTime) / 1000);
|
|
448
|
-
log(`STALL — no output for ${silence}s (nudges: ${nudgeCount}/${MAX_NUDGES})`);
|
|
449
|
-
|
|
450
|
-
// Escalation: close stdin → nudge → check CPU → kill
|
|
451
|
-
if (!stdinClosed) {
|
|
452
|
-
closeStdin();
|
|
453
|
-
resetStallTimer();
|
|
454
|
-
return;
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
if (nudgeCount < MAX_NUDGES && !child.stdin.writableEnded) {
|
|
458
|
-
nudgeCount++;
|
|
459
|
-
state.nudges = nudgeCount;
|
|
460
|
-
log(`Stall nudge #${nudgeCount}`);
|
|
461
|
-
resetStallTimer();
|
|
462
|
-
return;
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
// If dead air detection is enabled, defer kill to the dead air poller.
|
|
466
|
-
// Only stall-kill if we can confirm zero CPU, or if dead air is disabled.
|
|
467
|
-
if (DEAD_AIR_TIMEOUT_M > 0 && child) {
|
|
468
|
-
const cpuActive = hasConsumedCpu(child.pid);
|
|
469
|
-
if (cpuActive) {
|
|
470
|
-
log(`Stall deferred — process has CPU activity, deferring to dead-air detection`);
|
|
471
|
-
resetStallTimer();
|
|
472
|
-
return;
|
|
473
|
-
}
|
|
474
|
-
}
|
|
475
|
-
|
|
476
|
-
killChild('stall');
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
// --- Restart logic ---
|
|
480
|
-
|
|
481
|
-
// Reasons that should NOT trigger a restart (hard limits / operator intent)
|
|
482
|
-
const NO_RESTART_REASONS = new Set(['budget', 'rate-sends', 'rate-tools', 'abort']);
|
|
483
|
-
let nikiTerminated = false; // Set when niki itself receives SIGTERM/SIGINT
|
|
484
|
-
|
|
485
|
-
function shouldRestart(code, signal) {
|
|
486
|
-
if (!RESTART) return false;
|
|
487
|
-
if (nikiTerminated) return false;
|
|
488
|
-
if (state.killedBy && NO_RESTART_REASONS.has(state.killedBy)) return false;
|
|
489
|
-
if (MAX_RESTARTS > 0 && state.restarts >= MAX_RESTARTS) return false;
|
|
490
|
-
return true;
|
|
491
|
-
}
|
|
492
|
-
|
|
493
|
-
function resetPerRunState() {
|
|
494
|
-
// Reset per-run flags but keep cumulative counters (tokens, toolCalls, etc.)
|
|
495
|
-
killed = false;
|
|
496
|
-
state.killedBy = null;
|
|
497
|
-
state.exitCode = null;
|
|
498
|
-
state.exitSignal = null;
|
|
499
|
-
state.gotFirstOutput = undefined;
|
|
500
|
-
|
|
501
|
-
// Reset stall/dead-air detection
|
|
502
|
-
nudgeCount = 0;
|
|
503
|
-
stdinClosed = false;
|
|
504
|
-
lastOutputTime = Date.now();
|
|
505
|
-
gotFirstOutput = false;
|
|
506
|
-
lastCpuMs = null;
|
|
507
|
-
deadAirStart = null;
|
|
508
|
-
|
|
509
|
-
// Clear rate limit windows (fresh session)
|
|
510
|
-
toolCallTimestamps.length = 0;
|
|
511
|
-
sendCallTimestamps.length = 0;
|
|
512
|
-
state.toolCallsThisMinute = 0;
|
|
513
|
-
state.sendCallsThisMinute = 0;
|
|
514
|
-
}
|
|
515
|
-
|
|
516
|
-
// --- Abort file polling ---
|
|
517
|
-
|
|
518
|
-
let abortPollId = null;
|
|
519
|
-
|
|
520
|
-
function scheduleAbortPoll() {
|
|
521
|
-
if (!ABORT_FILE || killed) return;
|
|
522
|
-
abortPollId = setTimeout(() => {
|
|
523
|
-
if (killed) return;
|
|
524
|
-
if (existsSync(ABORT_FILE)) {
|
|
525
|
-
log(`Abort file detected: ${ABORT_FILE}`);
|
|
526
|
-
killChild('abort');
|
|
527
|
-
return;
|
|
528
|
-
}
|
|
529
|
-
scheduleAbortPoll();
|
|
530
|
-
}, jitteredDelay(POLL_INTERVAL));
|
|
531
|
-
}
|
|
532
|
-
|
|
533
|
-
// --- Spawn and monitor child ---
|
|
534
|
-
|
|
535
|
-
let timeoutId = null;
|
|
536
|
-
|
|
537
|
-
function startChild() {
|
|
538
|
-
log(`Starting: ${childCmd} ${childArgs.join(' ').substring(0, 100)}...`);
|
|
539
|
-
log(`Budget: ${BUDGET} tokens | Timeout: ${TIMEOUT_S}s | Startup: ${STARTUP_TIMEOUT_S}s | Stall: ${STALL_TIMEOUT_S}s | Dead air: ${DEAD_AIR_TIMEOUT_M}min | Max sends: ${MAX_SENDS}/min | Max tools: ${MAX_TOOL_CALLS}/min`);
|
|
540
|
-
if (RESTART) {
|
|
541
|
-
log(`Restart: enabled | max: ${MAX_RESTARTS || 'unlimited'} | delay: ${RESTART_DELAY_S}s ±30% | restarts so far: ${state.restarts}`);
|
|
542
|
-
}
|
|
543
|
-
|
|
544
|
-
child = spawn(childCmd, childArgs, {
|
|
545
|
-
stdio: ['pipe', 'pipe', 'pipe'],
|
|
546
|
-
env: process.env,
|
|
547
|
-
});
|
|
548
|
-
|
|
549
|
-
state.pid = child.pid;
|
|
550
|
-
|
|
551
|
-
// Close stdin immediately — claude -p should never need interactive input.
|
|
552
|
-
closeStdin();
|
|
553
|
-
|
|
554
|
-
// --- Monitor stdout ---
|
|
555
|
-
|
|
556
|
-
child.stdout.on('data', (chunk) => {
|
|
557
|
-
process.stdout.write(chunk);
|
|
558
|
-
onChildOutput();
|
|
559
|
-
checkForPrompts(chunk.toString());
|
|
560
|
-
});
|
|
561
|
-
|
|
562
|
-
// --- Monitor stderr ---
|
|
563
|
-
|
|
564
|
-
let stderrBuffer = '';
|
|
565
|
-
|
|
566
|
-
child.stderr.on('data', (chunk) => {
|
|
567
|
-
const text = chunk.toString();
|
|
568
|
-
process.stderr.write(chunk);
|
|
569
|
-
onChildOutput();
|
|
570
|
-
|
|
571
|
-
stderrBuffer += text;
|
|
572
|
-
const lines = stderrBuffer.split('\n');
|
|
573
|
-
stderrBuffer = lines.pop();
|
|
574
|
-
|
|
575
|
-
for (const line of lines) {
|
|
576
|
-
parseTokens(line);
|
|
577
|
-
parseToolCall(line);
|
|
578
|
-
|
|
579
|
-
if (state.tokensTotal > BUDGET) {
|
|
580
|
-
killChild('budget');
|
|
581
|
-
return;
|
|
582
|
-
}
|
|
583
|
-
|
|
584
|
-
const rateViolation = checkRateLimits();
|
|
585
|
-
if (rateViolation) {
|
|
586
|
-
killChild(rateViolation);
|
|
587
|
-
return;
|
|
588
|
-
}
|
|
589
|
-
}
|
|
590
|
-
});
|
|
591
|
-
|
|
592
|
-
// Start stall detection
|
|
593
|
-
if (STALL_TIMEOUT_S > 0 || STARTUP_TIMEOUT_S > 0) {
|
|
594
|
-
log(`Stall detection: startup-timeout=${STARTUP_TIMEOUT_S}s, stall-timeout=${STALL_TIMEOUT_S}s, max-nudges=${MAX_NUDGES}`);
|
|
595
|
-
resetStallTimer();
|
|
596
|
-
}
|
|
597
|
-
|
|
598
|
-
// Start dead air detection
|
|
599
|
-
if (DEAD_AIR_TIMEOUT_M > 0) {
|
|
600
|
-
log(`Dead air detection: ${DEAD_AIR_TIMEOUT_M}min threshold, ${Math.round(DEAD_AIR_POLL_MS / 1000)}s poll interval`);
|
|
601
|
-
scheduleDeadAirPoll();
|
|
602
|
-
}
|
|
603
|
-
|
|
604
|
-
// Abort file polling
|
|
605
|
-
if (ABORT_FILE) {
|
|
606
|
-
log(`Abort file: ${ABORT_FILE} (poll: ${POLL_INTERVAL}ms ±30% jitter)`);
|
|
607
|
-
scheduleAbortPoll();
|
|
608
|
-
}
|
|
609
|
-
|
|
610
|
-
// Per-run timeout
|
|
611
|
-
timeoutId = setTimeout(() => {
|
|
612
|
-
killChild('timeout');
|
|
613
|
-
}, TIMEOUT_S * 1000);
|
|
614
|
-
|
|
615
|
-
// --- Exit handler ---
|
|
616
|
-
|
|
617
|
-
child.on('exit', (code, signal) => {
|
|
618
|
-
clearTimeout(timeoutId);
|
|
619
|
-
if (stallTimer) clearTimeout(stallTimer);
|
|
620
|
-
if (abortPollId) clearTimeout(abortPollId);
|
|
621
|
-
if (deadAirPollId) clearTimeout(deadAirPollId);
|
|
622
|
-
|
|
623
|
-
state.exitCode = code;
|
|
624
|
-
state.exitSignal = signal;
|
|
625
|
-
state.duration = Math.round((Date.now() - new Date(state.startedAt).getTime()) / 1000);
|
|
626
|
-
state.gotFirstOutput = gotFirstOutput;
|
|
627
|
-
|
|
628
|
-
log(`Exit — code: ${code} signal: ${signal} | tokens: ${state.tokensTotal} | tools: ${state.toolCalls} | sends: ${state.sendCalls} | duration: ${state.duration}s | output: ${gotFirstOutput}${state.killedBy ? ` | killed: ${state.killedBy}` : ''} | restarts: ${state.restarts}`);
|
|
629
|
-
writeState();
|
|
630
|
-
|
|
631
|
-
if (shouldRestart(code, signal)) {
|
|
632
|
-
state.restarts++;
|
|
633
|
-
const delay = jitteredDelay(RESTART_DELAY_S * 1000);
|
|
634
|
-
log(`RESTART — attempt ${state.restarts}${MAX_RESTARTS > 0 ? `/${MAX_RESTARTS}` : ''} in ${Math.round(delay / 1000)}s`);
|
|
635
|
-
resetPerRunState();
|
|
636
|
-
setTimeout(() => {
|
|
637
|
-
startChild();
|
|
638
|
-
}, delay);
|
|
639
|
-
} else {
|
|
640
|
-
if (RESTART && !shouldRestart(code, signal)) {
|
|
641
|
-
const reason = nikiTerminated ? 'niki received signal' :
|
|
642
|
-
(state.killedBy && NO_RESTART_REASONS.has(state.killedBy)) ? `hard kill (${state.killedBy})` :
|
|
643
|
-
(MAX_RESTARTS > 0 && state.restarts >= MAX_RESTARTS) ? `max restarts reached (${MAX_RESTARTS})` :
|
|
644
|
-
'restart not enabled';
|
|
645
|
-
log(`NOT RESTARTING — ${reason}`);
|
|
646
|
-
}
|
|
647
|
-
if (logStream) logStream.end();
|
|
648
|
-
process.exit(code ?? 1);
|
|
649
|
-
}
|
|
650
|
-
});
|
|
651
|
-
}
|
|
652
|
-
|
|
653
|
-
// --- Signal forwarding ---
|
|
654
|
-
|
|
655
|
-
for (const sig of ['SIGINT', 'SIGTERM']) {
|
|
656
|
-
process.on(sig, () => {
|
|
657
|
-
log(`Received ${sig}, forwarding to child`);
|
|
658
|
-
nikiTerminated = true;
|
|
659
|
-
if (child) child.kill(sig);
|
|
660
|
-
});
|
|
661
|
-
}
|
|
662
|
-
|
|
663
|
-
// --- Start ---
|
|
664
|
-
|
|
665
|
-
startChild();
|
package/niki.png
DELETED
|
Binary file
|
package/tests/test-niki.sh
DELETED
|
@@ -1,275 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
# test-niki.sh — Unit tests for niki process supervisor
|
|
3
|
-
#
|
|
4
|
-
# Tests stdin management, stdout forwarding, stall detection,
|
|
5
|
-
# session handling, and prompt detection.
|
|
6
|
-
#
|
|
7
|
-
# Usage: ./tests/test-niki.sh [--verbose]
|
|
8
|
-
|
|
9
|
-
set -euo pipefail
|
|
10
|
-
|
|
11
|
-
NIKI="$(dirname "$0")/../bin/niki"
|
|
12
|
-
PASSED=0
|
|
13
|
-
FAILED=0
|
|
14
|
-
VERBOSE="${1:-}"
|
|
15
|
-
|
|
16
|
-
red() { printf "\033[31m%s\033[0m" "$1"; }
|
|
17
|
-
green() { printf "\033[32m%s\033[0m" "$1"; }
|
|
18
|
-
bold() { printf "\033[1m%s\033[0m" "$1"; }
|
|
19
|
-
|
|
20
|
-
run_test() {
|
|
21
|
-
local name="$1"
|
|
22
|
-
shift
|
|
23
|
-
local expected_exit="$1"
|
|
24
|
-
shift
|
|
25
|
-
|
|
26
|
-
printf " %-50s " "$name"
|
|
27
|
-
|
|
28
|
-
local output
|
|
29
|
-
local actual_exit=0
|
|
30
|
-
output=$("$@" 2>&1) || actual_exit=$?
|
|
31
|
-
|
|
32
|
-
if [ "$actual_exit" -eq "$expected_exit" ]; then
|
|
33
|
-
green "PASS"
|
|
34
|
-
echo " (exit $actual_exit)"
|
|
35
|
-
PASSED=$((PASSED + 1))
|
|
36
|
-
if [ "$VERBOSE" = "--verbose" ]; then
|
|
37
|
-
echo "$output" | sed 's/^/ | /'
|
|
38
|
-
fi
|
|
39
|
-
else
|
|
40
|
-
red "FAIL"
|
|
41
|
-
echo " (expected exit $expected_exit, got $actual_exit)"
|
|
42
|
-
FAILED=$((FAILED + 1))
|
|
43
|
-
echo "$output" | sed 's/^/ | /'
|
|
44
|
-
fi
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
run_test_output() {
|
|
48
|
-
local name="$1"
|
|
49
|
-
local expected_pattern="$2"
|
|
50
|
-
shift 2
|
|
51
|
-
|
|
52
|
-
printf " %-50s " "$name"
|
|
53
|
-
|
|
54
|
-
local output
|
|
55
|
-
local actual_exit=0
|
|
56
|
-
output=$("$@" 2>&1) || actual_exit=$?
|
|
57
|
-
|
|
58
|
-
if echo "$output" | grep -qE "$expected_pattern"; then
|
|
59
|
-
green "PASS"
|
|
60
|
-
echo ""
|
|
61
|
-
PASSED=$((PASSED + 1))
|
|
62
|
-
if [ "$VERBOSE" = "--verbose" ]; then
|
|
63
|
-
echo "$output" | sed 's/^/ | /'
|
|
64
|
-
fi
|
|
65
|
-
else
|
|
66
|
-
red "FAIL"
|
|
67
|
-
echo " (pattern '$expected_pattern' not found)"
|
|
68
|
-
FAILED=$((FAILED + 1))
|
|
69
|
-
echo "$output" | sed 's/^/ | /'
|
|
70
|
-
fi
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
echo ""
|
|
74
|
-
bold "=== niki unit tests ==="; echo ""
|
|
75
|
-
echo ""
|
|
76
|
-
|
|
77
|
-
# ---- Stdout forwarding ----
|
|
78
|
-
|
|
79
|
-
bold "Stdout forwarding"; echo ""
|
|
80
|
-
|
|
81
|
-
run_test_output \
|
|
82
|
-
"echo passes through stdout" \
|
|
83
|
-
"^hello from niki$" \
|
|
84
|
-
timeout 10 node "$NIKI" --stall-timeout 5 -- echo "hello from niki"
|
|
85
|
-
|
|
86
|
-
run_test \
|
|
87
|
-
"echo exits cleanly (code 0)" \
|
|
88
|
-
0 \
|
|
89
|
-
timeout 10 node "$NIKI" --stall-timeout 5 -- echo "test"
|
|
90
|
-
|
|
91
|
-
run_test_output \
|
|
92
|
-
"multi-line output preserved" \
|
|
93
|
-
"line2" \
|
|
94
|
-
timeout 10 node "$NIKI" --stall-timeout 5 -- sh -c 'echo line1; echo line2; echo line3'
|
|
95
|
-
|
|
96
|
-
echo ""
|
|
97
|
-
|
|
98
|
-
# ---- Stdin management ----
|
|
99
|
-
|
|
100
|
-
bold "Stdin management"; echo ""
|
|
101
|
-
|
|
102
|
-
run_test_output \
|
|
103
|
-
"stdin closed immediately on spawn" \
|
|
104
|
-
"Stdin: closed" \
|
|
105
|
-
timeout 10 node "$NIKI" --stall-timeout 5 -- echo "ok"
|
|
106
|
-
|
|
107
|
-
run_test \
|
|
108
|
-
"cat exits on EOF (stdin closed)" \
|
|
109
|
-
0 \
|
|
110
|
-
timeout 10 node "$NIKI" --stall-timeout 5 -- cat
|
|
111
|
-
|
|
112
|
-
echo ""
|
|
113
|
-
|
|
114
|
-
# ---- Stall detection ----
|
|
115
|
-
|
|
116
|
-
bold "Stall detection"; echo ""
|
|
117
|
-
|
|
118
|
-
run_test_output \
|
|
119
|
-
"stall kills silent process" \
|
|
120
|
-
"STALL.*no output" \
|
|
121
|
-
timeout 10 node "$NIKI" --stall-timeout 2 --startup-timeout 0 --max-nudges 0 -- sleep 30
|
|
122
|
-
|
|
123
|
-
run_test_output \
|
|
124
|
-
"stall kill reason logged" \
|
|
125
|
-
"KILL.*reason: stall" \
|
|
126
|
-
timeout 10 node "$NIKI" --stall-timeout 2 --startup-timeout 0 --max-nudges 0 -- sleep 30
|
|
127
|
-
|
|
128
|
-
run_test \
|
|
129
|
-
"stall kill exits non-zero" \
|
|
130
|
-
1 \
|
|
131
|
-
timeout 10 node "$NIKI" --stall-timeout 2 --startup-timeout 0 --max-nudges 0 -- sleep 30
|
|
132
|
-
|
|
133
|
-
run_test_output \
|
|
134
|
-
"stall disabled when timeout=0" \
|
|
135
|
-
"Exit.*code: 0" \
|
|
136
|
-
timeout 5 node "$NIKI" --stall-timeout 0 -- sh -c 'sleep 1; echo done'
|
|
137
|
-
|
|
138
|
-
echo ""
|
|
139
|
-
|
|
140
|
-
# ---- Stall timeout precision ----
|
|
141
|
-
|
|
142
|
-
bold "Stall timing"; echo ""
|
|
143
|
-
|
|
144
|
-
# Process that outputs then goes silent — stall should fire after the silence
|
|
145
|
-
run_test_output \
|
|
146
|
-
"stall timer resets on output" \
|
|
147
|
-
"Exit.*code: 0" \
|
|
148
|
-
timeout 10 node "$NIKI" --stall-timeout 3 --startup-timeout 0 -- sh -c 'echo tick; sleep 1; echo tick; sleep 1; echo done'
|
|
149
|
-
|
|
150
|
-
echo ""
|
|
151
|
-
|
|
152
|
-
# ---- Startup timeout ----
|
|
153
|
-
|
|
154
|
-
bold "Startup timeout"; echo ""
|
|
155
|
-
|
|
156
|
-
# Startup timeout gives longer grace period before first output
|
|
157
|
-
run_test_output \
|
|
158
|
-
"startup-timeout used before first output" \
|
|
159
|
-
"startup-timeout=5s" \
|
|
160
|
-
timeout 10 node "$NIKI" --stall-timeout 2 --startup-timeout 5 -- sh -c 'sleep 3; echo hello'
|
|
161
|
-
|
|
162
|
-
# After first output, switches to stall-timeout
|
|
163
|
-
run_test_output \
|
|
164
|
-
"switches to stall-timeout after first output" \
|
|
165
|
-
"switching to stall-timeout" \
|
|
166
|
-
timeout 10 node "$NIKI" --stall-timeout 3 --startup-timeout 10 -- sh -c 'echo first; sleep 1; echo done'
|
|
167
|
-
|
|
168
|
-
echo ""
|
|
169
|
-
|
|
170
|
-
# ---- Budget/timeout (existing features, regression) ----
|
|
171
|
-
|
|
172
|
-
bold "Budget and timeout (regression)"; echo ""
|
|
173
|
-
|
|
174
|
-
run_test_output \
|
|
175
|
-
"wall-clock timeout kills" \
|
|
176
|
-
"KILL.*reason: timeout" \
|
|
177
|
-
timeout 10 node "$NIKI" --timeout 2 --stall-timeout 0 -- sleep 30
|
|
178
|
-
|
|
179
|
-
run_test \
|
|
180
|
-
"timeout kill exits non-zero" \
|
|
181
|
-
1 \
|
|
182
|
-
timeout 10 node "$NIKI" --timeout 2 --stall-timeout 0 -- sleep 30
|
|
183
|
-
|
|
184
|
-
echo ""
|
|
185
|
-
|
|
186
|
-
# ---- Exit code passthrough ----
|
|
187
|
-
|
|
188
|
-
bold "Exit code passthrough"; echo ""
|
|
189
|
-
|
|
190
|
-
run_test \
|
|
191
|
-
"child exit 0 → niki exit 0" \
|
|
192
|
-
0 \
|
|
193
|
-
timeout 10 node "$NIKI" --stall-timeout 5 -- sh -c 'exit 0'
|
|
194
|
-
|
|
195
|
-
run_test \
|
|
196
|
-
"child exit 1 → niki exit 1" \
|
|
197
|
-
1 \
|
|
198
|
-
timeout 10 node "$NIKI" --stall-timeout 5 -- sh -c 'exit 1'
|
|
199
|
-
|
|
200
|
-
run_test \
|
|
201
|
-
"child exit 42 → niki exit 42" \
|
|
202
|
-
42 \
|
|
203
|
-
timeout 10 node "$NIKI" --stall-timeout 5 -- sh -c 'exit 42'
|
|
204
|
-
|
|
205
|
-
echo ""
|
|
206
|
-
|
|
207
|
-
# ---- Abort file ----
|
|
208
|
-
|
|
209
|
-
bold "Abort file"; echo ""
|
|
210
|
-
|
|
211
|
-
ABORT_FILE=$(mktemp)
|
|
212
|
-
rm -f "$ABORT_FILE"
|
|
213
|
-
|
|
214
|
-
run_test_output \
|
|
215
|
-
"abort file kills process" \
|
|
216
|
-
"KILL.*reason: abort" \
|
|
217
|
-
timeout 10 sh -c "node $NIKI --stall-timeout 0 --abort-file $ABORT_FILE -- sh -c 'sleep 1; echo still here; sleep 30' & PID=\$!; sleep 2; touch $ABORT_FILE; wait \$PID 2>/dev/null; echo done"
|
|
218
|
-
|
|
219
|
-
rm -f "$ABORT_FILE"
|
|
220
|
-
|
|
221
|
-
echo ""
|
|
222
|
-
|
|
223
|
-
# ---- SIGTERM forwarding ----
|
|
224
|
-
|
|
225
|
-
bold "SIGTERM forwarding"; echo ""
|
|
226
|
-
|
|
227
|
-
# niki should forward SIGTERM to child and exit
|
|
228
|
-
run_test_output \
|
|
229
|
-
"SIGTERM forwarded to child" \
|
|
230
|
-
"Received SIGTERM" \
|
|
231
|
-
timeout 10 sh -c "node $NIKI --stall-timeout 0 -- sh -c 'echo started; sleep 30' & PID=\$!; sleep 1; kill -TERM \$PID; wait \$PID 2>/dev/null; echo done"
|
|
232
|
-
|
|
233
|
-
echo ""
|
|
234
|
-
|
|
235
|
-
# ---- Dead air detection ----
|
|
236
|
-
|
|
237
|
-
bold "Dead air detection"; echo ""
|
|
238
|
-
|
|
239
|
-
# Dead air kills silent process with zero CPU (sleep has ~zero CPU)
|
|
240
|
-
run_test_output \
|
|
241
|
-
"dead air kills zero-CPU process" \
|
|
242
|
-
"DEAD AIR.*zero CPU" \
|
|
243
|
-
timeout 30 node "$NIKI" --stall-timeout 0 --dead-air-timeout 0.1 -- sleep 30
|
|
244
|
-
|
|
245
|
-
run_test_output \
|
|
246
|
-
"dead air kill reason logged" \
|
|
247
|
-
"KILL.*reason: dead-air" \
|
|
248
|
-
timeout 30 node "$NIKI" --stall-timeout 0 --dead-air-timeout 0.1 -- sleep 30
|
|
249
|
-
|
|
250
|
-
# Dead air defers for CPU-active processes (busy loop uses CPU)
|
|
251
|
-
run_test_output \
|
|
252
|
-
"dead air defers when CPU active" \
|
|
253
|
-
"Exit.*code: 0" \
|
|
254
|
-
timeout 15 node "$NIKI" --stall-timeout 0 --dead-air-timeout 0.1 -- sh -c 'i=0; while [ $i -lt 2000000 ]; do i=$((i+1)); done; echo done'
|
|
255
|
-
|
|
256
|
-
# Dead air disabled when timeout=0
|
|
257
|
-
run_test_output \
|
|
258
|
-
"dead air disabled when timeout=0" \
|
|
259
|
-
"Exit.*code: 0" \
|
|
260
|
-
timeout 10 node "$NIKI" --stall-timeout 0 --dead-air-timeout 0 -- sh -c 'sleep 1; echo done'
|
|
261
|
-
|
|
262
|
-
echo ""
|
|
263
|
-
|
|
264
|
-
# ---- Summary ----
|
|
265
|
-
|
|
266
|
-
echo "────────────────────────────────────────────────"
|
|
267
|
-
TOTAL=$((PASSED + FAILED))
|
|
268
|
-
if [ "$FAILED" -eq 0 ]; then
|
|
269
|
-
green "All $TOTAL tests passed"; echo ""
|
|
270
|
-
else
|
|
271
|
-
red "$FAILED/$TOTAL tests failed"; echo ""
|
|
272
|
-
fi
|
|
273
|
-
echo ""
|
|
274
|
-
|
|
275
|
-
exit "$FAILED"
|