ai-or-die 0.1.77 → 0.1.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ai-or-die.js +1 -1
- package/package.json +1 -1
- package/src/server.js +18 -1
- package/src/sticky-note-engine.js +39 -14
- package/src/sticky-note-summarizer.js +1 -1
- package/src/sticky-note-threads.js +25 -0
- package/src/sticky-note-worker.js +20 -3
package/bin/ai-or-die.js
CHANGED
|
@@ -43,7 +43,7 @@ program
|
|
|
43
43
|
.option('--no-sticky-notes', 'disable per-tab AI session summaries + auto tab titles (on by default)')
|
|
44
44
|
.option('--sticky-notes-model-dir <path>', 'custom directory for the sticky-note model file')
|
|
45
45
|
.option('--sticky-notes-model <url>', 'override the sticky-note model GGUF download URL')
|
|
46
|
-
.option('--sticky-notes-threads <number>', 'CPU threads for sticky-note inference (default: auto,
|
|
46
|
+
.option('--sticky-notes-threads <number>', 'CPU threads for sticky-note inference (default: auto — three-quarters of the cores on CPU, gentle on GPU)')
|
|
47
47
|
.option('--no-keepalive', 'disable keeping the machine awake while the server runs (Windows only; on by default)')
|
|
48
48
|
.option('--keepalive-display', 'also keep the display on (default keeps the system awake but lets the monitor sleep)');
|
|
49
49
|
|
package/package.json
CHANGED
package/src/server.js
CHANGED
|
@@ -4276,7 +4276,24 @@ class ClaudeCodeWebServer {
|
|
|
4276
4276
|
percent: progress.percent,
|
|
4277
4277
|
});
|
|
4278
4278
|
})
|
|
4279
|
-
.then(() =>
|
|
4279
|
+
.then(() => {
|
|
4280
|
+
// One-time visibility into the inference backend. On CPU (no GPU — common
|
|
4281
|
+
// on Windows when the Vulkan/CUDA prebuilt is incompatible) summaries are
|
|
4282
|
+
// materially slower; the worker compensates with more threads + a generous
|
|
4283
|
+
// watchdog timeout, but a note can still take a couple of minutes.
|
|
4284
|
+
const rt = this.stickyNoteEngine.getRuntimeInfo && this.stickyNoteEngine.getRuntimeInfo();
|
|
4285
|
+
if (this.dev && rt) {
|
|
4286
|
+
if (rt.gpu) {
|
|
4287
|
+
console.log(`[sticky-notes] engine ready (GPU backend, ${rt.threads} threads)`);
|
|
4288
|
+
} else {
|
|
4289
|
+
console.log(
|
|
4290
|
+
`[sticky-notes] engine ready (CPU backend, ${rt.threads} threads) — ` +
|
|
4291
|
+
'summaries run on CPU and may take a couple of minutes; a Vulkan/CUDA driver would accelerate them'
|
|
4292
|
+
);
|
|
4293
|
+
}
|
|
4294
|
+
}
|
|
4295
|
+
this._broadcastStickyStatus();
|
|
4296
|
+
})
|
|
4280
4297
|
.catch((err) => {
|
|
4281
4298
|
// Allow a later AI-session start to retry after a transient failure
|
|
4282
4299
|
// (download blip). A permanent failure (no binding) just fails fast.
|
|
@@ -9,21 +9,31 @@
|
|
|
9
9
|
|
|
10
10
|
const { Worker } = require('worker_threads');
|
|
11
11
|
const path = require('path');
|
|
12
|
-
const os = require('os');
|
|
13
12
|
const GgufModelManager = require('./utils/gguf-model-manager');
|
|
14
13
|
const { isBun } = require('./utils/runtime');
|
|
15
14
|
|
|
16
15
|
const MAX_QUEUE_SIZE = 3;
|
|
17
|
-
|
|
16
|
+
// Watchdog-grade, unconditional per-request timeout. Real grammar-constrained
|
|
17
|
+
// summaries on a CPU backend (no GPU — common on Windows when the Vulkan/CUDA
|
|
18
|
+
// prebuilt is incompatible) take ~90s on half-core threading and up to ~160s on
|
|
19
|
+
// 2 threads. This is a true catastrophic watchdog set well above that, NOT an
|
|
20
|
+
// expected boundary: correctness over speed (a slow note must still complete).
|
|
21
|
+
// GPU runs finish in ~7s and return immediately, so the high cap costs them
|
|
22
|
+
// nothing. The summarizer's backstop sits strictly above this (one timeout
|
|
23
|
+
// owner). An explicit inferTimeoutMs still overrides.
|
|
24
|
+
const DEFAULT_INFER_TIMEOUT_MS = 300000;
|
|
18
25
|
const MAX_RESTART_DELAY_MS = 15000;
|
|
19
26
|
const MAX_RESTART_ATTEMPTS = 5;
|
|
20
27
|
|
|
21
28
|
class StickyNoteEngine {
|
|
22
29
|
constructor(options = {}) {
|
|
23
30
|
this._enabled = !!options.enabled;
|
|
24
|
-
//
|
|
25
|
-
//
|
|
26
|
-
|
|
31
|
+
// Thread count is auto-selected by the worker once it knows whether a GPU
|
|
32
|
+
// backend loaded (see sticky-note-threads.pickThreads), UNLESS the caller
|
|
33
|
+
// pins it explicitly (--sticky-notes-threads). Auto is signalled by leaving
|
|
34
|
+
// numThreads out of the worker data entirely.
|
|
35
|
+
this._numThreadsExplicit = Number.isFinite(Number(options.numThreads)) && Number(options.numThreads) > 0;
|
|
36
|
+
this._numThreads = this._numThreadsExplicit ? Math.floor(Number(options.numThreads)) : null;
|
|
27
37
|
this._contextSize = options.contextSize || 8192;
|
|
28
38
|
this._inferTimeoutMs = options.inferTimeoutMs || DEFAULT_INFER_TIMEOUT_MS;
|
|
29
39
|
this._maxQueue = options.maxQueue || MAX_QUEUE_SIZE;
|
|
@@ -39,22 +49,30 @@ class StickyNoteEngine {
|
|
|
39
49
|
this._stopping = false;
|
|
40
50
|
this._initPromise = null;
|
|
41
51
|
this._downloadProgress = null;
|
|
52
|
+
this._runtimeInfo = null; // { gpu, threads } reported by the worker on ready
|
|
42
53
|
|
|
43
54
|
this._modelManager =
|
|
44
55
|
options.modelManager ||
|
|
45
56
|
new GgufModelManager({ model: options.model, modelsDir: options.modelsDir });
|
|
46
57
|
|
|
47
|
-
// Injectable for tests; default spawns the real worker thread.
|
|
58
|
+
// Injectable for tests; default spawns the real worker thread. numThreads is
|
|
59
|
+
// included ONLY when explicitly pinned — otherwise the worker auto-picks.
|
|
48
60
|
this._createWorker =
|
|
49
61
|
options.createWorker ||
|
|
50
|
-
(() =>
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
62
|
+
(() => new Worker(path.join(__dirname, 'sticky-note-worker.js'), { workerData: this._workerData() }));
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Build the worker's workerData. numThreads is OMITTED when auto (not pinned)
|
|
67
|
+
* so the worker auto-selects based on the GPU backend it detects; pinning it
|
|
68
|
+
* here would defeat that. Kept as a method so it is unit-testable.
|
|
69
|
+
*/
|
|
70
|
+
_workerData() {
|
|
71
|
+
return {
|
|
72
|
+
modelPath: this._modelManager.getModelFile(),
|
|
73
|
+
...(this._numThreadsExplicit ? { numThreads: this._numThreads } : {}),
|
|
74
|
+
contextSize: this._contextSize,
|
|
75
|
+
};
|
|
58
76
|
}
|
|
59
77
|
|
|
60
78
|
async initialize(onProgress) {
|
|
@@ -109,6 +127,11 @@ class StickyNoteEngine {
|
|
|
109
127
|
return this._downloadProgress;
|
|
110
128
|
}
|
|
111
129
|
|
|
130
|
+
/** { gpu, threads } reported by the worker on ready, or null before ready. */
|
|
131
|
+
getRuntimeInfo() {
|
|
132
|
+
return this._runtimeInfo;
|
|
133
|
+
}
|
|
134
|
+
|
|
112
135
|
/**
|
|
113
136
|
* Run one inference. Resolves with the model's raw output string.
|
|
114
137
|
* @param {string} prompt
|
|
@@ -173,6 +196,7 @@ class StickyNoteEngine {
|
|
|
173
196
|
this._queue = [];
|
|
174
197
|
this._currentRequest = null;
|
|
175
198
|
this._worker = null;
|
|
199
|
+
this._runtimeInfo = null; // dead worker — drop its reported backend/threads
|
|
176
200
|
|
|
177
201
|
if (this._stopping) {
|
|
178
202
|
this._status = 'unavailable';
|
|
@@ -230,6 +254,7 @@ class StickyNoteEngine {
|
|
|
230
254
|
this._status = 'ready';
|
|
231
255
|
this._restartAttempts = 0;
|
|
232
256
|
this._lastSpawnError = null;
|
|
257
|
+
this._runtimeInfo = { gpu: !!msg.gpu, threads: msg.threads || null };
|
|
233
258
|
worker.on('message', (m) => this._onWorkerMessage(m));
|
|
234
259
|
worker.on('exit', (c) => this._onWorkerExit(c));
|
|
235
260
|
this._processQueue();
|
|
@@ -18,7 +18,7 @@ const DEFAULTS = {
|
|
|
18
18
|
minIntervalMs: 20000, // floor between inferences for one session
|
|
19
19
|
intervalFactor: 3, // adaptive: minInterval = max(floor, factor * lastDurationMs)
|
|
20
20
|
turnDebounceMs: 1500, // (JSONL mode) coalesce a burst of appended turn lines
|
|
21
|
-
inferTimeoutMs:
|
|
21
|
+
inferTimeoutMs: 330000, // backstop ABOVE the engine's own 300s timeout, so the
|
|
22
22
|
// engine times out first (one timeout owner); this only fires if the engine
|
|
23
23
|
// promise hangs entirely. Worker-side serialisation prevents concurrent runs.
|
|
24
24
|
failureThreshold: 3, // consecutive failures -> open circuit breaker
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
// Thread-count policy for the sticky-note inference worker. Pure + dependency-
|
|
4
|
+
// free so it can be unit-tested without spawning a worker or loading a model.
|
|
5
|
+
//
|
|
6
|
+
// The worker decides its own thread count AFTER getLlama() reports whether a GPU
|
|
7
|
+
// backend actually loaded:
|
|
8
|
+
// - GPU present -> the GPU carries the inference (the worker also requests full
|
|
9
|
+
// layer offload); keep a low, gentle CPU thread count so it can't saturate CPU
|
|
10
|
+
// and starve the terminal / AI agent.
|
|
11
|
+
// - No GPU (CPU) -> common on Windows when the Vulkan/CUDA prebuilt binary is
|
|
12
|
+
// incompatible. At 2 threads one grammar-constrained summary takes ~160s on a
|
|
13
|
+
// 16-core box and blows every timeout; use THREE-QUARTERS of the cores (leaving
|
|
14
|
+
// a quarter for the terminal/agent) so it completes well inside the watchdog.
|
|
15
|
+
// An explicit override (--sticky-notes-threads) always wins, after validation.
|
|
16
|
+
// `explicit` is coerced with Number() so a numeric string (e.g. from a CLI/env
|
|
17
|
+
// arg) still counts as a valid pin rather than silently falling back to auto.
|
|
18
|
+
function pickThreads({ explicit, gpu, cpus } = {}) {
|
|
19
|
+
const pinned = Number(explicit);
|
|
20
|
+
if (Number.isFinite(pinned) && pinned > 0) return Math.floor(pinned);
|
|
21
|
+
const cores = Number.isFinite(cpus) && cpus > 0 ? Math.floor(cpus) : 1;
|
|
22
|
+
return gpu ? Math.max(1, Math.min(2, cores - 2)) : Math.max(1, Math.floor((cores * 3) / 4));
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
module.exports = { pickThreads };
|
|
@@ -9,10 +9,10 @@
|
|
|
9
9
|
const { parentPort, workerData } = require('worker_threads');
|
|
10
10
|
const os = require('os');
|
|
11
11
|
const { SYSTEM_PROMPT, NOTE_SCHEMA } = require('./sticky-note-prompt');
|
|
12
|
+
const { pickThreads } = require('./sticky-note-threads');
|
|
12
13
|
|
|
13
14
|
const modelPath = workerData.modelPath;
|
|
14
15
|
const contextSize = workerData.contextSize || 8192;
|
|
15
|
-
const numThreads = workerData.numThreads || Math.max(1, Math.min(2, os.cpus().length - 2));
|
|
16
16
|
const maxTokens = workerData.maxTokens || 320;
|
|
17
17
|
|
|
18
18
|
let llama;
|
|
@@ -42,12 +42,29 @@ async function init() {
|
|
|
42
42
|
LlamaChatSessionCtor = LlamaChatSession;
|
|
43
43
|
|
|
44
44
|
llama = await getLlama();
|
|
45
|
-
|
|
45
|
+
// availableParallelism() reflects usable parallelism better than cpus().length
|
|
46
|
+
// on Windows hybrid P/E-core machines; fall back where it's unavailable.
|
|
47
|
+
const cpus = (typeof os.availableParallelism === 'function' ? os.availableParallelism() : 0) || os.cpus().length;
|
|
48
|
+
// llama.gpu is false | 'cuda' | 'vulkan' | 'metal'; any non-empty string = GPU.
|
|
49
|
+
const gpu = !!llama.gpu;
|
|
50
|
+
const numThreads = pickThreads({ explicit: workerData.numThreads, gpu, cpus });
|
|
51
|
+
// Use the GPU fully when present: request all layers in VRAM ('max'). If the
|
|
52
|
+
// GPU can't fit them, 'max' throws — fall back to the default 'auto', which
|
|
53
|
+
// still offloads as many layers as fit (never worse than CPU-only).
|
|
54
|
+
if (gpu) {
|
|
55
|
+
try {
|
|
56
|
+
model = await llama.loadModel({ modelPath, gpuLayers: 'max' });
|
|
57
|
+
} catch {
|
|
58
|
+
model = await llama.loadModel({ modelPath });
|
|
59
|
+
}
|
|
60
|
+
} else {
|
|
61
|
+
model = await llama.loadModel({ modelPath });
|
|
62
|
+
}
|
|
46
63
|
context = await model.createContext({ contextSize, threads: numThreads });
|
|
47
64
|
sequence = context.getSequence();
|
|
48
65
|
grammar = await llama.createGrammarForJsonSchema(NOTE_SCHEMA);
|
|
49
66
|
|
|
50
|
-
parentPort.postMessage({ type: 'ready' });
|
|
67
|
+
parentPort.postMessage({ type: 'ready', gpu, threads: numThreads });
|
|
51
68
|
}
|
|
52
69
|
|
|
53
70
|
async function handleInfer(msg) {
|