ai-or-die 0.1.77 → 0.1.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/ai-or-die.js CHANGED
@@ -43,7 +43,7 @@ program
43
43
  .option('--no-sticky-notes', 'disable per-tab AI session summaries + auto tab titles (on by default)')
44
44
  .option('--sticky-notes-model-dir <path>', 'custom directory for the sticky-note model file')
45
45
  .option('--sticky-notes-model <url>', 'override the sticky-note model GGUF download URL')
46
- .option('--sticky-notes-threads <number>', 'CPU threads for sticky-note inference (default: auto, max 4)')
46
+ .option('--sticky-notes-threads <number>', 'CPU threads for sticky-note inference (default: auto — three-quarters of the cores on CPU, gentle on GPU)')
47
47
  .option('--no-keepalive', 'disable keeping the machine awake while the server runs (Windows only; on by default)')
48
48
  .option('--keepalive-display', 'also keep the display on (default keeps the system awake but lets the monitor sleep)');
49
49
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ai-or-die",
3
- "version": "0.1.77",
3
+ "version": "0.1.78",
4
4
  "description": "Universal AI coding terminal — Claude, Copilot, Gemini & more in your browser",
5
5
  "main": "src/server.js",
6
6
  "bin": {
package/src/server.js CHANGED
@@ -4276,7 +4276,24 @@ class ClaudeCodeWebServer {
4276
4276
  percent: progress.percent,
4277
4277
  });
4278
4278
  })
4279
- .then(() => this._broadcastStickyStatus())
4279
+ .then(() => {
4280
+ // One-time visibility into the inference backend. On CPU (no GPU — common
4281
+ // on Windows when the Vulkan/CUDA prebuilt is incompatible) summaries are
4282
+ // materially slower; the worker compensates with more threads + a generous
4283
+ // watchdog timeout, but a note can still take a couple of minutes.
4284
+ const rt = this.stickyNoteEngine.getRuntimeInfo && this.stickyNoteEngine.getRuntimeInfo();
4285
+ if (this.dev && rt) {
4286
+ if (rt.gpu) {
4287
+ console.log(`[sticky-notes] engine ready (GPU backend, ${rt.threads} threads)`);
4288
+ } else {
4289
+ console.log(
4290
+ `[sticky-notes] engine ready (CPU backend, ${rt.threads} threads) — ` +
4291
+ 'summaries run on CPU and may take a couple of minutes; a Vulkan/CUDA driver would accelerate them'
4292
+ );
4293
+ }
4294
+ }
4295
+ this._broadcastStickyStatus();
4296
+ })
4280
4297
  .catch((err) => {
4281
4298
  // Allow a later AI-session start to retry after a transient failure
4282
4299
  // (download blip). A permanent failure (no binding) just fails fast.
@@ -9,21 +9,31 @@
9
9
 
10
10
  const { Worker } = require('worker_threads');
11
11
  const path = require('path');
12
- const os = require('os');
13
12
  const GgufModelManager = require('./utils/gguf-model-manager');
14
13
  const { isBun } = require('./utils/runtime');
15
14
 
16
15
  const MAX_QUEUE_SIZE = 3;
17
- const DEFAULT_INFER_TIMEOUT_MS = 60000;
16
+ // Watchdog-grade, unconditional per-request timeout. Real grammar-constrained
17
+ // summaries on a CPU backend (no GPU — common on Windows when the Vulkan/CUDA
18
+ // prebuilt is incompatible) take ~90s on half-core threading and up to ~160s on
19
+ // 2 threads. This is a true catastrophic watchdog set well above that, NOT an
20
+ // expected boundary: correctness over speed (a slow note must still complete).
21
+ // GPU runs finish in ~7s and return immediately, so the high cap costs them
22
+ // nothing. The summarizer's backstop sits strictly above this (one timeout
23
+ // owner). An explicit inferTimeoutMs still overrides.
24
+ const DEFAULT_INFER_TIMEOUT_MS = 300000;
18
25
  const MAX_RESTART_DELAY_MS = 15000;
19
26
  const MAX_RESTART_ATTEMPTS = 5;
20
27
 
21
28
  class StickyNoteEngine {
22
29
  constructor(options = {}) {
23
30
  this._enabled = !!options.enabled;
24
- // Low thread cap keeps inference gentle so the model can't saturate CPU and
25
- // starve the terminal / AI agent. Summaries are infrequent + throttled.
26
- this._numThreads = options.numThreads || Math.max(1, Math.min(2, os.cpus().length - 2));
31
+ // Thread count is auto-selected by the worker once it knows whether a GPU
32
+ // backend loaded (see sticky-note-threads.pickThreads), UNLESS the caller
33
+ // pins it explicitly (--sticky-notes-threads). Auto is signalled by leaving
34
+ // numThreads out of the worker data entirely.
35
+ this._numThreadsExplicit = Number.isFinite(Number(options.numThreads)) && Number(options.numThreads) > 0;
36
+ this._numThreads = this._numThreadsExplicit ? Math.floor(Number(options.numThreads)) : null;
27
37
  this._contextSize = options.contextSize || 8192;
28
38
  this._inferTimeoutMs = options.inferTimeoutMs || DEFAULT_INFER_TIMEOUT_MS;
29
39
  this._maxQueue = options.maxQueue || MAX_QUEUE_SIZE;
@@ -39,22 +49,30 @@ class StickyNoteEngine {
39
49
  this._stopping = false;
40
50
  this._initPromise = null;
41
51
  this._downloadProgress = null;
52
+ this._runtimeInfo = null; // { gpu, threads } reported by the worker on ready
42
53
 
43
54
  this._modelManager =
44
55
  options.modelManager ||
45
56
  new GgufModelManager({ model: options.model, modelsDir: options.modelsDir });
46
57
 
47
- // Injectable for tests; default spawns the real worker thread.
58
+ // Injectable for tests; default spawns the real worker thread. numThreads is
59
+ // included ONLY when explicitly pinned — otherwise the worker auto-picks.
48
60
  this._createWorker =
49
61
  options.createWorker ||
50
- (() =>
51
- new Worker(path.join(__dirname, 'sticky-note-worker.js'), {
52
- workerData: {
53
- modelPath: this._modelManager.getModelFile(),
54
- numThreads: this._numThreads,
55
- contextSize: this._contextSize,
56
- },
57
- }));
62
+ (() => new Worker(path.join(__dirname, 'sticky-note-worker.js'), { workerData: this._workerData() }));
63
+ }
64
+
65
+ /**
66
+ * Build the worker's workerData. numThreads is OMITTED when auto (not pinned)
67
+ * so the worker auto-selects based on the GPU backend it detects; pinning it
68
+ * here would defeat that. Kept as a method so it is unit-testable.
69
+ */
70
+ _workerData() {
71
+ return {
72
+ modelPath: this._modelManager.getModelFile(),
73
+ ...(this._numThreadsExplicit ? { numThreads: this._numThreads } : {}),
74
+ contextSize: this._contextSize,
75
+ };
58
76
  }
59
77
 
60
78
  async initialize(onProgress) {
@@ -109,6 +127,11 @@ class StickyNoteEngine {
109
127
  return this._downloadProgress;
110
128
  }
111
129
 
130
+ /** { gpu, threads } reported by the worker on ready, or null before ready. */
131
+ getRuntimeInfo() {
132
+ return this._runtimeInfo;
133
+ }
134
+
112
135
  /**
113
136
  * Run one inference. Resolves with the model's raw output string.
114
137
  * @param {string} prompt
@@ -173,6 +196,7 @@ class StickyNoteEngine {
173
196
  this._queue = [];
174
197
  this._currentRequest = null;
175
198
  this._worker = null;
199
+ this._runtimeInfo = null; // dead worker — drop its reported backend/threads
176
200
 
177
201
  if (this._stopping) {
178
202
  this._status = 'unavailable';
@@ -230,6 +254,7 @@ class StickyNoteEngine {
230
254
  this._status = 'ready';
231
255
  this._restartAttempts = 0;
232
256
  this._lastSpawnError = null;
257
+ this._runtimeInfo = { gpu: !!msg.gpu, threads: msg.threads || null };
233
258
  worker.on('message', (m) => this._onWorkerMessage(m));
234
259
  worker.on('exit', (c) => this._onWorkerExit(c));
235
260
  this._processQueue();
@@ -18,7 +18,7 @@ const DEFAULTS = {
18
18
  minIntervalMs: 20000, // floor between inferences for one session
19
19
  intervalFactor: 3, // adaptive: minInterval = max(floor, factor * lastDurationMs)
20
20
  turnDebounceMs: 1500, // (JSONL mode) coalesce a burst of appended turn lines
21
- inferTimeoutMs: 75000, // backstop ABOVE the engine's own 60s timeout, so the
21
+ inferTimeoutMs: 330000, // backstop ABOVE the engine's own 300s timeout, so the
22
22
  // engine times out first (one timeout owner); this only fires if the engine
23
23
  // promise hangs entirely. Worker-side serialisation prevents concurrent runs.
24
24
  failureThreshold: 3, // consecutive failures -> open circuit breaker
@@ -0,0 +1,25 @@
1
+ 'use strict';
2
+
3
+ // Thread-count policy for the sticky-note inference worker. Pure + dependency-
4
+ // free so it can be unit-tested without spawning a worker or loading a model.
5
+ //
6
+ // The worker decides its own thread count AFTER getLlama() reports whether a GPU
7
+ // backend actually loaded:
8
+ // - GPU present -> the GPU carries the inference (the worker also requests full
9
+ // layer offload); keep a low, gentle CPU thread count so it can't saturate CPU
10
+ // and starve the terminal / AI agent.
11
+ // - No GPU (CPU) -> common on Windows when the Vulkan/CUDA prebuilt binary is
12
+ // incompatible. At 2 threads one grammar-constrained summary takes ~160s on a
13
+ // 16-core box and blows every timeout; use THREE-QUARTERS of the cores (leaving
14
+ // a quarter for the terminal/agent) so it completes well inside the watchdog.
15
+ // An explicit override (--sticky-notes-threads) always wins, after validation.
16
+ // `explicit` is coerced with Number() so a numeric string (e.g. from a CLI/env
17
+ // arg) still counts as a valid pin rather than silently falling back to auto.
18
+ function pickThreads({ explicit, gpu, cpus } = {}) {
19
+ const pinned = Number(explicit);
20
+ if (Number.isFinite(pinned) && pinned > 0) return Math.floor(pinned);
21
+ const cores = Number.isFinite(cpus) && cpus > 0 ? Math.floor(cpus) : 1;
22
+ return gpu ? Math.max(1, Math.min(2, cores - 2)) : Math.max(1, Math.floor((cores * 3) / 4));
23
+ }
24
+
25
+ module.exports = { pickThreads };
@@ -9,10 +9,10 @@
9
9
  const { parentPort, workerData } = require('worker_threads');
10
10
  const os = require('os');
11
11
  const { SYSTEM_PROMPT, NOTE_SCHEMA } = require('./sticky-note-prompt');
12
+ const { pickThreads } = require('./sticky-note-threads');
12
13
 
13
14
  const modelPath = workerData.modelPath;
14
15
  const contextSize = workerData.contextSize || 8192;
15
- const numThreads = workerData.numThreads || Math.max(1, Math.min(2, os.cpus().length - 2));
16
16
  const maxTokens = workerData.maxTokens || 320;
17
17
 
18
18
  let llama;
@@ -42,12 +42,29 @@ async function init() {
42
42
  LlamaChatSessionCtor = LlamaChatSession;
43
43
 
44
44
  llama = await getLlama();
45
- model = await llama.loadModel({ modelPath });
45
+ // availableParallelism() reflects usable parallelism better than cpus().length
46
+ // on Windows hybrid P/E-core machines; fall back where it's unavailable.
47
+ const cpus = (typeof os.availableParallelism === 'function' ? os.availableParallelism() : 0) || os.cpus().length;
48
+ // llama.gpu is false | 'cuda' | 'vulkan' | 'metal'; any non-empty string = GPU.
49
+ const gpu = !!llama.gpu;
50
+ const numThreads = pickThreads({ explicit: workerData.numThreads, gpu, cpus });
51
+ // Use the GPU fully when present: request all layers in VRAM ('max'). If the
52
+ // GPU can't fit them, 'max' throws — fall back to the default 'auto', which
53
+ // still offloads as many layers as fit (never worse than CPU-only).
54
+ if (gpu) {
55
+ try {
56
+ model = await llama.loadModel({ modelPath, gpuLayers: 'max' });
57
+ } catch {
58
+ model = await llama.loadModel({ modelPath });
59
+ }
60
+ } else {
61
+ model = await llama.loadModel({ modelPath });
62
+ }
46
63
  context = await model.createContext({ contextSize, threads: numThreads });
47
64
  sequence = context.getSequence();
48
65
  grammar = await llama.createGrammarForJsonSchema(NOTE_SCHEMA);
49
66
 
50
- parentPort.postMessage({ type: 'ready' });
67
+ parentPort.postMessage({ type: 'ready', gpu, threads: numThreads });
51
68
  }
52
69
 
53
70
  async function handleInfer(msg) {