npm - ai-or-die - Versions diffs - 0.1.77 → 0.1.78 - Mend

ai-or-die 0.1.77 → 0.1.78

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/bin/ai-or-die.js +1 -1
package/package.json +1 -1
package/src/server.js +18 -1
package/src/sticky-note-engine.js +39 -14
package/src/sticky-note-summarizer.js +1 -1
package/src/sticky-note-threads.js +25 -0
package/src/sticky-note-worker.js +20 -3

package/bin/ai-or-die.js CHANGED Viewed

@@ -43,7 +43,7 @@ program
   .option('--no-sticky-notes', 'disable per-tab AI session summaries + auto tab titles (on by default)')
   .option('--sticky-notes-model-dir <path>', 'custom directory for the sticky-note model file')
   .option('--sticky-notes-model <url>', 'override the sticky-note model GGUF download URL')
-  .option('--sticky-notes-threads <number>', 'CPU threads for sticky-note inference (default: auto, max 4)')
+  .option('--sticky-notes-threads <number>', 'CPU threads for sticky-note inference (default: auto — three-quarters of the cores on CPU, gentle on GPU)')
   .option('--no-keepalive', 'disable keeping the machine awake while the server runs (Windows only; on by default)')
   .option('--keepalive-display', 'also keep the display on (default keeps the system awake but lets the monitor sleep)');

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "ai-or-die",
-  "version": "0.1.77",
+  "version": "0.1.78",
   "description": "Universal AI coding terminal — Claude, Copilot, Gemini & more in your browser",
   "main": "src/server.js",
   "bin": {

package/src/server.js CHANGED Viewed

@@ -4276,7 +4276,24 @@ class ClaudeCodeWebServer {
           percent: progress.percent,
         });
       })
-      .then(() => this._broadcastStickyStatus())
+      .then(() => {
+        // One-time visibility into the inference backend. On CPU (no GPU — common
+        // on Windows when the Vulkan/CUDA prebuilt is incompatible) summaries are
+        // materially slower; the worker compensates with more threads + a generous
+        // watchdog timeout, but a note can still take a couple of minutes.
+        const rt = this.stickyNoteEngine.getRuntimeInfo && this.stickyNoteEngine.getRuntimeInfo();
+        if (this.dev && rt) {
+          if (rt.gpu) {
+            console.log(`[sticky-notes] engine ready (GPU backend, ${rt.threads} threads)`);
+          } else {
+            console.log(
+              `[sticky-notes] engine ready (CPU backend, ${rt.threads} threads) — ` +
+                'summaries run on CPU and may take a couple of minutes; a Vulkan/CUDA driver would accelerate them'
+            );
+          }
+        }
+        this._broadcastStickyStatus();
+      })
       .catch((err) => {
         // Allow a later AI-session start to retry after a transient failure
         // (download blip). A permanent failure (no binding) just fails fast.

package/src/sticky-note-engine.js CHANGED Viewed

@@ -9,21 +9,31 @@
 const { Worker } = require('worker_threads');
 const path = require('path');
-const os = require('os');
 const GgufModelManager = require('./utils/gguf-model-manager');
 const { isBun } = require('./utils/runtime');
 const MAX_QUEUE_SIZE = 3;
-const DEFAULT_INFER_TIMEOUT_MS = 60000;
+// Watchdog-grade, unconditional per-request timeout. Real grammar-constrained
+// summaries on a CPU backend (no GPU — common on Windows when the Vulkan/CUDA
+// prebuilt is incompatible) take ~90s on half-core threading and up to ~160s on
+// 2 threads. This is a true catastrophic watchdog set well above that, NOT an
+// expected boundary: correctness over speed (a slow note must still complete).
+// GPU runs finish in ~7s and return immediately, so the high cap costs them
+// nothing. The summarizer's backstop sits strictly above this (one timeout
+// owner). An explicit inferTimeoutMs still overrides.
+const DEFAULT_INFER_TIMEOUT_MS = 300000;
 const MAX_RESTART_DELAY_MS = 15000;
 const MAX_RESTART_ATTEMPTS = 5;
 class StickyNoteEngine {
   constructor(options = {}) {
     this._enabled = !!options.enabled;
-    // Low thread cap keeps inference gentle so the model can't saturate CPU and
-    // starve the terminal / AI agent. Summaries are infrequent + throttled.
-    this._numThreads = options.numThreads || Math.max(1, Math.min(2, os.cpus().length - 2));
+    // Thread count is auto-selected by the worker once it knows whether a GPU
+    // backend loaded (see sticky-note-threads.pickThreads), UNLESS the caller
+    // pins it explicitly (--sticky-notes-threads). Auto is signalled by leaving
+    // numThreads out of the worker data entirely.
+    this._numThreadsExplicit = Number.isFinite(Number(options.numThreads)) && Number(options.numThreads) > 0;
+    this._numThreads = this._numThreadsExplicit ? Math.floor(Number(options.numThreads)) : null;
     this._contextSize = options.contextSize || 8192;
     this._inferTimeoutMs = options.inferTimeoutMs || DEFAULT_INFER_TIMEOUT_MS;
     this._maxQueue = options.maxQueue || MAX_QUEUE_SIZE;
@@ -39,22 +49,30 @@ class StickyNoteEngine {
     this._stopping = false;
     this._initPromise = null;
     this._downloadProgress = null;
+    this._runtimeInfo = null; // { gpu, threads } reported by the worker on ready
     this._modelManager =
       options.modelManager ||
       new GgufModelManager({ model: options.model, modelsDir: options.modelsDir });
-    // Injectable for tests; default spawns the real worker thread.
+    // Injectable for tests; default spawns the real worker thread. numThreads is
+    // included ONLY when explicitly pinned — otherwise the worker auto-picks.
     this._createWorker =
       options.createWorker ||
-      (() =>
-        new Worker(path.join(__dirname, 'sticky-note-worker.js'), {
-          workerData: {
-            modelPath: this._modelManager.getModelFile(),
-            numThreads: this._numThreads,
-            contextSize: this._contextSize,
-          },
-        }));
+      (() => new Worker(path.join(__dirname, 'sticky-note-worker.js'), { workerData: this._workerData() }));
+  }
+  /**
+   * Build the worker's workerData. numThreads is OMITTED when auto (not pinned)
+   * so the worker auto-selects based on the GPU backend it detects; pinning it
+   * here would defeat that. Kept as a method so it is unit-testable.
+   */
+  _workerData() {
+    return {
+      modelPath: this._modelManager.getModelFile(),
+      ...(this._numThreadsExplicit ? { numThreads: this._numThreads } : {}),
+      contextSize: this._contextSize,
+    };
   }
   async initialize(onProgress) {
@@ -109,6 +127,11 @@ class StickyNoteEngine {
     return this._downloadProgress;
   }
+  /** { gpu, threads } reported by the worker on ready, or null before ready. */
+  getRuntimeInfo() {
+    return this._runtimeInfo;
+  }
   /**
    * Run one inference. Resolves with the model's raw output string.
    * @param {string} prompt
@@ -173,6 +196,7 @@ class StickyNoteEngine {
     this._queue = [];
     this._currentRequest = null;
     this._worker = null;
+    this._runtimeInfo = null; // dead worker — drop its reported backend/threads
     if (this._stopping) {
       this._status = 'unavailable';
@@ -230,6 +254,7 @@ class StickyNoteEngine {
           this._status = 'ready';
           this._restartAttempts = 0;
           this._lastSpawnError = null;
+          this._runtimeInfo = { gpu: !!msg.gpu, threads: msg.threads || null };
           worker.on('message', (m) => this._onWorkerMessage(m));
           worker.on('exit', (c) => this._onWorkerExit(c));
           this._processQueue();

package/src/sticky-note-summarizer.js CHANGED Viewed

@@ -18,7 +18,7 @@ const DEFAULTS = {
   minIntervalMs: 20000, // floor between inferences for one session
   intervalFactor: 3, // adaptive: minInterval = max(floor, factor * lastDurationMs)
   turnDebounceMs: 1500, // (JSONL mode) coalesce a burst of appended turn lines
-  inferTimeoutMs: 75000, // backstop ABOVE the engine's own 60s timeout, so the
+  inferTimeoutMs: 330000, // backstop ABOVE the engine's own 300s timeout, so the
   // engine times out first (one timeout owner); this only fires if the engine
   // promise hangs entirely. Worker-side serialisation prevents concurrent runs.
   failureThreshold: 3, // consecutive failures -> open circuit breaker

package/src/sticky-note-threads.js ADDED Viewed

@@ -0,0 +1,25 @@
+'use strict';
+// Thread-count policy for the sticky-note inference worker. Pure + dependency-
+// free so it can be unit-tested without spawning a worker or loading a model.
+//
+// The worker decides its own thread count AFTER getLlama() reports whether a GPU
+// backend actually loaded:
+//   - GPU present  -> the GPU carries the inference (the worker also requests full
+//     layer offload); keep a low, gentle CPU thread count so it can't saturate CPU
+//     and starve the terminal / AI agent.
+//   - No GPU (CPU) -> common on Windows when the Vulkan/CUDA prebuilt binary is
+//     incompatible. At 2 threads one grammar-constrained summary takes ~160s on a
+//     16-core box and blows every timeout; use THREE-QUARTERS of the cores (leaving
+//     a quarter for the terminal/agent) so it completes well inside the watchdog.
+// An explicit override (--sticky-notes-threads) always wins, after validation.
+// `explicit` is coerced with Number() so a numeric string (e.g. from a CLI/env
+// arg) still counts as a valid pin rather than silently falling back to auto.
+function pickThreads({ explicit, gpu, cpus } = {}) {
+  const pinned = Number(explicit);
+  if (Number.isFinite(pinned) && pinned > 0) return Math.floor(pinned);
+  const cores = Number.isFinite(cpus) && cpus > 0 ? Math.floor(cpus) : 1;
+  return gpu ? Math.max(1, Math.min(2, cores - 2)) : Math.max(1, Math.floor((cores * 3) / 4));
+}
+module.exports = { pickThreads };

package/src/sticky-note-worker.js CHANGED Viewed

@@ -9,10 +9,10 @@
 const { parentPort, workerData } = require('worker_threads');
 const os = require('os');
 const { SYSTEM_PROMPT, NOTE_SCHEMA } = require('./sticky-note-prompt');
+const { pickThreads } = require('./sticky-note-threads');
 const modelPath = workerData.modelPath;
 const contextSize = workerData.contextSize || 8192;
-const numThreads = workerData.numThreads || Math.max(1, Math.min(2, os.cpus().length - 2));
 const maxTokens = workerData.maxTokens || 320;
 let llama;
@@ -42,12 +42,29 @@ async function init() {
   LlamaChatSessionCtor = LlamaChatSession;
   llama = await getLlama();
-  model = await llama.loadModel({ modelPath });
+  // availableParallelism() reflects usable parallelism better than cpus().length
+  // on Windows hybrid P/E-core machines; fall back where it's unavailable.
+  const cpus = (typeof os.availableParallelism === 'function' ? os.availableParallelism() : 0) || os.cpus().length;
+  // llama.gpu is false | 'cuda' | 'vulkan' | 'metal'; any non-empty string = GPU.
+  const gpu = !!llama.gpu;
+  const numThreads = pickThreads({ explicit: workerData.numThreads, gpu, cpus });
+  // Use the GPU fully when present: request all layers in VRAM ('max'). If the
+  // GPU can't fit them, 'max' throws — fall back to the default 'auto', which
+  // still offloads as many layers as fit (never worse than CPU-only).
+  if (gpu) {
+    try {
+      model = await llama.loadModel({ modelPath, gpuLayers: 'max' });
+    } catch {
+      model = await llama.loadModel({ modelPath });
+    }
+  } else {
+    model = await llama.loadModel({ modelPath });
+  }
   context = await model.createContext({ contextSize, threads: numThreads });
   sequence = context.getSequence();
   grammar = await llama.createGrammarForJsonSchema(NOTE_SCHEMA);
-  parentPort.postMessage({ type: 'ready' });
+  parentPort.postMessage({ type: 'ready', gpu, threads: numThreads });
 }
 async function handleInfer(msg) {