npm - omnius - Versions diffs - 1.0.146 → 1.0.147 - Mend

omnius 1.0.146 → 1.0.147

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -529012,38 +529012,136 @@ function inferHomeFromProcUid(pid) {
   }
   return null;
 }
+function detectPeerOmniusOllamaPool() {
+  if (!isDirectory("/proc"))
+    return false;
+  const selfPid = String(process.pid);
+  const selfPpid = String(process.ppid ?? "");
+  const peerNodePids = /* @__PURE__ */ new Set();
+  let entries;
+  try {
+    entries = readdirSync21("/proc", { withFileTypes: true }).filter((d2) => d2.isDirectory() && /^\d+$/.test(d2.name)).map((d2) => ({ name: d2.name }));
+  } catch {
+    return false;
+  }
+  for (const e2 of entries) {
+    if (e2.name === selfPid || e2.name === selfPpid)
+      continue;
+    try {
+      const cmdline = readFileSync50(`/proc/${e2.name}/cmdline`, "utf8");
+      if (!cmdline.includes("node"))
+        continue;
+      if (!/[/\\]omnius[/\\]dist[/\\]index\.js|[/\\]omnius[/\\]/i.test(cmdline))
+        continue;
+      peerNodePids.add(e2.name);
+    } catch {
+    }
+  }
+  if (peerNodePids.size === 0)
+    return false;
+  for (const e2 of entries) {
+    try {
+      const cmd = readFileSync50(`/proc/${e2.name}/cmdline`, "utf8");
+      if (!cmd.includes("ollama"))
+        continue;
+      if (!cmd.split("\0").includes("serve"))
+        continue;
+      const status = readFileSync50(`/proc/${e2.name}/status`, "utf8");
+      const ppid = status.match(/^PPid:\s+(\d+)/m)?.[1];
+      if (ppid && peerNodePids.has(ppid))
+        return true;
+    } catch {
+    }
+  }
+  return false;
+}
 async function detectGpus() {
   if (_nvidiaSmiAvailable === false)
     return [];
   return new Promise((resolve56) => {
-    exec2("nvidia-smi --query-gpu=index,uuid,name,memory.total,memory.free,utilization.gpu --format=csv,noheader,nounits 2>/dev/null", { encoding: "utf8", timeout: 3e3 }, (err, stdout) => {
+    const queryFields = "index,uuid,name,memory.total,memory.free,utilization.gpu,compute_cap";
+    exec2(`nvidia-smi --query-gpu=${queryFields} --format=csv,noheader,nounits 2>/dev/null`, { encoding: "utf8", timeout: 3e3 }, (err, stdout) => {
       if (err) {
-        _nvidiaSmiAvailable = false;
-        resolve56([]);
+        exec2("nvidia-smi --query-gpu=index,uuid,name,memory.total,memory.free,utilization.gpu --format=csv,noheader,nounits 2>/dev/null", { encoding: "utf8", timeout: 3e3 }, (err2, stdout2) => {
+          if (err2) {
+            _nvidiaSmiAvailable = false;
+            resolve56([]);
+            return;
+          }
+          _nvidiaSmiAvailable = true;
+          resolve56(parseGpuQueryOutput(
+            stdout2,
+            /* hasComputeCap */
+            false
+          ));
+        });
         return;
       }
       _nvidiaSmiAvailable = true;
-      const gpus = [];
-      for (const line of stdout.split("\n")) {
-        const parts = line.split(",").map((s2) => s2.trim());
-        if (parts.length < 6)
-          continue;
-        const idx = Number(parts[0]);
-        if (!Number.isFinite(idx))
-          continue;
-        gpus.push({
-          index: idx,
-          uuid: parts[1] ?? "",
-          name: parts[2] ?? "",
-          vramTotalMB: Number(parts[3]) || 0,
-          vramFreeMB: Number(parts[4]) || 0,
-          utilization: Number(parts[5]) || 0
-        });
-      }
-      resolve56(gpus);
+      resolve56(parseGpuQueryOutput(
+        stdout,
+        /* hasComputeCap */
+        true
+      ));
     });
   });
 }
+function parseGpuQueryOutput(stdout, hasComputeCap) {
+  const gpus = [];
+  const minFields = hasComputeCap ? 7 : 6;
+  for (const line of stdout.split("\n")) {
+    const parts = line.split(",").map((s2) => s2.trim());
+    if (parts.length < minFields)
+      continue;
+    const idx = Number(parts[0]);
+    if (!Number.isFinite(idx))
+      continue;
+    const info = {
+      index: idx,
+      uuid: parts[1] ?? "",
+      name: parts[2] ?? "",
+      vramTotalMB: Number(parts[3]) || 0,
+      vramFreeMB: Number(parts[4]) || 0,
+      utilization: Number(parts[5]) || 0
+    };
+    if (hasComputeCap) {
+      const cap = Number(parts[6]);
+      if (Number.isFinite(cap))
+        info.computeCapability = cap;
+    }
+    gpus.push(info);
+  }
+  return gpus;
+}
+function resolveMinGpuVramMB() {
+  const fromEnv = Number(process.env["OMNIUS_OLLAMA_MIN_GPU_MB"]);
+  return Number.isFinite(fromEnv) && fromEnv > 0 ? fromEnv : DEFAULT_MIN_GPU_VRAM_MB;
+}
+function resolveMinComputeCapability() {
+  const fromEnv = Number(process.env["OMNIUS_OLLAMA_MIN_COMPUTE_CAP"]);
+  return Number.isFinite(fromEnv) && fromEnv > 0 ? fromEnv : DEFAULT_MIN_COMPUTE_CAPABILITY;
+}
+function isCapableForLLM(gpu, thresholds = {}) {
+  const minVramMB = thresholds.minVramMB ?? resolveMinGpuVramMB();
+  const minComputeCap = thresholds.minComputeCap ?? resolveMinComputeCapability();
+  if (gpu.vramTotalMB < minVramMB)
+    return false;
+  if (gpu.computeCapability !== void 0 && gpu.computeCapability < minComputeCap)
+    return false;
+  return true;
+}
+function filterCapableGpus(gpus, thresholds) {
+  return gpus.filter((g) => isCapableForLLM(g, thresholds));
+}
+function recommendMaxParallelFromVram(minFreeMB) {
+  if (minFreeMB >= 60 * 1024)
+    return 8;
+  if (minFreeMB >= 40 * 1024)
+    return 4;
+  if (minFreeMB >= 24 * 1024)
+    return 2;
+  return 1;
+}
 async function getHardwareSnapshot() {
   const { totalmem: totalmem8, freemem: freemem7, cpus: cpus5 } = await import("node:os");
   const gpus = await detectGpus();
@@ -529127,10 +529225,15 @@ async function findFreePort(start2) {
 }
 function resolveDefaultPoolConfig() {
   const baseInstanceUrl = process.env["OMNIUS_OLLAMA_BASE_URL"] || process.env["OLLAMA_HOST"]?.replace(/^([^:/]+:[0-9]+)$/, "http://$1") || "http://127.0.0.1:11434";
-  const maxParallelPerInstance = Number(process.env["OMNIUS_OLLAMA_MAX_PARALLEL"]) || 1;
-  const maxSpawnedInstances = Number(process.env["OMNIUS_OLLAMA_MAX_INSTANCES"]) || 0;
+  const maxParallelExplicit = process.env["OMNIUS_OLLAMA_MAX_PARALLEL"] !== void 0;
+  const maxParallelPerInstance = maxParallelExplicit ? Number(process.env["OMNIUS_OLLAMA_MAX_PARALLEL"]) || 1 : 1;
+  const autoTuneMaxParallel = !maxParallelExplicit;
+  const gpuPlacementExplicit = process.env["OMNIUS_OLLAMA_GPU_PLACEMENT"] !== void 0;
+  const maxInstancesExplicit = process.env["OMNIUS_OLLAMA_MAX_INSTANCES"] !== void 0;
+  const peerPoolActive = !gpuPlacementExplicit && !maxInstancesExplicit && detectPeerOmniusOllamaPool();
+  const maxSpawnedInstances = maxInstancesExplicit ? Number(process.env["OMNIUS_OLLAMA_MAX_INSTANCES"]) || 0 : peerPoolActive ? 1 : 0;
   const targetGpuInstances = Number(process.env["OMNIUS_OLLAMA_TARGET_GPU_INSTANCES"]) || 0;
-  const gpuPlacementRaw = (process.env["OMNIUS_OLLAMA_GPU_PLACEMENT"] ?? "auto").toLowerCase();
+  const gpuPlacementRaw = (process.env["OMNIUS_OLLAMA_GPU_PLACEMENT"] ?? (peerPoolActive ? "elastic" : "auto")).toLowerCase();
   const gpuPlacement = gpuPlacementRaw === "dedicated" || gpuPlacementRaw === "elastic" || gpuPlacementRaw === "auto" ? gpuPlacementRaw : "auto";
   const idleMs = Number(process.env["OMNIUS_OLLAMA_IDLE_MS"]) || 3 * 60 * 60 * 1e3;
   const reaperIntervalMs = Number(process.env["OMNIUS_OLLAMA_REAPER_MS"]) || 3e4;
@@ -529155,7 +529258,8 @@ function resolveDefaultPoolConfig() {
     ollamaBinary,
     spawnReadyTimeoutSec,
     networkRxBudgetBytesPerSec,
-    networkTxBudgetBytesPerSec
+    networkTxBudgetBytesPerSec,
+    autoTuneMaxParallel
   };
 }
 function parseNullableNumber(value2) {
@@ -529223,11 +529327,13 @@ function setOllamaPool(pool3) {
     _poolByBaseUrl.set(pool3.statusConfig().baseInstanceUrl, pool3);
   }
 }
-var _nvidiaSmiAvailable, _lastNetworkSnapshot, OllamaInstance, realInstanceSpawner, _gpuCursor, OllamaPool, _poolSingleton, _poolByBaseUrl;
+var _nvidiaSmiAvailable, DEFAULT_MIN_GPU_VRAM_MB, DEFAULT_MIN_COMPUTE_CAPABILITY, _lastNetworkSnapshot, OllamaInstance, realInstanceSpawner, _gpuCursor, OllamaPool, _poolSingleton, _poolByBaseUrl;
 var init_ollama_pool = __esm({
   "packages/orchestrator/dist/ollama-pool.js"() {
     "use strict";
     _nvidiaSmiAvailable = null;
+    DEFAULT_MIN_GPU_VRAM_MB = 16 * 1024;
+    DEFAULT_MIN_COMPUTE_CAPABILITY = 7;
     _lastNetworkSnapshot = null;
     OllamaInstance = class {
       state;
@@ -529281,6 +529387,7 @@ var init_ollama_pool = __esm({
         env2["OLLAMA_MODELS"] = config.sharedModelStore;
       }
       env2["OLLAMA_NUM_PARALLEL"] = String(config.maxParallelPerInstance);
+      env2["OLLAMA_KEEP_ALIVE"] = process.env["OMNIUS_OLLAMA_SPAWN_KEEP_ALIVE"] ?? "-1";
       if (gpuUuid) {
         env2["CUDA_VISIBLE_DEVICES"] = gpuUuid;
         env2["GPU_DEVICE_ORDINAL"] = gpuIndex === null ? "" : String(gpuIndex);
@@ -529334,6 +529441,20 @@ var init_ollama_pool = __esm({
       dedicatedGpuPoolActive = false;
       activePlacementMode = "constrained";
       gpuCache = null;
+      /**
+       * Set once after the first capability-filtered GPU detection. Prevents the
+       * auto-tune from oscillating maxParallelPerInstance as free VRAM fluctuates
+       * during normal inference.
+       */
+      _autoTuned = false;
+      /** UUIDs we've already emitted gpu-excluded for. Prevents log spam. */
+      _excludedGpusReported = /* @__PURE__ */ new Set();
+      /**
+       * Cached model footprint in MiB (model name → estimated VRAM required).
+       * Populated lazily via /api/show on the base instance. null sentinel means
+       * "we tried but failed" so we don't re-probe in a tight loop.
+       */
+      _modelVramEstimateMB = /* @__PURE__ */ new Map();
       slotWaiters = [];
       /**
        * Agent → preferred instance id. Set whenever an acquire resolves an
@@ -529533,7 +529654,18 @@ var init_ollama_pool = __esm({
           const freedPick = this.pickInstance({ model });
           if (freedPick)
             return freedPick;
-          const gpu = this.pickGpuForSpawn(gpus);
+          const vramNeededMB = await this.estimateModelVramMB(model);
+          const capable = this.gpusWithCapacityForModel(gpus, vramNeededMB);
+          if (capable.length === 0 && vramNeededMB !== null) {
+            this.emit("spawn-skipped", {
+              reason: "insufficient-vram",
+              model,
+              vramNeededMB,
+              gpuFreeMBs: gpus.map((g) => g.vramFreeMB)
+            });
+            return null;
+          }
+          const gpu = this.pickGpuForSpawn(capable.length > 0 ? capable : gpus);
           return this.spawnInstance(model, gpu);
         });
       }
@@ -529544,8 +529676,20 @@ var init_ollama_pool = __esm({
           if (!this.canSpawnWithSharedModelStore(model))
             return;
           const target = this.dedicatedTargetCount(gpus);
+          const vramNeededMB = await this.estimateModelVramMB(model);
           while (this.instances.filter((i2) => i2.state.poolOwned).length < target) {
-            const gpu = this.pickGpuForSpawn(gpus);
+            const candidates = this.gpusWithCapacityForModel(gpus, vramNeededMB);
+            const pool3 = candidates.length > 0 ? candidates : vramNeededMB === null ? gpus : [];
+            if (pool3.length === 0) {
+              this.emit("spawn-skipped", {
+                reason: "insufficient-vram",
+                model,
+                vramNeededMB,
+                gpuFreeMBs: gpus.map((g) => g.vramFreeMB)
+              });
+              break;
+            }
+            const gpu = this.pickGpuForSpawn(pool3);
             if (!gpu)
               break;
             const inst = await this.spawnInstance(model, gpu);
@@ -529603,9 +529747,59 @@ var init_ollama_pool = __esm({
         if (this.gpuCache && now - this.gpuCache.takenAtMs <= maxAgeMs) {
           return this.gpuCache.gpus;
         }
-        const gpus = await this.gpuDetector();
-        this.gpuCache = { gpus, takenAtMs: now };
-        return gpus;
+        const rawGpus = await this.gpuDetector();
+        const filtered = filterCapableGpus(rawGpus);
+        const filteredUuids = new Set(filtered.map((g) => g.uuid));
+        for (const g of rawGpus) {
+          if (filteredUuids.has(g.uuid))
+            continue;
+          if (this._excludedGpusReported.has(g.uuid))
+            continue;
+          this._excludedGpusReported.add(g.uuid);
+          const reason = g.vramTotalMB < resolveMinGpuVramMB() ? "insufficient-vram" : "insufficient-compute-capability";
+          this.emit("gpu-excluded", {
+            uuid: g.uuid,
+            index: g.index,
+            name: g.name,
+            vramTotalMB: g.vramTotalMB,
+            computeCapability: g.computeCapability,
+            reason
+          });
+        }
+        this.gpuCache = { gpus: filtered, takenAtMs: now };
+        this.maybeAutoTuneMaxParallel(filtered);
+        return filtered;
+      }
+      /**
+       * One-shot: bump `maxParallelPerInstance` from the worst-case free VRAM
+       * across capable GPUs the first time we see them. We never tune down (a
+       * subsequent low-VRAM read shouldn't strip concurrency from in-flight
+       * requests), and we never tune again once successful — the recommendation
+       * ladder is stable enough that a single read at startup is correct.
+       */
+      maybeAutoTuneMaxParallel(filtered) {
+        if (!this.config.autoTuneMaxParallel)
+          return;
+        if (this._autoTuned)
+          return;
+        if (filtered.length === 0)
+          return;
+        const minFreeMB = filtered.reduce((m2, g) => Math.min(m2, g.vramFreeMB), Number.POSITIVE_INFINITY);
+        const recommended = recommendMaxParallelFromVram(minFreeMB);
+        if (recommended > this.config.maxParallelPerInstance) {
+          const previous = this.config.maxParallelPerInstance;
+          this.config.maxParallelPerInstance = recommended;
+          for (const inst of this.instances) {
+            inst.state.maxParallel = recommended;
+          }
+          this.emit("max-parallel-tuned", {
+            previous,
+            recommended,
+            minFreeMB,
+            capableGpuCount: filtered.length
+          });
+        }
+        this._autoTuned = true;
       }
       async spawnInstance(model, gpu) {
         let port;
@@ -529668,6 +529862,56 @@ var init_ollama_pool = __esm({
         });
         return inst;
       }
+      /**
+       * Best-effort: estimate the VRAM (in MiB) a model needs to be served
+       * without CPU spill. Hits the base instance's `/api/show` once per model
+       * and caches the result. Returns null when the probe fails (the caller
+       * then falls back to "no estimate" semantics — capacity check is skipped).
+       *
+       * The number returned is `disk_size * 1.15 + maxParallel * 1024` (1 GiB of
+       * KV cache per parallel slot — conservative for 30B-class models). Newer
+       * model families may exceed this margin slightly; bumps are safe via
+       * OMNIUS_OLLAMA_VRAM_SAFETY_MARGIN.
+       */
+      async estimateModelVramMB(model) {
+        if (this._modelVramEstimateMB.has(model)) {
+          return this._modelVramEstimateMB.get(model) ?? null;
+        }
+        let bytesOnDisk = null;
+        try {
+          const url = `${this.config.baseInstanceUrl.replace(/\/+$/, "")}/api/show`;
+          const resp = await fetch(url, {
+            method: "POST",
+            headers: { "Content-Type": "application/json" },
+            body: JSON.stringify({ name: model }),
+            signal: AbortSignal.timeout(2e3)
+          });
+          if (resp.ok) {
+            const data = await resp.json();
+            if (typeof data.size === "number" && data.size > 0)
+              bytesOnDisk = data.size;
+          }
+        } catch {
+        }
+        if (bytesOnDisk === null) {
+          this._modelVramEstimateMB.set(model, null);
+          return null;
+        }
+        const safetyMargin = Number(process.env["OMNIUS_OLLAMA_VRAM_SAFETY_MARGIN"]) || 1.15;
+        const kvCacheMB = this.config.maxParallelPerInstance * 1024;
+        const estimateMB = Math.ceil(bytesOnDisk / (1024 * 1024) * safetyMargin + kvCacheMB);
+        this._modelVramEstimateMB.set(model, estimateMB);
+        return estimateMB;
+      }
+      /**
+       * Filter GPUs to those with enough free VRAM for the model. Caller decides
+       * how to react to an empty list (skip spawn vs degrade to constrained).
+       */
+      gpusWithCapacityForModel(gpus, vramNeededMB) {
+        if (vramNeededMB === null)
+          return gpus;
+        return gpus.filter((g) => g.vramFreeMB >= vramNeededMB);
+      }
       /**
        * Pick a GPU for a freshly-spawned instance. Prefers GPUs that no
        * pool-owned instance is already pinned to, then most free VRAM. Returns
@@ -529704,6 +529948,28 @@ var init_ollama_pool = __esm({
             survivors.push(inst);
             continue;
           }
+          const PROBE_GRACE_MS = 3e4;
+          if (inst.state.inflight === 0 && Date.now() - inst.state.spawnedAtMs > PROBE_GRACE_MS && await this.isPartialVramSpilled(inst)) {
+            const reapedAtMs = Date.now();
+            await inst.terminate();
+            this.dropAffinityFor(inst.state.id);
+            this.emit("instance-reaped", {
+              id: inst.state.id,
+              pid: inst.state.pid,
+              reason: "partial-vram",
+              totalRequests: inst.state.totalRequests,
+              peakInflight: inst.state.peakInflight,
+              ageMs: reapedAtMs - inst.state.spawnedAtMs,
+              idleMs: reapedAtMs - inst.state.lastUsedMs,
+              provenance: {
+                entity: `urn:omnius:ollama-instance:${inst.state.id}`,
+                activity: "ollama-instance-reap-partial-vram",
+                agent: "orchestrator.ollama-pool",
+                timestampMs: reapedAtMs
+              }
+            });
+            continue;
+          }
           if (inst.isIdleLongerThan(this.config.idleMs)) {
             const reapedAtMs = Date.now();
             await inst.terminate();
@@ -529711,6 +529977,7 @@ var init_ollama_pool = __esm({
             this.emit("instance-reaped", {
               id: inst.state.id,
               pid: inst.state.pid,
+              reason: "idle",
               totalRequests: inst.state.totalRequests,
               peakInflight: inst.state.peakInflight,
               ageMs: reapedAtMs - inst.state.spawnedAtMs,
@@ -529728,6 +529995,33 @@ var init_ollama_pool = __esm({
         }
         this.instances = survivors;
       }
+      /**
+       * Probe `/api/ps` on the instance and return true if any resident model has
+       * less than 95% of its weights in VRAM — the unmistakable CPU-offload
+       * signature that produces 50× slowdowns. Defensive: any HTTP failure
+       * returns false so a transient network blip never triggers a reap.
+       */
+      async isPartialVramSpilled(inst) {
+        const PARTIAL_VRAM_THRESHOLD = 0.95;
+        try {
+          const url = `${inst.state.baseUrl.replace(/\/+$/, "")}/api/ps`;
+          const resp = await fetch(url, { signal: AbortSignal.timeout(2e3) });
+          if (!resp.ok)
+            return false;
+          const data = await resp.json();
+          if (!data.models || data.models.length === 0)
+            return false;
+          return data.models.some((m2) => {
+            const total = m2.size ?? 0;
+            const vram = m2.size_vram ?? 0;
+            if (total <= 0)
+              return false;
+            return vram / total < PARTIAL_VRAM_THRESHOLD;
+          });
+        } catch {
+          return false;
+        }
+      }
       /** Stop the reaper and terminate every spawned instance. Call on process exit. */
       async shutdown() {
         if (this.reaperHandle) {

package/npm-shrinkwrap.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "omnius",
-  "version": "1.0.146",
+  "version": "1.0.147",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "omnius",
-      "version": "1.0.146",
+      "version": "1.0.147",
       "bundleDependencies": [
         "image-to-ascii"
       ],

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "omnius",
-  "version": "1.0.146",
+  "version": "1.0.147",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",