npm - omnius - Versions diffs - 1.0.82 → 1.0.84 - Mend

omnius 1.0.82 → 1.0.84

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -524968,8 +524968,11 @@ async function findFreePort(start2) {
 }
 function resolveDefaultPoolConfig() {
   const baseInstanceUrl = process.env["OMNIUS_OLLAMA_BASE_URL"] || process.env["OLLAMA_HOST"]?.replace(/^([^:/]+:[0-9]+)$/, "http://$1") || "http://127.0.0.1:11434";
-  const maxParallelPerInstance = Number(process.env["OMNIUS_OLLAMA_MAX_PARALLEL"]) || 4;
+  const maxParallelPerInstance = Number(process.env["OMNIUS_OLLAMA_MAX_PARALLEL"]) || 1;
   const maxSpawnedInstances = Number(process.env["OMNIUS_OLLAMA_MAX_INSTANCES"]) || 0;
+  const targetGpuInstances = Number(process.env["OMNIUS_OLLAMA_TARGET_GPU_INSTANCES"]) || 0;
+  const gpuPlacementRaw = (process.env["OMNIUS_OLLAMA_GPU_PLACEMENT"] ?? "auto").toLowerCase();
+  const gpuPlacement = gpuPlacementRaw === "dedicated" || gpuPlacementRaw === "elastic" || gpuPlacementRaw === "auto" ? gpuPlacementRaw : "auto";
   const idleMs = Number(process.env["OMNIUS_OLLAMA_IDLE_MS"]) || 5 * 60 * 1e3;
   const reaperIntervalMs = Number(process.env["OMNIUS_OLLAMA_REAPER_MS"]) || 3e4;
   const spawnPortStart = Number(process.env["OMNIUS_OLLAMA_SPAWN_PORT"]) || 11435;
@@ -524983,6 +524986,8 @@ function resolveDefaultPoolConfig() {
     baseInstanceUrl: baseInstanceUrl.replace(/\/+$/, ""),
     maxParallelPerInstance,
     maxSpawnedInstances,
+    targetGpuInstances,
+    gpuPlacement,
     idleMs,
     reaperIntervalMs,
     spawnPortStart,
@@ -525110,15 +525115,17 @@ var init_ollama_pool = __esm({
         this.proc = null;
       }
     };
-    realInstanceSpawner = async ({ port, gpuUuid, config }) => {
+    realInstanceSpawner = async ({ port, gpuUuid, gpuIndex, config }) => {
       const env2 = { ...process.env };
       env2["OLLAMA_HOST"] = `127.0.0.1:${port}`;
       if (config.sharedModelStore) {
         env2["OLLAMA_MODELS"] = config.sharedModelStore;
       }
       env2["OLLAMA_NUM_PARALLEL"] = String(config.maxParallelPerInstance);
-      if (gpuUuid)
+      if (gpuUuid) {
         env2["CUDA_VISIBLE_DEVICES"] = gpuUuid;
+        env2["GPU_DEVICE_ORDINAL"] = gpuIndex === null ? "" : String(gpuIndex);
+      }
       const child = spawn21(config.ollamaBinary, ["serve"], {
         env: env2,
         stdio: ["ignore", "pipe", "pipe"],
@@ -525160,17 +525167,27 @@ var init_ollama_pool = __esm({
       instances = [];
       reaperHandle = null;
       spawner;
+      gpuDetector;
+      portAllocator;
       /** Serializes concurrent spawn requests so two callers don't both create instance N+1. */
       spawnGate = Promise.resolve();
+      /** True after dedicated mode has successfully started at least one pool-owned GPU runner. */
+      dedicatedGpuPoolActive = false;
+      activePlacementMode = "constrained";
+      gpuCache = null;
+      slotWaiters = [];
       constructor(config, opts) {
         super();
         this.config = { ...resolveDefaultPoolConfig(), ...config };
         this.spawner = opts?.spawner ?? realInstanceSpawner;
+        this.gpuDetector = opts?.gpuDetector ?? detectGpus;
+        this.portAllocator = opts?.portAllocator ?? findFreePort;
         this.instances.push(new OllamaInstance({
           id: "omnius-ollama-base",
           baseUrl: this.config.baseInstanceUrl,
           port: this.portFromUrl(this.config.baseInstanceUrl),
           gpuUuid: null,
+          gpuIndex: null,
           poolOwned: false,
           inflight: 0,
           peakInflight: 0,
@@ -525191,32 +525208,42 @@ var init_ollama_pool = __esm({
        *   2. Any instance with free slots (least-loaded first).
        *   3. Spawn a new instance pinned to the least-utilized GPU, when the
        *      pool hasn't hit `maxSpawnedInstances`.
-       *   4. Fall back to the least-loaded instance even if saturated — the
-       *      caller will block inside Ollama's internal queue rather than fail.
+       *   4. Queue at the pool boundary when all allowed lanes are busy.
        */
       async acquire(opts) {
+        const gpus = await this.getGpusForPlacement();
+        let placementMode = this.placementModeFor(gpus);
+        this.activePlacementMode = placementMode;
+        if (placementMode === "dedicated") {
+          await this.ensureDedicatedGpuPool(opts.model, gpus);
+          if (!this.instances.some((i2) => i2.state.poolOwned)) {
+            placementMode = "constrained";
+            this.activePlacementMode = placementMode;
+          }
+        }
         const pick = this.pickInstance(opts);
         if (pick) {
           pick.acquire(opts.model);
           return this.buildSlot(pick);
         }
-        const spawned = await this.maybeSpawnInstance(opts.model);
-        if (spawned) {
+        if (placementMode === "constrained") {
+          return this.acquireQueued(opts);
+        }
+        const spawned = placementMode === "elastic" ? await this.maybeSpawnInstance(opts.model) : null;
+        if (spawned && !spawned.isSaturated()) {
           spawned.acquire(opts.model);
           return this.buildSlot(spawned);
         }
-        const fallback = this.instances.slice().sort((a2, b) => a2.state.inflight - b.state.inflight)[0];
-        fallback.acquire(opts.model);
-        return this.buildSlot(fallback);
+        return this.acquireQueued(opts);
       }
       /** Synchronous routing decision; returns the instance or null if every one is saturated. */
       pickInstance(opts) {
-        const candidates = this.instances.filter((inst) => !inst.isSaturated());
+        const candidates = this.instances.filter((inst) => !this.isEffectivelySaturated(inst) && !(this.activePlacementMode === "dedicated" && this.dedicatedGpuPoolActive && !inst.state.poolOwned && !opts.preferBaseInstance));
         if (candidates.length === 0)
           return null;
         const scored = candidates.map((inst) => ({
           inst,
-          score: (inst.state.knownModels.has(opts.model) ? 100 : 0) + (opts.preferBaseInstance && !inst.state.poolOwned ? 25 : 0) + inst.freeSlots() * 10 - inst.state.inflight
+          score: (inst.state.knownModels.has(opts.model) ? 100 : 0) + (opts.preferBaseInstance && !inst.state.poolOwned ? 25 : 0) + this.effectiveFreeSlots(inst) * 10 - inst.state.inflight
         }));
         scored.sort((a2, b) => b.score - a2.score);
         return scored[0].inst;
@@ -525227,9 +525254,39 @@ var init_ollama_pool = __esm({
           baseUrl: inst.state.baseUrl,
           poolOwned: inst.state.poolOwned,
           gpuUuid: inst.state.gpuUuid,
-          release: (success) => inst.release(success)
+          gpuIndex: inst.state.gpuIndex,
+          release: (success) => {
+            inst.release(success);
+            this.wakeNextSlotWaiter();
+          }
         };
       }
+      async acquireQueued(opts) {
+        for (; ; ) {
+          const pick = this.pickInstance(opts);
+          if (pick) {
+            pick.acquire(opts.model);
+            return this.buildSlot(pick);
+          }
+          await new Promise((resolve52) => this.slotWaiters.push(resolve52));
+        }
+      }
+      wakeNextSlotWaiter() {
+        const waiter = this.slotWaiters.shift();
+        if (waiter)
+          waiter();
+      }
+      effectiveMaxParallel(inst) {
+        if (this.activePlacementMode === "constrained")
+          return 1;
+        return Math.max(1, inst.state.maxParallel);
+      }
+      isEffectivelySaturated(inst) {
+        return inst.state.inflight >= this.effectiveMaxParallel(inst);
+      }
+      effectiveFreeSlots(inst) {
+        return Math.max(0, this.effectiveMaxParallel(inst) - inst.state.inflight);
+      }
       /**
        * Spawn a new instance pinned to a GPU when policy allows. Returns the
        * spawned instance or null when:
@@ -525241,6 +525298,42 @@ var init_ollama_pool = __esm({
        * over-allocate.
        */
       async maybeSpawnInstance(model) {
+        return this.withSpawnGate(async () => {
+          if (!this.canSpawnWithSharedModelStore(model))
+            return null;
+          const poolOwnedCount = this.instances.filter((i2) => i2.state.poolOwned).length;
+          const gpus = await this.getGpusForPlacement();
+          const cap = this.elasticSpawnCap(gpus);
+          if (poolOwnedCount >= cap)
+            return null;
+          const freedPick = this.pickInstance({ model });
+          if (freedPick)
+            return freedPick;
+          const gpu = this.pickGpuForSpawn(gpus);
+          return this.spawnInstance(model, gpu);
+        });
+      }
+      async ensureDedicatedGpuPool(model, gpus) {
+        if (this.placementModeFor(gpus) !== "dedicated")
+          return;
+        await this.withSpawnGate(async () => {
+          if (!this.canSpawnWithSharedModelStore(model))
+            return;
+          const target = this.dedicatedTargetCount(gpus);
+          while (this.instances.filter((i2) => i2.state.poolOwned).length < target) {
+            const gpu = this.pickGpuForSpawn(gpus);
+            if (!gpu)
+              break;
+            const inst = await this.spawnInstance(model, gpu);
+            if (!inst)
+              break;
+          }
+          if (this.instances.some((i2) => i2.state.poolOwned)) {
+            this.dedicatedGpuPoolActive = true;
+          }
+        });
+      }
+      async withSpawnGate(fn) {
         let resolveGate = () => {
         };
         const myTurn = new Promise((r2) => {
@@ -525250,55 +525343,90 @@ var init_ollama_pool = __esm({
         this.spawnGate = myTurn;
         await prev;
         try {
-          if (!this.config.sharedModelStore && !this.config.allowUnsharedModelStore) {
-            this.emit("spawn-skipped", {
-              reason: "missing-shared-model-store",
-              model,
-              baseInstanceUrl: this.config.baseInstanceUrl
-            });
-            return null;
-          }
-          const poolOwnedCount = this.instances.filter((i2) => i2.state.poolOwned).length;
-          const gpus = await detectGpus();
-          const cap = this.config.maxSpawnedInstances > 0 ? this.config.maxSpawnedInstances : Math.max(0, gpus.length - 1);
-          if (poolOwnedCount >= cap)
-            return null;
-          const freedPick = this.pickInstance({ model });
-          if (freedPick)
-            return freedPick;
-          const port = await findFreePort(this.config.spawnPortStart);
-          const gpuUuid = this.pickGpuForSpawn(gpus);
-          const { proc, ready } = await this.spawner({ port, gpuUuid, config: this.config });
-          try {
-            await ready;
-          } catch (err) {
-            try {
-              proc.kill();
-            } catch {
-            }
-            this.emit("spawn-failed", { port, gpuUuid, error: err });
-            return null;
-          }
-          const inst = new OllamaInstance({
-            id: `omnius-ollama-${port}`,
-            baseUrl: `http://127.0.0.1:${port}`,
-            port,
-            gpuUuid,
-            poolOwned: true,
-            inflight: 0,
-            peakInflight: 0,
-            lastUsedMs: Date.now(),
-            knownModels: /* @__PURE__ */ new Set(),
-            maxParallel: this.config.maxParallelPerInstance,
-            totalRequests: 0
-          }, proc);
-          this.instances.push(inst);
-          this.emit("instance-spawned", { id: inst.state.id, port, gpuUuid });
-          return inst;
+          return await fn();
         } finally {
           resolveGate();
         }
       }
+      canSpawnWithSharedModelStore(model) {
+        if (this.config.sharedModelStore || this.config.allowUnsharedModelStore)
+          return true;
+        this.emit("spawn-skipped", {
+          reason: "missing-shared-model-store",
+          model,
+          baseInstanceUrl: this.config.baseInstanceUrl
+        });
+        return false;
+      }
+      placementModeFor(gpus) {
+        const canShareModelStore = Boolean(this.config.sharedModelStore) || this.config.allowUnsharedModelStore;
+        if (!canShareModelStore || gpus.length < 2)
+          return "constrained";
+        if (this.config.gpuPlacement === "elastic")
+          return "elastic";
+        return "dedicated";
+      }
+      dedicatedTargetCount(gpus) {
+        const requested = this.config.targetGpuInstances > 0 ? this.config.targetGpuInstances : gpus.length;
+        const cappedByGpuCount = Math.min(requested, gpus.length);
+        return this.config.maxSpawnedInstances > 0 ? Math.min(cappedByGpuCount, this.config.maxSpawnedInstances) : cappedByGpuCount;
+      }
+      elasticSpawnCap(gpus) {
+        return this.config.maxSpawnedInstances > 0 ? this.config.maxSpawnedInstances : Math.max(0, gpus.length - 1);
+      }
+      async getGpusForPlacement(maxAgeMs = 3e3) {
+        const now = Date.now();
+        if (this.gpuCache && now - this.gpuCache.takenAtMs <= maxAgeMs) {
+          return this.gpuCache.gpus;
+        }
+        const gpus = await this.gpuDetector();
+        this.gpuCache = { gpus, takenAtMs: now };
+        return gpus;
+      }
+      async spawnInstance(model, gpu) {
+        let port;
+        try {
+          port = await this.portAllocator(this.config.spawnPortStart);
+        } catch (err) {
+          this.emit("spawn-failed", {
+            reason: "port-allocation-failed",
+            gpuUuid: gpu?.uuid ?? null,
+            gpuIndex: gpu?.index ?? null,
+            error: err
+          });
+          return null;
+        }
+        const gpuUuid = gpu?.uuid || null;
+        const gpuIndex = gpu?.index ?? null;
+        const { proc, ready } = await this.spawner({ port, gpuUuid, gpuIndex, config: this.config });
+        try {
+          await ready;
+        } catch (err) {
+          try {
+            proc.kill();
+          } catch {
+          }
+          this.emit("spawn-failed", { port, gpuUuid, gpuIndex, error: err });
+          return null;
+        }
+        const inst = new OllamaInstance({
+          id: `omnius-ollama-${port}`,
+          baseUrl: `http://127.0.0.1:${port}`,
+          port,
+          gpuUuid,
+          gpuIndex,
+          poolOwned: true,
+          inflight: 0,
+          peakInflight: 0,
+          lastUsedMs: Date.now(),
+          knownModels: /* @__PURE__ */ new Set([model]),
+          maxParallel: this.config.maxParallelPerInstance,
+          totalRequests: 0
+        }, proc);
+        this.instances.push(inst);
+        this.emit("instance-spawned", { id: inst.state.id, port, gpuUuid, gpuIndex });
+        return inst;
+      }
       /**
        * Pick a GPU for a freshly-spawned instance. Prefers GPUs that no
        * pool-owned instance is already pinned to, then most free VRAM. Returns
@@ -525313,7 +525441,7 @@ var init_ollama_pool = __esm({
         pool3.sort((a2, b) => b.vramFreeMB - a2.vramFreeMB);
         const best = pool3[_gpuCursor % pool3.length];
         _gpuCursor++;
-        return best.uuid;
+        return best;
       }
       /**
        * Periodically reap pool-owned instances that have been idle past the
@@ -525360,13 +525488,24 @@ var init_ollama_pool = __esm({
       }
       async status() {
         const hardware = await getHardwareSnapshot();
+        const placementGpus = this.gpuCache?.gpus ?? hardware.gpus;
+        const placementMode = this.placementModeFor(placementGpus);
+        const targetGpuInstances = placementMode === "dedicated" ? this.dedicatedTargetCount(placementGpus) : placementMode === "elastic" ? this.elasticSpawnCap(placementGpus) : 1;
+        const readyGpuInstances = this.instances.filter((inst) => inst.state.poolOwned).length;
         return {
           config: this.config,
+          placement: {
+            mode: placementMode,
+            targetGpuInstances,
+            readyGpuInstances,
+            sharedModelStore: this.config.sharedModelStore
+          },
           instances: this.instances.map((inst) => ({
             id: inst.state.id,
             baseUrl: inst.state.baseUrl,
             poolOwned: inst.state.poolOwned,
             gpuUuid: inst.state.gpuUuid,
+            gpuIndex: inst.state.gpuIndex,
             inflight: inst.state.inflight,
             peakInflight: inst.state.peakInflight,
             maxParallel: inst.state.maxParallel,
@@ -569697,29 +569836,60 @@ async function collectNetworkMetrics() {
   return { rxBytesPerSec: 0, txBytesPerSec: 0 };
 }
 async function collectGpuMetrics() {
-  const noGpu = { available: false, name: "", utilization: 0, vramUsedMB: 0, vramTotalMB: 0, vramUtilization: 0 };
+  const noGpu = {
+    available: false,
+    count: 0,
+    name: "",
+    utilization: 0,
+    vramUsedMB: 0,
+    vramTotalMB: 0,
+    vramUtilization: 0,
+    devices: []
+  };
   if (_nvidiaSmiAvailable2 === false) return noGpu;
   try {
     const smi = await new Promise((resolve52, reject) => {
       exec3(
-        "nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total,name --format=csv,noheader,nounits 2>/dev/null",
+        "nvidia-smi --query-gpu=index,uuid,utilization.gpu,memory.used,memory.total,name --format=csv,noheader,nounits 2>/dev/null",
         { encoding: "utf8", timeout: 3e3 },
         (err, stdout) => err ? reject(err) : resolve52(stdout)
       );
     });
     _nvidiaSmiAvailable2 = true;
-    const line = smi.trim().split("\n")[0];
-    if (!line) return noGpu;
-    const parts = line.split(",").map((s2) => s2.trim());
-    const vramUsed = parseInt(parts[1] ?? "0", 10) || 0;
-    const vramTotal = parseInt(parts[2] ?? "0", 10) || 0;
+    const devices = [];
+    for (const line of smi.trim().split("\n")) {
+      if (!line.trim()) continue;
+      const parts = line.split(",").map((s2) => s2.trim());
+      const index = parseInt(parts[0] ?? "-1", 10);
+      const utilization = parseInt(parts[2] ?? "0", 10) || 0;
+      const vramUsed2 = parseInt(parts[3] ?? "0", 10) || 0;
+      const vramTotal2 = parseInt(parts[4] ?? "0", 10) || 0;
+      if (!Number.isFinite(index) || index < 0) continue;
+      devices.push({
+        index,
+        uuid: parts[1] ?? "",
+        utilization,
+        vramUsedMB: vramUsed2,
+        vramTotalMB: vramTotal2,
+        name: parts.slice(5).join(", ") || "",
+        vramUtilization: vramTotal2 > 0 ? Math.round(vramUsed2 / vramTotal2 * 100) : 0
+      });
+    }
+    if (devices.length === 0) return noGpu;
+    const vramUsed = devices.reduce((sum, gpu) => sum + gpu.vramUsedMB, 0);
+    const vramTotal = devices.reduce((sum, gpu) => sum + gpu.vramTotalMB, 0);
+    const avgUtil = Math.round(devices.reduce((sum, gpu) => sum + gpu.utilization, 0) / devices.length);
+    const firstName = devices[0]?.name ?? "";
+    const allSameName = devices.every((gpu) => gpu.name === firstName);
     return {
       available: true,
-      utilization: parseInt(parts[0] ?? "0", 10) || 0,
+      count: devices.length,
+      utilization: avgUtil,
       vramUsedMB: vramUsed,
       vramTotalMB: vramTotal,
-      name: parts[3] ?? "",
-      vramUtilization: vramTotal > 0 ? Math.round(vramUsed / vramTotal * 100) : 0
+      name: devices.length > 1 && allSameName ? `${devices.length}x ${firstName}` : firstName,
+      vramUtilization: vramTotal > 0 ? Math.round(vramUsed / vramTotal * 100) : 0,
+      devices
     };
   } catch {
     _nvidiaSmiAvailable2 = false;
@@ -569736,7 +569906,9 @@ function getInstantSnapshot() {
       cpuCores: cr.cpuCores,
       cpuModel: cr.cpuModel,
       gpuUtil: -1,
+      gpuCount: 0,
       gpuName: "",
+      gpuDevices: [],
       vramUtil: -1,
       vramUsedMB: 0,
       vramTotalMB: 0,
@@ -569794,10 +569966,11 @@ function collectCpuRam() {
 }
 async function collectLocalMetrics() {
   const cpuRam = collectCpuRam();
-  const [gpu, disk, network] = await Promise.all([
+  const [gpu, disk, network, ollamaPool] = await Promise.all([
     collectGpuMetrics(),
     collectDiskMetrics(),
-    collectNetworkMetrics()
+    collectNetworkMetrics(),
+    collectOllamaPoolMetrics()
   ]);
   return {
     source: "local",
@@ -569806,7 +569979,9 @@ async function collectLocalMetrics() {
       cpuCores: cpuRam.cpuCores,
       cpuModel: cpuRam.cpuModel,
       gpuUtil: gpu.available ? gpu.utilization : -1,
+      gpuCount: gpu.count,
       gpuName: gpu.name,
+      gpuDevices: gpu.devices,
       vramUtil: gpu.available ? gpu.vramUtilization : -1,
       vramUsedMB: gpu.vramUsedMB,
       vramTotalMB: gpu.vramTotalMB,
@@ -569817,15 +569992,43 @@ async function collectLocalMetrics() {
       diskUsedGB: disk.usedGB,
       diskTotalGB: disk.totalGB,
       diskFreeGB: disk.freeGB,
-      diskPath: disk.path
+      diskPath: disk.path,
+      ollamaPool
     },
     network
   };
 }
+async function collectOllamaPoolMetrics() {
+  try {
+    const config = resolveDefaultPoolConfig();
+    if (!shouldUseOllamaPoolForBaseUrl(config.baseInstanceUrl)) return null;
+    const status = await getOllamaPool({ baseInstanceUrl: config.baseInstanceUrl }).status();
+    return {
+      enabled: true,
+      mode: status.placement.mode,
+      targetGpuInstances: status.placement.targetGpuInstances,
+      readyGpuInstances: status.placement.readyGpuInstances,
+      sharedModelStore: status.placement.sharedModelStore,
+      instances: status.instances.map((inst) => ({
+        id: inst.id,
+        baseUrl: inst.baseUrl,
+        poolOwned: inst.poolOwned,
+        gpuUuid: inst.gpuUuid,
+        gpuIndex: inst.gpuIndex,
+        inflight: inst.inflight,
+        maxParallel: inst.maxParallel,
+        totalRequests: inst.totalRequests
+      }))
+    };
+  } catch {
+    return null;
+  }
+}
 var _lastNetSnapshot, _nvidiaSmiAvailable2, _cpuPrevSnapshot, SystemMetricsCollector;
 var init_system_metrics = __esm({
   "packages/cli/src/tui/system-metrics.ts"() {
     "use strict";
+    init_dist8();
     init_disk_monitor();
     _lastNetSnapshot = null;
     _nvidiaSmiAvailable2 = null;
@@ -569881,7 +570084,9 @@ var init_system_metrics = __esm({
           cpuCores: hw.cpuCores ?? 0,
           cpuModel: hw.cpuModel ?? "",
           gpuUtil: hw.gpuUtil ?? -1,
+          gpuCount: hw.gpuCount ?? 0,
           gpuName: hw.gpuName ?? "",
+          gpuDevices: hw.gpuDevices ?? [],
           vramUtil: hw.vramUtil ?? -1,
           vramUsedMB: hw.vramUsedMB ?? 0,
           vramTotalMB: hw.vramTotalMB ?? 0,
@@ -569892,7 +570097,8 @@ var init_system_metrics = __esm({
           diskUsedGB: hw.diskUsedGB ?? 0,
           diskTotalGB: hw.diskTotalGB ?? 0,
           diskFreeGB: hw.diskFreeGB ?? 0,
-          diskPath: hw.diskPath ?? ""
+          diskPath: hw.diskPath ?? "",
+          ollamaPool: hw.ollamaPool ?? null
         };
         this._latest = {
           source: "remote",
@@ -573751,6 +573957,19 @@ ${CONTENT_BG_SEQ}`);
             hwExpW += 6 + `${rm4.vramUtil}%`.length + vramDetail.length;
             hwCompW += 6 + `${rm4.vramUtil}%`.length;
           }
+          if (rm4.ollamaPool?.enabled) {
+            const pool3 = rm4.ollamaPool;
+            const ready = pool3.readyGpuInstances;
+            const target = pool3.targetGpuInstances;
+            const poolColor = pool3.mode === "constrained" ? c3.yellow : target > 0 && ready < target ? c3.yellow : c3.green;
+            const poolDetail = pool3.mode === "constrained" ? "queue" : `${ready}/${target}`;
+            const poolText = ` OLLAMA ${poolColor(`${pool3.mode}:${poolDetail}`)}`;
+            const compactText = ` OLLAMA ${poolColor(pool3.mode === "constrained" ? "queue" : `${ready}/${target}`)}`;
+            hwExpStr += poolText;
+            hwCompStr += compactText;
+            hwExpW += 8 + `${pool3.mode}:${poolDetail}`.length;
+            hwCompW += 8 + (pool3.mode === "constrained" ? "queue".length : `${ready}/${target}`.length);
+          }
           if (!isLocal && hwExpW === 0) {
             const statusMsg = rm4.gpuName && rm4.gpuName !== "peer" ? rm4.gpuName : "awaiting metrics...";
             hwExpStr = c3.dim(statusMsg);
@@ -610459,9 +610678,9 @@ function telegramDecisionRecoverableFlag(text) {
   }
   return void 0;
 }
-function telegramRouterTimeoutMs(configTimeoutMs, minMs = 15e3, _legacyMaxMs) {
+function telegramRouterTimeoutMs(configTimeoutMs, minMs = 12e4, _legacyMaxMs) {
   const configured = Number.isFinite(configTimeoutMs) && (configTimeoutMs ?? 0) > 0 ? configTimeoutMs : 3e5;
-  return Math.max(configured, minMs);
+  return Math.max(configured, minMs, 12e4);
 }
 function parseTelegramInteractionDecision(text, forcedRoute, options2 = {}) {
   for (const jsonText of telegramDecisionJsonCandidates(text)) {
@@ -617135,7 +617354,7 @@ ${conversationStream}`
           tools: [],
           temperature: 0.4,
           maxTokens: 700,
-          timeoutMs: Math.max(config.timeoutMs ?? 3e5, 6e4),
+          timeoutMs: Math.max(config.timeoutMs ?? 3e5, 12e4),
           think: false
         };
         let accumulated = "";
@@ -627073,17 +627292,50 @@ async function handleAimsResources(ctx3) {
   try {
     const os9 = __require("node:os");
     const config = loadConfig();
+    let ollamaPool = null;
+    let hardware = null;
+    try {
+      const {
+        getHardwareSnapshot: getHardwareSnapshot2,
+        getOllamaPool: getOllamaPool2,
+        resolveDefaultPoolConfig: resolveDefaultPoolConfig2,
+        shouldUseOllamaPoolForBaseUrl: shouldUseOllamaPoolForBaseUrl2
+      } = await Promise.resolve().then(() => (init_dist8(), dist_exports3));
+      hardware = await getHardwareSnapshot2();
+      const poolConfig = resolveDefaultPoolConfig2();
+      if (shouldUseOllamaPoolForBaseUrl2(poolConfig.baseInstanceUrl)) {
+        const status = await getOllamaPool2({ baseInstanceUrl: poolConfig.baseInstanceUrl }).status();
+        ollamaPool = {
+          placement: status.placement,
+          instances: status.instances.map((inst) => ({
+            id: inst.id,
+            base_url: inst.baseUrl,
+            pool_owned: inst.poolOwned,
+            gpu_uuid: inst.gpuUuid,
+            gpu_index: inst.gpuIndex,
+            inflight: inst.inflight,
+            max_parallel: inst.maxParallel,
+            total_requests: inst.totalRequests
+          }))
+        };
+      }
+    } catch {
+      hardware = null;
+      ollamaPool = null;
+    }
     sendJson(res, 200, {
       compute: {
         cpu: os9.cpus()[0]?.model ?? "unknown",
         cores: os9.cpus().length,
         ram_gb: Math.round(os9.totalmem() / 1024 ** 3),
-        platform: process.platform
+        platform: process.platform,
+        hardware
       },
       backend: {
         type: config.backendType,
         url: config.backendUrl,
-        model: config.model
+        model: config.model,
+        ollama_pool: ollamaPool
       },
       "aims:control": "A.4"
     });
@@ -641443,6 +641695,32 @@ async function handleRequest(req2, res, ollamaUrl, verbose) {
         }
       } catch {
       }
+      let ollamaPool = null;
+      try {
+        const {
+          getOllamaPool: getOllamaPool2,
+          resolveDefaultPoolConfig: resolveDefaultPoolConfig2,
+          shouldUseOllamaPoolForBaseUrl: shouldUseOllamaPoolForBaseUrl2
+        } = await Promise.resolve().then(() => (init_dist8(), dist_exports3));
+        const poolConfig = resolveDefaultPoolConfig2();
+        if (shouldUseOllamaPoolForBaseUrl2(poolConfig.baseInstanceUrl)) {
+          const status2 = await getOllamaPool2({ baseInstanceUrl: poolConfig.baseInstanceUrl }).status();
+          ollamaPool = {
+            placement: status2.placement,
+            instances: status2.instances.map((inst) => ({
+              id: inst.id,
+              base_url: inst.baseUrl,
+              pool_owned: inst.poolOwned,
+              gpu_uuid: inst.gpuUuid,
+              gpu_index: inst.gpuIndex,
+              inflight: inst.inflight,
+              max_parallel: inst.maxParallel,
+              total_requests: inst.totalRequests
+            }))
+          };
+        }
+      } catch {
+      }
       let latestVersion = null;
       try {
         const ver = es("npm view omnius version 2>/dev/null", { encoding: "utf8", timeout: 5e3, stdio: "pipe" }).trim();
@@ -641452,6 +641730,7 @@ async function handleRequest(req2, res, ollamaUrl, verbose) {
       jsonResponse(res, 200, {
         gpu: gpus,
         gpu_utilization: gpuUtil,
+        ollama_pool: ollamaPool,
         total_vram_gb: totalVram,
         ram_gb: Math.round(totalMem / 1024 ** 3),
         ram_used_pct: ramUsedPct,

package/npm-shrinkwrap.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "omnius",
-  "version": "1.0.82",
+  "version": "1.0.84",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "omnius",
-      "version": "1.0.82",
+      "version": "1.0.84",
       "bundleDependencies": [
         "image-to-ascii"
       ],

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "omnius",
-  "version": "1.0.82",
+  "version": "1.0.84",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",