omnius 1.0.82 → 1.0.84

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -524968,8 +524968,11 @@ async function findFreePort(start2) {
524968
524968
  }
524969
524969
  function resolveDefaultPoolConfig() {
524970
524970
  const baseInstanceUrl = process.env["OMNIUS_OLLAMA_BASE_URL"] || process.env["OLLAMA_HOST"]?.replace(/^([^:/]+:[0-9]+)$/, "http://$1") || "http://127.0.0.1:11434";
524971
- const maxParallelPerInstance = Number(process.env["OMNIUS_OLLAMA_MAX_PARALLEL"]) || 4;
524971
+ const maxParallelPerInstance = Number(process.env["OMNIUS_OLLAMA_MAX_PARALLEL"]) || 1;
524972
524972
  const maxSpawnedInstances = Number(process.env["OMNIUS_OLLAMA_MAX_INSTANCES"]) || 0;
524973
+ const targetGpuInstances = Number(process.env["OMNIUS_OLLAMA_TARGET_GPU_INSTANCES"]) || 0;
524974
+ const gpuPlacementRaw = (process.env["OMNIUS_OLLAMA_GPU_PLACEMENT"] ?? "auto").toLowerCase();
524975
+ const gpuPlacement = gpuPlacementRaw === "dedicated" || gpuPlacementRaw === "elastic" || gpuPlacementRaw === "auto" ? gpuPlacementRaw : "auto";
524973
524976
  const idleMs = Number(process.env["OMNIUS_OLLAMA_IDLE_MS"]) || 5 * 60 * 1e3;
524974
524977
  const reaperIntervalMs = Number(process.env["OMNIUS_OLLAMA_REAPER_MS"]) || 3e4;
524975
524978
  const spawnPortStart = Number(process.env["OMNIUS_OLLAMA_SPAWN_PORT"]) || 11435;
@@ -524983,6 +524986,8 @@ function resolveDefaultPoolConfig() {
524983
524986
  baseInstanceUrl: baseInstanceUrl.replace(/\/+$/, ""),
524984
524987
  maxParallelPerInstance,
524985
524988
  maxSpawnedInstances,
524989
+ targetGpuInstances,
524990
+ gpuPlacement,
524986
524991
  idleMs,
524987
524992
  reaperIntervalMs,
524988
524993
  spawnPortStart,
@@ -525110,15 +525115,17 @@ var init_ollama_pool = __esm({
525110
525115
  this.proc = null;
525111
525116
  }
525112
525117
  };
525113
- realInstanceSpawner = async ({ port, gpuUuid, config }) => {
525118
+ realInstanceSpawner = async ({ port, gpuUuid, gpuIndex, config }) => {
525114
525119
  const env2 = { ...process.env };
525115
525120
  env2["OLLAMA_HOST"] = `127.0.0.1:${port}`;
525116
525121
  if (config.sharedModelStore) {
525117
525122
  env2["OLLAMA_MODELS"] = config.sharedModelStore;
525118
525123
  }
525119
525124
  env2["OLLAMA_NUM_PARALLEL"] = String(config.maxParallelPerInstance);
525120
- if (gpuUuid)
525125
+ if (gpuUuid) {
525121
525126
  env2["CUDA_VISIBLE_DEVICES"] = gpuUuid;
525127
+ env2["GPU_DEVICE_ORDINAL"] = gpuIndex === null ? "" : String(gpuIndex);
525128
+ }
525122
525129
  const child = spawn21(config.ollamaBinary, ["serve"], {
525123
525130
  env: env2,
525124
525131
  stdio: ["ignore", "pipe", "pipe"],
@@ -525160,17 +525167,27 @@ var init_ollama_pool = __esm({
525160
525167
  instances = [];
525161
525168
  reaperHandle = null;
525162
525169
  spawner;
525170
+ gpuDetector;
525171
+ portAllocator;
525163
525172
  /** Serializes concurrent spawn requests so two callers don't both create instance N+1. */
525164
525173
  spawnGate = Promise.resolve();
525174
+ /** True after dedicated mode has successfully started at least one pool-owned GPU runner. */
525175
+ dedicatedGpuPoolActive = false;
525176
+ activePlacementMode = "constrained";
525177
+ gpuCache = null;
525178
+ slotWaiters = [];
525165
525179
  constructor(config, opts) {
525166
525180
  super();
525167
525181
  this.config = { ...resolveDefaultPoolConfig(), ...config };
525168
525182
  this.spawner = opts?.spawner ?? realInstanceSpawner;
525183
+ this.gpuDetector = opts?.gpuDetector ?? detectGpus;
525184
+ this.portAllocator = opts?.portAllocator ?? findFreePort;
525169
525185
  this.instances.push(new OllamaInstance({
525170
525186
  id: "omnius-ollama-base",
525171
525187
  baseUrl: this.config.baseInstanceUrl,
525172
525188
  port: this.portFromUrl(this.config.baseInstanceUrl),
525173
525189
  gpuUuid: null,
525190
+ gpuIndex: null,
525174
525191
  poolOwned: false,
525175
525192
  inflight: 0,
525176
525193
  peakInflight: 0,
@@ -525191,32 +525208,42 @@ var init_ollama_pool = __esm({
525191
525208
  * 2. Any instance with free slots (least-loaded first).
525192
525209
  * 3. Spawn a new instance pinned to the least-utilized GPU, when the
525193
525210
  * pool hasn't hit `maxSpawnedInstances`.
525194
- * 4. Fall back to the least-loaded instance even if saturated the
525195
- * caller will block inside Ollama's internal queue rather than fail.
525211
+ * 4. Queue at the pool boundary when all allowed lanes are busy.
525196
525212
  */
525197
525213
  async acquire(opts) {
525214
+ const gpus = await this.getGpusForPlacement();
525215
+ let placementMode = this.placementModeFor(gpus);
525216
+ this.activePlacementMode = placementMode;
525217
+ if (placementMode === "dedicated") {
525218
+ await this.ensureDedicatedGpuPool(opts.model, gpus);
525219
+ if (!this.instances.some((i2) => i2.state.poolOwned)) {
525220
+ placementMode = "constrained";
525221
+ this.activePlacementMode = placementMode;
525222
+ }
525223
+ }
525198
525224
  const pick = this.pickInstance(opts);
525199
525225
  if (pick) {
525200
525226
  pick.acquire(opts.model);
525201
525227
  return this.buildSlot(pick);
525202
525228
  }
525203
- const spawned = await this.maybeSpawnInstance(opts.model);
525204
- if (spawned) {
525229
+ if (placementMode === "constrained") {
525230
+ return this.acquireQueued(opts);
525231
+ }
525232
+ const spawned = placementMode === "elastic" ? await this.maybeSpawnInstance(opts.model) : null;
525233
+ if (spawned && !spawned.isSaturated()) {
525205
525234
  spawned.acquire(opts.model);
525206
525235
  return this.buildSlot(spawned);
525207
525236
  }
525208
- const fallback = this.instances.slice().sort((a2, b) => a2.state.inflight - b.state.inflight)[0];
525209
- fallback.acquire(opts.model);
525210
- return this.buildSlot(fallback);
525237
+ return this.acquireQueued(opts);
525211
525238
  }
525212
525239
  /** Synchronous routing decision; returns the instance or null if every one is saturated. */
525213
525240
  pickInstance(opts) {
525214
- const candidates = this.instances.filter((inst) => !inst.isSaturated());
525241
+ const candidates = this.instances.filter((inst) => !this.isEffectivelySaturated(inst) && !(this.activePlacementMode === "dedicated" && this.dedicatedGpuPoolActive && !inst.state.poolOwned && !opts.preferBaseInstance));
525215
525242
  if (candidates.length === 0)
525216
525243
  return null;
525217
525244
  const scored = candidates.map((inst) => ({
525218
525245
  inst,
525219
- score: (inst.state.knownModels.has(opts.model) ? 100 : 0) + (opts.preferBaseInstance && !inst.state.poolOwned ? 25 : 0) + inst.freeSlots() * 10 - inst.state.inflight
525246
+ score: (inst.state.knownModels.has(opts.model) ? 100 : 0) + (opts.preferBaseInstance && !inst.state.poolOwned ? 25 : 0) + this.effectiveFreeSlots(inst) * 10 - inst.state.inflight
525220
525247
  }));
525221
525248
  scored.sort((a2, b) => b.score - a2.score);
525222
525249
  return scored[0].inst;
@@ -525227,9 +525254,39 @@ var init_ollama_pool = __esm({
525227
525254
  baseUrl: inst.state.baseUrl,
525228
525255
  poolOwned: inst.state.poolOwned,
525229
525256
  gpuUuid: inst.state.gpuUuid,
525230
- release: (success) => inst.release(success)
525257
+ gpuIndex: inst.state.gpuIndex,
525258
+ release: (success) => {
525259
+ inst.release(success);
525260
+ this.wakeNextSlotWaiter();
525261
+ }
525231
525262
  };
525232
525263
  }
525264
+ async acquireQueued(opts) {
525265
+ for (; ; ) {
525266
+ const pick = this.pickInstance(opts);
525267
+ if (pick) {
525268
+ pick.acquire(opts.model);
525269
+ return this.buildSlot(pick);
525270
+ }
525271
+ await new Promise((resolve52) => this.slotWaiters.push(resolve52));
525272
+ }
525273
+ }
525274
+ wakeNextSlotWaiter() {
525275
+ const waiter = this.slotWaiters.shift();
525276
+ if (waiter)
525277
+ waiter();
525278
+ }
525279
+ effectiveMaxParallel(inst) {
525280
+ if (this.activePlacementMode === "constrained")
525281
+ return 1;
525282
+ return Math.max(1, inst.state.maxParallel);
525283
+ }
525284
+ isEffectivelySaturated(inst) {
525285
+ return inst.state.inflight >= this.effectiveMaxParallel(inst);
525286
+ }
525287
+ effectiveFreeSlots(inst) {
525288
+ return Math.max(0, this.effectiveMaxParallel(inst) - inst.state.inflight);
525289
+ }
525233
525290
  /**
525234
525291
  * Spawn a new instance pinned to a GPU when policy allows. Returns the
525235
525292
  * spawned instance or null when:
@@ -525241,6 +525298,42 @@ var init_ollama_pool = __esm({
525241
525298
  * over-allocate.
525242
525299
  */
525243
525300
  async maybeSpawnInstance(model) {
525301
+ return this.withSpawnGate(async () => {
525302
+ if (!this.canSpawnWithSharedModelStore(model))
525303
+ return null;
525304
+ const poolOwnedCount = this.instances.filter((i2) => i2.state.poolOwned).length;
525305
+ const gpus = await this.getGpusForPlacement();
525306
+ const cap = this.elasticSpawnCap(gpus);
525307
+ if (poolOwnedCount >= cap)
525308
+ return null;
525309
+ const freedPick = this.pickInstance({ model });
525310
+ if (freedPick)
525311
+ return freedPick;
525312
+ const gpu = this.pickGpuForSpawn(gpus);
525313
+ return this.spawnInstance(model, gpu);
525314
+ });
525315
+ }
525316
+ async ensureDedicatedGpuPool(model, gpus) {
525317
+ if (this.placementModeFor(gpus) !== "dedicated")
525318
+ return;
525319
+ await this.withSpawnGate(async () => {
525320
+ if (!this.canSpawnWithSharedModelStore(model))
525321
+ return;
525322
+ const target = this.dedicatedTargetCount(gpus);
525323
+ while (this.instances.filter((i2) => i2.state.poolOwned).length < target) {
525324
+ const gpu = this.pickGpuForSpawn(gpus);
525325
+ if (!gpu)
525326
+ break;
525327
+ const inst = await this.spawnInstance(model, gpu);
525328
+ if (!inst)
525329
+ break;
525330
+ }
525331
+ if (this.instances.some((i2) => i2.state.poolOwned)) {
525332
+ this.dedicatedGpuPoolActive = true;
525333
+ }
525334
+ });
525335
+ }
525336
+ async withSpawnGate(fn) {
525244
525337
  let resolveGate = () => {
525245
525338
  };
525246
525339
  const myTurn = new Promise((r2) => {
@@ -525250,55 +525343,90 @@ var init_ollama_pool = __esm({
525250
525343
  this.spawnGate = myTurn;
525251
525344
  await prev;
525252
525345
  try {
525253
- if (!this.config.sharedModelStore && !this.config.allowUnsharedModelStore) {
525254
- this.emit("spawn-skipped", {
525255
- reason: "missing-shared-model-store",
525256
- model,
525257
- baseInstanceUrl: this.config.baseInstanceUrl
525258
- });
525259
- return null;
525260
- }
525261
- const poolOwnedCount = this.instances.filter((i2) => i2.state.poolOwned).length;
525262
- const gpus = await detectGpus();
525263
- const cap = this.config.maxSpawnedInstances > 0 ? this.config.maxSpawnedInstances : Math.max(0, gpus.length - 1);
525264
- if (poolOwnedCount >= cap)
525265
- return null;
525266
- const freedPick = this.pickInstance({ model });
525267
- if (freedPick)
525268
- return freedPick;
525269
- const port = await findFreePort(this.config.spawnPortStart);
525270
- const gpuUuid = this.pickGpuForSpawn(gpus);
525271
- const { proc, ready } = await this.spawner({ port, gpuUuid, config: this.config });
525272
- try {
525273
- await ready;
525274
- } catch (err) {
525275
- try {
525276
- proc.kill();
525277
- } catch {
525278
- }
525279
- this.emit("spawn-failed", { port, gpuUuid, error: err });
525280
- return null;
525281
- }
525282
- const inst = new OllamaInstance({
525283
- id: `omnius-ollama-${port}`,
525284
- baseUrl: `http://127.0.0.1:${port}`,
525285
- port,
525286
- gpuUuid,
525287
- poolOwned: true,
525288
- inflight: 0,
525289
- peakInflight: 0,
525290
- lastUsedMs: Date.now(),
525291
- knownModels: /* @__PURE__ */ new Set(),
525292
- maxParallel: this.config.maxParallelPerInstance,
525293
- totalRequests: 0
525294
- }, proc);
525295
- this.instances.push(inst);
525296
- this.emit("instance-spawned", { id: inst.state.id, port, gpuUuid });
525297
- return inst;
525346
+ return await fn();
525298
525347
  } finally {
525299
525348
  resolveGate();
525300
525349
  }
525301
525350
  }
525351
+ canSpawnWithSharedModelStore(model) {
525352
+ if (this.config.sharedModelStore || this.config.allowUnsharedModelStore)
525353
+ return true;
525354
+ this.emit("spawn-skipped", {
525355
+ reason: "missing-shared-model-store",
525356
+ model,
525357
+ baseInstanceUrl: this.config.baseInstanceUrl
525358
+ });
525359
+ return false;
525360
+ }
525361
+ placementModeFor(gpus) {
525362
+ const canShareModelStore = Boolean(this.config.sharedModelStore) || this.config.allowUnsharedModelStore;
525363
+ if (!canShareModelStore || gpus.length < 2)
525364
+ return "constrained";
525365
+ if (this.config.gpuPlacement === "elastic")
525366
+ return "elastic";
525367
+ return "dedicated";
525368
+ }
525369
+ dedicatedTargetCount(gpus) {
525370
+ const requested = this.config.targetGpuInstances > 0 ? this.config.targetGpuInstances : gpus.length;
525371
+ const cappedByGpuCount = Math.min(requested, gpus.length);
525372
+ return this.config.maxSpawnedInstances > 0 ? Math.min(cappedByGpuCount, this.config.maxSpawnedInstances) : cappedByGpuCount;
525373
+ }
525374
+ elasticSpawnCap(gpus) {
525375
+ return this.config.maxSpawnedInstances > 0 ? this.config.maxSpawnedInstances : Math.max(0, gpus.length - 1);
525376
+ }
525377
+ async getGpusForPlacement(maxAgeMs = 3e3) {
525378
+ const now = Date.now();
525379
+ if (this.gpuCache && now - this.gpuCache.takenAtMs <= maxAgeMs) {
525380
+ return this.gpuCache.gpus;
525381
+ }
525382
+ const gpus = await this.gpuDetector();
525383
+ this.gpuCache = { gpus, takenAtMs: now };
525384
+ return gpus;
525385
+ }
525386
+ async spawnInstance(model, gpu) {
525387
+ let port;
525388
+ try {
525389
+ port = await this.portAllocator(this.config.spawnPortStart);
525390
+ } catch (err) {
525391
+ this.emit("spawn-failed", {
525392
+ reason: "port-allocation-failed",
525393
+ gpuUuid: gpu?.uuid ?? null,
525394
+ gpuIndex: gpu?.index ?? null,
525395
+ error: err
525396
+ });
525397
+ return null;
525398
+ }
525399
+ const gpuUuid = gpu?.uuid || null;
525400
+ const gpuIndex = gpu?.index ?? null;
525401
+ const { proc, ready } = await this.spawner({ port, gpuUuid, gpuIndex, config: this.config });
525402
+ try {
525403
+ await ready;
525404
+ } catch (err) {
525405
+ try {
525406
+ proc.kill();
525407
+ } catch {
525408
+ }
525409
+ this.emit("spawn-failed", { port, gpuUuid, gpuIndex, error: err });
525410
+ return null;
525411
+ }
525412
+ const inst = new OllamaInstance({
525413
+ id: `omnius-ollama-${port}`,
525414
+ baseUrl: `http://127.0.0.1:${port}`,
525415
+ port,
525416
+ gpuUuid,
525417
+ gpuIndex,
525418
+ poolOwned: true,
525419
+ inflight: 0,
525420
+ peakInflight: 0,
525421
+ lastUsedMs: Date.now(),
525422
+ knownModels: /* @__PURE__ */ new Set([model]),
525423
+ maxParallel: this.config.maxParallelPerInstance,
525424
+ totalRequests: 0
525425
+ }, proc);
525426
+ this.instances.push(inst);
525427
+ this.emit("instance-spawned", { id: inst.state.id, port, gpuUuid, gpuIndex });
525428
+ return inst;
525429
+ }
525302
525430
  /**
525303
525431
  * Pick a GPU for a freshly-spawned instance. Prefers GPUs that no
525304
525432
  * pool-owned instance is already pinned to, then most free VRAM. Returns
@@ -525313,7 +525441,7 @@ var init_ollama_pool = __esm({
525313
525441
  pool3.sort((a2, b) => b.vramFreeMB - a2.vramFreeMB);
525314
525442
  const best = pool3[_gpuCursor % pool3.length];
525315
525443
  _gpuCursor++;
525316
- return best.uuid;
525444
+ return best;
525317
525445
  }
525318
525446
  /**
525319
525447
  * Periodically reap pool-owned instances that have been idle past the
@@ -525360,13 +525488,24 @@ var init_ollama_pool = __esm({
525360
525488
  }
525361
525489
  async status() {
525362
525490
  const hardware = await getHardwareSnapshot();
525491
+ const placementGpus = this.gpuCache?.gpus ?? hardware.gpus;
525492
+ const placementMode = this.placementModeFor(placementGpus);
525493
+ const targetGpuInstances = placementMode === "dedicated" ? this.dedicatedTargetCount(placementGpus) : placementMode === "elastic" ? this.elasticSpawnCap(placementGpus) : 1;
525494
+ const readyGpuInstances = this.instances.filter((inst) => inst.state.poolOwned).length;
525363
525495
  return {
525364
525496
  config: this.config,
525497
+ placement: {
525498
+ mode: placementMode,
525499
+ targetGpuInstances,
525500
+ readyGpuInstances,
525501
+ sharedModelStore: this.config.sharedModelStore
525502
+ },
525365
525503
  instances: this.instances.map((inst) => ({
525366
525504
  id: inst.state.id,
525367
525505
  baseUrl: inst.state.baseUrl,
525368
525506
  poolOwned: inst.state.poolOwned,
525369
525507
  gpuUuid: inst.state.gpuUuid,
525508
+ gpuIndex: inst.state.gpuIndex,
525370
525509
  inflight: inst.state.inflight,
525371
525510
  peakInflight: inst.state.peakInflight,
525372
525511
  maxParallel: inst.state.maxParallel,
@@ -569697,29 +569836,60 @@ async function collectNetworkMetrics() {
569697
569836
  return { rxBytesPerSec: 0, txBytesPerSec: 0 };
569698
569837
  }
569699
569838
  async function collectGpuMetrics() {
569700
- const noGpu = { available: false, name: "", utilization: 0, vramUsedMB: 0, vramTotalMB: 0, vramUtilization: 0 };
569839
+ const noGpu = {
569840
+ available: false,
569841
+ count: 0,
569842
+ name: "",
569843
+ utilization: 0,
569844
+ vramUsedMB: 0,
569845
+ vramTotalMB: 0,
569846
+ vramUtilization: 0,
569847
+ devices: []
569848
+ };
569701
569849
  if (_nvidiaSmiAvailable2 === false) return noGpu;
569702
569850
  try {
569703
569851
  const smi = await new Promise((resolve52, reject) => {
569704
569852
  exec3(
569705
- "nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total,name --format=csv,noheader,nounits 2>/dev/null",
569853
+ "nvidia-smi --query-gpu=index,uuid,utilization.gpu,memory.used,memory.total,name --format=csv,noheader,nounits 2>/dev/null",
569706
569854
  { encoding: "utf8", timeout: 3e3 },
569707
569855
  (err, stdout) => err ? reject(err) : resolve52(stdout)
569708
569856
  );
569709
569857
  });
569710
569858
  _nvidiaSmiAvailable2 = true;
569711
- const line = smi.trim().split("\n")[0];
569712
- if (!line) return noGpu;
569713
- const parts = line.split(",").map((s2) => s2.trim());
569714
- const vramUsed = parseInt(parts[1] ?? "0", 10) || 0;
569715
- const vramTotal = parseInt(parts[2] ?? "0", 10) || 0;
569859
+ const devices = [];
569860
+ for (const line of smi.trim().split("\n")) {
569861
+ if (!line.trim()) continue;
569862
+ const parts = line.split(",").map((s2) => s2.trim());
569863
+ const index = parseInt(parts[0] ?? "-1", 10);
569864
+ const utilization = parseInt(parts[2] ?? "0", 10) || 0;
569865
+ const vramUsed2 = parseInt(parts[3] ?? "0", 10) || 0;
569866
+ const vramTotal2 = parseInt(parts[4] ?? "0", 10) || 0;
569867
+ if (!Number.isFinite(index) || index < 0) continue;
569868
+ devices.push({
569869
+ index,
569870
+ uuid: parts[1] ?? "",
569871
+ utilization,
569872
+ vramUsedMB: vramUsed2,
569873
+ vramTotalMB: vramTotal2,
569874
+ name: parts.slice(5).join(", ") || "",
569875
+ vramUtilization: vramTotal2 > 0 ? Math.round(vramUsed2 / vramTotal2 * 100) : 0
569876
+ });
569877
+ }
569878
+ if (devices.length === 0) return noGpu;
569879
+ const vramUsed = devices.reduce((sum, gpu) => sum + gpu.vramUsedMB, 0);
569880
+ const vramTotal = devices.reduce((sum, gpu) => sum + gpu.vramTotalMB, 0);
569881
+ const avgUtil = Math.round(devices.reduce((sum, gpu) => sum + gpu.utilization, 0) / devices.length);
569882
+ const firstName = devices[0]?.name ?? "";
569883
+ const allSameName = devices.every((gpu) => gpu.name === firstName);
569716
569884
  return {
569717
569885
  available: true,
569718
- utilization: parseInt(parts[0] ?? "0", 10) || 0,
569886
+ count: devices.length,
569887
+ utilization: avgUtil,
569719
569888
  vramUsedMB: vramUsed,
569720
569889
  vramTotalMB: vramTotal,
569721
- name: parts[3] ?? "",
569722
- vramUtilization: vramTotal > 0 ? Math.round(vramUsed / vramTotal * 100) : 0
569890
+ name: devices.length > 1 && allSameName ? `${devices.length}x ${firstName}` : firstName,
569891
+ vramUtilization: vramTotal > 0 ? Math.round(vramUsed / vramTotal * 100) : 0,
569892
+ devices
569723
569893
  };
569724
569894
  } catch {
569725
569895
  _nvidiaSmiAvailable2 = false;
@@ -569736,7 +569906,9 @@ function getInstantSnapshot() {
569736
569906
  cpuCores: cr.cpuCores,
569737
569907
  cpuModel: cr.cpuModel,
569738
569908
  gpuUtil: -1,
569909
+ gpuCount: 0,
569739
569910
  gpuName: "",
569911
+ gpuDevices: [],
569740
569912
  vramUtil: -1,
569741
569913
  vramUsedMB: 0,
569742
569914
  vramTotalMB: 0,
@@ -569794,10 +569966,11 @@ function collectCpuRam() {
569794
569966
  }
569795
569967
  async function collectLocalMetrics() {
569796
569968
  const cpuRam = collectCpuRam();
569797
- const [gpu, disk, network] = await Promise.all([
569969
+ const [gpu, disk, network, ollamaPool] = await Promise.all([
569798
569970
  collectGpuMetrics(),
569799
569971
  collectDiskMetrics(),
569800
- collectNetworkMetrics()
569972
+ collectNetworkMetrics(),
569973
+ collectOllamaPoolMetrics()
569801
569974
  ]);
569802
569975
  return {
569803
569976
  source: "local",
@@ -569806,7 +569979,9 @@ async function collectLocalMetrics() {
569806
569979
  cpuCores: cpuRam.cpuCores,
569807
569980
  cpuModel: cpuRam.cpuModel,
569808
569981
  gpuUtil: gpu.available ? gpu.utilization : -1,
569982
+ gpuCount: gpu.count,
569809
569983
  gpuName: gpu.name,
569984
+ gpuDevices: gpu.devices,
569810
569985
  vramUtil: gpu.available ? gpu.vramUtilization : -1,
569811
569986
  vramUsedMB: gpu.vramUsedMB,
569812
569987
  vramTotalMB: gpu.vramTotalMB,
@@ -569817,15 +569992,43 @@ async function collectLocalMetrics() {
569817
569992
  diskUsedGB: disk.usedGB,
569818
569993
  diskTotalGB: disk.totalGB,
569819
569994
  diskFreeGB: disk.freeGB,
569820
- diskPath: disk.path
569995
+ diskPath: disk.path,
569996
+ ollamaPool
569821
569997
  },
569822
569998
  network
569823
569999
  };
569824
570000
  }
570001
+ async function collectOllamaPoolMetrics() {
570002
+ try {
570003
+ const config = resolveDefaultPoolConfig();
570004
+ if (!shouldUseOllamaPoolForBaseUrl(config.baseInstanceUrl)) return null;
570005
+ const status = await getOllamaPool({ baseInstanceUrl: config.baseInstanceUrl }).status();
570006
+ return {
570007
+ enabled: true,
570008
+ mode: status.placement.mode,
570009
+ targetGpuInstances: status.placement.targetGpuInstances,
570010
+ readyGpuInstances: status.placement.readyGpuInstances,
570011
+ sharedModelStore: status.placement.sharedModelStore,
570012
+ instances: status.instances.map((inst) => ({
570013
+ id: inst.id,
570014
+ baseUrl: inst.baseUrl,
570015
+ poolOwned: inst.poolOwned,
570016
+ gpuUuid: inst.gpuUuid,
570017
+ gpuIndex: inst.gpuIndex,
570018
+ inflight: inst.inflight,
570019
+ maxParallel: inst.maxParallel,
570020
+ totalRequests: inst.totalRequests
570021
+ }))
570022
+ };
570023
+ } catch {
570024
+ return null;
570025
+ }
570026
+ }
569825
570027
  var _lastNetSnapshot, _nvidiaSmiAvailable2, _cpuPrevSnapshot, SystemMetricsCollector;
569826
570028
  var init_system_metrics = __esm({
569827
570029
  "packages/cli/src/tui/system-metrics.ts"() {
569828
570030
  "use strict";
570031
+ init_dist8();
569829
570032
  init_disk_monitor();
569830
570033
  _lastNetSnapshot = null;
569831
570034
  _nvidiaSmiAvailable2 = null;
@@ -569881,7 +570084,9 @@ var init_system_metrics = __esm({
569881
570084
  cpuCores: hw.cpuCores ?? 0,
569882
570085
  cpuModel: hw.cpuModel ?? "",
569883
570086
  gpuUtil: hw.gpuUtil ?? -1,
570087
+ gpuCount: hw.gpuCount ?? 0,
569884
570088
  gpuName: hw.gpuName ?? "",
570089
+ gpuDevices: hw.gpuDevices ?? [],
569885
570090
  vramUtil: hw.vramUtil ?? -1,
569886
570091
  vramUsedMB: hw.vramUsedMB ?? 0,
569887
570092
  vramTotalMB: hw.vramTotalMB ?? 0,
@@ -569892,7 +570097,8 @@ var init_system_metrics = __esm({
569892
570097
  diskUsedGB: hw.diskUsedGB ?? 0,
569893
570098
  diskTotalGB: hw.diskTotalGB ?? 0,
569894
570099
  diskFreeGB: hw.diskFreeGB ?? 0,
569895
- diskPath: hw.diskPath ?? ""
570100
+ diskPath: hw.diskPath ?? "",
570101
+ ollamaPool: hw.ollamaPool ?? null
569896
570102
  };
569897
570103
  this._latest = {
569898
570104
  source: "remote",
@@ -573751,6 +573957,19 @@ ${CONTENT_BG_SEQ}`);
573751
573957
  hwExpW += 6 + `${rm4.vramUtil}%`.length + vramDetail.length;
573752
573958
  hwCompW += 6 + `${rm4.vramUtil}%`.length;
573753
573959
  }
573960
+ if (rm4.ollamaPool?.enabled) {
573961
+ const pool3 = rm4.ollamaPool;
573962
+ const ready = pool3.readyGpuInstances;
573963
+ const target = pool3.targetGpuInstances;
573964
+ const poolColor = pool3.mode === "constrained" ? c3.yellow : target > 0 && ready < target ? c3.yellow : c3.green;
573965
+ const poolDetail = pool3.mode === "constrained" ? "queue" : `${ready}/${target}`;
573966
+ const poolText = ` OLLAMA ${poolColor(`${pool3.mode}:${poolDetail}`)}`;
573967
+ const compactText = ` OLLAMA ${poolColor(pool3.mode === "constrained" ? "queue" : `${ready}/${target}`)}`;
573968
+ hwExpStr += poolText;
573969
+ hwCompStr += compactText;
573970
+ hwExpW += 8 + `${pool3.mode}:${poolDetail}`.length;
573971
+ hwCompW += 8 + (pool3.mode === "constrained" ? "queue".length : `${ready}/${target}`.length);
573972
+ }
573754
573973
  if (!isLocal && hwExpW === 0) {
573755
573974
  const statusMsg = rm4.gpuName && rm4.gpuName !== "peer" ? rm4.gpuName : "awaiting metrics...";
573756
573975
  hwExpStr = c3.dim(statusMsg);
@@ -610459,9 +610678,9 @@ function telegramDecisionRecoverableFlag(text) {
610459
610678
  }
610460
610679
  return void 0;
610461
610680
  }
610462
- function telegramRouterTimeoutMs(configTimeoutMs, minMs = 15e3, _legacyMaxMs) {
610681
+ function telegramRouterTimeoutMs(configTimeoutMs, minMs = 12e4, _legacyMaxMs) {
610463
610682
  const configured = Number.isFinite(configTimeoutMs) && (configTimeoutMs ?? 0) > 0 ? configTimeoutMs : 3e5;
610464
- return Math.max(configured, minMs);
610683
+ return Math.max(configured, minMs, 12e4);
610465
610684
  }
610466
610685
  function parseTelegramInteractionDecision(text, forcedRoute, options2 = {}) {
610467
610686
  for (const jsonText of telegramDecisionJsonCandidates(text)) {
@@ -617135,7 +617354,7 @@ ${conversationStream}`
617135
617354
  tools: [],
617136
617355
  temperature: 0.4,
617137
617356
  maxTokens: 700,
617138
- timeoutMs: Math.max(config.timeoutMs ?? 3e5, 6e4),
617357
+ timeoutMs: Math.max(config.timeoutMs ?? 3e5, 12e4),
617139
617358
  think: false
617140
617359
  };
617141
617360
  let accumulated = "";
@@ -627073,17 +627292,50 @@ async function handleAimsResources(ctx3) {
627073
627292
  try {
627074
627293
  const os9 = __require("node:os");
627075
627294
  const config = loadConfig();
627295
+ let ollamaPool = null;
627296
+ let hardware = null;
627297
+ try {
627298
+ const {
627299
+ getHardwareSnapshot: getHardwareSnapshot2,
627300
+ getOllamaPool: getOllamaPool2,
627301
+ resolveDefaultPoolConfig: resolveDefaultPoolConfig2,
627302
+ shouldUseOllamaPoolForBaseUrl: shouldUseOllamaPoolForBaseUrl2
627303
+ } = await Promise.resolve().then(() => (init_dist8(), dist_exports3));
627304
+ hardware = await getHardwareSnapshot2();
627305
+ const poolConfig = resolveDefaultPoolConfig2();
627306
+ if (shouldUseOllamaPoolForBaseUrl2(poolConfig.baseInstanceUrl)) {
627307
+ const status = await getOllamaPool2({ baseInstanceUrl: poolConfig.baseInstanceUrl }).status();
627308
+ ollamaPool = {
627309
+ placement: status.placement,
627310
+ instances: status.instances.map((inst) => ({
627311
+ id: inst.id,
627312
+ base_url: inst.baseUrl,
627313
+ pool_owned: inst.poolOwned,
627314
+ gpu_uuid: inst.gpuUuid,
627315
+ gpu_index: inst.gpuIndex,
627316
+ inflight: inst.inflight,
627317
+ max_parallel: inst.maxParallel,
627318
+ total_requests: inst.totalRequests
627319
+ }))
627320
+ };
627321
+ }
627322
+ } catch {
627323
+ hardware = null;
627324
+ ollamaPool = null;
627325
+ }
627076
627326
  sendJson(res, 200, {
627077
627327
  compute: {
627078
627328
  cpu: os9.cpus()[0]?.model ?? "unknown",
627079
627329
  cores: os9.cpus().length,
627080
627330
  ram_gb: Math.round(os9.totalmem() / 1024 ** 3),
627081
- platform: process.platform
627331
+ platform: process.platform,
627332
+ hardware
627082
627333
  },
627083
627334
  backend: {
627084
627335
  type: config.backendType,
627085
627336
  url: config.backendUrl,
627086
- model: config.model
627337
+ model: config.model,
627338
+ ollama_pool: ollamaPool
627087
627339
  },
627088
627340
  "aims:control": "A.4"
627089
627341
  });
@@ -641443,6 +641695,32 @@ async function handleRequest(req2, res, ollamaUrl, verbose) {
641443
641695
  }
641444
641696
  } catch {
641445
641697
  }
641698
+ let ollamaPool = null;
641699
+ try {
641700
+ const {
641701
+ getOllamaPool: getOllamaPool2,
641702
+ resolveDefaultPoolConfig: resolveDefaultPoolConfig2,
641703
+ shouldUseOllamaPoolForBaseUrl: shouldUseOllamaPoolForBaseUrl2
641704
+ } = await Promise.resolve().then(() => (init_dist8(), dist_exports3));
641705
+ const poolConfig = resolveDefaultPoolConfig2();
641706
+ if (shouldUseOllamaPoolForBaseUrl2(poolConfig.baseInstanceUrl)) {
641707
+ const status2 = await getOllamaPool2({ baseInstanceUrl: poolConfig.baseInstanceUrl }).status();
641708
+ ollamaPool = {
641709
+ placement: status2.placement,
641710
+ instances: status2.instances.map((inst) => ({
641711
+ id: inst.id,
641712
+ base_url: inst.baseUrl,
641713
+ pool_owned: inst.poolOwned,
641714
+ gpu_uuid: inst.gpuUuid,
641715
+ gpu_index: inst.gpuIndex,
641716
+ inflight: inst.inflight,
641717
+ max_parallel: inst.maxParallel,
641718
+ total_requests: inst.totalRequests
641719
+ }))
641720
+ };
641721
+ }
641722
+ } catch {
641723
+ }
641446
641724
  let latestVersion = null;
641447
641725
  try {
641448
641726
  const ver = es("npm view omnius version 2>/dev/null", { encoding: "utf8", timeout: 5e3, stdio: "pipe" }).trim();
@@ -641452,6 +641730,7 @@ async function handleRequest(req2, res, ollamaUrl, verbose) {
641452
641730
  jsonResponse(res, 200, {
641453
641731
  gpu: gpus,
641454
641732
  gpu_utilization: gpuUtil,
641733
+ ollama_pool: ollamaPool,
641455
641734
  total_vram_gb: totalVram,
641456
641735
  ram_gb: Math.round(totalMem / 1024 ** 3),
641457
641736
  ram_used_pct: ramUsedPct,
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.82",
3
+ "version": "1.0.84",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "omnius",
9
- "version": "1.0.82",
9
+ "version": "1.0.84",
10
10
  "bundleDependencies": [
11
11
  "image-to-ascii"
12
12
  ],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.82",
3
+ "version": "1.0.84",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",