omnius 1.0.146 → 1.0.147

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -529012,38 +529012,136 @@ function inferHomeFromProcUid(pid) {
529012
529012
  }
529013
529013
  return null;
529014
529014
  }
529015
+ function detectPeerOmniusOllamaPool() {
529016
+ if (!isDirectory("/proc"))
529017
+ return false;
529018
+ const selfPid = String(process.pid);
529019
+ const selfPpid = String(process.ppid ?? "");
529020
+ const peerNodePids = /* @__PURE__ */ new Set();
529021
+ let entries;
529022
+ try {
529023
+ entries = readdirSync21("/proc", { withFileTypes: true }).filter((d2) => d2.isDirectory() && /^\d+$/.test(d2.name)).map((d2) => ({ name: d2.name }));
529024
+ } catch {
529025
+ return false;
529026
+ }
529027
+ for (const e2 of entries) {
529028
+ if (e2.name === selfPid || e2.name === selfPpid)
529029
+ continue;
529030
+ try {
529031
+ const cmdline = readFileSync50(`/proc/${e2.name}/cmdline`, "utf8");
529032
+ if (!cmdline.includes("node"))
529033
+ continue;
529034
+ if (!/[/\\]omnius[/\\]dist[/\\]index\.js|[/\\]omnius[/\\]/i.test(cmdline))
529035
+ continue;
529036
+ peerNodePids.add(e2.name);
529037
+ } catch {
529038
+ }
529039
+ }
529040
+ if (peerNodePids.size === 0)
529041
+ return false;
529042
+ for (const e2 of entries) {
529043
+ try {
529044
+ const cmd = readFileSync50(`/proc/${e2.name}/cmdline`, "utf8");
529045
+ if (!cmd.includes("ollama"))
529046
+ continue;
529047
+ if (!cmd.split("\0").includes("serve"))
529048
+ continue;
529049
+ const status = readFileSync50(`/proc/${e2.name}/status`, "utf8");
529050
+ const ppid = status.match(/^PPid:\s+(\d+)/m)?.[1];
529051
+ if (ppid && peerNodePids.has(ppid))
529052
+ return true;
529053
+ } catch {
529054
+ }
529055
+ }
529056
+ return false;
529057
+ }
529015
529058
  async function detectGpus() {
529016
529059
  if (_nvidiaSmiAvailable === false)
529017
529060
  return [];
529018
529061
  return new Promise((resolve56) => {
529019
- exec2("nvidia-smi --query-gpu=index,uuid,name,memory.total,memory.free,utilization.gpu --format=csv,noheader,nounits 2>/dev/null", { encoding: "utf8", timeout: 3e3 }, (err, stdout) => {
529062
+ const queryFields = "index,uuid,name,memory.total,memory.free,utilization.gpu,compute_cap";
529063
+ exec2(`nvidia-smi --query-gpu=${queryFields} --format=csv,noheader,nounits 2>/dev/null`, { encoding: "utf8", timeout: 3e3 }, (err, stdout) => {
529020
529064
  if (err) {
529021
- _nvidiaSmiAvailable = false;
529022
- resolve56([]);
529065
+ exec2("nvidia-smi --query-gpu=index,uuid,name,memory.total,memory.free,utilization.gpu --format=csv,noheader,nounits 2>/dev/null", { encoding: "utf8", timeout: 3e3 }, (err2, stdout2) => {
529066
+ if (err2) {
529067
+ _nvidiaSmiAvailable = false;
529068
+ resolve56([]);
529069
+ return;
529070
+ }
529071
+ _nvidiaSmiAvailable = true;
529072
+ resolve56(parseGpuQueryOutput(
529073
+ stdout2,
529074
+ /* hasComputeCap */
529075
+ false
529076
+ ));
529077
+ });
529023
529078
  return;
529024
529079
  }
529025
529080
  _nvidiaSmiAvailable = true;
529026
- const gpus = [];
529027
- for (const line of stdout.split("\n")) {
529028
- const parts = line.split(",").map((s2) => s2.trim());
529029
- if (parts.length < 6)
529030
- continue;
529031
- const idx = Number(parts[0]);
529032
- if (!Number.isFinite(idx))
529033
- continue;
529034
- gpus.push({
529035
- index: idx,
529036
- uuid: parts[1] ?? "",
529037
- name: parts[2] ?? "",
529038
- vramTotalMB: Number(parts[3]) || 0,
529039
- vramFreeMB: Number(parts[4]) || 0,
529040
- utilization: Number(parts[5]) || 0
529041
- });
529042
- }
529043
- resolve56(gpus);
529081
+ resolve56(parseGpuQueryOutput(
529082
+ stdout,
529083
+ /* hasComputeCap */
529084
+ true
529085
+ ));
529044
529086
  });
529045
529087
  });
529046
529088
  }
529089
+ function parseGpuQueryOutput(stdout, hasComputeCap) {
529090
+ const gpus = [];
529091
+ const minFields = hasComputeCap ? 7 : 6;
529092
+ for (const line of stdout.split("\n")) {
529093
+ const parts = line.split(",").map((s2) => s2.trim());
529094
+ if (parts.length < minFields)
529095
+ continue;
529096
+ const idx = Number(parts[0]);
529097
+ if (!Number.isFinite(idx))
529098
+ continue;
529099
+ const info = {
529100
+ index: idx,
529101
+ uuid: parts[1] ?? "",
529102
+ name: parts[2] ?? "",
529103
+ vramTotalMB: Number(parts[3]) || 0,
529104
+ vramFreeMB: Number(parts[4]) || 0,
529105
+ utilization: Number(parts[5]) || 0
529106
+ };
529107
+ if (hasComputeCap) {
529108
+ const cap = Number(parts[6]);
529109
+ if (Number.isFinite(cap))
529110
+ info.computeCapability = cap;
529111
+ }
529112
+ gpus.push(info);
529113
+ }
529114
+ return gpus;
529115
+ }
529116
+ function resolveMinGpuVramMB() {
529117
+ const fromEnv = Number(process.env["OMNIUS_OLLAMA_MIN_GPU_MB"]);
529118
+ return Number.isFinite(fromEnv) && fromEnv > 0 ? fromEnv : DEFAULT_MIN_GPU_VRAM_MB;
529119
+ }
529120
+ function resolveMinComputeCapability() {
529121
+ const fromEnv = Number(process.env["OMNIUS_OLLAMA_MIN_COMPUTE_CAP"]);
529122
+ return Number.isFinite(fromEnv) && fromEnv > 0 ? fromEnv : DEFAULT_MIN_COMPUTE_CAPABILITY;
529123
+ }
529124
+ function isCapableForLLM(gpu, thresholds = {}) {
529125
+ const minVramMB = thresholds.minVramMB ?? resolveMinGpuVramMB();
529126
+ const minComputeCap = thresholds.minComputeCap ?? resolveMinComputeCapability();
529127
+ if (gpu.vramTotalMB < minVramMB)
529128
+ return false;
529129
+ if (gpu.computeCapability !== void 0 && gpu.computeCapability < minComputeCap)
529130
+ return false;
529131
+ return true;
529132
+ }
529133
+ function filterCapableGpus(gpus, thresholds) {
529134
+ return gpus.filter((g) => isCapableForLLM(g, thresholds));
529135
+ }
529136
+ function recommendMaxParallelFromVram(minFreeMB) {
529137
+ if (minFreeMB >= 60 * 1024)
529138
+ return 8;
529139
+ if (minFreeMB >= 40 * 1024)
529140
+ return 4;
529141
+ if (minFreeMB >= 24 * 1024)
529142
+ return 2;
529143
+ return 1;
529144
+ }
529047
529145
  async function getHardwareSnapshot() {
529048
529146
  const { totalmem: totalmem8, freemem: freemem7, cpus: cpus5 } = await import("node:os");
529049
529147
  const gpus = await detectGpus();
@@ -529127,10 +529225,15 @@ async function findFreePort(start2) {
529127
529225
  }
529128
529226
  function resolveDefaultPoolConfig() {
529129
529227
  const baseInstanceUrl = process.env["OMNIUS_OLLAMA_BASE_URL"] || process.env["OLLAMA_HOST"]?.replace(/^([^:/]+:[0-9]+)$/, "http://$1") || "http://127.0.0.1:11434";
529130
- const maxParallelPerInstance = Number(process.env["OMNIUS_OLLAMA_MAX_PARALLEL"]) || 1;
529131
- const maxSpawnedInstances = Number(process.env["OMNIUS_OLLAMA_MAX_INSTANCES"]) || 0;
529228
+ const maxParallelExplicit = process.env["OMNIUS_OLLAMA_MAX_PARALLEL"] !== void 0;
529229
+ const maxParallelPerInstance = maxParallelExplicit ? Number(process.env["OMNIUS_OLLAMA_MAX_PARALLEL"]) || 1 : 1;
529230
+ const autoTuneMaxParallel = !maxParallelExplicit;
529231
+ const gpuPlacementExplicit = process.env["OMNIUS_OLLAMA_GPU_PLACEMENT"] !== void 0;
529232
+ const maxInstancesExplicit = process.env["OMNIUS_OLLAMA_MAX_INSTANCES"] !== void 0;
529233
+ const peerPoolActive = !gpuPlacementExplicit && !maxInstancesExplicit && detectPeerOmniusOllamaPool();
529234
+ const maxSpawnedInstances = maxInstancesExplicit ? Number(process.env["OMNIUS_OLLAMA_MAX_INSTANCES"]) || 0 : peerPoolActive ? 1 : 0;
529132
529235
  const targetGpuInstances = Number(process.env["OMNIUS_OLLAMA_TARGET_GPU_INSTANCES"]) || 0;
529133
- const gpuPlacementRaw = (process.env["OMNIUS_OLLAMA_GPU_PLACEMENT"] ?? "auto").toLowerCase();
529236
+ const gpuPlacementRaw = (process.env["OMNIUS_OLLAMA_GPU_PLACEMENT"] ?? (peerPoolActive ? "elastic" : "auto")).toLowerCase();
529134
529237
  const gpuPlacement = gpuPlacementRaw === "dedicated" || gpuPlacementRaw === "elastic" || gpuPlacementRaw === "auto" ? gpuPlacementRaw : "auto";
529135
529238
  const idleMs = Number(process.env["OMNIUS_OLLAMA_IDLE_MS"]) || 3 * 60 * 60 * 1e3;
529136
529239
  const reaperIntervalMs = Number(process.env["OMNIUS_OLLAMA_REAPER_MS"]) || 3e4;
@@ -529155,7 +529258,8 @@ function resolveDefaultPoolConfig() {
529155
529258
  ollamaBinary,
529156
529259
  spawnReadyTimeoutSec,
529157
529260
  networkRxBudgetBytesPerSec,
529158
- networkTxBudgetBytesPerSec
529261
+ networkTxBudgetBytesPerSec,
529262
+ autoTuneMaxParallel
529159
529263
  };
529160
529264
  }
529161
529265
  function parseNullableNumber(value2) {
@@ -529223,11 +529327,13 @@ function setOllamaPool(pool3) {
529223
529327
  _poolByBaseUrl.set(pool3.statusConfig().baseInstanceUrl, pool3);
529224
529328
  }
529225
529329
  }
529226
- var _nvidiaSmiAvailable, _lastNetworkSnapshot, OllamaInstance, realInstanceSpawner, _gpuCursor, OllamaPool, _poolSingleton, _poolByBaseUrl;
529330
+ var _nvidiaSmiAvailable, DEFAULT_MIN_GPU_VRAM_MB, DEFAULT_MIN_COMPUTE_CAPABILITY, _lastNetworkSnapshot, OllamaInstance, realInstanceSpawner, _gpuCursor, OllamaPool, _poolSingleton, _poolByBaseUrl;
529227
529331
  var init_ollama_pool = __esm({
529228
529332
  "packages/orchestrator/dist/ollama-pool.js"() {
529229
529333
  "use strict";
529230
529334
  _nvidiaSmiAvailable = null;
529335
+ DEFAULT_MIN_GPU_VRAM_MB = 16 * 1024;
529336
+ DEFAULT_MIN_COMPUTE_CAPABILITY = 7;
529231
529337
  _lastNetworkSnapshot = null;
529232
529338
  OllamaInstance = class {
529233
529339
  state;
@@ -529281,6 +529387,7 @@ var init_ollama_pool = __esm({
529281
529387
  env2["OLLAMA_MODELS"] = config.sharedModelStore;
529282
529388
  }
529283
529389
  env2["OLLAMA_NUM_PARALLEL"] = String(config.maxParallelPerInstance);
529390
+ env2["OLLAMA_KEEP_ALIVE"] = process.env["OMNIUS_OLLAMA_SPAWN_KEEP_ALIVE"] ?? "-1";
529284
529391
  if (gpuUuid) {
529285
529392
  env2["CUDA_VISIBLE_DEVICES"] = gpuUuid;
529286
529393
  env2["GPU_DEVICE_ORDINAL"] = gpuIndex === null ? "" : String(gpuIndex);
@@ -529334,6 +529441,20 @@ var init_ollama_pool = __esm({
529334
529441
  dedicatedGpuPoolActive = false;
529335
529442
  activePlacementMode = "constrained";
529336
529443
  gpuCache = null;
529444
+ /**
529445
+ * Set once after the first capability-filtered GPU detection. Prevents the
529446
+ * auto-tune from oscillating maxParallelPerInstance as free VRAM fluctuates
529447
+ * during normal inference.
529448
+ */
529449
+ _autoTuned = false;
529450
+ /** UUIDs we've already emitted gpu-excluded for. Prevents log spam. */
529451
+ _excludedGpusReported = /* @__PURE__ */ new Set();
529452
+ /**
529453
+ * Cached model footprint in MiB (model name → estimated VRAM required).
529454
+ * Populated lazily via /api/show on the base instance. null sentinel means
529455
+ * "we tried but failed" so we don't re-probe in a tight loop.
529456
+ */
529457
+ _modelVramEstimateMB = /* @__PURE__ */ new Map();
529337
529458
  slotWaiters = [];
529338
529459
  /**
529339
529460
  * Agent → preferred instance id. Set whenever an acquire resolves an
@@ -529533,7 +529654,18 @@ var init_ollama_pool = __esm({
529533
529654
  const freedPick = this.pickInstance({ model });
529534
529655
  if (freedPick)
529535
529656
  return freedPick;
529536
- const gpu = this.pickGpuForSpawn(gpus);
529657
+ const vramNeededMB = await this.estimateModelVramMB(model);
529658
+ const capable = this.gpusWithCapacityForModel(gpus, vramNeededMB);
529659
+ if (capable.length === 0 && vramNeededMB !== null) {
529660
+ this.emit("spawn-skipped", {
529661
+ reason: "insufficient-vram",
529662
+ model,
529663
+ vramNeededMB,
529664
+ gpuFreeMBs: gpus.map((g) => g.vramFreeMB)
529665
+ });
529666
+ return null;
529667
+ }
529668
+ const gpu = this.pickGpuForSpawn(capable.length > 0 ? capable : gpus);
529537
529669
  return this.spawnInstance(model, gpu);
529538
529670
  });
529539
529671
  }
@@ -529544,8 +529676,20 @@ var init_ollama_pool = __esm({
529544
529676
  if (!this.canSpawnWithSharedModelStore(model))
529545
529677
  return;
529546
529678
  const target = this.dedicatedTargetCount(gpus);
529679
+ const vramNeededMB = await this.estimateModelVramMB(model);
529547
529680
  while (this.instances.filter((i2) => i2.state.poolOwned).length < target) {
529548
- const gpu = this.pickGpuForSpawn(gpus);
529681
+ const candidates = this.gpusWithCapacityForModel(gpus, vramNeededMB);
529682
+ const pool3 = candidates.length > 0 ? candidates : vramNeededMB === null ? gpus : [];
529683
+ if (pool3.length === 0) {
529684
+ this.emit("spawn-skipped", {
529685
+ reason: "insufficient-vram",
529686
+ model,
529687
+ vramNeededMB,
529688
+ gpuFreeMBs: gpus.map((g) => g.vramFreeMB)
529689
+ });
529690
+ break;
529691
+ }
529692
+ const gpu = this.pickGpuForSpawn(pool3);
529549
529693
  if (!gpu)
529550
529694
  break;
529551
529695
  const inst = await this.spawnInstance(model, gpu);
@@ -529603,9 +529747,59 @@ var init_ollama_pool = __esm({
529603
529747
  if (this.gpuCache && now - this.gpuCache.takenAtMs <= maxAgeMs) {
529604
529748
  return this.gpuCache.gpus;
529605
529749
  }
529606
- const gpus = await this.gpuDetector();
529607
- this.gpuCache = { gpus, takenAtMs: now };
529608
- return gpus;
529750
+ const rawGpus = await this.gpuDetector();
529751
+ const filtered = filterCapableGpus(rawGpus);
529752
+ const filteredUuids = new Set(filtered.map((g) => g.uuid));
529753
+ for (const g of rawGpus) {
529754
+ if (filteredUuids.has(g.uuid))
529755
+ continue;
529756
+ if (this._excludedGpusReported.has(g.uuid))
529757
+ continue;
529758
+ this._excludedGpusReported.add(g.uuid);
529759
+ const reason = g.vramTotalMB < resolveMinGpuVramMB() ? "insufficient-vram" : "insufficient-compute-capability";
529760
+ this.emit("gpu-excluded", {
529761
+ uuid: g.uuid,
529762
+ index: g.index,
529763
+ name: g.name,
529764
+ vramTotalMB: g.vramTotalMB,
529765
+ computeCapability: g.computeCapability,
529766
+ reason
529767
+ });
529768
+ }
529769
+ this.gpuCache = { gpus: filtered, takenAtMs: now };
529770
+ this.maybeAutoTuneMaxParallel(filtered);
529771
+ return filtered;
529772
+ }
529773
+ /**
529774
+ * One-shot: bump `maxParallelPerInstance` from the worst-case free VRAM
529775
+ * across capable GPUs the first time we see them. We never tune down (a
529776
+ * subsequent low-VRAM read shouldn't strip concurrency from in-flight
529777
+ * requests), and we never tune again once successful — the recommendation
529778
+ * ladder is stable enough that a single read at startup is correct.
529779
+ */
529780
+ maybeAutoTuneMaxParallel(filtered) {
529781
+ if (!this.config.autoTuneMaxParallel)
529782
+ return;
529783
+ if (this._autoTuned)
529784
+ return;
529785
+ if (filtered.length === 0)
529786
+ return;
529787
+ const minFreeMB = filtered.reduce((m2, g) => Math.min(m2, g.vramFreeMB), Number.POSITIVE_INFINITY);
529788
+ const recommended = recommendMaxParallelFromVram(minFreeMB);
529789
+ if (recommended > this.config.maxParallelPerInstance) {
529790
+ const previous = this.config.maxParallelPerInstance;
529791
+ this.config.maxParallelPerInstance = recommended;
529792
+ for (const inst of this.instances) {
529793
+ inst.state.maxParallel = recommended;
529794
+ }
529795
+ this.emit("max-parallel-tuned", {
529796
+ previous,
529797
+ recommended,
529798
+ minFreeMB,
529799
+ capableGpuCount: filtered.length
529800
+ });
529801
+ }
529802
+ this._autoTuned = true;
529609
529803
  }
529610
529804
  async spawnInstance(model, gpu) {
529611
529805
  let port;
@@ -529668,6 +529862,56 @@ var init_ollama_pool = __esm({
529668
529862
  });
529669
529863
  return inst;
529670
529864
  }
529865
+ /**
529866
+ * Best-effort: estimate the VRAM (in MiB) a model needs to be served
529867
+ * without CPU spill. Hits the base instance's `/api/show` once per model
529868
+ * and caches the result. Returns null when the probe fails (the caller
529869
+ * then falls back to "no estimate" semantics — capacity check is skipped).
529870
+ *
529871
+ * The number returned is `disk_size * 1.15 + maxParallel * 1024` (1 GiB of
529872
+ * KV cache per parallel slot — conservative for 30B-class models). Newer
529873
+ * model families may exceed this margin slightly; bumps are safe via
529874
+ * OMNIUS_OLLAMA_VRAM_SAFETY_MARGIN.
529875
+ */
529876
+ async estimateModelVramMB(model) {
529877
+ if (this._modelVramEstimateMB.has(model)) {
529878
+ return this._modelVramEstimateMB.get(model) ?? null;
529879
+ }
529880
+ let bytesOnDisk = null;
529881
+ try {
529882
+ const url = `${this.config.baseInstanceUrl.replace(/\/+$/, "")}/api/show`;
529883
+ const resp = await fetch(url, {
529884
+ method: "POST",
529885
+ headers: { "Content-Type": "application/json" },
529886
+ body: JSON.stringify({ name: model }),
529887
+ signal: AbortSignal.timeout(2e3)
529888
+ });
529889
+ if (resp.ok) {
529890
+ const data = await resp.json();
529891
+ if (typeof data.size === "number" && data.size > 0)
529892
+ bytesOnDisk = data.size;
529893
+ }
529894
+ } catch {
529895
+ }
529896
+ if (bytesOnDisk === null) {
529897
+ this._modelVramEstimateMB.set(model, null);
529898
+ return null;
529899
+ }
529900
+ const safetyMargin = Number(process.env["OMNIUS_OLLAMA_VRAM_SAFETY_MARGIN"]) || 1.15;
529901
+ const kvCacheMB = this.config.maxParallelPerInstance * 1024;
529902
+ const estimateMB = Math.ceil(bytesOnDisk / (1024 * 1024) * safetyMargin + kvCacheMB);
529903
+ this._modelVramEstimateMB.set(model, estimateMB);
529904
+ return estimateMB;
529905
+ }
529906
+ /**
529907
+ * Filter GPUs to those with enough free VRAM for the model. Caller decides
529908
+ * how to react to an empty list (skip spawn vs degrade to constrained).
529909
+ */
529910
+ gpusWithCapacityForModel(gpus, vramNeededMB) {
529911
+ if (vramNeededMB === null)
529912
+ return gpus;
529913
+ return gpus.filter((g) => g.vramFreeMB >= vramNeededMB);
529914
+ }
529671
529915
  /**
529672
529916
  * Pick a GPU for a freshly-spawned instance. Prefers GPUs that no
529673
529917
  * pool-owned instance is already pinned to, then most free VRAM. Returns
@@ -529704,6 +529948,28 @@ var init_ollama_pool = __esm({
529704
529948
  survivors.push(inst);
529705
529949
  continue;
529706
529950
  }
529951
+ const PROBE_GRACE_MS = 3e4;
529952
+ if (inst.state.inflight === 0 && Date.now() - inst.state.spawnedAtMs > PROBE_GRACE_MS && await this.isPartialVramSpilled(inst)) {
529953
+ const reapedAtMs = Date.now();
529954
+ await inst.terminate();
529955
+ this.dropAffinityFor(inst.state.id);
529956
+ this.emit("instance-reaped", {
529957
+ id: inst.state.id,
529958
+ pid: inst.state.pid,
529959
+ reason: "partial-vram",
529960
+ totalRequests: inst.state.totalRequests,
529961
+ peakInflight: inst.state.peakInflight,
529962
+ ageMs: reapedAtMs - inst.state.spawnedAtMs,
529963
+ idleMs: reapedAtMs - inst.state.lastUsedMs,
529964
+ provenance: {
529965
+ entity: `urn:omnius:ollama-instance:${inst.state.id}`,
529966
+ activity: "ollama-instance-reap-partial-vram",
529967
+ agent: "orchestrator.ollama-pool",
529968
+ timestampMs: reapedAtMs
529969
+ }
529970
+ });
529971
+ continue;
529972
+ }
529707
529973
  if (inst.isIdleLongerThan(this.config.idleMs)) {
529708
529974
  const reapedAtMs = Date.now();
529709
529975
  await inst.terminate();
@@ -529711,6 +529977,7 @@ var init_ollama_pool = __esm({
529711
529977
  this.emit("instance-reaped", {
529712
529978
  id: inst.state.id,
529713
529979
  pid: inst.state.pid,
529980
+ reason: "idle",
529714
529981
  totalRequests: inst.state.totalRequests,
529715
529982
  peakInflight: inst.state.peakInflight,
529716
529983
  ageMs: reapedAtMs - inst.state.spawnedAtMs,
@@ -529728,6 +529995,33 @@ var init_ollama_pool = __esm({
529728
529995
  }
529729
529996
  this.instances = survivors;
529730
529997
  }
529998
+ /**
529999
+ * Probe `/api/ps` on the instance and return true if any resident model has
530000
+ * less than 95% of its weights in VRAM — the unmistakable CPU-offload
530001
+ * signature that produces 50× slowdowns. Defensive: any HTTP failure
530002
+ * returns false so a transient network blip never triggers a reap.
530003
+ */
530004
+ async isPartialVramSpilled(inst) {
530005
+ const PARTIAL_VRAM_THRESHOLD = 0.95;
530006
+ try {
530007
+ const url = `${inst.state.baseUrl.replace(/\/+$/, "")}/api/ps`;
530008
+ const resp = await fetch(url, { signal: AbortSignal.timeout(2e3) });
530009
+ if (!resp.ok)
530010
+ return false;
530011
+ const data = await resp.json();
530012
+ if (!data.models || data.models.length === 0)
530013
+ return false;
530014
+ return data.models.some((m2) => {
530015
+ const total = m2.size ?? 0;
530016
+ const vram = m2.size_vram ?? 0;
530017
+ if (total <= 0)
530018
+ return false;
530019
+ return vram / total < PARTIAL_VRAM_THRESHOLD;
530020
+ });
530021
+ } catch {
530022
+ return false;
530023
+ }
530024
+ }
529731
530025
  /** Stop the reaper and terminate every spawned instance. Call on process exit. */
529732
530026
  async shutdown() {
529733
530027
  if (this.reaperHandle) {
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.146",
3
+ "version": "1.0.147",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "omnius",
9
- "version": "1.0.146",
9
+ "version": "1.0.147",
10
10
  "bundleDependencies": [
11
11
  "image-to-ascii"
12
12
  ],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.146",
3
+ "version": "1.0.147",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",