omnius 1.0.146 → 1.0.147
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +325 -31
- package/npm-shrinkwrap.json +2 -2
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -529012,38 +529012,136 @@ function inferHomeFromProcUid(pid) {
|
|
|
529012
529012
|
}
|
|
529013
529013
|
return null;
|
|
529014
529014
|
}
|
|
529015
|
+
function detectPeerOmniusOllamaPool() {
|
|
529016
|
+
if (!isDirectory("/proc"))
|
|
529017
|
+
return false;
|
|
529018
|
+
const selfPid = String(process.pid);
|
|
529019
|
+
const selfPpid = String(process.ppid ?? "");
|
|
529020
|
+
const peerNodePids = /* @__PURE__ */ new Set();
|
|
529021
|
+
let entries;
|
|
529022
|
+
try {
|
|
529023
|
+
entries = readdirSync21("/proc", { withFileTypes: true }).filter((d2) => d2.isDirectory() && /^\d+$/.test(d2.name)).map((d2) => ({ name: d2.name }));
|
|
529024
|
+
} catch {
|
|
529025
|
+
return false;
|
|
529026
|
+
}
|
|
529027
|
+
for (const e2 of entries) {
|
|
529028
|
+
if (e2.name === selfPid || e2.name === selfPpid)
|
|
529029
|
+
continue;
|
|
529030
|
+
try {
|
|
529031
|
+
const cmdline = readFileSync50(`/proc/${e2.name}/cmdline`, "utf8");
|
|
529032
|
+
if (!cmdline.includes("node"))
|
|
529033
|
+
continue;
|
|
529034
|
+
if (!/[/\\]omnius[/\\]dist[/\\]index\.js|[/\\]omnius[/\\]/i.test(cmdline))
|
|
529035
|
+
continue;
|
|
529036
|
+
peerNodePids.add(e2.name);
|
|
529037
|
+
} catch {
|
|
529038
|
+
}
|
|
529039
|
+
}
|
|
529040
|
+
if (peerNodePids.size === 0)
|
|
529041
|
+
return false;
|
|
529042
|
+
for (const e2 of entries) {
|
|
529043
|
+
try {
|
|
529044
|
+
const cmd = readFileSync50(`/proc/${e2.name}/cmdline`, "utf8");
|
|
529045
|
+
if (!cmd.includes("ollama"))
|
|
529046
|
+
continue;
|
|
529047
|
+
if (!cmd.split("\0").includes("serve"))
|
|
529048
|
+
continue;
|
|
529049
|
+
const status = readFileSync50(`/proc/${e2.name}/status`, "utf8");
|
|
529050
|
+
const ppid = status.match(/^PPid:\s+(\d+)/m)?.[1];
|
|
529051
|
+
if (ppid && peerNodePids.has(ppid))
|
|
529052
|
+
return true;
|
|
529053
|
+
} catch {
|
|
529054
|
+
}
|
|
529055
|
+
}
|
|
529056
|
+
return false;
|
|
529057
|
+
}
|
|
529015
529058
|
async function detectGpus() {
|
|
529016
529059
|
if (_nvidiaSmiAvailable === false)
|
|
529017
529060
|
return [];
|
|
529018
529061
|
return new Promise((resolve56) => {
|
|
529019
|
-
|
|
529062
|
+
const queryFields = "index,uuid,name,memory.total,memory.free,utilization.gpu,compute_cap";
|
|
529063
|
+
exec2(`nvidia-smi --query-gpu=${queryFields} --format=csv,noheader,nounits 2>/dev/null`, { encoding: "utf8", timeout: 3e3 }, (err, stdout) => {
|
|
529020
529064
|
if (err) {
|
|
529021
|
-
|
|
529022
|
-
|
|
529065
|
+
exec2("nvidia-smi --query-gpu=index,uuid,name,memory.total,memory.free,utilization.gpu --format=csv,noheader,nounits 2>/dev/null", { encoding: "utf8", timeout: 3e3 }, (err2, stdout2) => {
|
|
529066
|
+
if (err2) {
|
|
529067
|
+
_nvidiaSmiAvailable = false;
|
|
529068
|
+
resolve56([]);
|
|
529069
|
+
return;
|
|
529070
|
+
}
|
|
529071
|
+
_nvidiaSmiAvailable = true;
|
|
529072
|
+
resolve56(parseGpuQueryOutput(
|
|
529073
|
+
stdout2,
|
|
529074
|
+
/* hasComputeCap */
|
|
529075
|
+
false
|
|
529076
|
+
));
|
|
529077
|
+
});
|
|
529023
529078
|
return;
|
|
529024
529079
|
}
|
|
529025
529080
|
_nvidiaSmiAvailable = true;
|
|
529026
|
-
|
|
529027
|
-
|
|
529028
|
-
|
|
529029
|
-
|
|
529030
|
-
|
|
529031
|
-
const idx = Number(parts[0]);
|
|
529032
|
-
if (!Number.isFinite(idx))
|
|
529033
|
-
continue;
|
|
529034
|
-
gpus.push({
|
|
529035
|
-
index: idx,
|
|
529036
|
-
uuid: parts[1] ?? "",
|
|
529037
|
-
name: parts[2] ?? "",
|
|
529038
|
-
vramTotalMB: Number(parts[3]) || 0,
|
|
529039
|
-
vramFreeMB: Number(parts[4]) || 0,
|
|
529040
|
-
utilization: Number(parts[5]) || 0
|
|
529041
|
-
});
|
|
529042
|
-
}
|
|
529043
|
-
resolve56(gpus);
|
|
529081
|
+
resolve56(parseGpuQueryOutput(
|
|
529082
|
+
stdout,
|
|
529083
|
+
/* hasComputeCap */
|
|
529084
|
+
true
|
|
529085
|
+
));
|
|
529044
529086
|
});
|
|
529045
529087
|
});
|
|
529046
529088
|
}
|
|
529089
|
+
function parseGpuQueryOutput(stdout, hasComputeCap) {
|
|
529090
|
+
const gpus = [];
|
|
529091
|
+
const minFields = hasComputeCap ? 7 : 6;
|
|
529092
|
+
for (const line of stdout.split("\n")) {
|
|
529093
|
+
const parts = line.split(",").map((s2) => s2.trim());
|
|
529094
|
+
if (parts.length < minFields)
|
|
529095
|
+
continue;
|
|
529096
|
+
const idx = Number(parts[0]);
|
|
529097
|
+
if (!Number.isFinite(idx))
|
|
529098
|
+
continue;
|
|
529099
|
+
const info = {
|
|
529100
|
+
index: idx,
|
|
529101
|
+
uuid: parts[1] ?? "",
|
|
529102
|
+
name: parts[2] ?? "",
|
|
529103
|
+
vramTotalMB: Number(parts[3]) || 0,
|
|
529104
|
+
vramFreeMB: Number(parts[4]) || 0,
|
|
529105
|
+
utilization: Number(parts[5]) || 0
|
|
529106
|
+
};
|
|
529107
|
+
if (hasComputeCap) {
|
|
529108
|
+
const cap = Number(parts[6]);
|
|
529109
|
+
if (Number.isFinite(cap))
|
|
529110
|
+
info.computeCapability = cap;
|
|
529111
|
+
}
|
|
529112
|
+
gpus.push(info);
|
|
529113
|
+
}
|
|
529114
|
+
return gpus;
|
|
529115
|
+
}
|
|
529116
|
+
function resolveMinGpuVramMB() {
|
|
529117
|
+
const fromEnv = Number(process.env["OMNIUS_OLLAMA_MIN_GPU_MB"]);
|
|
529118
|
+
return Number.isFinite(fromEnv) && fromEnv > 0 ? fromEnv : DEFAULT_MIN_GPU_VRAM_MB;
|
|
529119
|
+
}
|
|
529120
|
+
function resolveMinComputeCapability() {
|
|
529121
|
+
const fromEnv = Number(process.env["OMNIUS_OLLAMA_MIN_COMPUTE_CAP"]);
|
|
529122
|
+
return Number.isFinite(fromEnv) && fromEnv > 0 ? fromEnv : DEFAULT_MIN_COMPUTE_CAPABILITY;
|
|
529123
|
+
}
|
|
529124
|
+
function isCapableForLLM(gpu, thresholds = {}) {
|
|
529125
|
+
const minVramMB = thresholds.minVramMB ?? resolveMinGpuVramMB();
|
|
529126
|
+
const minComputeCap = thresholds.minComputeCap ?? resolveMinComputeCapability();
|
|
529127
|
+
if (gpu.vramTotalMB < minVramMB)
|
|
529128
|
+
return false;
|
|
529129
|
+
if (gpu.computeCapability !== void 0 && gpu.computeCapability < minComputeCap)
|
|
529130
|
+
return false;
|
|
529131
|
+
return true;
|
|
529132
|
+
}
|
|
529133
|
+
function filterCapableGpus(gpus, thresholds) {
|
|
529134
|
+
return gpus.filter((g) => isCapableForLLM(g, thresholds));
|
|
529135
|
+
}
|
|
529136
|
+
function recommendMaxParallelFromVram(minFreeMB) {
|
|
529137
|
+
if (minFreeMB >= 60 * 1024)
|
|
529138
|
+
return 8;
|
|
529139
|
+
if (minFreeMB >= 40 * 1024)
|
|
529140
|
+
return 4;
|
|
529141
|
+
if (minFreeMB >= 24 * 1024)
|
|
529142
|
+
return 2;
|
|
529143
|
+
return 1;
|
|
529144
|
+
}
|
|
529047
529145
|
async function getHardwareSnapshot() {
|
|
529048
529146
|
const { totalmem: totalmem8, freemem: freemem7, cpus: cpus5 } = await import("node:os");
|
|
529049
529147
|
const gpus = await detectGpus();
|
|
@@ -529127,10 +529225,15 @@ async function findFreePort(start2) {
|
|
|
529127
529225
|
}
|
|
529128
529226
|
function resolveDefaultPoolConfig() {
|
|
529129
529227
|
const baseInstanceUrl = process.env["OMNIUS_OLLAMA_BASE_URL"] || process.env["OLLAMA_HOST"]?.replace(/^([^:/]+:[0-9]+)$/, "http://$1") || "http://127.0.0.1:11434";
|
|
529130
|
-
const
|
|
529131
|
-
const
|
|
529228
|
+
const maxParallelExplicit = process.env["OMNIUS_OLLAMA_MAX_PARALLEL"] !== void 0;
|
|
529229
|
+
const maxParallelPerInstance = maxParallelExplicit ? Number(process.env["OMNIUS_OLLAMA_MAX_PARALLEL"]) || 1 : 1;
|
|
529230
|
+
const autoTuneMaxParallel = !maxParallelExplicit;
|
|
529231
|
+
const gpuPlacementExplicit = process.env["OMNIUS_OLLAMA_GPU_PLACEMENT"] !== void 0;
|
|
529232
|
+
const maxInstancesExplicit = process.env["OMNIUS_OLLAMA_MAX_INSTANCES"] !== void 0;
|
|
529233
|
+
const peerPoolActive = !gpuPlacementExplicit && !maxInstancesExplicit && detectPeerOmniusOllamaPool();
|
|
529234
|
+
const maxSpawnedInstances = maxInstancesExplicit ? Number(process.env["OMNIUS_OLLAMA_MAX_INSTANCES"]) || 0 : peerPoolActive ? 1 : 0;
|
|
529132
529235
|
const targetGpuInstances = Number(process.env["OMNIUS_OLLAMA_TARGET_GPU_INSTANCES"]) || 0;
|
|
529133
|
-
const gpuPlacementRaw = (process.env["OMNIUS_OLLAMA_GPU_PLACEMENT"] ?? "auto").toLowerCase();
|
|
529236
|
+
const gpuPlacementRaw = (process.env["OMNIUS_OLLAMA_GPU_PLACEMENT"] ?? (peerPoolActive ? "elastic" : "auto")).toLowerCase();
|
|
529134
529237
|
const gpuPlacement = gpuPlacementRaw === "dedicated" || gpuPlacementRaw === "elastic" || gpuPlacementRaw === "auto" ? gpuPlacementRaw : "auto";
|
|
529135
529238
|
const idleMs = Number(process.env["OMNIUS_OLLAMA_IDLE_MS"]) || 3 * 60 * 60 * 1e3;
|
|
529136
529239
|
const reaperIntervalMs = Number(process.env["OMNIUS_OLLAMA_REAPER_MS"]) || 3e4;
|
|
@@ -529155,7 +529258,8 @@ function resolveDefaultPoolConfig() {
|
|
|
529155
529258
|
ollamaBinary,
|
|
529156
529259
|
spawnReadyTimeoutSec,
|
|
529157
529260
|
networkRxBudgetBytesPerSec,
|
|
529158
|
-
networkTxBudgetBytesPerSec
|
|
529261
|
+
networkTxBudgetBytesPerSec,
|
|
529262
|
+
autoTuneMaxParallel
|
|
529159
529263
|
};
|
|
529160
529264
|
}
|
|
529161
529265
|
function parseNullableNumber(value2) {
|
|
@@ -529223,11 +529327,13 @@ function setOllamaPool(pool3) {
|
|
|
529223
529327
|
_poolByBaseUrl.set(pool3.statusConfig().baseInstanceUrl, pool3);
|
|
529224
529328
|
}
|
|
529225
529329
|
}
|
|
529226
|
-
var _nvidiaSmiAvailable, _lastNetworkSnapshot, OllamaInstance, realInstanceSpawner, _gpuCursor, OllamaPool, _poolSingleton, _poolByBaseUrl;
|
|
529330
|
+
var _nvidiaSmiAvailable, DEFAULT_MIN_GPU_VRAM_MB, DEFAULT_MIN_COMPUTE_CAPABILITY, _lastNetworkSnapshot, OllamaInstance, realInstanceSpawner, _gpuCursor, OllamaPool, _poolSingleton, _poolByBaseUrl;
|
|
529227
529331
|
var init_ollama_pool = __esm({
|
|
529228
529332
|
"packages/orchestrator/dist/ollama-pool.js"() {
|
|
529229
529333
|
"use strict";
|
|
529230
529334
|
_nvidiaSmiAvailable = null;
|
|
529335
|
+
DEFAULT_MIN_GPU_VRAM_MB = 16 * 1024;
|
|
529336
|
+
DEFAULT_MIN_COMPUTE_CAPABILITY = 7;
|
|
529231
529337
|
_lastNetworkSnapshot = null;
|
|
529232
529338
|
OllamaInstance = class {
|
|
529233
529339
|
state;
|
|
@@ -529281,6 +529387,7 @@ var init_ollama_pool = __esm({
|
|
|
529281
529387
|
env2["OLLAMA_MODELS"] = config.sharedModelStore;
|
|
529282
529388
|
}
|
|
529283
529389
|
env2["OLLAMA_NUM_PARALLEL"] = String(config.maxParallelPerInstance);
|
|
529390
|
+
env2["OLLAMA_KEEP_ALIVE"] = process.env["OMNIUS_OLLAMA_SPAWN_KEEP_ALIVE"] ?? "-1";
|
|
529284
529391
|
if (gpuUuid) {
|
|
529285
529392
|
env2["CUDA_VISIBLE_DEVICES"] = gpuUuid;
|
|
529286
529393
|
env2["GPU_DEVICE_ORDINAL"] = gpuIndex === null ? "" : String(gpuIndex);
|
|
@@ -529334,6 +529441,20 @@ var init_ollama_pool = __esm({
|
|
|
529334
529441
|
dedicatedGpuPoolActive = false;
|
|
529335
529442
|
activePlacementMode = "constrained";
|
|
529336
529443
|
gpuCache = null;
|
|
529444
|
+
/**
|
|
529445
|
+
* Set once after the first capability-filtered GPU detection. Prevents the
|
|
529446
|
+
* auto-tune from oscillating maxParallelPerInstance as free VRAM fluctuates
|
|
529447
|
+
* during normal inference.
|
|
529448
|
+
*/
|
|
529449
|
+
_autoTuned = false;
|
|
529450
|
+
/** UUIDs we've already emitted gpu-excluded for. Prevents log spam. */
|
|
529451
|
+
_excludedGpusReported = /* @__PURE__ */ new Set();
|
|
529452
|
+
/**
|
|
529453
|
+
* Cached model footprint in MiB (model name → estimated VRAM required).
|
|
529454
|
+
* Populated lazily via /api/show on the base instance. null sentinel means
|
|
529455
|
+
* "we tried but failed" so we don't re-probe in a tight loop.
|
|
529456
|
+
*/
|
|
529457
|
+
_modelVramEstimateMB = /* @__PURE__ */ new Map();
|
|
529337
529458
|
slotWaiters = [];
|
|
529338
529459
|
/**
|
|
529339
529460
|
* Agent → preferred instance id. Set whenever an acquire resolves an
|
|
@@ -529533,7 +529654,18 @@ var init_ollama_pool = __esm({
|
|
|
529533
529654
|
const freedPick = this.pickInstance({ model });
|
|
529534
529655
|
if (freedPick)
|
|
529535
529656
|
return freedPick;
|
|
529536
|
-
const
|
|
529657
|
+
const vramNeededMB = await this.estimateModelVramMB(model);
|
|
529658
|
+
const capable = this.gpusWithCapacityForModel(gpus, vramNeededMB);
|
|
529659
|
+
if (capable.length === 0 && vramNeededMB !== null) {
|
|
529660
|
+
this.emit("spawn-skipped", {
|
|
529661
|
+
reason: "insufficient-vram",
|
|
529662
|
+
model,
|
|
529663
|
+
vramNeededMB,
|
|
529664
|
+
gpuFreeMBs: gpus.map((g) => g.vramFreeMB)
|
|
529665
|
+
});
|
|
529666
|
+
return null;
|
|
529667
|
+
}
|
|
529668
|
+
const gpu = this.pickGpuForSpawn(capable.length > 0 ? capable : gpus);
|
|
529537
529669
|
return this.spawnInstance(model, gpu);
|
|
529538
529670
|
});
|
|
529539
529671
|
}
|
|
@@ -529544,8 +529676,20 @@ var init_ollama_pool = __esm({
|
|
|
529544
529676
|
if (!this.canSpawnWithSharedModelStore(model))
|
|
529545
529677
|
return;
|
|
529546
529678
|
const target = this.dedicatedTargetCount(gpus);
|
|
529679
|
+
const vramNeededMB = await this.estimateModelVramMB(model);
|
|
529547
529680
|
while (this.instances.filter((i2) => i2.state.poolOwned).length < target) {
|
|
529548
|
-
const
|
|
529681
|
+
const candidates = this.gpusWithCapacityForModel(gpus, vramNeededMB);
|
|
529682
|
+
const pool3 = candidates.length > 0 ? candidates : vramNeededMB === null ? gpus : [];
|
|
529683
|
+
if (pool3.length === 0) {
|
|
529684
|
+
this.emit("spawn-skipped", {
|
|
529685
|
+
reason: "insufficient-vram",
|
|
529686
|
+
model,
|
|
529687
|
+
vramNeededMB,
|
|
529688
|
+
gpuFreeMBs: gpus.map((g) => g.vramFreeMB)
|
|
529689
|
+
});
|
|
529690
|
+
break;
|
|
529691
|
+
}
|
|
529692
|
+
const gpu = this.pickGpuForSpawn(pool3);
|
|
529549
529693
|
if (!gpu)
|
|
529550
529694
|
break;
|
|
529551
529695
|
const inst = await this.spawnInstance(model, gpu);
|
|
@@ -529603,9 +529747,59 @@ var init_ollama_pool = __esm({
|
|
|
529603
529747
|
if (this.gpuCache && now - this.gpuCache.takenAtMs <= maxAgeMs) {
|
|
529604
529748
|
return this.gpuCache.gpus;
|
|
529605
529749
|
}
|
|
529606
|
-
const
|
|
529607
|
-
|
|
529608
|
-
|
|
529750
|
+
const rawGpus = await this.gpuDetector();
|
|
529751
|
+
const filtered = filterCapableGpus(rawGpus);
|
|
529752
|
+
const filteredUuids = new Set(filtered.map((g) => g.uuid));
|
|
529753
|
+
for (const g of rawGpus) {
|
|
529754
|
+
if (filteredUuids.has(g.uuid))
|
|
529755
|
+
continue;
|
|
529756
|
+
if (this._excludedGpusReported.has(g.uuid))
|
|
529757
|
+
continue;
|
|
529758
|
+
this._excludedGpusReported.add(g.uuid);
|
|
529759
|
+
const reason = g.vramTotalMB < resolveMinGpuVramMB() ? "insufficient-vram" : "insufficient-compute-capability";
|
|
529760
|
+
this.emit("gpu-excluded", {
|
|
529761
|
+
uuid: g.uuid,
|
|
529762
|
+
index: g.index,
|
|
529763
|
+
name: g.name,
|
|
529764
|
+
vramTotalMB: g.vramTotalMB,
|
|
529765
|
+
computeCapability: g.computeCapability,
|
|
529766
|
+
reason
|
|
529767
|
+
});
|
|
529768
|
+
}
|
|
529769
|
+
this.gpuCache = { gpus: filtered, takenAtMs: now };
|
|
529770
|
+
this.maybeAutoTuneMaxParallel(filtered);
|
|
529771
|
+
return filtered;
|
|
529772
|
+
}
|
|
529773
|
+
/**
|
|
529774
|
+
* One-shot: bump `maxParallelPerInstance` from the worst-case free VRAM
|
|
529775
|
+
* across capable GPUs the first time we see them. We never tune down (a
|
|
529776
|
+
* subsequent low-VRAM read shouldn't strip concurrency from in-flight
|
|
529777
|
+
* requests), and we never tune again once successful — the recommendation
|
|
529778
|
+
* ladder is stable enough that a single read at startup is correct.
|
|
529779
|
+
*/
|
|
529780
|
+
maybeAutoTuneMaxParallel(filtered) {
|
|
529781
|
+
if (!this.config.autoTuneMaxParallel)
|
|
529782
|
+
return;
|
|
529783
|
+
if (this._autoTuned)
|
|
529784
|
+
return;
|
|
529785
|
+
if (filtered.length === 0)
|
|
529786
|
+
return;
|
|
529787
|
+
const minFreeMB = filtered.reduce((m2, g) => Math.min(m2, g.vramFreeMB), Number.POSITIVE_INFINITY);
|
|
529788
|
+
const recommended = recommendMaxParallelFromVram(minFreeMB);
|
|
529789
|
+
if (recommended > this.config.maxParallelPerInstance) {
|
|
529790
|
+
const previous = this.config.maxParallelPerInstance;
|
|
529791
|
+
this.config.maxParallelPerInstance = recommended;
|
|
529792
|
+
for (const inst of this.instances) {
|
|
529793
|
+
inst.state.maxParallel = recommended;
|
|
529794
|
+
}
|
|
529795
|
+
this.emit("max-parallel-tuned", {
|
|
529796
|
+
previous,
|
|
529797
|
+
recommended,
|
|
529798
|
+
minFreeMB,
|
|
529799
|
+
capableGpuCount: filtered.length
|
|
529800
|
+
});
|
|
529801
|
+
}
|
|
529802
|
+
this._autoTuned = true;
|
|
529609
529803
|
}
|
|
529610
529804
|
async spawnInstance(model, gpu) {
|
|
529611
529805
|
let port;
|
|
@@ -529668,6 +529862,56 @@ var init_ollama_pool = __esm({
|
|
|
529668
529862
|
});
|
|
529669
529863
|
return inst;
|
|
529670
529864
|
}
|
|
529865
|
+
/**
|
|
529866
|
+
* Best-effort: estimate the VRAM (in MiB) a model needs to be served
|
|
529867
|
+
* without CPU spill. Hits the base instance's `/api/show` once per model
|
|
529868
|
+
* and caches the result. Returns null when the probe fails (the caller
|
|
529869
|
+
* then falls back to "no estimate" semantics — capacity check is skipped).
|
|
529870
|
+
*
|
|
529871
|
+
* The number returned is `disk_size * 1.15 + maxParallel * 1024` (1 GiB of
|
|
529872
|
+
* KV cache per parallel slot — conservative for 30B-class models). Newer
|
|
529873
|
+
* model families may exceed this margin slightly; bumps are safe via
|
|
529874
|
+
* OMNIUS_OLLAMA_VRAM_SAFETY_MARGIN.
|
|
529875
|
+
*/
|
|
529876
|
+
async estimateModelVramMB(model) {
|
|
529877
|
+
if (this._modelVramEstimateMB.has(model)) {
|
|
529878
|
+
return this._modelVramEstimateMB.get(model) ?? null;
|
|
529879
|
+
}
|
|
529880
|
+
let bytesOnDisk = null;
|
|
529881
|
+
try {
|
|
529882
|
+
const url = `${this.config.baseInstanceUrl.replace(/\/+$/, "")}/api/show`;
|
|
529883
|
+
const resp = await fetch(url, {
|
|
529884
|
+
method: "POST",
|
|
529885
|
+
headers: { "Content-Type": "application/json" },
|
|
529886
|
+
body: JSON.stringify({ name: model }),
|
|
529887
|
+
signal: AbortSignal.timeout(2e3)
|
|
529888
|
+
});
|
|
529889
|
+
if (resp.ok) {
|
|
529890
|
+
const data = await resp.json();
|
|
529891
|
+
if (typeof data.size === "number" && data.size > 0)
|
|
529892
|
+
bytesOnDisk = data.size;
|
|
529893
|
+
}
|
|
529894
|
+
} catch {
|
|
529895
|
+
}
|
|
529896
|
+
if (bytesOnDisk === null) {
|
|
529897
|
+
this._modelVramEstimateMB.set(model, null);
|
|
529898
|
+
return null;
|
|
529899
|
+
}
|
|
529900
|
+
const safetyMargin = Number(process.env["OMNIUS_OLLAMA_VRAM_SAFETY_MARGIN"]) || 1.15;
|
|
529901
|
+
const kvCacheMB = this.config.maxParallelPerInstance * 1024;
|
|
529902
|
+
const estimateMB = Math.ceil(bytesOnDisk / (1024 * 1024) * safetyMargin + kvCacheMB);
|
|
529903
|
+
this._modelVramEstimateMB.set(model, estimateMB);
|
|
529904
|
+
return estimateMB;
|
|
529905
|
+
}
|
|
529906
|
+
/**
|
|
529907
|
+
* Filter GPUs to those with enough free VRAM for the model. Caller decides
|
|
529908
|
+
* how to react to an empty list (skip spawn vs degrade to constrained).
|
|
529909
|
+
*/
|
|
529910
|
+
gpusWithCapacityForModel(gpus, vramNeededMB) {
|
|
529911
|
+
if (vramNeededMB === null)
|
|
529912
|
+
return gpus;
|
|
529913
|
+
return gpus.filter((g) => g.vramFreeMB >= vramNeededMB);
|
|
529914
|
+
}
|
|
529671
529915
|
/**
|
|
529672
529916
|
* Pick a GPU for a freshly-spawned instance. Prefers GPUs that no
|
|
529673
529917
|
* pool-owned instance is already pinned to, then most free VRAM. Returns
|
|
@@ -529704,6 +529948,28 @@ var init_ollama_pool = __esm({
|
|
|
529704
529948
|
survivors.push(inst);
|
|
529705
529949
|
continue;
|
|
529706
529950
|
}
|
|
529951
|
+
const PROBE_GRACE_MS = 3e4;
|
|
529952
|
+
if (inst.state.inflight === 0 && Date.now() - inst.state.spawnedAtMs > PROBE_GRACE_MS && await this.isPartialVramSpilled(inst)) {
|
|
529953
|
+
const reapedAtMs = Date.now();
|
|
529954
|
+
await inst.terminate();
|
|
529955
|
+
this.dropAffinityFor(inst.state.id);
|
|
529956
|
+
this.emit("instance-reaped", {
|
|
529957
|
+
id: inst.state.id,
|
|
529958
|
+
pid: inst.state.pid,
|
|
529959
|
+
reason: "partial-vram",
|
|
529960
|
+
totalRequests: inst.state.totalRequests,
|
|
529961
|
+
peakInflight: inst.state.peakInflight,
|
|
529962
|
+
ageMs: reapedAtMs - inst.state.spawnedAtMs,
|
|
529963
|
+
idleMs: reapedAtMs - inst.state.lastUsedMs,
|
|
529964
|
+
provenance: {
|
|
529965
|
+
entity: `urn:omnius:ollama-instance:${inst.state.id}`,
|
|
529966
|
+
activity: "ollama-instance-reap-partial-vram",
|
|
529967
|
+
agent: "orchestrator.ollama-pool",
|
|
529968
|
+
timestampMs: reapedAtMs
|
|
529969
|
+
}
|
|
529970
|
+
});
|
|
529971
|
+
continue;
|
|
529972
|
+
}
|
|
529707
529973
|
if (inst.isIdleLongerThan(this.config.idleMs)) {
|
|
529708
529974
|
const reapedAtMs = Date.now();
|
|
529709
529975
|
await inst.terminate();
|
|
@@ -529711,6 +529977,7 @@ var init_ollama_pool = __esm({
|
|
|
529711
529977
|
this.emit("instance-reaped", {
|
|
529712
529978
|
id: inst.state.id,
|
|
529713
529979
|
pid: inst.state.pid,
|
|
529980
|
+
reason: "idle",
|
|
529714
529981
|
totalRequests: inst.state.totalRequests,
|
|
529715
529982
|
peakInflight: inst.state.peakInflight,
|
|
529716
529983
|
ageMs: reapedAtMs - inst.state.spawnedAtMs,
|
|
@@ -529728,6 +529995,33 @@ var init_ollama_pool = __esm({
|
|
|
529728
529995
|
}
|
|
529729
529996
|
this.instances = survivors;
|
|
529730
529997
|
}
|
|
529998
|
+
/**
|
|
529999
|
+
* Probe `/api/ps` on the instance and return true if any resident model has
|
|
530000
|
+
* less than 95% of its weights in VRAM — the unmistakable CPU-offload
|
|
530001
|
+
* signature that produces 50× slowdowns. Defensive: any HTTP failure
|
|
530002
|
+
* returns false so a transient network blip never triggers a reap.
|
|
530003
|
+
*/
|
|
530004
|
+
async isPartialVramSpilled(inst) {
|
|
530005
|
+
const PARTIAL_VRAM_THRESHOLD = 0.95;
|
|
530006
|
+
try {
|
|
530007
|
+
const url = `${inst.state.baseUrl.replace(/\/+$/, "")}/api/ps`;
|
|
530008
|
+
const resp = await fetch(url, { signal: AbortSignal.timeout(2e3) });
|
|
530009
|
+
if (!resp.ok)
|
|
530010
|
+
return false;
|
|
530011
|
+
const data = await resp.json();
|
|
530012
|
+
if (!data.models || data.models.length === 0)
|
|
530013
|
+
return false;
|
|
530014
|
+
return data.models.some((m2) => {
|
|
530015
|
+
const total = m2.size ?? 0;
|
|
530016
|
+
const vram = m2.size_vram ?? 0;
|
|
530017
|
+
if (total <= 0)
|
|
530018
|
+
return false;
|
|
530019
|
+
return vram / total < PARTIAL_VRAM_THRESHOLD;
|
|
530020
|
+
});
|
|
530021
|
+
} catch {
|
|
530022
|
+
return false;
|
|
530023
|
+
}
|
|
530024
|
+
}
|
|
529731
530025
|
/** Stop the reaper and terminate every spawned instance. Call on process exit. */
|
|
529732
530026
|
async shutdown() {
|
|
529733
530027
|
if (this.reaperHandle) {
|
package/npm-shrinkwrap.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "omnius",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.147",
|
|
4
4
|
"lockfileVersion": 3,
|
|
5
5
|
"requires": true,
|
|
6
6
|
"packages": {
|
|
7
7
|
"": {
|
|
8
8
|
"name": "omnius",
|
|
9
|
-
"version": "1.0.
|
|
9
|
+
"version": "1.0.147",
|
|
10
10
|
"bundleDependencies": [
|
|
11
11
|
"image-to-ascii"
|
|
12
12
|
],
|
package/package.json
CHANGED