omnius 1.0.82 → 1.0.84
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +357 -78
- package/npm-shrinkwrap.json +2 -2
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -524968,8 +524968,11 @@ async function findFreePort(start2) {
|
|
|
524968
524968
|
}
|
|
524969
524969
|
function resolveDefaultPoolConfig() {
|
|
524970
524970
|
const baseInstanceUrl = process.env["OMNIUS_OLLAMA_BASE_URL"] || process.env["OLLAMA_HOST"]?.replace(/^([^:/]+:[0-9]+)$/, "http://$1") || "http://127.0.0.1:11434";
|
|
524971
|
-
const maxParallelPerInstance = Number(process.env["OMNIUS_OLLAMA_MAX_PARALLEL"]) ||
|
|
524971
|
+
const maxParallelPerInstance = Number(process.env["OMNIUS_OLLAMA_MAX_PARALLEL"]) || 1;
|
|
524972
524972
|
const maxSpawnedInstances = Number(process.env["OMNIUS_OLLAMA_MAX_INSTANCES"]) || 0;
|
|
524973
|
+
const targetGpuInstances = Number(process.env["OMNIUS_OLLAMA_TARGET_GPU_INSTANCES"]) || 0;
|
|
524974
|
+
const gpuPlacementRaw = (process.env["OMNIUS_OLLAMA_GPU_PLACEMENT"] ?? "auto").toLowerCase();
|
|
524975
|
+
const gpuPlacement = gpuPlacementRaw === "dedicated" || gpuPlacementRaw === "elastic" || gpuPlacementRaw === "auto" ? gpuPlacementRaw : "auto";
|
|
524973
524976
|
const idleMs = Number(process.env["OMNIUS_OLLAMA_IDLE_MS"]) || 5 * 60 * 1e3;
|
|
524974
524977
|
const reaperIntervalMs = Number(process.env["OMNIUS_OLLAMA_REAPER_MS"]) || 3e4;
|
|
524975
524978
|
const spawnPortStart = Number(process.env["OMNIUS_OLLAMA_SPAWN_PORT"]) || 11435;
|
|
@@ -524983,6 +524986,8 @@ function resolveDefaultPoolConfig() {
|
|
|
524983
524986
|
baseInstanceUrl: baseInstanceUrl.replace(/\/+$/, ""),
|
|
524984
524987
|
maxParallelPerInstance,
|
|
524985
524988
|
maxSpawnedInstances,
|
|
524989
|
+
targetGpuInstances,
|
|
524990
|
+
gpuPlacement,
|
|
524986
524991
|
idleMs,
|
|
524987
524992
|
reaperIntervalMs,
|
|
524988
524993
|
spawnPortStart,
|
|
@@ -525110,15 +525115,17 @@ var init_ollama_pool = __esm({
|
|
|
525110
525115
|
this.proc = null;
|
|
525111
525116
|
}
|
|
525112
525117
|
};
|
|
525113
|
-
realInstanceSpawner = async ({ port, gpuUuid, config }) => {
|
|
525118
|
+
realInstanceSpawner = async ({ port, gpuUuid, gpuIndex, config }) => {
|
|
525114
525119
|
const env2 = { ...process.env };
|
|
525115
525120
|
env2["OLLAMA_HOST"] = `127.0.0.1:${port}`;
|
|
525116
525121
|
if (config.sharedModelStore) {
|
|
525117
525122
|
env2["OLLAMA_MODELS"] = config.sharedModelStore;
|
|
525118
525123
|
}
|
|
525119
525124
|
env2["OLLAMA_NUM_PARALLEL"] = String(config.maxParallelPerInstance);
|
|
525120
|
-
if (gpuUuid)
|
|
525125
|
+
if (gpuUuid) {
|
|
525121
525126
|
env2["CUDA_VISIBLE_DEVICES"] = gpuUuid;
|
|
525127
|
+
env2["GPU_DEVICE_ORDINAL"] = gpuIndex === null ? "" : String(gpuIndex);
|
|
525128
|
+
}
|
|
525122
525129
|
const child = spawn21(config.ollamaBinary, ["serve"], {
|
|
525123
525130
|
env: env2,
|
|
525124
525131
|
stdio: ["ignore", "pipe", "pipe"],
|
|
@@ -525160,17 +525167,27 @@ var init_ollama_pool = __esm({
|
|
|
525160
525167
|
instances = [];
|
|
525161
525168
|
reaperHandle = null;
|
|
525162
525169
|
spawner;
|
|
525170
|
+
gpuDetector;
|
|
525171
|
+
portAllocator;
|
|
525163
525172
|
/** Serializes concurrent spawn requests so two callers don't both create instance N+1. */
|
|
525164
525173
|
spawnGate = Promise.resolve();
|
|
525174
|
+
/** True after dedicated mode has successfully started at least one pool-owned GPU runner. */
|
|
525175
|
+
dedicatedGpuPoolActive = false;
|
|
525176
|
+
activePlacementMode = "constrained";
|
|
525177
|
+
gpuCache = null;
|
|
525178
|
+
slotWaiters = [];
|
|
525165
525179
|
constructor(config, opts) {
|
|
525166
525180
|
super();
|
|
525167
525181
|
this.config = { ...resolveDefaultPoolConfig(), ...config };
|
|
525168
525182
|
this.spawner = opts?.spawner ?? realInstanceSpawner;
|
|
525183
|
+
this.gpuDetector = opts?.gpuDetector ?? detectGpus;
|
|
525184
|
+
this.portAllocator = opts?.portAllocator ?? findFreePort;
|
|
525169
525185
|
this.instances.push(new OllamaInstance({
|
|
525170
525186
|
id: "omnius-ollama-base",
|
|
525171
525187
|
baseUrl: this.config.baseInstanceUrl,
|
|
525172
525188
|
port: this.portFromUrl(this.config.baseInstanceUrl),
|
|
525173
525189
|
gpuUuid: null,
|
|
525190
|
+
gpuIndex: null,
|
|
525174
525191
|
poolOwned: false,
|
|
525175
525192
|
inflight: 0,
|
|
525176
525193
|
peakInflight: 0,
|
|
@@ -525191,32 +525208,42 @@ var init_ollama_pool = __esm({
|
|
|
525191
525208
|
* 2. Any instance with free slots (least-loaded first).
|
|
525192
525209
|
* 3. Spawn a new instance pinned to the least-utilized GPU, when the
|
|
525193
525210
|
* pool hasn't hit `maxSpawnedInstances`.
|
|
525194
|
-
* 4.
|
|
525195
|
-
* caller will block inside Ollama's internal queue rather than fail.
|
|
525211
|
+
* 4. Queue at the pool boundary when all allowed lanes are busy.
|
|
525196
525212
|
*/
|
|
525197
525213
|
async acquire(opts) {
|
|
525214
|
+
const gpus = await this.getGpusForPlacement();
|
|
525215
|
+
let placementMode = this.placementModeFor(gpus);
|
|
525216
|
+
this.activePlacementMode = placementMode;
|
|
525217
|
+
if (placementMode === "dedicated") {
|
|
525218
|
+
await this.ensureDedicatedGpuPool(opts.model, gpus);
|
|
525219
|
+
if (!this.instances.some((i2) => i2.state.poolOwned)) {
|
|
525220
|
+
placementMode = "constrained";
|
|
525221
|
+
this.activePlacementMode = placementMode;
|
|
525222
|
+
}
|
|
525223
|
+
}
|
|
525198
525224
|
const pick = this.pickInstance(opts);
|
|
525199
525225
|
if (pick) {
|
|
525200
525226
|
pick.acquire(opts.model);
|
|
525201
525227
|
return this.buildSlot(pick);
|
|
525202
525228
|
}
|
|
525203
|
-
|
|
525204
|
-
|
|
525229
|
+
if (placementMode === "constrained") {
|
|
525230
|
+
return this.acquireQueued(opts);
|
|
525231
|
+
}
|
|
525232
|
+
const spawned = placementMode === "elastic" ? await this.maybeSpawnInstance(opts.model) : null;
|
|
525233
|
+
if (spawned && !spawned.isSaturated()) {
|
|
525205
525234
|
spawned.acquire(opts.model);
|
|
525206
525235
|
return this.buildSlot(spawned);
|
|
525207
525236
|
}
|
|
525208
|
-
|
|
525209
|
-
fallback.acquire(opts.model);
|
|
525210
|
-
return this.buildSlot(fallback);
|
|
525237
|
+
return this.acquireQueued(opts);
|
|
525211
525238
|
}
|
|
525212
525239
|
/** Synchronous routing decision; returns the instance or null if every one is saturated. */
|
|
525213
525240
|
pickInstance(opts) {
|
|
525214
|
-
const candidates = this.instances.filter((inst) => !
|
|
525241
|
+
const candidates = this.instances.filter((inst) => !this.isEffectivelySaturated(inst) && !(this.activePlacementMode === "dedicated" && this.dedicatedGpuPoolActive && !inst.state.poolOwned && !opts.preferBaseInstance));
|
|
525215
525242
|
if (candidates.length === 0)
|
|
525216
525243
|
return null;
|
|
525217
525244
|
const scored = candidates.map((inst) => ({
|
|
525218
525245
|
inst,
|
|
525219
|
-
score: (inst.state.knownModels.has(opts.model) ? 100 : 0) + (opts.preferBaseInstance && !inst.state.poolOwned ? 25 : 0) +
|
|
525246
|
+
score: (inst.state.knownModels.has(opts.model) ? 100 : 0) + (opts.preferBaseInstance && !inst.state.poolOwned ? 25 : 0) + this.effectiveFreeSlots(inst) * 10 - inst.state.inflight
|
|
525220
525247
|
}));
|
|
525221
525248
|
scored.sort((a2, b) => b.score - a2.score);
|
|
525222
525249
|
return scored[0].inst;
|
|
@@ -525227,9 +525254,39 @@ var init_ollama_pool = __esm({
|
|
|
525227
525254
|
baseUrl: inst.state.baseUrl,
|
|
525228
525255
|
poolOwned: inst.state.poolOwned,
|
|
525229
525256
|
gpuUuid: inst.state.gpuUuid,
|
|
525230
|
-
|
|
525257
|
+
gpuIndex: inst.state.gpuIndex,
|
|
525258
|
+
release: (success) => {
|
|
525259
|
+
inst.release(success);
|
|
525260
|
+
this.wakeNextSlotWaiter();
|
|
525261
|
+
}
|
|
525231
525262
|
};
|
|
525232
525263
|
}
|
|
525264
|
+
async acquireQueued(opts) {
|
|
525265
|
+
for (; ; ) {
|
|
525266
|
+
const pick = this.pickInstance(opts);
|
|
525267
|
+
if (pick) {
|
|
525268
|
+
pick.acquire(opts.model);
|
|
525269
|
+
return this.buildSlot(pick);
|
|
525270
|
+
}
|
|
525271
|
+
await new Promise((resolve52) => this.slotWaiters.push(resolve52));
|
|
525272
|
+
}
|
|
525273
|
+
}
|
|
525274
|
+
wakeNextSlotWaiter() {
|
|
525275
|
+
const waiter = this.slotWaiters.shift();
|
|
525276
|
+
if (waiter)
|
|
525277
|
+
waiter();
|
|
525278
|
+
}
|
|
525279
|
+
effectiveMaxParallel(inst) {
|
|
525280
|
+
if (this.activePlacementMode === "constrained")
|
|
525281
|
+
return 1;
|
|
525282
|
+
return Math.max(1, inst.state.maxParallel);
|
|
525283
|
+
}
|
|
525284
|
+
isEffectivelySaturated(inst) {
|
|
525285
|
+
return inst.state.inflight >= this.effectiveMaxParallel(inst);
|
|
525286
|
+
}
|
|
525287
|
+
effectiveFreeSlots(inst) {
|
|
525288
|
+
return Math.max(0, this.effectiveMaxParallel(inst) - inst.state.inflight);
|
|
525289
|
+
}
|
|
525233
525290
|
/**
|
|
525234
525291
|
* Spawn a new instance pinned to a GPU when policy allows. Returns the
|
|
525235
525292
|
* spawned instance or null when:
|
|
@@ -525241,6 +525298,42 @@ var init_ollama_pool = __esm({
|
|
|
525241
525298
|
* over-allocate.
|
|
525242
525299
|
*/
|
|
525243
525300
|
async maybeSpawnInstance(model) {
|
|
525301
|
+
return this.withSpawnGate(async () => {
|
|
525302
|
+
if (!this.canSpawnWithSharedModelStore(model))
|
|
525303
|
+
return null;
|
|
525304
|
+
const poolOwnedCount = this.instances.filter((i2) => i2.state.poolOwned).length;
|
|
525305
|
+
const gpus = await this.getGpusForPlacement();
|
|
525306
|
+
const cap = this.elasticSpawnCap(gpus);
|
|
525307
|
+
if (poolOwnedCount >= cap)
|
|
525308
|
+
return null;
|
|
525309
|
+
const freedPick = this.pickInstance({ model });
|
|
525310
|
+
if (freedPick)
|
|
525311
|
+
return freedPick;
|
|
525312
|
+
const gpu = this.pickGpuForSpawn(gpus);
|
|
525313
|
+
return this.spawnInstance(model, gpu);
|
|
525314
|
+
});
|
|
525315
|
+
}
|
|
525316
|
+
async ensureDedicatedGpuPool(model, gpus) {
|
|
525317
|
+
if (this.placementModeFor(gpus) !== "dedicated")
|
|
525318
|
+
return;
|
|
525319
|
+
await this.withSpawnGate(async () => {
|
|
525320
|
+
if (!this.canSpawnWithSharedModelStore(model))
|
|
525321
|
+
return;
|
|
525322
|
+
const target = this.dedicatedTargetCount(gpus);
|
|
525323
|
+
while (this.instances.filter((i2) => i2.state.poolOwned).length < target) {
|
|
525324
|
+
const gpu = this.pickGpuForSpawn(gpus);
|
|
525325
|
+
if (!gpu)
|
|
525326
|
+
break;
|
|
525327
|
+
const inst = await this.spawnInstance(model, gpu);
|
|
525328
|
+
if (!inst)
|
|
525329
|
+
break;
|
|
525330
|
+
}
|
|
525331
|
+
if (this.instances.some((i2) => i2.state.poolOwned)) {
|
|
525332
|
+
this.dedicatedGpuPoolActive = true;
|
|
525333
|
+
}
|
|
525334
|
+
});
|
|
525335
|
+
}
|
|
525336
|
+
async withSpawnGate(fn) {
|
|
525244
525337
|
let resolveGate = () => {
|
|
525245
525338
|
};
|
|
525246
525339
|
const myTurn = new Promise((r2) => {
|
|
@@ -525250,55 +525343,90 @@ var init_ollama_pool = __esm({
|
|
|
525250
525343
|
this.spawnGate = myTurn;
|
|
525251
525344
|
await prev;
|
|
525252
525345
|
try {
|
|
525253
|
-
|
|
525254
|
-
this.emit("spawn-skipped", {
|
|
525255
|
-
reason: "missing-shared-model-store",
|
|
525256
|
-
model,
|
|
525257
|
-
baseInstanceUrl: this.config.baseInstanceUrl
|
|
525258
|
-
});
|
|
525259
|
-
return null;
|
|
525260
|
-
}
|
|
525261
|
-
const poolOwnedCount = this.instances.filter((i2) => i2.state.poolOwned).length;
|
|
525262
|
-
const gpus = await detectGpus();
|
|
525263
|
-
const cap = this.config.maxSpawnedInstances > 0 ? this.config.maxSpawnedInstances : Math.max(0, gpus.length - 1);
|
|
525264
|
-
if (poolOwnedCount >= cap)
|
|
525265
|
-
return null;
|
|
525266
|
-
const freedPick = this.pickInstance({ model });
|
|
525267
|
-
if (freedPick)
|
|
525268
|
-
return freedPick;
|
|
525269
|
-
const port = await findFreePort(this.config.spawnPortStart);
|
|
525270
|
-
const gpuUuid = this.pickGpuForSpawn(gpus);
|
|
525271
|
-
const { proc, ready } = await this.spawner({ port, gpuUuid, config: this.config });
|
|
525272
|
-
try {
|
|
525273
|
-
await ready;
|
|
525274
|
-
} catch (err) {
|
|
525275
|
-
try {
|
|
525276
|
-
proc.kill();
|
|
525277
|
-
} catch {
|
|
525278
|
-
}
|
|
525279
|
-
this.emit("spawn-failed", { port, gpuUuid, error: err });
|
|
525280
|
-
return null;
|
|
525281
|
-
}
|
|
525282
|
-
const inst = new OllamaInstance({
|
|
525283
|
-
id: `omnius-ollama-${port}`,
|
|
525284
|
-
baseUrl: `http://127.0.0.1:${port}`,
|
|
525285
|
-
port,
|
|
525286
|
-
gpuUuid,
|
|
525287
|
-
poolOwned: true,
|
|
525288
|
-
inflight: 0,
|
|
525289
|
-
peakInflight: 0,
|
|
525290
|
-
lastUsedMs: Date.now(),
|
|
525291
|
-
knownModels: /* @__PURE__ */ new Set(),
|
|
525292
|
-
maxParallel: this.config.maxParallelPerInstance,
|
|
525293
|
-
totalRequests: 0
|
|
525294
|
-
}, proc);
|
|
525295
|
-
this.instances.push(inst);
|
|
525296
|
-
this.emit("instance-spawned", { id: inst.state.id, port, gpuUuid });
|
|
525297
|
-
return inst;
|
|
525346
|
+
return await fn();
|
|
525298
525347
|
} finally {
|
|
525299
525348
|
resolveGate();
|
|
525300
525349
|
}
|
|
525301
525350
|
}
|
|
525351
|
+
canSpawnWithSharedModelStore(model) {
|
|
525352
|
+
if (this.config.sharedModelStore || this.config.allowUnsharedModelStore)
|
|
525353
|
+
return true;
|
|
525354
|
+
this.emit("spawn-skipped", {
|
|
525355
|
+
reason: "missing-shared-model-store",
|
|
525356
|
+
model,
|
|
525357
|
+
baseInstanceUrl: this.config.baseInstanceUrl
|
|
525358
|
+
});
|
|
525359
|
+
return false;
|
|
525360
|
+
}
|
|
525361
|
+
placementModeFor(gpus) {
|
|
525362
|
+
const canShareModelStore = Boolean(this.config.sharedModelStore) || this.config.allowUnsharedModelStore;
|
|
525363
|
+
if (!canShareModelStore || gpus.length < 2)
|
|
525364
|
+
return "constrained";
|
|
525365
|
+
if (this.config.gpuPlacement === "elastic")
|
|
525366
|
+
return "elastic";
|
|
525367
|
+
return "dedicated";
|
|
525368
|
+
}
|
|
525369
|
+
dedicatedTargetCount(gpus) {
|
|
525370
|
+
const requested = this.config.targetGpuInstances > 0 ? this.config.targetGpuInstances : gpus.length;
|
|
525371
|
+
const cappedByGpuCount = Math.min(requested, gpus.length);
|
|
525372
|
+
return this.config.maxSpawnedInstances > 0 ? Math.min(cappedByGpuCount, this.config.maxSpawnedInstances) : cappedByGpuCount;
|
|
525373
|
+
}
|
|
525374
|
+
elasticSpawnCap(gpus) {
|
|
525375
|
+
return this.config.maxSpawnedInstances > 0 ? this.config.maxSpawnedInstances : Math.max(0, gpus.length - 1);
|
|
525376
|
+
}
|
|
525377
|
+
async getGpusForPlacement(maxAgeMs = 3e3) {
|
|
525378
|
+
const now = Date.now();
|
|
525379
|
+
if (this.gpuCache && now - this.gpuCache.takenAtMs <= maxAgeMs) {
|
|
525380
|
+
return this.gpuCache.gpus;
|
|
525381
|
+
}
|
|
525382
|
+
const gpus = await this.gpuDetector();
|
|
525383
|
+
this.gpuCache = { gpus, takenAtMs: now };
|
|
525384
|
+
return gpus;
|
|
525385
|
+
}
|
|
525386
|
+
async spawnInstance(model, gpu) {
|
|
525387
|
+
let port;
|
|
525388
|
+
try {
|
|
525389
|
+
port = await this.portAllocator(this.config.spawnPortStart);
|
|
525390
|
+
} catch (err) {
|
|
525391
|
+
this.emit("spawn-failed", {
|
|
525392
|
+
reason: "port-allocation-failed",
|
|
525393
|
+
gpuUuid: gpu?.uuid ?? null,
|
|
525394
|
+
gpuIndex: gpu?.index ?? null,
|
|
525395
|
+
error: err
|
|
525396
|
+
});
|
|
525397
|
+
return null;
|
|
525398
|
+
}
|
|
525399
|
+
const gpuUuid = gpu?.uuid || null;
|
|
525400
|
+
const gpuIndex = gpu?.index ?? null;
|
|
525401
|
+
const { proc, ready } = await this.spawner({ port, gpuUuid, gpuIndex, config: this.config });
|
|
525402
|
+
try {
|
|
525403
|
+
await ready;
|
|
525404
|
+
} catch (err) {
|
|
525405
|
+
try {
|
|
525406
|
+
proc.kill();
|
|
525407
|
+
} catch {
|
|
525408
|
+
}
|
|
525409
|
+
this.emit("spawn-failed", { port, gpuUuid, gpuIndex, error: err });
|
|
525410
|
+
return null;
|
|
525411
|
+
}
|
|
525412
|
+
const inst = new OllamaInstance({
|
|
525413
|
+
id: `omnius-ollama-${port}`,
|
|
525414
|
+
baseUrl: `http://127.0.0.1:${port}`,
|
|
525415
|
+
port,
|
|
525416
|
+
gpuUuid,
|
|
525417
|
+
gpuIndex,
|
|
525418
|
+
poolOwned: true,
|
|
525419
|
+
inflight: 0,
|
|
525420
|
+
peakInflight: 0,
|
|
525421
|
+
lastUsedMs: Date.now(),
|
|
525422
|
+
knownModels: /* @__PURE__ */ new Set([model]),
|
|
525423
|
+
maxParallel: this.config.maxParallelPerInstance,
|
|
525424
|
+
totalRequests: 0
|
|
525425
|
+
}, proc);
|
|
525426
|
+
this.instances.push(inst);
|
|
525427
|
+
this.emit("instance-spawned", { id: inst.state.id, port, gpuUuid, gpuIndex });
|
|
525428
|
+
return inst;
|
|
525429
|
+
}
|
|
525302
525430
|
/**
|
|
525303
525431
|
* Pick a GPU for a freshly-spawned instance. Prefers GPUs that no
|
|
525304
525432
|
* pool-owned instance is already pinned to, then most free VRAM. Returns
|
|
@@ -525313,7 +525441,7 @@ var init_ollama_pool = __esm({
|
|
|
525313
525441
|
pool3.sort((a2, b) => b.vramFreeMB - a2.vramFreeMB);
|
|
525314
525442
|
const best = pool3[_gpuCursor % pool3.length];
|
|
525315
525443
|
_gpuCursor++;
|
|
525316
|
-
return best
|
|
525444
|
+
return best;
|
|
525317
525445
|
}
|
|
525318
525446
|
/**
|
|
525319
525447
|
* Periodically reap pool-owned instances that have been idle past the
|
|
@@ -525360,13 +525488,24 @@ var init_ollama_pool = __esm({
|
|
|
525360
525488
|
}
|
|
525361
525489
|
async status() {
|
|
525362
525490
|
const hardware = await getHardwareSnapshot();
|
|
525491
|
+
const placementGpus = this.gpuCache?.gpus ?? hardware.gpus;
|
|
525492
|
+
const placementMode = this.placementModeFor(placementGpus);
|
|
525493
|
+
const targetGpuInstances = placementMode === "dedicated" ? this.dedicatedTargetCount(placementGpus) : placementMode === "elastic" ? this.elasticSpawnCap(placementGpus) : 1;
|
|
525494
|
+
const readyGpuInstances = this.instances.filter((inst) => inst.state.poolOwned).length;
|
|
525363
525495
|
return {
|
|
525364
525496
|
config: this.config,
|
|
525497
|
+
placement: {
|
|
525498
|
+
mode: placementMode,
|
|
525499
|
+
targetGpuInstances,
|
|
525500
|
+
readyGpuInstances,
|
|
525501
|
+
sharedModelStore: this.config.sharedModelStore
|
|
525502
|
+
},
|
|
525365
525503
|
instances: this.instances.map((inst) => ({
|
|
525366
525504
|
id: inst.state.id,
|
|
525367
525505
|
baseUrl: inst.state.baseUrl,
|
|
525368
525506
|
poolOwned: inst.state.poolOwned,
|
|
525369
525507
|
gpuUuid: inst.state.gpuUuid,
|
|
525508
|
+
gpuIndex: inst.state.gpuIndex,
|
|
525370
525509
|
inflight: inst.state.inflight,
|
|
525371
525510
|
peakInflight: inst.state.peakInflight,
|
|
525372
525511
|
maxParallel: inst.state.maxParallel,
|
|
@@ -569697,29 +569836,60 @@ async function collectNetworkMetrics() {
|
|
|
569697
569836
|
return { rxBytesPerSec: 0, txBytesPerSec: 0 };
|
|
569698
569837
|
}
|
|
569699
569838
|
async function collectGpuMetrics() {
|
|
569700
|
-
const noGpu = {
|
|
569839
|
+
const noGpu = {
|
|
569840
|
+
available: false,
|
|
569841
|
+
count: 0,
|
|
569842
|
+
name: "",
|
|
569843
|
+
utilization: 0,
|
|
569844
|
+
vramUsedMB: 0,
|
|
569845
|
+
vramTotalMB: 0,
|
|
569846
|
+
vramUtilization: 0,
|
|
569847
|
+
devices: []
|
|
569848
|
+
};
|
|
569701
569849
|
if (_nvidiaSmiAvailable2 === false) return noGpu;
|
|
569702
569850
|
try {
|
|
569703
569851
|
const smi = await new Promise((resolve52, reject) => {
|
|
569704
569852
|
exec3(
|
|
569705
|
-
"nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total,name --format=csv,noheader,nounits 2>/dev/null",
|
|
569853
|
+
"nvidia-smi --query-gpu=index,uuid,utilization.gpu,memory.used,memory.total,name --format=csv,noheader,nounits 2>/dev/null",
|
|
569706
569854
|
{ encoding: "utf8", timeout: 3e3 },
|
|
569707
569855
|
(err, stdout) => err ? reject(err) : resolve52(stdout)
|
|
569708
569856
|
);
|
|
569709
569857
|
});
|
|
569710
569858
|
_nvidiaSmiAvailable2 = true;
|
|
569711
|
-
const
|
|
569712
|
-
|
|
569713
|
-
|
|
569714
|
-
|
|
569715
|
-
|
|
569859
|
+
const devices = [];
|
|
569860
|
+
for (const line of smi.trim().split("\n")) {
|
|
569861
|
+
if (!line.trim()) continue;
|
|
569862
|
+
const parts = line.split(",").map((s2) => s2.trim());
|
|
569863
|
+
const index = parseInt(parts[0] ?? "-1", 10);
|
|
569864
|
+
const utilization = parseInt(parts[2] ?? "0", 10) || 0;
|
|
569865
|
+
const vramUsed2 = parseInt(parts[3] ?? "0", 10) || 0;
|
|
569866
|
+
const vramTotal2 = parseInt(parts[4] ?? "0", 10) || 0;
|
|
569867
|
+
if (!Number.isFinite(index) || index < 0) continue;
|
|
569868
|
+
devices.push({
|
|
569869
|
+
index,
|
|
569870
|
+
uuid: parts[1] ?? "",
|
|
569871
|
+
utilization,
|
|
569872
|
+
vramUsedMB: vramUsed2,
|
|
569873
|
+
vramTotalMB: vramTotal2,
|
|
569874
|
+
name: parts.slice(5).join(", ") || "",
|
|
569875
|
+
vramUtilization: vramTotal2 > 0 ? Math.round(vramUsed2 / vramTotal2 * 100) : 0
|
|
569876
|
+
});
|
|
569877
|
+
}
|
|
569878
|
+
if (devices.length === 0) return noGpu;
|
|
569879
|
+
const vramUsed = devices.reduce((sum, gpu) => sum + gpu.vramUsedMB, 0);
|
|
569880
|
+
const vramTotal = devices.reduce((sum, gpu) => sum + gpu.vramTotalMB, 0);
|
|
569881
|
+
const avgUtil = Math.round(devices.reduce((sum, gpu) => sum + gpu.utilization, 0) / devices.length);
|
|
569882
|
+
const firstName = devices[0]?.name ?? "";
|
|
569883
|
+
const allSameName = devices.every((gpu) => gpu.name === firstName);
|
|
569716
569884
|
return {
|
|
569717
569885
|
available: true,
|
|
569718
|
-
|
|
569886
|
+
count: devices.length,
|
|
569887
|
+
utilization: avgUtil,
|
|
569719
569888
|
vramUsedMB: vramUsed,
|
|
569720
569889
|
vramTotalMB: vramTotal,
|
|
569721
|
-
name:
|
|
569722
|
-
vramUtilization: vramTotal > 0 ? Math.round(vramUsed / vramTotal * 100) : 0
|
|
569890
|
+
name: devices.length > 1 && allSameName ? `${devices.length}x ${firstName}` : firstName,
|
|
569891
|
+
vramUtilization: vramTotal > 0 ? Math.round(vramUsed / vramTotal * 100) : 0,
|
|
569892
|
+
devices
|
|
569723
569893
|
};
|
|
569724
569894
|
} catch {
|
|
569725
569895
|
_nvidiaSmiAvailable2 = false;
|
|
@@ -569736,7 +569906,9 @@ function getInstantSnapshot() {
|
|
|
569736
569906
|
cpuCores: cr.cpuCores,
|
|
569737
569907
|
cpuModel: cr.cpuModel,
|
|
569738
569908
|
gpuUtil: -1,
|
|
569909
|
+
gpuCount: 0,
|
|
569739
569910
|
gpuName: "",
|
|
569911
|
+
gpuDevices: [],
|
|
569740
569912
|
vramUtil: -1,
|
|
569741
569913
|
vramUsedMB: 0,
|
|
569742
569914
|
vramTotalMB: 0,
|
|
@@ -569794,10 +569966,11 @@ function collectCpuRam() {
|
|
|
569794
569966
|
}
|
|
569795
569967
|
async function collectLocalMetrics() {
|
|
569796
569968
|
const cpuRam = collectCpuRam();
|
|
569797
|
-
const [gpu, disk, network] = await Promise.all([
|
|
569969
|
+
const [gpu, disk, network, ollamaPool] = await Promise.all([
|
|
569798
569970
|
collectGpuMetrics(),
|
|
569799
569971
|
collectDiskMetrics(),
|
|
569800
|
-
collectNetworkMetrics()
|
|
569972
|
+
collectNetworkMetrics(),
|
|
569973
|
+
collectOllamaPoolMetrics()
|
|
569801
569974
|
]);
|
|
569802
569975
|
return {
|
|
569803
569976
|
source: "local",
|
|
@@ -569806,7 +569979,9 @@ async function collectLocalMetrics() {
|
|
|
569806
569979
|
cpuCores: cpuRam.cpuCores,
|
|
569807
569980
|
cpuModel: cpuRam.cpuModel,
|
|
569808
569981
|
gpuUtil: gpu.available ? gpu.utilization : -1,
|
|
569982
|
+
gpuCount: gpu.count,
|
|
569809
569983
|
gpuName: gpu.name,
|
|
569984
|
+
gpuDevices: gpu.devices,
|
|
569810
569985
|
vramUtil: gpu.available ? gpu.vramUtilization : -1,
|
|
569811
569986
|
vramUsedMB: gpu.vramUsedMB,
|
|
569812
569987
|
vramTotalMB: gpu.vramTotalMB,
|
|
@@ -569817,15 +569992,43 @@ async function collectLocalMetrics() {
|
|
|
569817
569992
|
diskUsedGB: disk.usedGB,
|
|
569818
569993
|
diskTotalGB: disk.totalGB,
|
|
569819
569994
|
diskFreeGB: disk.freeGB,
|
|
569820
|
-
diskPath: disk.path
|
|
569995
|
+
diskPath: disk.path,
|
|
569996
|
+
ollamaPool
|
|
569821
569997
|
},
|
|
569822
569998
|
network
|
|
569823
569999
|
};
|
|
569824
570000
|
}
|
|
570001
|
+
async function collectOllamaPoolMetrics() {
|
|
570002
|
+
try {
|
|
570003
|
+
const config = resolveDefaultPoolConfig();
|
|
570004
|
+
if (!shouldUseOllamaPoolForBaseUrl(config.baseInstanceUrl)) return null;
|
|
570005
|
+
const status = await getOllamaPool({ baseInstanceUrl: config.baseInstanceUrl }).status();
|
|
570006
|
+
return {
|
|
570007
|
+
enabled: true,
|
|
570008
|
+
mode: status.placement.mode,
|
|
570009
|
+
targetGpuInstances: status.placement.targetGpuInstances,
|
|
570010
|
+
readyGpuInstances: status.placement.readyGpuInstances,
|
|
570011
|
+
sharedModelStore: status.placement.sharedModelStore,
|
|
570012
|
+
instances: status.instances.map((inst) => ({
|
|
570013
|
+
id: inst.id,
|
|
570014
|
+
baseUrl: inst.baseUrl,
|
|
570015
|
+
poolOwned: inst.poolOwned,
|
|
570016
|
+
gpuUuid: inst.gpuUuid,
|
|
570017
|
+
gpuIndex: inst.gpuIndex,
|
|
570018
|
+
inflight: inst.inflight,
|
|
570019
|
+
maxParallel: inst.maxParallel,
|
|
570020
|
+
totalRequests: inst.totalRequests
|
|
570021
|
+
}))
|
|
570022
|
+
};
|
|
570023
|
+
} catch {
|
|
570024
|
+
return null;
|
|
570025
|
+
}
|
|
570026
|
+
}
|
|
569825
570027
|
var _lastNetSnapshot, _nvidiaSmiAvailable2, _cpuPrevSnapshot, SystemMetricsCollector;
|
|
569826
570028
|
var init_system_metrics = __esm({
|
|
569827
570029
|
"packages/cli/src/tui/system-metrics.ts"() {
|
|
569828
570030
|
"use strict";
|
|
570031
|
+
init_dist8();
|
|
569829
570032
|
init_disk_monitor();
|
|
569830
570033
|
_lastNetSnapshot = null;
|
|
569831
570034
|
_nvidiaSmiAvailable2 = null;
|
|
@@ -569881,7 +570084,9 @@ var init_system_metrics = __esm({
|
|
|
569881
570084
|
cpuCores: hw.cpuCores ?? 0,
|
|
569882
570085
|
cpuModel: hw.cpuModel ?? "",
|
|
569883
570086
|
gpuUtil: hw.gpuUtil ?? -1,
|
|
570087
|
+
gpuCount: hw.gpuCount ?? 0,
|
|
569884
570088
|
gpuName: hw.gpuName ?? "",
|
|
570089
|
+
gpuDevices: hw.gpuDevices ?? [],
|
|
569885
570090
|
vramUtil: hw.vramUtil ?? -1,
|
|
569886
570091
|
vramUsedMB: hw.vramUsedMB ?? 0,
|
|
569887
570092
|
vramTotalMB: hw.vramTotalMB ?? 0,
|
|
@@ -569892,7 +570097,8 @@ var init_system_metrics = __esm({
|
|
|
569892
570097
|
diskUsedGB: hw.diskUsedGB ?? 0,
|
|
569893
570098
|
diskTotalGB: hw.diskTotalGB ?? 0,
|
|
569894
570099
|
diskFreeGB: hw.diskFreeGB ?? 0,
|
|
569895
|
-
diskPath: hw.diskPath ?? ""
|
|
570100
|
+
diskPath: hw.diskPath ?? "",
|
|
570101
|
+
ollamaPool: hw.ollamaPool ?? null
|
|
569896
570102
|
};
|
|
569897
570103
|
this._latest = {
|
|
569898
570104
|
source: "remote",
|
|
@@ -573751,6 +573957,19 @@ ${CONTENT_BG_SEQ}`);
|
|
|
573751
573957
|
hwExpW += 6 + `${rm4.vramUtil}%`.length + vramDetail.length;
|
|
573752
573958
|
hwCompW += 6 + `${rm4.vramUtil}%`.length;
|
|
573753
573959
|
}
|
|
573960
|
+
if (rm4.ollamaPool?.enabled) {
|
|
573961
|
+
const pool3 = rm4.ollamaPool;
|
|
573962
|
+
const ready = pool3.readyGpuInstances;
|
|
573963
|
+
const target = pool3.targetGpuInstances;
|
|
573964
|
+
const poolColor = pool3.mode === "constrained" ? c3.yellow : target > 0 && ready < target ? c3.yellow : c3.green;
|
|
573965
|
+
const poolDetail = pool3.mode === "constrained" ? "queue" : `${ready}/${target}`;
|
|
573966
|
+
const poolText = ` OLLAMA ${poolColor(`${pool3.mode}:${poolDetail}`)}`;
|
|
573967
|
+
const compactText = ` OLLAMA ${poolColor(pool3.mode === "constrained" ? "queue" : `${ready}/${target}`)}`;
|
|
573968
|
+
hwExpStr += poolText;
|
|
573969
|
+
hwCompStr += compactText;
|
|
573970
|
+
hwExpW += 8 + `${pool3.mode}:${poolDetail}`.length;
|
|
573971
|
+
hwCompW += 8 + (pool3.mode === "constrained" ? "queue".length : `${ready}/${target}`.length);
|
|
573972
|
+
}
|
|
573754
573973
|
if (!isLocal && hwExpW === 0) {
|
|
573755
573974
|
const statusMsg = rm4.gpuName && rm4.gpuName !== "peer" ? rm4.gpuName : "awaiting metrics...";
|
|
573756
573975
|
hwExpStr = c3.dim(statusMsg);
|
|
@@ -610459,9 +610678,9 @@ function telegramDecisionRecoverableFlag(text) {
|
|
|
610459
610678
|
}
|
|
610460
610679
|
return void 0;
|
|
610461
610680
|
}
|
|
610462
|
-
function telegramRouterTimeoutMs(configTimeoutMs, minMs =
|
|
610681
|
+
function telegramRouterTimeoutMs(configTimeoutMs, minMs = 12e4, _legacyMaxMs) {
|
|
610463
610682
|
const configured = Number.isFinite(configTimeoutMs) && (configTimeoutMs ?? 0) > 0 ? configTimeoutMs : 3e5;
|
|
610464
|
-
return Math.max(configured, minMs);
|
|
610683
|
+
return Math.max(configured, minMs, 12e4);
|
|
610465
610684
|
}
|
|
610466
610685
|
function parseTelegramInteractionDecision(text, forcedRoute, options2 = {}) {
|
|
610467
610686
|
for (const jsonText of telegramDecisionJsonCandidates(text)) {
|
|
@@ -617135,7 +617354,7 @@ ${conversationStream}`
|
|
|
617135
617354
|
tools: [],
|
|
617136
617355
|
temperature: 0.4,
|
|
617137
617356
|
maxTokens: 700,
|
|
617138
|
-
timeoutMs: Math.max(config.timeoutMs ?? 3e5,
|
|
617357
|
+
timeoutMs: Math.max(config.timeoutMs ?? 3e5, 12e4),
|
|
617139
617358
|
think: false
|
|
617140
617359
|
};
|
|
617141
617360
|
let accumulated = "";
|
|
@@ -627073,17 +627292,50 @@ async function handleAimsResources(ctx3) {
|
|
|
627073
627292
|
try {
|
|
627074
627293
|
const os9 = __require("node:os");
|
|
627075
627294
|
const config = loadConfig();
|
|
627295
|
+
let ollamaPool = null;
|
|
627296
|
+
let hardware = null;
|
|
627297
|
+
try {
|
|
627298
|
+
const {
|
|
627299
|
+
getHardwareSnapshot: getHardwareSnapshot2,
|
|
627300
|
+
getOllamaPool: getOllamaPool2,
|
|
627301
|
+
resolveDefaultPoolConfig: resolveDefaultPoolConfig2,
|
|
627302
|
+
shouldUseOllamaPoolForBaseUrl: shouldUseOllamaPoolForBaseUrl2
|
|
627303
|
+
} = await Promise.resolve().then(() => (init_dist8(), dist_exports3));
|
|
627304
|
+
hardware = await getHardwareSnapshot2();
|
|
627305
|
+
const poolConfig = resolveDefaultPoolConfig2();
|
|
627306
|
+
if (shouldUseOllamaPoolForBaseUrl2(poolConfig.baseInstanceUrl)) {
|
|
627307
|
+
const status = await getOllamaPool2({ baseInstanceUrl: poolConfig.baseInstanceUrl }).status();
|
|
627308
|
+
ollamaPool = {
|
|
627309
|
+
placement: status.placement,
|
|
627310
|
+
instances: status.instances.map((inst) => ({
|
|
627311
|
+
id: inst.id,
|
|
627312
|
+
base_url: inst.baseUrl,
|
|
627313
|
+
pool_owned: inst.poolOwned,
|
|
627314
|
+
gpu_uuid: inst.gpuUuid,
|
|
627315
|
+
gpu_index: inst.gpuIndex,
|
|
627316
|
+
inflight: inst.inflight,
|
|
627317
|
+
max_parallel: inst.maxParallel,
|
|
627318
|
+
total_requests: inst.totalRequests
|
|
627319
|
+
}))
|
|
627320
|
+
};
|
|
627321
|
+
}
|
|
627322
|
+
} catch {
|
|
627323
|
+
hardware = null;
|
|
627324
|
+
ollamaPool = null;
|
|
627325
|
+
}
|
|
627076
627326
|
sendJson(res, 200, {
|
|
627077
627327
|
compute: {
|
|
627078
627328
|
cpu: os9.cpus()[0]?.model ?? "unknown",
|
|
627079
627329
|
cores: os9.cpus().length,
|
|
627080
627330
|
ram_gb: Math.round(os9.totalmem() / 1024 ** 3),
|
|
627081
|
-
platform: process.platform
|
|
627331
|
+
platform: process.platform,
|
|
627332
|
+
hardware
|
|
627082
627333
|
},
|
|
627083
627334
|
backend: {
|
|
627084
627335
|
type: config.backendType,
|
|
627085
627336
|
url: config.backendUrl,
|
|
627086
|
-
model: config.model
|
|
627337
|
+
model: config.model,
|
|
627338
|
+
ollama_pool: ollamaPool
|
|
627087
627339
|
},
|
|
627088
627340
|
"aims:control": "A.4"
|
|
627089
627341
|
});
|
|
@@ -641443,6 +641695,32 @@ async function handleRequest(req2, res, ollamaUrl, verbose) {
|
|
|
641443
641695
|
}
|
|
641444
641696
|
} catch {
|
|
641445
641697
|
}
|
|
641698
|
+
let ollamaPool = null;
|
|
641699
|
+
try {
|
|
641700
|
+
const {
|
|
641701
|
+
getOllamaPool: getOllamaPool2,
|
|
641702
|
+
resolveDefaultPoolConfig: resolveDefaultPoolConfig2,
|
|
641703
|
+
shouldUseOllamaPoolForBaseUrl: shouldUseOllamaPoolForBaseUrl2
|
|
641704
|
+
} = await Promise.resolve().then(() => (init_dist8(), dist_exports3));
|
|
641705
|
+
const poolConfig = resolveDefaultPoolConfig2();
|
|
641706
|
+
if (shouldUseOllamaPoolForBaseUrl2(poolConfig.baseInstanceUrl)) {
|
|
641707
|
+
const status2 = await getOllamaPool2({ baseInstanceUrl: poolConfig.baseInstanceUrl }).status();
|
|
641708
|
+
ollamaPool = {
|
|
641709
|
+
placement: status2.placement,
|
|
641710
|
+
instances: status2.instances.map((inst) => ({
|
|
641711
|
+
id: inst.id,
|
|
641712
|
+
base_url: inst.baseUrl,
|
|
641713
|
+
pool_owned: inst.poolOwned,
|
|
641714
|
+
gpu_uuid: inst.gpuUuid,
|
|
641715
|
+
gpu_index: inst.gpuIndex,
|
|
641716
|
+
inflight: inst.inflight,
|
|
641717
|
+
max_parallel: inst.maxParallel,
|
|
641718
|
+
total_requests: inst.totalRequests
|
|
641719
|
+
}))
|
|
641720
|
+
};
|
|
641721
|
+
}
|
|
641722
|
+
} catch {
|
|
641723
|
+
}
|
|
641446
641724
|
let latestVersion = null;
|
|
641447
641725
|
try {
|
|
641448
641726
|
const ver = es("npm view omnius version 2>/dev/null", { encoding: "utf8", timeout: 5e3, stdio: "pipe" }).trim();
|
|
@@ -641452,6 +641730,7 @@ async function handleRequest(req2, res, ollamaUrl, verbose) {
|
|
|
641452
641730
|
jsonResponse(res, 200, {
|
|
641453
641731
|
gpu: gpus,
|
|
641454
641732
|
gpu_utilization: gpuUtil,
|
|
641733
|
+
ollama_pool: ollamaPool,
|
|
641455
641734
|
total_vram_gb: totalVram,
|
|
641456
641735
|
ram_gb: Math.round(totalMem / 1024 ** 3),
|
|
641457
641736
|
ram_used_pct: ramUsedPct,
|
package/npm-shrinkwrap.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "omnius",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.84",
|
|
4
4
|
"lockfileVersion": 3,
|
|
5
5
|
"requires": true,
|
|
6
6
|
"packages": {
|
|
7
7
|
"": {
|
|
8
8
|
"name": "omnius",
|
|
9
|
-
"version": "1.0.
|
|
9
|
+
"version": "1.0.84",
|
|
10
10
|
"bundleDependencies": [
|
|
11
11
|
"image-to-ascii"
|
|
12
12
|
],
|
package/package.json
CHANGED