omnius 1.0.136 → 1.0.137

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1337,29 +1337,36 @@ function ramSnapshotMB() {
1337
1337
  const free = Math.round(freemem() / (1024 * 1024));
1338
1338
  return { total, free, used: total - free };
1339
1339
  }
1340
- async function vramSnapshotMB() {
1340
+ async function vramSnapshotPerDevice() {
1341
1341
  if (_nvSmiAvailable === false)
1342
- return null;
1342
+ return [];
1343
1343
  try {
1344
1344
  const out = await new Promise((resolve55, reject) => {
1345
- exec("nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv,noheader,nounits 2>/dev/null", { encoding: "utf8", timeout: 3e3 }, (err, stdout) => err ? reject(err) : resolve55(stdout));
1345
+ exec("nvidia-smi --query-gpu=index,uuid,memory.total,memory.used,memory.free --format=csv,noheader,nounits 2>/dev/null", { encoding: "utf8", timeout: 3e3 }, (err, stdout) => err ? reject(err) : resolve55(stdout));
1346
1346
  });
1347
1347
  _nvSmiAvailable = true;
1348
- let total = 0, used = 0, free = 0;
1348
+ const devices = [];
1349
1349
  for (const line of out.trim().split("\n")) {
1350
+ if (!line.trim())
1351
+ continue;
1350
1352
  const parts = line.split(",").map((s2) => s2.trim());
1351
- if (parts.length < 3)
1353
+ if (parts.length < 5)
1354
+ continue;
1355
+ const index = parseInt(parts[0] ?? "-1", 10);
1356
+ if (!Number.isFinite(index) || index < 0)
1352
1357
  continue;
1353
- total += parseInt(parts[0] ?? "0", 10) || 0;
1354
- used += parseInt(parts[1] ?? "0", 10) || 0;
1355
- free += parseInt(parts[2] ?? "0", 10) || 0;
1358
+ devices.push({
1359
+ index,
1360
+ uuid: parts[1] ?? "",
1361
+ total: parseInt(parts[2] ?? "0", 10) || 0,
1362
+ used: parseInt(parts[3] ?? "0", 10) || 0,
1363
+ free: parseInt(parts[4] ?? "0", 10) || 0
1364
+ });
1356
1365
  }
1357
- if (total <= 0)
1358
- return null;
1359
- return { total, used, free };
1366
+ return devices;
1360
1367
  } catch {
1361
1368
  _nvSmiAvailable = false;
1362
- return null;
1369
+ return [];
1363
1370
  }
1364
1371
  }
1365
1372
  function getModelBroker() {
@@ -1403,7 +1410,9 @@ var init_model_broker = __esm({
1403
1410
  ramHeadroomMB = DEFAULT_RAM_HEADROOM_MB;
1404
1411
  vramHeadroomMB = DEFAULT_VRAM_HEADROOM_MB;
1405
1412
  idleEvictMs = DEFAULT_IDLE_EVICT_MS;
1406
- /** Inference slot capacity (auto-tunes from Ollama pool size when known). */
1413
+ /** Inference slot capacity (shared pool aggregate; auto-tunes from Ollama
1414
+ * pool size when known). Per-device cap defaults to ceil(slotCapacity/N)
1415
+ * unless overridden via setPerGpuSlotCapacity. */
1407
1416
  slotCapacity = DEFAULT_SLOT_CAPACITY;
1408
1417
  /** Maximum queue depth before queue pressure is emitted. */
1409
1418
  queueCapacity = DEFAULT_QUEUE_CAPACITY;
@@ -1419,6 +1428,15 @@ var init_model_broker = __esm({
1419
1428
  _throughput = /* @__PURE__ */ new Map();
1420
1429
  /** Monotonic counter for slot ids. */
1421
1430
  _slotIdSeq = 0;
1431
+ /** Per-GPU slot capacity override. When unset, broker derives a per-GPU
1432
+ * cap from slotCapacity / detected device count. */
1433
+ _perGpuSlotCapacity = /* @__PURE__ */ new Map();
1434
+ /** Cached per-device VRAM (refreshed by pollOnce). */
1435
+ _vramByDevice = [];
1436
+ /** Optional provider that maps an Ollama model name to its current GPU.
1437
+ * Wired by the CLI/orchestrator at startup so the broker can copy pool
1438
+ * affinity onto LoadedModel records without importing the pool directly. */
1439
+ _ollamaAffinityProvider = null;
1422
1440
  static getInstance() {
1423
1441
  if (!_ModelBroker._instance)
1424
1442
  _ModelBroker._instance = new _ModelBroker();
@@ -1471,6 +1489,18 @@ var init_model_broker = __esm({
1471
1489
  setOllamaBaseUrl(url) {
1472
1490
  this._ollamaBaseUrl = url;
1473
1491
  }
1492
+ /**
1493
+ * Wire a function that resolves an Ollama model name to its current GPU
1494
+ * affinity (from the Ollama pool's per-instance state). The CLI calls
1495
+ * this at startup with a closure over `getOllamaPool().status()` so the
1496
+ * broker can copy gpuIndex/gpuUuid onto LoadedModel records without
1497
+ * importing from @omnius/orchestrator (which would create a circular dep).
1498
+ *
1499
+ * Pass null to clear.
1500
+ */
1501
+ setOllamaAffinityProvider(provider) {
1502
+ this._ollamaAffinityProvider = provider;
1503
+ }
1474
1504
  /** One poll cycle — refreshes /api/ps and emits snapshot. */
1475
1505
  async pollOnce() {
1476
1506
  await Promise.all([
@@ -1539,30 +1569,44 @@ var init_model_broker = __esm({
1539
1569
  const estVram = spec.estimatedVramMB ?? this.estimateFootprintVramMB(spec);
1540
1570
  const estRam = spec.estimatedRamMB ?? this.estimateFootprintRamMB(spec);
1541
1571
  const ram = ramSnapshotMB();
1542
- const vram = await vramSnapshotMB();
1543
1572
  const ramFitsAfter = ram.free - estRam >= this.ramHeadroomMB;
1544
- const vramFitsAfter = vram ? vram.free - estVram >= this.vramHeadroomMB : true;
1573
+ const devices = await vramSnapshotPerDevice();
1574
+ this._vramByDevice = devices;
1575
+ let chosenGpu = null;
1576
+ let vramFitsAfter = devices.length === 0;
1577
+ if (devices.length > 0) {
1578
+ const candidates = devices.filter((d2) => spec.preferredGpuIndex === void 0 || d2.index === spec.preferredGpuIndex).filter((d2) => d2.free - estVram >= this.vramHeadroomMB).sort((a2, b) => b.free - a2.free);
1579
+ if (candidates.length > 0) {
1580
+ chosenGpu = candidates[0].index;
1581
+ vramFitsAfter = true;
1582
+ }
1583
+ }
1545
1584
  if (ramFitsAfter && vramFitsAfter) {
1546
- const promise = Promise.resolve({ kind: "ok", effectiveNumCtx });
1585
+ const decision2 = { kind: "ok", effectiveNumCtx, gpuIndex: chosenGpu };
1586
+ const promise = Promise.resolve(decision2);
1547
1587
  this._inflight.set(key, { startedMs: Date.now(), owner: spec.owner, promise });
1548
1588
  setTimeout(() => this._inflight.delete(key), spec.loadTimeoutMs ?? DEFAULT_INFLIGHT_WAIT_MS).unref?.();
1549
- return { kind: "ok", effectiveNumCtx };
1589
+ return decision2;
1550
1590
  }
1591
+ const targetGpu = chosenGpu ?? this.deviceWithMostPressureRelativeTo(devices, estVram);
1592
+ const needVramMB = vramFitsAfter ? 0 : targetGpu !== null ? estVram + this.vramHeadroomMB - (devices.find((d2) => d2.index === targetGpu)?.free ?? 0) : estVram + this.vramHeadroomMB;
1551
1593
  const evictTargets = this.pickEvictionCandidates({
1552
- needVramMB: vramFitsAfter ? 0 : estVram + this.vramHeadroomMB - (vram?.free ?? 0),
1594
+ needVramMB,
1553
1595
  needRamMB: ramFitsAfter ? 0 : estRam + this.ramHeadroomMB - ram.free,
1554
1596
  requestingPriority: spec.priority ?? 0,
1555
- requestingDomain: spec.domain
1597
+ requestingDomain: spec.domain,
1598
+ targetGpu
1556
1599
  });
1557
1600
  if (evictTargets.length > 0) {
1558
- return { kind: "evict", evictTargets, effectiveNumCtx };
1601
+ return { kind: "evict", evictTargets, effectiveNumCtx, gpuIndex: targetGpu };
1559
1602
  }
1560
1603
  const fallback = await this.findRunnableFallback(spec);
1561
1604
  if (fallback) {
1562
1605
  this.emit("degraded", spec, fallback, "insufficient-memory-no-evictable");
1563
1606
  return { kind: "degrade", fallback, reason: "insufficient-memory-no-evictable" };
1564
1607
  }
1565
- const reason = `insufficient resources (need ~${estRam}MB RAM, ~${estVram}MB VRAM; free ${ram.free}MB RAM, ${vram ? vram.free : "?"}MB VRAM) and no evictable / fallback models`;
1608
+ const perDeviceSummary = devices.length === 0 ? "no GPU" : devices.map((d2) => `gpu${d2.index}=${d2.free}MB`).join(", ");
1609
+ const reason = `insufficient resources (need ~${estRam}MB RAM, ~${estVram}MB VRAM; free ${ram.free}MB RAM; VRAM ${perDeviceSummary}) and no evictable / fallback models`;
1566
1610
  this.emit("rejected", spec, reason);
1567
1611
  return { kind: "reject", reason };
1568
1612
  }
@@ -1638,10 +1682,22 @@ var init_model_broker = __esm({
1638
1682
  seen.add(key);
1639
1683
  const vramMB = Math.round((m2.size_vram ?? 0) / (1024 * 1024));
1640
1684
  const ramMB = Math.round(((m2.size ?? 0) - (m2.size_vram ?? 0)) / (1024 * 1024));
1685
+ let affinity = null;
1686
+ try {
1687
+ affinity = this._ollamaAffinityProvider ? this._ollamaAffinityProvider(m2.name) : null;
1688
+ } catch {
1689
+ affinity = null;
1690
+ }
1641
1691
  const existing = this._loaded.get(key);
1642
1692
  if (existing) {
1643
1693
  existing.vramMB = vramMB || existing.vramMB;
1644
1694
  existing.ramMB = ramMB || existing.ramMB;
1695
+ if (affinity) {
1696
+ if (affinity.gpuIndex !== null)
1697
+ existing.gpuIndex = affinity.gpuIndex;
1698
+ if (affinity.gpuUuid !== null)
1699
+ existing.gpuUuid = affinity.gpuUuid;
1700
+ }
1645
1701
  } else {
1646
1702
  const tracked = this.registerLoaded({
1647
1703
  key,
@@ -1653,7 +1709,9 @@ var init_model_broker = __esm({
1653
1709
  ramMB,
1654
1710
  priority: 0,
1655
1711
  loadedAt: now,
1656
- lastUsedAt: now
1712
+ lastUsedAt: now,
1713
+ gpuIndex: affinity?.gpuIndex ?? null,
1714
+ gpuUuid: affinity?.gpuUuid ?? null
1657
1715
  });
1658
1716
  void tracked;
1659
1717
  }
@@ -1746,7 +1804,8 @@ var init_model_broker = __esm({
1746
1804
  m2.domain !== req2.requestingDomain || this.countByDomain(req2.requestingDomain) > 1
1747
1805
  );
1748
1806
  const idle = (m2) => now - m2.lastUsedAt > this.idleEvictMs;
1749
- const evictable = [...this._loaded.values()].filter((m2) => m2.priority <= req2.requestingPriority).filter(sameDomainOk).sort((a2, b) => {
1807
+ const onTargetGpu = (m2) => req2.targetGpu === void 0 || req2.targetGpu === null ? true : m2.gpuIndex === req2.targetGpu;
1808
+ const evictable = [...this._loaded.values()].filter((m2) => m2.priority <= req2.requestingPriority).filter(sameDomainOk).filter(onTargetGpu).sort((a2, b) => {
1750
1809
  const aIdle = idle(a2) ? 0 : 1;
1751
1810
  const bIdle = idle(b) ? 0 : 1;
1752
1811
  if (aIdle !== bIdle)
@@ -1767,6 +1826,24 @@ var init_model_broker = __esm({
1767
1826
  return targets;
1768
1827
  return [];
1769
1828
  }
1829
+ /** Pick the GPU whose free-VRAM gap to the requested footprint is smallest
1830
+ * (i.e. closest to fitting). Used when no device cleanly fits — eviction
1831
+ * on this device has the best chance of opening room. Returns null when
1832
+ * no GPUs are detected. */
1833
+ deviceWithMostPressureRelativeTo(devices, needMB) {
1834
+ if (devices.length === 0)
1835
+ return null;
1836
+ let best = null;
1837
+ let bestGap = Infinity;
1838
+ for (const d2 of devices) {
1839
+ const gap = needMB - d2.free;
1840
+ if (gap < bestGap) {
1841
+ bestGap = gap;
1842
+ best = d2;
1843
+ }
1844
+ }
1845
+ return best?.index ?? null;
1846
+ }
1770
1847
  countByDomain(domain) {
1771
1848
  let n2 = 0;
1772
1849
  for (const m2 of this._loaded.values())
@@ -1897,17 +1974,31 @@ var init_model_broker = __esm({
1897
1974
  inflight: [...this._inflight.entries()].map(([key, v]) => ({ key, owner: v.owner, startedMs: v.startedMs })),
1898
1975
  ramMB: ram,
1899
1976
  vramMB: vram,
1977
+ vramPerDevice: [...this._vramByDevice],
1900
1978
  lastPollAt: Date.now(),
1901
1979
  slots: this.buildSlotsSnapshot()
1902
1980
  };
1903
1981
  }
1904
1982
  buildSlotsSnapshot() {
1905
1983
  const byModel = {};
1984
+ const byGpu = {};
1906
1985
  for (const slot of this._activeSlots.values()) {
1907
1986
  const k = slot.model;
1908
1987
  if (!byModel[k])
1909
1988
  byModel[k] = { inUse: 0, tokensPerSec: 0, samples: 0 };
1910
1989
  byModel[k].inUse += 1;
1990
+ if (slot.gpuIndex !== null && slot.gpuIndex !== void 0) {
1991
+ if (!byGpu[slot.gpuIndex])
1992
+ byGpu[slot.gpuIndex] = { inUse: 0, capacity: this.perGpuSlotCapacity(slot.gpuIndex), loadedMB: 0 };
1993
+ byGpu[slot.gpuIndex].inUse += 1;
1994
+ }
1995
+ }
1996
+ for (const m2 of this._loaded.values()) {
1997
+ if (m2.gpuIndex !== null && m2.gpuIndex !== void 0) {
1998
+ if (!byGpu[m2.gpuIndex])
1999
+ byGpu[m2.gpuIndex] = { inUse: 0, capacity: this.perGpuSlotCapacity(m2.gpuIndex), loadedMB: 0 };
2000
+ byGpu[m2.gpuIndex].loadedMB += m2.vramMB;
2001
+ }
1911
2002
  }
1912
2003
  for (const [model, tp] of this._throughput) {
1913
2004
  if (!byModel[model])
@@ -1915,23 +2006,46 @@ var init_model_broker = __esm({
1915
2006
  byModel[model].tokensPerSec = tp.tokensPerSec;
1916
2007
  byModel[model].samples = tp.samples;
1917
2008
  }
2009
+ for (const d2 of this._vramByDevice) {
2010
+ if (!byGpu[d2.index])
2011
+ byGpu[d2.index] = { inUse: 0, capacity: this.perGpuSlotCapacity(d2.index), loadedMB: 0 };
2012
+ }
1918
2013
  return {
1919
2014
  inUse: this._activeSlots.size,
1920
2015
  capacity: this.slotCapacity,
1921
2016
  queueDepth: this._slotQueue.length,
1922
2017
  queueCapacity: this.queueCapacity,
1923
- byModel
2018
+ byModel,
2019
+ byGpu
1924
2020
  };
1925
2021
  }
2022
+ /** Per-GPU slot capacity. Returns the override when set, else ceil(slotCapacity / deviceCount). */
2023
+ perGpuSlotCapacity(gpuIndex) {
2024
+ const override = this._perGpuSlotCapacity.get(gpuIndex);
2025
+ if (override !== void 0)
2026
+ return override;
2027
+ const n2 = Math.max(1, this._vramByDevice.length);
2028
+ return Math.max(1, Math.ceil(this.slotCapacity / n2));
2029
+ }
1926
2030
  async checkPressure(snap) {
1927
2031
  if (snap.ramMB.free < this.ramHeadroomMB) {
1928
2032
  this.emit("pressure", "ram", snap.ramMB.free, this.ramHeadroomMB);
1929
2033
  }
1930
- const v = await vramSnapshotMB();
1931
- if (v) {
1932
- snap.vramMB = v;
1933
- if (v.free < this.vramHeadroomMB) {
1934
- this.emit("pressure", "vram", v.free, this.vramHeadroomMB);
2034
+ const devices = await vramSnapshotPerDevice();
2035
+ this._vramByDevice = devices;
2036
+ if (devices.length > 0) {
2037
+ let total = 0, used = 0, free = 0;
2038
+ for (const d2 of devices) {
2039
+ total += d2.total;
2040
+ used += d2.used;
2041
+ free += d2.free;
2042
+ }
2043
+ snap.vramMB = { total, used, free };
2044
+ snap.vramPerDevice = devices;
2045
+ for (const d2 of devices) {
2046
+ if (d2.free < this.vramHeadroomMB) {
2047
+ this.emit("pressure", "vram", d2.free, this.vramHeadroomMB);
2048
+ }
1935
2049
  }
1936
2050
  }
1937
2051
  const queueThreshold = Math.floor(this.queueCapacity * 0.8);
@@ -1960,18 +2074,22 @@ var init_model_broker = __esm({
1960
2074
  * upstream callers (e.g. Telegram poll loop) should slow ingress.
1961
2075
  */
1962
2076
  acquireInferenceSlot(spec) {
1963
- if (this._activeSlots.size < this.slotCapacity) {
2077
+ const chosenGpu = this.pickGpuForSlot(spec);
2078
+ const gpuOk = chosenGpu === null || this.activeSlotsOnGpu(chosenGpu) < this.perGpuSlotCapacity(chosenGpu);
2079
+ if (gpuOk && this._activeSlots.size < this.slotCapacity) {
1964
2080
  return Promise.resolve(this.admitSlot(
1965
2081
  spec,
1966
2082
  /*reserved*/
1967
- false
2083
+ false,
2084
+ chosenGpu
1968
2085
  ));
1969
2086
  }
1970
2087
  if (spec.sessionKey && !this._reservedBySession.has(spec.sessionKey) && this._activeSlots.size < this.slotCapacity + 1) {
1971
2088
  const slot = this.admitSlot(
1972
2089
  spec,
1973
2090
  /*reserved*/
1974
- true
2091
+ true,
2092
+ chosenGpu
1975
2093
  );
1976
2094
  this._reservedBySession.set(spec.sessionKey, slot.info.id);
1977
2095
  return Promise.resolve(slot);
@@ -2038,7 +2156,7 @@ var init_model_broker = __esm({
2038
2156
  });
2039
2157
  }
2040
2158
  /** Admit a slot — internal, called from acquire fast path and from drainQueue. */
2041
- admitSlot(spec, reserved) {
2159
+ admitSlot(spec, reserved, gpuIndex = null) {
2042
2160
  const id = `slot-${++this._slotIdSeq}-${Date.now().toString(36)}`;
2043
2161
  const info = {
2044
2162
  id,
@@ -2048,7 +2166,8 @@ var init_model_broker = __esm({
2048
2166
  sessionKey: spec.sessionKey,
2049
2167
  acquiredAt: Date.now(),
2050
2168
  promptTokens: spec.promptTokens ?? 0,
2051
- reserved
2169
+ reserved,
2170
+ gpuIndex
2052
2171
  };
2053
2172
  this._activeSlots.set(id, info);
2054
2173
  this.emit("slotAcquired", info);
@@ -2064,6 +2183,35 @@ var init_model_broker = __esm({
2064
2183
  }
2065
2184
  };
2066
2185
  }
2186
+ /** Count of active slots pinned to a given GPU. */
2187
+ activeSlotsOnGpu(gpuIndex) {
2188
+ let n2 = 0;
2189
+ for (const s2 of this._activeSlots.values()) {
2190
+ if (s2.gpuIndex === gpuIndex)
2191
+ n2++;
2192
+ }
2193
+ return n2;
2194
+ }
2195
+ /**
2196
+ * Pick a GPU for a new inference slot. Honors caller's preferredGpuIndex
2197
+ * when set; otherwise picks the GPU with the highest free VRAM that has
2198
+ * room for the estimated footprint and an open per-device slot.
2199
+ *
2200
+ * Returns null when no GPU is detected (CPU-only) or no device fits — in
2201
+ * the latter case the slot is admitted unpinned and the underlying
2202
+ * subprocess will pick whatever CUDA exposes by default.
2203
+ */
2204
+ pickGpuForSlot(spec) {
2205
+ if (this._vramByDevice.length === 0)
2206
+ return null;
2207
+ const candidates = this._vramByDevice.filter((d2) => spec.preferredGpuIndex === void 0 || d2.index === spec.preferredGpuIndex).filter((d2) => this.activeSlotsOnGpu(d2.index) < this.perGpuSlotCapacity(d2.index)).filter((d2) => spec.estimatedVramMB === void 0 || d2.free >= spec.estimatedVramMB).sort((a2, b) => b.free - a2.free);
2208
+ return candidates[0]?.index ?? null;
2209
+ }
2210
+ /** Configure per-GPU slot capacity. Overrides the slotCapacity-derived default. */
2211
+ setPerGpuSlotCapacity(gpuIndex, capacity) {
2212
+ this._perGpuSlotCapacity.set(gpuIndex, Math.max(1, Math.floor(capacity)));
2213
+ this.drainSlotQueue();
2214
+ }
2067
2215
  releaseSlot(info, outcome) {
2068
2216
  this._activeSlots.delete(info.id);
2069
2217
  if (info.sessionKey && this._reservedBySession.get(info.sessionKey) === info.id) {
@@ -2089,8 +2237,18 @@ var init_model_broker = __esm({
2089
2237
  this.drainSlotQueue();
2090
2238
  }
2091
2239
  drainSlotQueue() {
2092
- while (this._slotQueue.length > 0 && this._activeSlots.size < this.slotCapacity) {
2093
- const entry = this._slotQueue.shift();
2240
+ const queueCopy = [...this._slotQueue];
2241
+ for (const entry of queueCopy) {
2242
+ if (this._activeSlots.size >= this.slotCapacity)
2243
+ break;
2244
+ const chosenGpu = this.pickGpuForSlot(entry.spec);
2245
+ const gpuOk = chosenGpu === null || this.activeSlotsOnGpu(chosenGpu) < this.perGpuSlotCapacity(chosenGpu);
2246
+ if (!gpuOk)
2247
+ continue;
2248
+ const idx = this._slotQueue.indexOf(entry);
2249
+ if (idx < 0)
2250
+ continue;
2251
+ this._slotQueue.splice(idx, 1);
2094
2252
  if (entry.onSignalAbort && entry.spec.signal) {
2095
2253
  entry.spec.signal.removeEventListener("abort", entry.onSignalAbort);
2096
2254
  }
@@ -2104,7 +2262,8 @@ var init_model_broker = __esm({
2104
2262
  const slot = this.admitSlot(
2105
2263
  entry.spec,
2106
2264
  /*reserved*/
2107
- false
2265
+ false,
2266
+ chosenGpu
2108
2267
  );
2109
2268
  try {
2110
2269
  entry.resolve(slot);
@@ -19608,26 +19767,16 @@ function extractSkillForQuery(skill, content, query, budgetTokens = 900) {
19608
19767
  function buildSkillsSummary(skills) {
19609
19768
  if (skills.length === 0)
19610
19769
  return "";
19611
- const lines = [
19612
- "## Skills Index",
19613
- "",
19614
- `${skills.length} skills available. Call \`skill_list\` to search, \`skill_execute <name>\` to load full instructions.`,
19615
- ""
19616
- ];
19617
19770
  const bySource = /* @__PURE__ */ new Map();
19618
19771
  for (const s2 of skills) {
19619
- const group = bySource.get(s2.source) ?? [];
19620
- group.push(s2);
19621
- bySource.set(s2.source, group);
19622
- }
19623
- for (const [source, group] of bySource) {
19624
- const names = group.map((s2) => {
19625
- const t2 = s2.triggers[0];
19626
- return t2 ? `${s2.name}(${t2})` : s2.name;
19627
- });
19628
- lines.push(`**${source}** (${group.length}): ${names.join(", ")}`);
19772
+ bySource.set(s2.source, (bySource.get(s2.source) ?? 0) + 1);
19629
19773
  }
19630
- return lines.join("\n");
19774
+ const sourcesSummary = [...bySource.entries()].sort((a2, b) => b[1] - a2[1]).map(([source, count]) => `${source}=${count}`).join(", ");
19775
+ return [
19776
+ "## Skills Index",
19777
+ `${skills.length} skills available across ${bySource.size} sources (${sourcesSummary}).`,
19778
+ "Use `skill_list` (with optional `filter` or `source`) to search; `skill_execute <name>` to load full instructions."
19779
+ ].join("\n");
19631
19780
  }
19632
19781
  function safeReaddir2(dir, dirsOnly = false) {
19633
19782
  try {
@@ -255439,6 +255588,11 @@ import sys
255439
255588
  import time
255440
255589
  from pathlib import Path
255441
255590
 
255591
+ # Broker-picked GPU pinning — MUST run before importing torch.
255592
+ _omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
255593
+ if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
255594
+ os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
255595
+
255442
255596
  def _progress(stage, message, percent=None):
255443
255597
  payload = {"omnius_progress": True, "stage": stage, "message": message}
255444
255598
  if percent is not None:
@@ -255597,9 +255751,15 @@ if __name__ == "__main__":
255597
255751
  SDCPP_RUNNER = String.raw`#!/usr/bin/env python3
255598
255752
  import argparse
255599
255753
  import json
255754
+ import os
255600
255755
  import time
255601
255756
  from pathlib import Path
255602
255757
 
255758
+ # Broker-picked GPU pinning — sd-cpp's CUDA backend honors CUDA_VISIBLE_DEVICES.
255759
+ _omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
255760
+ if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
255761
+ os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
255762
+
255603
255763
  def main():
255604
255764
  parser = argparse.ArgumentParser()
255605
255765
  parser.add_argument("--model-path", required=True)
@@ -255713,6 +255873,9 @@ if __name__ == "__main__":
255713
255873
  defaultModel;
255714
255874
  defaultBackend;
255715
255875
  promptExpander = null;
255876
+ /** Broker-chosen GPU pinning for the in-flight generation. Read by the
255877
+ * spawn path to set OMNIUS_GPU_INDEX in the subprocess env. */
255878
+ _brokerGpuIndex = null;
255716
255879
  constructor(cwd4, ollamaUrl = "http://localhost:11434", defaults3 = {}) {
255717
255880
  this.cwd = cwd4;
255718
255881
  this.ollamaUrl = ollamaUrl.replace(/\/v1\/?$/, "").replace(/\/$/, "");
@@ -255788,6 +255951,7 @@ if __name__ == "__main__":
255788
255951
  const candidates = imageGenerationFallbackCandidates(requestedModel, requestedBackend, generationFallbackEnabled(args));
255789
255952
  const broker = getModelBroker();
255790
255953
  const firstCandidate = candidates[0];
255954
+ let brokerGpuIndex = null;
255791
255955
  if (firstCandidate) {
255792
255956
  const decision2 = await broker.ensureModelLoadable({
255793
255957
  name: firstCandidate.model,
@@ -255799,6 +255963,9 @@ if __name__ == "__main__":
255799
255963
  for (const target of decision2.evictTargets) {
255800
255964
  await broker.evict(target.host, target.name, "image-gen-needs-room");
255801
255965
  }
255966
+ brokerGpuIndex = decision2.gpuIndex ?? null;
255967
+ } else if (decision2.kind === "ok") {
255968
+ brokerGpuIndex = decision2.gpuIndex ?? null;
255802
255969
  } else if (decision2.kind === "reject") {
255803
255970
  return {
255804
255971
  success: false,
@@ -255808,6 +255975,7 @@ if __name__ == "__main__":
255808
255975
  };
255809
255976
  }
255810
255977
  }
255978
+ this._brokerGpuIndex = brokerGpuIndex;
255811
255979
  try {
255812
255980
  return await this.generateCandidateLadder({ candidates, prompt, args, seed, start: start2 });
255813
255981
  } catch (err) {
@@ -256310,10 +256478,14 @@ ${errText.slice(0, 800)}`,
256310
256478
  }
256311
256479
  ensureUnifiedCacheDirs();
256312
256480
  this.emitProgress({ stage: "load", message: `Starting image generation with ${args.model}` });
256481
+ const runnerEnv = { ...python.env };
256482
+ if (this._brokerGpuIndex !== null) {
256483
+ runnerEnv["OMNIUS_GPU_INDEX"] = String(this._brokerGpuIndex);
256484
+ }
256313
256485
  const result = await runProcess2(python.command, argv, {
256314
256486
  cwd: this.cwd,
256315
256487
  timeoutMs: 9e5,
256316
- env: python.env,
256488
+ env: runnerEnv,
256317
256489
  progressLabel: `Downloading/loading ${args.model}`,
256318
256490
  onProgress: (event) => this.emitProgress(event)
256319
256491
  });
@@ -257609,9 +257781,14 @@ var init_audio_generate = __esm({
257609
257781
  DEFAULT_MUSIC_MODEL
257610
257782
  ];
257611
257783
  DIFFUSERS_AUDIO_RUNNER = String.raw`#!/usr/bin/env python3
257612
- import argparse, json, sys, time
257784
+ import argparse, json, os, sys, time
257613
257785
  from pathlib import Path
257614
257786
 
257787
+ # Broker-picked GPU pinning — must run before importing torch.
257788
+ _omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
257789
+ if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
257790
+ os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
257791
+
257615
257792
  def _format_bytes(value):
257616
257793
  try:
257617
257794
  n = float(value)
@@ -257805,9 +257982,14 @@ if __name__ == "__main__":
257805
257982
  main()
257806
257983
  `;
257807
257984
  TRANSFORMERS_AUDIO_RUNNER = String.raw`#!/usr/bin/env python3
257808
- import argparse, json, sys, time
257985
+ import argparse, json, os, sys, time
257809
257986
  from pathlib import Path
257810
257987
 
257988
+ # Broker-picked GPU pinning — must run before importing torch.
257989
+ _omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
257990
+ if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
257991
+ os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
257992
+
257811
257993
  def _format_bytes(value):
257812
257994
  try:
257813
257995
  n = float(value)
@@ -258033,6 +258215,8 @@ if __name__ == "__main__":
258033
258215
  progressHandler = null;
258034
258216
  lastProgressMessage = "";
258035
258217
  lastProgressAt = 0;
258218
+ /** Broker-chosen GPU pinning for the in-flight generation. */
258219
+ _brokerGpuIndex = null;
258036
258220
  constructor(cwd4, defaults3 = {}) {
258037
258221
  this.cwd = cwd4;
258038
258222
  this.defaults = defaults3;
@@ -258198,6 +258382,7 @@ if __name__ == "__main__":
258198
258382
  const playback = playbackRequested(args);
258199
258383
  const broker = getModelBroker();
258200
258384
  const firstCandidate = candidates[0];
258385
+ let brokerGpuIndex = null;
258201
258386
  if (firstCandidate) {
258202
258387
  const decision2 = await broker.ensureModelLoadable({
258203
258388
  name: firstCandidate.model,
@@ -258209,6 +258394,9 @@ if __name__ == "__main__":
258209
258394
  for (const target of decision2.evictTargets) {
258210
258395
  await broker.evict(target.host, target.name, `${kind}-gen-needs-room`);
258211
258396
  }
258397
+ brokerGpuIndex = decision2.gpuIndex ?? null;
258398
+ } else if (decision2.kind === "ok") {
258399
+ brokerGpuIndex = decision2.gpuIndex ?? null;
258212
258400
  } else if (decision2.kind === "reject") {
258213
258401
  return {
258214
258402
  success: false,
@@ -258218,6 +258406,7 @@ if __name__ == "__main__":
258218
258406
  };
258219
258407
  }
258220
258408
  }
258409
+ this._brokerGpuIndex = brokerGpuIndex;
258221
258410
  try {
258222
258411
  return await this.generateCandidateLadder({ kind, candidates, prompt, args, seed, playback, start: start2 });
258223
258412
  } catch (err) {
@@ -258384,10 +258573,14 @@ if __name__ == "__main__":
258384
258573
  }
258385
258574
  ensureUnifiedCacheDirs();
258386
258575
  this.emitProgress({ stage: "load", message: `Starting ${args.kind} generation with ${args.model}` });
258576
+ const runnerEnv = { ...python.env };
258577
+ if (this._brokerGpuIndex !== null) {
258578
+ runnerEnv["OMNIUS_GPU_INDEX"] = String(this._brokerGpuIndex);
258579
+ }
258387
258580
  const result = await runProcess3(python.command, argv, {
258388
258581
  cwd: this.cwd,
258389
258582
  timeoutMs: 9e5,
258390
- env: python.env,
258583
+ env: runnerEnv,
258391
258584
  progressLabel: `Downloading/loading ${args.model}`,
258392
258585
  onProgress: (event) => this.emitProgress(event)
258393
258586
  });
@@ -259157,7 +259350,7 @@ function parseRunnerJson3(stdout) {
259157
259350
  }
259158
259351
  return null;
259159
259352
  }
259160
- var DEFAULT_DIFFUSERS_VIDEO_MODEL, SANA_VIDEO_480P_MODEL, SANA_VIDEO_720P_MODEL, WAN_TI2V_5B_MODEL, WAN_T2V_A14B_MODEL, WAN_I2V_A14B_MODEL, WAN_S2V_14B_MODEL, COGVIDEOX_5B_MODEL, COGVIDEOX_2B_MODEL, COGVIDEOX_5B_I2V_MODEL, MOCHI_PREVIEW_MODEL, LTX_VIDEO_MODEL, LTX_2_3_MODEL, HUNYUAN_VIDEO_MODEL, DIFFUSERS_VIDEO_PACKAGES, VIDEO_GENERATION_MODEL_PRESETS, VIDEO_GENERATION_QUALITY_LADDER, VIDEO_AUDIO_QUALITY_LADDER, DIFFUSERS_VIDEO_RUNNER, COMFY_BOOTSTRAP_SCRIPT, COMFY_DEFAULT_WORKFLOWS, VideoGenerateTool;
259353
+ var DEFAULT_DIFFUSERS_VIDEO_MODEL, SANA_VIDEO_480P_MODEL, SANA_VIDEO_720P_MODEL, SANA_WM_BIDIRECTIONAL_MODEL, WAN_TI2V_5B_MODEL, WAN_T2V_A14B_MODEL, WAN_I2V_A14B_MODEL, WAN_S2V_14B_MODEL, COGVIDEOX_5B_MODEL, COGVIDEOX_2B_MODEL, COGVIDEOX_5B_I2V_MODEL, MOCHI_PREVIEW_MODEL, LTX_VIDEO_MODEL, LTX_2_3_MODEL, HUNYUAN_VIDEO_MODEL, DIFFUSERS_VIDEO_PACKAGES, VIDEO_GENERATION_MODEL_PRESETS, VIDEO_GENERATION_QUALITY_LADDER, VIDEO_AUDIO_QUALITY_LADDER, DIFFUSERS_VIDEO_RUNNER, COMFY_BOOTSTRAP_SCRIPT, COMFY_DEFAULT_WORKFLOWS, VideoGenerateTool;
259161
259354
  var init_video_generate = __esm({
259162
259355
  "packages/execution/dist/tools/video-generate.js"() {
259163
259356
  "use strict";
@@ -259167,6 +259360,7 @@ var init_video_generate = __esm({
259167
259360
  DEFAULT_DIFFUSERS_VIDEO_MODEL = "Efficient-Large-Model/SANA-Video_2B_480p";
259168
259361
  SANA_VIDEO_480P_MODEL = "Efficient-Large-Model/SANA-Video_2B_480p";
259169
259362
  SANA_VIDEO_720P_MODEL = "Efficient-Large-Model/SANA-Video_2B_720p";
259363
+ SANA_WM_BIDIRECTIONAL_MODEL = "Efficient-Large-Model/SANA-WM_bidirectional";
259170
259364
  WAN_TI2V_5B_MODEL = "Wan-AI/Wan2.2-TI2V-5B-Diffusers";
259171
259365
  WAN_T2V_A14B_MODEL = "Wan-AI/Wan2.2-T2V-A14B-Diffusers";
259172
259366
  WAN_I2V_A14B_MODEL = "Wan-AI/Wan2.2-I2V-A14B-Diffusers";
@@ -259460,6 +259654,41 @@ var init_video_generate = __esm({
259460
259654
  licenseNote: "Apache 2.0",
259461
259655
  note: "Premium Wan T2V; cloud GPU recommended."
259462
259656
  },
259657
+ {
259658
+ id: SANA_WM_BIDIRECTIONAL_MODEL,
259659
+ label: "SANA-WM bidirectional (world-model i2v)",
259660
+ kinds: ["i2v"],
259661
+ backend: "diffusers",
259662
+ // SANA-WM declares its concrete class in model_index.json; loaded via
259663
+ // generic DiffusionPipeline.from_pretrained — the runner's auto path
259664
+ // already does this for unknown model names.
259665
+ pipelineClass: "DiffusionPipeline",
259666
+ install: 'python3 .omnius/video-gen/diffusers_text2video.py --model Efficient-Large-Model/SANA-WM_bidirectional --mode i2v --num-frames 121 --fps 24 --width 704 --height 1280 --steps 30 --guidance 5.0 --image <input.png> --prompt "..." --output .omnius/videos/out.mp4',
259667
+ category: "Premium quality",
259668
+ sizeClass: "2.6B DiT + LTX-2 refiner (Sana World Model)",
259669
+ quality: "Image-to-video world model with optional camera-trajectory control. Two-stage generation (Sana DiT + LTX-2 refiner); hybrid linear attention; 6-DoF camera support via .npy matrices or WASD/IJKL action DSL.",
259670
+ output: "Up to ~13s 704×1280 (portrait 720p) MP4 at 24 fps; max 321 frames.",
259671
+ bestUse: "World-model / camera-controlled video from a single first-frame image. Best on H100/A100-class hardware.",
259672
+ minVramGB: 80,
259673
+ recommendedVramGB: 100,
259674
+ deployment: "Diffusers DiffusionPipeline.from_pretrained; bfloat16; aggressive CPU offload mandatory below 100 GB. Bundled LTX-2 refiner runs as stage 2.",
259675
+ steps: 30,
259676
+ guidance: 5,
259677
+ numFrames: 121,
259678
+ fps: 24,
259679
+ width: 704,
259680
+ height: 1280,
259681
+ dtype: "bfloat16",
259682
+ needsCpuOffload: true,
259683
+ frameQuantum: 1,
259684
+ pixelQuantum: 16,
259685
+ // Apache 2.0 base; bundled LTX-2 refiner + VAE inherit the LTX-2
259686
+ // non-commercial license. Surface that explicitly.
259687
+ licenseNote: "Apache 2.0 (bundled LTX-2 refiner/VAE inherit LTX-2 non-commercial terms)",
259688
+ approxDownloadGB: 99,
259689
+ fallbackFor: [WAN_I2V_A14B_MODEL],
259690
+ note: "Sana World Model bidirectional i2v; portrait 704×1280 fixed; camera control via --camera <matrices.npy> or --action <DSL> when the runner supports it."
259691
+ },
259463
259692
  {
259464
259693
  id: WAN_I2V_A14B_MODEL,
259465
259694
  label: "Wan2.2 I2V A14B",
@@ -259588,6 +259817,9 @@ var init_video_generate = __esm({
259588
259817
  COGVIDEOX_5B_MODEL,
259589
259818
  MOCHI_PREVIEW_MODEL,
259590
259819
  COGVIDEOX_2B_MODEL,
259820
+ // Heavy i2v / world-model tier — only attempted when an explicit model
259821
+ // is requested or the consumer-VRAM tier above has failed for an i2v ask.
259822
+ SANA_WM_BIDIRECTIONAL_MODEL,
259591
259823
  WAN_I2V_A14B_MODEL,
259592
259824
  WAN_T2V_A14B_MODEL,
259593
259825
  HUNYUAN_VIDEO_MODEL
@@ -259606,6 +259838,16 @@ import sys
259606
259838
  import time
259607
259839
  from pathlib import Path
259608
259840
 
259841
+ # ── GPU pinning ─────────────────────────────────────────────────────
259842
+ # The TS broker picks a GPU per generation via bin-packing across the
259843
+ # available CUDA devices. It passes the chosen index in OMNIUS_GPU_INDEX.
259844
+ # We MUST apply CUDA_VISIBLE_DEVICES BEFORE importing torch, otherwise
259845
+ # torch initializes the device list with all visible GPUs and the model
259846
+ # may land on a different device than the broker reserved capacity on.
259847
+ _omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
259848
+ if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
259849
+ os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
259850
+
259609
259851
  def _progress(stage, message, percent=None):
259610
259852
  payload = {"omnius_progress": True, "stage": stage, "message": message}
259611
259853
  if percent is not None:
@@ -260412,6 +260654,9 @@ if __name__ == "__main__":
260412
260654
  defaultBackend;
260413
260655
  defaultKind;
260414
260656
  promptExpander = null;
260657
+ /** GPU index chosen by the broker for the in-flight generation. Read
260658
+ * by the spawn path to set OMNIUS_GPU_INDEX in the subprocess env. */
260659
+ _brokerGpuIndex = null;
260415
260660
  constructor(cwd4, defaults3 = {}) {
260416
260661
  this.cwd = cwd4;
260417
260662
  this.defaultModel = defaults3.model;
@@ -260501,17 +260746,23 @@ if __name__ == "__main__":
260501
260746
  const candidates = videoGenerationFallbackCandidates(requestedModel, requestedBackend, inferredKind, generationFallbackEnabled3(args), { preferNativeAudioVideo: withAudio || Boolean(audioInput) });
260502
260747
  const broker = getModelBroker();
260503
260748
  const firstCandidate = candidates[0];
260749
+ let brokerGpuIndex = null;
260504
260750
  if (firstCandidate) {
260751
+ const preset = firstCandidate.preset;
260505
260752
  const decision2 = await broker.ensureModelLoadable({
260506
260753
  name: firstCandidate.model,
260507
260754
  domain: "video-gen",
260508
260755
  host: firstCandidate.backend === "comfyui" ? "comfyui" : "diffusers-py",
260509
- owner: "video-generate-tool"
260756
+ owner: "video-generate-tool",
260757
+ estimatedVramMB: preset ? preset.minVramGB * 1024 : void 0
260510
260758
  });
260511
260759
  if (decision2.kind === "evict") {
260512
260760
  for (const target of decision2.evictTargets) {
260513
260761
  await broker.evict(target.host, target.name, "video-gen-needs-room");
260514
260762
  }
260763
+ brokerGpuIndex = decision2.gpuIndex ?? null;
260764
+ } else if (decision2.kind === "ok") {
260765
+ brokerGpuIndex = decision2.gpuIndex ?? null;
260515
260766
  } else if (decision2.kind === "reject") {
260516
260767
  return {
260517
260768
  success: false,
@@ -260521,6 +260772,7 @@ if __name__ == "__main__":
260521
260772
  };
260522
260773
  }
260523
260774
  }
260775
+ this._brokerGpuIndex = brokerGpuIndex;
260524
260776
  if (candidates.length === 0) {
260525
260777
  return {
260526
260778
  success: false,
@@ -260942,6 +261194,9 @@ ${llmAnnotation}` : result.llmContent;
260942
261194
  runnerEnv["HF_TOKEN"] = effectiveToken;
260943
261195
  runnerEnv["HUGGING_FACE_HUB_TOKEN"] = effectiveToken;
260944
261196
  }
261197
+ if (this._brokerGpuIndex !== null) {
261198
+ runnerEnv["OMNIUS_GPU_INDEX"] = String(this._brokerGpuIndex);
261199
+ }
260945
261200
  const argv = [
260946
261201
  runner,
260947
261202
  "--model",
@@ -570291,18 +570546,6 @@ function formatReflection(notes2, scenario) {
570291
570546
  ];
570292
570547
  return lines.join("\n");
570293
570548
  }
570294
- function formatMemory(input, state) {
570295
- const lines = [];
570296
- if (input.memoryContext) lines.push(input.memoryContext);
570297
- if (state.dynamicState && Object.keys(state.dynamicState).length > 0) {
570298
- const entries = Object.entries(state.dynamicState).slice(0, 12).map(([key, value2]) => `- ${key}: ${compactText(JSON.stringify(value2) ?? String(value2), 220)}`);
570299
- lines.push(`Dynamic state:
570300
- ${entries.join("\n")}`);
570301
- }
570302
- if (state.updatedAt) lines.push(`State updated: ${state.updatedAt}`);
570303
- if (lines.length === 0) return "No additional retrieved voice-soul memory beyond scoped personality and runtime state.";
570304
- return lines.join("\n\n");
570305
- }
570306
570549
  function formatFinalVoice(input) {
570307
570550
  const voice = findProjectVoice(input.scope);
570308
570551
  const lines = [
@@ -570329,23 +570572,23 @@ function buildSoulContext(input) {
570329
570572
  const state = loadSoulRuntimeState(input);
570330
570573
  const scenario = resolveSoulScenario(input, state);
570331
570574
  const tree2 = resolveSoulDecisionTree(input, state, scenario);
570332
- return [
570333
- "## Voice Soul Context",
570334
- "### 1. Authority And Safety Scope",
570575
+ const sections = ["## Voice Soul Context"];
570576
+ const voiceAndScope = [
570335
570577
  formatAuthorityScope(input),
570336
- "### 2. Core Identity",
570337
570578
  formatCoreIdentity(input),
570338
- "### 3. Procedural Decision Tree",
570339
- formatProceduralConstraints(input, scenario, tree2, state),
570340
- "### 4. Relationship State",
570341
- formatRelationshipState(input),
570342
- "### 5. Current Reflection Notes",
570343
- formatReflection(input.currentReflection, scenario),
570344
- "### 6. Minimal Retrieved Memory",
570345
- formatMemory(input, state),
570346
- "### 7. Final Voice Guidance",
570347
570579
  formatFinalVoice(input)
570348
- ].join("\n\n");
570580
+ ].filter(Boolean).join("\n\n");
570581
+ sections.push("### Voice + Scope + Identity", voiceAndScope);
570582
+ const decisionSubstrate = [
570583
+ formatRelationshipState(input),
570584
+ formatProceduralConstraints(input, scenario, tree2, state)
570585
+ ].filter(Boolean).join("\n\n");
570586
+ sections.push("### Active Relationship + Scenario", decisionSubstrate);
570587
+ const reflection = formatReflection(input.currentReflection, scenario);
570588
+ if (reflection && reflection.trim().length > 0) {
570589
+ sections.push("### Current Reflection Notes", reflection);
570590
+ }
570591
+ return sections.join("\n\n");
570349
570592
  }
570350
570593
  var MAX_SOUL_CHARS, MAX_VOICE_CHARS, MAX_SCOPED_PERSONALITY_CHARS, UNCLASSIFIED_SCENARIO;
570351
570594
  var init_voice_soul = __esm({
@@ -577276,7 +577519,32 @@ var init_status_bar = __esm({
577276
577519
  if (this.active) this.renderFooterPreserveCursor();
577277
577520
  }, intervalMs);
577278
577521
  try {
577279
- getModelBroker().startPolling(Math.max(2e3, intervalMs * 2));
577522
+ const broker = getModelBroker();
577523
+ try {
577524
+ Promise.resolve().then(() => (init_dist8(), dist_exports3)).then(({ getOllamaPool: getOllamaPool2, resolveDefaultPoolConfig: resolveDefaultPoolConfig2 }) => {
577525
+ try {
577526
+ const config = resolveDefaultPoolConfig2();
577527
+ const pool3 = getOllamaPool2({ baseInstanceUrl: config.baseInstanceUrl });
577528
+ broker.setOllamaAffinityProvider((modelName) => {
577529
+ try {
577530
+ const status = pool3.status?.();
577531
+ if (!status) return null;
577532
+ for (const inst of status.instances ?? []) {
577533
+ void modelName;
577534
+ return { gpuIndex: inst.gpuIndex, gpuUuid: inst.gpuUuid };
577535
+ }
577536
+ return null;
577537
+ } catch {
577538
+ return null;
577539
+ }
577540
+ });
577541
+ } catch {
577542
+ }
577543
+ }).catch(() => {
577544
+ });
577545
+ } catch {
577546
+ }
577547
+ broker.startPolling(Math.max(2e3, intervalMs * 2));
577280
577548
  } catch {
577281
577549
  }
577282
577550
  }
@@ -604379,14 +604647,22 @@ async function handleBroker(arg, _ctx) {
604379
604647
  safeLog(` ${c3.bold("Resource Broker")}`);
604380
604648
  safeLog("");
604381
604649
  safeLog(` ${c3.dim("RAM:")} ${snap.ramMB.used} / ${snap.ramMB.total} MB used (${snap.ramMB.free} MB free)`);
604382
- if (snap.vramMB) {
604650
+ if (snap.vramPerDevice.length > 0) {
604651
+ safeLog(` ${c3.bold("GPUs:")}`);
604652
+ for (const d2 of snap.vramPerDevice) {
604653
+ const gpuSlots = snap.slots.byGpu[d2.index];
604654
+ const slotInfo = gpuSlots ? ` slots=${gpuSlots.inUse}/${gpuSlots.capacity}, loaded=${gpuSlots.loadedMB}MB` : "";
604655
+ safeLog(` gpu${d2.index} (${d2.uuid.slice(0, 12)}…) ${d2.used} / ${d2.total} MB used (${d2.free} MB free)${slotInfo}`);
604656
+ }
604657
+ } else if (snap.vramMB) {
604383
604658
  safeLog(` ${c3.dim("VRAM:")} ${snap.vramMB.used} / ${snap.vramMB.total} MB used (${snap.vramMB.free} MB free)`);
604384
604659
  } else {
604385
604660
  safeLog(` ${c3.dim("VRAM:")} ${c3.dim("(no GPU detected)")}`);
604386
604661
  }
604387
604662
  safeLog(` ${c3.dim("RAM headroom threshold:")} ${broker.ramHeadroomMB} MB`);
604388
- safeLog(` ${c3.dim("VRAM headroom threshold:")} ${broker.vramHeadroomMB} MB`);
604663
+ safeLog(` ${c3.dim("VRAM headroom threshold:")} ${broker.vramHeadroomMB} MB (per-device)`);
604389
604664
  safeLog(` ${c3.dim("Idle-evict threshold:")} ${Math.round(broker.idleEvictMs / 1e3)}s`);
604665
+ safeLog(` ${c3.dim("Slot capacity:")} ${snap.slots.inUse}/${snap.slots.capacity} active, queue ${snap.slots.queueDepth}/${snap.slots.queueCapacity}`);
604390
604666
  safeLog("");
604391
604667
  if (snap.loaded.length === 0) {
604392
604668
  safeLog(` ${c3.dim("No loaded models tracked.")}`);
@@ -604397,7 +604673,8 @@ async function handleBroker(arg, _ctx) {
604397
604673
  const idle = Math.round((now - m2.lastUsedAt) / 1e3);
604398
604674
  const owner = m2.owner ? c3.dim(` [owner=${m2.owner}]`) : "";
604399
604675
  const ctx3 = m2.numCtx ? c3.dim(` n_ctx=${m2.numCtx}`) : "";
604400
- safeLog(` ${c3.cyan(m2.name)} (${m2.host}/${m2.domain}) vram=${m2.vramMB}MB ram=${m2.ramMB}MB idle=${idle}s${ctx3}${owner}`);
604676
+ const gpu = m2.gpuIndex !== null && m2.gpuIndex !== void 0 ? c3.dim(` gpu=${m2.gpuIndex}`) : "";
604677
+ safeLog(` ${c3.cyan(m2.name)} (${m2.host}/${m2.domain}) vram=${m2.vramMB}MB ram=${m2.ramMB}MB${gpu} idle=${idle}s${ctx3}${owner}`);
604401
604678
  }
604402
604679
  }
604403
604680
  if (snap.inflight.length > 0) {
@@ -618149,6 +618426,95 @@ function parseTelegramSilentReflectionNotes(text) {
618149
618426
  }
618150
618427
  return null;
618151
618428
  }
618429
+ function extractPartialTelegramReplyJson(buffer2) {
618430
+ const stripped = stripTelegramHiddenThinking(buffer2).trimStart();
618431
+ if (!stripped.startsWith("{")) {
618432
+ return stripped || null;
618433
+ }
618434
+ const keyMatch = stripped.indexOf('"reply"');
618435
+ if (keyMatch < 0) return null;
618436
+ let i2 = keyMatch + '"reply"'.length;
618437
+ while (i2 < stripped.length && stripped[i2] !== ":") i2++;
618438
+ if (i2 >= stripped.length) return null;
618439
+ i2++;
618440
+ while (i2 < stripped.length && /\s/.test(stripped[i2])) i2++;
618441
+ if (i2 >= stripped.length || stripped[i2] !== '"') return null;
618442
+ i2++;
618443
+ let out = "";
618444
+ while (i2 < stripped.length) {
618445
+ const ch = stripped[i2];
618446
+ if (ch === "\\") {
618447
+ const next = stripped[i2 + 1];
618448
+ if (next === void 0) break;
618449
+ if (next === '"') out += '"';
618450
+ else if (next === "\\") out += "\\";
618451
+ else if (next === "n") out += "\n";
618452
+ else if (next === "t") out += " ";
618453
+ else if (next === "r") out += "\r";
618454
+ else if (next === "/") out += "/";
618455
+ else if (next === "u") {
618456
+ if (i2 + 5 >= stripped.length) break;
618457
+ const hex = stripped.slice(i2 + 2, i2 + 6);
618458
+ const code8 = parseInt(hex, 16);
618459
+ if (Number.isFinite(code8)) out += String.fromCharCode(code8);
618460
+ i2 += 4;
618461
+ } else {
618462
+ out += next;
618463
+ }
618464
+ i2 += 2;
618465
+ continue;
618466
+ }
618467
+ if (ch === '"') {
618468
+ return out;
618469
+ }
618470
+ out += ch;
618471
+ i2++;
618472
+ }
618473
+ return out.length > 0 ? out : null;
618474
+ }
618475
+ function extractFinalTelegramReplyJson(buffer2) {
618476
+ const stripped = stripTelegramHiddenThinking(buffer2).trim();
618477
+ if (!stripped.startsWith("{")) return null;
618478
+ try {
618479
+ const parsed = JSON.parse(stripped);
618480
+ if (typeof parsed.reply === "string") return parsed.reply.trim();
618481
+ } catch {
618482
+ }
618483
+ let depth = 0;
618484
+ let inString = false;
618485
+ let escape2 = false;
618486
+ let end = -1;
618487
+ for (let i2 = 0; i2 < stripped.length; i2++) {
618488
+ const ch = stripped[i2];
618489
+ if (escape2) {
618490
+ escape2 = false;
618491
+ continue;
618492
+ }
618493
+ if (inString) {
618494
+ if (ch === "\\") escape2 = true;
618495
+ else if (ch === '"') inString = false;
618496
+ continue;
618497
+ }
618498
+ if (ch === '"') inString = true;
618499
+ else if (ch === "{") depth++;
618500
+ else if (ch === "}") {
618501
+ depth--;
618502
+ if (depth === 0) {
618503
+ end = i2;
618504
+ break;
618505
+ }
618506
+ }
618507
+ }
618508
+ if (end > 0) {
618509
+ try {
618510
+ const parsed = JSON.parse(stripped.slice(0, end + 1));
618511
+ if (typeof parsed.reply === "string") return parsed.reply.trim();
618512
+ } catch {
618513
+ }
618514
+ }
618515
+ const partial = extractPartialTelegramReplyJson(stripped);
618516
+ return partial && partial.trim().length > 0 ? partial.trim() : null;
618517
+ }
618152
618518
  function estimatePromptTokensFromRequest(request) {
618153
618519
  let chars = 0;
618154
618520
  for (const m2 of request.messages ?? []) {
@@ -619636,7 +620002,7 @@ function renderTelegramSubAgentError(username, error) {
619636
620002
  process.stdout.write(` ${c3.dim("│")} ${c3.magenta("✘")} @${username}: ${c3.dim(preview)}
619637
620003
  `);
619638
620004
  }
619639
- var TELEGRAM_TOOL_ACTION_GROUPS, TELEGRAM_TOOL_ACTION_GROUP, TELEGRAM_TOOL_MUTATING_GROUPS, DEFAULT_TELEGRAM_TOOL_GROUP_POLICY, TELEGRAM_TOOL_BUTTON_LABELS, TELEGRAM_SAFETY_PROMPT, ADMIN_DM_PROMPT, ADMIN_GROUP_PROMPT, TELEGRAM_PUBLIC_SOUL_PROFILE, TELEGRAM_PUBLIC_ORCHESTRATOR_CONTRACT, TELEGRAM_PUBLIC_MEMORY_SCOPE_CONTRACT, TELEGRAM_PUBLIC_VISION_STACK_CONTRACT, GROUP_REPLY_DISCRETION_PROMPT, TELEGRAM_CHAT_MODE_PROMPT, ADMIN_CHAT_PROFILE_PROMPT, TELEGRAM_ACTION_RESPONSE_CONTRACT, TELEGRAM_EXTERNAL_ACQUISITION_CONTRACT, TELEGRAM_INTERACTION_DECISION_RESPONSE_FORMAT, TELEGRAM_STUCK_SELF_TALK_PREFIXES, TELEGRAM_CHAT_HISTORY_LIMIT, TELEGRAM_CONTEXT_RECENT_DEFAULT, TELEGRAM_CONTEXT_LINE_LIMIT, TELEGRAM_CONTEXT_SAMPLE_LIMIT, TELEGRAM_MEMORY_CARD_LIMIT, TELEGRAM_MEMORY_NOTE_LIMIT, TELEGRAM_ASSOCIATIVE_FACT_LIMIT, TELEGRAM_ASSOCIATIVE_USER_FACT_LIMIT, TELEGRAM_ASSOCIATIVE_ACTION_LIMIT, TELEGRAM_ASSOCIATIVE_RELATION_LIMIT, TELEGRAM_MEMORY_STOPWORDS, TELEGRAM_MEMORY_GENERIC_QUERY_TOKENS, TELEGRAM_SUB_AGENT_BOUNDED_OPTIONS, TELEGRAM_SUB_AGENT_DEFAULT_LIMIT, TELEGRAM_SUB_AGENT_MAX_LIMIT, TELEGRAM_SUB_AGENT_BURST_CONTEXT_LIMIT, TELEGRAM_PUBLIC_HELP_COMMANDS2, TELEGRAM_REMINDER_SLASH_COMMANDS, TELEGRAM_REFLECTION_SLASH_COMMANDS, TELEGRAM_PUBLIC_BOT_COMMAND_NAMES, TELEGRAM_IMAGE_EXTENSIONS, MEDIA_CACHE_TTL_MS, TELEGRAM_CHANNEL_DMN_SWEEP_MS, TELEGRAM_CHANNEL_DMN_IDLE_AFTER_MS, TELEGRAM_CHANNEL_DMN_MIN_INTERVAL_MS, TELEGRAM_CHANNEL_DMN_MIN_MESSAGES, TELEGRAM_ALLOWED_UPDATES, TELEGRAM_PUBLIC_TOOL_QUOTAS, TelegramBridge;
620005
+ var TELEGRAM_TOOL_ACTION_GROUPS, TELEGRAM_TOOL_ACTION_GROUP, TELEGRAM_TOOL_MUTATING_GROUPS, DEFAULT_TELEGRAM_TOOL_GROUP_POLICY, TELEGRAM_TOOL_BUTTON_LABELS, TELEGRAM_SAFETY_PROMPT, ADMIN_DM_PROMPT, ADMIN_GROUP_PROMPT, TELEGRAM_PUBLIC_SOUL_PROFILE, TELEGRAM_PUBLIC_ORCHESTRATOR_CONTRACT, TELEGRAM_PUBLIC_MEMORY_SCOPE_CONTRACT, TELEGRAM_PUBLIC_VISION_STACK_CONTRACT, GROUP_REPLY_DISCRETION_PROMPT, TELEGRAM_CHAT_MODE_PROMPT, ADMIN_CHAT_PROFILE_PROMPT, TELEGRAM_ACTION_RESPONSE_CONTRACT, TELEGRAM_EXTERNAL_ACQUISITION_CONTRACT, TELEGRAM_INTERACTION_DECISION_RESPONSE_FORMAT, TELEGRAM_CHAT_REPLY_RESPONSE_FORMAT, TELEGRAM_STUCK_SELF_TALK_PREFIXES, TELEGRAM_CHAT_HISTORY_LIMIT, TELEGRAM_CONTEXT_RECENT_DEFAULT, TELEGRAM_CONTEXT_LINE_LIMIT, TELEGRAM_CONTEXT_SAMPLE_LIMIT, TELEGRAM_MEMORY_CARD_LIMIT, TELEGRAM_MEMORY_NOTE_LIMIT, TELEGRAM_ASSOCIATIVE_FACT_LIMIT, TELEGRAM_ASSOCIATIVE_USER_FACT_LIMIT, TELEGRAM_ASSOCIATIVE_ACTION_LIMIT, TELEGRAM_ASSOCIATIVE_RELATION_LIMIT, TELEGRAM_MEMORY_STOPWORDS, TELEGRAM_MEMORY_GENERIC_QUERY_TOKENS, TELEGRAM_SUB_AGENT_BOUNDED_OPTIONS, TELEGRAM_SUB_AGENT_DEFAULT_LIMIT, TELEGRAM_SUB_AGENT_MAX_LIMIT, TELEGRAM_SUB_AGENT_BURST_CONTEXT_LIMIT, TELEGRAM_PUBLIC_HELP_COMMANDS2, TELEGRAM_REMINDER_SLASH_COMMANDS, TELEGRAM_REFLECTION_SLASH_COMMANDS, TELEGRAM_PUBLIC_BOT_COMMAND_NAMES, TELEGRAM_IMAGE_EXTENSIONS, MEDIA_CACHE_TTL_MS, TELEGRAM_CHANNEL_DMN_SWEEP_MS, TELEGRAM_CHANNEL_DMN_IDLE_AFTER_MS, TELEGRAM_CHANNEL_DMN_MIN_INTERVAL_MS, TELEGRAM_CHANNEL_DMN_MIN_MESSAGES, TELEGRAM_ALLOWED_UPDATES, TELEGRAM_PUBLIC_TOOL_QUOTAS, TelegramBridge;
619640
620006
  var init_telegram_bridge = __esm({
619641
620007
  "packages/cli/src/tui/telegram-bridge.ts"() {
619642
620008
  "use strict";
@@ -619870,6 +620236,12 @@ Rules:
619870
620236
  7. Do not claim older chat is unavailable when the context stream contains it. If asked what you see, summarize the supplied transcript, speakers, and relationship/tone signals.
619871
620237
  8. Mirror the current sender's tone and directness while staying safe and clear.
619872
620238
  9. Never send router decisions, skip explanations, memory-stage notes, task-complete summaries, or "no_reply" as chat text.
620239
+
620240
+ Output discipline (your assistant message is sent verbatim to Telegram, ALL of it):
620241
+ - Emit ONLY the final reply text. Do not narrate your reasoning, summarize what you found, organize bullet-point notes, or write phrases like "Let me summarize", "Let me send the reply", "Now I have enough", "Based on the research", "Here's my response:" before the actual reply. Those are scratch-pad phrases that leak when emitted as visible text.
620242
+ - Do not produce a draft followed by the final answer. The first character of your output should be the first character of the message the user will receive.
620243
+ - If you need to think, do it silently. Do not write your reasoning steps as visible prose. If you have an internal scratchpad, keep it internal.
620244
+ - A reply that begins by restating what you found, then says something like "Let me write the response" or "Here's the breakdown", then gives the answer, is wrong twice over: the user sees the restatement AND the answer, doubling the message. Skip the restatement.
619873
620245
  `.trim();
619874
620246
  ADMIN_CHAT_PROFILE_PROMPT = `
619875
620247
  You are replying to the authenticated Telegram admin in a private DM.
@@ -619902,6 +620274,24 @@ External acquisition contract:
619902
620274
  TELEGRAM_INTERACTION_DECISION_RESPONSE_FORMAT = {
619903
620275
  type: "json_object"
619904
620276
  };
620277
+ TELEGRAM_CHAT_REPLY_RESPONSE_FORMAT = {
620278
+ type: "json_schema",
620279
+ json_schema: {
620280
+ name: "telegram_chat_reply",
620281
+ strict: true,
620282
+ schema: {
620283
+ type: "object",
620284
+ additionalProperties: false,
620285
+ required: ["reply"],
620286
+ properties: {
620287
+ reply: {
620288
+ type: "string",
620289
+ description: "The exact text to send to Telegram. No prefixes, no narration, no scratch reasoning, no bullet-point notes preceding the reply."
620290
+ }
620291
+ }
620292
+ }
620293
+ }
620294
+ };
619905
620295
  TELEGRAM_STUCK_SELF_TALK_PREFIXES = [
619906
620296
  /^i'?ve been stuck for\b/i,
619907
620297
  /^i am (still |currently )?stuck\b/i,
@@ -622128,6 +622518,14 @@ ${mediaContext}` : ""
622128
622518
  if (state.lastFollowupAt && now - state.lastFollowupAt < 60 * 6e4) {
622129
622519
  return { sent: false, reason: "rate limit held public follow-up" };
622130
622520
  }
622521
+ const cooldownEnv = Number.parseInt(process.env["OMNIUS_TG_FOLLOWUP_COOLDOWN_MS"] ?? "", 10);
622522
+ const cooldownMs = Number.isFinite(cooldownEnv) && cooldownEnv >= 6e4 ? cooldownEnv : 10 * 6e4;
622523
+ if (state.lastAssistantMessageAt && now - state.lastAssistantMessageAt < cooldownMs) {
622524
+ return {
622525
+ sent: false,
622526
+ reason: `recent assistant reply suppresses follow-up (${Math.round((now - state.lastAssistantMessageAt) / 1e3)}s ago, cooldown ${Math.round(cooldownMs / 1e3)}s)`
622527
+ };
622528
+ }
622131
622529
  const candidateMessageIds = Array.from(new Set([
622132
622530
  ...artifact.curiosityThreads.flatMap((thread) => thread.sourceMessages ?? []),
622133
622531
  ...artifact.memoryProposals.flatMap((proposal) => proposal.sourceMessages ?? []),
@@ -622458,6 +622856,10 @@ ${mediaContext}` : ""
622458
622856
  chatTitle: msg.chatTitle
622459
622857
  };
622460
622858
  this.recordChatHistory(sessionKey, entry);
622859
+ try {
622860
+ this.reflectionStateForSession(sessionKey).lastAssistantMessageAt = Date.now();
622861
+ } catch {
622862
+ }
622461
622863
  this.persistTelegramAssistantMessage(
622462
622864
  msg,
622463
622865
  clean5,
@@ -623685,32 +624087,16 @@ ${lines.join("\n")}`);
623685
624087
  sections.push(`### Participants And Relationship Signals${tierNote}
623686
624088
  ${participantLines.join("\n")}`);
623687
624089
  }
623688
- const associativeContext = this.relevantTelegramAssociativeMemoryContext(
623689
- sessionKey,
623690
- msg,
623691
- isGroup ? 14 : 8
623692
- );
623693
- if (associativeContext) {
623694
- sections.push(associativeContext);
623695
- }
623696
- const sqliteMirrorContext = this.relevantTelegramSqliteMirrorContext(
623697
- sessionKey,
623698
- msg,
623699
- isGroup ? 14 : 8
623700
- );
623701
- if (sqliteMirrorContext) {
623702
- sections.push(sqliteMirrorContext);
623703
- }
623704
- try {
623705
- const episodicContext = this.relevantTelegramEpisodicMemoryContext(
624090
+ const ASSOCIATIVE_MIN_TURNS = isGroup ? 8 : 4;
624091
+ if (retainedCount >= ASSOCIATIVE_MIN_TURNS) {
624092
+ const associativeContext = this.relevantTelegramAssociativeMemoryContext(
623706
624093
  sessionKey,
623707
624094
  msg,
623708
- isGroup ? 10 : 6
624095
+ isGroup ? 14 : 8
623709
624096
  );
623710
- if (episodicContext) {
623711
- sections.push(episodicContext);
624097
+ if (associativeContext) {
624098
+ sections.push(associativeContext);
623712
624099
  }
623713
- } catch {
623714
624100
  }
623715
624101
  const memoryCards = this.relevantTelegramMemoryCards(sessionKey, msg, isGroup ? 10 : 6);
623716
624102
  if (memoryCards.length > 0) {
@@ -623741,10 +624127,6 @@ ${notes2}`;
623741
624127
  ${cardLines.join("\n")}`);
623742
624128
  }
623743
624129
  }
623744
- const channelDaydream = this.formatLatestTelegramChannelDaydreamContext(sessionKey);
623745
- if (channelDaydream) {
623746
- sections.push(channelDaydream);
623747
- }
623748
624130
  const recentMedia = this.recentTelegramMediaEntries(msg.chatId, 10);
623749
624131
  if (recentMedia.length > 0) {
623750
624132
  const mediaLines = recentMedia.map((entry) => {
@@ -623763,26 +624145,33 @@ ${cardLines.join("\n")}`);
623763
624145
  ].join("\n"));
623764
624146
  }
623765
624147
  if (olderCount > 0) {
624148
+ const halfLifeMs = (isGroup ? 24 : 48) * 60 * 60 * 1e3;
624149
+ const now = Date.now();
623766
624150
  const older = history.slice(0, olderCount);
623767
624151
  const bySpeaker = /* @__PURE__ */ new Map();
623768
624152
  for (const entry of older) {
623769
624153
  if (!entry.text.trim()) continue;
623770
624154
  const speaker = telegramHistorySpeaker(entry);
624155
+ const ageMs = Math.max(0, now - (entry.ts ?? 0));
624156
+ const weight = Math.exp(-ageMs / halfLifeMs);
623771
624157
  const existing = bySpeaker.get(speaker);
623772
624158
  const text = truncateTelegramContextLine(entry.text, 180);
623773
624159
  if (existing) {
623774
624160
  existing.count += 1;
623775
624161
  existing.last = text;
624162
+ existing.weightSum += weight;
624163
+ existing.maxWeight = Math.max(existing.maxWeight, weight);
623776
624164
  } else {
623777
- bySpeaker.set(speaker, { count: 1, first: text, last: text });
624165
+ bySpeaker.set(speaker, { count: 1, first: text, last: text, weightSum: weight, maxWeight: weight });
623778
624166
  }
623779
624167
  }
623780
- const olderLines = [...bySpeaker.entries()].slice(0, 10).map(([speaker, info]) => {
624168
+ const olderLines = [...bySpeaker.entries()].sort(([, a2], [, b]) => b.maxWeight - a2.maxWeight).slice(0, 5).map(([speaker, info]) => {
623781
624169
  const range = info.first === info.last ? info.first : `${info.first} -> ${info.last}`;
623782
- return `- ${speaker}: ${info.count} earlier msg(s); digest=${telegramContextJsonString(range, 240)}`;
624170
+ const decayLabel = info.maxWeight >= 0.5 ? "fresh" : info.maxWeight >= 0.1 ? "decayed" : "stale";
624171
+ return `- ${speaker}: ${info.count} earlier msg(s) [${decayLabel}]; digest=${telegramContextJsonString(range, 200)}`;
623783
624172
  });
623784
624173
  if (olderLines.length > 0) {
623785
- sections.push(`### Earlier Retained Thread Digest
624174
+ sections.push(`### Earlier Retained Thread Digest (recency-weighted)
623786
624175
  ${olderLines.join("\n")}`);
623787
624176
  }
623788
624177
  }
@@ -624096,7 +624485,8 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
624096
624485
  const completionHeadroom = 4096;
624097
624486
  const targetCtx = trainCtx && trainCtx > 0 ? Math.min(trainCtx, Math.max(2048, promptTokens + completionHeadroom)) : Math.max(2048, promptTokens + completionHeadroom);
624098
624487
  const requestWithCtx = { ...request, numCtx: targetCtx };
624099
- const slot = await broker.acquireInferenceSlot({
624488
+ const brokerBypass = process.env["OMNIUS_DISABLE_BROKER_ADMISSION"] === "1";
624489
+ const slot = brokerBypass ? null : await broker.acquireInferenceSlot({
624100
624490
  model,
624101
624491
  domain: "chat",
624102
624492
  owner: `telegram-bridge/${kind}`,
@@ -624107,7 +624497,7 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
624107
624497
  if (process.env["OMNIUS_BROKER_TRACE"] === "1") {
624108
624498
  this.tuiWrite(() => renderTelegramSubAgentEvent(
624109
624499
  sessionKey,
624110
- `inference admitted [${kind}] model=${model} prompt~${promptTokens}t num_ctx=${targetCtx} slot=${slot.info.id}${slot.info.reserved ? " reserved" : ""}`
624500
+ `inference admitted [${kind}] model=${model} prompt~${promptTokens}t num_ctx=${targetCtx} slot=${slot ? slot.info.id : "bypass"}${slot?.info.reserved ? " reserved" : ""}`
624111
624501
  ));
624112
624502
  }
624113
624503
  const streamFn = backend.chatCompletionStream;
@@ -624136,10 +624526,10 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
624136
624526
  }
624137
624527
  const usage = result.usage;
624138
624528
  completionTokens = usage?.completion_tokens ?? 0;
624139
- slot.release({ ok: true, completionTokens });
624529
+ slot?.release({ ok: true, completionTokens });
624140
624530
  return result;
624141
624531
  } catch (err) {
624142
- slot.release({ ok: false, error: err instanceof Error ? err.message : String(err) });
624532
+ slot?.release({ ok: false, error: err instanceof Error ? err.message : String(err) });
624143
624533
  throw err;
624144
624534
  } finally {
624145
624535
  this.deregisterTelegramInference(id);
@@ -625140,34 +625530,25 @@ ${list}` : "No shared group target is currently known for this sender. Ask in th
625140
625530
  return join131(this.repoRoot, ".omnius", "telegram-runner-state", safe);
625141
625531
  }
625142
625532
  buildTelegramAdminOverviewContext(currentSessionKey) {
625143
- const sections = [];
625144
625533
  this.ensureAllTelegramConversationsLoaded();
625145
625534
  const chatEntries = [...this.chatHistory.entries()].filter(([sessionKey, history]) => sessionKey !== currentSessionKey && history.length > 0).sort(([, a2], [, b]) => (b[b.length - 1]?.ts ?? 0) - (a2[a2.length - 1]?.ts ?? 0)).slice(0, 18);
625535
+ if (chatEntries.length === 0) return "";
625536
+ const indexLines = [];
625146
625537
  for (const [sessionKey, history] of chatEntries) {
625147
625538
  const latest = history[history.length - 1];
625148
- const participants = [...this.chatParticipants.get(sessionKey)?.values() ?? []].sort((a2, b) => b.lastSeenTs - a2.lastSeenTs).slice(0, 8).map((profile) => {
625149
- const label = profile.username && profile.username !== "unknown" ? `@${profile.username}` : profile.firstName || `user:${profile.fromUserId}`;
625150
- return `${label} (${profile.messageCount} msg)`;
625151
- }).join(", ");
625152
- const recent = history.slice(-5).map(
625153
- (entry) => ` - ${telegramHistorySpeaker(entry)}: ${truncateTelegramContextLine(entry.text, 180)}`
625154
- ).join("\n");
625155
- const cards = (this.chatMemoryCards.get(sessionKey) ?? []).slice(0, 4).map((card) => ` - ${card.title}: ${card.notes.slice(-1)[0] ?? ""}`).join("\n");
625156
- sections.push([
625157
- `- ${sessionKey} (chat_id ${String(latest.chatId ?? "unknown")}; ${latest.chatType || "chat"}${latest.chatTitle ? `: ${latest.chatTitle}` : ""})`,
625158
- participants ? ` Participants: ${participants}` : "",
625159
- ` Latest: ${telegramHistorySpeaker(latest)}: ${truncateTelegramContextLine(latest.text, 180)}`,
625160
- recent ? ` Recent:
625161
- ${recent}` : "",
625162
- cards ? ` Memory cards:
625163
- ${cards}` : ""
625164
- ].filter(Boolean).join("\n"));
625165
- }
625166
- if (sections.length === 0) return "";
625539
+ const participantCount = this.chatParticipants.get(sessionKey)?.size ?? 0;
625540
+ const ageMs = Date.now() - (latest.ts ?? 0);
625541
+ const ageMin = Math.round(ageMs / 6e4);
625542
+ const ageStr = ageMin < 60 ? `${ageMin}m ago` : ageMin < 24 * 60 ? `${Math.round(ageMin / 60)}h ago` : `${Math.round(ageMin / (24 * 60))}d ago`;
625543
+ const label = latest.chatTitle ? `"${latest.chatTitle}"` : sessionKey;
625544
+ indexLines.push(`- ${label} (chat_id ${String(latest.chatId ?? "?")}; ${latest.chatType || "chat"}): ${participantCount} participants; last ${ageStr}; ${history.length} retained msgs`);
625545
+ }
625167
625546
  return [
625168
- "## Admin Telegram Omniscience",
625169
- "This section is one-way context for the authenticated admin private DM only. It summarizes public/group and other Telegram sessions the bot has observed. Never inject admin/private DM content into public groups.",
625170
- sections.join("\n")
625547
+ "## Admin Telegram Omniscience (index only)",
625548
+ "One-way context for the authenticated admin private DM. Other Telegram sessions the bot has observed are listed below with one line each.",
625549
+ "For details on a specific chat, use telegram_memory_search with the chat_id or topic — the always-loaded view is intentionally compact.",
625550
+ "Never inject admin/private DM content into public groups.",
625551
+ indexLines.join("\n")
625171
625552
  ].join("\n\n");
625172
625553
  }
625173
625554
  buildTelegramSessionContext(msg, toolContext, profile, modelTier) {
@@ -626287,8 +626668,9 @@ ${conversationStream}`
626287
626668
  messages: this.buildTelegramChatMessages(msg, toolContext, mediaContext),
626288
626669
  tools: [],
626289
626670
  temperature: 0.4,
626290
- maxTokens: 700,
626291
- timeoutMs: Math.max(config.timeoutMs ?? 3e5, 12e4)
626671
+ maxTokens: 1500,
626672
+ timeoutMs: Math.max(config.timeoutMs ?? 3e5, 12e4),
626673
+ responseFormat: TELEGRAM_CHAT_REPLY_RESPONSE_FORMAT
626292
626674
  });
626293
626675
  let accumulated = "";
626294
626676
  let streamError;
@@ -626315,7 +626697,8 @@ ${conversationStream}`
626315
626697
  } else {
626316
626698
  this.bumpTelegramInferenceTokens(inferenceId, 1, 0);
626317
626699
  accumulated += piece;
626318
- await onToken(accumulated);
626700
+ const partial = extractPartialTelegramReplyJson(accumulated);
626701
+ if (partial !== null) await onToken(partial);
626319
626702
  }
626320
626703
  }
626321
626704
  } catch (err) {
@@ -626337,11 +626720,14 @@ ${conversationStream}`
626337
626720
  }
626338
626721
  this.updateTelegramInferenceFinal(inferenceId, result);
626339
626722
  accumulated = result.choices[0]?.message?.content ?? "";
626340
- if (accumulated) await onToken(accumulated);
626723
+ const fullExtracted = extractPartialTelegramReplyJson(accumulated);
626724
+ if (fullExtracted) await onToken(fullExtracted);
626341
626725
  }
626342
626726
  } finally {
626343
626727
  this.deregisterTelegramInference(inferenceId);
626344
626728
  }
626729
+ const extracted = extractFinalTelegramReplyJson(accumulated);
626730
+ if (extracted) return extracted;
626345
626731
  return stripTelegramHiddenThinking(accumulated).trim();
626346
626732
  }
626347
626733
  retainTelegramVisibleReplyDraft(subAgent, draft, streamToolNames = subAgent.currentStreamToolNames) {
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.136",
3
+ "version": "1.0.137",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "omnius",
9
- "version": "1.0.136",
9
+ "version": "1.0.137",
10
10
  "bundleDependencies": [
11
11
  "image-to-ascii"
12
12
  ],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.136",
3
+ "version": "1.0.137",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",