omnius 1.0.135 → 1.0.137

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1337,29 +1337,36 @@ function ramSnapshotMB() {
1337
1337
  const free = Math.round(freemem() / (1024 * 1024));
1338
1338
  return { total, free, used: total - free };
1339
1339
  }
1340
- async function vramSnapshotMB() {
1340
+ async function vramSnapshotPerDevice() {
1341
1341
  if (_nvSmiAvailable === false)
1342
- return null;
1342
+ return [];
1343
1343
  try {
1344
1344
  const out = await new Promise((resolve55, reject) => {
1345
- exec("nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv,noheader,nounits 2>/dev/null", { encoding: "utf8", timeout: 3e3 }, (err, stdout) => err ? reject(err) : resolve55(stdout));
1345
+ exec("nvidia-smi --query-gpu=index,uuid,memory.total,memory.used,memory.free --format=csv,noheader,nounits 2>/dev/null", { encoding: "utf8", timeout: 3e3 }, (err, stdout) => err ? reject(err) : resolve55(stdout));
1346
1346
  });
1347
1347
  _nvSmiAvailable = true;
1348
- let total = 0, used = 0, free = 0;
1348
+ const devices = [];
1349
1349
  for (const line of out.trim().split("\n")) {
1350
+ if (!line.trim())
1351
+ continue;
1350
1352
  const parts = line.split(",").map((s2) => s2.trim());
1351
- if (parts.length < 3)
1353
+ if (parts.length < 5)
1354
+ continue;
1355
+ const index = parseInt(parts[0] ?? "-1", 10);
1356
+ if (!Number.isFinite(index) || index < 0)
1352
1357
  continue;
1353
- total += parseInt(parts[0] ?? "0", 10) || 0;
1354
- used += parseInt(parts[1] ?? "0", 10) || 0;
1355
- free += parseInt(parts[2] ?? "0", 10) || 0;
1358
+ devices.push({
1359
+ index,
1360
+ uuid: parts[1] ?? "",
1361
+ total: parseInt(parts[2] ?? "0", 10) || 0,
1362
+ used: parseInt(parts[3] ?? "0", 10) || 0,
1363
+ free: parseInt(parts[4] ?? "0", 10) || 0
1364
+ });
1356
1365
  }
1357
- if (total <= 0)
1358
- return null;
1359
- return { total, used, free };
1366
+ return devices;
1360
1367
  } catch {
1361
1368
  _nvSmiAvailable = false;
1362
- return null;
1369
+ return [];
1363
1370
  }
1364
1371
  }
1365
1372
  function getModelBroker() {
@@ -1374,7 +1381,7 @@ var init_model_broker = __esm({
1374
1381
  DEFAULT_IDLE_EVICT_MS = 5 * 60 * 1e3;
1375
1382
  DEFAULT_POLL_MS = 4e3;
1376
1383
  DEFAULT_INFLIGHT_WAIT_MS = 6e4;
1377
- DEFAULT_SLOT_CAPACITY = 4;
1384
+ DEFAULT_SLOT_CAPACITY = 8;
1378
1385
  DEFAULT_QUEUE_CAPACITY = 50;
1379
1386
  THROUGHPUT_EMA_ALPHA = 0.2;
1380
1387
  THROUGHPUT_INITIAL_TPS = 25;
@@ -1403,7 +1410,9 @@ var init_model_broker = __esm({
1403
1410
  ramHeadroomMB = DEFAULT_RAM_HEADROOM_MB;
1404
1411
  vramHeadroomMB = DEFAULT_VRAM_HEADROOM_MB;
1405
1412
  idleEvictMs = DEFAULT_IDLE_EVICT_MS;
1406
- /** Inference slot capacity (auto-tunes from Ollama pool size when known). */
1413
+ /** Inference slot capacity (shared pool aggregate; auto-tunes from Ollama
1414
+ * pool size when known). Per-device cap defaults to ceil(slotCapacity/N)
1415
+ * unless overridden via setPerGpuSlotCapacity. */
1407
1416
  slotCapacity = DEFAULT_SLOT_CAPACITY;
1408
1417
  /** Maximum queue depth before queue pressure is emitted. */
1409
1418
  queueCapacity = DEFAULT_QUEUE_CAPACITY;
@@ -1419,6 +1428,15 @@ var init_model_broker = __esm({
1419
1428
  _throughput = /* @__PURE__ */ new Map();
1420
1429
  /** Monotonic counter for slot ids. */
1421
1430
  _slotIdSeq = 0;
1431
+ /** Per-GPU slot capacity override. When unset, broker derives a per-GPU
1432
+ * cap from slotCapacity / detected device count. */
1433
+ _perGpuSlotCapacity = /* @__PURE__ */ new Map();
1434
+ /** Cached per-device VRAM (refreshed by pollOnce). */
1435
+ _vramByDevice = [];
1436
+ /** Optional provider that maps an Ollama model name to its current GPU.
1437
+ * Wired by the CLI/orchestrator at startup so the broker can copy pool
1438
+ * affinity onto LoadedModel records without importing the pool directly. */
1439
+ _ollamaAffinityProvider = null;
1422
1440
  static getInstance() {
1423
1441
  if (!_ModelBroker._instance)
1424
1442
  _ModelBroker._instance = new _ModelBroker();
@@ -1471,6 +1489,18 @@ var init_model_broker = __esm({
1471
1489
  setOllamaBaseUrl(url) {
1472
1490
  this._ollamaBaseUrl = url;
1473
1491
  }
1492
+ /**
1493
+ * Wire a function that resolves an Ollama model name to its current GPU
1494
+ * affinity (from the Ollama pool's per-instance state). The CLI calls
1495
+ * this at startup with a closure over `getOllamaPool().status()` so the
1496
+ * broker can copy gpuIndex/gpuUuid onto LoadedModel records without
1497
+ * importing from @omnius/orchestrator (which would create a circular dep).
1498
+ *
1499
+ * Pass null to clear.
1500
+ */
1501
+ setOllamaAffinityProvider(provider) {
1502
+ this._ollamaAffinityProvider = provider;
1503
+ }
1474
1504
  /** One poll cycle — refreshes /api/ps and emits snapshot. */
1475
1505
  async pollOnce() {
1476
1506
  await Promise.all([
@@ -1539,30 +1569,44 @@ var init_model_broker = __esm({
1539
1569
  const estVram = spec.estimatedVramMB ?? this.estimateFootprintVramMB(spec);
1540
1570
  const estRam = spec.estimatedRamMB ?? this.estimateFootprintRamMB(spec);
1541
1571
  const ram = ramSnapshotMB();
1542
- const vram = await vramSnapshotMB();
1543
1572
  const ramFitsAfter = ram.free - estRam >= this.ramHeadroomMB;
1544
- const vramFitsAfter = vram ? vram.free - estVram >= this.vramHeadroomMB : true;
1573
+ const devices = await vramSnapshotPerDevice();
1574
+ this._vramByDevice = devices;
1575
+ let chosenGpu = null;
1576
+ let vramFitsAfter = devices.length === 0;
1577
+ if (devices.length > 0) {
1578
+ const candidates = devices.filter((d2) => spec.preferredGpuIndex === void 0 || d2.index === spec.preferredGpuIndex).filter((d2) => d2.free - estVram >= this.vramHeadroomMB).sort((a2, b) => b.free - a2.free);
1579
+ if (candidates.length > 0) {
1580
+ chosenGpu = candidates[0].index;
1581
+ vramFitsAfter = true;
1582
+ }
1583
+ }
1545
1584
  if (ramFitsAfter && vramFitsAfter) {
1546
- const promise = Promise.resolve({ kind: "ok", effectiveNumCtx });
1585
+ const decision2 = { kind: "ok", effectiveNumCtx, gpuIndex: chosenGpu };
1586
+ const promise = Promise.resolve(decision2);
1547
1587
  this._inflight.set(key, { startedMs: Date.now(), owner: spec.owner, promise });
1548
1588
  setTimeout(() => this._inflight.delete(key), spec.loadTimeoutMs ?? DEFAULT_INFLIGHT_WAIT_MS).unref?.();
1549
- return { kind: "ok", effectiveNumCtx };
1589
+ return decision2;
1550
1590
  }
1591
+ const targetGpu = chosenGpu ?? this.deviceWithMostPressureRelativeTo(devices, estVram);
1592
+ const needVramMB = vramFitsAfter ? 0 : targetGpu !== null ? estVram + this.vramHeadroomMB - (devices.find((d2) => d2.index === targetGpu)?.free ?? 0) : estVram + this.vramHeadroomMB;
1551
1593
  const evictTargets = this.pickEvictionCandidates({
1552
- needVramMB: vramFitsAfter ? 0 : estVram + this.vramHeadroomMB - (vram?.free ?? 0),
1594
+ needVramMB,
1553
1595
  needRamMB: ramFitsAfter ? 0 : estRam + this.ramHeadroomMB - ram.free,
1554
1596
  requestingPriority: spec.priority ?? 0,
1555
- requestingDomain: spec.domain
1597
+ requestingDomain: spec.domain,
1598
+ targetGpu
1556
1599
  });
1557
1600
  if (evictTargets.length > 0) {
1558
- return { kind: "evict", evictTargets, effectiveNumCtx };
1601
+ return { kind: "evict", evictTargets, effectiveNumCtx, gpuIndex: targetGpu };
1559
1602
  }
1560
1603
  const fallback = await this.findRunnableFallback(spec);
1561
1604
  if (fallback) {
1562
1605
  this.emit("degraded", spec, fallback, "insufficient-memory-no-evictable");
1563
1606
  return { kind: "degrade", fallback, reason: "insufficient-memory-no-evictable" };
1564
1607
  }
1565
- const reason = `insufficient resources (need ~${estRam}MB RAM, ~${estVram}MB VRAM; free ${ram.free}MB RAM, ${vram ? vram.free : "?"}MB VRAM) and no evictable / fallback models`;
1608
+ const perDeviceSummary = devices.length === 0 ? "no GPU" : devices.map((d2) => `gpu${d2.index}=${d2.free}MB`).join(", ");
1609
+ const reason = `insufficient resources (need ~${estRam}MB RAM, ~${estVram}MB VRAM; free ${ram.free}MB RAM; VRAM ${perDeviceSummary}) and no evictable / fallback models`;
1566
1610
  this.emit("rejected", spec, reason);
1567
1611
  return { kind: "reject", reason };
1568
1612
  }
@@ -1638,10 +1682,22 @@ var init_model_broker = __esm({
1638
1682
  seen.add(key);
1639
1683
  const vramMB = Math.round((m2.size_vram ?? 0) / (1024 * 1024));
1640
1684
  const ramMB = Math.round(((m2.size ?? 0) - (m2.size_vram ?? 0)) / (1024 * 1024));
1685
+ let affinity = null;
1686
+ try {
1687
+ affinity = this._ollamaAffinityProvider ? this._ollamaAffinityProvider(m2.name) : null;
1688
+ } catch {
1689
+ affinity = null;
1690
+ }
1641
1691
  const existing = this._loaded.get(key);
1642
1692
  if (existing) {
1643
1693
  existing.vramMB = vramMB || existing.vramMB;
1644
1694
  existing.ramMB = ramMB || existing.ramMB;
1695
+ if (affinity) {
1696
+ if (affinity.gpuIndex !== null)
1697
+ existing.gpuIndex = affinity.gpuIndex;
1698
+ if (affinity.gpuUuid !== null)
1699
+ existing.gpuUuid = affinity.gpuUuid;
1700
+ }
1645
1701
  } else {
1646
1702
  const tracked = this.registerLoaded({
1647
1703
  key,
@@ -1653,7 +1709,9 @@ var init_model_broker = __esm({
1653
1709
  ramMB,
1654
1710
  priority: 0,
1655
1711
  loadedAt: now,
1656
- lastUsedAt: now
1712
+ lastUsedAt: now,
1713
+ gpuIndex: affinity?.gpuIndex ?? null,
1714
+ gpuUuid: affinity?.gpuUuid ?? null
1657
1715
  });
1658
1716
  void tracked;
1659
1717
  }
@@ -1746,7 +1804,8 @@ var init_model_broker = __esm({
1746
1804
  m2.domain !== req2.requestingDomain || this.countByDomain(req2.requestingDomain) > 1
1747
1805
  );
1748
1806
  const idle = (m2) => now - m2.lastUsedAt > this.idleEvictMs;
1749
- const evictable = [...this._loaded.values()].filter((m2) => m2.priority <= req2.requestingPriority).filter(sameDomainOk).sort((a2, b) => {
1807
+ const onTargetGpu = (m2) => req2.targetGpu === void 0 || req2.targetGpu === null ? true : m2.gpuIndex === req2.targetGpu;
1808
+ const evictable = [...this._loaded.values()].filter((m2) => m2.priority <= req2.requestingPriority).filter(sameDomainOk).filter(onTargetGpu).sort((a2, b) => {
1750
1809
  const aIdle = idle(a2) ? 0 : 1;
1751
1810
  const bIdle = idle(b) ? 0 : 1;
1752
1811
  if (aIdle !== bIdle)
@@ -1767,6 +1826,24 @@ var init_model_broker = __esm({
1767
1826
  return targets;
1768
1827
  return [];
1769
1828
  }
1829
+ /** Pick the GPU whose free-VRAM gap to the requested footprint is smallest
1830
+ * (i.e. closest to fitting). Used when no device cleanly fits — eviction
1831
+ * on this device has the best chance of opening room. Returns null when
1832
+ * no GPUs are detected. */
1833
+ deviceWithMostPressureRelativeTo(devices, needMB) {
1834
+ if (devices.length === 0)
1835
+ return null;
1836
+ let best = null;
1837
+ let bestGap = Infinity;
1838
+ for (const d2 of devices) {
1839
+ const gap = needMB - d2.free;
1840
+ if (gap < bestGap) {
1841
+ bestGap = gap;
1842
+ best = d2;
1843
+ }
1844
+ }
1845
+ return best?.index ?? null;
1846
+ }
1770
1847
  countByDomain(domain) {
1771
1848
  let n2 = 0;
1772
1849
  for (const m2 of this._loaded.values())
@@ -1897,17 +1974,31 @@ var init_model_broker = __esm({
1897
1974
  inflight: [...this._inflight.entries()].map(([key, v]) => ({ key, owner: v.owner, startedMs: v.startedMs })),
1898
1975
  ramMB: ram,
1899
1976
  vramMB: vram,
1977
+ vramPerDevice: [...this._vramByDevice],
1900
1978
  lastPollAt: Date.now(),
1901
1979
  slots: this.buildSlotsSnapshot()
1902
1980
  };
1903
1981
  }
1904
1982
  buildSlotsSnapshot() {
1905
1983
  const byModel = {};
1984
+ const byGpu = {};
1906
1985
  for (const slot of this._activeSlots.values()) {
1907
1986
  const k = slot.model;
1908
1987
  if (!byModel[k])
1909
1988
  byModel[k] = { inUse: 0, tokensPerSec: 0, samples: 0 };
1910
1989
  byModel[k].inUse += 1;
1990
+ if (slot.gpuIndex !== null && slot.gpuIndex !== void 0) {
1991
+ if (!byGpu[slot.gpuIndex])
1992
+ byGpu[slot.gpuIndex] = { inUse: 0, capacity: this.perGpuSlotCapacity(slot.gpuIndex), loadedMB: 0 };
1993
+ byGpu[slot.gpuIndex].inUse += 1;
1994
+ }
1995
+ }
1996
+ for (const m2 of this._loaded.values()) {
1997
+ if (m2.gpuIndex !== null && m2.gpuIndex !== void 0) {
1998
+ if (!byGpu[m2.gpuIndex])
1999
+ byGpu[m2.gpuIndex] = { inUse: 0, capacity: this.perGpuSlotCapacity(m2.gpuIndex), loadedMB: 0 };
2000
+ byGpu[m2.gpuIndex].loadedMB += m2.vramMB;
2001
+ }
1911
2002
  }
1912
2003
  for (const [model, tp] of this._throughput) {
1913
2004
  if (!byModel[model])
@@ -1915,23 +2006,46 @@ var init_model_broker = __esm({
1915
2006
  byModel[model].tokensPerSec = tp.tokensPerSec;
1916
2007
  byModel[model].samples = tp.samples;
1917
2008
  }
2009
+ for (const d2 of this._vramByDevice) {
2010
+ if (!byGpu[d2.index])
2011
+ byGpu[d2.index] = { inUse: 0, capacity: this.perGpuSlotCapacity(d2.index), loadedMB: 0 };
2012
+ }
1918
2013
  return {
1919
2014
  inUse: this._activeSlots.size,
1920
2015
  capacity: this.slotCapacity,
1921
2016
  queueDepth: this._slotQueue.length,
1922
2017
  queueCapacity: this.queueCapacity,
1923
- byModel
2018
+ byModel,
2019
+ byGpu
1924
2020
  };
1925
2021
  }
2022
+ /** Per-GPU slot capacity. Returns the override when set, else ceil(slotCapacity / deviceCount). */
2023
+ perGpuSlotCapacity(gpuIndex) {
2024
+ const override = this._perGpuSlotCapacity.get(gpuIndex);
2025
+ if (override !== void 0)
2026
+ return override;
2027
+ const n2 = Math.max(1, this._vramByDevice.length);
2028
+ return Math.max(1, Math.ceil(this.slotCapacity / n2));
2029
+ }
1926
2030
  async checkPressure(snap) {
1927
2031
  if (snap.ramMB.free < this.ramHeadroomMB) {
1928
2032
  this.emit("pressure", "ram", snap.ramMB.free, this.ramHeadroomMB);
1929
2033
  }
1930
- const v = await vramSnapshotMB();
1931
- if (v) {
1932
- snap.vramMB = v;
1933
- if (v.free < this.vramHeadroomMB) {
1934
- this.emit("pressure", "vram", v.free, this.vramHeadroomMB);
2034
+ const devices = await vramSnapshotPerDevice();
2035
+ this._vramByDevice = devices;
2036
+ if (devices.length > 0) {
2037
+ let total = 0, used = 0, free = 0;
2038
+ for (const d2 of devices) {
2039
+ total += d2.total;
2040
+ used += d2.used;
2041
+ free += d2.free;
2042
+ }
2043
+ snap.vramMB = { total, used, free };
2044
+ snap.vramPerDevice = devices;
2045
+ for (const d2 of devices) {
2046
+ if (d2.free < this.vramHeadroomMB) {
2047
+ this.emit("pressure", "vram", d2.free, this.vramHeadroomMB);
2048
+ }
1935
2049
  }
1936
2050
  }
1937
2051
  const queueThreshold = Math.floor(this.queueCapacity * 0.8);
@@ -1960,23 +2074,54 @@ var init_model_broker = __esm({
1960
2074
  * upstream callers (e.g. Telegram poll loop) should slow ingress.
1961
2075
  */
1962
2076
  acquireInferenceSlot(spec) {
1963
- if (this._activeSlots.size < this.slotCapacity) {
2077
+ const chosenGpu = this.pickGpuForSlot(spec);
2078
+ const gpuOk = chosenGpu === null || this.activeSlotsOnGpu(chosenGpu) < this.perGpuSlotCapacity(chosenGpu);
2079
+ if (gpuOk && this._activeSlots.size < this.slotCapacity) {
1964
2080
  return Promise.resolve(this.admitSlot(
1965
2081
  spec,
1966
2082
  /*reserved*/
1967
- false
2083
+ false,
2084
+ chosenGpu
1968
2085
  ));
1969
2086
  }
1970
2087
  if (spec.sessionKey && !this._reservedBySession.has(spec.sessionKey) && this._activeSlots.size < this.slotCapacity + 1) {
1971
2088
  const slot = this.admitSlot(
1972
2089
  spec,
1973
2090
  /*reserved*/
1974
- true
2091
+ true,
2092
+ chosenGpu
1975
2093
  );
1976
2094
  this._reservedBySession.set(spec.sessionKey, slot.info.id);
1977
2095
  return Promise.resolve(slot);
1978
2096
  }
1979
2097
  return new Promise((resolve55, reject) => {
2098
+ if (this._slotQueue.length >= this.queueCapacity) {
2099
+ const newPrio = spec.priority ?? 0;
2100
+ let victim = -1;
2101
+ let victimPrio = Infinity;
2102
+ for (let i2 = this._slotQueue.length - 1; i2 >= 0; i2--) {
2103
+ const p2 = this._slotQueue[i2].spec.priority ?? 0;
2104
+ if (p2 < victimPrio) {
2105
+ victimPrio = p2;
2106
+ victim = i2;
2107
+ }
2108
+ if (victimPrio === 0)
2109
+ break;
2110
+ }
2111
+ if (victim >= 0 && victimPrio < newPrio) {
2112
+ const dropped = this._slotQueue.splice(victim, 1)[0];
2113
+ if (dropped.onSignalAbort && dropped.spec.signal) {
2114
+ dropped.spec.signal.removeEventListener("abort", dropped.onSignalAbort);
2115
+ }
2116
+ try {
2117
+ dropped.reject(new Error("broker queue shed: capacity reached, lower-priority entry displaced"));
2118
+ } catch {
2119
+ }
2120
+ } else {
2121
+ reject(new Error(`broker queue full (capacity=${this.queueCapacity}); caller priority ${newPrio} insufficient to displace`));
2122
+ return;
2123
+ }
2124
+ }
1980
2125
  const entry = { spec, resolve: resolve55, reject, enqueuedAt: Date.now() };
1981
2126
  if (spec.signal) {
1982
2127
  const onAbort = () => {
@@ -2011,7 +2156,7 @@ var init_model_broker = __esm({
2011
2156
  });
2012
2157
  }
2013
2158
  /** Admit a slot — internal, called from acquire fast path and from drainQueue. */
2014
- admitSlot(spec, reserved) {
2159
+ admitSlot(spec, reserved, gpuIndex = null) {
2015
2160
  const id = `slot-${++this._slotIdSeq}-${Date.now().toString(36)}`;
2016
2161
  const info = {
2017
2162
  id,
@@ -2021,7 +2166,8 @@ var init_model_broker = __esm({
2021
2166
  sessionKey: spec.sessionKey,
2022
2167
  acquiredAt: Date.now(),
2023
2168
  promptTokens: spec.promptTokens ?? 0,
2024
- reserved
2169
+ reserved,
2170
+ gpuIndex
2025
2171
  };
2026
2172
  this._activeSlots.set(id, info);
2027
2173
  this.emit("slotAcquired", info);
@@ -2037,6 +2183,35 @@ var init_model_broker = __esm({
2037
2183
  }
2038
2184
  };
2039
2185
  }
2186
+ /** Count of active slots pinned to a given GPU. */
2187
+ activeSlotsOnGpu(gpuIndex) {
2188
+ let n2 = 0;
2189
+ for (const s2 of this._activeSlots.values()) {
2190
+ if (s2.gpuIndex === gpuIndex)
2191
+ n2++;
2192
+ }
2193
+ return n2;
2194
+ }
2195
+ /**
2196
+ * Pick a GPU for a new inference slot. Honors caller's preferredGpuIndex
2197
+ * when set; otherwise picks the GPU with the highest free VRAM that has
2198
+ * room for the estimated footprint and an open per-device slot.
2199
+ *
2200
+ * Returns null when no GPU is detected (CPU-only) or no device fits — in
2201
+ * the latter case the slot is admitted unpinned and the underlying
2202
+ * subprocess will pick whatever CUDA exposes by default.
2203
+ */
2204
+ pickGpuForSlot(spec) {
2205
+ if (this._vramByDevice.length === 0)
2206
+ return null;
2207
+ const candidates = this._vramByDevice.filter((d2) => spec.preferredGpuIndex === void 0 || d2.index === spec.preferredGpuIndex).filter((d2) => this.activeSlotsOnGpu(d2.index) < this.perGpuSlotCapacity(d2.index)).filter((d2) => spec.estimatedVramMB === void 0 || d2.free >= spec.estimatedVramMB).sort((a2, b) => b.free - a2.free);
2208
+ return candidates[0]?.index ?? null;
2209
+ }
2210
+ /** Configure per-GPU slot capacity. Overrides the slotCapacity-derived default. */
2211
+ setPerGpuSlotCapacity(gpuIndex, capacity) {
2212
+ this._perGpuSlotCapacity.set(gpuIndex, Math.max(1, Math.floor(capacity)));
2213
+ this.drainSlotQueue();
2214
+ }
2040
2215
  releaseSlot(info, outcome) {
2041
2216
  this._activeSlots.delete(info.id);
2042
2217
  if (info.sessionKey && this._reservedBySession.get(info.sessionKey) === info.id) {
@@ -2062,8 +2237,18 @@ var init_model_broker = __esm({
2062
2237
  this.drainSlotQueue();
2063
2238
  }
2064
2239
  drainSlotQueue() {
2065
- while (this._slotQueue.length > 0 && this._activeSlots.size < this.slotCapacity) {
2066
- const entry = this._slotQueue.shift();
2240
+ const queueCopy = [...this._slotQueue];
2241
+ for (const entry of queueCopy) {
2242
+ if (this._activeSlots.size >= this.slotCapacity)
2243
+ break;
2244
+ const chosenGpu = this.pickGpuForSlot(entry.spec);
2245
+ const gpuOk = chosenGpu === null || this.activeSlotsOnGpu(chosenGpu) < this.perGpuSlotCapacity(chosenGpu);
2246
+ if (!gpuOk)
2247
+ continue;
2248
+ const idx = this._slotQueue.indexOf(entry);
2249
+ if (idx < 0)
2250
+ continue;
2251
+ this._slotQueue.splice(idx, 1);
2067
2252
  if (entry.onSignalAbort && entry.spec.signal) {
2068
2253
  entry.spec.signal.removeEventListener("abort", entry.onSignalAbort);
2069
2254
  }
@@ -2077,7 +2262,8 @@ var init_model_broker = __esm({
2077
2262
  const slot = this.admitSlot(
2078
2263
  entry.spec,
2079
2264
  /*reserved*/
2080
- false
2265
+ false,
2266
+ chosenGpu
2081
2267
  );
2082
2268
  try {
2083
2269
  entry.resolve(slot);
@@ -19581,26 +19767,16 @@ function extractSkillForQuery(skill, content, query, budgetTokens = 900) {
19581
19767
  function buildSkillsSummary(skills) {
19582
19768
  if (skills.length === 0)
19583
19769
  return "";
19584
- const lines = [
19585
- "## Skills Index",
19586
- "",
19587
- `${skills.length} skills available. Call \`skill_list\` to search, \`skill_execute <name>\` to load full instructions.`,
19588
- ""
19589
- ];
19590
19770
  const bySource = /* @__PURE__ */ new Map();
19591
19771
  for (const s2 of skills) {
19592
- const group = bySource.get(s2.source) ?? [];
19593
- group.push(s2);
19594
- bySource.set(s2.source, group);
19595
- }
19596
- for (const [source, group] of bySource) {
19597
- const names = group.map((s2) => {
19598
- const t2 = s2.triggers[0];
19599
- return t2 ? `${s2.name}(${t2})` : s2.name;
19600
- });
19601
- lines.push(`**${source}** (${group.length}): ${names.join(", ")}`);
19772
+ bySource.set(s2.source, (bySource.get(s2.source) ?? 0) + 1);
19602
19773
  }
19603
- return lines.join("\n");
19774
+ const sourcesSummary = [...bySource.entries()].sort((a2, b) => b[1] - a2[1]).map(([source, count]) => `${source}=${count}`).join(", ");
19775
+ return [
19776
+ "## Skills Index",
19777
+ `${skills.length} skills available across ${bySource.size} sources (${sourcesSummary}).`,
19778
+ "Use `skill_list` (with optional `filter` or `source`) to search; `skill_execute <name>` to load full instructions."
19779
+ ].join("\n");
19604
19780
  }
19605
19781
  function safeReaddir2(dir, dirsOnly = false) {
19606
19782
  try {
@@ -255412,6 +255588,11 @@ import sys
255412
255588
  import time
255413
255589
  from pathlib import Path
255414
255590
 
255591
+ # Broker-picked GPU pinning — MUST run before importing torch.
255592
+ _omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
255593
+ if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
255594
+ os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
255595
+
255415
255596
  def _progress(stage, message, percent=None):
255416
255597
  payload = {"omnius_progress": True, "stage": stage, "message": message}
255417
255598
  if percent is not None:
@@ -255570,9 +255751,15 @@ if __name__ == "__main__":
255570
255751
  SDCPP_RUNNER = String.raw`#!/usr/bin/env python3
255571
255752
  import argparse
255572
255753
  import json
255754
+ import os
255573
255755
  import time
255574
255756
  from pathlib import Path
255575
255757
 
255758
+ # Broker-picked GPU pinning — sd-cpp's CUDA backend honors CUDA_VISIBLE_DEVICES.
255759
+ _omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
255760
+ if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
255761
+ os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
255762
+
255576
255763
  def main():
255577
255764
  parser = argparse.ArgumentParser()
255578
255765
  parser.add_argument("--model-path", required=True)
@@ -255686,6 +255873,9 @@ if __name__ == "__main__":
255686
255873
  defaultModel;
255687
255874
  defaultBackend;
255688
255875
  promptExpander = null;
255876
+ /** Broker-chosen GPU pinning for the in-flight generation. Read by the
255877
+ * spawn path to set OMNIUS_GPU_INDEX in the subprocess env. */
255878
+ _brokerGpuIndex = null;
255689
255879
  constructor(cwd4, ollamaUrl = "http://localhost:11434", defaults3 = {}) {
255690
255880
  this.cwd = cwd4;
255691
255881
  this.ollamaUrl = ollamaUrl.replace(/\/v1\/?$/, "").replace(/\/$/, "");
@@ -255761,6 +255951,7 @@ if __name__ == "__main__":
255761
255951
  const candidates = imageGenerationFallbackCandidates(requestedModel, requestedBackend, generationFallbackEnabled(args));
255762
255952
  const broker = getModelBroker();
255763
255953
  const firstCandidate = candidates[0];
255954
+ let brokerGpuIndex = null;
255764
255955
  if (firstCandidate) {
255765
255956
  const decision2 = await broker.ensureModelLoadable({
255766
255957
  name: firstCandidate.model,
@@ -255772,6 +255963,9 @@ if __name__ == "__main__":
255772
255963
  for (const target of decision2.evictTargets) {
255773
255964
  await broker.evict(target.host, target.name, "image-gen-needs-room");
255774
255965
  }
255966
+ brokerGpuIndex = decision2.gpuIndex ?? null;
255967
+ } else if (decision2.kind === "ok") {
255968
+ brokerGpuIndex = decision2.gpuIndex ?? null;
255775
255969
  } else if (decision2.kind === "reject") {
255776
255970
  return {
255777
255971
  success: false,
@@ -255781,6 +255975,7 @@ if __name__ == "__main__":
255781
255975
  };
255782
255976
  }
255783
255977
  }
255978
+ this._brokerGpuIndex = brokerGpuIndex;
255784
255979
  try {
255785
255980
  return await this.generateCandidateLadder({ candidates, prompt, args, seed, start: start2 });
255786
255981
  } catch (err) {
@@ -256283,10 +256478,14 @@ ${errText.slice(0, 800)}`,
256283
256478
  }
256284
256479
  ensureUnifiedCacheDirs();
256285
256480
  this.emitProgress({ stage: "load", message: `Starting image generation with ${args.model}` });
256481
+ const runnerEnv = { ...python.env };
256482
+ if (this._brokerGpuIndex !== null) {
256483
+ runnerEnv["OMNIUS_GPU_INDEX"] = String(this._brokerGpuIndex);
256484
+ }
256286
256485
  const result = await runProcess2(python.command, argv, {
256287
256486
  cwd: this.cwd,
256288
256487
  timeoutMs: 9e5,
256289
- env: python.env,
256488
+ env: runnerEnv,
256290
256489
  progressLabel: `Downloading/loading ${args.model}`,
256291
256490
  onProgress: (event) => this.emitProgress(event)
256292
256491
  });
@@ -257582,9 +257781,14 @@ var init_audio_generate = __esm({
257582
257781
  DEFAULT_MUSIC_MODEL
257583
257782
  ];
257584
257783
  DIFFUSERS_AUDIO_RUNNER = String.raw`#!/usr/bin/env python3
257585
- import argparse, json, sys, time
257784
+ import argparse, json, os, sys, time
257586
257785
  from pathlib import Path
257587
257786
 
257787
+ # Broker-picked GPU pinning — must run before importing torch.
257788
+ _omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
257789
+ if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
257790
+ os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
257791
+
257588
257792
  def _format_bytes(value):
257589
257793
  try:
257590
257794
  n = float(value)
@@ -257778,9 +257982,14 @@ if __name__ == "__main__":
257778
257982
  main()
257779
257983
  `;
257780
257984
  TRANSFORMERS_AUDIO_RUNNER = String.raw`#!/usr/bin/env python3
257781
- import argparse, json, sys, time
257985
+ import argparse, json, os, sys, time
257782
257986
  from pathlib import Path
257783
257987
 
257988
+ # Broker-picked GPU pinning — must run before importing torch.
257989
+ _omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
257990
+ if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
257991
+ os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
257992
+
257784
257993
  def _format_bytes(value):
257785
257994
  try:
257786
257995
  n = float(value)
@@ -258006,6 +258215,8 @@ if __name__ == "__main__":
258006
258215
  progressHandler = null;
258007
258216
  lastProgressMessage = "";
258008
258217
  lastProgressAt = 0;
258218
+ /** Broker-chosen GPU pinning for the in-flight generation. */
258219
+ _brokerGpuIndex = null;
258009
258220
  constructor(cwd4, defaults3 = {}) {
258010
258221
  this.cwd = cwd4;
258011
258222
  this.defaults = defaults3;
@@ -258171,6 +258382,7 @@ if __name__ == "__main__":
258171
258382
  const playback = playbackRequested(args);
258172
258383
  const broker = getModelBroker();
258173
258384
  const firstCandidate = candidates[0];
258385
+ let brokerGpuIndex = null;
258174
258386
  if (firstCandidate) {
258175
258387
  const decision2 = await broker.ensureModelLoadable({
258176
258388
  name: firstCandidate.model,
@@ -258182,6 +258394,9 @@ if __name__ == "__main__":
258182
258394
  for (const target of decision2.evictTargets) {
258183
258395
  await broker.evict(target.host, target.name, `${kind}-gen-needs-room`);
258184
258396
  }
258397
+ brokerGpuIndex = decision2.gpuIndex ?? null;
258398
+ } else if (decision2.kind === "ok") {
258399
+ brokerGpuIndex = decision2.gpuIndex ?? null;
258185
258400
  } else if (decision2.kind === "reject") {
258186
258401
  return {
258187
258402
  success: false,
@@ -258191,6 +258406,7 @@ if __name__ == "__main__":
258191
258406
  };
258192
258407
  }
258193
258408
  }
258409
+ this._brokerGpuIndex = brokerGpuIndex;
258194
258410
  try {
258195
258411
  return await this.generateCandidateLadder({ kind, candidates, prompt, args, seed, playback, start: start2 });
258196
258412
  } catch (err) {
@@ -258357,10 +258573,14 @@ if __name__ == "__main__":
258357
258573
  }
258358
258574
  ensureUnifiedCacheDirs();
258359
258575
  this.emitProgress({ stage: "load", message: `Starting ${args.kind} generation with ${args.model}` });
258576
+ const runnerEnv = { ...python.env };
258577
+ if (this._brokerGpuIndex !== null) {
258578
+ runnerEnv["OMNIUS_GPU_INDEX"] = String(this._brokerGpuIndex);
258579
+ }
258360
258580
  const result = await runProcess3(python.command, argv, {
258361
258581
  cwd: this.cwd,
258362
258582
  timeoutMs: 9e5,
258363
- env: python.env,
258583
+ env: runnerEnv,
258364
258584
  progressLabel: `Downloading/loading ${args.model}`,
258365
258585
  onProgress: (event) => this.emitProgress(event)
258366
258586
  });
@@ -259130,7 +259350,7 @@ function parseRunnerJson3(stdout) {
259130
259350
  }
259131
259351
  return null;
259132
259352
  }
259133
- var DEFAULT_DIFFUSERS_VIDEO_MODEL, SANA_VIDEO_480P_MODEL, SANA_VIDEO_720P_MODEL, WAN_TI2V_5B_MODEL, WAN_T2V_A14B_MODEL, WAN_I2V_A14B_MODEL, WAN_S2V_14B_MODEL, COGVIDEOX_5B_MODEL, COGVIDEOX_2B_MODEL, COGVIDEOX_5B_I2V_MODEL, MOCHI_PREVIEW_MODEL, LTX_VIDEO_MODEL, LTX_2_3_MODEL, HUNYUAN_VIDEO_MODEL, DIFFUSERS_VIDEO_PACKAGES, VIDEO_GENERATION_MODEL_PRESETS, VIDEO_GENERATION_QUALITY_LADDER, VIDEO_AUDIO_QUALITY_LADDER, DIFFUSERS_VIDEO_RUNNER, COMFY_BOOTSTRAP_SCRIPT, COMFY_DEFAULT_WORKFLOWS, VideoGenerateTool;
259353
+ var DEFAULT_DIFFUSERS_VIDEO_MODEL, SANA_VIDEO_480P_MODEL, SANA_VIDEO_720P_MODEL, SANA_WM_BIDIRECTIONAL_MODEL, WAN_TI2V_5B_MODEL, WAN_T2V_A14B_MODEL, WAN_I2V_A14B_MODEL, WAN_S2V_14B_MODEL, COGVIDEOX_5B_MODEL, COGVIDEOX_2B_MODEL, COGVIDEOX_5B_I2V_MODEL, MOCHI_PREVIEW_MODEL, LTX_VIDEO_MODEL, LTX_2_3_MODEL, HUNYUAN_VIDEO_MODEL, DIFFUSERS_VIDEO_PACKAGES, VIDEO_GENERATION_MODEL_PRESETS, VIDEO_GENERATION_QUALITY_LADDER, VIDEO_AUDIO_QUALITY_LADDER, DIFFUSERS_VIDEO_RUNNER, COMFY_BOOTSTRAP_SCRIPT, COMFY_DEFAULT_WORKFLOWS, VideoGenerateTool;
259134
259354
  var init_video_generate = __esm({
259135
259355
  "packages/execution/dist/tools/video-generate.js"() {
259136
259356
  "use strict";
@@ -259140,6 +259360,7 @@ var init_video_generate = __esm({
259140
259360
  DEFAULT_DIFFUSERS_VIDEO_MODEL = "Efficient-Large-Model/SANA-Video_2B_480p";
259141
259361
  SANA_VIDEO_480P_MODEL = "Efficient-Large-Model/SANA-Video_2B_480p";
259142
259362
  SANA_VIDEO_720P_MODEL = "Efficient-Large-Model/SANA-Video_2B_720p";
259363
+ SANA_WM_BIDIRECTIONAL_MODEL = "Efficient-Large-Model/SANA-WM_bidirectional";
259143
259364
  WAN_TI2V_5B_MODEL = "Wan-AI/Wan2.2-TI2V-5B-Diffusers";
259144
259365
  WAN_T2V_A14B_MODEL = "Wan-AI/Wan2.2-T2V-A14B-Diffusers";
259145
259366
  WAN_I2V_A14B_MODEL = "Wan-AI/Wan2.2-I2V-A14B-Diffusers";
@@ -259433,6 +259654,41 @@ var init_video_generate = __esm({
259433
259654
  licenseNote: "Apache 2.0",
259434
259655
  note: "Premium Wan T2V; cloud GPU recommended."
259435
259656
  },
259657
+ {
259658
+ id: SANA_WM_BIDIRECTIONAL_MODEL,
259659
+ label: "SANA-WM bidirectional (world-model i2v)",
259660
+ kinds: ["i2v"],
259661
+ backend: "diffusers",
259662
+ // SANA-WM declares its concrete class in model_index.json; loaded via
259663
+ // generic DiffusionPipeline.from_pretrained — the runner's auto path
259664
+ // already does this for unknown model names.
259665
+ pipelineClass: "DiffusionPipeline",
259666
+ install: 'python3 .omnius/video-gen/diffusers_text2video.py --model Efficient-Large-Model/SANA-WM_bidirectional --mode i2v --num-frames 121 --fps 24 --width 704 --height 1280 --steps 30 --guidance 5.0 --image <input.png> --prompt "..." --output .omnius/videos/out.mp4',
259667
+ category: "Premium quality",
259668
+ sizeClass: "2.6B DiT + LTX-2 refiner (Sana World Model)",
259669
+ quality: "Image-to-video world model with optional camera-trajectory control. Two-stage generation (Sana DiT + LTX-2 refiner); hybrid linear attention; 6-DoF camera support via .npy matrices or WASD/IJKL action DSL.",
259670
+ output: "Up to ~13s 704×1280 (portrait 720p) MP4 at 24 fps; max 321 frames.",
259671
+ bestUse: "World-model / camera-controlled video from a single first-frame image. Best on H100/A100-class hardware.",
259672
+ minVramGB: 80,
259673
+ recommendedVramGB: 100,
259674
+ deployment: "Diffusers DiffusionPipeline.from_pretrained; bfloat16; aggressive CPU offload mandatory below 100 GB. Bundled LTX-2 refiner runs as stage 2.",
259675
+ steps: 30,
259676
+ guidance: 5,
259677
+ numFrames: 121,
259678
+ fps: 24,
259679
+ width: 704,
259680
+ height: 1280,
259681
+ dtype: "bfloat16",
259682
+ needsCpuOffload: true,
259683
+ frameQuantum: 1,
259684
+ pixelQuantum: 16,
259685
+ // Apache 2.0 base; bundled LTX-2 refiner + VAE inherit the LTX-2
259686
+ // non-commercial license. Surface that explicitly.
259687
+ licenseNote: "Apache 2.0 (bundled LTX-2 refiner/VAE inherit LTX-2 non-commercial terms)",
259688
+ approxDownloadGB: 99,
259689
+ fallbackFor: [WAN_I2V_A14B_MODEL],
259690
+ note: "Sana World Model bidirectional i2v; portrait 704×1280 fixed; camera control via --camera <matrices.npy> or --action <DSL> when the runner supports it."
259691
+ },
259436
259692
  {
259437
259693
  id: WAN_I2V_A14B_MODEL,
259438
259694
  label: "Wan2.2 I2V A14B",
@@ -259561,6 +259817,9 @@ var init_video_generate = __esm({
259561
259817
  COGVIDEOX_5B_MODEL,
259562
259818
  MOCHI_PREVIEW_MODEL,
259563
259819
  COGVIDEOX_2B_MODEL,
259820
+ // Heavy i2v / world-model tier — only attempted when an explicit model
259821
+ // is requested or the consumer-VRAM tier above has failed for an i2v ask.
259822
+ SANA_WM_BIDIRECTIONAL_MODEL,
259564
259823
  WAN_I2V_A14B_MODEL,
259565
259824
  WAN_T2V_A14B_MODEL,
259566
259825
  HUNYUAN_VIDEO_MODEL
@@ -259579,6 +259838,16 @@ import sys
259579
259838
  import time
259580
259839
  from pathlib import Path
259581
259840
 
259841
+ # ── GPU pinning ─────────────────────────────────────────────────────
259842
+ # The TS broker picks a GPU per generation via bin-packing across the
259843
+ # available CUDA devices. It passes the chosen index in OMNIUS_GPU_INDEX.
259844
+ # We MUST apply CUDA_VISIBLE_DEVICES BEFORE importing torch, otherwise
259845
+ # torch initializes the device list with all visible GPUs and the model
259846
+ # may land on a different device than the broker reserved capacity on.
259847
+ _omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
259848
+ if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
259849
+ os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
259850
+
259582
259851
  def _progress(stage, message, percent=None):
259583
259852
  payload = {"omnius_progress": True, "stage": stage, "message": message}
259584
259853
  if percent is not None:
@@ -260385,6 +260654,9 @@ if __name__ == "__main__":
260385
260654
  defaultBackend;
260386
260655
  defaultKind;
260387
260656
  promptExpander = null;
260657
+ /** GPU index chosen by the broker for the in-flight generation. Read
260658
+ * by the spawn path to set OMNIUS_GPU_INDEX in the subprocess env. */
260659
+ _brokerGpuIndex = null;
260388
260660
  constructor(cwd4, defaults3 = {}) {
260389
260661
  this.cwd = cwd4;
260390
260662
  this.defaultModel = defaults3.model;
@@ -260474,17 +260746,23 @@ if __name__ == "__main__":
260474
260746
  const candidates = videoGenerationFallbackCandidates(requestedModel, requestedBackend, inferredKind, generationFallbackEnabled3(args), { preferNativeAudioVideo: withAudio || Boolean(audioInput) });
260475
260747
  const broker = getModelBroker();
260476
260748
  const firstCandidate = candidates[0];
260749
+ let brokerGpuIndex = null;
260477
260750
  if (firstCandidate) {
260751
+ const preset = firstCandidate.preset;
260478
260752
  const decision2 = await broker.ensureModelLoadable({
260479
260753
  name: firstCandidate.model,
260480
260754
  domain: "video-gen",
260481
260755
  host: firstCandidate.backend === "comfyui" ? "comfyui" : "diffusers-py",
260482
- owner: "video-generate-tool"
260756
+ owner: "video-generate-tool",
260757
+ estimatedVramMB: preset ? preset.minVramGB * 1024 : void 0
260483
260758
  });
260484
260759
  if (decision2.kind === "evict") {
260485
260760
  for (const target of decision2.evictTargets) {
260486
260761
  await broker.evict(target.host, target.name, "video-gen-needs-room");
260487
260762
  }
260763
+ brokerGpuIndex = decision2.gpuIndex ?? null;
260764
+ } else if (decision2.kind === "ok") {
260765
+ brokerGpuIndex = decision2.gpuIndex ?? null;
260488
260766
  } else if (decision2.kind === "reject") {
260489
260767
  return {
260490
260768
  success: false,
@@ -260494,6 +260772,7 @@ if __name__ == "__main__":
260494
260772
  };
260495
260773
  }
260496
260774
  }
260775
+ this._brokerGpuIndex = brokerGpuIndex;
260497
260776
  if (candidates.length === 0) {
260498
260777
  return {
260499
260778
  success: false,
@@ -260915,6 +261194,9 @@ ${llmAnnotation}` : result.llmContent;
260915
261194
  runnerEnv["HF_TOKEN"] = effectiveToken;
260916
261195
  runnerEnv["HUGGING_FACE_HUB_TOKEN"] = effectiveToken;
260917
261196
  }
261197
+ if (this._brokerGpuIndex !== null) {
261198
+ runnerEnv["OMNIUS_GPU_INDEX"] = String(this._brokerGpuIndex);
261199
+ }
260918
261200
  const argv = [
260919
261201
  runner,
260920
261202
  "--model",
@@ -570264,18 +570546,6 @@ function formatReflection(notes2, scenario) {
570264
570546
  ];
570265
570547
  return lines.join("\n");
570266
570548
  }
570267
- function formatMemory(input, state) {
570268
- const lines = [];
570269
- if (input.memoryContext) lines.push(input.memoryContext);
570270
- if (state.dynamicState && Object.keys(state.dynamicState).length > 0) {
570271
- const entries = Object.entries(state.dynamicState).slice(0, 12).map(([key, value2]) => `- ${key}: ${compactText(JSON.stringify(value2) ?? String(value2), 220)}`);
570272
- lines.push(`Dynamic state:
570273
- ${entries.join("\n")}`);
570274
- }
570275
- if (state.updatedAt) lines.push(`State updated: ${state.updatedAt}`);
570276
- if (lines.length === 0) return "No additional retrieved voice-soul memory beyond scoped personality and runtime state.";
570277
- return lines.join("\n\n");
570278
- }
570279
570549
  function formatFinalVoice(input) {
570280
570550
  const voice = findProjectVoice(input.scope);
570281
570551
  const lines = [
@@ -570302,23 +570572,23 @@ function buildSoulContext(input) {
570302
570572
  const state = loadSoulRuntimeState(input);
570303
570573
  const scenario = resolveSoulScenario(input, state);
570304
570574
  const tree2 = resolveSoulDecisionTree(input, state, scenario);
570305
- return [
570306
- "## Voice Soul Context",
570307
- "### 1. Authority And Safety Scope",
570575
+ const sections = ["## Voice Soul Context"];
570576
+ const voiceAndScope = [
570308
570577
  formatAuthorityScope(input),
570309
- "### 2. Core Identity",
570310
570578
  formatCoreIdentity(input),
570311
- "### 3. Procedural Decision Tree",
570312
- formatProceduralConstraints(input, scenario, tree2, state),
570313
- "### 4. Relationship State",
570314
- formatRelationshipState(input),
570315
- "### 5. Current Reflection Notes",
570316
- formatReflection(input.currentReflection, scenario),
570317
- "### 6. Minimal Retrieved Memory",
570318
- formatMemory(input, state),
570319
- "### 7. Final Voice Guidance",
570320
570579
  formatFinalVoice(input)
570321
- ].join("\n\n");
570580
+ ].filter(Boolean).join("\n\n");
570581
+ sections.push("### Voice + Scope + Identity", voiceAndScope);
570582
+ const decisionSubstrate = [
570583
+ formatRelationshipState(input),
570584
+ formatProceduralConstraints(input, scenario, tree2, state)
570585
+ ].filter(Boolean).join("\n\n");
570586
+ sections.push("### Active Relationship + Scenario", decisionSubstrate);
570587
+ const reflection = formatReflection(input.currentReflection, scenario);
570588
+ if (reflection && reflection.trim().length > 0) {
570589
+ sections.push("### Current Reflection Notes", reflection);
570590
+ }
570591
+ return sections.join("\n\n");
570322
570592
  }
570323
570593
  var MAX_SOUL_CHARS, MAX_VOICE_CHARS, MAX_SCOPED_PERSONALITY_CHARS, UNCLASSIFIED_SCENARIO;
570324
570594
  var init_voice_soul = __esm({
@@ -577249,7 +577519,32 @@ var init_status_bar = __esm({
577249
577519
  if (this.active) this.renderFooterPreserveCursor();
577250
577520
  }, intervalMs);
577251
577521
  try {
577252
- getModelBroker().startPolling(Math.max(2e3, intervalMs * 2));
577522
+ const broker = getModelBroker();
577523
+ try {
577524
+ Promise.resolve().then(() => (init_dist8(), dist_exports3)).then(({ getOllamaPool: getOllamaPool2, resolveDefaultPoolConfig: resolveDefaultPoolConfig2 }) => {
577525
+ try {
577526
+ const config = resolveDefaultPoolConfig2();
577527
+ const pool3 = getOllamaPool2({ baseInstanceUrl: config.baseInstanceUrl });
577528
+ broker.setOllamaAffinityProvider((modelName) => {
577529
+ try {
577530
+ const status = pool3.status?.();
577531
+ if (!status) return null;
577532
+ for (const inst of status.instances ?? []) {
577533
+ void modelName;
577534
+ return { gpuIndex: inst.gpuIndex, gpuUuid: inst.gpuUuid };
577535
+ }
577536
+ return null;
577537
+ } catch {
577538
+ return null;
577539
+ }
577540
+ });
577541
+ } catch {
577542
+ }
577543
+ }).catch(() => {
577544
+ });
577545
+ } catch {
577546
+ }
577547
+ broker.startPolling(Math.max(2e3, intervalMs * 2));
577253
577548
  } catch {
577254
577549
  }
577255
577550
  }
@@ -604352,14 +604647,22 @@ async function handleBroker(arg, _ctx) {
604352
604647
  safeLog(` ${c3.bold("Resource Broker")}`);
604353
604648
  safeLog("");
604354
604649
  safeLog(` ${c3.dim("RAM:")} ${snap.ramMB.used} / ${snap.ramMB.total} MB used (${snap.ramMB.free} MB free)`);
604355
- if (snap.vramMB) {
604650
+ if (snap.vramPerDevice.length > 0) {
604651
+ safeLog(` ${c3.bold("GPUs:")}`);
604652
+ for (const d2 of snap.vramPerDevice) {
604653
+ const gpuSlots = snap.slots.byGpu[d2.index];
604654
+ const slotInfo = gpuSlots ? ` slots=${gpuSlots.inUse}/${gpuSlots.capacity}, loaded=${gpuSlots.loadedMB}MB` : "";
604655
+ safeLog(` gpu${d2.index} (${d2.uuid.slice(0, 12)}…) ${d2.used} / ${d2.total} MB used (${d2.free} MB free)${slotInfo}`);
604656
+ }
604657
+ } else if (snap.vramMB) {
604356
604658
  safeLog(` ${c3.dim("VRAM:")} ${snap.vramMB.used} / ${snap.vramMB.total} MB used (${snap.vramMB.free} MB free)`);
604357
604659
  } else {
604358
604660
  safeLog(` ${c3.dim("VRAM:")} ${c3.dim("(no GPU detected)")}`);
604359
604661
  }
604360
604662
  safeLog(` ${c3.dim("RAM headroom threshold:")} ${broker.ramHeadroomMB} MB`);
604361
- safeLog(` ${c3.dim("VRAM headroom threshold:")} ${broker.vramHeadroomMB} MB`);
604663
+ safeLog(` ${c3.dim("VRAM headroom threshold:")} ${broker.vramHeadroomMB} MB (per-device)`);
604362
604664
  safeLog(` ${c3.dim("Idle-evict threshold:")} ${Math.round(broker.idleEvictMs / 1e3)}s`);
604665
+ safeLog(` ${c3.dim("Slot capacity:")} ${snap.slots.inUse}/${snap.slots.capacity} active, queue ${snap.slots.queueDepth}/${snap.slots.queueCapacity}`);
604363
604666
  safeLog("");
604364
604667
  if (snap.loaded.length === 0) {
604365
604668
  safeLog(` ${c3.dim("No loaded models tracked.")}`);
@@ -604370,7 +604673,8 @@ async function handleBroker(arg, _ctx) {
604370
604673
  const idle = Math.round((now - m2.lastUsedAt) / 1e3);
604371
604674
  const owner = m2.owner ? c3.dim(` [owner=${m2.owner}]`) : "";
604372
604675
  const ctx3 = m2.numCtx ? c3.dim(` n_ctx=${m2.numCtx}`) : "";
604373
- safeLog(` ${c3.cyan(m2.name)} (${m2.host}/${m2.domain}) vram=${m2.vramMB}MB ram=${m2.ramMB}MB idle=${idle}s${ctx3}${owner}`);
604676
+ const gpu = m2.gpuIndex !== null && m2.gpuIndex !== void 0 ? c3.dim(` gpu=${m2.gpuIndex}`) : "";
604677
+ safeLog(` ${c3.cyan(m2.name)} (${m2.host}/${m2.domain}) vram=${m2.vramMB}MB ram=${m2.ramMB}MB${gpu} idle=${idle}s${ctx3}${owner}`);
604374
604678
  }
604375
604679
  }
604376
604680
  if (snap.inflight.length > 0) {
@@ -618122,6 +618426,95 @@ function parseTelegramSilentReflectionNotes(text) {
618122
618426
  }
618123
618427
  return null;
618124
618428
  }
618429
+ function extractPartialTelegramReplyJson(buffer2) {
618430
+ const stripped = stripTelegramHiddenThinking(buffer2).trimStart();
618431
+ if (!stripped.startsWith("{")) {
618432
+ return stripped || null;
618433
+ }
618434
+ const keyMatch = stripped.indexOf('"reply"');
618435
+ if (keyMatch < 0) return null;
618436
+ let i2 = keyMatch + '"reply"'.length;
618437
+ while (i2 < stripped.length && stripped[i2] !== ":") i2++;
618438
+ if (i2 >= stripped.length) return null;
618439
+ i2++;
618440
+ while (i2 < stripped.length && /\s/.test(stripped[i2])) i2++;
618441
+ if (i2 >= stripped.length || stripped[i2] !== '"') return null;
618442
+ i2++;
618443
+ let out = "";
618444
+ while (i2 < stripped.length) {
618445
+ const ch = stripped[i2];
618446
+ if (ch === "\\") {
618447
+ const next = stripped[i2 + 1];
618448
+ if (next === void 0) break;
618449
+ if (next === '"') out += '"';
618450
+ else if (next === "\\") out += "\\";
618451
+ else if (next === "n") out += "\n";
618452
+ else if (next === "t") out += " ";
618453
+ else if (next === "r") out += "\r";
618454
+ else if (next === "/") out += "/";
618455
+ else if (next === "u") {
618456
+ if (i2 + 5 >= stripped.length) break;
618457
+ const hex = stripped.slice(i2 + 2, i2 + 6);
618458
+ const code8 = parseInt(hex, 16);
618459
+ if (Number.isFinite(code8)) out += String.fromCharCode(code8);
618460
+ i2 += 4;
618461
+ } else {
618462
+ out += next;
618463
+ }
618464
+ i2 += 2;
618465
+ continue;
618466
+ }
618467
+ if (ch === '"') {
618468
+ return out;
618469
+ }
618470
+ out += ch;
618471
+ i2++;
618472
+ }
618473
+ return out.length > 0 ? out : null;
618474
+ }
618475
+ function extractFinalTelegramReplyJson(buffer2) {
618476
+ const stripped = stripTelegramHiddenThinking(buffer2).trim();
618477
+ if (!stripped.startsWith("{")) return null;
618478
+ try {
618479
+ const parsed = JSON.parse(stripped);
618480
+ if (typeof parsed.reply === "string") return parsed.reply.trim();
618481
+ } catch {
618482
+ }
618483
+ let depth = 0;
618484
+ let inString = false;
618485
+ let escape2 = false;
618486
+ let end = -1;
618487
+ for (let i2 = 0; i2 < stripped.length; i2++) {
618488
+ const ch = stripped[i2];
618489
+ if (escape2) {
618490
+ escape2 = false;
618491
+ continue;
618492
+ }
618493
+ if (inString) {
618494
+ if (ch === "\\") escape2 = true;
618495
+ else if (ch === '"') inString = false;
618496
+ continue;
618497
+ }
618498
+ if (ch === '"') inString = true;
618499
+ else if (ch === "{") depth++;
618500
+ else if (ch === "}") {
618501
+ depth--;
618502
+ if (depth === 0) {
618503
+ end = i2;
618504
+ break;
618505
+ }
618506
+ }
618507
+ }
618508
+ if (end > 0) {
618509
+ try {
618510
+ const parsed = JSON.parse(stripped.slice(0, end + 1));
618511
+ if (typeof parsed.reply === "string") return parsed.reply.trim();
618512
+ } catch {
618513
+ }
618514
+ }
618515
+ const partial = extractPartialTelegramReplyJson(stripped);
618516
+ return partial && partial.trim().length > 0 ? partial.trim() : null;
618517
+ }
618125
618518
  function estimatePromptTokensFromRequest(request) {
618126
618519
  let chars = 0;
618127
618520
  for (const m2 of request.messages ?? []) {
@@ -618138,6 +618531,32 @@ function estimatePromptTokensFromRequest(request) {
618138
618531
  }
618139
618532
  return Math.ceil(chars / 4);
618140
618533
  }
618534
+ function isLikelyTruncatedRouterJson(text) {
618535
+ if (typeof text !== "string") return false;
618536
+ const stripped = text.replace(/^\s*<think>[\s\S]*?<\/think>\s*/i, "").trim();
618537
+ if (!stripped.startsWith("{")) return false;
618538
+ let depth = 0;
618539
+ let inString = false;
618540
+ let escape2 = false;
618541
+ for (let i2 = 0; i2 < stripped.length; i2++) {
618542
+ const ch = stripped[i2];
618543
+ if (escape2) {
618544
+ escape2 = false;
618545
+ continue;
618546
+ }
618547
+ if (inString) {
618548
+ if (ch === "\\") escape2 = true;
618549
+ else if (ch === '"') inString = false;
618550
+ continue;
618551
+ }
618552
+ if (ch === '"') inString = true;
618553
+ else if (ch === "{") depth++;
618554
+ else if (ch === "}") depth--;
618555
+ }
618556
+ if (depth <= 0) return false;
618557
+ const hits = (stripped.includes('"route"') ? 1 : 0) + (stripped.includes('"should_reply"') ? 1 : 0) + (stripped.includes('"confidence"') ? 1 : 0) + (stripped.includes('"reason"') ? 1 : 0) + (stripped.includes('"silent_disposition"') ? 1 : 0) + (stripped.includes('"mental_note"') ? 1 : 0);
618558
+ return hits >= 3;
618559
+ }
618141
618560
  function telegramRouterTimeoutMs(configTimeoutMs, _minMs, _legacyMaxMs) {
618142
618561
  void _minMs;
618143
618562
  void _legacyMaxMs;
@@ -619583,7 +620002,7 @@ function renderTelegramSubAgentError(username, error) {
619583
620002
  process.stdout.write(` ${c3.dim("│")} ${c3.magenta("✘")} @${username}: ${c3.dim(preview)}
619584
620003
  `);
619585
620004
  }
619586
- var TELEGRAM_TOOL_ACTION_GROUPS, TELEGRAM_TOOL_ACTION_GROUP, TELEGRAM_TOOL_MUTATING_GROUPS, DEFAULT_TELEGRAM_TOOL_GROUP_POLICY, TELEGRAM_TOOL_BUTTON_LABELS, TELEGRAM_SAFETY_PROMPT, ADMIN_DM_PROMPT, ADMIN_GROUP_PROMPT, TELEGRAM_PUBLIC_SOUL_PROFILE, TELEGRAM_PUBLIC_ORCHESTRATOR_CONTRACT, TELEGRAM_PUBLIC_MEMORY_SCOPE_CONTRACT, TELEGRAM_PUBLIC_VISION_STACK_CONTRACT, GROUP_REPLY_DISCRETION_PROMPT, TELEGRAM_CHAT_MODE_PROMPT, ADMIN_CHAT_PROFILE_PROMPT, TELEGRAM_ACTION_RESPONSE_CONTRACT, TELEGRAM_EXTERNAL_ACQUISITION_CONTRACT, TELEGRAM_INTERACTION_DECISION_RESPONSE_FORMAT, TELEGRAM_STUCK_SELF_TALK_PREFIXES, TELEGRAM_CHAT_HISTORY_LIMIT, TELEGRAM_CONTEXT_RECENT_DEFAULT, TELEGRAM_CONTEXT_LINE_LIMIT, TELEGRAM_CONTEXT_SAMPLE_LIMIT, TELEGRAM_MEMORY_CARD_LIMIT, TELEGRAM_MEMORY_NOTE_LIMIT, TELEGRAM_ASSOCIATIVE_FACT_LIMIT, TELEGRAM_ASSOCIATIVE_USER_FACT_LIMIT, TELEGRAM_ASSOCIATIVE_ACTION_LIMIT, TELEGRAM_ASSOCIATIVE_RELATION_LIMIT, TELEGRAM_MEMORY_STOPWORDS, TELEGRAM_MEMORY_GENERIC_QUERY_TOKENS, TELEGRAM_SUB_AGENT_BOUNDED_OPTIONS, TELEGRAM_SUB_AGENT_DEFAULT_LIMIT, TELEGRAM_SUB_AGENT_MAX_LIMIT, TELEGRAM_SUB_AGENT_BURST_CONTEXT_LIMIT, TELEGRAM_PUBLIC_HELP_COMMANDS2, TELEGRAM_REMINDER_SLASH_COMMANDS, TELEGRAM_REFLECTION_SLASH_COMMANDS, TELEGRAM_PUBLIC_BOT_COMMAND_NAMES, TELEGRAM_IMAGE_EXTENSIONS, MEDIA_CACHE_TTL_MS, TELEGRAM_CHANNEL_DMN_SWEEP_MS, TELEGRAM_CHANNEL_DMN_IDLE_AFTER_MS, TELEGRAM_CHANNEL_DMN_MIN_INTERVAL_MS, TELEGRAM_CHANNEL_DMN_MIN_MESSAGES, TELEGRAM_ALLOWED_UPDATES, TELEGRAM_PUBLIC_TOOL_QUOTAS, TelegramBridge;
620005
+ var TELEGRAM_TOOL_ACTION_GROUPS, TELEGRAM_TOOL_ACTION_GROUP, TELEGRAM_TOOL_MUTATING_GROUPS, DEFAULT_TELEGRAM_TOOL_GROUP_POLICY, TELEGRAM_TOOL_BUTTON_LABELS, TELEGRAM_SAFETY_PROMPT, ADMIN_DM_PROMPT, ADMIN_GROUP_PROMPT, TELEGRAM_PUBLIC_SOUL_PROFILE, TELEGRAM_PUBLIC_ORCHESTRATOR_CONTRACT, TELEGRAM_PUBLIC_MEMORY_SCOPE_CONTRACT, TELEGRAM_PUBLIC_VISION_STACK_CONTRACT, GROUP_REPLY_DISCRETION_PROMPT, TELEGRAM_CHAT_MODE_PROMPT, ADMIN_CHAT_PROFILE_PROMPT, TELEGRAM_ACTION_RESPONSE_CONTRACT, TELEGRAM_EXTERNAL_ACQUISITION_CONTRACT, TELEGRAM_INTERACTION_DECISION_RESPONSE_FORMAT, TELEGRAM_CHAT_REPLY_RESPONSE_FORMAT, TELEGRAM_STUCK_SELF_TALK_PREFIXES, TELEGRAM_CHAT_HISTORY_LIMIT, TELEGRAM_CONTEXT_RECENT_DEFAULT, TELEGRAM_CONTEXT_LINE_LIMIT, TELEGRAM_CONTEXT_SAMPLE_LIMIT, TELEGRAM_MEMORY_CARD_LIMIT, TELEGRAM_MEMORY_NOTE_LIMIT, TELEGRAM_ASSOCIATIVE_FACT_LIMIT, TELEGRAM_ASSOCIATIVE_USER_FACT_LIMIT, TELEGRAM_ASSOCIATIVE_ACTION_LIMIT, TELEGRAM_ASSOCIATIVE_RELATION_LIMIT, TELEGRAM_MEMORY_STOPWORDS, TELEGRAM_MEMORY_GENERIC_QUERY_TOKENS, TELEGRAM_SUB_AGENT_BOUNDED_OPTIONS, TELEGRAM_SUB_AGENT_DEFAULT_LIMIT, TELEGRAM_SUB_AGENT_MAX_LIMIT, TELEGRAM_SUB_AGENT_BURST_CONTEXT_LIMIT, TELEGRAM_PUBLIC_HELP_COMMANDS2, TELEGRAM_REMINDER_SLASH_COMMANDS, TELEGRAM_REFLECTION_SLASH_COMMANDS, TELEGRAM_PUBLIC_BOT_COMMAND_NAMES, TELEGRAM_IMAGE_EXTENSIONS, MEDIA_CACHE_TTL_MS, TELEGRAM_CHANNEL_DMN_SWEEP_MS, TELEGRAM_CHANNEL_DMN_IDLE_AFTER_MS, TELEGRAM_CHANNEL_DMN_MIN_INTERVAL_MS, TELEGRAM_CHANNEL_DMN_MIN_MESSAGES, TELEGRAM_ALLOWED_UPDATES, TELEGRAM_PUBLIC_TOOL_QUOTAS, TelegramBridge;
619587
620006
  var init_telegram_bridge = __esm({
619588
620007
  "packages/cli/src/tui/telegram-bridge.ts"() {
619589
620008
  "use strict";
@@ -619817,6 +620236,12 @@ Rules:
619817
620236
  7. Do not claim older chat is unavailable when the context stream contains it. If asked what you see, summarize the supplied transcript, speakers, and relationship/tone signals.
619818
620237
  8. Mirror the current sender's tone and directness while staying safe and clear.
619819
620238
  9. Never send router decisions, skip explanations, memory-stage notes, task-complete summaries, or "no_reply" as chat text.
620239
+
620240
+ Output discipline (your assistant message is sent verbatim to Telegram, ALL of it):
620241
+ - Emit ONLY the final reply text. Do not narrate your reasoning, summarize what you found, organize bullet-point notes, or write phrases like "Let me summarize", "Let me send the reply", "Now I have enough", "Based on the research", "Here's my response:" before the actual reply. Those are scratch-pad phrases that leak when emitted as visible text.
620242
+ - Do not produce a draft followed by the final answer. The first character of your output should be the first character of the message the user will receive.
620243
+ - If you need to think, do it silently. Do not write your reasoning steps as visible prose. If you have an internal scratchpad, keep it internal.
620244
+ - A reply that begins by restating what you found, then says something like "Let me write the response" or "Here's the breakdown", then gives the answer, is wrong twice over: the user sees the restatement AND the answer, doubling the message. Skip the restatement.
619820
620245
  `.trim();
619821
620246
  ADMIN_CHAT_PROFILE_PROMPT = `
619822
620247
  You are replying to the authenticated Telegram admin in a private DM.
@@ -619849,6 +620274,24 @@ External acquisition contract:
619849
620274
  TELEGRAM_INTERACTION_DECISION_RESPONSE_FORMAT = {
619850
620275
  type: "json_object"
619851
620276
  };
620277
+ TELEGRAM_CHAT_REPLY_RESPONSE_FORMAT = {
620278
+ type: "json_schema",
620279
+ json_schema: {
620280
+ name: "telegram_chat_reply",
620281
+ strict: true,
620282
+ schema: {
620283
+ type: "object",
620284
+ additionalProperties: false,
620285
+ required: ["reply"],
620286
+ properties: {
620287
+ reply: {
620288
+ type: "string",
620289
+ description: "The exact text to send to Telegram. No prefixes, no narration, no scratch reasoning, no bullet-point notes preceding the reply."
620290
+ }
620291
+ }
620292
+ }
620293
+ }
620294
+ };
619852
620295
  TELEGRAM_STUCK_SELF_TALK_PREFIXES = [
619853
620296
  /^i'?ve been stuck for\b/i,
619854
620297
  /^i am (still |currently )?stuck\b/i,
@@ -622075,6 +622518,14 @@ ${mediaContext}` : ""
622075
622518
  if (state.lastFollowupAt && now - state.lastFollowupAt < 60 * 6e4) {
622076
622519
  return { sent: false, reason: "rate limit held public follow-up" };
622077
622520
  }
622521
+ const cooldownEnv = Number.parseInt(process.env["OMNIUS_TG_FOLLOWUP_COOLDOWN_MS"] ?? "", 10);
622522
+ const cooldownMs = Number.isFinite(cooldownEnv) && cooldownEnv >= 6e4 ? cooldownEnv : 10 * 6e4;
622523
+ if (state.lastAssistantMessageAt && now - state.lastAssistantMessageAt < cooldownMs) {
622524
+ return {
622525
+ sent: false,
622526
+ reason: `recent assistant reply suppresses follow-up (${Math.round((now - state.lastAssistantMessageAt) / 1e3)}s ago, cooldown ${Math.round(cooldownMs / 1e3)}s)`
622527
+ };
622528
+ }
622078
622529
  const candidateMessageIds = Array.from(new Set([
622079
622530
  ...artifact.curiosityThreads.flatMap((thread) => thread.sourceMessages ?? []),
622080
622531
  ...artifact.memoryProposals.flatMap((proposal) => proposal.sourceMessages ?? []),
@@ -622405,6 +622856,10 @@ ${mediaContext}` : ""
622405
622856
  chatTitle: msg.chatTitle
622406
622857
  };
622407
622858
  this.recordChatHistory(sessionKey, entry);
622859
+ try {
622860
+ this.reflectionStateForSession(sessionKey).lastAssistantMessageAt = Date.now();
622861
+ } catch {
622862
+ }
622408
622863
  this.persistTelegramAssistantMessage(
622409
622864
  msg,
622410
622865
  clean5,
@@ -623632,32 +624087,16 @@ ${lines.join("\n")}`);
623632
624087
  sections.push(`### Participants And Relationship Signals${tierNote}
623633
624088
  ${participantLines.join("\n")}`);
623634
624089
  }
623635
- const associativeContext = this.relevantTelegramAssociativeMemoryContext(
623636
- sessionKey,
623637
- msg,
623638
- isGroup ? 14 : 8
623639
- );
623640
- if (associativeContext) {
623641
- sections.push(associativeContext);
623642
- }
623643
- const sqliteMirrorContext = this.relevantTelegramSqliteMirrorContext(
623644
- sessionKey,
623645
- msg,
623646
- isGroup ? 14 : 8
623647
- );
623648
- if (sqliteMirrorContext) {
623649
- sections.push(sqliteMirrorContext);
623650
- }
623651
- try {
623652
- const episodicContext = this.relevantTelegramEpisodicMemoryContext(
624090
+ const ASSOCIATIVE_MIN_TURNS = isGroup ? 8 : 4;
624091
+ if (retainedCount >= ASSOCIATIVE_MIN_TURNS) {
624092
+ const associativeContext = this.relevantTelegramAssociativeMemoryContext(
623653
624093
  sessionKey,
623654
624094
  msg,
623655
- isGroup ? 10 : 6
624095
+ isGroup ? 14 : 8
623656
624096
  );
623657
- if (episodicContext) {
623658
- sections.push(episodicContext);
624097
+ if (associativeContext) {
624098
+ sections.push(associativeContext);
623659
624099
  }
623660
- } catch {
623661
624100
  }
623662
624101
  const memoryCards = this.relevantTelegramMemoryCards(sessionKey, msg, isGroup ? 10 : 6);
623663
624102
  if (memoryCards.length > 0) {
@@ -623688,10 +624127,6 @@ ${notes2}`;
623688
624127
  ${cardLines.join("\n")}`);
623689
624128
  }
623690
624129
  }
623691
- const channelDaydream = this.formatLatestTelegramChannelDaydreamContext(sessionKey);
623692
- if (channelDaydream) {
623693
- sections.push(channelDaydream);
623694
- }
623695
624130
  const recentMedia = this.recentTelegramMediaEntries(msg.chatId, 10);
623696
624131
  if (recentMedia.length > 0) {
623697
624132
  const mediaLines = recentMedia.map((entry) => {
@@ -623710,26 +624145,33 @@ ${cardLines.join("\n")}`);
623710
624145
  ].join("\n"));
623711
624146
  }
623712
624147
  if (olderCount > 0) {
624148
+ const halfLifeMs = (isGroup ? 24 : 48) * 60 * 60 * 1e3;
624149
+ const now = Date.now();
623713
624150
  const older = history.slice(0, olderCount);
623714
624151
  const bySpeaker = /* @__PURE__ */ new Map();
623715
624152
  for (const entry of older) {
623716
624153
  if (!entry.text.trim()) continue;
623717
624154
  const speaker = telegramHistorySpeaker(entry);
624155
+ const ageMs = Math.max(0, now - (entry.ts ?? 0));
624156
+ const weight = Math.exp(-ageMs / halfLifeMs);
623718
624157
  const existing = bySpeaker.get(speaker);
623719
624158
  const text = truncateTelegramContextLine(entry.text, 180);
623720
624159
  if (existing) {
623721
624160
  existing.count += 1;
623722
624161
  existing.last = text;
624162
+ existing.weightSum += weight;
624163
+ existing.maxWeight = Math.max(existing.maxWeight, weight);
623723
624164
  } else {
623724
- bySpeaker.set(speaker, { count: 1, first: text, last: text });
624165
+ bySpeaker.set(speaker, { count: 1, first: text, last: text, weightSum: weight, maxWeight: weight });
623725
624166
  }
623726
624167
  }
623727
- const olderLines = [...bySpeaker.entries()].slice(0, 10).map(([speaker, info]) => {
624168
+ const olderLines = [...bySpeaker.entries()].sort(([, a2], [, b]) => b.maxWeight - a2.maxWeight).slice(0, 5).map(([speaker, info]) => {
623728
624169
  const range = info.first === info.last ? info.first : `${info.first} -> ${info.last}`;
623729
- return `- ${speaker}: ${info.count} earlier msg(s); digest=${telegramContextJsonString(range, 240)}`;
624170
+ const decayLabel = info.maxWeight >= 0.5 ? "fresh" : info.maxWeight >= 0.1 ? "decayed" : "stale";
624171
+ return `- ${speaker}: ${info.count} earlier msg(s) [${decayLabel}]; digest=${telegramContextJsonString(range, 200)}`;
623730
624172
  });
623731
624173
  if (olderLines.length > 0) {
623732
- sections.push(`### Earlier Retained Thread Digest
624174
+ sections.push(`### Earlier Retained Thread Digest (recency-weighted)
623733
624175
  ${olderLines.join("\n")}`);
623734
624176
  }
623735
624177
  }
@@ -623949,7 +624391,8 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
623949
624391
  ],
623950
624392
  tools: [],
623951
624393
  temperature: 0,
623952
- maxTokens: 650,
624394
+ // Reflection has 12 string fields; 650 was tight enough to truncate.
624395
+ maxTokens: 1500,
623953
624396
  timeoutMs: telegramRouterTimeoutMs(timeoutMs),
623954
624397
  think: false
623955
624398
  },
@@ -624039,9 +624482,11 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
624039
624482
  const promptTokens = estimatePromptTokensFromRequest(request);
624040
624483
  const broker = getModelBroker();
624041
624484
  const trainCtx = await broker.getNctxTrain(model).catch(() => null);
624042
- const targetCtx = trainCtx && trainCtx > 0 ? Math.min(trainCtx, Math.max(2048, promptTokens + 1024)) : Math.max(2048, promptTokens + 1024);
624485
+ const completionHeadroom = 4096;
624486
+ const targetCtx = trainCtx && trainCtx > 0 ? Math.min(trainCtx, Math.max(2048, promptTokens + completionHeadroom)) : Math.max(2048, promptTokens + completionHeadroom);
624043
624487
  const requestWithCtx = { ...request, numCtx: targetCtx };
624044
- const slot = await broker.acquireInferenceSlot({
624488
+ const brokerBypass = process.env["OMNIUS_DISABLE_BROKER_ADMISSION"] === "1";
624489
+ const slot = brokerBypass ? null : await broker.acquireInferenceSlot({
624045
624490
  model,
624046
624491
  domain: "chat",
624047
624492
  owner: `telegram-bridge/${kind}`,
@@ -624049,10 +624494,12 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
624049
624494
  promptTokens,
624050
624495
  priority: kind === "router" || kind === "router-repair" || kind === "router-strict-retry" ? 1 : 0
624051
624496
  });
624052
- this.tuiWrite(() => renderTelegramSubAgentEvent(
624053
- sessionKey,
624054
- `inference admitted [${kind}] model=${model} prompt~${promptTokens}t num_ctx=${targetCtx} slot=${slot.info.id}${slot.info.reserved ? " reserved" : ""}`
624055
- ));
624497
+ if (process.env["OMNIUS_BROKER_TRACE"] === "1") {
624498
+ this.tuiWrite(() => renderTelegramSubAgentEvent(
624499
+ sessionKey,
624500
+ `inference admitted [${kind}] model=${model} prompt~${promptTokens}t num_ctx=${targetCtx} slot=${slot ? slot.info.id : "bypass"}${slot?.info.reserved ? " reserved" : ""}`
624501
+ ));
624502
+ }
624056
624503
  const streamFn = backend.chatCompletionStream;
624057
624504
  const id = this.registerTelegramInference(kind, sessionKey, model);
624058
624505
  let completionTokens = 0;
@@ -624079,10 +624526,10 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
624079
624526
  }
624080
624527
  const usage = result.usage;
624081
624528
  completionTokens = usage?.completion_tokens ?? 0;
624082
- slot.release({ ok: true, completionTokens });
624529
+ slot?.release({ ok: true, completionTokens });
624083
624530
  return result;
624084
624531
  } catch (err) {
624085
- slot.release({ ok: false, error: err instanceof Error ? err.message : String(err) });
624532
+ slot?.release({ ok: false, error: err instanceof Error ? err.message : String(err) });
624086
624533
  throw err;
624087
624534
  } finally {
624088
624535
  this.deregisterTelegramInference(id);
@@ -624274,7 +624721,7 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
624274
624721
  getTelegramThinkingVisible() {
624275
624722
  return this.telegramThinkingVisible;
624276
624723
  }
624277
- async repairTelegramInteractionDecision(backend, rawOutput, forcedRoute, timeoutMs, diagnostics) {
624724
+ async repairTelegramInteractionDecision(backend, rawOutput, forcedRoute, timeoutMs, diagnostics, sessionKey = "__router__") {
624278
624725
  const rawPreview = telegramRouterRawPreview(rawOutput, 4e3);
624279
624726
  if (!rawPreview || telegramDecisionOutputHasDanglingJson(rawOutput)) {
624280
624727
  if (diagnostics) {
@@ -624309,10 +624756,10 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
624309
624756
  ],
624310
624757
  tools: [],
624311
624758
  temperature: 0,
624312
- maxTokens: 500,
624759
+ maxTokens: 1500,
624313
624760
  timeoutMs: telegramRouterTimeoutMs(timeoutMs, 8e3, 2e4),
624314
624761
  think: false
624315
- });
624762
+ }, diagnostics, "router-repair", sessionKey);
624316
624763
  const repairedText = result.choices[0]?.message?.content ?? "";
624317
624764
  if (telegramDecisionRecoverableFlag(repairedText) === false) {
624318
624765
  if (diagnostics) diagnostics.repairStatus = "no-recoverable-output";
@@ -624344,7 +624791,7 @@ ${repairedText}`,
624344
624791
  return null;
624345
624792
  }
624346
624793
  }
624347
- async retryTelegramInteractionDecisionStrict(backend, userPrompt, rawOutput, forcedRoute, timeoutMs, diagnostics) {
624794
+ async retryTelegramInteractionDecisionStrict(backend, userPrompt, rawOutput, forcedRoute, timeoutMs, diagnostics, sessionKey = "__router__") {
624348
624795
  const invalidPreview = telegramRouterRawPreview(rawOutput, 1200) ?? "(empty assistant content)";
624349
624796
  const routeInstruction = forcedRoute ? `The operator selected Telegram mode "${forcedRoute}". The route field must be "${forcedRoute}", but should_reply must still be inferred from context.` : `Infer route live from context.`;
624350
624797
  const trimmedUserPrompt = userPrompt.length > 4e3 ? `…
@@ -624376,10 +624823,10 @@ ${userPrompt.slice(-4e3)}` : userPrompt;
624376
624823
  ],
624377
624824
  tools: [],
624378
624825
  temperature: 0,
624379
- maxTokens: 1200,
624826
+ maxTokens: 2400,
624380
624827
  timeoutMs: telegramRouterTimeoutMs(timeoutMs, 1e4, 3e4),
624381
624828
  think: false
624382
- });
624829
+ }, diagnostics, "router-strict-retry", sessionKey);
624383
624830
  const retryText = result.choices[0]?.message?.content ?? "";
624384
624831
  if (diagnostics) diagnostics.strictRetryPreview = telegramRouterRawPreview(retryText, 320);
624385
624832
  const parsed = parseTelegramInteractionDecision(retryText, forcedRoute, {
@@ -624762,10 +625209,14 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`
624762
625209
  ],
624763
625210
  tools: [],
624764
625211
  temperature: 0,
624765
- maxTokens: 1e3,
625212
+ // Router JSON schema has ~18 string-valued fields when reflection is
625213
+ // embedded (consolidated mode). 1000 tokens was the documented cause
625214
+ // of truncated JSON → repair → strict-retry cascade. 2400 is enough
625215
+ // for normal verbose values without slowing the call appreciably.
625216
+ maxTokens: 2400,
624766
625217
  timeoutMs: telegramRouterTimeoutMs(config.timeoutMs),
624767
625218
  think: false
624768
- }, diagnostics);
625219
+ }, diagnostics, "router", sessionKey);
624769
625220
  const text = result.choices[0]?.message?.content ?? "";
624770
625221
  const routerLatencyMs = Date.now() - routerStartMs;
624771
625222
  try {
@@ -624788,12 +625239,40 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`
624788
625239
  if (parsed) {
624789
625240
  return this.applyTelegramSilentReflectionNotes(parsed, reflectionNotes);
624790
625241
  }
625242
+ if (isLikelyTruncatedRouterJson(text)) {
625243
+ if (diagnostics) diagnostics.repairStatus = "skipped-truncation-rerun";
625244
+ try {
625245
+ const reissued = await this.telegramRouterJsonCompletion(backend, {
625246
+ messages: [
625247
+ {
625248
+ role: "system",
625249
+ content: "You perform live Telegram route and stimulation inference. Output strict JSON only."
625250
+ },
625251
+ { role: "user", content: userPrompt }
625252
+ ],
625253
+ tools: [],
625254
+ temperature: 0,
625255
+ maxTokens: 4096,
625256
+ timeoutMs: telegramRouterTimeoutMs(config.timeoutMs),
625257
+ think: false
625258
+ }, diagnostics, "router", sessionKey);
625259
+ const reissuedText = reissued.choices[0]?.message?.content ?? "";
625260
+ const reparsed = parseTelegramInteractionDecision(reissuedText, forcedRoute, {
625261
+ defaultShouldReply: false
625262
+ });
625263
+ if (reparsed) {
625264
+ return this.applyTelegramSilentReflectionNotes(reparsed, reflectionNotes);
625265
+ }
625266
+ } catch {
625267
+ }
625268
+ }
624791
625269
  const repaired = await this.repairTelegramInteractionDecision(
624792
625270
  backend,
624793
625271
  text,
624794
625272
  forcedRoute,
624795
625273
  config.timeoutMs ?? 3e4,
624796
- diagnostics
625274
+ diagnostics,
625275
+ sessionKey
624797
625276
  );
624798
625277
  if (repaired) {
624799
625278
  return this.applyTelegramSilentReflectionNotes(repaired, reflectionNotes);
@@ -624804,7 +625283,8 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`
624804
625283
  text,
624805
625284
  forcedRoute,
624806
625285
  config.timeoutMs ?? 3e4,
624807
- diagnostics
625286
+ diagnostics,
625287
+ sessionKey
624808
625288
  );
624809
625289
  if (strictRetry) {
624810
625290
  return this.applyTelegramSilentReflectionNotes(strictRetry, reflectionNotes);
@@ -625050,34 +625530,25 @@ ${list}` : "No shared group target is currently known for this sender. Ask in th
625050
625530
  return join131(this.repoRoot, ".omnius", "telegram-runner-state", safe);
625051
625531
  }
625052
625532
  buildTelegramAdminOverviewContext(currentSessionKey) {
625053
- const sections = [];
625054
625533
  this.ensureAllTelegramConversationsLoaded();
625055
625534
  const chatEntries = [...this.chatHistory.entries()].filter(([sessionKey, history]) => sessionKey !== currentSessionKey && history.length > 0).sort(([, a2], [, b]) => (b[b.length - 1]?.ts ?? 0) - (a2[a2.length - 1]?.ts ?? 0)).slice(0, 18);
625535
+ if (chatEntries.length === 0) return "";
625536
+ const indexLines = [];
625056
625537
  for (const [sessionKey, history] of chatEntries) {
625057
625538
  const latest = history[history.length - 1];
625058
- const participants = [...this.chatParticipants.get(sessionKey)?.values() ?? []].sort((a2, b) => b.lastSeenTs - a2.lastSeenTs).slice(0, 8).map((profile) => {
625059
- const label = profile.username && profile.username !== "unknown" ? `@${profile.username}` : profile.firstName || `user:${profile.fromUserId}`;
625060
- return `${label} (${profile.messageCount} msg)`;
625061
- }).join(", ");
625062
- const recent = history.slice(-5).map(
625063
- (entry) => ` - ${telegramHistorySpeaker(entry)}: ${truncateTelegramContextLine(entry.text, 180)}`
625064
- ).join("\n");
625065
- const cards = (this.chatMemoryCards.get(sessionKey) ?? []).slice(0, 4).map((card) => ` - ${card.title}: ${card.notes.slice(-1)[0] ?? ""}`).join("\n");
625066
- sections.push([
625067
- `- ${sessionKey} (chat_id ${String(latest.chatId ?? "unknown")}; ${latest.chatType || "chat"}${latest.chatTitle ? `: ${latest.chatTitle}` : ""})`,
625068
- participants ? ` Participants: ${participants}` : "",
625069
- ` Latest: ${telegramHistorySpeaker(latest)}: ${truncateTelegramContextLine(latest.text, 180)}`,
625070
- recent ? ` Recent:
625071
- ${recent}` : "",
625072
- cards ? ` Memory cards:
625073
- ${cards}` : ""
625074
- ].filter(Boolean).join("\n"));
625075
- }
625076
- if (sections.length === 0) return "";
625539
+ const participantCount = this.chatParticipants.get(sessionKey)?.size ?? 0;
625540
+ const ageMs = Date.now() - (latest.ts ?? 0);
625541
+ const ageMin = Math.round(ageMs / 6e4);
625542
+ const ageStr = ageMin < 60 ? `${ageMin}m ago` : ageMin < 24 * 60 ? `${Math.round(ageMin / 60)}h ago` : `${Math.round(ageMin / (24 * 60))}d ago`;
625543
+ const label = latest.chatTitle ? `"${latest.chatTitle}"` : sessionKey;
625544
+ indexLines.push(`- ${label} (chat_id ${String(latest.chatId ?? "?")}; ${latest.chatType || "chat"}): ${participantCount} participants; last ${ageStr}; ${history.length} retained msgs`);
625545
+ }
625077
625546
  return [
625078
- "## Admin Telegram Omniscience",
625079
- "This section is one-way context for the authenticated admin private DM only. It summarizes public/group and other Telegram sessions the bot has observed. Never inject admin/private DM content into public groups.",
625080
- sections.join("\n")
625547
+ "## Admin Telegram Omniscience (index only)",
625548
+ "One-way context for the authenticated admin private DM. Other Telegram sessions the bot has observed are listed below with one line each.",
625549
+ "For details on a specific chat, use telegram_memory_search with the chat_id or topic — the always-loaded view is intentionally compact.",
625550
+ "Never inject admin/private DM content into public groups.",
625551
+ indexLines.join("\n")
625081
625552
  ].join("\n\n");
625082
625553
  }
625083
625554
  buildTelegramSessionContext(msg, toolContext, profile, modelTier) {
@@ -626197,8 +626668,9 @@ ${conversationStream}`
626197
626668
  messages: this.buildTelegramChatMessages(msg, toolContext, mediaContext),
626198
626669
  tools: [],
626199
626670
  temperature: 0.4,
626200
- maxTokens: 700,
626201
- timeoutMs: Math.max(config.timeoutMs ?? 3e5, 12e4)
626671
+ maxTokens: 1500,
626672
+ timeoutMs: Math.max(config.timeoutMs ?? 3e5, 12e4),
626673
+ responseFormat: TELEGRAM_CHAT_REPLY_RESPONSE_FORMAT
626202
626674
  });
626203
626675
  let accumulated = "";
626204
626676
  let streamError;
@@ -626225,7 +626697,8 @@ ${conversationStream}`
626225
626697
  } else {
626226
626698
  this.bumpTelegramInferenceTokens(inferenceId, 1, 0);
626227
626699
  accumulated += piece;
626228
- await onToken(accumulated);
626700
+ const partial = extractPartialTelegramReplyJson(accumulated);
626701
+ if (partial !== null) await onToken(partial);
626229
626702
  }
626230
626703
  }
626231
626704
  } catch (err) {
@@ -626247,11 +626720,14 @@ ${conversationStream}`
626247
626720
  }
626248
626721
  this.updateTelegramInferenceFinal(inferenceId, result);
626249
626722
  accumulated = result.choices[0]?.message?.content ?? "";
626250
- if (accumulated) await onToken(accumulated);
626723
+ const fullExtracted = extractPartialTelegramReplyJson(accumulated);
626724
+ if (fullExtracted) await onToken(fullExtracted);
626251
626725
  }
626252
626726
  } finally {
626253
626727
  this.deregisterTelegramInference(inferenceId);
626254
626728
  }
626729
+ const extracted = extractFinalTelegramReplyJson(accumulated);
626730
+ if (extracted) return extracted;
626255
626731
  return stripTelegramHiddenThinking(accumulated).trim();
626256
626732
  }
626257
626733
  retainTelegramVisibleReplyDraft(subAgent, draft, streamToolNames = subAgent.currentStreamToolNames) {