npm - omnius - Versions diffs - 1.0.135 → 1.0.137 - Mend

omnius 1.0.135 → 1.0.137

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -1337,29 +1337,36 @@ function ramSnapshotMB() {
   const free = Math.round(freemem() / (1024 * 1024));
   return { total, free, used: total - free };
 }
-async function vramSnapshotMB() {
+async function vramSnapshotPerDevice() {
   if (_nvSmiAvailable === false)
-    return null;
+    return [];
   try {
     const out = await new Promise((resolve55, reject) => {
-      exec("nvidia-smi --query-gpu=memory.total,memory.used,memory.free --format=csv,noheader,nounits 2>/dev/null", { encoding: "utf8", timeout: 3e3 }, (err, stdout) => err ? reject(err) : resolve55(stdout));
+      exec("nvidia-smi --query-gpu=index,uuid,memory.total,memory.used,memory.free --format=csv,noheader,nounits 2>/dev/null", { encoding: "utf8", timeout: 3e3 }, (err, stdout) => err ? reject(err) : resolve55(stdout));
     });
     _nvSmiAvailable = true;
-    let total = 0, used = 0, free = 0;
+    const devices = [];
     for (const line of out.trim().split("\n")) {
+      if (!line.trim())
+        continue;
       const parts = line.split(",").map((s2) => s2.trim());
-      if (parts.length < 3)
+      if (parts.length < 5)
+        continue;
+      const index = parseInt(parts[0] ?? "-1", 10);
+      if (!Number.isFinite(index) || index < 0)
         continue;
-      total += parseInt(parts[0] ?? "0", 10) || 0;
-      used += parseInt(parts[1] ?? "0", 10) || 0;
-      free += parseInt(parts[2] ?? "0", 10) || 0;
+      devices.push({
+        index,
+        uuid: parts[1] ?? "",
+        total: parseInt(parts[2] ?? "0", 10) || 0,
+        used: parseInt(parts[3] ?? "0", 10) || 0,
+        free: parseInt(parts[4] ?? "0", 10) || 0
+      });
     }
-    if (total <= 0)
-      return null;
-    return { total, used, free };
+    return devices;
   } catch {
     _nvSmiAvailable = false;
-    return null;
+    return [];
   }
 }
 function getModelBroker() {
@@ -1374,7 +1381,7 @@ var init_model_broker = __esm({
     DEFAULT_IDLE_EVICT_MS = 5 * 60 * 1e3;
     DEFAULT_POLL_MS = 4e3;
     DEFAULT_INFLIGHT_WAIT_MS = 6e4;
-    DEFAULT_SLOT_CAPACITY = 4;
+    DEFAULT_SLOT_CAPACITY = 8;
     DEFAULT_QUEUE_CAPACITY = 50;
     THROUGHPUT_EMA_ALPHA = 0.2;
     THROUGHPUT_INITIAL_TPS = 25;
@@ -1403,7 +1410,9 @@ var init_model_broker = __esm({
       ramHeadroomMB = DEFAULT_RAM_HEADROOM_MB;
       vramHeadroomMB = DEFAULT_VRAM_HEADROOM_MB;
       idleEvictMs = DEFAULT_IDLE_EVICT_MS;
-      /** Inference slot capacity (auto-tunes from Ollama pool size when known). */
+      /** Inference slot capacity (shared pool aggregate; auto-tunes from Ollama
+       *  pool size when known). Per-device cap defaults to ceil(slotCapacity/N)
+       *  unless overridden via setPerGpuSlotCapacity. */
       slotCapacity = DEFAULT_SLOT_CAPACITY;
       /** Maximum queue depth before queue pressure is emitted. */
       queueCapacity = DEFAULT_QUEUE_CAPACITY;
@@ -1419,6 +1428,15 @@ var init_model_broker = __esm({
       _throughput = /* @__PURE__ */ new Map();
       /** Monotonic counter for slot ids. */
       _slotIdSeq = 0;
+      /** Per-GPU slot capacity override. When unset, broker derives a per-GPU
+       *  cap from slotCapacity / detected device count. */
+      _perGpuSlotCapacity = /* @__PURE__ */ new Map();
+      /** Cached per-device VRAM (refreshed by pollOnce). */
+      _vramByDevice = [];
+      /** Optional provider that maps an Ollama model name to its current GPU.
+       *  Wired by the CLI/orchestrator at startup so the broker can copy pool
+       *  affinity onto LoadedModel records without importing the pool directly. */
+      _ollamaAffinityProvider = null;
       static getInstance() {
         if (!_ModelBroker._instance)
           _ModelBroker._instance = new _ModelBroker();
@@ -1471,6 +1489,18 @@ var init_model_broker = __esm({
       setOllamaBaseUrl(url) {
         this._ollamaBaseUrl = url;
       }
+      /**
+       * Wire a function that resolves an Ollama model name to its current GPU
+       * affinity (from the Ollama pool's per-instance state). The CLI calls
+       * this at startup with a closure over `getOllamaPool().status()` so the
+       * broker can copy gpuIndex/gpuUuid onto LoadedModel records without
+       * importing from @omnius/orchestrator (which would create a circular dep).
+       *
+       * Pass null to clear.
+       */
+      setOllamaAffinityProvider(provider) {
+        this._ollamaAffinityProvider = provider;
+      }
       /** One poll cycle — refreshes /api/ps and emits snapshot. */
       async pollOnce() {
         await Promise.all([
@@ -1539,30 +1569,44 @@ var init_model_broker = __esm({
         const estVram = spec.estimatedVramMB ?? this.estimateFootprintVramMB(spec);
         const estRam = spec.estimatedRamMB ?? this.estimateFootprintRamMB(spec);
         const ram = ramSnapshotMB();
-        const vram = await vramSnapshotMB();
         const ramFitsAfter = ram.free - estRam >= this.ramHeadroomMB;
-        const vramFitsAfter = vram ? vram.free - estVram >= this.vramHeadroomMB : true;
+        const devices = await vramSnapshotPerDevice();
+        this._vramByDevice = devices;
+        let chosenGpu = null;
+        let vramFitsAfter = devices.length === 0;
+        if (devices.length > 0) {
+          const candidates = devices.filter((d2) => spec.preferredGpuIndex === void 0 || d2.index === spec.preferredGpuIndex).filter((d2) => d2.free - estVram >= this.vramHeadroomMB).sort((a2, b) => b.free - a2.free);
+          if (candidates.length > 0) {
+            chosenGpu = candidates[0].index;
+            vramFitsAfter = true;
+          }
+        }
         if (ramFitsAfter && vramFitsAfter) {
-          const promise = Promise.resolve({ kind: "ok", effectiveNumCtx });
+          const decision2 = { kind: "ok", effectiveNumCtx, gpuIndex: chosenGpu };
+          const promise = Promise.resolve(decision2);
           this._inflight.set(key, { startedMs: Date.now(), owner: spec.owner, promise });
           setTimeout(() => this._inflight.delete(key), spec.loadTimeoutMs ?? DEFAULT_INFLIGHT_WAIT_MS).unref?.();
-          return { kind: "ok", effectiveNumCtx };
+          return decision2;
         }
+        const targetGpu = chosenGpu ?? this.deviceWithMostPressureRelativeTo(devices, estVram);
+        const needVramMB = vramFitsAfter ? 0 : targetGpu !== null ? estVram + this.vramHeadroomMB - (devices.find((d2) => d2.index === targetGpu)?.free ?? 0) : estVram + this.vramHeadroomMB;
         const evictTargets = this.pickEvictionCandidates({
-          needVramMB: vramFitsAfter ? 0 : estVram + this.vramHeadroomMB - (vram?.free ?? 0),
+          needVramMB,
           needRamMB: ramFitsAfter ? 0 : estRam + this.ramHeadroomMB - ram.free,
           requestingPriority: spec.priority ?? 0,
-          requestingDomain: spec.domain
+          requestingDomain: spec.domain,
+          targetGpu
         });
         if (evictTargets.length > 0) {
-          return { kind: "evict", evictTargets, effectiveNumCtx };
+          return { kind: "evict", evictTargets, effectiveNumCtx, gpuIndex: targetGpu };
         }
         const fallback = await this.findRunnableFallback(spec);
         if (fallback) {
           this.emit("degraded", spec, fallback, "insufficient-memory-no-evictable");
           return { kind: "degrade", fallback, reason: "insufficient-memory-no-evictable" };
         }
-        const reason = `insufficient resources (need ~${estRam}MB RAM, ~${estVram}MB VRAM; free ${ram.free}MB RAM, ${vram ? vram.free : "?"}MB VRAM) and no evictable / fallback models`;
+        const perDeviceSummary = devices.length === 0 ? "no GPU" : devices.map((d2) => `gpu${d2.index}=${d2.free}MB`).join(", ");
+        const reason = `insufficient resources (need ~${estRam}MB RAM, ~${estVram}MB VRAM; free ${ram.free}MB RAM; VRAM ${perDeviceSummary}) and no evictable / fallback models`;
         this.emit("rejected", spec, reason);
         return { kind: "reject", reason };
       }
@@ -1638,10 +1682,22 @@ var init_model_broker = __esm({
             seen.add(key);
             const vramMB = Math.round((m2.size_vram ?? 0) / (1024 * 1024));
             const ramMB = Math.round(((m2.size ?? 0) - (m2.size_vram ?? 0)) / (1024 * 1024));
+            let affinity = null;
+            try {
+              affinity = this._ollamaAffinityProvider ? this._ollamaAffinityProvider(m2.name) : null;
+            } catch {
+              affinity = null;
+            }
             const existing = this._loaded.get(key);
             if (existing) {
               existing.vramMB = vramMB || existing.vramMB;
               existing.ramMB = ramMB || existing.ramMB;
+              if (affinity) {
+                if (affinity.gpuIndex !== null)
+                  existing.gpuIndex = affinity.gpuIndex;
+                if (affinity.gpuUuid !== null)
+                  existing.gpuUuid = affinity.gpuUuid;
+              }
             } else {
               const tracked = this.registerLoaded({
                 key,
@@ -1653,7 +1709,9 @@ var init_model_broker = __esm({
                 ramMB,
                 priority: 0,
                 loadedAt: now,
-                lastUsedAt: now
+                lastUsedAt: now,
+                gpuIndex: affinity?.gpuIndex ?? null,
+                gpuUuid: affinity?.gpuUuid ?? null
               });
               void tracked;
             }
@@ -1746,7 +1804,8 @@ var init_model_broker = __esm({
           m2.domain !== req2.requestingDomain || this.countByDomain(req2.requestingDomain) > 1
         );
         const idle = (m2) => now - m2.lastUsedAt > this.idleEvictMs;
-        const evictable = [...this._loaded.values()].filter((m2) => m2.priority <= req2.requestingPriority).filter(sameDomainOk).sort((a2, b) => {
+        const onTargetGpu = (m2) => req2.targetGpu === void 0 || req2.targetGpu === null ? true : m2.gpuIndex === req2.targetGpu;
+        const evictable = [...this._loaded.values()].filter((m2) => m2.priority <= req2.requestingPriority).filter(sameDomainOk).filter(onTargetGpu).sort((a2, b) => {
           const aIdle = idle(a2) ? 0 : 1;
           const bIdle = idle(b) ? 0 : 1;
           if (aIdle !== bIdle)
@@ -1767,6 +1826,24 @@ var init_model_broker = __esm({
           return targets;
         return [];
       }
+      /** Pick the GPU whose free-VRAM gap to the requested footprint is smallest
+       *  (i.e. closest to fitting). Used when no device cleanly fits — eviction
+       *  on this device has the best chance of opening room. Returns null when
+       *  no GPUs are detected. */
+      deviceWithMostPressureRelativeTo(devices, needMB) {
+        if (devices.length === 0)
+          return null;
+        let best = null;
+        let bestGap = Infinity;
+        for (const d2 of devices) {
+          const gap = needMB - d2.free;
+          if (gap < bestGap) {
+            bestGap = gap;
+            best = d2;
+          }
+        }
+        return best?.index ?? null;
+      }
       countByDomain(domain) {
         let n2 = 0;
         for (const m2 of this._loaded.values())
@@ -1897,17 +1974,31 @@ var init_model_broker = __esm({
           inflight: [...this._inflight.entries()].map(([key, v]) => ({ key, owner: v.owner, startedMs: v.startedMs })),
           ramMB: ram,
           vramMB: vram,
+          vramPerDevice: [...this._vramByDevice],
           lastPollAt: Date.now(),
           slots: this.buildSlotsSnapshot()
         };
       }
       buildSlotsSnapshot() {
         const byModel = {};
+        const byGpu = {};
         for (const slot of this._activeSlots.values()) {
           const k = slot.model;
           if (!byModel[k])
             byModel[k] = { inUse: 0, tokensPerSec: 0, samples: 0 };
           byModel[k].inUse += 1;
+          if (slot.gpuIndex !== null && slot.gpuIndex !== void 0) {
+            if (!byGpu[slot.gpuIndex])
+              byGpu[slot.gpuIndex] = { inUse: 0, capacity: this.perGpuSlotCapacity(slot.gpuIndex), loadedMB: 0 };
+            byGpu[slot.gpuIndex].inUse += 1;
+          }
+        }
+        for (const m2 of this._loaded.values()) {
+          if (m2.gpuIndex !== null && m2.gpuIndex !== void 0) {
+            if (!byGpu[m2.gpuIndex])
+              byGpu[m2.gpuIndex] = { inUse: 0, capacity: this.perGpuSlotCapacity(m2.gpuIndex), loadedMB: 0 };
+            byGpu[m2.gpuIndex].loadedMB += m2.vramMB;
+          }
         }
         for (const [model, tp] of this._throughput) {
           if (!byModel[model])
@@ -1915,23 +2006,46 @@ var init_model_broker = __esm({
           byModel[model].tokensPerSec = tp.tokensPerSec;
           byModel[model].samples = tp.samples;
         }
+        for (const d2 of this._vramByDevice) {
+          if (!byGpu[d2.index])
+            byGpu[d2.index] = { inUse: 0, capacity: this.perGpuSlotCapacity(d2.index), loadedMB: 0 };
+        }
         return {
           inUse: this._activeSlots.size,
           capacity: this.slotCapacity,
           queueDepth: this._slotQueue.length,
           queueCapacity: this.queueCapacity,
-          byModel
+          byModel,
+          byGpu
         };
       }
+      /** Per-GPU slot capacity. Returns the override when set, else ceil(slotCapacity / deviceCount). */
+      perGpuSlotCapacity(gpuIndex) {
+        const override = this._perGpuSlotCapacity.get(gpuIndex);
+        if (override !== void 0)
+          return override;
+        const n2 = Math.max(1, this._vramByDevice.length);
+        return Math.max(1, Math.ceil(this.slotCapacity / n2));
+      }
       async checkPressure(snap) {
         if (snap.ramMB.free < this.ramHeadroomMB) {
           this.emit("pressure", "ram", snap.ramMB.free, this.ramHeadroomMB);
         }
-        const v = await vramSnapshotMB();
-        if (v) {
-          snap.vramMB = v;
-          if (v.free < this.vramHeadroomMB) {
-            this.emit("pressure", "vram", v.free, this.vramHeadroomMB);
+        const devices = await vramSnapshotPerDevice();
+        this._vramByDevice = devices;
+        if (devices.length > 0) {
+          let total = 0, used = 0, free = 0;
+          for (const d2 of devices) {
+            total += d2.total;
+            used += d2.used;
+            free += d2.free;
+          }
+          snap.vramMB = { total, used, free };
+          snap.vramPerDevice = devices;
+          for (const d2 of devices) {
+            if (d2.free < this.vramHeadroomMB) {
+              this.emit("pressure", "vram", d2.free, this.vramHeadroomMB);
+            }
           }
         }
         const queueThreshold = Math.floor(this.queueCapacity * 0.8);
@@ -1960,23 +2074,54 @@ var init_model_broker = __esm({
        * upstream callers (e.g. Telegram poll loop) should slow ingress.
        */
       acquireInferenceSlot(spec) {
-        if (this._activeSlots.size < this.slotCapacity) {
+        const chosenGpu = this.pickGpuForSlot(spec);
+        const gpuOk = chosenGpu === null || this.activeSlotsOnGpu(chosenGpu) < this.perGpuSlotCapacity(chosenGpu);
+        if (gpuOk && this._activeSlots.size < this.slotCapacity) {
           return Promise.resolve(this.admitSlot(
             spec,
             /*reserved*/
-            false
+            false,
+            chosenGpu
           ));
         }
         if (spec.sessionKey && !this._reservedBySession.has(spec.sessionKey) && this._activeSlots.size < this.slotCapacity + 1) {
           const slot = this.admitSlot(
             spec,
             /*reserved*/
-            true
+            true,
+            chosenGpu
           );
           this._reservedBySession.set(spec.sessionKey, slot.info.id);
           return Promise.resolve(slot);
         }
         return new Promise((resolve55, reject) => {
+          if (this._slotQueue.length >= this.queueCapacity) {
+            const newPrio = spec.priority ?? 0;
+            let victim = -1;
+            let victimPrio = Infinity;
+            for (let i2 = this._slotQueue.length - 1; i2 >= 0; i2--) {
+              const p2 = this._slotQueue[i2].spec.priority ?? 0;
+              if (p2 < victimPrio) {
+                victimPrio = p2;
+                victim = i2;
+              }
+              if (victimPrio === 0)
+                break;
+            }
+            if (victim >= 0 && victimPrio < newPrio) {
+              const dropped = this._slotQueue.splice(victim, 1)[0];
+              if (dropped.onSignalAbort && dropped.spec.signal) {
+                dropped.spec.signal.removeEventListener("abort", dropped.onSignalAbort);
+              }
+              try {
+                dropped.reject(new Error("broker queue shed: capacity reached, lower-priority entry displaced"));
+              } catch {
+              }
+            } else {
+              reject(new Error(`broker queue full (capacity=${this.queueCapacity}); caller priority ${newPrio} insufficient to displace`));
+              return;
+            }
+          }
           const entry = { spec, resolve: resolve55, reject, enqueuedAt: Date.now() };
           if (spec.signal) {
             const onAbort = () => {
@@ -2011,7 +2156,7 @@ var init_model_broker = __esm({
         });
       }
       /** Admit a slot — internal, called from acquire fast path and from drainQueue. */
-      admitSlot(spec, reserved) {
+      admitSlot(spec, reserved, gpuIndex = null) {
         const id = `slot-${++this._slotIdSeq}-${Date.now().toString(36)}`;
         const info = {
           id,
@@ -2021,7 +2166,8 @@ var init_model_broker = __esm({
           sessionKey: spec.sessionKey,
           acquiredAt: Date.now(),
           promptTokens: spec.promptTokens ?? 0,
-          reserved
+          reserved,
+          gpuIndex
         };
         this._activeSlots.set(id, info);
         this.emit("slotAcquired", info);
@@ -2037,6 +2183,35 @@ var init_model_broker = __esm({
           }
         };
       }
+      /** Count of active slots pinned to a given GPU. */
+      activeSlotsOnGpu(gpuIndex) {
+        let n2 = 0;
+        for (const s2 of this._activeSlots.values()) {
+          if (s2.gpuIndex === gpuIndex)
+            n2++;
+        }
+        return n2;
+      }
+      /**
+       * Pick a GPU for a new inference slot. Honors caller's preferredGpuIndex
+       * when set; otherwise picks the GPU with the highest free VRAM that has
+       * room for the estimated footprint and an open per-device slot.
+       *
+       * Returns null when no GPU is detected (CPU-only) or no device fits — in
+       * the latter case the slot is admitted unpinned and the underlying
+       * subprocess will pick whatever CUDA exposes by default.
+       */
+      pickGpuForSlot(spec) {
+        if (this._vramByDevice.length === 0)
+          return null;
+        const candidates = this._vramByDevice.filter((d2) => spec.preferredGpuIndex === void 0 || d2.index === spec.preferredGpuIndex).filter((d2) => this.activeSlotsOnGpu(d2.index) < this.perGpuSlotCapacity(d2.index)).filter((d2) => spec.estimatedVramMB === void 0 || d2.free >= spec.estimatedVramMB).sort((a2, b) => b.free - a2.free);
+        return candidates[0]?.index ?? null;
+      }
+      /** Configure per-GPU slot capacity. Overrides the slotCapacity-derived default. */
+      setPerGpuSlotCapacity(gpuIndex, capacity) {
+        this._perGpuSlotCapacity.set(gpuIndex, Math.max(1, Math.floor(capacity)));
+        this.drainSlotQueue();
+      }
       releaseSlot(info, outcome) {
         this._activeSlots.delete(info.id);
         if (info.sessionKey && this._reservedBySession.get(info.sessionKey) === info.id) {
@@ -2062,8 +2237,18 @@ var init_model_broker = __esm({
         this.drainSlotQueue();
       }
       drainSlotQueue() {
-        while (this._slotQueue.length > 0 && this._activeSlots.size < this.slotCapacity) {
-          const entry = this._slotQueue.shift();
+        const queueCopy = [...this._slotQueue];
+        for (const entry of queueCopy) {
+          if (this._activeSlots.size >= this.slotCapacity)
+            break;
+          const chosenGpu = this.pickGpuForSlot(entry.spec);
+          const gpuOk = chosenGpu === null || this.activeSlotsOnGpu(chosenGpu) < this.perGpuSlotCapacity(chosenGpu);
+          if (!gpuOk)
+            continue;
+          const idx = this._slotQueue.indexOf(entry);
+          if (idx < 0)
+            continue;
+          this._slotQueue.splice(idx, 1);
           if (entry.onSignalAbort && entry.spec.signal) {
             entry.spec.signal.removeEventListener("abort", entry.onSignalAbort);
           }
@@ -2077,7 +2262,8 @@ var init_model_broker = __esm({
           const slot = this.admitSlot(
             entry.spec,
             /*reserved*/
-            false
+            false,
+            chosenGpu
           );
           try {
             entry.resolve(slot);
@@ -19581,26 +19767,16 @@ function extractSkillForQuery(skill, content, query, budgetTokens = 900) {
 function buildSkillsSummary(skills) {
   if (skills.length === 0)
     return "";
-  const lines = [
-    "## Skills Index",
-    "",
-    `${skills.length} skills available. Call \`skill_list\` to search, \`skill_execute <name>\` to load full instructions.`,
-    ""
-  ];
   const bySource = /* @__PURE__ */ new Map();
   for (const s2 of skills) {
-    const group = bySource.get(s2.source) ?? [];
-    group.push(s2);
-    bySource.set(s2.source, group);
-  }
-  for (const [source, group] of bySource) {
-    const names = group.map((s2) => {
-      const t2 = s2.triggers[0];
-      return t2 ? `${s2.name}(${t2})` : s2.name;
-    });
-    lines.push(`**${source}** (${group.length}): ${names.join(", ")}`);
+    bySource.set(s2.source, (bySource.get(s2.source) ?? 0) + 1);
   }
-  return lines.join("\n");
+  const sourcesSummary = [...bySource.entries()].sort((a2, b) => b[1] - a2[1]).map(([source, count]) => `${source}=${count}`).join(", ");
+  return [
+    "## Skills Index",
+    `${skills.length} skills available across ${bySource.size} sources (${sourcesSummary}).`,
+    "Use `skill_list` (with optional `filter` or `source`) to search; `skill_execute <name>` to load full instructions."
+  ].join("\n");
 }
 function safeReaddir2(dir, dirsOnly = false) {
   try {
@@ -255412,6 +255588,11 @@ import sys
 import time
 from pathlib import Path
+# Broker-picked GPU pinning — MUST run before importing torch.
+_omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
+if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
+    os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
 def _progress(stage, message, percent=None):
     payload = {"omnius_progress": True, "stage": stage, "message": message}
     if percent is not None:
@@ -255570,9 +255751,15 @@ if __name__ == "__main__":
     SDCPP_RUNNER = String.raw`#!/usr/bin/env python3
 import argparse
 import json
+import os
 import time
 from pathlib import Path
+# Broker-picked GPU pinning — sd-cpp's CUDA backend honors CUDA_VISIBLE_DEVICES.
+_omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
+if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
+    os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model-path", required=True)
@@ -255686,6 +255873,9 @@ if __name__ == "__main__":
       defaultModel;
       defaultBackend;
       promptExpander = null;
+      /** Broker-chosen GPU pinning for the in-flight generation. Read by the
+       *  spawn path to set OMNIUS_GPU_INDEX in the subprocess env. */
+      _brokerGpuIndex = null;
       constructor(cwd4, ollamaUrl = "http://localhost:11434", defaults3 = {}) {
         this.cwd = cwd4;
         this.ollamaUrl = ollamaUrl.replace(/\/v1\/?$/, "").replace(/\/$/, "");
@@ -255761,6 +255951,7 @@ if __name__ == "__main__":
         const candidates = imageGenerationFallbackCandidates(requestedModel, requestedBackend, generationFallbackEnabled(args));
         const broker = getModelBroker();
         const firstCandidate = candidates[0];
+        let brokerGpuIndex = null;
         if (firstCandidate) {
           const decision2 = await broker.ensureModelLoadable({
             name: firstCandidate.model,
@@ -255772,6 +255963,9 @@ if __name__ == "__main__":
             for (const target of decision2.evictTargets) {
               await broker.evict(target.host, target.name, "image-gen-needs-room");
             }
+            brokerGpuIndex = decision2.gpuIndex ?? null;
+          } else if (decision2.kind === "ok") {
+            brokerGpuIndex = decision2.gpuIndex ?? null;
           } else if (decision2.kind === "reject") {
             return {
               success: false,
@@ -255781,6 +255975,7 @@ if __name__ == "__main__":
             };
           }
         }
+        this._brokerGpuIndex = brokerGpuIndex;
         try {
           return await this.generateCandidateLadder({ candidates, prompt, args, seed, start: start2 });
         } catch (err) {
@@ -256283,10 +256478,14 @@ ${errText.slice(0, 800)}`,
         }
         ensureUnifiedCacheDirs();
         this.emitProgress({ stage: "load", message: `Starting image generation with ${args.model}` });
+        const runnerEnv = { ...python.env };
+        if (this._brokerGpuIndex !== null) {
+          runnerEnv["OMNIUS_GPU_INDEX"] = String(this._brokerGpuIndex);
+        }
         const result = await runProcess2(python.command, argv, {
           cwd: this.cwd,
           timeoutMs: 9e5,
-          env: python.env,
+          env: runnerEnv,
           progressLabel: `Downloading/loading ${args.model}`,
           onProgress: (event) => this.emitProgress(event)
         });
@@ -257582,9 +257781,14 @@ var init_audio_generate = __esm({
       DEFAULT_MUSIC_MODEL
     ];
     DIFFUSERS_AUDIO_RUNNER = String.raw`#!/usr/bin/env python3
-import argparse, json, sys, time
+import argparse, json, os, sys, time
 from pathlib import Path
+# Broker-picked GPU pinning — must run before importing torch.
+_omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
+if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
+    os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
 def _format_bytes(value):
     try:
         n = float(value)
@@ -257778,9 +257982,14 @@ if __name__ == "__main__":
     main()
 `;
     TRANSFORMERS_AUDIO_RUNNER = String.raw`#!/usr/bin/env python3
-import argparse, json, sys, time
+import argparse, json, os, sys, time
 from pathlib import Path
+# Broker-picked GPU pinning — must run before importing torch.
+_omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
+if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
+    os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
 def _format_bytes(value):
     try:
         n = float(value)
@@ -258006,6 +258215,8 @@ if __name__ == "__main__":
       progressHandler = null;
       lastProgressMessage = "";
       lastProgressAt = 0;
+      /** Broker-chosen GPU pinning for the in-flight generation. */
+      _brokerGpuIndex = null;
       constructor(cwd4, defaults3 = {}) {
         this.cwd = cwd4;
         this.defaults = defaults3;
@@ -258171,6 +258382,7 @@ if __name__ == "__main__":
         const playback = playbackRequested(args);
         const broker = getModelBroker();
         const firstCandidate = candidates[0];
+        let brokerGpuIndex = null;
         if (firstCandidate) {
           const decision2 = await broker.ensureModelLoadable({
             name: firstCandidate.model,
@@ -258182,6 +258394,9 @@ if __name__ == "__main__":
             for (const target of decision2.evictTargets) {
               await broker.evict(target.host, target.name, `${kind}-gen-needs-room`);
             }
+            brokerGpuIndex = decision2.gpuIndex ?? null;
+          } else if (decision2.kind === "ok") {
+            brokerGpuIndex = decision2.gpuIndex ?? null;
           } else if (decision2.kind === "reject") {
             return {
               success: false,
@@ -258191,6 +258406,7 @@ if __name__ == "__main__":
             };
           }
         }
+        this._brokerGpuIndex = brokerGpuIndex;
         try {
           return await this.generateCandidateLadder({ kind, candidates, prompt, args, seed, playback, start: start2 });
         } catch (err) {
@@ -258357,10 +258573,14 @@ if __name__ == "__main__":
         }
         ensureUnifiedCacheDirs();
         this.emitProgress({ stage: "load", message: `Starting ${args.kind} generation with ${args.model}` });
+        const runnerEnv = { ...python.env };
+        if (this._brokerGpuIndex !== null) {
+          runnerEnv["OMNIUS_GPU_INDEX"] = String(this._brokerGpuIndex);
+        }
         const result = await runProcess3(python.command, argv, {
           cwd: this.cwd,
           timeoutMs: 9e5,
-          env: python.env,
+          env: runnerEnv,
           progressLabel: `Downloading/loading ${args.model}`,
           onProgress: (event) => this.emitProgress(event)
         });
@@ -259130,7 +259350,7 @@ function parseRunnerJson3(stdout) {
   }
   return null;
 }
-var DEFAULT_DIFFUSERS_VIDEO_MODEL, SANA_VIDEO_480P_MODEL, SANA_VIDEO_720P_MODEL, WAN_TI2V_5B_MODEL, WAN_T2V_A14B_MODEL, WAN_I2V_A14B_MODEL, WAN_S2V_14B_MODEL, COGVIDEOX_5B_MODEL, COGVIDEOX_2B_MODEL, COGVIDEOX_5B_I2V_MODEL, MOCHI_PREVIEW_MODEL, LTX_VIDEO_MODEL, LTX_2_3_MODEL, HUNYUAN_VIDEO_MODEL, DIFFUSERS_VIDEO_PACKAGES, VIDEO_GENERATION_MODEL_PRESETS, VIDEO_GENERATION_QUALITY_LADDER, VIDEO_AUDIO_QUALITY_LADDER, DIFFUSERS_VIDEO_RUNNER, COMFY_BOOTSTRAP_SCRIPT, COMFY_DEFAULT_WORKFLOWS, VideoGenerateTool;
+var DEFAULT_DIFFUSERS_VIDEO_MODEL, SANA_VIDEO_480P_MODEL, SANA_VIDEO_720P_MODEL, SANA_WM_BIDIRECTIONAL_MODEL, WAN_TI2V_5B_MODEL, WAN_T2V_A14B_MODEL, WAN_I2V_A14B_MODEL, WAN_S2V_14B_MODEL, COGVIDEOX_5B_MODEL, COGVIDEOX_2B_MODEL, COGVIDEOX_5B_I2V_MODEL, MOCHI_PREVIEW_MODEL, LTX_VIDEO_MODEL, LTX_2_3_MODEL, HUNYUAN_VIDEO_MODEL, DIFFUSERS_VIDEO_PACKAGES, VIDEO_GENERATION_MODEL_PRESETS, VIDEO_GENERATION_QUALITY_LADDER, VIDEO_AUDIO_QUALITY_LADDER, DIFFUSERS_VIDEO_RUNNER, COMFY_BOOTSTRAP_SCRIPT, COMFY_DEFAULT_WORKFLOWS, VideoGenerateTool;
 var init_video_generate = __esm({
   "packages/execution/dist/tools/video-generate.js"() {
     "use strict";
@@ -259140,6 +259360,7 @@ var init_video_generate = __esm({
     DEFAULT_DIFFUSERS_VIDEO_MODEL = "Efficient-Large-Model/SANA-Video_2B_480p";
     SANA_VIDEO_480P_MODEL = "Efficient-Large-Model/SANA-Video_2B_480p";
     SANA_VIDEO_720P_MODEL = "Efficient-Large-Model/SANA-Video_2B_720p";
+    SANA_WM_BIDIRECTIONAL_MODEL = "Efficient-Large-Model/SANA-WM_bidirectional";
     WAN_TI2V_5B_MODEL = "Wan-AI/Wan2.2-TI2V-5B-Diffusers";
     WAN_T2V_A14B_MODEL = "Wan-AI/Wan2.2-T2V-A14B-Diffusers";
     WAN_I2V_A14B_MODEL = "Wan-AI/Wan2.2-I2V-A14B-Diffusers";
@@ -259433,6 +259654,41 @@ var init_video_generate = __esm({
         licenseNote: "Apache 2.0",
         note: "Premium Wan T2V; cloud GPU recommended."
       },
+      {
+        id: SANA_WM_BIDIRECTIONAL_MODEL,
+        label: "SANA-WM bidirectional (world-model i2v)",
+        kinds: ["i2v"],
+        backend: "diffusers",
+        // SANA-WM declares its concrete class in model_index.json; loaded via
+        // generic DiffusionPipeline.from_pretrained — the runner's auto path
+        // already does this for unknown model names.
+        pipelineClass: "DiffusionPipeline",
+        install: 'python3 .omnius/video-gen/diffusers_text2video.py --model Efficient-Large-Model/SANA-WM_bidirectional --mode i2v --num-frames 121 --fps 24 --width 704 --height 1280 --steps 30 --guidance 5.0 --image <input.png> --prompt "..." --output .omnius/videos/out.mp4',
+        category: "Premium quality",
+        sizeClass: "2.6B DiT + LTX-2 refiner (Sana World Model)",
+        quality: "Image-to-video world model with optional camera-trajectory control. Two-stage generation (Sana DiT + LTX-2 refiner); hybrid linear attention; 6-DoF camera support via .npy matrices or WASD/IJKL action DSL.",
+        output: "Up to ~13s 704×1280 (portrait 720p) MP4 at 24 fps; max 321 frames.",
+        bestUse: "World-model / camera-controlled video from a single first-frame image. Best on H100/A100-class hardware.",
+        minVramGB: 80,
+        recommendedVramGB: 100,
+        deployment: "Diffusers DiffusionPipeline.from_pretrained; bfloat16; aggressive CPU offload mandatory below 100 GB. Bundled LTX-2 refiner runs as stage 2.",
+        steps: 30,
+        guidance: 5,
+        numFrames: 121,
+        fps: 24,
+        width: 704,
+        height: 1280,
+        dtype: "bfloat16",
+        needsCpuOffload: true,
+        frameQuantum: 1,
+        pixelQuantum: 16,
+        // Apache 2.0 base; bundled LTX-2 refiner + VAE inherit the LTX-2
+        // non-commercial license. Surface that explicitly.
+        licenseNote: "Apache 2.0 (bundled LTX-2 refiner/VAE inherit LTX-2 non-commercial terms)",
+        approxDownloadGB: 99,
+        fallbackFor: [WAN_I2V_A14B_MODEL],
+        note: "Sana World Model bidirectional i2v; portrait 704×1280 fixed; camera control via --camera <matrices.npy> or --action <DSL> when the runner supports it."
+      },
       {
         id: WAN_I2V_A14B_MODEL,
         label: "Wan2.2 I2V A14B",
@@ -259561,6 +259817,9 @@ var init_video_generate = __esm({
       COGVIDEOX_5B_MODEL,
       MOCHI_PREVIEW_MODEL,
       COGVIDEOX_2B_MODEL,
+      // Heavy i2v / world-model tier — only attempted when an explicit model
+      // is requested or the consumer-VRAM tier above has failed for an i2v ask.
+      SANA_WM_BIDIRECTIONAL_MODEL,
       WAN_I2V_A14B_MODEL,
       WAN_T2V_A14B_MODEL,
       HUNYUAN_VIDEO_MODEL
@@ -259579,6 +259838,16 @@ import sys
 import time
 from pathlib import Path
+# ── GPU pinning ─────────────────────────────────────────────────────
+# The TS broker picks a GPU per generation via bin-packing across the
+# available CUDA devices. It passes the chosen index in OMNIUS_GPU_INDEX.
+# We MUST apply CUDA_VISIBLE_DEVICES BEFORE importing torch, otherwise
+# torch initializes the device list with all visible GPUs and the model
+# may land on a different device than the broker reserved capacity on.
+_omnius_gpu = os.environ.get("OMNIUS_GPU_INDEX", "").strip()
+if _omnius_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
+    os.environ["CUDA_VISIBLE_DEVICES"] = _omnius_gpu
 def _progress(stage, message, percent=None):
     payload = {"omnius_progress": True, "stage": stage, "message": message}
     if percent is not None:
@@ -260385,6 +260654,9 @@ if __name__ == "__main__":
       defaultBackend;
       defaultKind;
       promptExpander = null;
+      /** GPU index chosen by the broker for the in-flight generation. Read
+       *  by the spawn path to set OMNIUS_GPU_INDEX in the subprocess env. */
+      _brokerGpuIndex = null;
       constructor(cwd4, defaults3 = {}) {
         this.cwd = cwd4;
         this.defaultModel = defaults3.model;
@@ -260474,17 +260746,23 @@ if __name__ == "__main__":
         const candidates = videoGenerationFallbackCandidates(requestedModel, requestedBackend, inferredKind, generationFallbackEnabled3(args), { preferNativeAudioVideo: withAudio || Boolean(audioInput) });
         const broker = getModelBroker();
         const firstCandidate = candidates[0];
+        let brokerGpuIndex = null;
         if (firstCandidate) {
+          const preset = firstCandidate.preset;
           const decision2 = await broker.ensureModelLoadable({
             name: firstCandidate.model,
             domain: "video-gen",
             host: firstCandidate.backend === "comfyui" ? "comfyui" : "diffusers-py",
-            owner: "video-generate-tool"
+            owner: "video-generate-tool",
+            estimatedVramMB: preset ? preset.minVramGB * 1024 : void 0
           });
           if (decision2.kind === "evict") {
             for (const target of decision2.evictTargets) {
               await broker.evict(target.host, target.name, "video-gen-needs-room");
             }
+            brokerGpuIndex = decision2.gpuIndex ?? null;
+          } else if (decision2.kind === "ok") {
+            brokerGpuIndex = decision2.gpuIndex ?? null;
           } else if (decision2.kind === "reject") {
             return {
               success: false,
@@ -260494,6 +260772,7 @@ if __name__ == "__main__":
             };
           }
         }
+        this._brokerGpuIndex = brokerGpuIndex;
         if (candidates.length === 0) {
           return {
             success: false,
@@ -260915,6 +261194,9 @@ ${llmAnnotation}` : result.llmContent;
           runnerEnv["HF_TOKEN"] = effectiveToken;
           runnerEnv["HUGGING_FACE_HUB_TOKEN"] = effectiveToken;
         }
+        if (this._brokerGpuIndex !== null) {
+          runnerEnv["OMNIUS_GPU_INDEX"] = String(this._brokerGpuIndex);
+        }
         const argv = [
           runner,
           "--model",
@@ -570264,18 +570546,6 @@ function formatReflection(notes2, scenario) {
   ];
   return lines.join("\n");
 }
-function formatMemory(input, state) {
-  const lines = [];
-  if (input.memoryContext) lines.push(input.memoryContext);
-  if (state.dynamicState && Object.keys(state.dynamicState).length > 0) {
-    const entries = Object.entries(state.dynamicState).slice(0, 12).map(([key, value2]) => `- ${key}: ${compactText(JSON.stringify(value2) ?? String(value2), 220)}`);
-    lines.push(`Dynamic state:
-${entries.join("\n")}`);
-  }
-  if (state.updatedAt) lines.push(`State updated: ${state.updatedAt}`);
-  if (lines.length === 0) return "No additional retrieved voice-soul memory beyond scoped personality and runtime state.";
-  return lines.join("\n\n");
-}
 function formatFinalVoice(input) {
   const voice = findProjectVoice(input.scope);
   const lines = [
@@ -570302,23 +570572,23 @@ function buildSoulContext(input) {
   const state = loadSoulRuntimeState(input);
   const scenario = resolveSoulScenario(input, state);
   const tree2 = resolveSoulDecisionTree(input, state, scenario);
-  return [
-    "## Voice Soul Context",
-    "### 1. Authority And Safety Scope",
+  const sections = ["## Voice Soul Context"];
+  const voiceAndScope = [
     formatAuthorityScope(input),
-    "### 2. Core Identity",
     formatCoreIdentity(input),
-    "### 3. Procedural Decision Tree",
-    formatProceduralConstraints(input, scenario, tree2, state),
-    "### 4. Relationship State",
-    formatRelationshipState(input),
-    "### 5. Current Reflection Notes",
-    formatReflection(input.currentReflection, scenario),
-    "### 6. Minimal Retrieved Memory",
-    formatMemory(input, state),
-    "### 7. Final Voice Guidance",
     formatFinalVoice(input)
-  ].join("\n\n");
+  ].filter(Boolean).join("\n\n");
+  sections.push("### Voice + Scope + Identity", voiceAndScope);
+  const decisionSubstrate = [
+    formatRelationshipState(input),
+    formatProceduralConstraints(input, scenario, tree2, state)
+  ].filter(Boolean).join("\n\n");
+  sections.push("### Active Relationship + Scenario", decisionSubstrate);
+  const reflection = formatReflection(input.currentReflection, scenario);
+  if (reflection && reflection.trim().length > 0) {
+    sections.push("### Current Reflection Notes", reflection);
+  }
+  return sections.join("\n\n");
 }
 var MAX_SOUL_CHARS, MAX_VOICE_CHARS, MAX_SCOPED_PERSONALITY_CHARS, UNCLASSIFIED_SCENARIO;
 var init_voice_soul = __esm({
@@ -577249,7 +577519,32 @@ var init_status_bar = __esm({
           if (this.active) this.renderFooterPreserveCursor();
         }, intervalMs);
         try {
-          getModelBroker().startPolling(Math.max(2e3, intervalMs * 2));
+          const broker = getModelBroker();
+          try {
+            Promise.resolve().then(() => (init_dist8(), dist_exports3)).then(({ getOllamaPool: getOllamaPool2, resolveDefaultPoolConfig: resolveDefaultPoolConfig2 }) => {
+              try {
+                const config = resolveDefaultPoolConfig2();
+                const pool3 = getOllamaPool2({ baseInstanceUrl: config.baseInstanceUrl });
+                broker.setOllamaAffinityProvider((modelName) => {
+                  try {
+                    const status = pool3.status?.();
+                    if (!status) return null;
+                    for (const inst of status.instances ?? []) {
+                      void modelName;
+                      return { gpuIndex: inst.gpuIndex, gpuUuid: inst.gpuUuid };
+                    }
+                    return null;
+                  } catch {
+                    return null;
+                  }
+                });
+              } catch {
+              }
+            }).catch(() => {
+            });
+          } catch {
+          }
+          broker.startPolling(Math.max(2e3, intervalMs * 2));
         } catch {
         }
       }
@@ -604352,14 +604647,22 @@ async function handleBroker(arg, _ctx) {
     safeLog(`  ${c3.bold("Resource Broker")}`);
     safeLog("");
     safeLog(`  ${c3.dim("RAM:")}  ${snap.ramMB.used} / ${snap.ramMB.total} MB used (${snap.ramMB.free} MB free)`);
-    if (snap.vramMB) {
+    if (snap.vramPerDevice.length > 0) {
+      safeLog(`  ${c3.bold("GPUs:")}`);
+      for (const d2 of snap.vramPerDevice) {
+        const gpuSlots = snap.slots.byGpu[d2.index];
+        const slotInfo = gpuSlots ? ` slots=${gpuSlots.inUse}/${gpuSlots.capacity}, loaded=${gpuSlots.loadedMB}MB` : "";
+        safeLog(`    gpu${d2.index} (${d2.uuid.slice(0, 12)}…)  ${d2.used} / ${d2.total} MB used (${d2.free} MB free)${slotInfo}`);
+      }
+    } else if (snap.vramMB) {
       safeLog(`  ${c3.dim("VRAM:")} ${snap.vramMB.used} / ${snap.vramMB.total} MB used (${snap.vramMB.free} MB free)`);
     } else {
       safeLog(`  ${c3.dim("VRAM:")} ${c3.dim("(no GPU detected)")}`);
     }
     safeLog(`  ${c3.dim("RAM headroom threshold:")}  ${broker.ramHeadroomMB} MB`);
-    safeLog(`  ${c3.dim("VRAM headroom threshold:")} ${broker.vramHeadroomMB} MB`);
+    safeLog(`  ${c3.dim("VRAM headroom threshold:")} ${broker.vramHeadroomMB} MB (per-device)`);
     safeLog(`  ${c3.dim("Idle-evict threshold:")} ${Math.round(broker.idleEvictMs / 1e3)}s`);
+    safeLog(`  ${c3.dim("Slot capacity:")} ${snap.slots.inUse}/${snap.slots.capacity} active, queue ${snap.slots.queueDepth}/${snap.slots.queueCapacity}`);
     safeLog("");
     if (snap.loaded.length === 0) {
       safeLog(`  ${c3.dim("No loaded models tracked.")}`);
@@ -604370,7 +604673,8 @@ async function handleBroker(arg, _ctx) {
         const idle = Math.round((now - m2.lastUsedAt) / 1e3);
         const owner = m2.owner ? c3.dim(` [owner=${m2.owner}]`) : "";
         const ctx3 = m2.numCtx ? c3.dim(` n_ctx=${m2.numCtx}`) : "";
-        safeLog(`    ${c3.cyan(m2.name)} (${m2.host}/${m2.domain})  vram=${m2.vramMB}MB ram=${m2.ramMB}MB  idle=${idle}s${ctx3}${owner}`);
+        const gpu = m2.gpuIndex !== null && m2.gpuIndex !== void 0 ? c3.dim(` gpu=${m2.gpuIndex}`) : "";
+        safeLog(`    ${c3.cyan(m2.name)} (${m2.host}/${m2.domain})  vram=${m2.vramMB}MB ram=${m2.ramMB}MB${gpu}  idle=${idle}s${ctx3}${owner}`);
       }
     }
     if (snap.inflight.length > 0) {
@@ -618122,6 +618426,95 @@ function parseTelegramSilentReflectionNotes(text) {
   }
   return null;
 }
+function extractPartialTelegramReplyJson(buffer2) {
+  const stripped = stripTelegramHiddenThinking(buffer2).trimStart();
+  if (!stripped.startsWith("{")) {
+    return stripped || null;
+  }
+  const keyMatch = stripped.indexOf('"reply"');
+  if (keyMatch < 0) return null;
+  let i2 = keyMatch + '"reply"'.length;
+  while (i2 < stripped.length && stripped[i2] !== ":") i2++;
+  if (i2 >= stripped.length) return null;
+  i2++;
+  while (i2 < stripped.length && /\s/.test(stripped[i2])) i2++;
+  if (i2 >= stripped.length || stripped[i2] !== '"') return null;
+  i2++;
+  let out = "";
+  while (i2 < stripped.length) {
+    const ch = stripped[i2];
+    if (ch === "\\") {
+      const next = stripped[i2 + 1];
+      if (next === void 0) break;
+      if (next === '"') out += '"';
+      else if (next === "\\") out += "\\";
+      else if (next === "n") out += "\n";
+      else if (next === "t") out += "	";
+      else if (next === "r") out += "\r";
+      else if (next === "/") out += "/";
+      else if (next === "u") {
+        if (i2 + 5 >= stripped.length) break;
+        const hex = stripped.slice(i2 + 2, i2 + 6);
+        const code8 = parseInt(hex, 16);
+        if (Number.isFinite(code8)) out += String.fromCharCode(code8);
+        i2 += 4;
+      } else {
+        out += next;
+      }
+      i2 += 2;
+      continue;
+    }
+    if (ch === '"') {
+      return out;
+    }
+    out += ch;
+    i2++;
+  }
+  return out.length > 0 ? out : null;
+}
+function extractFinalTelegramReplyJson(buffer2) {
+  const stripped = stripTelegramHiddenThinking(buffer2).trim();
+  if (!stripped.startsWith("{")) return null;
+  try {
+    const parsed = JSON.parse(stripped);
+    if (typeof parsed.reply === "string") return parsed.reply.trim();
+  } catch {
+  }
+  let depth = 0;
+  let inString = false;
+  let escape2 = false;
+  let end = -1;
+  for (let i2 = 0; i2 < stripped.length; i2++) {
+    const ch = stripped[i2];
+    if (escape2) {
+      escape2 = false;
+      continue;
+    }
+    if (inString) {
+      if (ch === "\\") escape2 = true;
+      else if (ch === '"') inString = false;
+      continue;
+    }
+    if (ch === '"') inString = true;
+    else if (ch === "{") depth++;
+    else if (ch === "}") {
+      depth--;
+      if (depth === 0) {
+        end = i2;
+        break;
+      }
+    }
+  }
+  if (end > 0) {
+    try {
+      const parsed = JSON.parse(stripped.slice(0, end + 1));
+      if (typeof parsed.reply === "string") return parsed.reply.trim();
+    } catch {
+    }
+  }
+  const partial = extractPartialTelegramReplyJson(stripped);
+  return partial && partial.trim().length > 0 ? partial.trim() : null;
+}
 function estimatePromptTokensFromRequest(request) {
   let chars = 0;
   for (const m2 of request.messages ?? []) {
@@ -618138,6 +618531,32 @@ function estimatePromptTokensFromRequest(request) {
   }
   return Math.ceil(chars / 4);
 }
+function isLikelyTruncatedRouterJson(text) {
+  if (typeof text !== "string") return false;
+  const stripped = text.replace(/^\s*<think>[\s\S]*?<\/think>\s*/i, "").trim();
+  if (!stripped.startsWith("{")) return false;
+  let depth = 0;
+  let inString = false;
+  let escape2 = false;
+  for (let i2 = 0; i2 < stripped.length; i2++) {
+    const ch = stripped[i2];
+    if (escape2) {
+      escape2 = false;
+      continue;
+    }
+    if (inString) {
+      if (ch === "\\") escape2 = true;
+      else if (ch === '"') inString = false;
+      continue;
+    }
+    if (ch === '"') inString = true;
+    else if (ch === "{") depth++;
+    else if (ch === "}") depth--;
+  }
+  if (depth <= 0) return false;
+  const hits = (stripped.includes('"route"') ? 1 : 0) + (stripped.includes('"should_reply"') ? 1 : 0) + (stripped.includes('"confidence"') ? 1 : 0) + (stripped.includes('"reason"') ? 1 : 0) + (stripped.includes('"silent_disposition"') ? 1 : 0) + (stripped.includes('"mental_note"') ? 1 : 0);
+  return hits >= 3;
+}
 function telegramRouterTimeoutMs(configTimeoutMs, _minMs, _legacyMaxMs) {
   void _minMs;
   void _legacyMaxMs;
@@ -619583,7 +620002,7 @@ function renderTelegramSubAgentError(username, error) {
   process.stdout.write(`    ${c3.dim("│")} ${c3.magenta("✘")} @${username}: ${c3.dim(preview)}
 `);
 }
-var TELEGRAM_TOOL_ACTION_GROUPS, TELEGRAM_TOOL_ACTION_GROUP, TELEGRAM_TOOL_MUTATING_GROUPS, DEFAULT_TELEGRAM_TOOL_GROUP_POLICY, TELEGRAM_TOOL_BUTTON_LABELS, TELEGRAM_SAFETY_PROMPT, ADMIN_DM_PROMPT, ADMIN_GROUP_PROMPT, TELEGRAM_PUBLIC_SOUL_PROFILE, TELEGRAM_PUBLIC_ORCHESTRATOR_CONTRACT, TELEGRAM_PUBLIC_MEMORY_SCOPE_CONTRACT, TELEGRAM_PUBLIC_VISION_STACK_CONTRACT, GROUP_REPLY_DISCRETION_PROMPT, TELEGRAM_CHAT_MODE_PROMPT, ADMIN_CHAT_PROFILE_PROMPT, TELEGRAM_ACTION_RESPONSE_CONTRACT, TELEGRAM_EXTERNAL_ACQUISITION_CONTRACT, TELEGRAM_INTERACTION_DECISION_RESPONSE_FORMAT, TELEGRAM_STUCK_SELF_TALK_PREFIXES, TELEGRAM_CHAT_HISTORY_LIMIT, TELEGRAM_CONTEXT_RECENT_DEFAULT, TELEGRAM_CONTEXT_LINE_LIMIT, TELEGRAM_CONTEXT_SAMPLE_LIMIT, TELEGRAM_MEMORY_CARD_LIMIT, TELEGRAM_MEMORY_NOTE_LIMIT, TELEGRAM_ASSOCIATIVE_FACT_LIMIT, TELEGRAM_ASSOCIATIVE_USER_FACT_LIMIT, TELEGRAM_ASSOCIATIVE_ACTION_LIMIT, TELEGRAM_ASSOCIATIVE_RELATION_LIMIT, TELEGRAM_MEMORY_STOPWORDS, TELEGRAM_MEMORY_GENERIC_QUERY_TOKENS, TELEGRAM_SUB_AGENT_BOUNDED_OPTIONS, TELEGRAM_SUB_AGENT_DEFAULT_LIMIT, TELEGRAM_SUB_AGENT_MAX_LIMIT, TELEGRAM_SUB_AGENT_BURST_CONTEXT_LIMIT, TELEGRAM_PUBLIC_HELP_COMMANDS2, TELEGRAM_REMINDER_SLASH_COMMANDS, TELEGRAM_REFLECTION_SLASH_COMMANDS, TELEGRAM_PUBLIC_BOT_COMMAND_NAMES, TELEGRAM_IMAGE_EXTENSIONS, MEDIA_CACHE_TTL_MS, TELEGRAM_CHANNEL_DMN_SWEEP_MS, TELEGRAM_CHANNEL_DMN_IDLE_AFTER_MS, TELEGRAM_CHANNEL_DMN_MIN_INTERVAL_MS, TELEGRAM_CHANNEL_DMN_MIN_MESSAGES, TELEGRAM_ALLOWED_UPDATES, TELEGRAM_PUBLIC_TOOL_QUOTAS, TelegramBridge;
+var TELEGRAM_TOOL_ACTION_GROUPS, TELEGRAM_TOOL_ACTION_GROUP, TELEGRAM_TOOL_MUTATING_GROUPS, DEFAULT_TELEGRAM_TOOL_GROUP_POLICY, TELEGRAM_TOOL_BUTTON_LABELS, TELEGRAM_SAFETY_PROMPT, ADMIN_DM_PROMPT, ADMIN_GROUP_PROMPT, TELEGRAM_PUBLIC_SOUL_PROFILE, TELEGRAM_PUBLIC_ORCHESTRATOR_CONTRACT, TELEGRAM_PUBLIC_MEMORY_SCOPE_CONTRACT, TELEGRAM_PUBLIC_VISION_STACK_CONTRACT, GROUP_REPLY_DISCRETION_PROMPT, TELEGRAM_CHAT_MODE_PROMPT, ADMIN_CHAT_PROFILE_PROMPT, TELEGRAM_ACTION_RESPONSE_CONTRACT, TELEGRAM_EXTERNAL_ACQUISITION_CONTRACT, TELEGRAM_INTERACTION_DECISION_RESPONSE_FORMAT, TELEGRAM_CHAT_REPLY_RESPONSE_FORMAT, TELEGRAM_STUCK_SELF_TALK_PREFIXES, TELEGRAM_CHAT_HISTORY_LIMIT, TELEGRAM_CONTEXT_RECENT_DEFAULT, TELEGRAM_CONTEXT_LINE_LIMIT, TELEGRAM_CONTEXT_SAMPLE_LIMIT, TELEGRAM_MEMORY_CARD_LIMIT, TELEGRAM_MEMORY_NOTE_LIMIT, TELEGRAM_ASSOCIATIVE_FACT_LIMIT, TELEGRAM_ASSOCIATIVE_USER_FACT_LIMIT, TELEGRAM_ASSOCIATIVE_ACTION_LIMIT, TELEGRAM_ASSOCIATIVE_RELATION_LIMIT, TELEGRAM_MEMORY_STOPWORDS, TELEGRAM_MEMORY_GENERIC_QUERY_TOKENS, TELEGRAM_SUB_AGENT_BOUNDED_OPTIONS, TELEGRAM_SUB_AGENT_DEFAULT_LIMIT, TELEGRAM_SUB_AGENT_MAX_LIMIT, TELEGRAM_SUB_AGENT_BURST_CONTEXT_LIMIT, TELEGRAM_PUBLIC_HELP_COMMANDS2, TELEGRAM_REMINDER_SLASH_COMMANDS, TELEGRAM_REFLECTION_SLASH_COMMANDS, TELEGRAM_PUBLIC_BOT_COMMAND_NAMES, TELEGRAM_IMAGE_EXTENSIONS, MEDIA_CACHE_TTL_MS, TELEGRAM_CHANNEL_DMN_SWEEP_MS, TELEGRAM_CHANNEL_DMN_IDLE_AFTER_MS, TELEGRAM_CHANNEL_DMN_MIN_INTERVAL_MS, TELEGRAM_CHANNEL_DMN_MIN_MESSAGES, TELEGRAM_ALLOWED_UPDATES, TELEGRAM_PUBLIC_TOOL_QUOTAS, TelegramBridge;
 var init_telegram_bridge = __esm({
   "packages/cli/src/tui/telegram-bridge.ts"() {
     "use strict";
@@ -619817,6 +620236,12 @@ Rules:
 7. Do not claim older chat is unavailable when the context stream contains it. If asked what you see, summarize the supplied transcript, speakers, and relationship/tone signals.
 8. Mirror the current sender's tone and directness while staying safe and clear.
 9. Never send router decisions, skip explanations, memory-stage notes, task-complete summaries, or "no_reply" as chat text.
+Output discipline (your assistant message is sent verbatim to Telegram, ALL of it):
+- Emit ONLY the final reply text. Do not narrate your reasoning, summarize what you found, organize bullet-point notes, or write phrases like "Let me summarize", "Let me send the reply", "Now I have enough", "Based on the research", "Here's my response:" before the actual reply. Those are scratch-pad phrases that leak when emitted as visible text.
+- Do not produce a draft followed by the final answer. The first character of your output should be the first character of the message the user will receive.
+- If you need to think, do it silently. Do not write your reasoning steps as visible prose. If you have an internal scratchpad, keep it internal.
+- A reply that begins by restating what you found, then says something like "Let me write the response" or "Here's the breakdown", then gives the answer, is wrong twice over: the user sees the restatement AND the answer, doubling the message. Skip the restatement.
 `.trim();
     ADMIN_CHAT_PROFILE_PROMPT = `
 You are replying to the authenticated Telegram admin in a private DM.
@@ -619849,6 +620274,24 @@ External acquisition contract:
     TELEGRAM_INTERACTION_DECISION_RESPONSE_FORMAT = {
       type: "json_object"
     };
+    TELEGRAM_CHAT_REPLY_RESPONSE_FORMAT = {
+      type: "json_schema",
+      json_schema: {
+        name: "telegram_chat_reply",
+        strict: true,
+        schema: {
+          type: "object",
+          additionalProperties: false,
+          required: ["reply"],
+          properties: {
+            reply: {
+              type: "string",
+              description: "The exact text to send to Telegram. No prefixes, no narration, no scratch reasoning, no bullet-point notes preceding the reply."
+            }
+          }
+        }
+      }
+    };
     TELEGRAM_STUCK_SELF_TALK_PREFIXES = [
       /^i'?ve been stuck for\b/i,
       /^i am (still |currently )?stuck\b/i,
@@ -622075,6 +622518,14 @@ ${mediaContext}` : ""
         if (state.lastFollowupAt && now - state.lastFollowupAt < 60 * 6e4) {
           return { sent: false, reason: "rate limit held public follow-up" };
         }
+        const cooldownEnv = Number.parseInt(process.env["OMNIUS_TG_FOLLOWUP_COOLDOWN_MS"] ?? "", 10);
+        const cooldownMs = Number.isFinite(cooldownEnv) && cooldownEnv >= 6e4 ? cooldownEnv : 10 * 6e4;
+        if (state.lastAssistantMessageAt && now - state.lastAssistantMessageAt < cooldownMs) {
+          return {
+            sent: false,
+            reason: `recent assistant reply suppresses follow-up (${Math.round((now - state.lastAssistantMessageAt) / 1e3)}s ago, cooldown ${Math.round(cooldownMs / 1e3)}s)`
+          };
+        }
         const candidateMessageIds = Array.from(new Set([
           ...artifact.curiosityThreads.flatMap((thread) => thread.sourceMessages ?? []),
           ...artifact.memoryProposals.flatMap((proposal) => proposal.sourceMessages ?? []),
@@ -622405,6 +622856,10 @@ ${mediaContext}` : ""
           chatTitle: msg.chatTitle
         };
         this.recordChatHistory(sessionKey, entry);
+        try {
+          this.reflectionStateForSession(sessionKey).lastAssistantMessageAt = Date.now();
+        } catch {
+        }
         this.persistTelegramAssistantMessage(
           msg,
           clean5,
@@ -623632,32 +624087,16 @@ ${lines.join("\n")}`);
           sections.push(`### Participants And Relationship Signals${tierNote}
 ${participantLines.join("\n")}`);
         }
-        const associativeContext = this.relevantTelegramAssociativeMemoryContext(
-          sessionKey,
-          msg,
-          isGroup ? 14 : 8
-        );
-        if (associativeContext) {
-          sections.push(associativeContext);
-        }
-        const sqliteMirrorContext = this.relevantTelegramSqliteMirrorContext(
-          sessionKey,
-          msg,
-          isGroup ? 14 : 8
-        );
-        if (sqliteMirrorContext) {
-          sections.push(sqliteMirrorContext);
-        }
-        try {
-          const episodicContext = this.relevantTelegramEpisodicMemoryContext(
+        const ASSOCIATIVE_MIN_TURNS = isGroup ? 8 : 4;
+        if (retainedCount >= ASSOCIATIVE_MIN_TURNS) {
+          const associativeContext = this.relevantTelegramAssociativeMemoryContext(
             sessionKey,
             msg,
-            isGroup ? 10 : 6
+            isGroup ? 14 : 8
           );
-          if (episodicContext) {
-            sections.push(episodicContext);
+          if (associativeContext) {
+            sections.push(associativeContext);
           }
-        } catch {
         }
         const memoryCards = this.relevantTelegramMemoryCards(sessionKey, msg, isGroup ? 10 : 6);
         if (memoryCards.length > 0) {
@@ -623688,10 +624127,6 @@ ${notes2}`;
 ${cardLines.join("\n")}`);
           }
         }
-        const channelDaydream = this.formatLatestTelegramChannelDaydreamContext(sessionKey);
-        if (channelDaydream) {
-          sections.push(channelDaydream);
-        }
         const recentMedia = this.recentTelegramMediaEntries(msg.chatId, 10);
         if (recentMedia.length > 0) {
           const mediaLines = recentMedia.map((entry) => {
@@ -623710,26 +624145,33 @@ ${cardLines.join("\n")}`);
           ].join("\n"));
         }
         if (olderCount > 0) {
+          const halfLifeMs = (isGroup ? 24 : 48) * 60 * 60 * 1e3;
+          const now = Date.now();
           const older = history.slice(0, olderCount);
           const bySpeaker = /* @__PURE__ */ new Map();
           for (const entry of older) {
             if (!entry.text.trim()) continue;
             const speaker = telegramHistorySpeaker(entry);
+            const ageMs = Math.max(0, now - (entry.ts ?? 0));
+            const weight = Math.exp(-ageMs / halfLifeMs);
             const existing = bySpeaker.get(speaker);
             const text = truncateTelegramContextLine(entry.text, 180);
             if (existing) {
               existing.count += 1;
               existing.last = text;
+              existing.weightSum += weight;
+              existing.maxWeight = Math.max(existing.maxWeight, weight);
             } else {
-              bySpeaker.set(speaker, { count: 1, first: text, last: text });
+              bySpeaker.set(speaker, { count: 1, first: text, last: text, weightSum: weight, maxWeight: weight });
             }
           }
-          const olderLines = [...bySpeaker.entries()].slice(0, 10).map(([speaker, info]) => {
+          const olderLines = [...bySpeaker.entries()].sort(([, a2], [, b]) => b.maxWeight - a2.maxWeight).slice(0, 5).map(([speaker, info]) => {
             const range = info.first === info.last ? info.first : `${info.first} -> ${info.last}`;
-            return `- ${speaker}: ${info.count} earlier msg(s); digest=${telegramContextJsonString(range, 240)}`;
+            const decayLabel = info.maxWeight >= 0.5 ? "fresh" : info.maxWeight >= 0.1 ? "decayed" : "stale";
+            return `- ${speaker}: ${info.count} earlier msg(s) [${decayLabel}]; digest=${telegramContextJsonString(range, 200)}`;
           });
           if (olderLines.length > 0) {
-            sections.push(`### Earlier Retained Thread Digest
+            sections.push(`### Earlier Retained Thread Digest (recency-weighted)
 ${olderLines.join("\n")}`);
           }
         }
@@ -623949,7 +624391,8 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
               ],
               tools: [],
               temperature: 0,
-              maxTokens: 650,
+              // Reflection has 12 string fields; 650 was tight enough to truncate.
+              maxTokens: 1500,
               timeoutMs: telegramRouterTimeoutMs(timeoutMs),
               think: false
             },
@@ -624039,9 +624482,11 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
         const promptTokens = estimatePromptTokensFromRequest(request);
         const broker = getModelBroker();
         const trainCtx = await broker.getNctxTrain(model).catch(() => null);
-        const targetCtx = trainCtx && trainCtx > 0 ? Math.min(trainCtx, Math.max(2048, promptTokens + 1024)) : Math.max(2048, promptTokens + 1024);
+        const completionHeadroom = 4096;
+        const targetCtx = trainCtx && trainCtx > 0 ? Math.min(trainCtx, Math.max(2048, promptTokens + completionHeadroom)) : Math.max(2048, promptTokens + completionHeadroom);
         const requestWithCtx = { ...request, numCtx: targetCtx };
-        const slot = await broker.acquireInferenceSlot({
+        const brokerBypass = process.env["OMNIUS_DISABLE_BROKER_ADMISSION"] === "1";
+        const slot = brokerBypass ? null : await broker.acquireInferenceSlot({
           model,
           domain: "chat",
           owner: `telegram-bridge/${kind}`,
@@ -624049,10 +624494,12 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
           promptTokens,
           priority: kind === "router" || kind === "router-repair" || kind === "router-strict-retry" ? 1 : 0
         });
-        this.tuiWrite(() => renderTelegramSubAgentEvent(
-          sessionKey,
-          `inference admitted [${kind}] model=${model} prompt~${promptTokens}t num_ctx=${targetCtx} slot=${slot.info.id}${slot.info.reserved ? " reserved" : ""}`
-        ));
+        if (process.env["OMNIUS_BROKER_TRACE"] === "1") {
+          this.tuiWrite(() => renderTelegramSubAgentEvent(
+            sessionKey,
+            `inference admitted [${kind}] model=${model} prompt~${promptTokens}t num_ctx=${targetCtx} slot=${slot ? slot.info.id : "bypass"}${slot?.info.reserved ? " reserved" : ""}`
+          ));
+        }
         const streamFn = backend.chatCompletionStream;
         const id = this.registerTelegramInference(kind, sessionKey, model);
         let completionTokens = 0;
@@ -624079,10 +624526,10 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
           }
           const usage = result.usage;
           completionTokens = usage?.completion_tokens ?? 0;
-          slot.release({ ok: true, completionTokens });
+          slot?.release({ ok: true, completionTokens });
           return result;
         } catch (err) {
-          slot.release({ ok: false, error: err instanceof Error ? err.message : String(err) });
+          slot?.release({ ok: false, error: err instanceof Error ? err.message : String(err) });
           throw err;
         } finally {
           this.deregisterTelegramInference(id);
@@ -624274,7 +624721,7 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
       getTelegramThinkingVisible() {
         return this.telegramThinkingVisible;
       }
-      async repairTelegramInteractionDecision(backend, rawOutput, forcedRoute, timeoutMs, diagnostics) {
+      async repairTelegramInteractionDecision(backend, rawOutput, forcedRoute, timeoutMs, diagnostics, sessionKey = "__router__") {
         const rawPreview = telegramRouterRawPreview(rawOutput, 4e3);
         if (!rawPreview || telegramDecisionOutputHasDanglingJson(rawOutput)) {
           if (diagnostics) {
@@ -624309,10 +624756,10 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`,
             ],
             tools: [],
             temperature: 0,
-            maxTokens: 500,
+            maxTokens: 1500,
             timeoutMs: telegramRouterTimeoutMs(timeoutMs, 8e3, 2e4),
             think: false
-          });
+          }, diagnostics, "router-repair", sessionKey);
           const repairedText = result.choices[0]?.message?.content ?? "";
           if (telegramDecisionRecoverableFlag(repairedText) === false) {
             if (diagnostics) diagnostics.repairStatus = "no-recoverable-output";
@@ -624344,7 +624791,7 @@ ${repairedText}`,
           return null;
         }
       }
-      async retryTelegramInteractionDecisionStrict(backend, userPrompt, rawOutput, forcedRoute, timeoutMs, diagnostics) {
+      async retryTelegramInteractionDecisionStrict(backend, userPrompt, rawOutput, forcedRoute, timeoutMs, diagnostics, sessionKey = "__router__") {
         const invalidPreview = telegramRouterRawPreview(rawOutput, 1200) ?? "(empty assistant content)";
         const routeInstruction = forcedRoute ? `The operator selected Telegram mode "${forcedRoute}". The route field must be "${forcedRoute}", but should_reply must still be inferred from context.` : `Infer route live from context.`;
         const trimmedUserPrompt = userPrompt.length > 4e3 ? `…
@@ -624376,10 +624823,10 @@ ${userPrompt.slice(-4e3)}` : userPrompt;
             ],
             tools: [],
             temperature: 0,
-            maxTokens: 1200,
+            maxTokens: 2400,
             timeoutMs: telegramRouterTimeoutMs(timeoutMs, 1e4, 3e4),
             think: false
-          });
+          }, diagnostics, "router-strict-retry", sessionKey);
           const retryText = result.choices[0]?.message?.content ?? "";
           if (diagnostics) diagnostics.strictRetryPreview = telegramRouterRawPreview(retryText, 320);
           const parsed = parseTelegramInteractionDecision(retryText, forcedRoute, {
@@ -624762,10 +625209,14 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`
             ],
             tools: [],
             temperature: 0,
-            maxTokens: 1e3,
+            // Router JSON schema has ~18 string-valued fields when reflection is
+            // embedded (consolidated mode). 1000 tokens was the documented cause
+            // of truncated JSON → repair → strict-retry cascade. 2400 is enough
+            // for normal verbose values without slowing the call appreciably.
+            maxTokens: 2400,
             timeoutMs: telegramRouterTimeoutMs(config.timeoutMs),
             think: false
-          }, diagnostics);
+          }, diagnostics, "router", sessionKey);
           const text = result.choices[0]?.message?.content ?? "";
           const routerLatencyMs = Date.now() - routerStartMs;
           try {
@@ -624788,12 +625239,40 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`
           if (parsed) {
             return this.applyTelegramSilentReflectionNotes(parsed, reflectionNotes);
           }
+          if (isLikelyTruncatedRouterJson(text)) {
+            if (diagnostics) diagnostics.repairStatus = "skipped-truncation-rerun";
+            try {
+              const reissued = await this.telegramRouterJsonCompletion(backend, {
+                messages: [
+                  {
+                    role: "system",
+                    content: "You perform live Telegram route and stimulation inference. Output strict JSON only."
+                  },
+                  { role: "user", content: userPrompt }
+                ],
+                tools: [],
+                temperature: 0,
+                maxTokens: 4096,
+                timeoutMs: telegramRouterTimeoutMs(config.timeoutMs),
+                think: false
+              }, diagnostics, "router", sessionKey);
+              const reissuedText = reissued.choices[0]?.message?.content ?? "";
+              const reparsed = parseTelegramInteractionDecision(reissuedText, forcedRoute, {
+                defaultShouldReply: false
+              });
+              if (reparsed) {
+                return this.applyTelegramSilentReflectionNotes(reparsed, reflectionNotes);
+              }
+            } catch {
+            }
+          }
           const repaired = await this.repairTelegramInteractionDecision(
             backend,
             text,
             forcedRoute,
             config.timeoutMs ?? 3e4,
-            diagnostics
+            diagnostics,
+            sessionKey
           );
           if (repaired) {
             return this.applyTelegramSilentReflectionNotes(repaired, reflectionNotes);
@@ -624804,7 +625283,8 @@ ${this.quoteTelegramContextBlock(msg.text, 1200)}`
             text,
             forcedRoute,
             config.timeoutMs ?? 3e4,
-            diagnostics
+            diagnostics,
+            sessionKey
           );
           if (strictRetry) {
             return this.applyTelegramSilentReflectionNotes(strictRetry, reflectionNotes);
@@ -625050,34 +625530,25 @@ ${list}` : "No shared group target is currently known for this sender. Ask in th
         return join131(this.repoRoot, ".omnius", "telegram-runner-state", safe);
       }
       buildTelegramAdminOverviewContext(currentSessionKey) {
-        const sections = [];
         this.ensureAllTelegramConversationsLoaded();
         const chatEntries = [...this.chatHistory.entries()].filter(([sessionKey, history]) => sessionKey !== currentSessionKey && history.length > 0).sort(([, a2], [, b]) => (b[b.length - 1]?.ts ?? 0) - (a2[a2.length - 1]?.ts ?? 0)).slice(0, 18);
+        if (chatEntries.length === 0) return "";
+        const indexLines = [];
         for (const [sessionKey, history] of chatEntries) {
           const latest = history[history.length - 1];
-          const participants = [...this.chatParticipants.get(sessionKey)?.values() ?? []].sort((a2, b) => b.lastSeenTs - a2.lastSeenTs).slice(0, 8).map((profile) => {
-            const label = profile.username && profile.username !== "unknown" ? `@${profile.username}` : profile.firstName || `user:${profile.fromUserId}`;
-            return `${label} (${profile.messageCount} msg)`;
-          }).join(", ");
-          const recent = history.slice(-5).map(
-            (entry) => `    - ${telegramHistorySpeaker(entry)}: ${truncateTelegramContextLine(entry.text, 180)}`
-          ).join("\n");
-          const cards = (this.chatMemoryCards.get(sessionKey) ?? []).slice(0, 4).map((card) => `    - ${card.title}: ${card.notes.slice(-1)[0] ?? ""}`).join("\n");
-          sections.push([
-            `- ${sessionKey} (chat_id ${String(latest.chatId ?? "unknown")}; ${latest.chatType || "chat"}${latest.chatTitle ? `: ${latest.chatTitle}` : ""})`,
-            participants ? `  Participants: ${participants}` : "",
-            `  Latest: ${telegramHistorySpeaker(latest)}: ${truncateTelegramContextLine(latest.text, 180)}`,
-            recent ? `  Recent:
-${recent}` : "",
-            cards ? `  Memory cards:
-${cards}` : ""
-          ].filter(Boolean).join("\n"));
-        }
-        if (sections.length === 0) return "";
+          const participantCount = this.chatParticipants.get(sessionKey)?.size ?? 0;
+          const ageMs = Date.now() - (latest.ts ?? 0);
+          const ageMin = Math.round(ageMs / 6e4);
+          const ageStr = ageMin < 60 ? `${ageMin}m ago` : ageMin < 24 * 60 ? `${Math.round(ageMin / 60)}h ago` : `${Math.round(ageMin / (24 * 60))}d ago`;
+          const label = latest.chatTitle ? `"${latest.chatTitle}"` : sessionKey;
+          indexLines.push(`- ${label} (chat_id ${String(latest.chatId ?? "?")}; ${latest.chatType || "chat"}): ${participantCount} participants; last ${ageStr}; ${history.length} retained msgs`);
+        }
         return [
-          "## Admin Telegram Omniscience",
-          "This section is one-way context for the authenticated admin private DM only. It summarizes public/group and other Telegram sessions the bot has observed. Never inject admin/private DM content into public groups.",
-          sections.join("\n")
+          "## Admin Telegram Omniscience (index only)",
+          "One-way context for the authenticated admin private DM. Other Telegram sessions the bot has observed are listed below with one line each.",
+          "For details on a specific chat, use telegram_memory_search with the chat_id or topic — the always-loaded view is intentionally compact.",
+          "Never inject admin/private DM content into public groups.",
+          indexLines.join("\n")
         ].join("\n\n");
       }
       buildTelegramSessionContext(msg, toolContext, profile, modelTier) {
@@ -626197,8 +626668,9 @@ ${conversationStream}`
           messages: this.buildTelegramChatMessages(msg, toolContext, mediaContext),
           tools: [],
           temperature: 0.4,
-          maxTokens: 700,
-          timeoutMs: Math.max(config.timeoutMs ?? 3e5, 12e4)
+          maxTokens: 1500,
+          timeoutMs: Math.max(config.timeoutMs ?? 3e5, 12e4),
+          responseFormat: TELEGRAM_CHAT_REPLY_RESPONSE_FORMAT
         });
         let accumulated = "";
         let streamError;
@@ -626225,7 +626697,8 @@ ${conversationStream}`
                 } else {
                   this.bumpTelegramInferenceTokens(inferenceId, 1, 0);
                   accumulated += piece;
-                  await onToken(accumulated);
+                  const partial = extractPartialTelegramReplyJson(accumulated);
+                  if (partial !== null) await onToken(partial);
                 }
               }
             } catch (err) {
@@ -626247,11 +626720,14 @@ ${conversationStream}`
             }
             this.updateTelegramInferenceFinal(inferenceId, result);
             accumulated = result.choices[0]?.message?.content ?? "";
-            if (accumulated) await onToken(accumulated);
+            const fullExtracted = extractPartialTelegramReplyJson(accumulated);
+            if (fullExtracted) await onToken(fullExtracted);
           }
         } finally {
           this.deregisterTelegramInference(inferenceId);
         }
+        const extracted = extractFinalTelegramReplyJson(accumulated);
+        if (extracted) return extracted;
         return stripTelegramHiddenThinking(accumulated).trim();
       }
       retainTelegramVisibleReplyDraft(subAgent, draft, streamToolNames = subAgent.currentStreamToolNames) {