npm - omnius - Versions diffs - 1.0.115 → 1.0.116 - Mend

omnius 1.0.115 → 1.0.116

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -616349,6 +616349,34 @@ External acquisition contract:
       telegramRouterSessionState = /* @__PURE__ */ new Map();
       /** Telegram interaction routing profile */
       interactionMode = "auto";
+      /**
+       * Toggle for surfacing qwen3 `<think>` content streamed by Telegram-side
+       * inferences (router, chat fast-path, follow-up). Mirrors the main TUI's
+       * Ctrl+O thinking-visibility toggle but applies to the bridge's stream
+       * surface (which has its own write path through tuiWrite + view
+       * callbacks). Default off; flip via env `OMNIUS_TG_SHOW_THINKING=1` or
+       * setTelegramThinkingVisible(). Independent of the model-side
+       * `think:false` directive — that controls whether the model emits
+       * thinking content at all; this controls whether the operator sees it
+       * when it IS emitted.
+       */
+      telegramThinkingVisible = process.env["OMNIUS_TG_SHOW_THINKING"] === "1";
+      /**
+       * Live telemetry of every in-flight Ollama call originating from the
+       * bridge. Lets the operator see WHY multiple GPUs are spun up at once
+       * and HOW each call is progressing — which is the only way to debug a
+       * 180s hard-deadline firing event without grepping logs.
+       *
+       * Each entry tracks:
+       *   - kind: router | chat-fast-path | followup | sub-agent
+       *   - sessionKey: which chat
+       *   - startTs: wall-clock start
+       *   - contentTokens / thinkingTokens: cumulative count from the stream
+       *   - lastTokenAt: timestamp of the most-recent chunk (staleness signal)
+       *   - model: the model being called (helps differentiate concurrent calls)
+       */
+      telegramActiveInferences = /* @__PURE__ */ new Map();
+      telegramInferenceCounter = 0;
       /** Actual model context window discovered by the main TUI. */
       contextWindowSize = 0;
       _metricsProvider = null;
@@ -617982,16 +618010,21 @@ ${mediaContext}` : ""
             this.agentConfig.model,
             this.agentConfig.apiKey
           );
-          const result = await backend.chatCompletion(telegramThinkSuppressedRequest({
-            messages: [
-              { role: "system", content: "You are a Telegram public-follow-up discretion model. Output strict JSON only." },
-              { role: "user", content: prompt }
-            ],
-            tools: [],
-            temperature: 0.2,
-            maxTokens: 300,
-            timeoutMs: Math.min(Math.max(this.agentConfig.timeoutMs ?? 3e4, 5e3), 2e4)
-          }));
+          const result = await this.telegramObservableInference(
+            backend,
+            telegramThinkSuppressedRequest({
+              messages: [
+                { role: "system", content: "You are a Telegram public-follow-up discretion model. Output strict JSON only." },
+                { role: "user", content: prompt }
+              ],
+              tools: [],
+              temperature: 0.2,
+              maxTokens: 300,
+              timeoutMs: Math.min(Math.max(this.agentConfig.timeoutMs ?? 3e4, 5e3), 2e4)
+            }),
+            "followup",
+            sessionKey
+          );
           const decision2 = parseTelegramReflectionFollowupDecision(result.choices[0]?.message?.content ?? "");
           state.lastFollowupArtifactAt = artifact.generatedAt;
           if (!decision2) {
@@ -619612,15 +619645,17 @@ ${lines.join("\n")}`);
           nextAnalysisAfterMessages: decision2.nextCheckAfterMessages
         });
       }
-      async telegramRouterJsonCompletion(backend, request, diagnostics) {
+      async telegramRouterJsonCompletion(backend, request, diagnostics, inferenceKind = "router", sessionKey = "__router__") {
         let jsonModeResult;
         let jsonModeError;
         const suppressed = telegramThinkSuppressedRequest(request);
         try {
-          jsonModeResult = await backend.chatCompletion({
-            ...suppressed,
-            responseFormat: TELEGRAM_INTERACTION_DECISION_RESPONSE_FORMAT
-          });
+          jsonModeResult = await this.telegramObservableInference(
+            backend,
+            { ...suppressed, responseFormat: TELEGRAM_INTERACTION_DECISION_RESPONSE_FORMAT },
+            inferenceKind,
+            sessionKey
+          );
           const visible = jsonModeResult.choices.some(
             (choice) => stripTelegramHiddenThinking(choice.message.content ?? "").trim().length > 0
           );
@@ -619637,7 +619672,12 @@ ${lines.join("\n")}`);
           }
         }
         try {
-          const plainResult = await backend.chatCompletion(suppressed);
+          const plainResult = await this.telegramObservableInference(
+            backend,
+            suppressed,
+            inferenceKind,
+            sessionKey
+          );
           if (diagnostics) {
             const plainVisible = plainResult.choices.some(
               (choice) => stripTelegramHiddenThinking(choice.message.content ?? "").trim().length > 0
@@ -619654,6 +619694,205 @@ ${lines.join("\n")}`);
           throw err;
         }
       }
+      // ─────────────────────────────────────────────────────────────────
+      // Observable inference — streams chatCompletion-shaped calls so the
+      // operator can SEE what's happening during a long-running router or
+      // chat-fast-path call instead of waiting 180s for a hard-deadline.
+      // ─────────────────────────────────────────────────────────────────
+      /**
+       * Wrap a chatCompletion-shaped call so the bridge can observe its token
+       * stream and surface telemetry. Falls back to non-streaming if the
+       * backend doesn't expose chatCompletionStream (older test stubs) or if
+       * streaming throws. The returned shape matches chatCompletion exactly,
+       * so callers don't have to know whether streaming was used.
+       *
+       * What this gives us:
+       *   1. Per-call entry in the active-inferences registry (visible to the
+       *      operator — answers "why are 2 GPUs spun up at once?")
+       *   2. Live emission of thinking + content tokens to the TUI when
+       *      telegramThinkingVisible is true (mirror of Ctrl+O for the bridge)
+       *   3. Wall-clock observability — if the call hangs at 60s with zero
+       *      content tokens emitted, the registry shows it, and the
+       *      hard-deadline retire path becomes diagnosable instead of opaque
+       */
+      async telegramObservableInference(backend, request, kind, sessionKey) {
+        const streamFn = backend.chatCompletionStream;
+        const id = this.registerTelegramInference(kind, sessionKey, this.agentConfig?.model ?? "?");
+        try {
+          if (typeof streamFn !== "function") {
+            const r2 = await backend.chatCompletion(request);
+            this.updateTelegramInferenceFinal(id, r2);
+            return r2;
+          }
+          try {
+            const result = await this.streamTelegramInferenceToCompletion(
+              streamFn.bind(backend),
+              request,
+              id
+            );
+            return result;
+          } catch (streamErr) {
+            const r2 = await backend.chatCompletion(request);
+            this.updateTelegramInferenceFinal(id, r2);
+            this.tuiWrite(() => renderTelegramSubAgentEvent(
+              sessionKey,
+              `inference ${id}: stream errored (${streamErr instanceof Error ? streamErr.message : String(streamErr)}); fell back to non-stream`
+            ));
+            return r2;
+          }
+        } finally {
+          this.deregisterTelegramInference(id);
+        }
+      }
+      /**
+       * Drive a chatCompletionStream to exhaustion, accumulating tokens into a
+       * chatCompletion-shaped result. Live-emits content + thinking tokens
+       * through the TUI when telegramThinkingVisible is true, throttled to
+       * avoid spamming the waterfall on fast streams.
+       */
+      async streamTelegramInferenceToCompletion(streamFn, request, inferenceId) {
+        let contentBuf = "";
+        let thinkingBuf = "";
+        let finishReason;
+        let usage;
+        let lastEmitMs = 0;
+        const EMIT_THROTTLE_MS = 500;
+        const flushPreview = (force) => {
+          if (!this.telegramThinkingVisible) return;
+          const now = Date.now();
+          if (!force && now - lastEmitMs < EMIT_THROTTLE_MS) return;
+          lastEmitMs = now;
+          const entry = this.telegramActiveInferences.get(inferenceId);
+          if (!entry) return;
+          const elapsed = ((performance.now() - entry.startTs) / 1e3).toFixed(1);
+          const thinkRatio = entry.contentTokens + entry.thinkingTokens > 0 ? Math.round(entry.thinkingTokens * 100 / (entry.contentTokens + entry.thinkingTokens)) : 0;
+          const preview = (thinkingBuf || contentBuf).slice(-180).replace(/\s+/g, " ");
+          this.tuiWrite(() => renderTelegramSubAgentEvent(
+            entry.sessionKey,
+            `inference ${inferenceId} [${entry.kind}] ${elapsed}s content=${entry.contentTokens}t thinking=${entry.thinkingTokens}t (${thinkRatio}% think) live=${JSON.stringify(preview)}`
+          ));
+        };
+        for await (const chunk of streamFn(request)) {
+          if (chunk.type === "content" && chunk.content) {
+            if (chunk.thinking) {
+              thinkingBuf += chunk.content;
+              this.bumpTelegramInferenceTokens(inferenceId, 0, 1);
+            } else {
+              contentBuf += chunk.content;
+              this.bumpTelegramInferenceTokens(inferenceId, 1, 0);
+            }
+            flushPreview(false);
+          } else if (chunk.type === "finish") {
+            finishReason = chunk.finishReason;
+          } else if (chunk.type === "usage") {
+            usage = {
+              prompt_tokens: chunk.promptTokens,
+              completion_tokens: chunk.completionTokens,
+              total_tokens: chunk.totalTokens
+            };
+          }
+        }
+        flushPreview(true);
+        void finishReason;
+        return {
+          choices: [
+            {
+              message: {
+                content: thinkingBuf ? `<think>${thinkingBuf}</think>${contentBuf}` : contentBuf
+              }
+            }
+          ],
+          usage: usage ? {
+            totalTokens: usage.total_tokens ?? 0,
+            promptTokens: usage.prompt_tokens,
+            completionTokens: usage.completion_tokens
+          } : void 0
+        };
+      }
+      // ─────────────────────────────────────────────────────────────────
+      // Inference telemetry registry
+      // ─────────────────────────────────────────────────────────────────
+      registerTelegramInference(kind, sessionKey, model) {
+        const id = `inf-${++this.telegramInferenceCounter}`;
+        const now = performance.now();
+        this.telegramActiveInferences.set(id, {
+          id,
+          kind,
+          sessionKey,
+          model,
+          startTs: now,
+          lastTokenAt: now,
+          contentTokens: 0,
+          thinkingTokens: 0,
+          streaming: true
+        });
+        return id;
+      }
+      bumpTelegramInferenceTokens(id, contentDelta, thinkingDelta) {
+        const entry = this.telegramActiveInferences.get(id);
+        if (!entry) return;
+        entry.contentTokens += contentDelta;
+        entry.thinkingTokens += thinkingDelta;
+        entry.lastTokenAt = performance.now();
+      }
+      /**
+       * Called when a non-streaming chatCompletion returns. Walks the completion
+       * to extract a rough token count from the visible content so the registry
+       * has SOME size signal even for non-streamed calls.
+       */
+      updateTelegramInferenceFinal(id, result) {
+        const entry = this.telegramActiveInferences.get(id);
+        if (!entry) return;
+        entry.streaming = false;
+        const text = result.choices[0]?.message?.content ?? "";
+        const thinkMatch = text.match(/<think>([\s\S]*?)<\/think>/);
+        const thinkingText = thinkMatch ? thinkMatch[1] : "";
+        const contentText = thinkMatch ? text.replace(thinkMatch[0], "") : text;
+        entry.thinkingTokens = Math.ceil(thinkingText.length / 4);
+        entry.contentTokens = Math.ceil(contentText.length / 4);
+        entry.lastTokenAt = performance.now();
+      }
+      deregisterTelegramInference(id) {
+        const entry = this.telegramActiveInferences.get(id);
+        if (!entry) return;
+        this.telegramActiveInferences.delete(id);
+        if (this.telegramThinkingVisible) {
+          const dur = ((performance.now() - entry.startTs) / 1e3).toFixed(1);
+          const totalTokens = entry.contentTokens + entry.thinkingTokens;
+          const ratio = totalTokens > 0 ? Math.round(entry.thinkingTokens * 100 / totalTokens) : 0;
+          this.tuiWrite(() => renderTelegramSubAgentEvent(
+            entry.sessionKey,
+            `inference ${id} [${entry.kind}] done in ${dur}s — ${entry.contentTokens}t content / ${entry.thinkingTokens}t thinking (${ratio}% think)`
+          ));
+        }
+      }
+      /**
+       * Snapshot of every in-flight Telegram-originated inference. The TUI
+       * dashboard / status line can call this to display "why are 2 GPUs spun
+       * up?" — each entry includes the kind, session, model, elapsed seconds,
+       * and token counts so the operator can correlate Ollama load to bridge
+       * activity.
+       */
+      getTelegramActiveInferences() {
+        const now = performance.now();
+        return Array.from(this.telegramActiveInferences.values()).map((e2) => ({
+          ...e2,
+          elapsedSec: (now - e2.startTs) / 1e3,
+          idleSec: (now - e2.lastTokenAt) / 1e3
+        }));
+      }
+      /**
+       * Toggle thinking visibility for the Telegram bridge. Mirrors the main
+       * TUI's Ctrl+O semantics but applies to bridge-side streams. Returns the
+       * new state so a binding can echo it back to the operator.
+       */
+      setTelegramThinkingVisible(visible) {
+        this.telegramThinkingVisible = visible;
+        return this.telegramThinkingVisible;
+      }
+      getTelegramThinkingVisible() {
+        return this.telegramThinkingVisible;
+      }
       async repairTelegramInteractionDecision(backend, rawOutput, forcedRoute, timeoutMs, diagnostics) {
         const rawPreview = telegramRouterRawPreview(rawOutput, 4e3);
         if (!rawPreview || telegramDecisionOutputHasDanglingJson(rawOutput)) {
@@ -620666,6 +620905,7 @@ ${TELEGRAM_PUBLIC_ORCHESTRATOR_CONTRACT}`);
         }
         this.stopTelegramSubAgentWatchdog();
         this.cancelTelegramRouterSessionState("bridge stop");
+        this.telegramActiveInferences.clear();
         if (this.telegramSqliteDb && this.telegramSqliteDb !== false) {
           try {
             this.telegramSqliteDb.close();
@@ -621482,35 +621722,55 @@ ${conversationStream}`
         });
         let accumulated = "";
         let streamError;
+        const sessionKey = this.sessionKeyForMessage(msg);
+        const inferenceId = this.registerTelegramInference("chat-fast-path", sessionKey, config.model);
         const streamable = backend;
         const stream = typeof streamable.chatCompletionStream === "function" ? streamable.chatCompletionStream(request) : null;
-        if (stream && typeof stream[Symbol.asyncIterator] === "function") {
-          try {
-            for await (const chunk of stream) {
-              if (chunk.type === "content" && !chunk.thinking && chunk.content) {
-                accumulated += chunk.content;
-                await onToken(accumulated);
+        try {
+          if (stream && typeof stream[Symbol.asyncIterator] === "function") {
+            try {
+              for await (const chunk of stream) {
+                if (chunk.type !== "content") continue;
+                const piece = chunk.content;
+                if (!piece) continue;
+                if (chunk.thinking) {
+                  this.bumpTelegramInferenceTokens(inferenceId, 0, 1);
+                  if (this.telegramThinkingVisible) {
+                    const preview = piece.slice(0, 120);
+                    this.tuiWrite(() => renderTelegramSubAgentEvent(
+                      msg.username,
+                      `chat-fast-path thinking: ${JSON.stringify(preview)}`
+                    ));
+                  }
+                } else {
+                  this.bumpTelegramInferenceTokens(inferenceId, 1, 0);
+                  accumulated += piece;
+                  await onToken(accumulated);
+                }
               }
+            } catch (err) {
+              streamError = err;
+              accumulated = "";
             }
-          } catch (err) {
-            streamError = err;
-            accumulated = "";
           }
-        }
-        if (!accumulated.trim()) {
-          let result;
-          try {
-            result = await backend.chatCompletion(request);
-          } catch (err) {
-            if (streamError) {
-              const streamMsg = streamError instanceof Error ? streamError.message : String(streamError);
-              const retryMsg = err instanceof Error ? err.message : String(err);
-              throw new Error(`streaming failed (${streamMsg}); non-stream retry failed (${retryMsg})`);
+          if (!accumulated.trim()) {
+            let result;
+            try {
+              result = await backend.chatCompletion(request);
+            } catch (err) {
+              if (streamError) {
+                const streamMsg = streamError instanceof Error ? streamError.message : String(streamError);
+                const retryMsg = err instanceof Error ? err.message : String(err);
+                throw new Error(`streaming failed (${streamMsg}); non-stream retry failed (${retryMsg})`);
+              }
+              throw err;
             }
-            throw err;
+            this.updateTelegramInferenceFinal(inferenceId, result);
+            accumulated = result.choices[0]?.message?.content ?? "";
+            if (accumulated) await onToken(accumulated);
           }
-          accumulated = result.choices[0]?.message?.content ?? "";
-          if (accumulated) await onToken(accumulated);
+        } finally {
+          this.deregisterTelegramInference(inferenceId);
         }
         return stripTelegramHiddenThinking(accumulated).trim();
       }
@@ -621665,6 +621925,13 @@ ${conversationStream}`
           if (event.type === "stream_token" && event.streamKind === "content" && event.content) {
             subAgent.accumulated += event.content;
           }
+          if (event.type === "stream_token" && event.streamKind === "thinking" && event.content && this.telegramThinkingVisible) {
+            const trimmed = event.content.replace(/\s+/g, " ").slice(0, 200);
+            this.subAgentViewCallbacks?.onWrite(
+              subAgent.viewId,
+              `thinking: ${trimmed}`
+            );
+          }
           const intermediateLine = formatTelegramProgressEvent(event);
           if (intermediateLine && (isAdminDM || event.type !== "status")) {
             subAgent.intermediateLines.push(intermediateLine);

package/npm-shrinkwrap.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "omnius",
-  "version": "1.0.115",
+  "version": "1.0.116",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "omnius",
-      "version": "1.0.115",
+      "version": "1.0.116",
       "bundleDependencies": [
         "image-to-ascii"
       ],

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "omnius",
-  "version": "1.0.115",
+  "version": "1.0.116",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",