npm - omnius - Versions diffs - 1.0.120 → 1.0.121 - Mend

omnius 1.0.120 → 1.0.121

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.js CHANGED Viewed

@@ -553442,14 +553442,29 @@ ${description}`
           poolSlot.release(success);
           poolSlot = null;
         };
+        const streamTimeoutMs = Number.isFinite(request.timeoutMs) && request.timeoutMs > 0 ? Math.max(request.timeoutMs, 1e4) : 3e5;
+        const streamAbort = new AbortController();
+        const streamTimeoutHandle = setTimeout(() => {
+          streamAbort.abort(new Error(`stream timeout: no response or chunk within ${(streamTimeoutMs / 1e3).toFixed(0)}s`));
+        }, streamTimeoutMs);
+        if (typeof streamTimeoutHandle.unref === "function") {
+          streamTimeoutHandle.unref();
+        }
+        const externalAbortListener = this._abortSignal ? () => streamAbort.abort(this._abortSignal?.reason ?? new Error("external abort")) : null;
+        if (this._abortSignal && externalAbortListener) {
+          if (this._abortSignal.aborted) {
+            externalAbortListener();
+          } else {
+            this._abortSignal.addEventListener("abort", externalAbortListener, { once: true });
+          }
+        }
         try {
           const streamFetchOpts = {
             method: "POST",
             headers: this.authHeaders(),
-            body: JSON.stringify(body)
+            body: JSON.stringify(body),
+            signal: streamAbort.signal
           };
-          if (this._abortSignal)
-            streamFetchOpts.signal = this._abortSignal;
           let resp = await fetch(`${requestBaseUrl}/v1/chat/completions`, streamFetchOpts);
           if (!resp.ok) {
             const text = await resp.text().catch(() => "");
@@ -553548,6 +553563,13 @@ ${description}`
           this._finalizeStreamGuard(effectiveThink, accumulatedContent, accumulatedThinking, sawReasoningTokens);
           poolSuccess = true;
         } finally {
+          clearTimeout(streamTimeoutHandle);
+          if (this._abortSignal && externalAbortListener) {
+            try {
+              this._abortSignal.removeEventListener("abort", externalAbortListener);
+            } catch {
+            }
+          }
           releasePoolSlot(poolSuccess);
         }
       }
@@ -619882,24 +619904,57 @@ ${lines.join("\n")}`);
             `inference ${inferenceId} [${entry.kind}] ${elapsed}s content=${entry.contentTokens}t thinking=${entry.thinkingTokens}t (${thinkRatio}% think) live=${JSON.stringify(preview)}`
           ));
         };
-        for await (const chunk of streamFn(request)) {
-          if (chunk.type === "content" && chunk.content) {
-            if (chunk.thinking) {
-              thinkingBuf += chunk.content;
-              this.bumpTelegramInferenceTokens(inferenceId, 0, 1);
-            } else {
-              contentBuf += chunk.content;
-              this.bumpTelegramInferenceTokens(inferenceId, 1, 0);
-            }
-            flushPreview(false);
-          } else if (chunk.type === "finish") {
-            finishReason = chunk.finishReason;
-          } else if (chunk.type === "usage") {
-            usage = {
-              prompt_tokens: chunk.promptTokens,
-              completion_tokens: chunk.completionTokens,
-              total_tokens: chunk.totalTokens
-            };
+        const inactivityMs = this.telegramStreamInactivityMs();
+        const iter = streamFn(request)[Symbol.asyncIterator]();
+        try {
+          while (true) {
+            let timeoutHandle = null;
+            const inactivityPromise = new Promise((_, reject) => {
+              timeoutHandle = setTimeout(
+                () => reject(new Error(
+                  `stream-inactivity: no chunks for ${(inactivityMs / 1e3).toFixed(0)}s (content=${contentBuf.length}c thinking=${thinkingBuf.length}c so far) — Ollama likely cold-loading the model or wedged; falling back to non-stream`
+                )),
+                inactivityMs
+              );
+              if (typeof timeoutHandle.unref === "function") {
+                timeoutHandle.unref();
+              }
+            });
+            let next;
+            try {
+              next = await Promise.race([iter.next(), inactivityPromise]);
+            } finally {
+              if (timeoutHandle) clearTimeout(timeoutHandle);
+            }
+            if (next.done) break;
+            const chunk = next.value;
+            if (chunk.type === "content" && chunk.content) {
+              const entry = this.telegramActiveInferences.get(inferenceId);
+              if (entry && entry.firstChunkAt === void 0) {
+                entry.firstChunkAt = performance.now();
+              }
+              if (chunk.thinking) {
+                thinkingBuf += chunk.content;
+                this.bumpTelegramInferenceTokens(inferenceId, 0, 1);
+              } else {
+                contentBuf += chunk.content;
+                this.bumpTelegramInferenceTokens(inferenceId, 1, 0);
+              }
+              flushPreview(false);
+            } else if (chunk.type === "finish") {
+              finishReason = chunk.finishReason;
+            } else if (chunk.type === "usage") {
+              usage = {
+                prompt_tokens: chunk.promptTokens,
+                completion_tokens: chunk.completionTokens,
+                total_tokens: chunk.totalTokens
+              };
+            }
+          }
+        } finally {
+          try {
+            await iter.return?.(void 0);
+          } catch {
           }
         }
         flushPreview(true);
@@ -619970,9 +620025,10 @@ ${lines.join("\n")}`);
           const dur = ((performance.now() - entry.startTs) / 1e3).toFixed(1);
           const totalTokens = entry.contentTokens + entry.thinkingTokens;
           const ratio = totalTokens > 0 ? Math.round(entry.thinkingTokens * 100 / totalTokens) : 0;
+          const ttfb = entry.firstChunkAt !== void 0 ? `${((entry.firstChunkAt - entry.startTs) / 1e3).toFixed(1)}s` : "never";
           this.tuiWrite(() => renderTelegramSubAgentEvent(
             entry.sessionKey,
-            `inference ${id} [${entry.kind}] done in ${dur}s — ${entry.contentTokens}t content / ${entry.thinkingTokens}t thinking (${ratio}% think)`
+            `inference ${id} [${entry.kind}] done in ${dur}s (ttfb=${ttfb}) — ${entry.contentTokens}t content / ${entry.thinkingTokens}t thinking (${ratio}% think)`
           ));
         }
       }
@@ -619988,7 +620044,10 @@ ${lines.join("\n")}`);
         return Array.from(this.telegramActiveInferences.values()).map((e2) => ({
           ...e2,
           elapsedSec: (now - e2.startTs) / 1e3,
-          idleSec: (now - e2.lastTokenAt) / 1e3
+          idleSec: (now - e2.lastTokenAt) / 1e3,
+          // Undefined when no chunk has arrived yet (still cold-loading or wedged).
+          // A dashboard renderer should display "—" or "waiting" in that case.
+          ttfbSec: e2.firstChunkAt !== void 0 ? (e2.firstChunkAt - e2.startTs) / 1e3 : void 0
         }));
       }
       /**
@@ -620260,6 +620319,25 @@ ${retryText}`,
       telegramSubAgentWatchdogIntervalMs() {
         return 3e4;
       }
+      /**
+       * Per-chunk inactivity window for the bridge's stream consumer. If no
+       * chunk arrives within this window, the streaming consumer in
+       * streamTelegramInferenceToCompletion aborts via Promise.race + clears
+       * the iterator, and telegramObservableInference falls back to the
+       * non-streaming chatCompletion path. This gives operators a clean
+       * "stream silent for 60s, falling back" signal instead of the opaque
+       * 180s coalescer hard-deadline.
+       *
+       * Default 60s — comfortably longer than a healthy cold-load of a 35B
+       * model on a warm VRAM cache (typically <30s) but short enough to
+       * surface a real wedge before the 180s coalescer fires. Override via
+       * OMNIUS_TG_STREAM_INACTIVITY_MS (clamped to [10s, 5min]).
+       */
+      telegramStreamInactivityMs() {
+        const raw = Number.parseInt(process.env["OMNIUS_TG_STREAM_INACTIVITY_MS"] ?? "", 10);
+        if (Number.isFinite(raw) && raw >= 1e4 && raw <= 3e5) return raw;
+        return 6e4;
+      }
       /**
        * Start the periodic stale-sub-agent reaper. Idempotent — safe to call
        * multiple times (no-op if already running). Stopped by stop() and on

package/npm-shrinkwrap.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "name": "omnius",
-  "version": "1.0.120",
+  "version": "1.0.121",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "omnius",
-      "version": "1.0.120",
+      "version": "1.0.121",
       "bundleDependencies": [
         "image-to-ascii"
       ],

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "omnius",
-  "version": "1.0.120",
+  "version": "1.0.121",
   "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
   "type": "module",
   "main": "./dist/index.js",