omnius 1.0.115 → 1.0.116

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -616349,6 +616349,34 @@ External acquisition contract:
616349
616349
  telegramRouterSessionState = /* @__PURE__ */ new Map();
616350
616350
  /** Telegram interaction routing profile */
616351
616351
  interactionMode = "auto";
616352
+ /**
616353
+ * Toggle for surfacing qwen3 `<think>` content streamed by Telegram-side
616354
+ * inferences (router, chat fast-path, follow-up). Mirrors the main TUI's
616355
+ * Ctrl+O thinking-visibility toggle but applies to the bridge's stream
616356
+ * surface (which has its own write path through tuiWrite + view
616357
+ * callbacks). Default off; flip via env `OMNIUS_TG_SHOW_THINKING=1` or
616358
+ * setTelegramThinkingVisible(). Independent of the model-side
616359
+ * `think:false` directive — that controls whether the model emits
616360
+ * thinking content at all; this controls whether the operator sees it
616361
+ * when it IS emitted.
616362
+ */
616363
+ telegramThinkingVisible = process.env["OMNIUS_TG_SHOW_THINKING"] === "1";
616364
+ /**
616365
+ * Live telemetry of every in-flight Ollama call originating from the
616366
+ * bridge. Lets the operator see WHY multiple GPUs are spun up at once
616367
+ * and HOW each call is progressing — which is the only way to debug a
616368
+ * 180s hard-deadline firing event without grepping logs.
616369
+ *
616370
+ * Each entry tracks:
616371
+ * - kind: router | chat-fast-path | followup | sub-agent
616372
+ * - sessionKey: which chat
616373
+ * - startTs: wall-clock start
616374
+ * - contentTokens / thinkingTokens: cumulative count from the stream
616375
+ * - lastTokenAt: timestamp of the most-recent chunk (staleness signal)
616376
+ * - model: the model being called (helps differentiate concurrent calls)
616377
+ */
616378
+ telegramActiveInferences = /* @__PURE__ */ new Map();
616379
+ telegramInferenceCounter = 0;
616352
616380
  /** Actual model context window discovered by the main TUI. */
616353
616381
  contextWindowSize = 0;
616354
616382
  _metricsProvider = null;
@@ -617982,16 +618010,21 @@ ${mediaContext}` : ""
617982
618010
  this.agentConfig.model,
617983
618011
  this.agentConfig.apiKey
617984
618012
  );
617985
- const result = await backend.chatCompletion(telegramThinkSuppressedRequest({
617986
- messages: [
617987
- { role: "system", content: "You are a Telegram public-follow-up discretion model. Output strict JSON only." },
617988
- { role: "user", content: prompt }
617989
- ],
617990
- tools: [],
617991
- temperature: 0.2,
617992
- maxTokens: 300,
617993
- timeoutMs: Math.min(Math.max(this.agentConfig.timeoutMs ?? 3e4, 5e3), 2e4)
617994
- }));
618013
+ const result = await this.telegramObservableInference(
618014
+ backend,
618015
+ telegramThinkSuppressedRequest({
618016
+ messages: [
618017
+ { role: "system", content: "You are a Telegram public-follow-up discretion model. Output strict JSON only." },
618018
+ { role: "user", content: prompt }
618019
+ ],
618020
+ tools: [],
618021
+ temperature: 0.2,
618022
+ maxTokens: 300,
618023
+ timeoutMs: Math.min(Math.max(this.agentConfig.timeoutMs ?? 3e4, 5e3), 2e4)
618024
+ }),
618025
+ "followup",
618026
+ sessionKey
618027
+ );
617995
618028
  const decision2 = parseTelegramReflectionFollowupDecision(result.choices[0]?.message?.content ?? "");
617996
618029
  state.lastFollowupArtifactAt = artifact.generatedAt;
617997
618030
  if (!decision2) {
@@ -619612,15 +619645,17 @@ ${lines.join("\n")}`);
619612
619645
  nextAnalysisAfterMessages: decision2.nextCheckAfterMessages
619613
619646
  });
619614
619647
  }
619615
- async telegramRouterJsonCompletion(backend, request, diagnostics) {
619648
+ async telegramRouterJsonCompletion(backend, request, diagnostics, inferenceKind = "router", sessionKey = "__router__") {
619616
619649
  let jsonModeResult;
619617
619650
  let jsonModeError;
619618
619651
  const suppressed = telegramThinkSuppressedRequest(request);
619619
619652
  try {
619620
- jsonModeResult = await backend.chatCompletion({
619621
- ...suppressed,
619622
- responseFormat: TELEGRAM_INTERACTION_DECISION_RESPONSE_FORMAT
619623
- });
619653
+ jsonModeResult = await this.telegramObservableInference(
619654
+ backend,
619655
+ { ...suppressed, responseFormat: TELEGRAM_INTERACTION_DECISION_RESPONSE_FORMAT },
619656
+ inferenceKind,
619657
+ sessionKey
619658
+ );
619624
619659
  const visible = jsonModeResult.choices.some(
619625
619660
  (choice) => stripTelegramHiddenThinking(choice.message.content ?? "").trim().length > 0
619626
619661
  );
@@ -619637,7 +619672,12 @@ ${lines.join("\n")}`);
619637
619672
  }
619638
619673
  }
619639
619674
  try {
619640
- const plainResult = await backend.chatCompletion(suppressed);
619675
+ const plainResult = await this.telegramObservableInference(
619676
+ backend,
619677
+ suppressed,
619678
+ inferenceKind,
619679
+ sessionKey
619680
+ );
619641
619681
  if (diagnostics) {
619642
619682
  const plainVisible = plainResult.choices.some(
619643
619683
  (choice) => stripTelegramHiddenThinking(choice.message.content ?? "").trim().length > 0
@@ -619654,6 +619694,205 @@ ${lines.join("\n")}`);
619654
619694
  throw err;
619655
619695
  }
619656
619696
  }
619697
+ // ─────────────────────────────────────────────────────────────────
619698
+ // Observable inference — streams chatCompletion-shaped calls so the
619699
+ // operator can SEE what's happening during a long-running router or
619700
+ // chat-fast-path call instead of waiting 180s for a hard-deadline.
619701
+ // ─────────────────────────────────────────────────────────────────
619702
+ /**
619703
+ * Wrap a chatCompletion-shaped call so the bridge can observe its token
619704
+ * stream and surface telemetry. Falls back to non-streaming if the
619705
+ * backend doesn't expose chatCompletionStream (older test stubs) or if
619706
+ * streaming throws. The returned shape matches chatCompletion exactly,
619707
+ * so callers don't have to know whether streaming was used.
619708
+ *
619709
+ * What this gives us:
619710
+ * 1. Per-call entry in the active-inferences registry (visible to the
619711
+ * operator — answers "why are 2 GPUs spun up at once?")
619712
+ * 2. Live emission of thinking + content tokens to the TUI when
619713
+ * telegramThinkingVisible is true (mirror of Ctrl+O for the bridge)
619714
+ * 3. Wall-clock observability — if the call hangs at 60s with zero
619715
+ * content tokens emitted, the registry shows it, and the
619716
+ * hard-deadline retire path becomes diagnosable instead of opaque
619717
+ */
619718
+ async telegramObservableInference(backend, request, kind, sessionKey) {
619719
+ const streamFn = backend.chatCompletionStream;
619720
+ const id = this.registerTelegramInference(kind, sessionKey, this.agentConfig?.model ?? "?");
619721
+ try {
619722
+ if (typeof streamFn !== "function") {
619723
+ const r2 = await backend.chatCompletion(request);
619724
+ this.updateTelegramInferenceFinal(id, r2);
619725
+ return r2;
619726
+ }
619727
+ try {
619728
+ const result = await this.streamTelegramInferenceToCompletion(
619729
+ streamFn.bind(backend),
619730
+ request,
619731
+ id
619732
+ );
619733
+ return result;
619734
+ } catch (streamErr) {
619735
+ const r2 = await backend.chatCompletion(request);
619736
+ this.updateTelegramInferenceFinal(id, r2);
619737
+ this.tuiWrite(() => renderTelegramSubAgentEvent(
619738
+ sessionKey,
619739
+ `inference ${id}: stream errored (${streamErr instanceof Error ? streamErr.message : String(streamErr)}); fell back to non-stream`
619740
+ ));
619741
+ return r2;
619742
+ }
619743
+ } finally {
619744
+ this.deregisterTelegramInference(id);
619745
+ }
619746
+ }
619747
+ /**
619748
+ * Drive a chatCompletionStream to exhaustion, accumulating tokens into a
619749
+ * chatCompletion-shaped result. Live-emits content + thinking tokens
619750
+ * through the TUI when telegramThinkingVisible is true, throttled to
619751
+ * avoid spamming the waterfall on fast streams.
619752
+ */
619753
+ async streamTelegramInferenceToCompletion(streamFn, request, inferenceId) {
619754
+ let contentBuf = "";
619755
+ let thinkingBuf = "";
619756
+ let finishReason;
619757
+ let usage;
619758
+ let lastEmitMs = 0;
619759
+ const EMIT_THROTTLE_MS = 500;
619760
+ const flushPreview = (force) => {
619761
+ if (!this.telegramThinkingVisible) return;
619762
+ const now = Date.now();
619763
+ if (!force && now - lastEmitMs < EMIT_THROTTLE_MS) return;
619764
+ lastEmitMs = now;
619765
+ const entry = this.telegramActiveInferences.get(inferenceId);
619766
+ if (!entry) return;
619767
+ const elapsed = ((performance.now() - entry.startTs) / 1e3).toFixed(1);
619768
+ const thinkRatio = entry.contentTokens + entry.thinkingTokens > 0 ? Math.round(entry.thinkingTokens * 100 / (entry.contentTokens + entry.thinkingTokens)) : 0;
619769
+ const preview = (thinkingBuf || contentBuf).slice(-180).replace(/\s+/g, " ");
619770
+ this.tuiWrite(() => renderTelegramSubAgentEvent(
619771
+ entry.sessionKey,
619772
+ `inference ${inferenceId} [${entry.kind}] ${elapsed}s content=${entry.contentTokens}t thinking=${entry.thinkingTokens}t (${thinkRatio}% think) live=${JSON.stringify(preview)}`
619773
+ ));
619774
+ };
619775
+ for await (const chunk of streamFn(request)) {
619776
+ if (chunk.type === "content" && chunk.content) {
619777
+ if (chunk.thinking) {
619778
+ thinkingBuf += chunk.content;
619779
+ this.bumpTelegramInferenceTokens(inferenceId, 0, 1);
619780
+ } else {
619781
+ contentBuf += chunk.content;
619782
+ this.bumpTelegramInferenceTokens(inferenceId, 1, 0);
619783
+ }
619784
+ flushPreview(false);
619785
+ } else if (chunk.type === "finish") {
619786
+ finishReason = chunk.finishReason;
619787
+ } else if (chunk.type === "usage") {
619788
+ usage = {
619789
+ prompt_tokens: chunk.promptTokens,
619790
+ completion_tokens: chunk.completionTokens,
619791
+ total_tokens: chunk.totalTokens
619792
+ };
619793
+ }
619794
+ }
619795
+ flushPreview(true);
619796
+ void finishReason;
619797
+ return {
619798
+ choices: [
619799
+ {
619800
+ message: {
619801
+ content: thinkingBuf ? `<think>${thinkingBuf}</think>${contentBuf}` : contentBuf
619802
+ }
619803
+ }
619804
+ ],
619805
+ usage: usage ? {
619806
+ totalTokens: usage.total_tokens ?? 0,
619807
+ promptTokens: usage.prompt_tokens,
619808
+ completionTokens: usage.completion_tokens
619809
+ } : void 0
619810
+ };
619811
+ }
619812
+ // ─────────────────────────────────────────────────────────────────
619813
+ // Inference telemetry registry
619814
+ // ─────────────────────────────────────────────────────────────────
619815
+ registerTelegramInference(kind, sessionKey, model) {
619816
+ const id = `inf-${++this.telegramInferenceCounter}`;
619817
+ const now = performance.now();
619818
+ this.telegramActiveInferences.set(id, {
619819
+ id,
619820
+ kind,
619821
+ sessionKey,
619822
+ model,
619823
+ startTs: now,
619824
+ lastTokenAt: now,
619825
+ contentTokens: 0,
619826
+ thinkingTokens: 0,
619827
+ streaming: true
619828
+ });
619829
+ return id;
619830
+ }
619831
+ bumpTelegramInferenceTokens(id, contentDelta, thinkingDelta) {
619832
+ const entry = this.telegramActiveInferences.get(id);
619833
+ if (!entry) return;
619834
+ entry.contentTokens += contentDelta;
619835
+ entry.thinkingTokens += thinkingDelta;
619836
+ entry.lastTokenAt = performance.now();
619837
+ }
619838
+ /**
619839
+ * Called when a non-streaming chatCompletion returns. Walks the completion
619840
+ * to extract a rough token count from the visible content so the registry
619841
+ * has SOME size signal even for non-streamed calls.
619842
+ */
619843
+ updateTelegramInferenceFinal(id, result) {
619844
+ const entry = this.telegramActiveInferences.get(id);
619845
+ if (!entry) return;
619846
+ entry.streaming = false;
619847
+ const text = result.choices[0]?.message?.content ?? "";
619848
+ const thinkMatch = text.match(/<think>([\s\S]*?)<\/think>/);
619849
+ const thinkingText = thinkMatch ? thinkMatch[1] : "";
619850
+ const contentText = thinkMatch ? text.replace(thinkMatch[0], "") : text;
619851
+ entry.thinkingTokens = Math.ceil(thinkingText.length / 4);
619852
+ entry.contentTokens = Math.ceil(contentText.length / 4);
619853
+ entry.lastTokenAt = performance.now();
619854
+ }
619855
+ deregisterTelegramInference(id) {
619856
+ const entry = this.telegramActiveInferences.get(id);
619857
+ if (!entry) return;
619858
+ this.telegramActiveInferences.delete(id);
619859
+ if (this.telegramThinkingVisible) {
619860
+ const dur = ((performance.now() - entry.startTs) / 1e3).toFixed(1);
619861
+ const totalTokens = entry.contentTokens + entry.thinkingTokens;
619862
+ const ratio = totalTokens > 0 ? Math.round(entry.thinkingTokens * 100 / totalTokens) : 0;
619863
+ this.tuiWrite(() => renderTelegramSubAgentEvent(
619864
+ entry.sessionKey,
619865
+ `inference ${id} [${entry.kind}] done in ${dur}s — ${entry.contentTokens}t content / ${entry.thinkingTokens}t thinking (${ratio}% think)`
619866
+ ));
619867
+ }
619868
+ }
619869
+ /**
619870
+ * Snapshot of every in-flight Telegram-originated inference. The TUI
619871
+ * dashboard / status line can call this to display "why are 2 GPUs spun
619872
+ * up?" — each entry includes the kind, session, model, elapsed seconds,
619873
+ * and token counts so the operator can correlate Ollama load to bridge
619874
+ * activity.
619875
+ */
619876
+ getTelegramActiveInferences() {
619877
+ const now = performance.now();
619878
+ return Array.from(this.telegramActiveInferences.values()).map((e2) => ({
619879
+ ...e2,
619880
+ elapsedSec: (now - e2.startTs) / 1e3,
619881
+ idleSec: (now - e2.lastTokenAt) / 1e3
619882
+ }));
619883
+ }
619884
+ /**
619885
+ * Toggle thinking visibility for the Telegram bridge. Mirrors the main
619886
+ * TUI's Ctrl+O semantics but applies to bridge-side streams. Returns the
619887
+ * new state so a binding can echo it back to the operator.
619888
+ */
619889
+ setTelegramThinkingVisible(visible) {
619890
+ this.telegramThinkingVisible = visible;
619891
+ return this.telegramThinkingVisible;
619892
+ }
619893
+ getTelegramThinkingVisible() {
619894
+ return this.telegramThinkingVisible;
619895
+ }
619657
619896
  async repairTelegramInteractionDecision(backend, rawOutput, forcedRoute, timeoutMs, diagnostics) {
619658
619897
  const rawPreview = telegramRouterRawPreview(rawOutput, 4e3);
619659
619898
  if (!rawPreview || telegramDecisionOutputHasDanglingJson(rawOutput)) {
@@ -620666,6 +620905,7 @@ ${TELEGRAM_PUBLIC_ORCHESTRATOR_CONTRACT}`);
620666
620905
  }
620667
620906
  this.stopTelegramSubAgentWatchdog();
620668
620907
  this.cancelTelegramRouterSessionState("bridge stop");
620908
+ this.telegramActiveInferences.clear();
620669
620909
  if (this.telegramSqliteDb && this.telegramSqliteDb !== false) {
620670
620910
  try {
620671
620911
  this.telegramSqliteDb.close();
@@ -621482,35 +621722,55 @@ ${conversationStream}`
621482
621722
  });
621483
621723
  let accumulated = "";
621484
621724
  let streamError;
621725
+ const sessionKey = this.sessionKeyForMessage(msg);
621726
+ const inferenceId = this.registerTelegramInference("chat-fast-path", sessionKey, config.model);
621485
621727
  const streamable = backend;
621486
621728
  const stream = typeof streamable.chatCompletionStream === "function" ? streamable.chatCompletionStream(request) : null;
621487
- if (stream && typeof stream[Symbol.asyncIterator] === "function") {
621488
- try {
621489
- for await (const chunk of stream) {
621490
- if (chunk.type === "content" && !chunk.thinking && chunk.content) {
621491
- accumulated += chunk.content;
621492
- await onToken(accumulated);
621729
+ try {
621730
+ if (stream && typeof stream[Symbol.asyncIterator] === "function") {
621731
+ try {
621732
+ for await (const chunk of stream) {
621733
+ if (chunk.type !== "content") continue;
621734
+ const piece = chunk.content;
621735
+ if (!piece) continue;
621736
+ if (chunk.thinking) {
621737
+ this.bumpTelegramInferenceTokens(inferenceId, 0, 1);
621738
+ if (this.telegramThinkingVisible) {
621739
+ const preview = piece.slice(0, 120);
621740
+ this.tuiWrite(() => renderTelegramSubAgentEvent(
621741
+ msg.username,
621742
+ `chat-fast-path thinking: ${JSON.stringify(preview)}`
621743
+ ));
621744
+ }
621745
+ } else {
621746
+ this.bumpTelegramInferenceTokens(inferenceId, 1, 0);
621747
+ accumulated += piece;
621748
+ await onToken(accumulated);
621749
+ }
621493
621750
  }
621751
+ } catch (err) {
621752
+ streamError = err;
621753
+ accumulated = "";
621494
621754
  }
621495
- } catch (err) {
621496
- streamError = err;
621497
- accumulated = "";
621498
621755
  }
621499
- }
621500
- if (!accumulated.trim()) {
621501
- let result;
621502
- try {
621503
- result = await backend.chatCompletion(request);
621504
- } catch (err) {
621505
- if (streamError) {
621506
- const streamMsg = streamError instanceof Error ? streamError.message : String(streamError);
621507
- const retryMsg = err instanceof Error ? err.message : String(err);
621508
- throw new Error(`streaming failed (${streamMsg}); non-stream retry failed (${retryMsg})`);
621756
+ if (!accumulated.trim()) {
621757
+ let result;
621758
+ try {
621759
+ result = await backend.chatCompletion(request);
621760
+ } catch (err) {
621761
+ if (streamError) {
621762
+ const streamMsg = streamError instanceof Error ? streamError.message : String(streamError);
621763
+ const retryMsg = err instanceof Error ? err.message : String(err);
621764
+ throw new Error(`streaming failed (${streamMsg}); non-stream retry failed (${retryMsg})`);
621765
+ }
621766
+ throw err;
621509
621767
  }
621510
- throw err;
621768
+ this.updateTelegramInferenceFinal(inferenceId, result);
621769
+ accumulated = result.choices[0]?.message?.content ?? "";
621770
+ if (accumulated) await onToken(accumulated);
621511
621771
  }
621512
- accumulated = result.choices[0]?.message?.content ?? "";
621513
- if (accumulated) await onToken(accumulated);
621772
+ } finally {
621773
+ this.deregisterTelegramInference(inferenceId);
621514
621774
  }
621515
621775
  return stripTelegramHiddenThinking(accumulated).trim();
621516
621776
  }
@@ -621665,6 +621925,13 @@ ${conversationStream}`
621665
621925
  if (event.type === "stream_token" && event.streamKind === "content" && event.content) {
621666
621926
  subAgent.accumulated += event.content;
621667
621927
  }
621928
+ if (event.type === "stream_token" && event.streamKind === "thinking" && event.content && this.telegramThinkingVisible) {
621929
+ const trimmed = event.content.replace(/\s+/g, " ").slice(0, 200);
621930
+ this.subAgentViewCallbacks?.onWrite(
621931
+ subAgent.viewId,
621932
+ `thinking: ${trimmed}`
621933
+ );
621934
+ }
621668
621935
  const intermediateLine = formatTelegramProgressEvent(event);
621669
621936
  if (intermediateLine && (isAdminDM || event.type !== "status")) {
621670
621937
  subAgent.intermediateLines.push(intermediateLine);
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.115",
3
+ "version": "1.0.116",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "omnius",
9
- "version": "1.0.115",
9
+ "version": "1.0.116",
10
10
  "bundleDependencies": [
11
11
  "image-to-ascii"
12
12
  ],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.115",
3
+ "version": "1.0.116",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",