omnius 1.0.357 → 1.0.358

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -567542,6 +567542,27 @@ var init_completion_resolution_verifier = __esm({
567542
567542
  });
567543
567543
 
567544
567544
  // packages/orchestrator/dist/evidenceBranch.js
567545
+ function buildStructuralPreview2(lines, path12, query) {
567546
+ const n2 = lines.length;
567547
+ const clip3 = (l2) => l2.length > 180 ? l2.slice(0, 180) + "…" : l2;
567548
+ const head = lines.slice(0, HEAD_LINES2).map((l2, i2) => `${i2 + 1}: ${clip3(l2)}`);
567549
+ const isStructural = (l2) => /^\s*(<[A-Za-z!]|#{1,6}\s|def |class |function |export |interface |type |async |public |private |\[[^\]]+\]|[A-Za-z_][\w.]*\s*=)/.test(l2) && l2.trim().length > 0 && l2.trim().length <= 180;
567550
+ const markers = [];
567551
+ for (let i2 = HEAD_LINES2; i2 < n2; i2++) {
567552
+ const l2 = lines[i2];
567553
+ if (isStructural(l2))
567554
+ markers.push(`${i2 + 1}: ${clip3(l2.trim())}`);
567555
+ }
567556
+ const MAX_MARKERS = 30;
567557
+ const sampled = markers.length > MAX_MARKERS ? Array.from({ length: MAX_MARKERS }, (_, k) => markers[Math.floor(k * markers.length / MAX_MARKERS)]) : markers;
567558
+ return [
567559
+ `[STRUCTURAL PREVIEW] "${query}" was not directly located in ${path12} (${n2} lines). Navigate by the structure below and re-read a SPECIFIC region with offset/limit — do NOT re-read the whole file:`,
567560
+ "Head:",
567561
+ ...head,
567562
+ sampled.length ? "Section markers (line: content):" : "(no clear section markers)",
567563
+ ...sampled
567564
+ ].filter(Boolean).join("\n").slice(0, 1600);
567565
+ }
567545
567566
  function queryTerms(query) {
567546
567567
  return [
567547
567568
  ...new Set(query.toLowerCase().replace(/[^a-z0-9_<>./-]+/g, " ").split(/\s+/).filter((w) => w.length > 2 && !STOPWORDS2.has(w)))
@@ -567692,17 +567713,20 @@ async function extractEvidence(opts) {
567692
567713
  const ends = kept.map((s2) => s2.end);
567693
567714
  const snippetLower = claim2.toLowerCase();
567694
567715
  const covered = terms2.filter((t2) => snippetLower.includes(t2)).length;
567695
- return {
567696
- path: path12,
567697
- query,
567698
- claim: claim2,
567699
- sourceStart: starts.length ? Math.min(...starts) : null,
567700
- sourceEnd: ends.length ? Math.max(...ends) : null,
567701
- fileVersion,
567702
- confidence: Math.min(1, covered / Math.max(1, terms2.length)),
567703
- exploredLines: lines.length,
567704
- injectedChars: claim2.length
567705
- };
567716
+ const grepConfidence = Math.min(1, covered / Math.max(1, terms2.length));
567717
+ if (grepConfidence >= EXTRACT_CONFIDENCE_FLOOR) {
567718
+ return {
567719
+ path: path12,
567720
+ query,
567721
+ claim: claim2,
567722
+ sourceStart: starts.length ? Math.min(...starts) : null,
567723
+ sourceEnd: ends.length ? Math.max(...ends) : null,
567724
+ fileVersion,
567725
+ confidence: grepConfidence,
567726
+ exploredLines: lines.length,
567727
+ injectedChars: claim2.length
567728
+ };
567729
+ }
567706
567730
  }
567707
567731
  }
567708
567732
  const windows = lines.length <= WINDOW_LINES * 2 ? [{ start: 1, end: lines.length, text: content, score: 1 }] : selectWindows(lines, terms2);
@@ -567725,7 +567749,7 @@ async function extractEvidence(opts) {
567725
567749
  parsed = null;
567726
567750
  }
567727
567751
  }
567728
- const claim = parsed && parsed.found && parsed.claim ? parsed.claim : `No evidence for "${query}" found in the explored windows of ${path12}.`;
567752
+ const claim = parsed && parsed.found && parsed.claim ? parsed.claim : buildStructuralPreview2(lines, path12, query);
567729
567753
  return {
567730
567754
  path: path12,
567731
567755
  query,
@@ -567743,7 +567767,7 @@ function shouldBranchRead(contentLength, lineCount, hasExplicitSmallRange, thres
567743
567767
  return false;
567744
567768
  return contentLength > thresholdChars || lineCount > 200;
567745
567769
  }
567746
- var WINDOW_LINES, SNIPPET_CONTEXT, HEAD_LINES2, MAX_SNIPPET_LINES, STOPWORDS2;
567770
+ var WINDOW_LINES, SNIPPET_CONTEXT, HEAD_LINES2, MAX_SNIPPET_LINES, EXTRACT_CONFIDENCE_FLOOR, STOPWORDS2;
567747
567771
  var init_evidenceBranch = __esm({
567748
567772
  "packages/orchestrator/dist/evidenceBranch.js"() {
567749
567773
  "use strict";
@@ -567751,6 +567775,7 @@ var init_evidenceBranch = __esm({
567751
567775
  SNIPPET_CONTEXT = 4;
567752
567776
  HEAD_LINES2 = 10;
567753
567777
  MAX_SNIPPET_LINES = 220;
567778
+ EXTRACT_CONFIDENCE_FLOOR = 0.3;
567754
567779
  STOPWORDS2 = /* @__PURE__ */ new Set([
567755
567780
  "the",
567756
567781
  "and",
@@ -615029,35 +615054,105 @@ ${CONTENT_BG_SEQ}`);
615029
615054
  (seq) => seq.endsWith("m") ? seq : ""
615030
615055
  );
615031
615056
  }
615032
- reflowContentLines(livePartialLine, width) {
615057
+ /** Resolve a dynamic-block sentinel line to its registered renderer's lines
615058
+ * at the given width, or null if it is not a (live) sentinel. */
615059
+ dynamicBlockLines(line, maxWidth) {
615060
+ if (!line.startsWith(this.DYNAMIC_BLOCK_MARK_PREFIX) || !line.endsWith(this.DYNAMIC_BLOCK_MARK_SUFFIX)) {
615061
+ return null;
615062
+ }
615063
+ const id = line.slice(
615064
+ this.DYNAMIC_BLOCK_MARK_PREFIX.length,
615065
+ line.length - this.DYNAMIC_BLOCK_MARK_SUFFIX.length
615066
+ );
615067
+ const renderer = this._dynamicBlocks.get(id);
615068
+ if (!renderer) return [];
615069
+ try {
615070
+ return renderer(maxWidth);
615071
+ } catch {
615072
+ return [];
615073
+ }
615074
+ }
615075
+ /** Reflowed-row COUNT for one buffer line at width — dynamic-block aware.
615076
+ * Static lines are a cache-hit `.length` (no allocation), so the count pass
615077
+ * over the whole backlog is cheap. */
615078
+ rowCountForSourceLine(line, maxWidth) {
615079
+ const block = this.dynamicBlockLines(line, maxWidth);
615080
+ if (block !== null) {
615081
+ let n2 = 0;
615082
+ for (const seg of block) n2 += this.reflowContentLine(seg, maxWidth).length;
615083
+ return n2;
615084
+ }
615085
+ return this.reflowContentLine(line, maxWidth).length;
615086
+ }
615087
+ /** Reflowed rows (with bufferIdx) for one buffer line — built ONLY for the
615088
+ * lines actually inside the viewport window. */
615089
+ rowsForSourceLine(line, idx, maxWidth) {
615090
+ const block = this.dynamicBlockLines(line, maxWidth);
615091
+ if (block !== null) {
615092
+ return block.flatMap(
615093
+ (seg) => this.reflowContentLine(seg, maxWidth).map((s2) => ({
615094
+ line: s2,
615095
+ bufferIdx: idx
615096
+ }))
615097
+ );
615098
+ }
615099
+ return this.reflowContentLine(line, maxWidth).map((segment) => ({
615100
+ line: segment,
615101
+ bufferIdx: idx
615102
+ }));
615103
+ }
615104
+ /** Total reflowed row count at width (cheap — cache-hit counts, no big array
615105
+ * allocation). Used for scroll bounds. */
615106
+ reflowedRowCount(livePartialLine, width) {
615033
615107
  const maxWidth = Math.max(16, width);
615034
615108
  const source = livePartialLine ? [...this._contentLines, livePartialLine] : this._contentLines;
615035
- return source.flatMap((line, idx) => {
615036
- if (line.startsWith(this.DYNAMIC_BLOCK_MARK_PREFIX) && line.endsWith(this.DYNAMIC_BLOCK_MARK_SUFFIX)) {
615037
- const id = line.slice(
615038
- this.DYNAMIC_BLOCK_MARK_PREFIX.length,
615039
- line.length - this.DYNAMIC_BLOCK_MARK_SUFFIX.length
615040
- );
615041
- const renderer = this._dynamicBlocks.get(id);
615042
- if (!renderer) return [];
615043
- let blockLines;
615044
- try {
615045
- blockLines = renderer(maxWidth);
615046
- } catch {
615047
- return [];
615048
- }
615049
- return blockLines.flatMap(
615050
- (segment) => this.reflowContentLine(segment, maxWidth).map((s2) => ({
615051
- line: s2,
615052
- bufferIdx: idx
615053
- }))
615054
- );
615055
- }
615056
- return this.reflowContentLine(line, maxWidth).map((segment) => ({
615057
- line: segment,
615058
- bufferIdx: idx
615059
- }));
615060
- });
615109
+ let total = 0;
615110
+ for (let i2 = 0; i2 < source.length; i2++) {
615111
+ total += this.rowCountForSourceLine(source[i2], maxWidth);
615112
+ }
615113
+ return total;
615114
+ }
615115
+ /**
615116
+ * VIRTUALIZED reflow — produce ONLY the viewport window's rows (the visible
615117
+ * `viewportRows` reflowed rows at the given scroll offset). Repaint cost is
615118
+ * O(viewport + dynamic blocks) instead of O(scrollback), so a long session
615119
+ * never re-wraps the whole 10k-line backlog on every paint (the lag-after-
615120
+ * thousands-of-lines stall). Returns the window rows IN ORDER (index 0 = top
615121
+ * visible row), the total row count (for the scrollbar/bounds), and the
615122
+ * clamped scroll offset.
615123
+ */
615124
+ reflowContentWindow(livePartialLine, width, viewportRows, scrollOffset) {
615125
+ const maxWidth = Math.max(16, width);
615126
+ const source = livePartialLine ? [...this._contentLines, livePartialLine] : this._contentLines;
615127
+ const h = Math.max(0, viewportRows);
615128
+ let totalRows = 0;
615129
+ const counts = new Array(source.length);
615130
+ for (let i2 = 0; i2 < source.length; i2++) {
615131
+ const c8 = this.rowCountForSourceLine(source[i2], maxWidth);
615132
+ counts[i2] = c8;
615133
+ totalRows += c8;
615134
+ }
615135
+ const maxOffset = Math.max(0, totalRows - h);
615136
+ const off = scrollOffset < 0 ? 0 : scrollOffset > maxOffset ? maxOffset : scrollOffset;
615137
+ const startIdx = Math.max(0, totalRows - h - off);
615138
+ const endIdx = startIdx + h;
615139
+ const rows = [];
615140
+ let cursor = 0;
615141
+ for (let i2 = 0; i2 < source.length && cursor < endIdx; i2++) {
615142
+ const c8 = counts[i2];
615143
+ const lineStart = cursor;
615144
+ cursor += c8;
615145
+ if (cursor <= startIdx) continue;
615146
+ if (lineStart >= endIdx) break;
615147
+ const lr = this.rowsForSourceLine(source[i2], i2, maxWidth);
615148
+ const from3 = Math.max(0, startIdx - lineStart);
615149
+ const to = Math.min(c8, endIdx - lineStart);
615150
+ for (let k = from3; k < to; k++) {
615151
+ const r2 = lr[k];
615152
+ if (r2) rows.push(r2);
615153
+ }
615154
+ }
615155
+ return { rows, totalRows, startIdx, scrollOffset: off };
615061
615156
  }
615062
615157
  // Memoize per-line reflow: it is a PURE function of (line, width), and
615063
615158
  // reflowContentLines re-wraps the entire scrollback every repaint. Caching
@@ -615201,7 +615296,7 @@ ${CONTENT_BG_SEQ}`);
615201
615296
  maxContentScrollOffset(width = termCols(), livePartialLine = this.getLiveBufferedLine()) {
615202
615297
  return Math.max(
615203
615298
  0,
615204
- this.reflowContentLines(livePartialLine, width).length - this.contentHeight
615299
+ this.reflowedRowCount(livePartialLine, width) - this.contentHeight
615205
615300
  );
615206
615301
  }
615207
615302
  clampContentScrollOffset(width = termCols()) {
@@ -615330,26 +615425,30 @@ ${CONTENT_BG_SEQ}`);
615330
615425
  const w = termCols();
615331
615426
  const _perfOn = process.env["OMNIUS_TUI_PERF"] === "1";
615332
615427
  const _t0 = _perfOn ? performance.now() : 0;
615333
- const reflowedLines = this.reflowContentLines(livePartialLine, w);
615428
+ const win = this.reflowContentWindow(
615429
+ livePartialLine,
615430
+ w,
615431
+ h,
615432
+ this._contentScrollOffset
615433
+ );
615434
+ const reflowedLines = win.rows;
615334
615435
  if (_perfOn) {
615335
615436
  const _ms = performance.now() - _t0;
615336
615437
  if (_ms > 8) {
615337
615438
  try {
615338
615439
  process.stderr.write(
615339
- `[TUI-PERF] reflow ${_ms.toFixed(1)}ms (lines=${reflowedLines.length}, w=${w})
615440
+ `[TUI-PERF] reflow ${_ms.toFixed(1)}ms (window=${win.rows.length}/${win.totalRows} rows, w=${w})
615340
615441
  `
615341
615442
  );
615342
615443
  } catch {
615343
615444
  }
615344
615445
  }
615345
615446
  }
615346
- const totalLines = reflowedLines.length;
615347
- const maxOffset = Math.max(0, totalLines - h);
615348
- if (this._contentScrollOffset > maxOffset) {
615349
- this._contentScrollOffset = maxOffset;
615447
+ if (win.scrollOffset !== this._contentScrollOffset) {
615448
+ this._contentScrollOffset = win.scrollOffset;
615350
615449
  if (this._contentScrollOffset === 0) this._autoScroll = true;
615351
615450
  }
615352
- const startIdx = Math.max(0, totalLines - h - this._contentScrollOffset);
615451
+ const startIdx = 0;
615353
615452
  this._lastPaintReflow = reflowedLines;
615354
615453
  this._lastPaintStartIdx = startIdx;
615355
615454
  const headerSafeFloor = layout().headerBottom + 1;
@@ -615402,8 +615501,8 @@ ${CONTENT_BG_SEQ}`);
615402
615501
  }
615403
615502
  }
615404
615503
  if (this._contentScrollOffset > 0) {
615405
- const linesAbove = startIdx;
615406
- const pct = totalLines > 0 ? Math.round((startIdx + h) / totalLines * 100) : 100;
615504
+ const linesAbove = win.startIdx;
615505
+ const pct = win.totalRows > 0 ? Math.round((win.startIdx + h) / win.totalRows * 100) : 100;
615407
615506
  const indicator = ` ↑ ${linesAbove} lines above · ${pct}% · PgDn/End to return `;
615408
615507
  const pad = Math.max(0, w - indicator.length);
615409
615508
  buf += `\x1B[${this.scrollRegionTop};1H\x1B[7m${indicator}${" ".repeat(pad)}\x1B[0m`;
@@ -662307,18 +662406,27 @@ function deriveVisualEvidencePlan(request) {
662307
662406
  const needsText = /\b(text|read|ocr|extract|label|word|number|what does it say|transcript|character|letter|digit|spell|transcribe|copy|quote|type|what is written)\b/i.test(prompt);
662308
662407
  const needsScene = /\b(what|who|where|describe|scene|object|person|identify|tell me about|explain|see|show|happening|look like|recogniz)\b/i.test(prompt) && !needsText;
662309
662408
  const needsUI = /\b(ui|button|menu|dialog|window|interface|screen|dashboard|form|field|input|select|option|dropdown)\b/i.test(prompt);
662409
+ const comprehensive = (reason) => ({
662410
+ stages: [
662411
+ { kind: "low_fidelity_observation", required: false },
662412
+ { kind: "ocr", required: true },
662413
+ { kind: "auxiliary_vision", required: true }
662414
+ ],
662415
+ reason
662416
+ });
662310
662417
  switch (detail) {
662311
662418
  case "low":
662312
- return { stages: [{ kind: "low_fidelity_observation", required: true }], reason: "low detail requested" };
662419
+ return { stages: [{ kind: "low_fidelity_observation", required: true }], reason: "low detail explicitly requested" };
662313
662420
  case "text":
662314
- return { stages: [{ kind: "low_fidelity_observation", required: false }, { kind: "ocr", required: true }, { kind: "auxiliary_vision", required: false }], reason: "text extraction requested" };
662421
+ return comprehensive("text extraction requested full vision still runs for classification");
662422
+ case "visual":
662423
+ return comprehensive("visual analysis requested");
662315
662424
  case "full":
662316
- return { stages: [{ kind: "low_fidelity_observation", required: false }, { kind: "ocr", required: needsText || needsUI }, { kind: "auxiliary_vision", required: true }], reason: "full detail requested" };
662425
+ return comprehensive("full detail requested");
662317
662426
  default:
662318
- const stages = [{ kind: "low_fidelity_observation", required: false }];
662319
- if (needsText || needsUI) stages.push({ kind: "ocr", required: true });
662320
- if (needsScene || needsUI) stages.push({ kind: "auxiliary_vision", required: !needsText && !needsUI });
662321
- return { stages, reason: needsText ? "text evidence needed" : needsScene ? "scene analysis needed" : needsUI ? "UI/document analysis needed" : "auto" };
662427
+ return comprehensive(
662428
+ needsText ? "comprehensive (text emphasis)" : needsScene ? "comprehensive (scene emphasis)" : needsUI ? "comprehensive (UI/document emphasis)" : "comprehensive (full vision + OCR on all media)"
662429
+ );
662322
662430
  }
662323
662431
  }
662324
662432
  async function executeVisualEvidencePlan(resolution, plan, executor) {
@@ -662362,6 +662470,7 @@ __export(vision_ingress_exports, {
662362
662470
  isTesseractAvailable: () => isTesseractAvailable,
662363
662471
  isVisionModel: () => isVisionModel,
662364
662472
  queryVisionModel: () => queryVisionModel,
662473
+ resolveVisionModel: () => resolveVisionModel,
662365
662474
  runVisionIngress: () => runVisionIngress
662366
662475
  });
662367
662476
  import { execFileSync as execFileSync10 } from "node:child_process";
@@ -662490,25 +662599,32 @@ async function queryVisionModel(modelName, imagePath, prompt = "Describe what yo
662490
662599
  return "";
662491
662600
  }
662492
662601
  }
662602
+ function resolveVisionModel(currentModel) {
662603
+ if (currentModel && isVisionModel(currentModel)) return currentModel;
662604
+ const env2 = (process.env["OMNIUS_VISION_MODEL"] || "").trim();
662605
+ if (env2) return env2;
662606
+ return "moondream";
662607
+ }
662493
662608
  async function runVisionIngress(image, currentModel) {
662494
662609
  const ocrText = advancedOcr(image.path);
662495
662610
  let visionDescription = "";
662496
662611
  let visionUsed = false;
662497
- if (currentModel && isVisionModel(currentModel)) {
662498
- visionDescription = await queryVisionModel(currentModel, image.path);
662612
+ const visionModel = resolveVisionModel(currentModel);
662613
+ if (visionModel) {
662614
+ visionDescription = await queryVisionModel(visionModel, image.path);
662499
662615
  visionUsed = visionDescription.length > 0;
662500
662616
  }
662501
662617
  const parts = [];
662502
662618
  if (ocrText.length > 0) {
662503
- parts.push(`[OCR Text from pasted image]
662619
+ parts.push(`[OCR Text from image]
662504
662620
  ${ocrText}`);
662505
662621
  }
662506
662622
  if (visionDescription.length > 0) {
662507
- parts.push(`[Vision analysis of pasted image (model: ${currentModel})]
662623
+ parts.push(`[Vision analysis of image (model: ${visionModel})]
662508
662624
  ${visionDescription}`);
662509
662625
  }
662510
662626
  if (parts.length === 0) {
662511
- parts.push(`[Image pasted at ${image.path} — no text detected by OCR, no vision model available for analysis]`);
662627
+ parts.push(`[Image at ${image.path} — OCR found no text and the vision model (${visionModel}) returned no description; treat as UNCOMPREHENDED and re-run telegram_image_analyze with detail='full' before answering.]`);
662512
662628
  }
662513
662629
  const contextBlock = parts.join("\n\n");
662514
662630
  return {
@@ -665197,7 +665313,7 @@ Public Telegram vision and media stack
665197
665313
 
665198
665314
  Public Telegram runs have the full scoped media-analysis stack for media posted in this chat:
665199
665315
  - Use telegram_media_recent to find recent scoped media, then use path/media aliases 'reply' and 'latest' instead of exposing local paths to users.
665200
- - For image questions, prefer telegram_image_analyze first. It resolves omitted/reply/latest media, starts with low-fidelity image intake, uses basic OCR as the text extraction probe, escalates to advanced OCR when text is dense or under-extracted, and escalates to Moondream vision when visual QA/captioning is needed.
665316
+ - MANDATORY: whenever one or more images are present (posted, replied-to, or recent), run a FULL comprehension pass on EVERY image BEFORE responding — telegram_image_analyze with detail='full' (advanced OCR + Moondream vision) on each; if a burst of images was posted, enumerate them with telegram_media_recent and analyze ALL of them. Base the answer ONLY on the extracted content (objects, scene, any text). NEVER answer from metadata alone (count, size, timestamp, caption), never claim you can't say what's pictured without running vision, and never offer to analyze only "the ones you care about" — full vision on all of them IS the job. Do not stop until every image is fully comprehended.
665201
665317
  - Use ocr for quick image text extraction, ocr_image_advanced when basic OCR shows dense or degraded text, image_read for image metadata + multimodal image payload, and vision for direct Moondream captioning, visual QA, object detection, or pointing.
665202
665318
  - Use pdf_to_text for embedded-text PDFs and ocr_pdf for scanned PDFs.
665203
665319
  - Use video_understand and transcribe_file for video/audio media posted in this chat.
@@ -675910,7 +676026,7 @@ ${currentTelegramPrompt}`;
675910
676026
  TELEGRAM_LINK_INTEGRITY_CONTRACT,
675911
676027
  "If a user explicitly states a durable preference for reply cadence/order, call telegram_preference_set. Do not infer or classify reply-mode preferences from keywords, style, tone, or task type.",
675912
676028
  TELEGRAM_EVIDENCE_SUFFICIENCY_CONTRACT,
675913
- "You have the full scoped Telegram media-analysis stack by default: telegram_image_analyze, telegram_media_recent, image_read, ocr, ocr_image_advanced, vision, pdf_to_text, ocr_pdf, transcribe_file, video_understand, audio_analyze, and identity_memory. For image questions, prefer telegram_image_analyze first; it resolves omitted/reply/latest media, starts with low-fidelity image intake, uses basic OCR as the text extraction probe, escalates to advanced OCR when text is dense or under-extracted, and escalates to Moondream vision when visual QA/captioning is needed.",
676029
+ "You have the full scoped Telegram media-analysis stack by default: telegram_image_analyze, telegram_media_recent, image_read, ocr, ocr_image_advanced, vision, pdf_to_text, ocr_pdf, transcribe_file, video_understand, audio_analyze, and identity_memory. MANDATORY image handling: whenever one or more images are present in the message (or a referenced/recent message), you MUST run a FULL comprehension pass on EVERY image before you respond — call telegram_image_analyze with detail='full' (advanced OCR + Moondream vision) on each, and if multiple images were sent in a burst, analyze ALL of them (use telegram_media_recent to enumerate them). Base your answer ONLY on the actual extracted content (objects, scene, and any text). NEVER answer from metadata alone (count, file size, timestamp, caption) and NEVER say you 'can't say what's pictured without running vision' or offer to analyze 'the ones you care about' — running full vision on all of them IS your job. Do not stop until every image is fully comprehended; if a pass returns nothing, retry with detail='full' or image_read+ocr_image_advanced+vision before concluding.",
675914
676030
  formatIdentityMemoryContext(chatLabel || "Telegram private chat"),
675915
676031
  reminderToolContract,
675916
676032
  "If the user asks you to create an image, audio file, video, 3D/CAD model, or document artifact, create it with the scoped creative tools. Freshly generated artifacts are recorded and automatically attached to this Telegram chat when the turn completes, so do not call telegram_send_file for those same artifacts unless the user asked for a specific caption, existing/unrecorded file, or non-default target.",
@@ -679750,7 +679866,7 @@ ${knownList}` : "Private-user telegram_send_file target must be this DM or a kno
679750
679866
  * Downloads the file, runs it through the appropriate pipeline,
679751
679867
  * caches it, and returns a text description for the agent.
679752
679868
  */
679753
- async processMedia(msg, source = "message") {
679869
+ async processMedia(msg, source = "message", eager = false) {
679754
679870
  const media = source === "reply" ? msg.replyToMedia : msg.media;
679755
679871
  if (!media) return "";
679756
679872
  const { type, fileId, fileUniqueId, mimeType, caption } = media;
@@ -679759,6 +679875,12 @@ ${knownList}` : "Private-user telegram_send_file target must be this DM or a kno
679759
679875
  const sourceLabel = source === "reply" ? "replied-to " : "";
679760
679876
  const mediaAlias = sourceMessageId ? `message_id:${sourceMessageId}` : source === "reply" ? "reply" : "latest";
679761
679877
  const safeCaption = caption ? ` — caption: ${telegramContextJsonString(caption, 220)}` : "";
679878
+ const cacheKey = `${String(msg.chatId)}:${String(sourceMessageId ?? 0)}:${fileUniqueId}`;
679879
+ const existingEntry = this.mediaCache.get(cacheKey);
679880
+ if (existingEntry && existsSync146(existingEntry.localPath)) {
679881
+ existingEntry.cachedAt = Date.now();
679882
+ return existingEntry.extractedContent || `[${sourceLabel}${type} received: path_alias=${mediaAlias}${safeCaption}]`;
679883
+ }
679762
679884
  let ext = ".bin";
679763
679885
  if (isImageMedia) ext = telegramImageExtension(media);
679764
679886
  else if (type === "audio" || type === "voice") ext = ".ogg";
@@ -679782,10 +679904,7 @@ ${knownList}` : "Private-user telegram_send_file target must be this DM or a kno
679782
679904
  caption,
679783
679905
  cachedAt: Date.now()
679784
679906
  };
679785
- this.mediaCache.set(
679786
- `${String(msg.chatId)}:${String(sourceMessageId ?? 0)}:${fileUniqueId}`,
679787
- cacheEntry
679788
- );
679907
+ this.mediaCache.set(cacheKey, cacheEntry);
679789
679908
  const metadataKey = String(msg.chatId);
679790
679909
  if (!this.mediaMetadata.has(metadataKey)) {
679791
679910
  this.mediaMetadata.set(metadataKey, []);
@@ -679798,7 +679917,7 @@ ${knownList}` : "Private-user telegram_send_file target must be this DM or a kno
679798
679917
  username: msg.username
679799
679918
  });
679800
679919
  let description = `[${type}${caption ? `: ${caption}` : ""}]`;
679801
- if (isImageMedia) {
679920
+ if (isImageMedia && !eager) {
679802
679921
  let visionContext = "";
679803
679922
  try {
679804
679923
  const { runVisionIngress: runVisionIngress2, formatImageContextPrefix: formatImageContextPrefix2 } = await Promise.resolve().then(() => (init_vision_ingress(), vision_ingress_exports));
@@ -680897,6 +681016,10 @@ ${caption}\r
680897
681016
  if (this.adminUserId && !this.agentConfig) {
680898
681017
  if (!isAdmin) continue;
680899
681018
  }
681019
+ if (msg.media) {
681020
+ void this.processMedia(msg, "message", true).catch(() => {
681021
+ });
681022
+ }
680900
681023
  if (this.agentConfig && this.repoRoot) {
680901
681024
  this.handleMessageWithSubAgent(msg).catch((err) => {
680902
681025
  this.tuiWrite(
@@ -1,12 +1,12 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.357",
3
+ "version": "1.0.358",
4
4
  "lockfileVersion": 3,
5
5
  "requires": true,
6
6
  "packages": {
7
7
  "": {
8
8
  "name": "omnius",
9
- "version": "1.0.357",
9
+ "version": "1.0.358",
10
10
  "bundleDependencies": [
11
11
  "image-to-ascii"
12
12
  ],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "omnius",
3
- "version": "1.0.357",
3
+ "version": "1.0.358",
4
4
  "description": "AI coding agent powered by open-source models (Ollama/vLLM) — interactive TUI with agentic tool-calling loop",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",