omnius 1.0.20 → 1.0.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1474,7 +1474,7 @@ var init_security_classifier = __esm({
1474
1474
  // ── Network reads (safe)
1475
1475
  { match: /^(web_search|web_fetch)$/, info: NETWORK_READ },
1476
1476
  // ── Network outbound (mutating or remote inference)
1477
- { match: /^(image_generate|generate_image|vision|video_understand)$/, info: NETWORK_OUTBOUND },
1477
+ { match: /^(image_generate|generate_image|generate_audio|generate_tts|create_audio_file|vision|video_understand|telegram_send_file)$/, info: NETWORK_OUTBOUND },
1478
1478
  { match: /^(transcribe_file|transcribe_url|youtube_download)$/, info: NETWORK_OUTBOUND },
1479
1479
  { match: /^(fortemi_bridge)$/, info: NETWORK_OUTBOUND },
1480
1480
  // ── Memory tools
@@ -1491,7 +1491,7 @@ var init_security_classifier = __esm({
1491
1491
  { match: /^(file_read|file_explore|list_directory|grep_search|glob_find|find_files)$/, info: LOCAL_READ },
1492
1492
  { match: /^(image_read|ocr|ocr_pdf|ocr_image_advanced|pdf_to_text|structured_read|read_structured_file)$/, info: LOCAL_READ },
1493
1493
  { match: /^(symbol_search|impact_analysis|code_neighbors|repo_map|codebase_map|semantic_map|import_graph)$/, info: LOCAL_READ },
1494
- { match: /^(diagnostic|git_info|environment_snapshot|process_health|todo_read|explore_tools)$/, info: LOCAL_READ },
1494
+ { match: /^(diagnostic|git_info|environment_snapshot|process_health|todo_read|explore_tools|telegram_media_recent)$/, info: LOCAL_READ },
1495
1495
  { match: /^(log_explore|log_packet|change_log|phase_recall|code_graph)$/, info: LOCAL_READ },
1496
1496
  { match: /^skill_(list|execute|read)$/, info: LOCAL_READ },
1497
1497
  // ── Task completion (neutral signal)
@@ -5733,13 +5733,20 @@ var init_explore_tools = __esm({
5733
5733
  diagnostic: "Run project diagnostics (build, test, lint)",
5734
5734
  image_read: "Read and describe image contents",
5735
5735
  screenshot: "Capture a screenshot of the desktop",
5736
+ ocr: "Extract text from images via OCR",
5736
5737
  ocr_image: "Extract text from images via OCR",
5738
+ ocr_image_advanced: "Advanced OCR for images with layout-aware extraction",
5737
5739
  ocr_pdf: "Extract text from PDF pages via OCR",
5738
5740
  pdf_to_text: "Convert PDF to plain text",
5739
5741
  vision: "Describe what's on screen using Moondream",
5742
+ video_understand: "Analyze a video file with transcription and keyframe understanding",
5743
+ audio_analyze: "Classify sounds, detect speech, inspect spectrum, or analyze audio files",
5740
5744
  desktop_click: "Click at coordinates on the desktop",
5741
5745
  desktop_describe: "Describe a region of the desktop",
5742
5746
  transcribe_file: "Transcribe audio/video files to text",
5747
+ telegram_media_recent: "List recent Telegram media available in the current chat scope",
5748
+ generate_audio: "Generate sound effects or music with local model backends",
5749
+ generate_tts: "Generate speech from text with configured voice/TTS backends",
5743
5750
  create_tool: "Create a new custom tool from a workflow",
5744
5751
  manage_tools: "List, inspect, or remove custom tools",
5745
5752
  skill_list: "List available AIWG skills",
@@ -84452,7 +84459,7 @@ var require_mime_types = __commonJS({
84452
84459
  "../node_modules/mime-types/index.js"(exports) {
84453
84460
  "use strict";
84454
84461
  var db = require_mime_db();
84455
- var extname16 = __require("path").extname;
84462
+ var extname17 = __require("path").extname;
84456
84463
  var EXTRACT_TYPE_REGEXP = /^\s*([^;\s]*)(?:;|\s|$)/;
84457
84464
  var TEXT_TYPE_REGEXP = /^text\//i;
84458
84465
  exports.charset = charset;
@@ -84506,7 +84513,7 @@ var require_mime_types = __commonJS({
84506
84513
  if (!path11 || typeof path11 !== "string") {
84507
84514
  return false;
84508
84515
  }
84509
- var extension4 = extname16("x." + path11).toLowerCase().substr(1);
84516
+ var extension4 = extname17("x." + path11).toLowerCase().substr(1);
84510
84517
  if (!extension4) {
84511
84518
  return false;
84512
84519
  }
@@ -250375,6 +250382,22 @@ function optionalNumberArg(value2) {
250375
250382
  const n2 = Number(value2);
250376
250383
  return Number.isFinite(n2) ? n2 : void 0;
250377
250384
  }
250385
+ function booleanArg(value2, fallback) {
250386
+ if (typeof value2 === "boolean")
250387
+ return value2;
250388
+ if (typeof value2 === "string") {
250389
+ if (/^(1|true|yes|on)$/i.test(value2.trim()))
250390
+ return true;
250391
+ if (/^(0|false|no|off)$/i.test(value2.trim()))
250392
+ return false;
250393
+ }
250394
+ return fallback;
250395
+ }
250396
+ function generationFallbackEnabled(args) {
250397
+ if (booleanArg(args["strict_model"] ?? args["strictModel"] ?? args["strict"], false))
250398
+ return false;
250399
+ return booleanArg(args["fallback"] ?? args["allow_fallback"] ?? args["allowFallback"], true);
250400
+ }
250378
250401
  function isBackend(value2) {
250379
250402
  return value2 === "auto" || value2 === "ollama" || value2 === "diffusers" || value2 === "sdcpp";
250380
250403
  }
@@ -250383,6 +250406,14 @@ function getImageGenerationPreset(model) {
250383
250406
  return void 0;
250384
250407
  return IMAGE_GENERATION_MODEL_PRESETS.find((preset) => preset.id === model);
250385
250408
  }
250409
+ function imageGenerationQualityLadder() {
250410
+ return IMAGE_GENERATION_QUALITY_LADDER.map((id) => getImageGenerationPreset(id)).filter((preset) => Boolean(preset));
250411
+ }
250412
+ function imageGenerationFallbackAlternates(model) {
250413
+ if (!model)
250414
+ return [];
250415
+ return IMAGE_GENERATION_MODEL_PRESETS.filter((preset) => preset.fallbackFor?.includes(model));
250416
+ }
250386
250417
  function inferImageGenerationBackend(model, requested) {
250387
250418
  if (requested && isBackend(requested))
250388
250419
  return requested;
@@ -250399,6 +250430,45 @@ function inferImageGenerationBackend(model, requested) {
250399
250430
  return "sdcpp";
250400
250431
  return "diffusers";
250401
250432
  }
250433
+ function imageCandidateFor(model, requestedBackend) {
250434
+ let backend = inferImageGenerationBackend(model, requestedBackend);
250435
+ if (backend === "auto")
250436
+ backend = "diffusers";
250437
+ return {
250438
+ model,
250439
+ backend,
250440
+ preset: getImageGenerationPreset(model)
250441
+ };
250442
+ }
250443
+ function imageGenerationFallbackCandidates(requestedModel, requestedBackend, allowFallback = true) {
250444
+ const ladder = imageGenerationQualityLadder();
250445
+ const candidates = [];
250446
+ const add2 = (candidate) => {
250447
+ const key = `${candidate.backend}:${candidate.model}`;
250448
+ if (!candidates.some((existing) => `${existing.backend}:${existing.model}` === key))
250449
+ candidates.push(candidate);
250450
+ };
250451
+ if (requestedModel) {
250452
+ add2(imageCandidateFor(requestedModel, requestedBackend));
250453
+ for (const alternate of imageGenerationFallbackAlternates(requestedModel))
250454
+ add2(imageCandidateFor(alternate.id));
250455
+ } else if (requestedBackend && requestedBackend !== "auto") {
250456
+ const firstForBackend = ladder.find((preset) => preset.backend === requestedBackend);
250457
+ add2(imageCandidateFor(firstForBackend?.id ?? (requestedBackend === "ollama" ? DEFAULT_OLLAMA_IMAGE_MODEL : DEFAULT_DIFFUSERS_IMAGE_MODEL), requestedBackend));
250458
+ } else if (!allowFallback) {
250459
+ add2(imageCandidateFor(DEFAULT_DIFFUSERS_IMAGE_MODEL, requestedBackend));
250460
+ }
250461
+ if (!allowFallback)
250462
+ return candidates.length ? candidates : [imageCandidateFor(DEFAULT_DIFFUSERS_IMAGE_MODEL, requestedBackend)];
250463
+ const primaryIndex = requestedModel ? ladder.findIndex((preset) => preset.id === requestedModel) : requestedBackend && requestedBackend !== "auto" ? ladder.findIndex((preset) => preset.backend === requestedBackend) : 0;
250464
+ const fallbackTail = primaryIndex >= 0 ? ladder.slice(primaryIndex) : ladder;
250465
+ for (const preset of fallbackTail) {
250466
+ add2(imageCandidateFor(preset.id));
250467
+ for (const alternate of imageGenerationFallbackAlternates(preset.id))
250468
+ add2(imageCandidateFor(alternate.id));
250469
+ }
250470
+ return candidates;
250471
+ }
250402
250472
  function imageGenerationDir(repoRoot = ".") {
250403
250473
  return join36(repoRoot, ".omnius", "image-gen");
250404
250474
  }
@@ -250653,6 +250723,33 @@ function formatSuccessOutput(args) {
250653
250723
  ` Prompt: "${prompt}"`
250654
250724
  ].filter(Boolean).join("\n");
250655
250725
  }
250726
+ function summarizeToolResult(result) {
250727
+ return trimProcessText(String(result.error || result.output || "unknown error"), 700).replace(/\s+/g, " ").trim();
250728
+ }
250729
+ function formatImageAttempt(candidate, reason, index) {
250730
+ return `${index + 1}. ${candidate.model} [${candidate.backend}] - ${reason}`;
250731
+ }
250732
+ function formatImageFallbackFailure(failed) {
250733
+ return [
250734
+ "No image generation model in the fallback ladder completed successfully.",
250735
+ "Attempted, highest quality to lowest:",
250736
+ ...failed.map((attempt, index) => ` ${formatImageAttempt(attempt.candidate, attempt.reason, index)}`)
250737
+ ].join("\n");
250738
+ }
250739
+ function annotateImageFallbackSuccess(result, failed, winner) {
250740
+ if (failed.length === 0)
250741
+ return result;
250742
+ const prefix = [
250743
+ `Fallback ladder succeeded with ${winner.model} [${winner.backend}] after ${failed.length} failed attempt(s).`,
250744
+ "Failed attempts:",
250745
+ ...failed.map((attempt, index) => ` ${formatImageAttempt(attempt.candidate, attempt.reason, index)}`),
250746
+ ""
250747
+ ].join("\n");
250748
+ return {
250749
+ ...result,
250750
+ output: prefix + result.output
250751
+ };
250752
+ }
250656
250753
  function parseRunnerJson(stdout) {
250657
250754
  const lines = stdout.trim().split(/\r?\n/).reverse();
250658
250755
  for (const line of lines) {
@@ -250665,7 +250762,7 @@ function parseRunnerJson(stdout) {
250665
250762
  }
250666
250763
  return null;
250667
250764
  }
250668
- var DEFAULT_DIFFUSERS_IMAGE_MODEL, DEFAULT_OLLAMA_IMAGE_MODEL, DIFFUSERS_PYTHON_PACKAGES, SDCPP_PYTHON_PACKAGES, IMAGE_GENERATION_MODEL_PRESETS, OLLAMA_IMAGE_MODELS, DIFFUSERS_RUNNER, SDCPP_RUNNER, ImageGenerateTool;
250765
+ var DEFAULT_DIFFUSERS_IMAGE_MODEL, DEFAULT_OLLAMA_IMAGE_MODEL, DIFFUSERS_PYTHON_PACKAGES, SDCPP_PYTHON_PACKAGES, IMAGE_GENERATION_MODEL_PRESETS, IMAGE_GENERATION_QUALITY_LADDER, OLLAMA_IMAGE_MODELS, DIFFUSERS_RUNNER, SDCPP_RUNNER, ImageGenerateTool;
250669
250766
  var init_image_generate = __esm({
250670
250767
  "packages/execution/dist/tools/image-generate.js"() {
250671
250768
  "use strict";
@@ -250737,6 +250834,78 @@ var init_image_generate = __esm({
250737
250834
  height: 1024,
250738
250835
  note: "Primary serious-generation baseline for maximum photorealism."
250739
250836
  },
250837
+ {
250838
+ id: "black-forest-labs/FLUX.1-dev-FP8",
250839
+ label: "FLUX.1 dev FP8",
250840
+ backend: "diffusers",
250841
+ install: 'python3 .omnius/image-gen/diffusers_text2image.py --model black-forest-labs/FLUX.1-dev-FP8 --steps 28 --guidance 3.5 --width 1024 --height 1024 --prompt "..." --output .omnius/images/out.png',
250842
+ category: "Official FLUX fallback",
250843
+ sizeClass: "12B FLUX.1 dev FP8",
250844
+ quality: "Official lower-precision FLUX.1 dev route; best first fallback when full FLUX.1 dev is unavailable or too heavy.",
250845
+ minVramGB: 16,
250846
+ recommendedVramGB: 24,
250847
+ deployment: "Prefer this before third-party mirrors when loader support is available.",
250848
+ steps: 28,
250849
+ guidance: 3.5,
250850
+ width: 1024,
250851
+ height: 1024,
250852
+ fallbackFor: ["black-forest-labs/FLUX.1-dev"],
250853
+ note: "Official BFL FP8 fallback for FLUX.1 dev."
250854
+ },
250855
+ {
250856
+ id: "black-forest-labs/FLUX.1-Krea-dev",
250857
+ label: "FLUX.1 Krea dev",
250858
+ backend: "diffusers",
250859
+ install: 'python3 .omnius/image-gen/diffusers_text2image.py --model black-forest-labs/FLUX.1-Krea-dev --steps 28 --guidance 3.5 --width 1024 --height 1024 --prompt "..." --output .omnius/images/out.png',
250860
+ category: "Official FLUX fallback",
250861
+ sizeClass: "12B FLUX.1 dev-family",
250862
+ quality: "Official FLUX.1 dev-family aesthetic variant; useful when the base dev repo is unavailable and the requested task tolerates an opinionated realism bias.",
250863
+ minVramGB: 24,
250864
+ recommendedVramGB: 48,
250865
+ deployment: "Heavy Diffusers/ComfyUI route with FLUX.1 dev-family license considerations.",
250866
+ steps: 28,
250867
+ guidance: 3.5,
250868
+ width: 1024,
250869
+ height: 1024,
250870
+ fallbackFor: ["black-forest-labs/FLUX.1-dev"],
250871
+ note: "Official aesthetic FLUX.1 fallback."
250872
+ },
250873
+ {
250874
+ id: "lllyasviel/flux1-dev-bnb-nf4",
250875
+ label: "FLUX.1 dev BNB NF4",
250876
+ backend: "diffusers",
250877
+ install: 'python3 .omnius/image-gen/diffusers_text2image.py --model lllyasviel/flux1-dev-bnb-nf4 --steps 28 --guidance 3.5 --width 1024 --height 1024 --prompt "..." --output .omnius/images/out.png',
250878
+ category: "Traceable FLUX fallback",
250879
+ sizeClass: "12B FLUX.1 dev NF4",
250880
+ quality: "Lower-memory community quantization; useful after official BFL sources, with some possible quality loss and loader brittleness.",
250881
+ minVramGB: 12,
250882
+ recommendedVramGB: 16,
250883
+ deployment: "Best with BNB-aware Diffusers/Forge-style runtimes. Falls through cleanly if the current runner cannot load it.",
250884
+ steps: 28,
250885
+ guidance: 3.5,
250886
+ width: 1024,
250887
+ height: 1024,
250888
+ fallbackFor: ["black-forest-labs/FLUX.1-dev", "black-forest-labs/FLUX.1-dev-FP8"],
250889
+ note: "Traceable low-VRAM NF4 fallback for FLUX.1 dev."
250890
+ },
250891
+ {
250892
+ id: "ChuckMcSneed/FLUX.1-dev",
250893
+ label: "FLUX.1 dev mirror",
250894
+ backend: "diffusers",
250895
+ install: 'python3 .omnius/image-gen/diffusers_text2image.py --model ChuckMcSneed/FLUX.1-dev --steps 28 --guidance 3.5 --width 1024 --height 1024 --prompt "..." --output .omnius/images/out.png',
250896
+ category: "Traceable FLUX fallback",
250897
+ sizeClass: "12B FLUX.1 dev mirror",
250898
+ quality: "Lower-priority mirror fallback for FLUX.1 dev. Use only after official and reputable quantized options fail.",
250899
+ minVramGB: 24,
250900
+ recommendedVramGB: 48,
250901
+ deployment: "Treat as lower-trust than official BFL and well-known quantized conversions; verify provenance and license before relying on it.",
250902
+ steps: 28,
250903
+ guidance: 3.5,
250904
+ width: 1024,
250905
+ height: 1024,
250906
+ fallbackFor: ["black-forest-labs/FLUX.1-dev", "black-forest-labs/FLUX.1-dev-FP8"],
250907
+ note: "Traceable mirror fallback for FLUX.1 dev."
250908
+ },
250740
250909
  {
250741
250910
  id: "stabilityai/stable-diffusion-3.5-large",
250742
250911
  label: "Stable Diffusion 3.5 Large",
@@ -250837,6 +251006,40 @@ var init_image_generate = __esm({
250837
251006
  height: 1024,
250838
251007
  note: "More deployable compact FLUX-family model."
250839
251008
  },
251009
+ {
251010
+ id: "black-forest-labs/FLUX.2-klein-4b-fp8",
251011
+ label: "FLUX.2 Klein 4B FP8",
251012
+ backend: "diffusers",
251013
+ install: 'python3 .omnius/image-gen/diffusers_text2image.py --model black-forest-labs/FLUX.2-klein-4b-fp8 --steps 8 --width 1024 --height 1024 --prompt "..." --output .omnius/images/out.png',
251014
+ category: "Official FLUX fallback",
251015
+ sizeClass: "4B compact FLUX-family FP8",
251016
+ quality: "Official lower-precision FLUX.2 Klein route with better deployment fit than full-precision 4B.",
251017
+ minVramGB: 8,
251018
+ recommendedVramGB: 12,
251019
+ deployment: "Preferred lower-memory official FLUX.2 fallback when compatible with the current loader.",
251020
+ steps: 8,
251021
+ width: 1024,
251022
+ height: 1024,
251023
+ fallbackFor: ["black-forest-labs/FLUX.2-klein-4B", "x/flux2-klein"],
251024
+ note: "Official FP8 fallback for FLUX.2 Klein."
251025
+ },
251026
+ {
251027
+ id: "black-forest-labs/FLUX.2-klein-4b-nvfp4",
251028
+ label: "FLUX.2 Klein 4B NVFP4",
251029
+ backend: "diffusers",
251030
+ install: 'python3 .omnius/image-gen/diffusers_text2image.py --model black-forest-labs/FLUX.2-klein-4b-nvfp4 --steps 8 --width 1024 --height 1024 --prompt "..." --output .omnius/images/out.png',
251031
+ category: "Official FLUX fallback",
251032
+ sizeClass: "4B compact FLUX-family NVFP4",
251033
+ quality: "Official NVIDIA-oriented low-precision FLUX.2 Klein fallback.",
251034
+ minVramGB: 8,
251035
+ recommendedVramGB: 12,
251036
+ deployment: "Use when the runtime/GPU supports the NVFP4 path; otherwise the fallback ladder continues.",
251037
+ steps: 8,
251038
+ width: 1024,
251039
+ height: 1024,
251040
+ fallbackFor: ["black-forest-labs/FLUX.2-klein-4B", "x/flux2-klein", "black-forest-labs/FLUX.2-klein-4b-fp8"],
251041
+ note: "Official NVFP4 fallback for FLUX.2 Klein."
251042
+ },
250840
251043
  {
250841
251044
  id: "deepseek-ai/Janus-Pro-7B",
250842
251045
  label: "Janus-Pro-7B",
@@ -250989,6 +251192,21 @@ var init_image_generate = __esm({
250989
251192
  note: "CPU/GGUF/checkpoint route; requires a local model path."
250990
251193
  }
250991
251194
  ];
251195
+ IMAGE_GENERATION_QUALITY_LADDER = [
251196
+ "black-forest-labs/FLUX.1-dev",
251197
+ "stabilityai/stable-diffusion-3.5-large",
251198
+ DEFAULT_OLLAMA_IMAGE_MODEL,
251199
+ "black-forest-labs/FLUX.1-schnell",
251200
+ "stabilityai/stable-diffusion-3.5-large-turbo",
251201
+ "Tongyi-MAI/Z-Image-Turbo",
251202
+ "black-forest-labs/FLUX.2-klein-4B",
251203
+ DEFAULT_DIFFUSERS_IMAGE_MODEL,
251204
+ "Efficient-Large-Model/Sana_Sprint_0.6B_1024px_diffusers",
251205
+ "SimianLuo/LCM_Dreamshaper_v7",
251206
+ "stabilityai/sd-turbo",
251207
+ "segmind/tiny-sd",
251208
+ "nota-ai/bk-sdm-tiny-2m"
251209
+ ];
250992
251210
  OLLAMA_IMAGE_MODELS = IMAGE_GENERATION_MODEL_PRESETS.filter((preset) => preset.backend === "ollama").map((preset) => preset.id);
250993
251211
  DIFFUSERS_RUNNER = String.raw`#!/usr/bin/env python3
250994
251212
  import argparse
@@ -251170,7 +251388,7 @@ if __name__ == "__main__":
251170
251388
  `;
251171
251389
  ImageGenerateTool = class {
251172
251390
  name = "generate_image";
251173
- description = "Generate an image from a text prompt using a local image-generation backend. Supports Ollama image models (x/z-image-turbo, x/flux2-klein), Python Diffusers models (SDXL Turbo default, FLUX.1 dev, SD3.5 Large, Tiny-SD, LCM, Sana Sprint), and stable-diffusion.cpp local checkpoints/GGUF. Saves a PNG under .omnius/images and returns the file path.";
251391
+ description = "Generate an image from a text prompt using a local image-generation backend. Supports Ollama image models (x/z-image-turbo, x/flux2-klein), Python Diffusers models (SDXL Turbo default, FLUX.1 dev, SD3.5 Large, Tiny-SD, LCM, Sana Sprint), and stable-diffusion.cpp local checkpoints/GGUF. When fallback is enabled, auto generation tries ranked high-quality candidates first, including official/traceable FLUX fallbacks for Black Forest Labs models, and then falls back to smaller models if setup, download, or generation fails. Saves a PNG under .omnius/images and returns the file path.";
251174
251392
  parameters = {
251175
251393
  type: "object",
251176
251394
  properties: {
@@ -251215,6 +251433,14 @@ if __name__ == "__main__":
251215
251433
  type: "string",
251216
251434
  enum: ["generate", "list_models", "setup"],
251217
251435
  description: "Optional utility action. Default is generate."
251436
+ },
251437
+ fallback: {
251438
+ type: "boolean",
251439
+ description: "Whether to try the ranked quality ladder if the selected model/backend fails. Defaults true."
251440
+ },
251441
+ strict_model: {
251442
+ type: "boolean",
251443
+ description: "When true, use only the requested model/backend and do not fall back. Defaults false."
251218
251444
  }
251219
251445
  },
251220
251446
  required: ["prompt"]
@@ -251257,7 +251483,7 @@ if __name__ == "__main__":
251257
251483
  if (action === "list_models") {
251258
251484
  return {
251259
251485
  success: true,
251260
- output: IMAGE_GENERATION_MODEL_PRESETS.map((preset2) => `${preset2.id} [${preset2.backend}] - ${preset2.note}`).join("\n"),
251486
+ output: IMAGE_GENERATION_MODEL_PRESETS.map((preset) => `${preset.id} [${preset.backend}] - ${preset.note}`).join("\n"),
251261
251487
  durationMs: performance.now() - start2
251262
251488
  };
251263
251489
  }
@@ -251281,19 +251507,8 @@ if __name__ == "__main__":
251281
251507
  const rawModel2 = args["model_path"] ? String(args["model_path"]) : args["model"] ? String(args["model"]) : this.defaultModel;
251282
251508
  const requestedModel2 = rawModel2 === "auto" ? void 0 : rawModel2;
251283
251509
  const requestedBackend2 = args["backend"] ? String(args["backend"]) : this.defaultBackend;
251284
- let backend = inferImageGenerationBackend(requestedModel2, requestedBackend2);
251285
- if (backend === "auto") {
251286
- backend = inferImageGenerationBackend(requestedModel2, void 0);
251287
- if (backend === "auto")
251288
- backend = "diffusers";
251289
- }
251290
- const model = requestedModel2 ?? (backend === "diffusers" ? DEFAULT_DIFFUSERS_IMAGE_MODEL : DEFAULT_OLLAMA_IMAGE_MODEL);
251291
- this.emitProgress({ stage: "setup", message: `Preparing image model ${model} (${backend})` });
251292
- if (backend === "ollama")
251293
- return await this.prewarmOllama({ model, start: start2 });
251294
- if (backend === "sdcpp")
251295
- return await this.prewarmSdCpp({ model, start: start2, python: args["python"] });
251296
- return await this.prewarmDiffusers({ model, start: start2, python: args["python"] });
251510
+ const candidates2 = imageGenerationFallbackCandidates(requestedModel2, requestedBackend2, generationFallbackEnabled(args));
251511
+ return await this.prewarmCandidateLadder({ candidates: candidates2, args, start: start2 });
251297
251512
  }
251298
251513
  const prompt = String(args["prompt"] ?? "").trim();
251299
251514
  if (!prompt) {
@@ -251302,31 +251517,10 @@ if __name__ == "__main__":
251302
251517
  const rawModel = args["model_path"] ? String(args["model_path"]) : args["model"] ? String(args["model"]) : this.defaultModel;
251303
251518
  const requestedModel = rawModel === "auto" ? void 0 : rawModel;
251304
251519
  const requestedBackend = args["backend"] ? String(args["backend"]) : this.defaultBackend;
251305
- const preset = getImageGenerationPreset(requestedModel);
251306
- const width = numberArg(args["width"], preset?.width ?? 1024);
251307
- const height = numberArg(args["height"], preset?.height ?? 1024);
251308
- const steps = optionalNumberArg(args["steps"]) ?? preset?.steps;
251309
- const guidance = optionalNumberArg(args["guidance"]) ?? preset?.guidance;
251310
251520
  const seed = optionalNumberArg(args["seed"]);
251521
+ const candidates = imageGenerationFallbackCandidates(requestedModel, requestedBackend, generationFallbackEnabled(args));
251311
251522
  try {
251312
- let backend = inferImageGenerationBackend(requestedModel, requestedBackend);
251313
- let model = requestedModel;
251314
- if (backend === "auto") {
251315
- backend = inferImageGenerationBackend(model, void 0);
251316
- if (backend === "auto")
251317
- backend = "diffusers";
251318
- }
251319
- if (!model) {
251320
- model = backend === "diffusers" ? DEFAULT_DIFFUSERS_IMAGE_MODEL : DEFAULT_OLLAMA_IMAGE_MODEL;
251321
- }
251322
- this.emitProgress({ stage: "setup", message: `Using image model ${model} (${backend})` });
251323
- if (backend === "ollama") {
251324
- return await this.generateWithOllama({ prompt, model, width, height, steps, start: start2 });
251325
- }
251326
- if (backend === "sdcpp") {
251327
- return await this.generateWithSdCpp({ prompt, model, width, height, steps, seed, start: start2, python: args["python"] });
251328
- }
251329
- return await this.generateWithDiffusers({ prompt, model, width, height, steps, guidance, seed, start: start2, python: args["python"] });
251523
+ return await this.generateCandidateLadder({ candidates, prompt, args, seed, start: start2 });
251330
251524
  } catch (err) {
251331
251525
  return {
251332
251526
  success: false,
@@ -251335,6 +251529,64 @@ if __name__ == "__main__":
251335
251529
  };
251336
251530
  }
251337
251531
  }
251532
+ async prewarmCandidateLadder(args) {
251533
+ const failed = [];
251534
+ for (let index = 0; index < args.candidates.length; index++) {
251535
+ const candidate = args.candidates[index];
251536
+ this.emitProgress({
251537
+ stage: "setup",
251538
+ message: `Preparing image model ${candidate.model} (${candidate.backend}) [${index + 1}/${args.candidates.length}]`
251539
+ });
251540
+ const result = candidate.backend === "ollama" ? await this.prewarmOllama({ model: candidate.model, start: args.start }) : candidate.backend === "sdcpp" ? await this.prewarmSdCpp({ model: candidate.model, start: args.start, python: args.args["python"] }) : await this.prewarmDiffusers({ model: candidate.model, start: args.start, python: args.args["python"] });
251541
+ if (result.success)
251542
+ return annotateImageFallbackSuccess(result, failed, candidate);
251543
+ failed.push({ candidate, reason: summarizeToolResult(result) });
251544
+ if (index < args.candidates.length - 1) {
251545
+ this.emitProgress({
251546
+ stage: "setup",
251547
+ message: `${candidate.model} failed; trying ${args.candidates[index + 1].model}`
251548
+ });
251549
+ }
251550
+ }
251551
+ const output = formatImageFallbackFailure(failed);
251552
+ return {
251553
+ success: false,
251554
+ output,
251555
+ error: output,
251556
+ durationMs: performance.now() - args.start
251557
+ };
251558
+ }
251559
+ async generateCandidateLadder(args) {
251560
+ const failed = [];
251561
+ for (let index = 0; index < args.candidates.length; index++) {
251562
+ const candidate = args.candidates[index];
251563
+ const width = numberArg(args.args["width"], candidate.preset?.width ?? 1024);
251564
+ const height = numberArg(args.args["height"], candidate.preset?.height ?? 1024);
251565
+ const steps = optionalNumberArg(args.args["steps"]) ?? candidate.preset?.steps;
251566
+ const guidance = optionalNumberArg(args.args["guidance"]) ?? candidate.preset?.guidance;
251567
+ this.emitProgress({
251568
+ stage: "setup",
251569
+ message: `Using image model ${candidate.model} (${candidate.backend}) [${index + 1}/${args.candidates.length}]`
251570
+ });
251571
+ const result = candidate.backend === "ollama" ? await this.generateWithOllama({ prompt: args.prompt, model: candidate.model, width, height, steps, start: args.start }) : candidate.backend === "sdcpp" ? await this.generateWithSdCpp({ prompt: args.prompt, model: candidate.model, width, height, steps, seed: args.seed, start: args.start, python: args.args["python"] }) : await this.generateWithDiffusers({ prompt: args.prompt, model: candidate.model, width, height, steps, guidance, seed: args.seed, start: args.start, python: args.args["python"] });
251572
+ if (result.success)
251573
+ return annotateImageFallbackSuccess(result, failed, candidate);
251574
+ failed.push({ candidate, reason: summarizeToolResult(result) });
251575
+ if (index < args.candidates.length - 1) {
251576
+ this.emitProgress({
251577
+ stage: "setup",
251578
+ message: `${candidate.model} failed; falling back to ${args.candidates[index + 1].model}`
251579
+ });
251580
+ }
251581
+ }
251582
+ const output = formatImageFallbackFailure(failed);
251583
+ return {
251584
+ success: false,
251585
+ output,
251586
+ error: output,
251587
+ durationMs: performance.now() - args.start
251588
+ };
251589
+ }
251338
251590
  async prewarmOllama(args) {
251339
251591
  const model = args.model || DEFAULT_OLLAMA_IMAGE_MODEL;
251340
251592
  if (await this.ollamaHasModel(model)) {
@@ -251800,7 +252052,7 @@ ${errText.slice(0, 800)}`,
251800
252052
  });
251801
252053
 
251802
252054
  // packages/execution/dist/tools/audio-generate.js
251803
- import { spawn as spawn10 } from "node:child_process";
252055
+ import { execFileSync as execFileSync2, spawn as spawn10 } from "node:child_process";
251804
252056
  import { existsSync as existsSync24, readdirSync as readdirSync10, statSync as statSync9 } from "node:fs";
251805
252057
  import { chmod as chmod4, mkdir as mkdir12, writeFile as writeFile17 } from "node:fs/promises";
251806
252058
  import { join as join37 } from "node:path";
@@ -251824,13 +252076,63 @@ function backendPackages(backend) {
251824
252076
  return TANGOFLUX_PACKAGES;
251825
252077
  return DIFFUSERS_AUDIO_PACKAGES;
251826
252078
  }
252079
+ function detectLegacyCudaComputeCapability() {
252080
+ try {
252081
+ const out = execFileSync2("nvidia-smi", ["--query-gpu=compute_cap,name", "--format=csv,noheader,nounits"], {
252082
+ encoding: "utf8",
252083
+ timeout: 5e3,
252084
+ stdio: ["ignore", "pipe", "ignore"]
252085
+ }).trim();
252086
+ const first2 = out.split(/\r?\n/).map((line) => line.trim()).find(Boolean);
252087
+ const match = first2?.match(/^(\d+)\.(\d+)\s*,?\s*(.*)$/);
252088
+ if (!match)
252089
+ return null;
252090
+ const major = Number(match[1]);
252091
+ const minor = Number(match[2]);
252092
+ if (!Number.isFinite(major) || !Number.isFinite(minor))
252093
+ return null;
252094
+ return { major, minor, name: match[3]?.trim() || void 0 };
252095
+ } catch {
252096
+ return null;
252097
+ }
252098
+ }
252099
+ function isLegacyCudaCapability(major, minor) {
252100
+ return major < 7 || major === 7 && minor < 5;
252101
+ }
252102
+ function torchInstallPlan(forceLegacyCuda = false) {
252103
+ if (process.env["OMNIUS_AUDIO_TORCH_INDEX_URL"]) {
252104
+ return {
252105
+ args: ["torch", "torchaudio", "--index-url", process.env["OMNIUS_AUDIO_TORCH_INDEX_URL"]],
252106
+ description: `env override ${process.env["OMNIUS_AUDIO_TORCH_INDEX_URL"]}`
252107
+ };
252108
+ }
252109
+ if (forceLegacyCuda) {
252110
+ return {
252111
+ args: ["torch==2.3.1", "torchaudio==2.3.1", "--index-url", "https://download.pytorch.org/whl/cu118"],
252112
+ description: "runtime-detected legacy CUDA GPU; using PyTorch 2.3.1 cu118 to avoid cuDNN 9 incompatibility"
252113
+ };
252114
+ }
252115
+ if (process.platform === "linux" && process.arch === "x64") {
252116
+ const gpu = detectLegacyCudaComputeCapability();
252117
+ if (gpu && isLegacyCudaCapability(gpu.major, gpu.minor)) {
252118
+ return {
252119
+ args: ["torch==2.3.1", "torchaudio==2.3.1", "--index-url", "https://download.pytorch.org/whl/cu118"],
252120
+ description: `CUDA legacy GPU SM ${gpu.major}.${gpu.minor}${gpu.name ? ` ${gpu.name}` : ""}; using PyTorch 2.3.1 cu118 to avoid cuDNN 9 incompatibility`
252121
+ };
252122
+ }
252123
+ }
252124
+ return { args: ["torch", "torchaudio"], description: "default PyTorch wheel selection" };
252125
+ }
252126
+ function withoutTorchPackages(packages) {
252127
+ return packages.filter((pkg) => pkg !== "torch" && pkg !== "torchaudio");
252128
+ }
251827
252129
  function backendImportCheck(backend) {
251828
252130
  if (backend === "transformers")
251829
252131
  return "import torch, torchaudio, transformers, scipy\nfrom transformers import AutoProcessor, MusicgenForConditionalGeneration\n";
251830
252132
  if (backend === "audiocraft")
251831
252133
  return "import torch, torchaudio, audiocraft\nfrom audiocraft.models import MusicGen, AudioGen\n";
251832
252134
  if (backend === "stable-audio")
251833
- return "import torch, torchaudio, stable_audio_tools\n";
252135
+ return "import torch, torchaudio, diffusers, scipy\nfrom diffusers import StableAudioPipeline\n";
251834
252136
  if (backend === "tangoflux")
251835
252137
  return "import torch, torchaudio\nfrom tangoflux import TangoFluxInference\n";
251836
252138
  return "import torch, diffusers, scipy\nfrom diffusers import AudioLDMPipeline\n";
@@ -252022,6 +252324,69 @@ async function pythonCanImport2(command, code8, repoRoot, env2) {
252022
252324
  async function pythonImportResult(command, code8, repoRoot, env2) {
252023
252325
  return await runProcess3(command, ["-c", code8], { cwd: repoRoot, timeoutMs: 6e4, env: env2 });
252024
252326
  }
252327
+ async function torchRuntimeCompatibilityResult(command, repoRoot, env2) {
252328
+ const code8 = [
252329
+ "import json, sys",
252330
+ "import torch",
252331
+ "payload={'torch': getattr(torch, '__version__', '?'), 'cuda_available': bool(torch.cuda.is_available())}",
252332
+ "if torch.cuda.is_available():",
252333
+ " cap=torch.cuda.get_device_capability(0)",
252334
+ " cudnn=torch.backends.cudnn.version() or 0",
252335
+ " payload.update({'capability': list(cap), 'cudnn': int(cudnn), 'device': torch.cuda.get_device_name(0)})",
252336
+ " if int(cudnn) >= 90000 and tuple(cap) < (7, 5):",
252337
+ " print(json.dumps(payload))",
252338
+ " raise SystemExit(42)",
252339
+ "print(json.dumps(payload))"
252340
+ ].join("\n");
252341
+ return await runProcess3(command, ["-c", code8], { cwd: repoRoot, timeoutMs: 6e4, env: env2 });
252342
+ }
252343
+ async function repairTorchRuntime(command, repoRoot, env2, forceLegacyCuda = false, onProgress) {
252344
+ const plan = torchInstallPlan(forceLegacyCuda);
252345
+ onProgress?.({ stage: "setup", message: `Installing PyTorch runtime: ${plan.description}` });
252346
+ const result = await runProcess3(command, [
252347
+ "-m",
252348
+ "pip",
252349
+ "install",
252350
+ "--progress-bar",
252351
+ "on",
252352
+ "--prefer-binary",
252353
+ "--force-reinstall",
252354
+ ...plan.args
252355
+ ], {
252356
+ cwd: repoRoot,
252357
+ timeoutMs: 18e5,
252358
+ env: env2,
252359
+ progressLabel: `Installing PyTorch runtime (${plan.description})`,
252360
+ onProgress
252361
+ });
252362
+ if (result.code !== 0) {
252363
+ throw new Error(`Failed to install compatible PyTorch runtime (${plan.description}).
252364
+ ${trimProcessText2(result.stderr || result.stdout)}`);
252365
+ }
252366
+ }
252367
+ async function ensureCompatibleTorchRuntime(command, repoRoot, env2, onProgress) {
252368
+ const existing = await torchRuntimeCompatibilityResult(command, repoRoot, env2);
252369
+ if (existing.code === 0)
252370
+ return;
252371
+ if (existing.code === 42) {
252372
+ await repairTorchRuntime(command, repoRoot, env2, true, onProgress);
252373
+ } else {
252374
+ await repairTorchRuntime(command, repoRoot, env2, false, onProgress);
252375
+ }
252376
+ const installed = await torchRuntimeCompatibilityResult(command, repoRoot, env2);
252377
+ if (installed.code === 0)
252378
+ return;
252379
+ if (installed.code === 42) {
252380
+ await repairTorchRuntime(command, repoRoot, env2, true, onProgress);
252381
+ const repaired = await torchRuntimeCompatibilityResult(command, repoRoot, env2);
252382
+ if (repaired.code === 0)
252383
+ return;
252384
+ throw new Error(`Audio-generation PyTorch runtime remains incompatible after cu118 repair.
252385
+ ${trimProcessText2(repaired.stderr || repaired.stdout)}`);
252386
+ }
252387
+ throw new Error(`Audio-generation PyTorch runtime could not be prepared.
252388
+ ${trimProcessText2(installed.stderr || installed.stdout)}`);
252389
+ }
252025
252390
  function formatAudioSetupFailure(backend, text) {
252026
252391
  const body = trimProcessText2(text);
252027
252392
  const lowered = text.toLowerCase();
@@ -252032,6 +252397,9 @@ function formatAudioSetupFailure(backend, text) {
252032
252397
  if (lowered.includes("cuda") && lowered.includes("not available")) {
252033
252398
  notes2.push("CUDA was not available to the selected Python environment; install a Torch build matching this machine's CUDA runtime or use CPU-compatible settings.");
252034
252399
  }
252400
+ if (lowered.includes("cudnn version") && lowered.includes("sm < 7.5")) {
252401
+ notes2.push("The installed PyTorch wheel uses cuDNN 9 on a legacy CUDA GPU. Omnius now repairs audio-generation venvs by reinstalling PyTorch 2.3.1 from the cu118 index for SM < 7.5 hardware.");
252402
+ }
252035
252403
  return [body, ...notes2.map((note) => `
252036
252404
  ${note}`)].filter(Boolean).join("");
252037
252405
  }
@@ -252060,9 +252428,13 @@ ${trimProcessText2(created.stderr || created.stdout)}`);
252060
252428
  }
252061
252429
  }
252062
252430
  if (await pythonCanImport2(command, backendImportCheck(backend), repoRoot, pythonEnv)) {
252063
- return { command, env: pythonEnv };
252431
+ await ensureCompatibleTorchRuntime(command, repoRoot, pythonEnv, onProgress);
252432
+ if (await pythonCanImport2(command, backendImportCheck(backend), repoRoot, pythonEnv)) {
252433
+ return { command, env: pythonEnv };
252434
+ }
252064
252435
  }
252065
252436
  const packages = backendPackages(backend);
252437
+ await ensureCompatibleTorchRuntime(command, repoRoot, pythonEnv, onProgress);
252066
252438
  onProgress?.({ stage: "setup", message: `Installing ${backend} audio-generation Python packages` });
252067
252439
  const pipArgs = [
252068
252440
  "-m",
@@ -252074,7 +252446,7 @@ ${trimProcessText2(created.stderr || created.stdout)}`);
252074
252446
  ...backend === "audiocraft" ? ["--only-binary", "av"] : [],
252075
252447
  "-U",
252076
252448
  "pip",
252077
- ...packages
252449
+ ...withoutTorchPackages(packages)
252078
252450
  ];
252079
252451
  const pip = await runProcess3(command, pipArgs, {
252080
252452
  cwd: repoRoot,
@@ -252091,6 +252463,12 @@ ${formatAudioSetupFailure(backend, pip.stderr || pip.stdout)}`);
252091
252463
  if (importCheck.code !== 0) {
252092
252464
  throw new Error(`Audio-generation Python environment at ${venvDir} was created, but required ${backend} imports still fail.
252093
252465
  ${formatAudioSetupFailure(backend, importCheck.stderr || importCheck.stdout)}`);
252466
+ }
252467
+ await ensureCompatibleTorchRuntime(command, repoRoot, pythonEnv, onProgress);
252468
+ if (!await pythonCanImport2(command, backendImportCheck(backend), repoRoot, pythonEnv)) {
252469
+ const retry = await pythonImportResult(command, backendImportCheck(backend), repoRoot, pythonEnv);
252470
+ throw new Error(`Audio-generation Python environment at ${venvDir} lost required ${backend} imports after PyTorch repair.
252471
+ ${formatAudioSetupFailure(backend, retry.stderr || retry.stdout)}`);
252094
252472
  }
252095
252473
  return { command, env: pythonEnv };
252096
252474
  }
@@ -252160,11 +252538,31 @@ function playbackRequested(args) {
252160
252538
  return false;
252161
252539
  return true;
252162
252540
  }
252541
+ function booleanArg2(value2, fallback) {
252542
+ if (typeof value2 === "boolean")
252543
+ return value2;
252544
+ if (typeof value2 === "string") {
252545
+ if (/^(1|true|yes|on)$/i.test(value2.trim()))
252546
+ return true;
252547
+ if (/^(0|false|no|off)$/i.test(value2.trim()))
252548
+ return false;
252549
+ }
252550
+ return fallback;
252551
+ }
252552
+ function generationFallbackEnabled2(args) {
252553
+ if (booleanArg2(args["strict_model"] ?? args["strictModel"] ?? args["strict"], false))
252554
+ return false;
252555
+ return booleanArg2(args["fallback"] ?? args["allow_fallback"] ?? args["allowFallback"], true);
252556
+ }
252163
252557
  function getAudioGenerationPreset(model, kind) {
252164
252558
  if (!model)
252165
252559
  return void 0;
252166
252560
  return AUDIO_GENERATION_MODEL_PRESETS.find((preset) => preset.id === model && (!kind || preset.kind === kind)) ?? AUDIO_GENERATION_MODEL_PRESETS.find((preset) => preset.id === model);
252167
252561
  }
252562
+ function audioGenerationQualityLadder(kind) {
252563
+ const ids = kind === "music" ? MUSIC_GENERATION_QUALITY_LADDER : SOUND_GENERATION_QUALITY_LADDER;
252564
+ return ids.map((id) => getAudioGenerationPreset(id, kind)).filter((preset) => Boolean(preset));
252565
+ }
252168
252566
  function inferAudioGenerationBackend(model, requested) {
252169
252567
  if (requested && requested !== "auto") {
252170
252568
  if (requested === "diffusers" || requested === "transformers" || requested === "audiocraft" || requested === "stable-audio" || requested === "tangoflux" || requested === "project")
@@ -252188,6 +252586,41 @@ function inferAudioGenerationBackend(model, requested) {
252188
252586
  return "project";
252189
252587
  return "diffusers";
252190
252588
  }
252589
+ function audioCandidateFor(kind, model, requestedBackend) {
252590
+ const backend = inferAudioGenerationBackend(model, requestedBackend);
252591
+ const resolvedBackend = backend === "auto" ? kind === "music" ? "transformers" : "diffusers" : backend;
252592
+ return {
252593
+ kind,
252594
+ model,
252595
+ backend: resolvedBackend,
252596
+ preset: getAudioGenerationPreset(model, kind)
252597
+ };
252598
+ }
252599
+ function audioGenerationFallbackCandidates(kind, requestedModel, requestedBackend, allowFallback = true) {
252600
+ const ladder = audioGenerationQualityLadder(kind);
252601
+ const candidates = [];
252602
+ const add2 = (candidate) => {
252603
+ const key = `${candidate.kind}:${candidate.backend}:${candidate.model}`;
252604
+ if (!candidates.some((existing) => `${existing.kind}:${existing.backend}:${existing.model}` === key)) {
252605
+ candidates.push(candidate);
252606
+ }
252607
+ };
252608
+ if (requestedModel) {
252609
+ add2(audioCandidateFor(kind, requestedModel, requestedBackend));
252610
+ } else if (requestedBackend && requestedBackend !== "auto") {
252611
+ const firstForBackend = ladder.find((preset) => preset.backend === requestedBackend);
252612
+ add2(audioCandidateFor(kind, firstForBackend?.id ?? (kind === "music" ? DEFAULT_MUSIC_MODEL : DEFAULT_SOUND_MODEL), requestedBackend));
252613
+ } else if (!allowFallback) {
252614
+ add2(audioCandidateFor(kind, kind === "music" ? DEFAULT_MUSIC_MODEL : DEFAULT_SOUND_MODEL, requestedBackend));
252615
+ }
252616
+ if (!allowFallback)
252617
+ return candidates.length ? candidates : [audioCandidateFor(kind, kind === "music" ? DEFAULT_MUSIC_MODEL : DEFAULT_SOUND_MODEL, requestedBackend)];
252618
+ const primaryIndex = requestedModel ? ladder.findIndex((preset) => preset.id === requestedModel) : requestedBackend && requestedBackend !== "auto" ? ladder.findIndex((preset) => preset.backend === requestedBackend) : 0;
252619
+ const fallbackTail = primaryIndex >= 0 ? ladder.slice(primaryIndex) : ladder;
252620
+ for (const preset of fallbackTail)
252621
+ add2(audioCandidateFor(kind, preset.id));
252622
+ return candidates;
252623
+ }
252191
252624
  function audioGenerationSetupPlan(kind, backend, repoRoot = ".", model) {
252192
252625
  const commandName = kind === "music" ? "music" : "sound";
252193
252626
  const fallback = kind === "music" ? DEFAULT_MUSIC_MODEL : DEFAULT_SOUND_MODEL;
@@ -252261,6 +252694,7 @@ function audioGenerationSetupPlan(kind, backend, repoRoot = ".", model) {
252261
252694
  ],
252262
252695
  notes: [
252263
252696
  "Use this path for Stable Audio Open 1.0, the serious stereo audio/music baseline.",
252697
+ "Omnius uses Diffusers StableAudioPipeline here; stable-audio-tools is intentionally not installed because it often pulls build-from-source dependencies.",
252264
252698
  "Expect larger model downloads and higher VRAM pressure than AudioLDM or MusicGen small."
252265
252699
  ]
252266
252700
  };
@@ -252296,7 +252730,34 @@ function audioGenerationSetupPlan(kind, backend, repoRoot = ".", model) {
252296
252730
  ]
252297
252731
  };
252298
252732
  }
252299
- var DEFAULT_SOUND_MODEL, DEFAULT_MUSIC_MODEL, DIFFUSERS_AUDIO_PACKAGES, TRANSFORMERS_AUDIO_PACKAGES, AUDIOCRAFT_PACKAGES, STABLE_AUDIO_PACKAGES, TANGOFLUX_PACKAGES, AUDIO_GENERATION_MODEL_PRESETS, DIFFUSERS_AUDIO_RUNNER, AUDIOCRAFT_RUNNER, TRANSFORMERS_AUDIO_RUNNER, TANGOFLUX_RUNNER, AudioGenerateTool;
252733
+ function summarizeToolResult2(result) {
252734
+ return trimProcessText2(String(result.error || result.output || "unknown error"), 700).replace(/\s+/g, " ").trim();
252735
+ }
252736
+ function formatAudioAttempt(candidate, reason, index) {
252737
+ return `${index + 1}. ${candidate.model} [${candidate.backend}] - ${reason}`;
252738
+ }
252739
+ function formatAudioFallbackFailure(kind, failed) {
252740
+ return [
252741
+ `No ${kind} generation model in the fallback ladder completed successfully.`,
252742
+ "Attempted, highest quality to lowest:",
252743
+ ...failed.map((attempt, index) => ` ${formatAudioAttempt(attempt.candidate, attempt.reason, index)}`)
252744
+ ].join("\n");
252745
+ }
252746
+ function annotateAudioFallbackSuccess(result, failed, winner) {
252747
+ if (failed.length === 0)
252748
+ return result;
252749
+ const prefix = [
252750
+ `Fallback ladder succeeded with ${winner.model} [${winner.backend}] after ${failed.length} failed attempt(s).`,
252751
+ "Failed attempts:",
252752
+ ...failed.map((attempt, index) => ` ${formatAudioAttempt(attempt.candidate, attempt.reason, index)}`),
252753
+ ""
252754
+ ].join("\n");
252755
+ return {
252756
+ ...result,
252757
+ output: prefix + result.output
252758
+ };
252759
+ }
252760
+ var DEFAULT_SOUND_MODEL, DEFAULT_MUSIC_MODEL, DIFFUSERS_AUDIO_PACKAGES, TRANSFORMERS_AUDIO_PACKAGES, AUDIOCRAFT_PACKAGES, STABLE_AUDIO_PACKAGES, TANGOFLUX_PACKAGES, AUDIO_GENERATION_MODEL_PRESETS, SOUND_GENERATION_QUALITY_LADDER, MUSIC_GENERATION_QUALITY_LADDER, DIFFUSERS_AUDIO_RUNNER, AUDIOCRAFT_RUNNER, TRANSFORMERS_AUDIO_RUNNER, TANGOFLUX_RUNNER, AudioGenerateTool;
252300
252761
  var init_audio_generate = __esm({
252301
252762
  "packages/execution/dist/tools/audio-generate.js"() {
252302
252763
  "use strict";
@@ -252338,7 +252799,6 @@ var init_audio_generate = __esm({
252338
252799
  "accelerate",
252339
252800
  "scipy",
252340
252801
  "soundfile",
252341
- "stable-audio-tools",
252342
252802
  "einops"
252343
252803
  ];
252344
252804
  TANGOFLUX_PACKAGES = [
@@ -252644,6 +253104,21 @@ var init_audio_generate = __esm({
252644
253104
  note: "Legacy specialized music-generation path."
252645
253105
  }
252646
253106
  ];
253107
+ SOUND_GENERATION_QUALITY_LADDER = [
253108
+ "stabilityai/stable-audio-open-1.0",
253109
+ "cvssp/audioldm2-large",
253110
+ "cvssp/audioldm2",
253111
+ "facebook/audiogen-medium",
253112
+ "declare-lab/TangoFlux",
253113
+ DEFAULT_SOUND_MODEL
253114
+ ];
253115
+ MUSIC_GENERATION_QUALITY_LADDER = [
253116
+ "stabilityai/stable-audio-open-1.0",
253117
+ "facebook/musicgen-stereo-large",
253118
+ "facebook/musicgen-large",
253119
+ "facebook/musicgen-medium",
253120
+ DEFAULT_MUSIC_MODEL
253121
+ ];
252647
253122
  DIFFUSERS_AUDIO_RUNNER = String.raw`#!/usr/bin/env python3
252648
253123
  import argparse, json, sys, time
252649
253124
  from pathlib import Path
@@ -252685,6 +253160,10 @@ def _snapshot_model(repo_id):
252685
253160
  def _device():
252686
253161
  import torch
252687
253162
  if torch.cuda.is_available():
253163
+ cap = torch.cuda.get_device_capability(0)
253164
+ cudnn = torch.backends.cudnn.version() or 0
253165
+ if int(cudnn) >= 90000 and tuple(cap) < (7, 5):
253166
+ raise RuntimeError(f"PyTorch cuDNN {cudnn} is incompatible with CUDA device {torch.cuda.get_device_name(0)} SM {cap[0]}.{cap[1]}; recreate the audio venv or let Omnius repair it with a cu118-compatible Torch wheel")
252688
253167
  return "cuda"
252689
253168
  if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
252690
253169
  return "mps"
@@ -252877,6 +253356,10 @@ def _snapshot_model(repo_id):
252877
253356
  def _device():
252878
253357
  import torch
252879
253358
  if torch.cuda.is_available():
253359
+ cap = torch.cuda.get_device_capability(0)
253360
+ cudnn = torch.backends.cudnn.version() or 0
253361
+ if int(cudnn) >= 90000 and tuple(cap) < (7, 5):
253362
+ raise RuntimeError(f"PyTorch cuDNN {cudnn} is incompatible with CUDA device {torch.cuda.get_device_name(0)} SM {cap[0]}.{cap[1]}; recreate the audio venv or let Omnius repair it with a cu118-compatible Torch wheel")
252880
253363
  return "cuda"
252881
253364
  if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
252882
253365
  return "mps"
@@ -253030,7 +253513,7 @@ if __name__ == "__main__":
253030
253513
  `;
253031
253514
  AudioGenerateTool = class {
253032
253515
  name = "generate_audio";
253033
- description = "Generate a sound effect or music clip from a text prompt using local audio-generation backends. Supports Diffusers AudioLDM/AudioLDM2, Transformers MusicGen, AudioCraft AudioGen, Stable Audio Open deployment paths, and explicit research-project profiles. Saves WAV files under .omnius/audio and returns the file path.";
253516
+ description = "Generate a sound effect or music clip from a text prompt using local audio-generation backends. Supports Diffusers AudioLDM/AudioLDM2, Transformers MusicGen, AudioCraft AudioGen, Stable Audio Open deployment paths, and explicit research-project profiles. When fallback is enabled, auto generation tries ranked high-quality candidates first and gracefully falls back to smaller models if setup, download, or generation fails. Saves WAV files under .omnius/audio and returns the file path.";
253034
253517
  parameters = {
253035
253518
  type: "object",
253036
253519
  properties: {
@@ -253044,6 +253527,14 @@ if __name__ == "__main__":
253044
253527
  playback: {
253045
253528
  type: "boolean",
253046
253529
  description: "Whether the TUI should play generated audio after saving it. Defaults true; set false for silent generation."
253530
+ },
253531
+ fallback: {
253532
+ type: "boolean",
253533
+ description: "Whether to try the ranked quality ladder if the selected model/backend fails. Defaults true."
253534
+ },
253535
+ strict_model: {
253536
+ type: "boolean",
253537
+ description: "When true, use only the requested model/backend and do not fall back. Defaults false."
253047
253538
  }
253048
253539
  },
253049
253540
  required: ["prompt"]
@@ -253147,14 +253638,14 @@ if __name__ == "__main__":
253147
253638
  if (action === "list_models") {
253148
253639
  return {
253149
253640
  success: true,
253150
- output: AUDIO_GENERATION_MODEL_PRESETS.filter((preset2) => preset2.kind === kind).map((preset2) => `${preset2.id} [${preset2.backend}] - ${preset2.note}`).join("\n"),
253641
+ output: AUDIO_GENERATION_MODEL_PRESETS.filter((preset) => preset.kind === kind).map((preset) => `${preset.id} [${preset.backend}] - ${preset.note}`).join("\n"),
253151
253642
  durationMs: performance.now() - start2
253152
253643
  };
253153
253644
  }
253154
253645
  if (action === "setup") {
253155
253646
  const requested = String(args["backend"] ?? (kind === "music" ? this.defaults.musicBackend : this.defaults.soundBackend) ?? (kind === "music" ? "transformers" : "diffusers"));
253156
- const backend2 = inferAudioGenerationBackend(typeof args["model"] === "string" ? args["model"] : void 0, requested);
253157
- const resolvedBackend = backend2 === "auto" ? kind === "music" ? "transformers" : "diffusers" : backend2;
253647
+ const backend = inferAudioGenerationBackend(typeof args["model"] === "string" ? args["model"] : void 0, requested);
253648
+ const resolvedBackend = backend === "auto" ? kind === "music" ? "transformers" : "diffusers" : backend;
253158
253649
  const plan = audioGenerationSetupPlan(kind, resolvedBackend, this.cwd, typeof args["model"] === "string" ? args["model"] : void 0);
253159
253650
  return {
253160
253651
  success: true,
@@ -253173,37 +253664,9 @@ if __name__ == "__main__":
253173
253664
  const defaultBackend2 = kind === "music" ? this.defaults.musicBackend : this.defaults.soundBackend;
253174
253665
  const rawModel2 = args["model"] ? String(args["model"]) : defaultModel2;
253175
253666
  const requestedModel2 = rawModel2 === "auto" ? void 0 : rawModel2;
253176
- let backend2 = inferAudioGenerationBackend(requestedModel2, args["backend"] ? String(args["backend"]) : defaultBackend2);
253177
- if (backend2 === "auto")
253178
- backend2 = kind === "music" ? "transformers" : "diffusers";
253179
- const model2 = requestedModel2 ?? (kind === "music" ? DEFAULT_MUSIC_MODEL : DEFAULT_SOUND_MODEL);
253180
- const preset2 = getAudioGenerationPreset(model2, kind);
253181
- const duration2 = numberArg2(args["duration"], preset2?.defaultDurationSec ?? (kind === "music" ? 20 : 8));
253182
- if (backend2 === "project") {
253183
- const plan = audioGenerationSetupPlan(kind, "project", this.cwd, model2);
253184
- return {
253185
- success: false,
253186
- output: [
253187
- `${preset2?.label ?? model2} is a project deployment profile, not an automatic generic runner.`,
253188
- "",
253189
- "Setup path:",
253190
- ...plan.commands.map((cmd) => ` ${cmd}`),
253191
- "",
253192
- ...plan.notes.map((note) => `- ${note}`)
253193
- ].join("\n"),
253194
- durationMs: performance.now() - start2
253195
- };
253196
- }
253197
- this.emitProgress({ stage: "setup", message: `Preparing ${kind} model ${model2} (${backend2})` });
253198
- return await this.prewarmPythonBackend({
253199
- kind,
253200
- backend: backend2,
253201
- runnerBackend: backend2,
253202
- model: model2,
253203
- duration: duration2,
253204
- start: start2,
253205
- python: args["python"]
253206
- });
253667
+ const requestedBackend2 = args["backend"] ? String(args["backend"]) : defaultBackend2;
253668
+ const candidates2 = audioGenerationFallbackCandidates(kind, requestedModel2, requestedBackend2, generationFallbackEnabled2(args));
253669
+ return await this.prewarmCandidateLadder({ kind, candidates: candidates2, args, start: start2 });
253207
253670
  }
253208
253671
  const prompt = String(args["prompt"] ?? "").trim();
253209
253672
  if (!prompt) {
@@ -253213,45 +253676,12 @@ if __name__ == "__main__":
253213
253676
  const defaultBackend = kind === "music" ? this.defaults.musicBackend : this.defaults.soundBackend;
253214
253677
  const rawModel = args["model"] ? String(args["model"]) : defaultModel;
253215
253678
  const requestedModel = rawModel === "auto" ? void 0 : rawModel;
253216
- let backend = inferAudioGenerationBackend(requestedModel, args["backend"] ? String(args["backend"]) : defaultBackend);
253217
- if (backend === "auto")
253218
- backend = kind === "music" ? "transformers" : "diffusers";
253219
- const model = requestedModel ?? (kind === "music" ? DEFAULT_MUSIC_MODEL : DEFAULT_SOUND_MODEL);
253220
- const preset = getAudioGenerationPreset(model, kind);
253221
- const duration = numberArg2(args["duration"], preset?.defaultDurationSec ?? (kind === "music" ? 20 : 8));
253222
- const steps = optionalNumberArg2(args["steps"]) ?? preset?.defaultSteps;
253679
+ const requestedBackend = args["backend"] ? String(args["backend"]) : defaultBackend;
253680
+ const candidates = audioGenerationFallbackCandidates(kind, requestedModel, requestedBackend, generationFallbackEnabled2(args));
253223
253681
  const seed = optionalNumberArg2(args["seed"]);
253224
253682
  const playback = playbackRequested(args);
253225
253683
  try {
253226
- this.emitProgress({ stage: "setup", message: `Using ${kind} model ${model} (${backend})` });
253227
- if (backend === "project") {
253228
- const plan = audioGenerationSetupPlan(kind, "project", this.cwd, model);
253229
- return {
253230
- success: false,
253231
- output: [
253232
- `${preset?.label ?? model} is a project deployment profile, not an automatic generic runner.`,
253233
- "",
253234
- "Setup path:",
253235
- ...plan.commands.map((cmd) => ` ${cmd}`),
253236
- "",
253237
- ...plan.notes.map((note) => `- ${note}`)
253238
- ].join("\n"),
253239
- durationMs: performance.now() - start2
253240
- };
253241
- }
253242
- if (backend === "tangoflux") {
253243
- return await this.generateWithPythonBackend({ kind, backend, runnerBackend: "tangoflux", prompt, model, duration, steps, seed, playback, start: start2, python: args["python"] });
253244
- }
253245
- if (backend === "transformers") {
253246
- return await this.generateWithPythonBackend({ kind, backend, runnerBackend: "transformers", prompt, model, duration, steps, seed, playback, start: start2, python: args["python"] });
253247
- }
253248
- if (backend === "audiocraft") {
253249
- return await this.generateWithPythonBackend({ kind, backend, runnerBackend: "audiocraft", prompt, model, duration, steps, seed, playback, start: start2, python: args["python"] });
253250
- }
253251
- if (backend === "stable-audio") {
253252
- return await this.generateWithPythonBackend({ kind, backend, runnerBackend: "stable-audio", prompt, model, duration, steps, seed, playback, start: start2, python: args["python"] });
253253
- }
253254
- return await this.generateWithPythonBackend({ kind, backend: "diffusers", runnerBackend: "diffusers", prompt, model, duration, steps, seed, playback, start: start2, python: args["python"] });
253684
+ return await this.generateCandidateLadder({ kind, candidates, prompt, args, seed, playback, start: start2 });
253255
253685
  } catch (err) {
253256
253686
  return {
253257
253687
  success: false,
@@ -253260,6 +253690,96 @@ if __name__ == "__main__":
253260
253690
  };
253261
253691
  }
253262
253692
  }
253693
+ async prewarmCandidateLadder(args) {
253694
+ const failed = [];
253695
+ for (let index = 0; index < args.candidates.length; index++) {
253696
+ const candidate = args.candidates[index];
253697
+ const duration = numberArg2(args.args["duration"], candidate.preset?.defaultDurationSec ?? (args.kind === "music" ? 20 : 8));
253698
+ this.emitProgress({
253699
+ stage: "setup",
253700
+ message: `Preparing ${args.kind} model ${candidate.model} (${candidate.backend}) [${index + 1}/${args.candidates.length}]`
253701
+ });
253702
+ const result = candidate.backend === "project" ? this.projectProfileResult(args.kind, candidate, args.start) : await this.prewarmPythonBackend({
253703
+ kind: args.kind,
253704
+ backend: candidate.backend,
253705
+ runnerBackend: candidate.backend,
253706
+ model: candidate.model,
253707
+ duration,
253708
+ start: args.start,
253709
+ python: args.args["python"]
253710
+ });
253711
+ if (result.success)
253712
+ return annotateAudioFallbackSuccess(result, failed, candidate);
253713
+ failed.push({ candidate, reason: summarizeToolResult2(result) });
253714
+ if (index < args.candidates.length - 1) {
253715
+ this.emitProgress({
253716
+ stage: "setup",
253717
+ message: `${candidate.model} failed; trying ${args.candidates[index + 1].model}`
253718
+ });
253719
+ }
253720
+ }
253721
+ return {
253722
+ success: false,
253723
+ output: formatAudioFallbackFailure(args.kind, failed),
253724
+ error: formatAudioFallbackFailure(args.kind, failed),
253725
+ durationMs: performance.now() - args.start
253726
+ };
253727
+ }
253728
+ async generateCandidateLadder(args) {
253729
+ const failed = [];
253730
+ for (let index = 0; index < args.candidates.length; index++) {
253731
+ const candidate = args.candidates[index];
253732
+ const duration = numberArg2(args.args["duration"], candidate.preset?.defaultDurationSec ?? (args.kind === "music" ? 20 : 8));
253733
+ const steps = optionalNumberArg2(args.args["steps"]) ?? candidate.preset?.defaultSteps;
253734
+ this.emitProgress({
253735
+ stage: "setup",
253736
+ message: `Using ${args.kind} model ${candidate.model} (${candidate.backend}) [${index + 1}/${args.candidates.length}]`
253737
+ });
253738
+ const result = candidate.backend === "project" ? this.projectProfileResult(args.kind, candidate, args.start) : await this.generateWithPythonBackend({
253739
+ kind: args.kind,
253740
+ backend: candidate.backend,
253741
+ runnerBackend: candidate.backend,
253742
+ prompt: args.prompt,
253743
+ model: candidate.model,
253744
+ duration,
253745
+ steps,
253746
+ seed: args.seed,
253747
+ playback: args.playback,
253748
+ start: args.start,
253749
+ python: args.args["python"]
253750
+ });
253751
+ if (result.success)
253752
+ return annotateAudioFallbackSuccess(result, failed, candidate);
253753
+ failed.push({ candidate, reason: summarizeToolResult2(result) });
253754
+ if (index < args.candidates.length - 1) {
253755
+ this.emitProgress({
253756
+ stage: "setup",
253757
+ message: `${candidate.model} failed; falling back to ${args.candidates[index + 1].model}`
253758
+ });
253759
+ }
253760
+ }
253761
+ return {
253762
+ success: false,
253763
+ output: formatAudioFallbackFailure(args.kind, failed),
253764
+ error: formatAudioFallbackFailure(args.kind, failed),
253765
+ durationMs: performance.now() - args.start
253766
+ };
253767
+ }
253768
+ projectProfileResult(kind, candidate, start2) {
253769
+ const plan = audioGenerationSetupPlan(kind, "project", this.cwd, candidate.model);
253770
+ return {
253771
+ success: false,
253772
+ output: [
253773
+ `${candidate.preset?.label ?? candidate.model} is a project deployment profile, not an automatic generic runner.`,
253774
+ "",
253775
+ "Setup path:",
253776
+ ...plan.commands.map((cmd) => ` ${cmd}`),
253777
+ "",
253778
+ ...plan.notes.map((note) => `- ${note}`)
253779
+ ].join("\n"),
253780
+ durationMs: performance.now() - start2
253781
+ };
253782
+ }
253263
253783
  async generateWithPythonBackend(args) {
253264
253784
  const runner = await ensureAudioRunner(this.cwd, args.runnerBackend);
253265
253785
  await mkdir12(audioOutputDir(this.cwd), { recursive: true });
@@ -477005,7 +477525,7 @@ var require_path_browserify = __commonJS({
477005
477525
  return path11.slice(start2, end);
477006
477526
  }
477007
477527
  },
477008
- extname: function extname16(path11) {
477528
+ extname: function extname17(path11) {
477009
477529
  assertPath(path11);
477010
477530
  var startDot = -1;
477011
477531
  var startPart = 0;
@@ -507166,22 +507686,22 @@ Saved to: ${tempFile}`,
507166
507686
  });
507167
507687
 
507168
507688
  // packages/execution/dist/tools/audio-playback.js
507169
- import { execFileSync as execFileSync2, execSync as execSync29, spawn as spawn16 } from "node:child_process";
507689
+ import { execFileSync as execFileSync3, execSync as execSync29, spawn as spawn16 } from "node:child_process";
507170
507690
  import { copyFileSync as copyFileSync2, existsSync as existsSync40, statSync as statSync18, writeFileSync as writeFileSync16, mkdirSync as mkdirSync16, readdirSync as readdirSync14 } from "node:fs";
507171
507691
  import { basename as basename12, extname as extname10, isAbsolute, join as join58 } from "node:path";
507172
507692
  import { homedir as homedir14, tmpdir as tmpdir11 } from "node:os";
507173
507693
  function hasCommand3(command) {
507174
507694
  try {
507175
507695
  if (process.platform === "win32") {
507176
- execFileSync2("where", [command], { stdio: "ignore", timeout: 2e3 });
507696
+ execFileSync3("where", [command], { stdio: "ignore", timeout: 2e3 });
507177
507697
  } else {
507178
- execFileSync2("command", ["-v", command], { stdio: "ignore", timeout: 2e3 });
507698
+ execFileSync3("command", ["-v", command], { stdio: "ignore", timeout: 2e3 });
507179
507699
  }
507180
507700
  return true;
507181
507701
  } catch {
507182
507702
  if (process.platform !== "win32") {
507183
507703
  try {
507184
- execFileSync2("which", [command], { stdio: "ignore", timeout: 2e3 });
507704
+ execFileSync3("which", [command], { stdio: "ignore", timeout: 2e3 });
507185
507705
  return true;
507186
507706
  } catch {
507187
507707
  return false;
@@ -507236,7 +507756,7 @@ function playSoundFile(file, opts = {}) {
507236
507756
  };
507237
507757
  }
507238
507758
  try {
507239
- execFileSync2(command.command, command.args, { timeout: opts.timeoutMs ?? 3e5, stdio: "pipe" });
507759
+ execFileSync3(command.command, command.args, { timeout: opts.timeoutMs ?? 3e5, stdio: "pipe" });
507240
507760
  return { ok: true, player: command.label };
507241
507761
  } catch (err) {
507242
507762
  return { ok: false, error: `Playback via ${command.label} failed: ${err instanceof Error ? err.message.slice(0, 300) : String(err).slice(0, 300)}` };
@@ -507359,6 +507879,18 @@ function supertonicInferScript() {
507359
507879
  function mlxVenvPy() {
507360
507880
  return process.platform === "win32" ? join58(voiceDir(), "mlx-venv", "Scripts", "python.exe") : join58(voiceDir(), "mlx-venv", "bin", "python3");
507361
507881
  }
507882
+ function luxttsVenvDir() {
507883
+ return join58(voiceDir(), "luxtts-venv");
507884
+ }
507885
+ function luxttsVenvPy() {
507886
+ return process.platform === "win32" ? join58(luxttsVenvDir(), "Scripts", "python.exe") : join58(luxttsVenvDir(), "bin", "python3");
507887
+ }
507888
+ function luxttsRepoDir() {
507889
+ return join58(voiceDir(), "LuxTTS");
507890
+ }
507891
+ function luxttsInferScript() {
507892
+ return join58(voiceDir(), "luxtts-infer.py");
507893
+ }
507362
507894
  function piperVenvDir() {
507363
507895
  return join58(voiceDir(), "piper-venv");
507364
507896
  }
@@ -507371,13 +507903,13 @@ function ensureSupertonicInstalled() {
507371
507903
  const py = findPython32();
507372
507904
  if (!py)
507373
507905
  throw new Error("python3 is required to set up Supertonic TTS.");
507374
- execFileSync2(py, ["-m", "venv", join58(voiceDir(), "supertonic3-venv")], { stdio: "pipe", timeout: 18e4 });
507906
+ execFileSync3(py, ["-m", "venv", join58(voiceDir(), "supertonic3-venv")], { stdio: "pipe", timeout: 18e4 });
507375
507907
  }
507376
507908
  try {
507377
- execFileSync2(venvPy, ["-c", "import supertonic"], { stdio: "pipe", timeout: 1e4 });
507909
+ execFileSync3(venvPy, ["-c", "import supertonic"], { stdio: "pipe", timeout: 1e4 });
507378
507910
  } catch {
507379
- execFileSync2(venvPy, ["-m", "pip", "install", "--quiet", "--upgrade", "pip"], { stdio: "pipe", timeout: 12e4 });
507380
- execFileSync2(venvPy, ["-m", "pip", "install", "--quiet", "supertonic"], { stdio: "pipe", timeout: 6e5 });
507911
+ execFileSync3(venvPy, ["-m", "pip", "install", "--quiet", "--upgrade", "pip"], { stdio: "pipe", timeout: 12e4 });
507912
+ execFileSync3(venvPy, ["-m", "pip", "install", "--quiet", "supertonic"], { stdio: "pipe", timeout: 6e5 });
507381
507913
  }
507382
507914
  mkdirSync16(voiceDir(), { recursive: true });
507383
507915
  writeFileSync16(supertonicInferScript(), SUPERTONIC_INFER_PY, "utf-8");
@@ -507385,20 +507917,95 @@ function ensureSupertonicInstalled() {
507385
507917
  }
507386
507918
  function ensureMlxInstalled() {
507387
507919
  if (process.platform !== "darwin" || process.arch !== "arm64") {
507388
- throw new Error("MLX TTS requires macOS on Apple Silicon. Use luxtts, supertonic, onnx/piper, or espeak on this machine.");
507920
+ throw new Error("MLX TTS requires macOS on Apple Silicon. Use luxtts, supertonic, onnx/piper, or backend=auto on this machine.");
507389
507921
  }
507390
507922
  const venvPy = mlxVenvPy();
507391
507923
  if (!existsSync40(venvPy)) {
507392
507924
  const py = findPython32();
507393
507925
  if (!py)
507394
507926
  throw new Error("python3 is required to set up MLX Audio.");
507395
- execFileSync2(py, ["-m", "venv", join58(voiceDir(), "mlx-venv")], { stdio: "pipe", timeout: 18e4 });
507927
+ execFileSync3(py, ["-m", "venv", join58(voiceDir(), "mlx-venv")], { stdio: "pipe", timeout: 18e4 });
507928
+ }
507929
+ try {
507930
+ execFileSync3(venvPy, ["-c", "import mlx_audio"], { stdio: "pipe", timeout: 1e4 });
507931
+ } catch {
507932
+ execFileSync3(venvPy, ["-m", "pip", "install", "--quiet", "--upgrade", "pip"], { stdio: "pipe", timeout: 12e4 });
507933
+ execFileSync3(venvPy, ["-m", "pip", "install", "--quiet", "mlx-audio"], { stdio: "pipe", timeout: 6e5 });
507934
+ }
507935
+ return venvPy;
507936
+ }
507937
+ function pythonCanImportLuxTts(venvPy) {
507938
+ try {
507939
+ execFileSync3(venvPy, [
507940
+ "-c",
507941
+ "import sys, os; sys.path.insert(0, os.environ['LUXTTS_REPO_PATH']); from zipvoice.luxvoice import LuxTTS; print('ok')"
507942
+ ], {
507943
+ stdio: "pipe",
507944
+ timeout: 3e4,
507945
+ env: { ...process.env, LUXTTS_REPO_PATH: luxttsRepoDir() }
507946
+ });
507947
+ return true;
507948
+ } catch {
507949
+ return false;
507950
+ }
507951
+ }
507952
+ function pipInstall(venvPy, packages, timeout2 = 9e5) {
507953
+ execFileSync3(venvPy, ["-m", "pip", "install", "--prefer-binary", ...packages], {
507954
+ stdio: "pipe",
507955
+ timeout: timeout2,
507956
+ env: process.env
507957
+ });
507958
+ }
507959
+ function ensureLuxttsInstalled() {
507960
+ const venvPy = luxttsVenvPy();
507961
+ const repoDir = luxttsRepoDir();
507962
+ mkdirSync16(voiceDir(), { recursive: true });
507963
+ if (existsSync40(venvPy) && existsSync40(join58(repoDir, "zipvoice", "luxvoice.py")) && pythonCanImportLuxTts(venvPy)) {
507964
+ writeFileSync16(luxttsInferScript(), LUXTTS_DAEMON_PY, "utf-8");
507965
+ return venvPy;
507966
+ }
507967
+ const py = findPython32();
507968
+ if (!py)
507969
+ throw new Error("python3 is required to set up LuxTTS voice cloning.");
507970
+ if (!existsSync40(venvPy)) {
507971
+ execFileSync3(py, ["-m", "venv", luxttsVenvDir()], { stdio: "pipe", timeout: 18e4 });
507972
+ }
507973
+ execFileSync3(venvPy, ["-m", "pip", "install", "--upgrade", "pip", "wheel", "setuptools<81"], {
507974
+ stdio: "pipe",
507975
+ timeout: 3e5
507976
+ });
507977
+ pipInstall(venvPy, ["torch", "torchaudio"], 12e5);
507978
+ if (!existsSync40(join58(repoDir, "zipvoice", "luxvoice.py"))) {
507979
+ if (!hasCommand3("git"))
507980
+ throw new Error("git is required to set up LuxTTS voice cloning.");
507981
+ execFileSync3("git", ["clone", "--depth", "1", "https://github.com/ysharma3501/LuxTTS.git", repoDir], {
507982
+ stdio: "pipe",
507983
+ timeout: 3e5
507984
+ });
507396
507985
  }
507986
+ pipInstall(venvPy, [
507987
+ "lhotse",
507988
+ "huggingface_hub",
507989
+ "safetensors",
507990
+ "pydub",
507991
+ "onnxruntime",
507992
+ "librosa",
507993
+ "transformers<=4.57.6",
507994
+ "inflect",
507995
+ "numpy",
507996
+ "vocos",
507997
+ "jieba",
507998
+ "pypinyin",
507999
+ "cn2an"
508000
+ ], 12e5);
507397
508001
  try {
507398
- execFileSync2(venvPy, ["-c", "import mlx_audio"], { stdio: "pipe", timeout: 1e4 });
508002
+ pipInstall(venvPy, ["git+https://github.com/ysharma3501/LinaCodec.git"], 12e5);
507399
508003
  } catch {
507400
- execFileSync2(venvPy, ["-m", "pip", "install", "--quiet", "--upgrade", "pip"], { stdio: "pipe", timeout: 12e4 });
507401
- execFileSync2(venvPy, ["-m", "pip", "install", "--quiet", "mlx-audio"], { stdio: "pipe", timeout: 6e5 });
508004
+ }
508005
+ pipInstall(venvPy, ["-e", repoDir], 6e5);
508006
+ writeFileSync16(luxttsInferScript(), LUXTTS_DAEMON_PY, "utf-8");
508007
+ if (!pythonCanImportLuxTts(venvPy)) {
508008
+ throw new Error(`LuxTTS setup completed but import still fails in ${luxttsVenvDir()}.`);
507402
508009
  }
507403
508010
  return venvPy;
507404
508011
  }
@@ -507411,10 +508018,10 @@ function ensurePiperInstalled() {
507411
508018
  if (!py)
507412
508019
  throw new Error("python3 is required to set up Piper TTS.");
507413
508020
  mkdirSync16(voiceDir(), { recursive: true });
507414
- execFileSync2(py, ["-m", "venv", piperVenvDir()], { stdio: "pipe", timeout: 18e4 });
508021
+ execFileSync3(py, ["-m", "venv", piperVenvDir()], { stdio: "pipe", timeout: 18e4 });
507415
508022
  const venvPy = process.platform === "win32" ? join58(piperVenvDir(), "Scripts", "python.exe") : join58(piperVenvDir(), "bin", "python3");
507416
- execFileSync2(venvPy, ["-m", "pip", "install", "--quiet", "--upgrade", "pip"], { stdio: "pipe", timeout: 12e4 });
507417
- execFileSync2(venvPy, ["-m", "pip", "install", "--quiet", "piper-tts"], { stdio: "pipe", timeout: 6e5 });
508023
+ execFileSync3(venvPy, ["-m", "pip", "install", "--quiet", "--upgrade", "pip"], { stdio: "pipe", timeout: 12e4 });
508024
+ execFileSync3(venvPy, ["-m", "pip", "install", "--quiet", "piper-tts"], { stdio: "pipe", timeout: 6e5 });
507418
508025
  }
507419
508026
  if (!existsSync40(bin)) {
507420
508027
  throw new Error("Piper TTS installed but the piper executable was not found in the managed venv.");
@@ -507435,6 +508042,28 @@ function saveCloneRefFromSample(sample, cloneName) {
507435
508042
  copyFileSync2(source, dest);
507436
508043
  return dest;
507437
508044
  }
508045
+ function cloneSampleArg(args) {
508046
+ for (const key of ["sample", "source_audio", "voice_sample", "reference_audio", "ref_audio", "clone_sample"]) {
508047
+ const value2 = args[key];
508048
+ if (typeof value2 === "string" && value2.trim())
508049
+ return value2.trim();
508050
+ }
508051
+ return "";
508052
+ }
508053
+ function wantsVoiceClone(args) {
508054
+ if (cloneSampleArg(args))
508055
+ return true;
508056
+ if (typeof args["clone_ref"] === "string" && args["clone_ref"].trim())
508057
+ return true;
508058
+ const voice = typeof args["voice"] === "string" ? args["voice"].trim() : "";
508059
+ return /\.(wav|mp3|flac|ogg|m4a)$/i.test(voice) || voice.startsWith("/") || voice.startsWith("./") || voice.startsWith("../") || voice.startsWith("~/");
508060
+ }
508061
+ function cloneRefForSynthesis(args) {
508062
+ const sample = cloneSampleArg(args);
508063
+ if (sample)
508064
+ return saveCloneRefFromSample(sample, typeof args["clone_name"] === "string" ? args["clone_name"] : void 0);
508065
+ return resolveCloneRef(args["clone_ref"] ?? args["voice"]);
508066
+ }
507438
508067
  function ensureLuxttsDaemon() {
507439
508068
  if (_luxttsDaemon && !_luxttsDaemon.killed && _luxttsReady)
507440
508069
  return Promise.resolve(true);
@@ -507448,14 +508077,23 @@ function ensureLuxttsDaemon() {
507448
508077
  }
507449
508078
  if (_luxttsStarting)
507450
508079
  return Promise.resolve(false);
507451
- const venvPy = join58(homedir14(), ".omnius", "voice", "luxtts-venv", "bin", "python3");
507452
- const inferScript = join58(homedir14(), ".omnius", "voice", "luxtts-infer.py");
507453
- const repoDir = join58(homedir14(), ".omnius", "voice", "LuxTTS");
508080
+ const venvPy = luxttsVenvPy();
508081
+ const inferScript = luxttsInferScript();
508082
+ const repoDir = luxttsRepoDir();
507454
508083
  if (!existsSync40(venvPy) || !existsSync40(inferScript))
507455
508084
  return Promise.resolve(false);
507456
508085
  _luxttsStarting = true;
507457
508086
  return new Promise((resolve48) => {
507458
- const timeout2 = setTimeout(() => {
508087
+ let settled = false;
508088
+ let timeout2;
508089
+ const finish = (ready) => {
508090
+ if (settled)
508091
+ return;
508092
+ settled = true;
508093
+ clearTimeout(timeout2);
508094
+ resolve48(ready);
508095
+ };
508096
+ timeout2 = setTimeout(() => {
507459
508097
  _luxttsStarting = false;
507460
508098
  if (_luxttsDaemon && !_luxttsReady) {
507461
508099
  try {
@@ -507464,7 +508102,7 @@ function ensureLuxttsDaemon() {
507464
508102
  }
507465
508103
  _luxttsDaemon = null;
507466
508104
  }
507467
- resolve48(false);
508105
+ finish(false);
507468
508106
  }, 12e4);
507469
508107
  const daemon = spawn16(venvPy, [inferScript], {
507470
508108
  stdio: ["pipe", "pipe", "pipe"],
@@ -507486,8 +508124,7 @@ function ensureLuxttsDaemon() {
507486
508124
  if (msg.type === "ready") {
507487
508125
  _luxttsReady = true;
507488
508126
  _luxttsStarting = false;
507489
- clearTimeout(timeout2);
507490
- resolve48(true);
508127
+ finish(true);
507491
508128
  } else if (msg.type === "result" && msg.id) {
507492
508129
  const pending = _luxttsPending.get(msg.id);
507493
508130
  if (pending) {
@@ -507509,13 +508146,13 @@ function ensureLuxttsDaemon() {
507509
508146
  _luxttsDaemon = null;
507510
508147
  _luxttsReady = false;
507511
508148
  _luxttsStarting = false;
508149
+ finish(false);
507512
508150
  });
507513
508151
  daemon.on("error", () => {
507514
508152
  _luxttsDaemon = null;
507515
508153
  _luxttsReady = false;
507516
508154
  _luxttsStarting = false;
507517
- clearTimeout(timeout2);
507518
- resolve48(false);
508155
+ finish(false);
507519
508156
  });
507520
508157
  });
507521
508158
  }
@@ -507545,7 +508182,7 @@ function luxttsSynthesize(text, cloneRef, outputPath2, speed = 1) {
507545
508182
  _luxttsDaemon.stdin.write(req2 + "\n");
507546
508183
  });
507547
508184
  }
507548
- var _luxttsDaemon, _luxttsReady, _luxttsRequestId, _luxttsPending, _luxttsBuffer, _luxttsStarting, SUPERTONIC_INFER_PY, AudioPlaybackTool, TtsGenerateTool, SoundPlaybackTool;
508185
+ var _luxttsDaemon, _luxttsReady, _luxttsRequestId, _luxttsPending, _luxttsBuffer, _luxttsStarting, SUPERTONIC_INFER_PY, LUXTTS_DAEMON_PY, AudioPlaybackTool, TtsGenerateTool, SoundPlaybackTool;
507549
508186
  var init_audio_playback = __esm({
507550
508187
  "packages/execution/dist/tools/audio-playback.js"() {
507551
508188
  "use strict";
@@ -507585,10 +508222,45 @@ try:
507585
508222
  except Exception as exc:
507586
508223
  print(json.dumps({"ok": False, "error": str(exc), "trace": traceback.format_exc(limit=3)}))
507587
508224
  sys.exit(1)
508225
+ `;
508226
+ LUXTTS_DAEMON_PY = String.raw`
508227
+ import json, os, sys, traceback, wave
508228
+ import numpy as np
508229
+ import torch
508230
+ repo = os.environ.get("LUXTTS_REPO_PATH") or ""
508231
+ if repo:
508232
+ sys.path.insert(0, repo)
508233
+ from zipvoice.luxvoice import LuxTTS
508234
+ device = "cuda" if torch.cuda.is_available() else "cpu"
508235
+ tts = LuxTTS(model_path="YatharthS/LuxTTS", device=device, threads=4)
508236
+ print(json.dumps({"type": "ready", "device": device}), flush=True)
508237
+ for line in sys.stdin:
508238
+ if not line.strip():
508239
+ continue
508240
+ req = json.loads(line)
508241
+ if req.get("action") == "quit":
508242
+ break
508243
+ rid = req.get("id")
508244
+ try:
508245
+ text = str(req.get("text") or "").strip()
508246
+ clone_ref = str(req.get("clone_ref") or "")
508247
+ output = str(req.get("output_path") or "")
508248
+ speed = float(req.get("speed") or 1.0)
508249
+ enc = tts.encode_prompt(clone_ref, duration=5, rms=0.001)
508250
+ wav = tts.generate_speech(text, enc, num_steps=4, guidance_scale=3.0, t_shift=0.5, speed=speed)
508251
+ data = (np.clip(wav.cpu().numpy().squeeze(), -1, 1) * 32767).astype(np.int16)
508252
+ with wave.open(output, "wb") as f:
508253
+ f.setnchannels(1)
508254
+ f.setsampwidth(2)
508255
+ f.setframerate(48000)
508256
+ f.writeframes(data.tobytes())
508257
+ print(json.dumps({"type": "result", "id": rid, "path": output}), flush=True)
508258
+ except Exception as exc:
508259
+ print(json.dumps({"type": "error", "id": rid, "error": str(exc), "trace": traceback.format_exc(limit=3)}), flush=True)
507588
508260
  `;
507589
508261
  AudioPlaybackTool = class {
507590
508262
  name = "audio_playback";
507591
- description = "Play audio through speakers, synthesize text-to-speech, and manage TTS clone voices. Actions: 'play' to play an audio file (WAV/MP3/OGG — including recordings from memory episodes), 'speak' to synthesize and play text, 'synthesize' to save TTS to a WAV file, 'clone' to register a voice-clone sample, 'list_voices' to inspect available clone refs/backends, 'volume' to get or set system volume, 'list' to enumerate audio output devices. TTS backends are explicit: auto, luxtts, supertonic, mlx, onnx/piper, or espeak. Neural TTS backends self-provision into ~/.omnius/voice on first use where supported. Use generate_tts when the task is specifically to create a TTS file; do not use shell speech commands or generate_audio for spoken TTS.";
508263
+ description = "Play audio through speakers, synthesize text-to-speech, and manage TTS clone voices. Actions: 'play' to play an audio file (WAV/MP3/OGG — including recordings from memory episodes), 'speak' to synthesize and play text, 'synthesize' to save TTS to a WAV file, 'clone' to register a voice-clone source clip, 'list_voices' to inspect available clone refs/backends, 'volume' to get or set system volume, 'list' to enumerate audio output devices. TTS backends include auto, LuxTTS voice cloning, Supertonic, MLX, ONNX/Piper, and a local fallback. Neural TTS backends self-provision into ~/.omnius/voice on first use where supported. For cloned speech from a source clip, call generate_tts or audio_playback action=synthesize with sample/source_audio/voice_sample and backend=auto or luxtts. Use generate_tts when the task is specifically to create a TTS file; do not use shell speech commands or generate_audio for spoken TTS.";
507592
508264
  parameters = {
507593
508265
  type: "object",
507594
508266
  properties: {
@@ -507615,8 +508287,8 @@ except Exception as exc:
507615
508287
  },
507616
508288
  backend: {
507617
508289
  type: "string",
507618
- enum: ["auto", "luxtts", "supertonic", "mlx", "onnx", "piper", "espeak"],
507619
- description: "TTS backend. auto tries LuxTTS clone, Supertonic, MLX on Apple Silicon, Piper/ONNX, then espeak."
508290
+ enum: ["auto", "luxtts", "supertonic", "mlx", "onnx", "piper"],
508291
+ description: "TTS backend. auto tries LuxTTS clone, Supertonic, MLX on Apple Silicon, Piper/ONNX, then a local fallback."
507620
508292
  },
507621
508293
  output: {
507622
508294
  type: "string",
@@ -507632,11 +508304,31 @@ except Exception as exc:
507632
508304
  },
507633
508305
  sample: {
507634
508306
  type: "string",
507635
- description: "Audio sample path to register as a clone voice for action=clone."
508307
+ description: "Audio source clip path to register or use as a LuxTTS clone voice."
508308
+ },
508309
+ source_audio: {
508310
+ type: "string",
508311
+ description: "Alias for sample. Use this for cloned speech from a source voice clip."
508312
+ },
508313
+ voice_sample: {
508314
+ type: "string",
508315
+ description: "Alias for sample/source_audio."
508316
+ },
508317
+ reference_audio: {
508318
+ type: "string",
508319
+ description: "Alias for sample/source_audio."
508320
+ },
508321
+ ref_audio: {
508322
+ type: "string",
508323
+ description: "Alias for sample/source_audio."
508324
+ },
508325
+ clone_sample: {
508326
+ type: "string",
508327
+ description: "Alias for sample/source_audio."
507636
508328
  },
507637
508329
  clone_name: {
507638
508330
  type: "string",
507639
- description: "Friendly filename stem for action=clone."
508331
+ description: "Friendly filename stem for action=clone or for registering a source clip during synthesis."
507640
508332
  },
507641
508333
  model: {
507642
508334
  type: "string",
@@ -507652,11 +508344,11 @@ except Exception as exc:
507652
508344
  },
507653
508345
  speed: {
507654
508346
  type: "number",
507655
- description: "Speech speed. espeak uses words per minute; neural backends use a multiplier."
508347
+ description: "Speech speed. Neural backends use a multiplier; local fallback uses its backend-specific rate."
507656
508348
  },
507657
508349
  voice: {
507658
508350
  type: "string",
507659
- description: "Voice id/name. Examples: Supertonic voice M4, MLX voice af_heart, espeak voice en-us, or Piper/ONNX model path."
508351
+ description: "Voice id/name. Examples: Supertonic voice M4, MLX voice af_heart, a source audio path for cloning, or Piper/ONNX model path."
507660
508352
  },
507661
508353
  lang: {
507662
508354
  type: "string",
@@ -507720,9 +508412,9 @@ except Exception as exc:
507720
508412
  return await this.synthesizeText(args, start2, true);
507721
508413
  }
507722
508414
  cloneVoice(args, start2) {
507723
- const sample = typeof args["sample"] === "string" ? args["sample"] : typeof args["file"] === "string" ? args["file"] : "";
508415
+ const sample = cloneSampleArg(args) || (typeof args["file"] === "string" ? args["file"] : "");
507724
508416
  if (!sample.trim()) {
507725
- return { success: false, output: "", error: "Missing 'sample' parameter. Provide a local audio sample to register as a clone voice.", durationMs: performance.now() - start2 };
508417
+ return { success: false, output: "", error: "Missing source audio. Provide sample=<file> or source_audio=<file> to register as a clone voice.", durationMs: performance.now() - start2 };
507726
508418
  }
507727
508419
  const saved = saveCloneRefFromSample(sample, typeof args["clone_name"] === "string" ? args["clone_name"] : void 0);
507728
508420
  return {
@@ -507739,10 +508431,11 @@ except Exception as exc:
507739
508431
  const lines = [
507740
508432
  "TTS backends:",
507741
508433
  ` luxtts: ${existsSync40(join58(voiceDir(), "luxtts-venv", "bin", "python3")) ? "installed" : "not installed"}; clone refs: ${refs.length}`,
508434
+ " clone from source clip: generate_tts text=<words> source_audio=<wav/mp3/flac/ogg/m4a> backend=auto",
507742
508435
  ` supertonic: ${existsSync40(supertonicVenvPy()) ? "installed" : "not installed"}; voices include M1, M2, M3, M4 when package assets are available`,
507743
508436
  ` mlx: ${existsSync40(mlxVenvPy()) ? "installed" : "not installed"}; Apple Silicon only; default model mlx-community/Kokoro-82M-bf16`,
507744
508437
  ` piper/onnx: ${hasCommand3("piper") || existsSync40(piperVenvBin()) ? "available" : "not installed"}; first use installs piper-tts into ${piperVenvDir()}; pass model=<path.onnx> for raw ONNX voices`,
507745
- ` espeak: ${hasCommand3("espeak-ng") ? "available" : "not found"}`,
508438
+ ` local fallback: ${hasCommand3("espeak-ng") ? "available" : "not found"}`,
507746
508439
  "",
507747
508440
  "Registered clone refs:",
507748
508441
  ...refs.length ? refs.map((ref) => ` ${ref}`) : [" none"]
@@ -507756,11 +508449,20 @@ except Exception as exc:
507756
508449
  }
507757
508450
  const requestedBackend = normalizeTtsBackend(args["backend"]);
507758
508451
  const strictBackend = boolArg(args["strict_backend"] ?? args["strictBackend"], false);
508452
+ const cloneRequested = wantsVoiceClone(args);
508453
+ if (cloneRequested && requestedBackend !== "auto" && requestedBackend !== "luxtts") {
508454
+ return {
508455
+ success: false,
508456
+ output: "",
508457
+ error: "Voice cloning from a source clip requires backend=auto or backend=luxtts.",
508458
+ durationMs: performance.now() - start2
508459
+ };
508460
+ }
507759
508461
  const playback = playbackArg(args, speakDefault);
507760
508462
  const outputPath2 = ttsOutputPath(args, requestedBackend);
507761
508463
  const device = typeof args["device"] === "string" ? args["device"] : "default";
507762
508464
  const tried = [];
507763
- const autoCandidates = ["luxtts", "supertonic", ...process.platform === "darwin" && process.arch === "arm64" ? ["mlx"] : [], "piper", "espeak"];
508465
+ const autoCandidates = cloneRequested ? ["luxtts"] : ["luxtts", "supertonic", ...process.platform === "darwin" && process.arch === "arm64" ? ["mlx"] : [], "piper", "espeak"];
507764
508466
  const candidates = requestedBackend === "auto" ? autoCandidates : strictBackend ? [requestedBackend] : [requestedBackend, ...autoCandidates.filter((backend) => backend !== requestedBackend)];
507765
508467
  let usedBackend = "";
507766
508468
  let voiceSummary = "";
@@ -507823,21 +508525,19 @@ ${tried.map((line) => `- ${line}`).join("\n")}`,
507823
508525
  };
507824
508526
  }
507825
508527
  async synthesizeLuxtts(text, outputPath2, args) {
507826
- const cloneRef = resolveCloneRef(args["clone_ref"] ?? args["voice"] ?? args["sample"]);
508528
+ const cloneRef = cloneRefForSynthesis(args);
507827
508529
  if (!cloneRef)
507828
- throw new Error(`No LuxTTS clone reference found. Register one with audio_playback action=clone sample=<file>.`);
508530
+ throw new Error(`No LuxTTS clone source found. Provide source_audio=<voice clip> or clone_ref=<registered clip>.`);
507829
508531
  const speed = numberArg3(args["speed"], 1);
508532
+ ensureLuxttsInstalled();
507830
508533
  const daemonReady = await ensureLuxttsDaemon();
507831
508534
  if (daemonReady) {
507832
508535
  await luxttsSynthesize(text, cloneRef, outputPath2, speed);
507833
508536
  if (existsSync40(outputPath2))
507834
508537
  return `${basename12(cloneRef)} (LuxTTS daemon)`;
507835
508538
  }
507836
- const venvPy = join58(voiceDir(), "luxtts-venv", "bin", "python3");
507837
- const repoDir = join58(voiceDir(), "LuxTTS");
507838
- if (!existsSync40(venvPy) || !existsSync40(repoDir)) {
507839
- throw new Error("LuxTTS is not installed in the managed voice environment yet.");
507840
- }
508539
+ const venvPy = luxttsVenvPy();
508540
+ const repoDir = luxttsRepoDir();
507841
508541
  const pyScript = [
507842
508542
  "import json, sys, wave",
507843
508543
  "import numpy as np, torch",
@@ -507851,7 +508551,7 @@ ${tried.map((line) => `- ${line}`).join("\n")}`,
507851
508551
  "d=(np.clip(wav.cpu().numpy().squeeze(), -1, 1)*32767).astype(np.int16)",
507852
508552
  "f=wave.open(args['output'], 'wb'); f.setnchannels(1); f.setsampwidth(2); f.setframerate(48000); f.writeframes(d.tobytes()); f.close()"
507853
508553
  ].join("; ");
507854
- execFileSync2(venvPy, ["-c", pyScript, JSON.stringify({ text, output: outputPath2, clone_ref: cloneRef, repo: repoDir, speed })], {
508554
+ execFileSync3(venvPy, ["-c", pyScript, JSON.stringify({ text, output: outputPath2, clone_ref: cloneRef, repo: repoDir, speed })], {
507855
508555
  stdio: "pipe",
507856
508556
  timeout: 12e4,
507857
508557
  env: { ...process.env, LUXTTS_REPO_PATH: repoDir }
@@ -507864,7 +508564,7 @@ ${tried.map((line) => `- ${line}`).join("\n")}`,
507864
508564
  const lang = typeof args["lang"] === "string" ? args["lang"] : "en";
507865
508565
  const speed = numberArg3(args["speed"], 1.05);
507866
508566
  const totalStep = Math.round(numberArg3(args["total_step"], 8));
507867
- const stdout = execFileSync2(venvPy, [supertonicInferScript()], {
508567
+ const stdout = execFileSync3(venvPy, [supertonicInferScript()], {
507868
508568
  input: JSON.stringify({ text, output_path: outputPath2, voice_name: voice, lang, speed, total_step: totalStep }),
507869
508569
  encoding: "utf8",
507870
508570
  stdio: ["pipe", "pipe", "pipe"],
@@ -507887,7 +508587,7 @@ ${tried.map((line) => `- ${line}`).join("\n")}`,
507887
508587
  "args=json.loads(sys.argv[1])",
507888
508588
  "tts_gen.main(['--model', args['model'], '--text', args['text'], '--voice', args['voice'], '--lang_code', args['lang'], '--audio_path', args['output']])"
507889
508589
  ].join("; ");
507890
- execFileSync2(py, ["-c", pyScript, JSON.stringify({ text, model, voice, lang, output: outputPath2 })], {
508590
+ execFileSync3(py, ["-c", pyScript, JSON.stringify({ text, model, voice, lang, output: outputPath2 })], {
507891
508591
  stdio: "pipe",
507892
508592
  timeout: 18e4,
507893
508593
  cwd: tmpdir11()
@@ -507908,15 +508608,15 @@ ${tried.map((line) => `- ${line}`).join("\n")}`,
507908
508608
  } else {
507909
508609
  throw new Error(`${requireModel ? "Raw ONNX" : "Piper"} TTS requires model=<path.onnx> or voice=<path.onnx>.`);
507910
508610
  }
507911
- execFileSync2(piper, argv, { input: text, stdio: ["pipe", "pipe", "pipe"], timeout: 12e4 });
508611
+ execFileSync3(piper, argv, { input: text, stdio: ["pipe", "pipe", "pipe"], timeout: 12e4 });
507912
508612
  return summary;
507913
508613
  }
507914
508614
  synthesizeEspeak(text, outputPath2, args) {
507915
508615
  if (!hasCommand3("espeak-ng"))
507916
- throw new Error("espeak-ng command not found.");
508616
+ throw new Error("Local fallback TTS command not found.");
507917
508617
  const voice = typeof args["voice"] === "string" ? args["voice"] : "en";
507918
508618
  const speed = Math.round(numberArg3(args["speed"], 160));
507919
- execFileSync2("espeak-ng", ["-v", voice, "-s", String(speed), "-w", outputPath2, text], {
508619
+ execFileSync3("espeak-ng", ["-v", voice, "-s", String(speed), "-w", outputPath2, text], {
507920
508620
  stdio: "pipe",
507921
508621
  timeout: 6e4
507922
508622
  });
@@ -507995,20 +508695,27 @@ ${devices.join("\n")}`,
507995
508695
  };
507996
508696
  TtsGenerateTool = class {
507997
508697
  name = "generate_tts";
507998
- description = "Generate text-to-speech audio as a WAV file, optionally playing it after synthesis. Supports explicit backends: auto, luxtts voice cloning, supertonic, mlx, onnx/piper, and espeak. Neural TTS backends self-provision into ~/.omnius/voice on first use where supported. Use clone_ref to select a registered LuxTTS voice, sample+clone_name to register a clone sample via audio_playback action=clone, and playback=false for silent file generation. Use this tool for speech/TTS requests; do not use shell commands or generate_audio as a TTS fallback.";
508698
+ description = "Generate text-to-speech audio as a WAV file, optionally playing it after synthesis. Supports explicit backends: auto, LuxTTS voice cloning, Supertonic, MLX, ONNX/Piper, and local fallback. Neural TTS backends self-provision into ~/.omnius/voice on first use where supported. For voice cloning, pass source_audio/sample/voice_sample with the reference clip and backend=auto or luxtts; clone_name can register it for reuse. Use clone_ref to select a registered LuxTTS voice and playback=false for silent file generation. Use this tool for speech/TTS requests; do not use shell commands or generate_audio as a TTS fallback.";
507999
508699
  parameters = {
508000
508700
  type: "object",
508001
508701
  properties: {
508002
508702
  text: { type: "string", description: "Text to synthesize" },
508003
508703
  input: { type: "string", description: "Alias for text." },
508004
508704
  prompt: { type: "string", description: "Alias for text." },
508005
- backend: { type: "string", enum: ["auto", "luxtts", "supertonic", "mlx", "onnx", "piper", "espeak"] },
508705
+ backend: { type: "string", enum: ["auto", "luxtts", "supertonic", "mlx", "onnx", "piper"] },
508006
508706
  output: { type: "string", description: "Output WAV path. Defaults to ~/.omnius/voice/generated/tts-*.wav." },
508007
508707
  path: { type: "string", description: "Alias for output." },
508008
508708
  playback: { type: "boolean", description: "Whether to play after generating. Defaults false for generate_tts." },
508009
508709
  strict_backend: { type: "boolean", description: "When true, fail instead of falling back if the requested backend is unavailable. Defaults false." },
508010
508710
  voice: { type: "string", description: "Voice id/name, or raw Piper/ONNX path when backend=onnx/piper." },
508011
508711
  clone_ref: { type: "string", description: "LuxTTS clone reference path, filename, or registered clone name." },
508712
+ sample: { type: "string", description: "Voice source clip path for cloned speech. Alias: source_audio." },
508713
+ source_audio: { type: "string", description: "Voice source clip path for cloned speech." },
508714
+ voice_sample: { type: "string", description: "Alias for source_audio." },
508715
+ reference_audio: { type: "string", description: "Alias for source_audio." },
508716
+ ref_audio: { type: "string", description: "Alias for source_audio." },
508717
+ clone_sample: { type: "string", description: "Alias for source_audio." },
508718
+ clone_name: { type: "string", description: "Optional name to register the source clip for later reuse." },
508012
508719
  model: { type: "string", description: "Backend model id or raw ONNX/Piper model path." },
508013
508720
  lang: { type: "string", description: "Language code for Supertonic/MLX where supported." },
508014
508721
  speed: { type: "number", description: "Speech speed multiplier or backend-specific rate." },
@@ -575055,7 +575762,7 @@ __export(image_ascii_preview_exports, {
575055
575762
  extractSavedImagePath: () => extractSavedImagePath,
575056
575763
  formatImageAsciiContext: () => formatImageAsciiContext
575057
575764
  });
575058
- import { execFileSync as execFileSync3 } from "node:child_process";
575765
+ import { execFileSync as execFileSync4 } from "node:child_process";
575059
575766
  import { createRequire as createRequire5 } from "node:module";
575060
575767
  import { existsSync as existsSync94, readFileSync as readFileSync75, statSync as statSync32 } from "node:fs";
575061
575768
  import { resolve as resolve37 } from "node:path";
@@ -575192,7 +575899,7 @@ function convertWithFfmpeg(imagePath, width, height, timeoutMs) {
575192
575899
  `scale=${width}:${height}`,
575193
575900
  "format=gray"
575194
575901
  ].join(",");
575195
- const raw = execFileSync3(
575902
+ const raw = execFileSync4(
575196
575903
  "ffmpeg",
575197
575904
  [
575198
575905
  "-hide_banner",
@@ -575357,19 +576064,19 @@ function modelOnnxPath(id) {
575357
576064
  function modelConfigPath(id) {
575358
576065
  return join109(modelDir(id), "config.json");
575359
576066
  }
575360
- function luxttsVenvDir() {
576067
+ function luxttsVenvDir2() {
575361
576068
  return join109(voiceDir2(), "luxtts-venv");
575362
576069
  }
575363
- function luxttsVenvPy() {
575364
- return platform5() === "win32" ? join109(luxttsVenvDir(), "Scripts", "python.exe") : join109(luxttsVenvDir(), "bin", "python3");
576070
+ function luxttsVenvPy2() {
576071
+ return platform5() === "win32" ? join109(luxttsVenvDir2(), "Scripts", "python.exe") : join109(luxttsVenvDir2(), "bin", "python3");
575365
576072
  }
575366
- function luxttsRepoDir() {
576073
+ function luxttsRepoDir2() {
575367
576074
  return join109(voiceDir2(), "LuxTTS");
575368
576075
  }
575369
576076
  function luxttsCloneRefsDir() {
575370
576077
  return join109(voiceDir2(), "clone-refs");
575371
576078
  }
575372
- function luxttsInferScript() {
576079
+ function luxttsInferScript2() {
575373
576080
  return join109(voiceDir2(), "luxtts-infer.py");
575374
576081
  }
575375
576082
  function supertonicVenvDir() {
@@ -577936,12 +578643,12 @@ Error: ${err2 instanceof Error ? err2.message : String(err2)}`
577936
578643
  "python3 not found. LuxTTS requires Python 3.10+. Try: apt install python3 / brew install python3"
577937
578644
  );
577938
578645
  }
577939
- const venvDir = luxttsVenvDir();
577940
- const venvPy = luxttsVenvPy();
578646
+ const venvDir = luxttsVenvDir2();
578647
+ const venvPy = luxttsVenvPy2();
577941
578648
  if (existsSync95(venvPy)) {
577942
578649
  try {
577943
578650
  const quotedPy = `"${venvPy}"`;
577944
- const repoPath = luxttsRepoDir().replace(/\\/g, "/");
578651
+ const repoPath = luxttsRepoDir2().replace(/\\/g, "/");
577945
578652
  await this.asyncShell(
577946
578653
  `${quotedPy} -c "import sys; sys.path.insert(0, '${repoPath}'); from zipvoice.luxvoice import LuxTTS; print('ok')"`,
577947
578654
  3e4
@@ -578055,7 +578762,7 @@ Error: ${err2 instanceof Error ? err2.message : String(err2)}`
578055
578762
  }
578056
578763
  }
578057
578764
  }
578058
- const repoDir = luxttsRepoDir();
578765
+ const repoDir = luxttsRepoDir2();
578059
578766
  if (!existsSync95(join109(repoDir, "zipvoice", "luxvoice.py"))) {
578060
578767
  renderInfo(" Cloning LuxTTS repository...");
578061
578768
  try {
@@ -578479,18 +579186,18 @@ def main():
578479
579186
  if __name__ == '__main__':
578480
579187
  main()
578481
579188
  `;
578482
- const scriptPath2 = luxttsInferScript();
579189
+ const scriptPath2 = luxttsInferScript2();
578483
579190
  mkdirSync52(voiceDir2(), { recursive: true });
578484
579191
  writeFileSync49(scriptPath2, script);
578485
579192
  }
578486
579193
  /** Ensure the LuxTTS daemon is running, spawn if needed */
578487
579194
  async ensureLuxttsDaemon() {
578488
579195
  if (this._luxttsDaemon && !this._luxttsDaemon.killed) return true;
578489
- const venvPy = luxttsVenvPy();
579196
+ const venvPy = luxttsVenvPy2();
578490
579197
  if (!existsSync95(venvPy)) return false;
578491
579198
  return new Promise((resolve48) => {
578492
- const env2 = { ...process.env, LUXTTS_REPO_PATH: luxttsRepoDir() };
578493
- const daemon = nodeSpawn(venvPy, [luxttsInferScript()], {
579199
+ const env2 = { ...process.env, LUXTTS_REPO_PATH: luxttsRepoDir2() };
579200
+ const daemon = nodeSpawn(venvPy, [luxttsInferScript2()], {
578494
579201
  stdio: ["pipe", "pipe", "pipe"],
578495
579202
  cwd: tmpdir20(),
578496
579203
  env: env2
@@ -596377,6 +597084,17 @@ var init_tool_policy = __esm({
596377
597084
  "todo_write",
596378
597085
  "web_search",
596379
597086
  "web_fetch",
597087
+ "image_read",
597088
+ "ocr",
597089
+ "ocr_image_advanced",
597090
+ "ocr_pdf",
597091
+ "pdf_to_text",
597092
+ "vision",
597093
+ "transcribe_file",
597094
+ "video_understand",
597095
+ "audio_analyze",
597096
+ "explore_tools",
597097
+ "telegram_media_recent",
596380
597098
  "generate_image",
596381
597099
  "generate_audio",
596382
597100
  "generate_tts",
@@ -596393,6 +597111,17 @@ var init_tool_policy = __esm({
596393
597111
  "web_search",
596394
597112
  "web_fetch",
596395
597113
  "web_crawl",
597114
+ "image_read",
597115
+ "ocr",
597116
+ "ocr_image_advanced",
597117
+ "ocr_pdf",
597118
+ "pdf_to_text",
597119
+ "vision",
597120
+ "transcribe_file",
597121
+ "video_understand",
597122
+ "audio_analyze",
597123
+ "explore_tools",
597124
+ "telegram_media_recent",
596396
597125
  "generate_image",
596397
597126
  "generate_audio",
596398
597127
  "generate_tts",
@@ -596500,6 +597229,7 @@ function scopedTool(base3, root, mode) {
596500
597229
  async execute(args) {
596501
597230
  const next = { ...args };
596502
597231
  if (base3.name === "generate_image" || base3.name === "generate_audio" || base3.name === "generate_tts") {
597232
+ const cleanup = [];
596503
597233
  const localModel = typeof next["model_path"] === "string" ? String(next["model_path"]) : typeof next["model"] === "string" && looksLikeLocalPath(String(next["model"])) ? String(next["model"]) : "";
596504
597234
  if (localModel) {
596505
597235
  const guarded = guardPath(rootAbs, localModel);
@@ -596508,6 +597238,22 @@ function scopedTool(base3, root, mode) {
596508
597238
  else next["model"] = guarded.path.abs;
596509
597239
  }
596510
597240
  if (base3.name === "generate_tts") {
597241
+ for (const key of TTS_CLONE_SOURCE_KEYS) {
597242
+ const value2 = next[key];
597243
+ if (typeof value2 !== "string" || !value2.trim()) continue;
597244
+ const materialized = materializeTelegramCreativeArtifactForSend(rootAbs, value2.trim());
597245
+ if (!materialized.ok) return denied(materialized.error);
597246
+ next[key] = materialized.path;
597247
+ if (materialized.cleanup) cleanup.push(materialized.cleanup);
597248
+ }
597249
+ for (const key of ["clone_ref", "voice"]) {
597250
+ const value2 = next[key];
597251
+ if (typeof value2 !== "string" || !value2.trim() || !looksLikeAudioPath(value2.trim())) continue;
597252
+ const materialized = materializeTelegramCreativeArtifactForSend(rootAbs, value2.trim());
597253
+ if (!materialized.ok) return denied(materialized.error);
597254
+ next[key] = materialized.path;
597255
+ if (materialized.cleanup) cleanup.push(materialized.cleanup);
597256
+ }
596511
597257
  const rawOutput = typeof next["output"] === "string" && String(next["output"]).trim() ? String(next["output"]) : typeof next["output_path"] === "string" && String(next["output_path"]).trim() ? String(next["output_path"]) : `tts-${Date.now()}.wav`;
596512
597258
  const guardedOutput = guardPath(rootAbs, rawOutput);
596513
597259
  if (!guardedOutput.ok) return denied(guardedOutput.error);
@@ -596517,16 +597263,20 @@ function scopedTool(base3, root, mode) {
596517
597263
  next["output"] = guardedOutput.path.abs;
596518
597264
  next["playback"] = false;
596519
597265
  }
596520
- const result2 = await base3.execute(next);
596521
- if (result2.success) {
596522
- if (base3.name === "generate_tts" && typeof next["output"] === "string") {
596523
- rememberCreated(rootAbs, String(next["output"]));
596524
- }
596525
- for (const path11 of collectGeneratedArtifactPathsFromText(result2.output, rootAbs)) {
596526
- rememberCreated(rootAbs, path11);
597266
+ try {
597267
+ const result2 = await base3.execute(next);
597268
+ if (result2.success) {
597269
+ if (base3.name === "generate_tts" && typeof next["output"] === "string") {
597270
+ rememberCreated(rootAbs, String(next["output"]));
597271
+ }
597272
+ for (const path11 of collectGeneratedArtifactPathsFromText(result2.output, rootAbs)) {
597273
+ rememberCreated(rootAbs, path11);
597274
+ }
596527
597275
  }
597276
+ return result2;
597277
+ } finally {
597278
+ for (const fn of cleanup) fn();
596528
597279
  }
596529
- return result2;
596530
597280
  }
596531
597281
  const pathKey = PATH_KEYS.find((key) => typeof next[key] === "string" && String(next[key]).trim());
596532
597282
  if (pathKey) {
@@ -596591,6 +597341,9 @@ function isInside(root, path11) {
596591
597341
  function looksLikeLocalPath(value2) {
596592
597342
  return value2.startsWith("/") || value2.startsWith("./") || value2.startsWith("../");
596593
597343
  }
597344
+ function looksLikeAudioPath(value2) {
597345
+ return looksLikeLocalPath(value2) || value2.startsWith("~/") || /\.(wav|mp3|flac|ogg|m4a)$/i.test(value2);
597346
+ }
596594
597347
  function manifestPath(root) {
596595
597348
  return join119(root, MANIFEST_FILE);
596596
597349
  }
@@ -596753,7 +597506,7 @@ function denied(error) {
596753
597506
  mutatedFiles: []
596754
597507
  };
596755
597508
  }
596756
- var MANIFEST_FILE, OBJECTS_DIR, SEND_DIR, PATH_KEYS, MEDIA_PATH_RE, PUBLIC_EXECUTABLE_ARTIFACT_EXTENSIONS, CreativeAudioFileTool;
597509
+ var MANIFEST_FILE, OBJECTS_DIR, SEND_DIR, PATH_KEYS, TTS_CLONE_SOURCE_KEYS, MEDIA_PATH_RE, PUBLIC_EXECUTABLE_ARTIFACT_EXTENSIONS, CreativeAudioFileTool;
596757
597510
  var init_telegram_creative_tools = __esm({
596758
597511
  "packages/cli/src/tui/telegram-creative-tools.ts"() {
596759
597512
  "use strict";
@@ -596762,6 +597515,7 @@ var init_telegram_creative_tools = __esm({
596762
597515
  OBJECTS_DIR = ".objects";
596763
597516
  SEND_DIR = ".send";
596764
597517
  PATH_KEYS = ["path", "file", "file_path", "filename", "filepath", "filePath"];
597518
+ TTS_CLONE_SOURCE_KEYS = ["sample", "source_audio", "voice_sample", "reference_audio", "ref_audio", "clone_sample"];
596765
597519
  MEDIA_PATH_RE = /(?:^|[\s([])(\/[^\s<>"')\]]+\.[A-Za-z0-9]{1,12})(?:$|[\s),.\]])/g;
596766
597520
  PUBLIC_EXECUTABLE_ARTIFACT_EXTENSIONS = /* @__PURE__ */ new Set([
596767
597521
  ".sh",
@@ -596836,9 +597590,16 @@ var init_telegram_creative_tools = __esm({
596836
597590
  input: { type: "string", description: "Alias for text" },
596837
597591
  prompt: { type: "string", description: "Alias for text" },
596838
597592
  path: { type: "string", description: "Output .wav path inside the creative workspace" },
596839
- backend: { type: "string", enum: ["auto", "luxtts", "supertonic", "mlx", "onnx", "piper", "espeak"], description: "TTS backend. Defaults to auto." },
596840
- voice: { type: "string", description: "Voice id/name for the selected TTS backend" },
597593
+ backend: { type: "string", enum: ["auto", "luxtts", "supertonic", "mlx", "onnx", "piper"], description: "TTS backend. Defaults to auto." },
597594
+ voice: { type: "string", description: "Voice id/name for the selected TTS backend, or a scoped source audio path for cloning" },
596841
597595
  clone_ref: { type: "string", description: "Optional LuxTTS clone reference" },
597596
+ sample: { type: "string", description: "Voice source clip path inside the creative workspace" },
597597
+ source_audio: { type: "string", description: "Alias for sample" },
597598
+ voice_sample: { type: "string", description: "Alias for sample" },
597599
+ reference_audio: { type: "string", description: "Alias for sample" },
597600
+ ref_audio: { type: "string", description: "Alias for sample" },
597601
+ clone_sample: { type: "string", description: "Alias for sample" },
597602
+ clone_name: { type: "string", description: "Optional name to register the source clip for later reuse" },
596842
597603
  model: { type: "string", description: "Optional backend model id or raw Piper/ONNX path" },
596843
597604
  speed: { type: "number", description: "Speech speed multiplier or backend-specific rate" }
596844
597605
  },
@@ -596857,26 +597618,57 @@ var init_telegram_creative_tools = __esm({
596857
597618
  if (!guarded.path.abs.toLowerCase().endsWith(".wav")) {
596858
597619
  return denied("create_audio_file currently writes WAV files; use a .wav output path.");
596859
597620
  }
596860
- await mkdir17(dirname33(guarded.path.abs), { recursive: true });
596861
- const tts = new TtsGenerateTool();
596862
- const result = await tts.execute({
596863
- text,
596864
- output: guarded.path.abs,
596865
- playback: false,
596866
- backend: args["backend"],
596867
- voice: args["voice"],
596868
- clone_ref: args["clone_ref"],
596869
- model: args["model"],
596870
- speed: args["speed"]
596871
- });
596872
- if (!result.success || !existsSync104(guarded.path.abs)) {
596873
- return {
596874
- success: false,
596875
- output: "",
596876
- error: `Audio synthesis failed through generate_tts.
597621
+ const cloneArgs = {};
597622
+ const cleanup = [];
597623
+ for (const key of TTS_CLONE_SOURCE_KEYS) {
597624
+ const value2 = args[key];
597625
+ if (typeof value2 !== "string" || !value2.trim()) continue;
597626
+ const materialized = materializeTelegramCreativeArtifactForSend(this.root, value2.trim());
597627
+ if (!materialized.ok) return denied(materialized.error);
597628
+ cloneArgs[key] = materialized.path;
597629
+ if (materialized.cleanup) cleanup.push(materialized.cleanup);
597630
+ }
597631
+ for (const key of ["clone_ref", "voice"]) {
597632
+ const value2 = args[key];
597633
+ if (typeof value2 !== "string" || !value2.trim() || !looksLikeAudioPath(value2.trim())) continue;
597634
+ const materialized = materializeTelegramCreativeArtifactForSend(this.root, value2.trim());
597635
+ if (!materialized.ok) return denied(materialized.error);
597636
+ cloneArgs[key] = materialized.path;
597637
+ if (materialized.cleanup) cleanup.push(materialized.cleanup);
597638
+ }
597639
+ let result;
597640
+ try {
597641
+ await mkdir17(dirname33(guarded.path.abs), { recursive: true });
597642
+ const tts = new TtsGenerateTool();
597643
+ result = await tts.execute({
597644
+ text,
597645
+ output: guarded.path.abs,
597646
+ playback: false,
597647
+ backend: args["backend"],
597648
+ voice: cloneArgs["voice"] ?? args["voice"],
597649
+ clone_ref: cloneArgs["clone_ref"] ?? args["clone_ref"],
597650
+ ...cloneArgs,
597651
+ sample: cloneArgs["sample"],
597652
+ source_audio: cloneArgs["source_audio"],
597653
+ voice_sample: cloneArgs["voice_sample"],
597654
+ reference_audio: cloneArgs["reference_audio"],
597655
+ ref_audio: cloneArgs["ref_audio"],
597656
+ clone_sample: cloneArgs["clone_sample"],
597657
+ clone_name: args["clone_name"],
597658
+ model: args["model"],
597659
+ speed: args["speed"]
597660
+ });
597661
+ if (!result.success || !existsSync104(guarded.path.abs)) {
597662
+ return {
597663
+ success: false,
597664
+ output: "",
597665
+ error: `Audio synthesis failed through generate_tts.
596877
597666
  ${(result.error || result.output || "").slice(0, 1200)}`,
596878
- durationMs: performance.now() - start2
596879
- };
597667
+ durationMs: performance.now() - start2
597668
+ };
597669
+ }
597670
+ } finally {
597671
+ for (const fn of cleanup) fn();
596880
597672
  }
596881
597673
  rememberCreated(this.root, guarded.path.abs);
596882
597674
  const sizeKB = Math.round(statSync35(guarded.path.abs).size / 1024);
@@ -596904,12 +597696,12 @@ __export(vision_ingress_exports, {
596904
597696
  queryVisionModel: () => queryVisionModel,
596905
597697
  runVisionIngress: () => runVisionIngress
596906
597698
  });
596907
- import { execFileSync as execFileSync4 } from "node:child_process";
597699
+ import { execFileSync as execFileSync5 } from "node:child_process";
596908
597700
  import { existsSync as existsSync105, readFileSync as readFileSync86, unlinkSync as unlinkSync20 } from "node:fs";
596909
597701
  import { join as join120 } from "node:path";
596910
597702
  function isTesseractAvailable() {
596911
597703
  try {
596912
- execFileSync4("tesseract", ["--version"], { stdio: "ignore", timeout: 3e3 });
597704
+ execFileSync5("tesseract", ["--version"], { stdio: "ignore", timeout: 3e3 });
596913
597705
  return true;
596914
597706
  } catch {
596915
597707
  return false;
@@ -596950,7 +597742,7 @@ function advancedOcr(imagePath) {
596950
597742
  for (const psm of psmModes) {
596951
597743
  const outFile = `${tmpBase}_psm${psm}`;
596952
597744
  try {
596953
- execFileSync4("tesseract", [
597745
+ execFileSync5("tesseract", [
596954
597746
  imagePath,
596955
597747
  outFile,
596956
597748
  "--psm",
@@ -597049,7 +597841,7 @@ var init_vision_ingress = __esm({
597049
597841
 
597050
597842
  // packages/cli/src/tui/telegram-bridge.ts
597051
597843
  import { mkdirSync as mkdirSync60, existsSync as existsSync106, unlinkSync as unlinkSync21, readdirSync as readdirSync36, statSync as statSync36, readFileSync as readFileSync87, writeFileSync as writeFileSync57 } from "node:fs";
597052
- import { join as join121, resolve as resolve39, basename as basename23, relative as relative13, isAbsolute as isAbsolute7 } from "node:path";
597844
+ import { join as join121, resolve as resolve39, basename as basename23, relative as relative13, isAbsolute as isAbsolute7, extname as extname15 } from "node:path";
597053
597845
  import { writeFile as writeFileAsync } from "node:fs/promises";
597054
597846
  import { createHash as createHash19, randomInt } from "node:crypto";
597055
597847
  function parseTelegramInteractionDecision(text, forcedRoute, options2 = {}) {
@@ -597247,6 +598039,19 @@ function summarizeTelegramMessageAttachments(msg) {
597247
598039
  parts.push(`caption: ${truncateTelegramContextLine(msg.media.caption, 180)}`);
597248
598040
  }
597249
598041
  }
598042
+ if (msg.replyToMedia) {
598043
+ const details = [
598044
+ msg.replyToMedia.type,
598045
+ msg.replyToMedia.mimeType,
598046
+ msg.replyToMedia.fileName,
598047
+ msg.replyToMedia.duration ? `${msg.replyToMedia.duration}s` : "",
598048
+ msg.replyToMedia.fileSize ? `${msg.replyToMedia.fileSize} bytes` : ""
598049
+ ].filter(Boolean).join(", ");
598050
+ parts.push(`replied-to media: ${details}`);
598051
+ if (msg.replyToMedia.caption) {
598052
+ parts.push(`replied-to caption: ${truncateTelegramContextLine(msg.replyToMedia.caption, 180)}`);
598053
+ }
598054
+ }
597250
598055
  if (msg.poll) {
597251
598056
  parts.push(`poll: ${truncateTelegramContextLine(msg.poll.question, 180)}`);
597252
598057
  }
@@ -597620,6 +598425,25 @@ function telegramImageMime(media) {
597620
598425
  if (ext === ".tif" || ext === ".tiff") return "image/tiff";
597621
598426
  return "image/jpeg";
597622
598427
  }
598428
+ function telegramCachedMediaIsImage(entry) {
598429
+ if (entry.mediaType === "photo") return true;
598430
+ if (entry.mimeType?.toLowerCase().startsWith("image/")) return true;
598431
+ return TELEGRAM_IMAGE_EXTENSIONS.has(extname15(entry.localPath).toLowerCase());
598432
+ }
598433
+ function telegramCachedMediaIsPdf(entry) {
598434
+ if (entry.mimeType?.toLowerCase() === "application/pdf") return true;
598435
+ return extname15(entry.localPath).toLowerCase() === ".pdf";
598436
+ }
598437
+ function telegramCachedMediaIsAudio(entry) {
598438
+ if (entry.mediaType === "audio" || entry.mediaType === "voice") return true;
598439
+ if (entry.mimeType?.toLowerCase().startsWith("audio/")) return true;
598440
+ return [".wav", ".mp3", ".flac", ".aac", ".m4a", ".ogg", ".opus"].includes(extname15(entry.localPath).toLowerCase());
598441
+ }
598442
+ function telegramCachedMediaIsVideo(entry) {
598443
+ if (entry.mediaType === "video" || entry.mediaType === "video_note" || entry.mediaType === "live_photo") return true;
598444
+ if (entry.mimeType?.toLowerCase().startsWith("video/")) return true;
598445
+ return [".mp4", ".mkv", ".avi", ".mov", ".webm"].includes(extname15(entry.localPath).toLowerCase());
598446
+ }
597623
598447
  function isPathInside(root, path11) {
597624
598448
  const rel = relative13(resolve39(root), resolve39(path11));
597625
598449
  return rel === "" || Boolean(rel) && !rel.startsWith("..") && !isAbsolute7(rel);
@@ -597653,6 +598477,10 @@ function normalizeTelegramUpdate(update2) {
597653
598477
  const username = message2.from?.username ?? message2.sender_chat?.username ?? "";
597654
598478
  const chatType = message2.chat?.type ?? "private";
597655
598479
  const media = normalizeTelegramMedia(message2);
598480
+ const replyTo = message2.reply_to_message && typeof message2.reply_to_message === "object" ? message2.reply_to_message : void 0;
598481
+ const replyToMedia = replyTo ? normalizeTelegramMedia(replyTo) : void 0;
598482
+ const replyToPoll = replyTo ? normalizeTelegramPoll(replyTo.poll) : void 0;
598483
+ const replyToText = replyTo ? replyTo.text || replyTo.caption || (replyToPoll ? formatTelegramPollSummary(replyToPoll) : "") : "";
597656
598484
  const poll = normalizeTelegramPoll(message2.poll);
597657
598485
  const livePhoto = normalizeTelegramLivePhoto(message2.live_photo);
597658
598486
  const text = message2.text || message2.caption || (poll ? formatTelegramPollSummary(poll) : "");
@@ -597667,6 +598495,8 @@ function normalizeTelegramUpdate(update2) {
597667
598495
  chatType,
597668
598496
  chatTitle: message2.chat?.title,
597669
598497
  media,
598498
+ replyToMedia,
598499
+ replyToText: replyToText || void 0,
597670
598500
  poll,
597671
598501
  livePhoto,
597672
598502
  guestQueryId: typeof message2.guest_query_id === "string" ? message2.guest_query_id : void 0,
@@ -597675,9 +598505,9 @@ function normalizeTelegramUpdate(update2) {
597675
598505
  isGuestMessage: sourceUpdateType === "guest_message",
597676
598506
  isDirectMessages: Boolean(message2.chat?.is_direct_messages),
597677
598507
  parentChatId: message2.chat?.parent_chat?.id ?? message2.direct_messages_topic?.parent_topic?.id,
597678
- replyToMessageId: message2.reply_to_message?.message_id,
597679
- replyToUsername: message2.reply_to_message?.from?.username ?? message2.reply_to_message?.sender_chat?.username,
597680
- replyToBot: Boolean(message2.reply_to_message?.from?.is_bot),
598508
+ replyToMessageId: replyTo?.message_id,
598509
+ replyToUsername: replyTo?.from?.username ?? replyTo?.sender_chat?.username,
598510
+ replyToBot: Boolean(replyTo?.from?.is_bot),
597681
598511
  mentionedUsernames: extractTelegramMentionedUsernames(message2, text),
597682
598512
  sourceUpdateType
597683
598513
  };
@@ -597824,7 +598654,7 @@ function renderTelegramSubAgentError(username, error) {
597824
598654
  process.stdout.write(` ${c3.dim("⎿")} ${c3.red("✘")} @${username}: ${c3.dim(preview)}
597825
598655
  `);
597826
598656
  }
597827
- var TELEGRAM_SAFETY_PROMPT, ADMIN_DM_PROMPT, ADMIN_GROUP_PROMPT, TELEGRAM_PUBLIC_SOUL_PROFILE, TELEGRAM_PUBLIC_ORCHESTRATOR_CONTRACT, TELEGRAM_PUBLIC_MEMORY_SCOPE_CONTRACT, GROUP_REPLY_DISCRETION_PROMPT, TELEGRAM_CHAT_MODE_PROMPT, ADMIN_CHAT_PROFILE_PROMPT, TELEGRAM_ACTION_RESPONSE_CONTRACT, TELEGRAM_CHAT_HISTORY_LIMIT, TELEGRAM_CONTEXT_RECENT_DEFAULT, TELEGRAM_CONTEXT_LINE_LIMIT, TELEGRAM_CONTEXT_SAMPLE_LIMIT, TELEGRAM_MEMORY_CARD_LIMIT, TELEGRAM_MEMORY_NOTE_LIMIT, TELEGRAM_MEMORY_STOPWORDS, TELEGRAM_PUBLIC_HELP_COMMANDS, MEDIA_CACHE_TTL_MS, TelegramBridge;
598657
+ var TELEGRAM_SAFETY_PROMPT, ADMIN_DM_PROMPT, ADMIN_GROUP_PROMPT, TELEGRAM_PUBLIC_SOUL_PROFILE, TELEGRAM_PUBLIC_ORCHESTRATOR_CONTRACT, TELEGRAM_PUBLIC_MEMORY_SCOPE_CONTRACT, GROUP_REPLY_DISCRETION_PROMPT, TELEGRAM_CHAT_MODE_PROMPT, ADMIN_CHAT_PROFILE_PROMPT, TELEGRAM_ACTION_RESPONSE_CONTRACT, TELEGRAM_CHAT_HISTORY_LIMIT, TELEGRAM_CONTEXT_RECENT_DEFAULT, TELEGRAM_CONTEXT_LINE_LIMIT, TELEGRAM_CONTEXT_SAMPLE_LIMIT, TELEGRAM_MEMORY_CARD_LIMIT, TELEGRAM_MEMORY_NOTE_LIMIT, TELEGRAM_MEMORY_STOPWORDS, TELEGRAM_PUBLIC_HELP_COMMANDS, TELEGRAM_IMAGE_EXTENSIONS, MEDIA_CACHE_TTL_MS, TelegramBridge;
597828
598658
  var init_telegram_bridge = __esm({
597829
598659
  "packages/cli/src/tui/telegram-bridge.ts"() {
597830
598660
  "use strict";
@@ -598020,6 +598850,7 @@ Telegram response contract:
598020
598850
  "your"
598021
598851
  ]);
598022
598852
  TELEGRAM_PUBLIC_HELP_COMMANDS = /* @__PURE__ */ new Set(["help", "start", "auth", "call"]);
598853
+ TELEGRAM_IMAGE_EXTENSIONS = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif", ".svg"]);
598023
598854
  MEDIA_CACHE_TTL_MS = 30 * 60 * 1e3;
598024
598855
  TelegramBridge = class {
598025
598856
  constructor(botToken, onMessage, agentConfig, repoRoot, toolPolicyConfig) {
@@ -598431,6 +599262,80 @@ Telegram response contract:
598431
599262
  }
598432
599263
  }
598433
599264
  }
599265
+ updateLastTelegramUserMessageText(msg, text) {
599266
+ const sessionKey = this.sessionKeyForMessage(msg);
599267
+ const history = this.chatHistory.get(sessionKey);
599268
+ if (!history || !text.trim()) return;
599269
+ for (let i2 = history.length - 1; i2 >= 0; i2--) {
599270
+ const entry = history[i2];
599271
+ if (entry.role !== "user") continue;
599272
+ if (entry.messageId === msg.messageId || !entry.messageId && entry.text === msg.text) {
599273
+ entry.text = text.trim();
599274
+ entry.mediaSummary = summarizeTelegramMessageAttachments(msg) || entry.mediaSummary;
599275
+ this.updateTelegramMemoryCards(sessionKey, entry);
599276
+ this.saveTelegramConversationState(sessionKey);
599277
+ return;
599278
+ }
599279
+ }
599280
+ }
599281
+ recentTelegramMediaEntries(chatId, limit = 12) {
599282
+ const now = Date.now();
599283
+ return [...this.mediaCache.values()].filter((entry) => {
599284
+ if (chatId !== void 0 && String(entry.chatId) !== String(chatId)) return false;
599285
+ return now - entry.cachedAt <= MEDIA_CACHE_TTL_MS;
599286
+ }).sort((a2, b) => b.cachedAt - a2.cachedAt).slice(0, limit);
599287
+ }
599288
+ telegramMediaEntryMatchesKind(entry, kind) {
599289
+ if (kind === "image") return telegramCachedMediaIsImage(entry);
599290
+ if (kind === "pdf") return telegramCachedMediaIsPdf(entry);
599291
+ if (kind === "audio") return telegramCachedMediaIsAudio(entry);
599292
+ if (kind === "video") return telegramCachedMediaIsVideo(entry);
599293
+ if (kind === "transcribable") {
599294
+ return telegramCachedMediaIsAudio(entry) || telegramCachedMediaIsVideo(entry);
599295
+ }
599296
+ return true;
599297
+ }
599298
+ resolveTelegramScopedMediaPath(rawValue, chatId, currentMsg, kind) {
599299
+ const raw = String(rawValue ?? "").trim();
599300
+ const repoRoot = this.repoRoot || ".";
599301
+ const creativeRoot = telegramCreativeWorkspaceRoot(repoRoot, chatId);
599302
+ const mediaEntries = this.recentTelegramMediaEntries(chatId, 60).filter((entry) => this.telegramMediaEntryMatchesKind(entry, kind));
599303
+ const aliases = /* @__PURE__ */ new Set(["", "latest", "last", "current", "this", "that", "it", "reply", "replied", "replied-to", "replied_to"]);
599304
+ if (aliases.has(raw.toLowerCase())) {
599305
+ const replied = currentMsg?.replyToMessageId ? mediaEntries.find((entry2) => entry2.messageId === currentMsg.replyToMessageId) : void 0;
599306
+ const entry = replied ?? mediaEntries[0];
599307
+ if (!entry) {
599308
+ return { ok: false, error: `No recent ${kind} media is available in this Telegram chat scope.` };
599309
+ }
599310
+ return { ok: true, path: entry.localPath };
599311
+ }
599312
+ const matchingEntry = mediaEntries.find((entry) => {
599313
+ if (resolve39(entry.localPath) === resolve39(raw)) return true;
599314
+ if (basename23(entry.localPath) === raw) return true;
599315
+ if (entry.fileUniqueId === raw || entry.fileId === raw) return true;
599316
+ if (entry.messageId && String(entry.messageId) === raw) return true;
599317
+ return false;
599318
+ });
599319
+ if (matchingEntry) return { ok: true, path: matchingEntry.localPath };
599320
+ const creativeCandidate = isAbsolute7(raw) ? resolve39(raw) : resolve39(creativeRoot, raw);
599321
+ if (isPathInside(creativeRoot, creativeCandidate) && existsSync106(creativeCandidate)) {
599322
+ return { ok: true, path: creativeCandidate };
599323
+ }
599324
+ return {
599325
+ ok: false,
599326
+ error: `Path is outside this Telegram chat's media/workspace scope or does not exist: ${raw || "(empty)"}`
599327
+ };
599328
+ }
599329
+ resolveTelegramScopedOutputPath(rawValue, chatId, fallbackName) {
599330
+ const repoRoot = this.repoRoot || ".";
599331
+ const creativeRoot = telegramCreativeWorkspaceRoot(repoRoot, chatId);
599332
+ const raw = String(rawValue || fallbackName).trim() || fallbackName;
599333
+ const outputPath2 = isAbsolute7(raw) ? resolve39(raw) : resolve39(creativeRoot, raw);
599334
+ if (!isPathInside(creativeRoot, outputPath2)) {
599335
+ return { ok: false, error: `Output path must stay inside this Telegram chat's creative workspace: ${raw}` };
599336
+ }
599337
+ return { ok: true, path: outputPath2 };
599338
+ }
598434
599339
  updateTelegramParticipantProfile(sessionKey, msg, text) {
598435
599340
  const participantKey = String(msg.fromUserId || msg.username || msg.firstName || "unknown");
598436
599341
  const participants = this.chatParticipants.get(sessionKey) ?? /* @__PURE__ */ new Map();
@@ -598605,6 +599510,22 @@ ${notes2}`;
598605
599510
  sections.push(`### Zettelkasten Memory Recall
598606
599511
  ${cardLines.join("\n")}`);
598607
599512
  }
599513
+ const recentMedia = this.recentTelegramMediaEntries(msg.chatId, 10);
599514
+ if (recentMedia.length > 0) {
599515
+ const mediaLines = recentMedia.map((entry) => {
599516
+ const kind = telegramCachedMediaIsImage(entry) ? "image" : entry.mediaType;
599517
+ const replyMark = msg.replyToMessageId && entry.messageId === msg.replyToMessageId ? " replied-to" : "";
599518
+ const caption = entry.caption ? ` caption:${truncateTelegramContextLine(entry.caption, 120)}` : "";
599519
+ const extracted = entry.extractedContent ? `
599520
+ ${truncateTelegramContextLine(entry.extractedContent.replace(/\s+/g, " "), 220)}` : "";
599521
+ return `- message_id ${entry.messageId}${replyMark}: ${kind}; path ${entry.localPath}; file ${basename23(entry.localPath)}${caption}${extracted}`;
599522
+ });
599523
+ sections.push([
599524
+ "### Recent Chat Media",
599525
+ "Use these paths only as tool inputs when the user asks about media in this chat. Do not quote local paths in the visible Telegram reply.",
599526
+ mediaLines.join("\n")
599527
+ ].join("\n"));
599528
+ }
598608
599529
  if (olderCount > 0) {
598609
599530
  const older = history.slice(0, olderCount);
598610
599531
  const bySpeaker = /* @__PURE__ */ new Map();
@@ -599301,8 +600222,8 @@ Join: ${newUrl}`);
599301
600222
  }
599302
600223
  }
599303
600224
  let steeringText = msg.text;
599304
- if (msg.media) {
599305
- const mediaContext = await this.processMedia(msg);
600225
+ if (msg.media || msg.replyToMedia) {
600226
+ const mediaContext = await this.processMediaContextForMessage(msg);
599306
600227
  if (mediaContext) {
599307
600228
  steeringText += `
599308
600229
 
@@ -599376,8 +600297,8 @@ ${mediaContext}`;
599376
600297
  this.tuiWrite(() => renderTelegramSubAgentStart(msg.username, msg.text, isAdminDM));
599377
600298
  try {
599378
600299
  let mediaContext = "";
599379
- if (msg.media) {
599380
- mediaContext = await this.processMedia(msg);
600300
+ if (msg.media || msg.replyToMedia) {
600301
+ mediaContext = await this.processMediaContextForMessage(msg);
599381
600302
  }
599382
600303
  const result = await this.runSubAgent(msg, subAgent, mediaContext);
599383
600304
  if (subAgent.typingInterval) {
@@ -599479,8 +600400,8 @@ ${mediaContext}`;
599479
600400
  this.tuiWrite(() => renderTelegramSubAgentEvent(msg.username, `admin chat with full context/tools (${this.interactionMode})`));
599480
600401
  try {
599481
600402
  let mediaContext = "";
599482
- if (msg.media) {
599483
- mediaContext = await this.processMedia(msg);
600403
+ if (msg.media || msg.replyToMedia) {
600404
+ mediaContext = await this.processMediaContextForMessage(msg);
599484
600405
  }
599485
600406
  const result = await this.runSubAgent(msg, subAgent, mediaContext, "chat");
599486
600407
  if (subAgent.typingInterval) {
@@ -599563,7 +600484,7 @@ ${mediaContext}`;
599563
600484
  }
599564
600485
  this.tuiWrite(() => renderTelegramSubAgentEvent(msg.username, `live inference: chat reply (${this.interactionMode})`));
599565
600486
  try {
599566
- const mediaContext = msg.media || msg.livePhoto ? "Attachment received. Quick-chat mode does not inspect media; use action mode for media analysis." : "";
600487
+ const mediaContext = msg.media || msg.replyToMedia || msg.livePhoto ? await this.processMediaContextForMessage(msg) : "";
599567
600488
  const finalText = await this.runTelegramChatCompletion(
599568
600489
  msg,
599569
600490
  toolContext,
@@ -600056,6 +600977,128 @@ ${lines.join("\n\n")}` };
600056
600977
  }
600057
600978
  };
600058
600979
  }
600980
+ if (tool.name === "image_read") {
600981
+ return {
600982
+ ...tool,
600983
+ description: "Read only images from this Telegram chat's media cache or creative workspace. Use path='reply' for the replied-to image or path='latest' for the most recent chat image.",
600984
+ execute: async (args) => {
600985
+ const resolved = this.resolveTelegramScopedMediaPath(args["path"], chatId, currentMsg, "image");
600986
+ if (!resolved.ok) return { success: false, output: "", error: resolved.error };
600987
+ return tool.execute({ ...args, path: resolved.path });
600988
+ }
600989
+ };
600990
+ }
600991
+ if (tool.name === "ocr") {
600992
+ return {
600993
+ ...tool,
600994
+ description: "Extract text only from images in this Telegram chat's media cache or creative workspace. Use path='reply' or path='latest' for chat media references.",
600995
+ execute: async (args) => {
600996
+ const resolved = this.resolveTelegramScopedMediaPath(args["path"], chatId, currentMsg, "image");
600997
+ if (!resolved.ok) return { success: false, output: "", error: resolved.error };
600998
+ return tool.execute({ ...args, path: resolved.path });
600999
+ }
601000
+ };
601001
+ }
601002
+ if (tool.name === "vision") {
601003
+ return {
601004
+ ...tool,
601005
+ description: "Analyze only images from this Telegram chat's media cache or creative workspace. Use image='reply' for the replied-to image or image='latest' for the most recent chat image.",
601006
+ execute: async (args) => {
601007
+ const resolved = this.resolveTelegramScopedMediaPath(args["image"], chatId, currentMsg, "image");
601008
+ if (!resolved.ok) return { success: false, output: "", error: resolved.error };
601009
+ return tool.execute({ ...args, image: resolved.path });
601010
+ }
601011
+ };
601012
+ }
601013
+ if (tool.name === "ocr_image_advanced") {
601014
+ return {
601015
+ ...tool,
601016
+ description: "Advanced OCR only for images in this Telegram chat's media cache or creative workspace. Batch directory mode is disabled in public Telegram scope.",
601017
+ execute: async (args) => {
601018
+ if (args["batch"] === true) return { success: false, output: "", error: "Batch directory OCR is not available in public Telegram scope." };
601019
+ const resolved = this.resolveTelegramScopedMediaPath(args["image"], chatId, currentMsg, "image");
601020
+ if (!resolved.ok) return { success: false, output: "", error: resolved.error };
601021
+ const next = { ...args, image: resolved.path };
601022
+ if (typeof next["output_dir"] === "string" && next["output_dir"].trim()) {
601023
+ const output = this.resolveTelegramScopedOutputPath(next["output_dir"], chatId, "ocr-output");
601024
+ if (!output.ok) return { success: false, output: "", error: output.error };
601025
+ next["output_dir"] = output.path;
601026
+ }
601027
+ return tool.execute(next);
601028
+ }
601029
+ };
601030
+ }
601031
+ if (tool.name === "transcribe_file") {
601032
+ return {
601033
+ ...tool,
601034
+ description: "Transcribe only audio/video files from this Telegram chat's media cache or creative workspace. Use path='reply' or path='latest' for chat media references.",
601035
+ execute: async (args) => {
601036
+ const resolved = this.resolveTelegramScopedMediaPath(args["path"], chatId, currentMsg, "transcribable");
601037
+ if (!resolved.ok) return { success: false, output: "", error: resolved.error };
601038
+ return tool.execute({ ...args, path: resolved.path });
601039
+ }
601040
+ };
601041
+ }
601042
+ if (tool.name === "pdf_to_text") {
601043
+ return {
601044
+ ...tool,
601045
+ description: "Extract text only from PDFs in this Telegram chat's media cache or creative workspace. Use path='reply' or path='latest' for chat document references.",
601046
+ execute: async (args) => {
601047
+ const resolved = this.resolveTelegramScopedMediaPath(args["path"], chatId, currentMsg, "pdf");
601048
+ if (!resolved.ok) return { success: false, output: "", error: resolved.error };
601049
+ return tool.execute({ ...args, path: resolved.path });
601050
+ }
601051
+ };
601052
+ }
601053
+ if (tool.name === "ocr_pdf") {
601054
+ return {
601055
+ ...tool,
601056
+ description: "OCR only PDFs from this Telegram chat's media cache or creative workspace. Output, when requested, is forced into this chat's creative workspace.",
601057
+ execute: async (args) => {
601058
+ const input = this.resolveTelegramScopedMediaPath(args["input"], chatId, currentMsg, "pdf");
601059
+ if (!input.ok) return { success: false, output: "", error: input.error };
601060
+ const next = { ...args, input: input.path };
601061
+ if (typeof next["output"] === "string" && next["output"].trim()) {
601062
+ const output = this.resolveTelegramScopedOutputPath(next["output"], chatId, `ocr-${Date.now()}.pdf`);
601063
+ if (!output.ok) return { success: false, output: "", error: output.error };
601064
+ next["output"] = output.path;
601065
+ }
601066
+ return tool.execute(next);
601067
+ }
601068
+ };
601069
+ }
601070
+ if (tool.name === "video_understand") {
601071
+ return {
601072
+ ...tool,
601073
+ description: "Analyze only video files from this Telegram chat's media cache or creative workspace. URL download is disabled in public Telegram scope; use path='reply' or path='latest'.",
601074
+ execute: async (args) => {
601075
+ if (args["url"]) return { success: false, output: "", error: "URL video analysis is not available in public Telegram scope. Use a video posted in this chat." };
601076
+ const resolved = this.resolveTelegramScopedMediaPath(args["path"], chatId, currentMsg, "video");
601077
+ if (!resolved.ok) return { success: false, output: "", error: resolved.error };
601078
+ return tool.execute({ ...args, path: resolved.path });
601079
+ }
601080
+ };
601081
+ }
601082
+ if (tool.name === "audio_analyze") {
601083
+ return {
601084
+ ...tool,
601085
+ description: "Analyze only audio files from this Telegram chat's media cache or creative workspace. Microphone/listen mode is disabled in public Telegram scope.",
601086
+ execute: async (args) => {
601087
+ if (String(args["action"] || "").toLowerCase() === "listen") {
601088
+ return { success: false, output: "", error: "Continuous microphone listening is not available in Telegram public scope." };
601089
+ }
601090
+ const resolved = this.resolveTelegramScopedMediaPath(args["file"] ?? args["path"], chatId, currentMsg, "audio");
601091
+ if (!resolved.ok) return { success: false, output: "", error: resolved.error };
601092
+ return tool.execute({ ...args, file: resolved.path, path: resolved.path });
601093
+ }
601094
+ };
601095
+ }
601096
+ if (tool.name === "explore_tools") {
601097
+ return {
601098
+ ...tool,
601099
+ description: "List and explain the tools available in this Telegram public/group scope. Do not invent unavailable tool names."
601100
+ };
601101
+ }
600059
601102
  return tool;
600060
601103
  });
600061
601104
  }
@@ -600219,11 +601262,16 @@ Scoped workspace: ${scopedRoot}`,
600219
601262
  new ImageReadTool(repoRoot),
600220
601263
  new OCRTool(repoRoot),
600221
601264
  new VisionTool(repoRoot),
601265
+ new OcrImageAdvancedTool(repoRoot),
600222
601266
  new OcrPdfTool(repoRoot),
600223
601267
  new PdfToTextTool(repoRoot),
600224
601268
  // Transcription tools
600225
601269
  new TranscribeFileTool(repoRoot),
600226
- new TranscribeUrlTool(repoRoot)
601270
+ new TranscribeUrlTool(repoRoot),
601271
+ new VideoUnderstandTool(repoRoot),
601272
+ new AudioAnalyzeTool(),
601273
+ new ExploreToolsTool(),
601274
+ this.buildTelegramMediaRecentTool(chatId, msg)
600227
601275
  ];
600228
601276
  const adminTools = [
600229
601277
  new ShellTool(repoRoot),
@@ -600326,6 +601374,55 @@ Scoped workspace: ${scopedRoot}`,
600326
601374
  ]);
600327
601375
  return tools.filter((tool) => !blocked.has(tool.name));
600328
601376
  }
601377
+ buildTelegramMediaRecentTool(chatId, currentMsg) {
601378
+ const bridge = this;
601379
+ return {
601380
+ name: "telegram_media_recent",
601381
+ description: "List recent media files available in this Telegram chat scope, including safe aliases for image_read, ocr, vision, transcribe_file, pdf_to_text, video_understand, and audio_analyze.",
601382
+ parameters: {
601383
+ type: "object",
601384
+ properties: {
601385
+ kind: {
601386
+ type: "string",
601387
+ enum: ["media", "image", "audio", "video", "pdf", "transcribable"],
601388
+ description: "Filter by media kind. Defaults to all recent chat media."
601389
+ },
601390
+ limit: { type: "number", description: "Maximum entries to return, 1-20. Default: 10." }
601391
+ }
601392
+ },
601393
+ async execute(args) {
601394
+ const start2 = performance.now();
601395
+ const kind = String(args["kind"] || "media").toLowerCase();
601396
+ const limit = typeof args["limit"] === "number" && Number.isFinite(args["limit"]) ? Math.max(1, Math.min(20, Math.floor(args["limit"]))) : 10;
601397
+ const entries = bridge.recentTelegramMediaEntries(chatId, 60).filter((entry) => bridge.telegramMediaEntryMatchesKind(entry, kind)).slice(0, limit);
601398
+ if (entries.length === 0) {
601399
+ return { success: true, output: `No recent ${kind} media is available in this Telegram chat scope.`, durationMs: performance.now() - start2 };
601400
+ }
601401
+ const lines = entries.map((entry, index) => {
601402
+ const parts = [
601403
+ `${index + 1}. message_id ${entry.messageId || "unknown"}`,
601404
+ currentMsg?.replyToMessageId === entry.messageId ? "replied-to" : "",
601405
+ telegramCachedMediaIsImage(entry) ? "image" : telegramCachedMediaIsPdf(entry) ? "pdf" : telegramCachedMediaIsAudio(entry) ? "audio" : telegramCachedMediaIsVideo(entry) ? "video" : entry.mediaType,
601406
+ `file=${basename23(entry.localPath)}`,
601407
+ `path=${entry.localPath}`,
601408
+ entry.caption ? `caption=${truncateTelegramContextLine(entry.caption, 140)}` : ""
601409
+ ].filter(Boolean);
601410
+ const extracted = entry.extractedContent ? `
601411
+ context: ${truncateTelegramContextLine(entry.extractedContent.replace(/\s+/g, " "), 240)}` : "";
601412
+ return `${parts.join("; ")}${extracted}`;
601413
+ });
601414
+ return {
601415
+ success: true,
601416
+ output: [
601417
+ "Recent scoped Telegram media:",
601418
+ "Use path='reply' for replied-to media, path='latest' for the most recent matching item, or one of the listed paths.",
601419
+ lines.join("\n")
601420
+ ].join("\n"),
601421
+ durationMs: performance.now() - start2
601422
+ };
601423
+ }
601424
+ };
601425
+ }
600329
601426
  imageGenerationDefaultsForRepo(repoRoot) {
600330
601427
  const settings = resolveSettings(repoRoot);
600331
601428
  return {
@@ -600543,30 +601640,36 @@ ${knownList}` : "Private-user telegram_send_file target must be this DM or a kno
600543
601640
  * Downloads the file, runs it through the appropriate pipeline,
600544
601641
  * caches it, and returns a text description for the agent.
600545
601642
  */
600546
- async processMedia(msg) {
600547
- if (!msg.media) return "";
600548
- const { type, fileId, fileUniqueId, mimeType, caption } = msg.media;
600549
- const isImageMedia = telegramMediaIsImage(msg.media);
601643
+ async processMedia(msg, source = "message") {
601644
+ const media = source === "reply" ? msg.replyToMedia : msg.media;
601645
+ if (!media) return "";
601646
+ const { type, fileId, fileUniqueId, mimeType, caption } = media;
601647
+ const isImageMedia = telegramMediaIsImage(media);
601648
+ const sourceMessageId = source === "reply" ? msg.replyToMessageId : msg.messageId;
601649
+ const sourceLabel = source === "reply" ? "replied-to " : "";
600550
601650
  let ext = ".bin";
600551
- if (isImageMedia) ext = telegramImageExtension(msg.media);
601651
+ if (isImageMedia) ext = telegramImageExtension(media);
600552
601652
  else if (type === "audio" || type === "voice") ext = ".ogg";
600553
601653
  else if (type === "video" || type === "video_note" || type === "live_photo") ext = ".mp4";
600554
- else if (msg.media.fileName) {
600555
- const dotIdx = msg.media.fileName.lastIndexOf(".");
600556
- if (dotIdx >= 0) ext = msg.media.fileName.slice(dotIdx);
601654
+ else if (media.fileName) {
601655
+ const dotIdx = media.fileName.lastIndexOf(".");
601656
+ if (dotIdx >= 0) ext = media.fileName.slice(dotIdx);
600557
601657
  }
600558
601658
  const localPath = await this.downloadTelegramFile(fileId, ext);
600559
601659
  if (!localPath) return `[Media: ${type} — failed to download]`;
600560
601660
  const cacheEntry = {
600561
601661
  localPath,
600562
601662
  fileId,
601663
+ fileUniqueId,
600563
601664
  chatId: msg.chatId,
601665
+ messageId: sourceMessageId ?? 0,
600564
601666
  username: msg.username,
600565
601667
  mediaType: type,
600566
601668
  mimeType,
601669
+ caption,
600567
601670
  cachedAt: Date.now()
600568
601671
  };
600569
- this.mediaCache.set(fileUniqueId, cacheEntry);
601672
+ this.mediaCache.set(`${String(msg.chatId)}:${String(sourceMessageId ?? 0)}:${fileUniqueId}`, cacheEntry);
600570
601673
  const metadataKey = String(msg.chatId);
600571
601674
  if (!this.mediaMetadata.has(metadataKey)) {
600572
601675
  this.mediaMetadata.set(metadataKey, []);
@@ -600587,7 +601690,7 @@ ${knownList}` : "Private-user telegram_send_file target must be this DM or a kno
600587
601690
  {
600588
601691
  path: localPath,
600589
601692
  buffer: readFileSync87(localPath),
600590
- mime: telegramImageMime(msg.media)
601693
+ mime: telegramImageMime(media)
600591
601694
  },
600592
601695
  this.agentConfig?.model ?? ""
600593
601696
  );
@@ -600596,10 +601699,10 @@ ${knownList}` : "Private-user telegram_send_file target must be this DM or a kno
600596
601699
  } catch {
600597
601700
  }
600598
601701
  if (visionContext) {
600599
- description = `[Image received: ${localPath}${caption ? ` — caption: "${caption}"` : ""}
601702
+ description = `[${sourceLabel}image received: ${localPath}${caption ? ` — caption: "${caption}"` : ""}
600600
601703
  ${visionContext}]`;
600601
601704
  } else {
600602
- description = `[Image received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}. You can use image_read or vision tools to analyze it if available.]`;
601705
+ description = `[${sourceLabel}image received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}. You can use image_read, ocr, or vision tools to analyze it.]`;
600603
601706
  }
600604
601707
  try {
600605
601708
  await fetch("http://127.0.0.1:11435/v1/memory/ingest", {
@@ -600623,9 +601726,9 @@ ${visionContext}]`;
600623
601726
  } catch {
600624
601727
  }
600625
601728
  if (transcription) {
600626
- description = `[Voice message transcribed: "${transcription}"${caption ? ` — caption: "${caption}"` : ""}]`;
601729
+ description = `[${sourceLabel}voice message transcribed: "${transcription}"${caption ? ` — caption: "${caption}"` : ""}]`;
600627
601730
  } else {
600628
- description = `[Audio/voice message received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}. You can use transcribe_file to transcribe it if available.]`;
601731
+ description = `[${sourceLabel}audio/voice message received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}. You can use transcribe_file to transcribe it.]`;
600629
601732
  }
600630
601733
  try {
600631
601734
  await fetch("http://127.0.0.1:11435/v1/memory/ingest", {
@@ -600638,13 +601741,30 @@ ${visionContext}]`;
600638
601741
  }
600639
601742
  } else if (type === "video" || type === "video_note" || type === "live_photo") {
600640
601743
  const label = type === "live_photo" ? "Live photo" : "Video";
600641
- description = `[${label} received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}.]`;
601744
+ description = `[${sourceLabel}${label.toLowerCase()} received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}. You can use video_understand or transcribe_file to analyze it.]`;
600642
601745
  } else if (type === "document") {
600643
- description = `[Document received: ${msg.media.fileName || "unnamed"}${mimeType ? ` (${mimeType})` : ""}, saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}.]`;
601746
+ description = `[${sourceLabel}document received: ${media.fileName || "unnamed"}${mimeType ? ` (${mimeType})` : ""}, saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}.]`;
600644
601747
  }
600645
601748
  cacheEntry.extractedContent = description;
600646
601749
  return description;
600647
601750
  }
601751
+ async processMediaContextForMessage(msg) {
601752
+ const parts = [];
601753
+ if (msg.media) {
601754
+ const current = await this.processMedia(msg, "message");
601755
+ if (current) parts.push(current);
601756
+ }
601757
+ if (msg.replyToMedia) {
601758
+ const replied = await this.processMedia(msg, "reply");
601759
+ if (replied) parts.push(replied);
601760
+ }
601761
+ const text = parts.join("\n\n");
601762
+ if (text) this.updateLastTelegramUserMessageText(msg, `${msg.text}
601763
+
601764
+ [Media context]
601765
+ ${text}`.trim());
601766
+ return text;
601767
+ }
600648
601768
  /** Clean up expired media cache entries (older than 30 minutes) */
600649
601769
  cleanupMediaCache() {
600650
601770
  const now = Date.now();
@@ -625230,7 +626350,7 @@ var clipboard_media_exports = {};
625230
626350
  __export(clipboard_media_exports, {
625231
626351
  pasteClipboardImageToFile: () => pasteClipboardImageToFile
625232
626352
  });
625233
- import { execFileSync as execFileSync5, execSync as execSync58 } from "node:child_process";
626353
+ import { execFileSync as execFileSync6, execSync as execSync58 } from "node:child_process";
625234
626354
  import { mkdirSync as mkdirSync72, readFileSync as readFileSync99, rmSync as rmSync5, writeFileSync as writeFileSync67 } from "node:fs";
625235
626355
  import { join as join136 } from "node:path";
625236
626356
  function pasteClipboardImageToFile(repoRoot) {
@@ -625247,7 +626367,7 @@ function readClipboardImage() {
625247
626367
  try {
625248
626368
  execSync58("command -v pngpaste", { stdio: "ignore", timeout: 1e3 });
625249
626369
  const tmp = `/tmp/omnius-clipboard-${Date.now()}.png`;
625250
- execFileSync5("pngpaste", [tmp], { timeout: 3e3 });
626370
+ execFileSync6("pngpaste", [tmp], { timeout: 3e3 });
625251
626371
  const buffer2 = readFileSync99(tmp);
625252
626372
  try {
625253
626373
  rmSync5(tmp);
@@ -625267,7 +626387,7 @@ function readClipboardImage() {
625267
626387
  ];
625268
626388
  for (const attempt of attempts) {
625269
626389
  try {
625270
- const buffer2 = execFileSync5(attempt.cmd, attempt.args, { timeout: 3e3, maxBuffer: 25 * 1024 * 1024 });
626390
+ const buffer2 = execFileSync6(attempt.cmd, attempt.args, { timeout: 3e3, maxBuffer: 25 * 1024 * 1024 });
625271
626391
  if (buffer2.length > 0) return { buffer: buffer2, mime: attempt.mime, ext: attempt.ext };
625272
626392
  } catch {
625273
626393
  continue;
@@ -625284,7 +626404,7 @@ function readClipboardImage() {
625284
626404
  "$img.Save($ms,[Drawing.Imaging.ImageFormat]::Png);",
625285
626405
  "[Console]::OpenStandardOutput().Write($ms.ToArray(),0,$ms.Length)"
625286
626406
  ].join("");
625287
- const buffer2 = execFileSync5("powershell.exe", ["-NoProfile", "-Command", ps], {
626407
+ const buffer2 = execFileSync6("powershell.exe", ["-NoProfile", "-Command", ps], {
625288
626408
  timeout: 5e3,
625289
626409
  maxBuffer: 25 * 1024 * 1024
625290
626410
  });
@@ -625303,7 +626423,7 @@ var init_clipboard_media = __esm({
625303
626423
 
625304
626424
  // packages/cli/src/tui/interactive.ts
625305
626425
  import { cwd } from "node:process";
625306
- import { resolve as resolve44, join as join137, dirname as dirname38, extname as extname15, relative as relative14 } from "node:path";
626426
+ import { resolve as resolve44, join as join137, dirname as dirname38, extname as extname16, relative as relative14 } from "node:path";
625307
626427
  import { createRequire as createRequire8 } from "node:module";
625308
626428
  import { fileURLToPath as fileURLToPath18 } from "node:url";
625309
626429
  import {
@@ -632605,7 +633725,7 @@ Execute this skill now. Follow the behavioral guidance above.`;
632605
633725
  const imgPath = resolve44(repoRoot, cleanPath);
632606
633726
  const imgBuffer = readFileSync100(imgPath);
632607
633727
  const base642 = imgBuffer.toString("base64");
632608
- const ext = extname15(cleanPath).toLowerCase();
633728
+ const ext = extname16(cleanPath).toLowerCase();
632609
633729
  const mime = ext === ".png" ? "image/png" : ext === ".gif" ? "image/gif" : ext === ".webp" ? "image/webp" : "image/jpeg";
632610
633730
  const asciiContext = await renderAsciiPreviewForImage(
632611
633731
  imgPath,