omnius 1.0.20 → 1.0.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/dist/index.js +1377 -257
- package/npm-shrinkwrap.json +2 -2
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -1474,7 +1474,7 @@ var init_security_classifier = __esm({
|
|
|
1474
1474
|
// ── Network reads (safe)
|
|
1475
1475
|
{ match: /^(web_search|web_fetch)$/, info: NETWORK_READ },
|
|
1476
1476
|
// ── Network outbound (mutating or remote inference)
|
|
1477
|
-
{ match: /^(image_generate|generate_image|vision|video_understand)$/, info: NETWORK_OUTBOUND },
|
|
1477
|
+
{ match: /^(image_generate|generate_image|generate_audio|generate_tts|create_audio_file|vision|video_understand|telegram_send_file)$/, info: NETWORK_OUTBOUND },
|
|
1478
1478
|
{ match: /^(transcribe_file|transcribe_url|youtube_download)$/, info: NETWORK_OUTBOUND },
|
|
1479
1479
|
{ match: /^(fortemi_bridge)$/, info: NETWORK_OUTBOUND },
|
|
1480
1480
|
// ── Memory tools
|
|
@@ -1491,7 +1491,7 @@ var init_security_classifier = __esm({
|
|
|
1491
1491
|
{ match: /^(file_read|file_explore|list_directory|grep_search|glob_find|find_files)$/, info: LOCAL_READ },
|
|
1492
1492
|
{ match: /^(image_read|ocr|ocr_pdf|ocr_image_advanced|pdf_to_text|structured_read|read_structured_file)$/, info: LOCAL_READ },
|
|
1493
1493
|
{ match: /^(symbol_search|impact_analysis|code_neighbors|repo_map|codebase_map|semantic_map|import_graph)$/, info: LOCAL_READ },
|
|
1494
|
-
{ match: /^(diagnostic|git_info|environment_snapshot|process_health|todo_read|explore_tools)$/, info: LOCAL_READ },
|
|
1494
|
+
{ match: /^(diagnostic|git_info|environment_snapshot|process_health|todo_read|explore_tools|telegram_media_recent)$/, info: LOCAL_READ },
|
|
1495
1495
|
{ match: /^(log_explore|log_packet|change_log|phase_recall|code_graph)$/, info: LOCAL_READ },
|
|
1496
1496
|
{ match: /^skill_(list|execute|read)$/, info: LOCAL_READ },
|
|
1497
1497
|
// ── Task completion (neutral signal)
|
|
@@ -5733,13 +5733,20 @@ var init_explore_tools = __esm({
|
|
|
5733
5733
|
diagnostic: "Run project diagnostics (build, test, lint)",
|
|
5734
5734
|
image_read: "Read and describe image contents",
|
|
5735
5735
|
screenshot: "Capture a screenshot of the desktop",
|
|
5736
|
+
ocr: "Extract text from images via OCR",
|
|
5736
5737
|
ocr_image: "Extract text from images via OCR",
|
|
5738
|
+
ocr_image_advanced: "Advanced OCR for images with layout-aware extraction",
|
|
5737
5739
|
ocr_pdf: "Extract text from PDF pages via OCR",
|
|
5738
5740
|
pdf_to_text: "Convert PDF to plain text",
|
|
5739
5741
|
vision: "Describe what's on screen using Moondream",
|
|
5742
|
+
video_understand: "Analyze a video file with transcription and keyframe understanding",
|
|
5743
|
+
audio_analyze: "Classify sounds, detect speech, inspect spectrum, or analyze audio files",
|
|
5740
5744
|
desktop_click: "Click at coordinates on the desktop",
|
|
5741
5745
|
desktop_describe: "Describe a region of the desktop",
|
|
5742
5746
|
transcribe_file: "Transcribe audio/video files to text",
|
|
5747
|
+
telegram_media_recent: "List recent Telegram media available in the current chat scope",
|
|
5748
|
+
generate_audio: "Generate sound effects or music with local model backends",
|
|
5749
|
+
generate_tts: "Generate speech from text with configured voice/TTS backends",
|
|
5743
5750
|
create_tool: "Create a new custom tool from a workflow",
|
|
5744
5751
|
manage_tools: "List, inspect, or remove custom tools",
|
|
5745
5752
|
skill_list: "List available AIWG skills",
|
|
@@ -84452,7 +84459,7 @@ var require_mime_types = __commonJS({
|
|
|
84452
84459
|
"../node_modules/mime-types/index.js"(exports) {
|
|
84453
84460
|
"use strict";
|
|
84454
84461
|
var db = require_mime_db();
|
|
84455
|
-
var
|
|
84462
|
+
var extname17 = __require("path").extname;
|
|
84456
84463
|
var EXTRACT_TYPE_REGEXP = /^\s*([^;\s]*)(?:;|\s|$)/;
|
|
84457
84464
|
var TEXT_TYPE_REGEXP = /^text\//i;
|
|
84458
84465
|
exports.charset = charset;
|
|
@@ -84506,7 +84513,7 @@ var require_mime_types = __commonJS({
|
|
|
84506
84513
|
if (!path11 || typeof path11 !== "string") {
|
|
84507
84514
|
return false;
|
|
84508
84515
|
}
|
|
84509
|
-
var extension4 =
|
|
84516
|
+
var extension4 = extname17("x." + path11).toLowerCase().substr(1);
|
|
84510
84517
|
if (!extension4) {
|
|
84511
84518
|
return false;
|
|
84512
84519
|
}
|
|
@@ -250375,6 +250382,22 @@ function optionalNumberArg(value2) {
|
|
|
250375
250382
|
const n2 = Number(value2);
|
|
250376
250383
|
return Number.isFinite(n2) ? n2 : void 0;
|
|
250377
250384
|
}
|
|
250385
|
+
function booleanArg(value2, fallback) {
|
|
250386
|
+
if (typeof value2 === "boolean")
|
|
250387
|
+
return value2;
|
|
250388
|
+
if (typeof value2 === "string") {
|
|
250389
|
+
if (/^(1|true|yes|on)$/i.test(value2.trim()))
|
|
250390
|
+
return true;
|
|
250391
|
+
if (/^(0|false|no|off)$/i.test(value2.trim()))
|
|
250392
|
+
return false;
|
|
250393
|
+
}
|
|
250394
|
+
return fallback;
|
|
250395
|
+
}
|
|
250396
|
+
function generationFallbackEnabled(args) {
|
|
250397
|
+
if (booleanArg(args["strict_model"] ?? args["strictModel"] ?? args["strict"], false))
|
|
250398
|
+
return false;
|
|
250399
|
+
return booleanArg(args["fallback"] ?? args["allow_fallback"] ?? args["allowFallback"], true);
|
|
250400
|
+
}
|
|
250378
250401
|
function isBackend(value2) {
|
|
250379
250402
|
return value2 === "auto" || value2 === "ollama" || value2 === "diffusers" || value2 === "sdcpp";
|
|
250380
250403
|
}
|
|
@@ -250383,6 +250406,14 @@ function getImageGenerationPreset(model) {
|
|
|
250383
250406
|
return void 0;
|
|
250384
250407
|
return IMAGE_GENERATION_MODEL_PRESETS.find((preset) => preset.id === model);
|
|
250385
250408
|
}
|
|
250409
|
+
function imageGenerationQualityLadder() {
|
|
250410
|
+
return IMAGE_GENERATION_QUALITY_LADDER.map((id) => getImageGenerationPreset(id)).filter((preset) => Boolean(preset));
|
|
250411
|
+
}
|
|
250412
|
+
function imageGenerationFallbackAlternates(model) {
|
|
250413
|
+
if (!model)
|
|
250414
|
+
return [];
|
|
250415
|
+
return IMAGE_GENERATION_MODEL_PRESETS.filter((preset) => preset.fallbackFor?.includes(model));
|
|
250416
|
+
}
|
|
250386
250417
|
function inferImageGenerationBackend(model, requested) {
|
|
250387
250418
|
if (requested && isBackend(requested))
|
|
250388
250419
|
return requested;
|
|
@@ -250399,6 +250430,45 @@ function inferImageGenerationBackend(model, requested) {
|
|
|
250399
250430
|
return "sdcpp";
|
|
250400
250431
|
return "diffusers";
|
|
250401
250432
|
}
|
|
250433
|
+
function imageCandidateFor(model, requestedBackend) {
|
|
250434
|
+
let backend = inferImageGenerationBackend(model, requestedBackend);
|
|
250435
|
+
if (backend === "auto")
|
|
250436
|
+
backend = "diffusers";
|
|
250437
|
+
return {
|
|
250438
|
+
model,
|
|
250439
|
+
backend,
|
|
250440
|
+
preset: getImageGenerationPreset(model)
|
|
250441
|
+
};
|
|
250442
|
+
}
|
|
250443
|
+
function imageGenerationFallbackCandidates(requestedModel, requestedBackend, allowFallback = true) {
|
|
250444
|
+
const ladder = imageGenerationQualityLadder();
|
|
250445
|
+
const candidates = [];
|
|
250446
|
+
const add2 = (candidate) => {
|
|
250447
|
+
const key = `${candidate.backend}:${candidate.model}`;
|
|
250448
|
+
if (!candidates.some((existing) => `${existing.backend}:${existing.model}` === key))
|
|
250449
|
+
candidates.push(candidate);
|
|
250450
|
+
};
|
|
250451
|
+
if (requestedModel) {
|
|
250452
|
+
add2(imageCandidateFor(requestedModel, requestedBackend));
|
|
250453
|
+
for (const alternate of imageGenerationFallbackAlternates(requestedModel))
|
|
250454
|
+
add2(imageCandidateFor(alternate.id));
|
|
250455
|
+
} else if (requestedBackend && requestedBackend !== "auto") {
|
|
250456
|
+
const firstForBackend = ladder.find((preset) => preset.backend === requestedBackend);
|
|
250457
|
+
add2(imageCandidateFor(firstForBackend?.id ?? (requestedBackend === "ollama" ? DEFAULT_OLLAMA_IMAGE_MODEL : DEFAULT_DIFFUSERS_IMAGE_MODEL), requestedBackend));
|
|
250458
|
+
} else if (!allowFallback) {
|
|
250459
|
+
add2(imageCandidateFor(DEFAULT_DIFFUSERS_IMAGE_MODEL, requestedBackend));
|
|
250460
|
+
}
|
|
250461
|
+
if (!allowFallback)
|
|
250462
|
+
return candidates.length ? candidates : [imageCandidateFor(DEFAULT_DIFFUSERS_IMAGE_MODEL, requestedBackend)];
|
|
250463
|
+
const primaryIndex = requestedModel ? ladder.findIndex((preset) => preset.id === requestedModel) : requestedBackend && requestedBackend !== "auto" ? ladder.findIndex((preset) => preset.backend === requestedBackend) : 0;
|
|
250464
|
+
const fallbackTail = primaryIndex >= 0 ? ladder.slice(primaryIndex) : ladder;
|
|
250465
|
+
for (const preset of fallbackTail) {
|
|
250466
|
+
add2(imageCandidateFor(preset.id));
|
|
250467
|
+
for (const alternate of imageGenerationFallbackAlternates(preset.id))
|
|
250468
|
+
add2(imageCandidateFor(alternate.id));
|
|
250469
|
+
}
|
|
250470
|
+
return candidates;
|
|
250471
|
+
}
|
|
250402
250472
|
function imageGenerationDir(repoRoot = ".") {
|
|
250403
250473
|
return join36(repoRoot, ".omnius", "image-gen");
|
|
250404
250474
|
}
|
|
@@ -250653,6 +250723,33 @@ function formatSuccessOutput(args) {
|
|
|
250653
250723
|
` Prompt: "${prompt}"`
|
|
250654
250724
|
].filter(Boolean).join("\n");
|
|
250655
250725
|
}
|
|
250726
|
+
function summarizeToolResult(result) {
|
|
250727
|
+
return trimProcessText(String(result.error || result.output || "unknown error"), 700).replace(/\s+/g, " ").trim();
|
|
250728
|
+
}
|
|
250729
|
+
function formatImageAttempt(candidate, reason, index) {
|
|
250730
|
+
return `${index + 1}. ${candidate.model} [${candidate.backend}] - ${reason}`;
|
|
250731
|
+
}
|
|
250732
|
+
function formatImageFallbackFailure(failed) {
|
|
250733
|
+
return [
|
|
250734
|
+
"No image generation model in the fallback ladder completed successfully.",
|
|
250735
|
+
"Attempted, highest quality to lowest:",
|
|
250736
|
+
...failed.map((attempt, index) => ` ${formatImageAttempt(attempt.candidate, attempt.reason, index)}`)
|
|
250737
|
+
].join("\n");
|
|
250738
|
+
}
|
|
250739
|
+
function annotateImageFallbackSuccess(result, failed, winner) {
|
|
250740
|
+
if (failed.length === 0)
|
|
250741
|
+
return result;
|
|
250742
|
+
const prefix = [
|
|
250743
|
+
`Fallback ladder succeeded with ${winner.model} [${winner.backend}] after ${failed.length} failed attempt(s).`,
|
|
250744
|
+
"Failed attempts:",
|
|
250745
|
+
...failed.map((attempt, index) => ` ${formatImageAttempt(attempt.candidate, attempt.reason, index)}`),
|
|
250746
|
+
""
|
|
250747
|
+
].join("\n");
|
|
250748
|
+
return {
|
|
250749
|
+
...result,
|
|
250750
|
+
output: prefix + result.output
|
|
250751
|
+
};
|
|
250752
|
+
}
|
|
250656
250753
|
function parseRunnerJson(stdout) {
|
|
250657
250754
|
const lines = stdout.trim().split(/\r?\n/).reverse();
|
|
250658
250755
|
for (const line of lines) {
|
|
@@ -250665,7 +250762,7 @@ function parseRunnerJson(stdout) {
|
|
|
250665
250762
|
}
|
|
250666
250763
|
return null;
|
|
250667
250764
|
}
|
|
250668
|
-
var DEFAULT_DIFFUSERS_IMAGE_MODEL, DEFAULT_OLLAMA_IMAGE_MODEL, DIFFUSERS_PYTHON_PACKAGES, SDCPP_PYTHON_PACKAGES, IMAGE_GENERATION_MODEL_PRESETS, OLLAMA_IMAGE_MODELS, DIFFUSERS_RUNNER, SDCPP_RUNNER, ImageGenerateTool;
|
|
250765
|
+
var DEFAULT_DIFFUSERS_IMAGE_MODEL, DEFAULT_OLLAMA_IMAGE_MODEL, DIFFUSERS_PYTHON_PACKAGES, SDCPP_PYTHON_PACKAGES, IMAGE_GENERATION_MODEL_PRESETS, IMAGE_GENERATION_QUALITY_LADDER, OLLAMA_IMAGE_MODELS, DIFFUSERS_RUNNER, SDCPP_RUNNER, ImageGenerateTool;
|
|
250669
250766
|
var init_image_generate = __esm({
|
|
250670
250767
|
"packages/execution/dist/tools/image-generate.js"() {
|
|
250671
250768
|
"use strict";
|
|
@@ -250737,6 +250834,78 @@ var init_image_generate = __esm({
|
|
|
250737
250834
|
height: 1024,
|
|
250738
250835
|
note: "Primary serious-generation baseline for maximum photorealism."
|
|
250739
250836
|
},
|
|
250837
|
+
{
|
|
250838
|
+
id: "black-forest-labs/FLUX.1-dev-FP8",
|
|
250839
|
+
label: "FLUX.1 dev FP8",
|
|
250840
|
+
backend: "diffusers",
|
|
250841
|
+
install: 'python3 .omnius/image-gen/diffusers_text2image.py --model black-forest-labs/FLUX.1-dev-FP8 --steps 28 --guidance 3.5 --width 1024 --height 1024 --prompt "..." --output .omnius/images/out.png',
|
|
250842
|
+
category: "Official FLUX fallback",
|
|
250843
|
+
sizeClass: "12B FLUX.1 dev FP8",
|
|
250844
|
+
quality: "Official lower-precision FLUX.1 dev route; best first fallback when full FLUX.1 dev is unavailable or too heavy.",
|
|
250845
|
+
minVramGB: 16,
|
|
250846
|
+
recommendedVramGB: 24,
|
|
250847
|
+
deployment: "Prefer this before third-party mirrors when loader support is available.",
|
|
250848
|
+
steps: 28,
|
|
250849
|
+
guidance: 3.5,
|
|
250850
|
+
width: 1024,
|
|
250851
|
+
height: 1024,
|
|
250852
|
+
fallbackFor: ["black-forest-labs/FLUX.1-dev"],
|
|
250853
|
+
note: "Official BFL FP8 fallback for FLUX.1 dev."
|
|
250854
|
+
},
|
|
250855
|
+
{
|
|
250856
|
+
id: "black-forest-labs/FLUX.1-Krea-dev",
|
|
250857
|
+
label: "FLUX.1 Krea dev",
|
|
250858
|
+
backend: "diffusers",
|
|
250859
|
+
install: 'python3 .omnius/image-gen/diffusers_text2image.py --model black-forest-labs/FLUX.1-Krea-dev --steps 28 --guidance 3.5 --width 1024 --height 1024 --prompt "..." --output .omnius/images/out.png',
|
|
250860
|
+
category: "Official FLUX fallback",
|
|
250861
|
+
sizeClass: "12B FLUX.1 dev-family",
|
|
250862
|
+
quality: "Official FLUX.1 dev-family aesthetic variant; useful when the base dev repo is unavailable and the requested task tolerates an opinionated realism bias.",
|
|
250863
|
+
minVramGB: 24,
|
|
250864
|
+
recommendedVramGB: 48,
|
|
250865
|
+
deployment: "Heavy Diffusers/ComfyUI route with FLUX.1 dev-family license considerations.",
|
|
250866
|
+
steps: 28,
|
|
250867
|
+
guidance: 3.5,
|
|
250868
|
+
width: 1024,
|
|
250869
|
+
height: 1024,
|
|
250870
|
+
fallbackFor: ["black-forest-labs/FLUX.1-dev"],
|
|
250871
|
+
note: "Official aesthetic FLUX.1 fallback."
|
|
250872
|
+
},
|
|
250873
|
+
{
|
|
250874
|
+
id: "lllyasviel/flux1-dev-bnb-nf4",
|
|
250875
|
+
label: "FLUX.1 dev BNB NF4",
|
|
250876
|
+
backend: "diffusers",
|
|
250877
|
+
install: 'python3 .omnius/image-gen/diffusers_text2image.py --model lllyasviel/flux1-dev-bnb-nf4 --steps 28 --guidance 3.5 --width 1024 --height 1024 --prompt "..." --output .omnius/images/out.png',
|
|
250878
|
+
category: "Traceable FLUX fallback",
|
|
250879
|
+
sizeClass: "12B FLUX.1 dev NF4",
|
|
250880
|
+
quality: "Lower-memory community quantization; useful after official BFL sources, with some possible quality loss and loader brittleness.",
|
|
250881
|
+
minVramGB: 12,
|
|
250882
|
+
recommendedVramGB: 16,
|
|
250883
|
+
deployment: "Best with BNB-aware Diffusers/Forge-style runtimes. Falls through cleanly if the current runner cannot load it.",
|
|
250884
|
+
steps: 28,
|
|
250885
|
+
guidance: 3.5,
|
|
250886
|
+
width: 1024,
|
|
250887
|
+
height: 1024,
|
|
250888
|
+
fallbackFor: ["black-forest-labs/FLUX.1-dev", "black-forest-labs/FLUX.1-dev-FP8"],
|
|
250889
|
+
note: "Traceable low-VRAM NF4 fallback for FLUX.1 dev."
|
|
250890
|
+
},
|
|
250891
|
+
{
|
|
250892
|
+
id: "ChuckMcSneed/FLUX.1-dev",
|
|
250893
|
+
label: "FLUX.1 dev mirror",
|
|
250894
|
+
backend: "diffusers",
|
|
250895
|
+
install: 'python3 .omnius/image-gen/diffusers_text2image.py --model ChuckMcSneed/FLUX.1-dev --steps 28 --guidance 3.5 --width 1024 --height 1024 --prompt "..." --output .omnius/images/out.png',
|
|
250896
|
+
category: "Traceable FLUX fallback",
|
|
250897
|
+
sizeClass: "12B FLUX.1 dev mirror",
|
|
250898
|
+
quality: "Lower-priority mirror fallback for FLUX.1 dev. Use only after official and reputable quantized options fail.",
|
|
250899
|
+
minVramGB: 24,
|
|
250900
|
+
recommendedVramGB: 48,
|
|
250901
|
+
deployment: "Treat as lower-trust than official BFL and well-known quantized conversions; verify provenance and license before relying on it.",
|
|
250902
|
+
steps: 28,
|
|
250903
|
+
guidance: 3.5,
|
|
250904
|
+
width: 1024,
|
|
250905
|
+
height: 1024,
|
|
250906
|
+
fallbackFor: ["black-forest-labs/FLUX.1-dev", "black-forest-labs/FLUX.1-dev-FP8"],
|
|
250907
|
+
note: "Traceable mirror fallback for FLUX.1 dev."
|
|
250908
|
+
},
|
|
250740
250909
|
{
|
|
250741
250910
|
id: "stabilityai/stable-diffusion-3.5-large",
|
|
250742
250911
|
label: "Stable Diffusion 3.5 Large",
|
|
@@ -250837,6 +251006,40 @@ var init_image_generate = __esm({
|
|
|
250837
251006
|
height: 1024,
|
|
250838
251007
|
note: "More deployable compact FLUX-family model."
|
|
250839
251008
|
},
|
|
251009
|
+
{
|
|
251010
|
+
id: "black-forest-labs/FLUX.2-klein-4b-fp8",
|
|
251011
|
+
label: "FLUX.2 Klein 4B FP8",
|
|
251012
|
+
backend: "diffusers",
|
|
251013
|
+
install: 'python3 .omnius/image-gen/diffusers_text2image.py --model black-forest-labs/FLUX.2-klein-4b-fp8 --steps 8 --width 1024 --height 1024 --prompt "..." --output .omnius/images/out.png',
|
|
251014
|
+
category: "Official FLUX fallback",
|
|
251015
|
+
sizeClass: "4B compact FLUX-family FP8",
|
|
251016
|
+
quality: "Official lower-precision FLUX.2 Klein route with better deployment fit than full-precision 4B.",
|
|
251017
|
+
minVramGB: 8,
|
|
251018
|
+
recommendedVramGB: 12,
|
|
251019
|
+
deployment: "Preferred lower-memory official FLUX.2 fallback when compatible with the current loader.",
|
|
251020
|
+
steps: 8,
|
|
251021
|
+
width: 1024,
|
|
251022
|
+
height: 1024,
|
|
251023
|
+
fallbackFor: ["black-forest-labs/FLUX.2-klein-4B", "x/flux2-klein"],
|
|
251024
|
+
note: "Official FP8 fallback for FLUX.2 Klein."
|
|
251025
|
+
},
|
|
251026
|
+
{
|
|
251027
|
+
id: "black-forest-labs/FLUX.2-klein-4b-nvfp4",
|
|
251028
|
+
label: "FLUX.2 Klein 4B NVFP4",
|
|
251029
|
+
backend: "diffusers",
|
|
251030
|
+
install: 'python3 .omnius/image-gen/diffusers_text2image.py --model black-forest-labs/FLUX.2-klein-4b-nvfp4 --steps 8 --width 1024 --height 1024 --prompt "..." --output .omnius/images/out.png',
|
|
251031
|
+
category: "Official FLUX fallback",
|
|
251032
|
+
sizeClass: "4B compact FLUX-family NVFP4",
|
|
251033
|
+
quality: "Official NVIDIA-oriented low-precision FLUX.2 Klein fallback.",
|
|
251034
|
+
minVramGB: 8,
|
|
251035
|
+
recommendedVramGB: 12,
|
|
251036
|
+
deployment: "Use when the runtime/GPU supports the NVFP4 path; otherwise the fallback ladder continues.",
|
|
251037
|
+
steps: 8,
|
|
251038
|
+
width: 1024,
|
|
251039
|
+
height: 1024,
|
|
251040
|
+
fallbackFor: ["black-forest-labs/FLUX.2-klein-4B", "x/flux2-klein", "black-forest-labs/FLUX.2-klein-4b-fp8"],
|
|
251041
|
+
note: "Official NVFP4 fallback for FLUX.2 Klein."
|
|
251042
|
+
},
|
|
250840
251043
|
{
|
|
250841
251044
|
id: "deepseek-ai/Janus-Pro-7B",
|
|
250842
251045
|
label: "Janus-Pro-7B",
|
|
@@ -250989,6 +251192,21 @@ var init_image_generate = __esm({
|
|
|
250989
251192
|
note: "CPU/GGUF/checkpoint route; requires a local model path."
|
|
250990
251193
|
}
|
|
250991
251194
|
];
|
|
251195
|
+
IMAGE_GENERATION_QUALITY_LADDER = [
|
|
251196
|
+
"black-forest-labs/FLUX.1-dev",
|
|
251197
|
+
"stabilityai/stable-diffusion-3.5-large",
|
|
251198
|
+
DEFAULT_OLLAMA_IMAGE_MODEL,
|
|
251199
|
+
"black-forest-labs/FLUX.1-schnell",
|
|
251200
|
+
"stabilityai/stable-diffusion-3.5-large-turbo",
|
|
251201
|
+
"Tongyi-MAI/Z-Image-Turbo",
|
|
251202
|
+
"black-forest-labs/FLUX.2-klein-4B",
|
|
251203
|
+
DEFAULT_DIFFUSERS_IMAGE_MODEL,
|
|
251204
|
+
"Efficient-Large-Model/Sana_Sprint_0.6B_1024px_diffusers",
|
|
251205
|
+
"SimianLuo/LCM_Dreamshaper_v7",
|
|
251206
|
+
"stabilityai/sd-turbo",
|
|
251207
|
+
"segmind/tiny-sd",
|
|
251208
|
+
"nota-ai/bk-sdm-tiny-2m"
|
|
251209
|
+
];
|
|
250992
251210
|
OLLAMA_IMAGE_MODELS = IMAGE_GENERATION_MODEL_PRESETS.filter((preset) => preset.backend === "ollama").map((preset) => preset.id);
|
|
250993
251211
|
DIFFUSERS_RUNNER = String.raw`#!/usr/bin/env python3
|
|
250994
251212
|
import argparse
|
|
@@ -251170,7 +251388,7 @@ if __name__ == "__main__":
|
|
|
251170
251388
|
`;
|
|
251171
251389
|
ImageGenerateTool = class {
|
|
251172
251390
|
name = "generate_image";
|
|
251173
|
-
description = "Generate an image from a text prompt using a local image-generation backend. Supports Ollama image models (x/z-image-turbo, x/flux2-klein), Python Diffusers models (SDXL Turbo default, FLUX.1 dev, SD3.5 Large, Tiny-SD, LCM, Sana Sprint), and stable-diffusion.cpp local checkpoints/GGUF. Saves a PNG under .omnius/images and returns the file path.";
|
|
251391
|
+
description = "Generate an image from a text prompt using a local image-generation backend. Supports Ollama image models (x/z-image-turbo, x/flux2-klein), Python Diffusers models (SDXL Turbo default, FLUX.1 dev, SD3.5 Large, Tiny-SD, LCM, Sana Sprint), and stable-diffusion.cpp local checkpoints/GGUF. When fallback is enabled, auto generation tries ranked high-quality candidates first, including official/traceable FLUX fallbacks for Black Forest Labs models, and then falls back to smaller models if setup, download, or generation fails. Saves a PNG under .omnius/images and returns the file path.";
|
|
251174
251392
|
parameters = {
|
|
251175
251393
|
type: "object",
|
|
251176
251394
|
properties: {
|
|
@@ -251215,6 +251433,14 @@ if __name__ == "__main__":
|
|
|
251215
251433
|
type: "string",
|
|
251216
251434
|
enum: ["generate", "list_models", "setup"],
|
|
251217
251435
|
description: "Optional utility action. Default is generate."
|
|
251436
|
+
},
|
|
251437
|
+
fallback: {
|
|
251438
|
+
type: "boolean",
|
|
251439
|
+
description: "Whether to try the ranked quality ladder if the selected model/backend fails. Defaults true."
|
|
251440
|
+
},
|
|
251441
|
+
strict_model: {
|
|
251442
|
+
type: "boolean",
|
|
251443
|
+
description: "When true, use only the requested model/backend and do not fall back. Defaults false."
|
|
251218
251444
|
}
|
|
251219
251445
|
},
|
|
251220
251446
|
required: ["prompt"]
|
|
@@ -251257,7 +251483,7 @@ if __name__ == "__main__":
|
|
|
251257
251483
|
if (action === "list_models") {
|
|
251258
251484
|
return {
|
|
251259
251485
|
success: true,
|
|
251260
|
-
output: IMAGE_GENERATION_MODEL_PRESETS.map((
|
|
251486
|
+
output: IMAGE_GENERATION_MODEL_PRESETS.map((preset) => `${preset.id} [${preset.backend}] - ${preset.note}`).join("\n"),
|
|
251261
251487
|
durationMs: performance.now() - start2
|
|
251262
251488
|
};
|
|
251263
251489
|
}
|
|
@@ -251281,19 +251507,8 @@ if __name__ == "__main__":
|
|
|
251281
251507
|
const rawModel2 = args["model_path"] ? String(args["model_path"]) : args["model"] ? String(args["model"]) : this.defaultModel;
|
|
251282
251508
|
const requestedModel2 = rawModel2 === "auto" ? void 0 : rawModel2;
|
|
251283
251509
|
const requestedBackend2 = args["backend"] ? String(args["backend"]) : this.defaultBackend;
|
|
251284
|
-
|
|
251285
|
-
|
|
251286
|
-
backend = inferImageGenerationBackend(requestedModel2, void 0);
|
|
251287
|
-
if (backend === "auto")
|
|
251288
|
-
backend = "diffusers";
|
|
251289
|
-
}
|
|
251290
|
-
const model = requestedModel2 ?? (backend === "diffusers" ? DEFAULT_DIFFUSERS_IMAGE_MODEL : DEFAULT_OLLAMA_IMAGE_MODEL);
|
|
251291
|
-
this.emitProgress({ stage: "setup", message: `Preparing image model ${model} (${backend})` });
|
|
251292
|
-
if (backend === "ollama")
|
|
251293
|
-
return await this.prewarmOllama({ model, start: start2 });
|
|
251294
|
-
if (backend === "sdcpp")
|
|
251295
|
-
return await this.prewarmSdCpp({ model, start: start2, python: args["python"] });
|
|
251296
|
-
return await this.prewarmDiffusers({ model, start: start2, python: args["python"] });
|
|
251510
|
+
const candidates2 = imageGenerationFallbackCandidates(requestedModel2, requestedBackend2, generationFallbackEnabled(args));
|
|
251511
|
+
return await this.prewarmCandidateLadder({ candidates: candidates2, args, start: start2 });
|
|
251297
251512
|
}
|
|
251298
251513
|
const prompt = String(args["prompt"] ?? "").trim();
|
|
251299
251514
|
if (!prompt) {
|
|
@@ -251302,31 +251517,10 @@ if __name__ == "__main__":
|
|
|
251302
251517
|
const rawModel = args["model_path"] ? String(args["model_path"]) : args["model"] ? String(args["model"]) : this.defaultModel;
|
|
251303
251518
|
const requestedModel = rawModel === "auto" ? void 0 : rawModel;
|
|
251304
251519
|
const requestedBackend = args["backend"] ? String(args["backend"]) : this.defaultBackend;
|
|
251305
|
-
const preset = getImageGenerationPreset(requestedModel);
|
|
251306
|
-
const width = numberArg(args["width"], preset?.width ?? 1024);
|
|
251307
|
-
const height = numberArg(args["height"], preset?.height ?? 1024);
|
|
251308
|
-
const steps = optionalNumberArg(args["steps"]) ?? preset?.steps;
|
|
251309
|
-
const guidance = optionalNumberArg(args["guidance"]) ?? preset?.guidance;
|
|
251310
251520
|
const seed = optionalNumberArg(args["seed"]);
|
|
251521
|
+
const candidates = imageGenerationFallbackCandidates(requestedModel, requestedBackend, generationFallbackEnabled(args));
|
|
251311
251522
|
try {
|
|
251312
|
-
|
|
251313
|
-
let model = requestedModel;
|
|
251314
|
-
if (backend === "auto") {
|
|
251315
|
-
backend = inferImageGenerationBackend(model, void 0);
|
|
251316
|
-
if (backend === "auto")
|
|
251317
|
-
backend = "diffusers";
|
|
251318
|
-
}
|
|
251319
|
-
if (!model) {
|
|
251320
|
-
model = backend === "diffusers" ? DEFAULT_DIFFUSERS_IMAGE_MODEL : DEFAULT_OLLAMA_IMAGE_MODEL;
|
|
251321
|
-
}
|
|
251322
|
-
this.emitProgress({ stage: "setup", message: `Using image model ${model} (${backend})` });
|
|
251323
|
-
if (backend === "ollama") {
|
|
251324
|
-
return await this.generateWithOllama({ prompt, model, width, height, steps, start: start2 });
|
|
251325
|
-
}
|
|
251326
|
-
if (backend === "sdcpp") {
|
|
251327
|
-
return await this.generateWithSdCpp({ prompt, model, width, height, steps, seed, start: start2, python: args["python"] });
|
|
251328
|
-
}
|
|
251329
|
-
return await this.generateWithDiffusers({ prompt, model, width, height, steps, guidance, seed, start: start2, python: args["python"] });
|
|
251523
|
+
return await this.generateCandidateLadder({ candidates, prompt, args, seed, start: start2 });
|
|
251330
251524
|
} catch (err) {
|
|
251331
251525
|
return {
|
|
251332
251526
|
success: false,
|
|
@@ -251335,6 +251529,64 @@ if __name__ == "__main__":
|
|
|
251335
251529
|
};
|
|
251336
251530
|
}
|
|
251337
251531
|
}
|
|
251532
|
+
async prewarmCandidateLadder(args) {
|
|
251533
|
+
const failed = [];
|
|
251534
|
+
for (let index = 0; index < args.candidates.length; index++) {
|
|
251535
|
+
const candidate = args.candidates[index];
|
|
251536
|
+
this.emitProgress({
|
|
251537
|
+
stage: "setup",
|
|
251538
|
+
message: `Preparing image model ${candidate.model} (${candidate.backend}) [${index + 1}/${args.candidates.length}]`
|
|
251539
|
+
});
|
|
251540
|
+
const result = candidate.backend === "ollama" ? await this.prewarmOllama({ model: candidate.model, start: args.start }) : candidate.backend === "sdcpp" ? await this.prewarmSdCpp({ model: candidate.model, start: args.start, python: args.args["python"] }) : await this.prewarmDiffusers({ model: candidate.model, start: args.start, python: args.args["python"] });
|
|
251541
|
+
if (result.success)
|
|
251542
|
+
return annotateImageFallbackSuccess(result, failed, candidate);
|
|
251543
|
+
failed.push({ candidate, reason: summarizeToolResult(result) });
|
|
251544
|
+
if (index < args.candidates.length - 1) {
|
|
251545
|
+
this.emitProgress({
|
|
251546
|
+
stage: "setup",
|
|
251547
|
+
message: `${candidate.model} failed; trying ${args.candidates[index + 1].model}`
|
|
251548
|
+
});
|
|
251549
|
+
}
|
|
251550
|
+
}
|
|
251551
|
+
const output = formatImageFallbackFailure(failed);
|
|
251552
|
+
return {
|
|
251553
|
+
success: false,
|
|
251554
|
+
output,
|
|
251555
|
+
error: output,
|
|
251556
|
+
durationMs: performance.now() - args.start
|
|
251557
|
+
};
|
|
251558
|
+
}
|
|
251559
|
+
async generateCandidateLadder(args) {
|
|
251560
|
+
const failed = [];
|
|
251561
|
+
for (let index = 0; index < args.candidates.length; index++) {
|
|
251562
|
+
const candidate = args.candidates[index];
|
|
251563
|
+
const width = numberArg(args.args["width"], candidate.preset?.width ?? 1024);
|
|
251564
|
+
const height = numberArg(args.args["height"], candidate.preset?.height ?? 1024);
|
|
251565
|
+
const steps = optionalNumberArg(args.args["steps"]) ?? candidate.preset?.steps;
|
|
251566
|
+
const guidance = optionalNumberArg(args.args["guidance"]) ?? candidate.preset?.guidance;
|
|
251567
|
+
this.emitProgress({
|
|
251568
|
+
stage: "setup",
|
|
251569
|
+
message: `Using image model ${candidate.model} (${candidate.backend}) [${index + 1}/${args.candidates.length}]`
|
|
251570
|
+
});
|
|
251571
|
+
const result = candidate.backend === "ollama" ? await this.generateWithOllama({ prompt: args.prompt, model: candidate.model, width, height, steps, start: args.start }) : candidate.backend === "sdcpp" ? await this.generateWithSdCpp({ prompt: args.prompt, model: candidate.model, width, height, steps, seed: args.seed, start: args.start, python: args.args["python"] }) : await this.generateWithDiffusers({ prompt: args.prompt, model: candidate.model, width, height, steps, guidance, seed: args.seed, start: args.start, python: args.args["python"] });
|
|
251572
|
+
if (result.success)
|
|
251573
|
+
return annotateImageFallbackSuccess(result, failed, candidate);
|
|
251574
|
+
failed.push({ candidate, reason: summarizeToolResult(result) });
|
|
251575
|
+
if (index < args.candidates.length - 1) {
|
|
251576
|
+
this.emitProgress({
|
|
251577
|
+
stage: "setup",
|
|
251578
|
+
message: `${candidate.model} failed; falling back to ${args.candidates[index + 1].model}`
|
|
251579
|
+
});
|
|
251580
|
+
}
|
|
251581
|
+
}
|
|
251582
|
+
const output = formatImageFallbackFailure(failed);
|
|
251583
|
+
return {
|
|
251584
|
+
success: false,
|
|
251585
|
+
output,
|
|
251586
|
+
error: output,
|
|
251587
|
+
durationMs: performance.now() - args.start
|
|
251588
|
+
};
|
|
251589
|
+
}
|
|
251338
251590
|
async prewarmOllama(args) {
|
|
251339
251591
|
const model = args.model || DEFAULT_OLLAMA_IMAGE_MODEL;
|
|
251340
251592
|
if (await this.ollamaHasModel(model)) {
|
|
@@ -251800,7 +252052,7 @@ ${errText.slice(0, 800)}`,
|
|
|
251800
252052
|
});
|
|
251801
252053
|
|
|
251802
252054
|
// packages/execution/dist/tools/audio-generate.js
|
|
251803
|
-
import { spawn as spawn10 } from "node:child_process";
|
|
252055
|
+
import { execFileSync as execFileSync2, spawn as spawn10 } from "node:child_process";
|
|
251804
252056
|
import { existsSync as existsSync24, readdirSync as readdirSync10, statSync as statSync9 } from "node:fs";
|
|
251805
252057
|
import { chmod as chmod4, mkdir as mkdir12, writeFile as writeFile17 } from "node:fs/promises";
|
|
251806
252058
|
import { join as join37 } from "node:path";
|
|
@@ -251824,13 +252076,63 @@ function backendPackages(backend) {
|
|
|
251824
252076
|
return TANGOFLUX_PACKAGES;
|
|
251825
252077
|
return DIFFUSERS_AUDIO_PACKAGES;
|
|
251826
252078
|
}
|
|
252079
|
+
function detectLegacyCudaComputeCapability() {
|
|
252080
|
+
try {
|
|
252081
|
+
const out = execFileSync2("nvidia-smi", ["--query-gpu=compute_cap,name", "--format=csv,noheader,nounits"], {
|
|
252082
|
+
encoding: "utf8",
|
|
252083
|
+
timeout: 5e3,
|
|
252084
|
+
stdio: ["ignore", "pipe", "ignore"]
|
|
252085
|
+
}).trim();
|
|
252086
|
+
const first2 = out.split(/\r?\n/).map((line) => line.trim()).find(Boolean);
|
|
252087
|
+
const match = first2?.match(/^(\d+)\.(\d+)\s*,?\s*(.*)$/);
|
|
252088
|
+
if (!match)
|
|
252089
|
+
return null;
|
|
252090
|
+
const major = Number(match[1]);
|
|
252091
|
+
const minor = Number(match[2]);
|
|
252092
|
+
if (!Number.isFinite(major) || !Number.isFinite(minor))
|
|
252093
|
+
return null;
|
|
252094
|
+
return { major, minor, name: match[3]?.trim() || void 0 };
|
|
252095
|
+
} catch {
|
|
252096
|
+
return null;
|
|
252097
|
+
}
|
|
252098
|
+
}
|
|
252099
|
+
function isLegacyCudaCapability(major, minor) {
|
|
252100
|
+
return major < 7 || major === 7 && minor < 5;
|
|
252101
|
+
}
|
|
252102
|
+
function torchInstallPlan(forceLegacyCuda = false) {
|
|
252103
|
+
if (process.env["OMNIUS_AUDIO_TORCH_INDEX_URL"]) {
|
|
252104
|
+
return {
|
|
252105
|
+
args: ["torch", "torchaudio", "--index-url", process.env["OMNIUS_AUDIO_TORCH_INDEX_URL"]],
|
|
252106
|
+
description: `env override ${process.env["OMNIUS_AUDIO_TORCH_INDEX_URL"]}`
|
|
252107
|
+
};
|
|
252108
|
+
}
|
|
252109
|
+
if (forceLegacyCuda) {
|
|
252110
|
+
return {
|
|
252111
|
+
args: ["torch==2.3.1", "torchaudio==2.3.1", "--index-url", "https://download.pytorch.org/whl/cu118"],
|
|
252112
|
+
description: "runtime-detected legacy CUDA GPU; using PyTorch 2.3.1 cu118 to avoid cuDNN 9 incompatibility"
|
|
252113
|
+
};
|
|
252114
|
+
}
|
|
252115
|
+
if (process.platform === "linux" && process.arch === "x64") {
|
|
252116
|
+
const gpu = detectLegacyCudaComputeCapability();
|
|
252117
|
+
if (gpu && isLegacyCudaCapability(gpu.major, gpu.minor)) {
|
|
252118
|
+
return {
|
|
252119
|
+
args: ["torch==2.3.1", "torchaudio==2.3.1", "--index-url", "https://download.pytorch.org/whl/cu118"],
|
|
252120
|
+
description: `CUDA legacy GPU SM ${gpu.major}.${gpu.minor}${gpu.name ? ` ${gpu.name}` : ""}; using PyTorch 2.3.1 cu118 to avoid cuDNN 9 incompatibility`
|
|
252121
|
+
};
|
|
252122
|
+
}
|
|
252123
|
+
}
|
|
252124
|
+
return { args: ["torch", "torchaudio"], description: "default PyTorch wheel selection" };
|
|
252125
|
+
}
|
|
252126
|
+
function withoutTorchPackages(packages) {
|
|
252127
|
+
return packages.filter((pkg) => pkg !== "torch" && pkg !== "torchaudio");
|
|
252128
|
+
}
|
|
251827
252129
|
function backendImportCheck(backend) {
|
|
251828
252130
|
if (backend === "transformers")
|
|
251829
252131
|
return "import torch, torchaudio, transformers, scipy\nfrom transformers import AutoProcessor, MusicgenForConditionalGeneration\n";
|
|
251830
252132
|
if (backend === "audiocraft")
|
|
251831
252133
|
return "import torch, torchaudio, audiocraft\nfrom audiocraft.models import MusicGen, AudioGen\n";
|
|
251832
252134
|
if (backend === "stable-audio")
|
|
251833
|
-
return "import torch, torchaudio,
|
|
252135
|
+
return "import torch, torchaudio, diffusers, scipy\nfrom diffusers import StableAudioPipeline\n";
|
|
251834
252136
|
if (backend === "tangoflux")
|
|
251835
252137
|
return "import torch, torchaudio\nfrom tangoflux import TangoFluxInference\n";
|
|
251836
252138
|
return "import torch, diffusers, scipy\nfrom diffusers import AudioLDMPipeline\n";
|
|
@@ -252022,6 +252324,69 @@ async function pythonCanImport2(command, code8, repoRoot, env2) {
|
|
|
252022
252324
|
async function pythonImportResult(command, code8, repoRoot, env2) {
|
|
252023
252325
|
return await runProcess3(command, ["-c", code8], { cwd: repoRoot, timeoutMs: 6e4, env: env2 });
|
|
252024
252326
|
}
|
|
252327
|
+
async function torchRuntimeCompatibilityResult(command, repoRoot, env2) {
|
|
252328
|
+
const code8 = [
|
|
252329
|
+
"import json, sys",
|
|
252330
|
+
"import torch",
|
|
252331
|
+
"payload={'torch': getattr(torch, '__version__', '?'), 'cuda_available': bool(torch.cuda.is_available())}",
|
|
252332
|
+
"if torch.cuda.is_available():",
|
|
252333
|
+
" cap=torch.cuda.get_device_capability(0)",
|
|
252334
|
+
" cudnn=torch.backends.cudnn.version() or 0",
|
|
252335
|
+
" payload.update({'capability': list(cap), 'cudnn': int(cudnn), 'device': torch.cuda.get_device_name(0)})",
|
|
252336
|
+
" if int(cudnn) >= 90000 and tuple(cap) < (7, 5):",
|
|
252337
|
+
" print(json.dumps(payload))",
|
|
252338
|
+
" raise SystemExit(42)",
|
|
252339
|
+
"print(json.dumps(payload))"
|
|
252340
|
+
].join("\n");
|
|
252341
|
+
return await runProcess3(command, ["-c", code8], { cwd: repoRoot, timeoutMs: 6e4, env: env2 });
|
|
252342
|
+
}
|
|
252343
|
+
async function repairTorchRuntime(command, repoRoot, env2, forceLegacyCuda = false, onProgress) {
|
|
252344
|
+
const plan = torchInstallPlan(forceLegacyCuda);
|
|
252345
|
+
onProgress?.({ stage: "setup", message: `Installing PyTorch runtime: ${plan.description}` });
|
|
252346
|
+
const result = await runProcess3(command, [
|
|
252347
|
+
"-m",
|
|
252348
|
+
"pip",
|
|
252349
|
+
"install",
|
|
252350
|
+
"--progress-bar",
|
|
252351
|
+
"on",
|
|
252352
|
+
"--prefer-binary",
|
|
252353
|
+
"--force-reinstall",
|
|
252354
|
+
...plan.args
|
|
252355
|
+
], {
|
|
252356
|
+
cwd: repoRoot,
|
|
252357
|
+
timeoutMs: 18e5,
|
|
252358
|
+
env: env2,
|
|
252359
|
+
progressLabel: `Installing PyTorch runtime (${plan.description})`,
|
|
252360
|
+
onProgress
|
|
252361
|
+
});
|
|
252362
|
+
if (result.code !== 0) {
|
|
252363
|
+
throw new Error(`Failed to install compatible PyTorch runtime (${plan.description}).
|
|
252364
|
+
${trimProcessText2(result.stderr || result.stdout)}`);
|
|
252365
|
+
}
|
|
252366
|
+
}
|
|
252367
|
+
async function ensureCompatibleTorchRuntime(command, repoRoot, env2, onProgress) {
|
|
252368
|
+
const existing = await torchRuntimeCompatibilityResult(command, repoRoot, env2);
|
|
252369
|
+
if (existing.code === 0)
|
|
252370
|
+
return;
|
|
252371
|
+
if (existing.code === 42) {
|
|
252372
|
+
await repairTorchRuntime(command, repoRoot, env2, true, onProgress);
|
|
252373
|
+
} else {
|
|
252374
|
+
await repairTorchRuntime(command, repoRoot, env2, false, onProgress);
|
|
252375
|
+
}
|
|
252376
|
+
const installed = await torchRuntimeCompatibilityResult(command, repoRoot, env2);
|
|
252377
|
+
if (installed.code === 0)
|
|
252378
|
+
return;
|
|
252379
|
+
if (installed.code === 42) {
|
|
252380
|
+
await repairTorchRuntime(command, repoRoot, env2, true, onProgress);
|
|
252381
|
+
const repaired = await torchRuntimeCompatibilityResult(command, repoRoot, env2);
|
|
252382
|
+
if (repaired.code === 0)
|
|
252383
|
+
return;
|
|
252384
|
+
throw new Error(`Audio-generation PyTorch runtime remains incompatible after cu118 repair.
|
|
252385
|
+
${trimProcessText2(repaired.stderr || repaired.stdout)}`);
|
|
252386
|
+
}
|
|
252387
|
+
throw new Error(`Audio-generation PyTorch runtime could not be prepared.
|
|
252388
|
+
${trimProcessText2(installed.stderr || installed.stdout)}`);
|
|
252389
|
+
}
|
|
252025
252390
|
function formatAudioSetupFailure(backend, text) {
|
|
252026
252391
|
const body = trimProcessText2(text);
|
|
252027
252392
|
const lowered = text.toLowerCase();
|
|
@@ -252032,6 +252397,9 @@ function formatAudioSetupFailure(backend, text) {
|
|
|
252032
252397
|
if (lowered.includes("cuda") && lowered.includes("not available")) {
|
|
252033
252398
|
notes2.push("CUDA was not available to the selected Python environment; install a Torch build matching this machine's CUDA runtime or use CPU-compatible settings.");
|
|
252034
252399
|
}
|
|
252400
|
+
if (lowered.includes("cudnn version") && lowered.includes("sm < 7.5")) {
|
|
252401
|
+
notes2.push("The installed PyTorch wheel uses cuDNN 9 on a legacy CUDA GPU. Omnius now repairs audio-generation venvs by reinstalling PyTorch 2.3.1 from the cu118 index for SM < 7.5 hardware.");
|
|
252402
|
+
}
|
|
252035
252403
|
return [body, ...notes2.map((note) => `
|
|
252036
252404
|
${note}`)].filter(Boolean).join("");
|
|
252037
252405
|
}
|
|
@@ -252060,9 +252428,13 @@ ${trimProcessText2(created.stderr || created.stdout)}`);
|
|
|
252060
252428
|
}
|
|
252061
252429
|
}
|
|
252062
252430
|
if (await pythonCanImport2(command, backendImportCheck(backend), repoRoot, pythonEnv)) {
|
|
252063
|
-
|
|
252431
|
+
await ensureCompatibleTorchRuntime(command, repoRoot, pythonEnv, onProgress);
|
|
252432
|
+
if (await pythonCanImport2(command, backendImportCheck(backend), repoRoot, pythonEnv)) {
|
|
252433
|
+
return { command, env: pythonEnv };
|
|
252434
|
+
}
|
|
252064
252435
|
}
|
|
252065
252436
|
const packages = backendPackages(backend);
|
|
252437
|
+
await ensureCompatibleTorchRuntime(command, repoRoot, pythonEnv, onProgress);
|
|
252066
252438
|
onProgress?.({ stage: "setup", message: `Installing ${backend} audio-generation Python packages` });
|
|
252067
252439
|
const pipArgs = [
|
|
252068
252440
|
"-m",
|
|
@@ -252074,7 +252446,7 @@ ${trimProcessText2(created.stderr || created.stdout)}`);
|
|
|
252074
252446
|
...backend === "audiocraft" ? ["--only-binary", "av"] : [],
|
|
252075
252447
|
"-U",
|
|
252076
252448
|
"pip",
|
|
252077
|
-
...packages
|
|
252449
|
+
...withoutTorchPackages(packages)
|
|
252078
252450
|
];
|
|
252079
252451
|
const pip = await runProcess3(command, pipArgs, {
|
|
252080
252452
|
cwd: repoRoot,
|
|
@@ -252091,6 +252463,12 @@ ${formatAudioSetupFailure(backend, pip.stderr || pip.stdout)}`);
|
|
|
252091
252463
|
if (importCheck.code !== 0) {
|
|
252092
252464
|
throw new Error(`Audio-generation Python environment at ${venvDir} was created, but required ${backend} imports still fail.
|
|
252093
252465
|
${formatAudioSetupFailure(backend, importCheck.stderr || importCheck.stdout)}`);
|
|
252466
|
+
}
|
|
252467
|
+
await ensureCompatibleTorchRuntime(command, repoRoot, pythonEnv, onProgress);
|
|
252468
|
+
if (!await pythonCanImport2(command, backendImportCheck(backend), repoRoot, pythonEnv)) {
|
|
252469
|
+
const retry = await pythonImportResult(command, backendImportCheck(backend), repoRoot, pythonEnv);
|
|
252470
|
+
throw new Error(`Audio-generation Python environment at ${venvDir} lost required ${backend} imports after PyTorch repair.
|
|
252471
|
+
${formatAudioSetupFailure(backend, retry.stderr || retry.stdout)}`);
|
|
252094
252472
|
}
|
|
252095
252473
|
return { command, env: pythonEnv };
|
|
252096
252474
|
}
|
|
@@ -252160,11 +252538,31 @@ function playbackRequested(args) {
|
|
|
252160
252538
|
return false;
|
|
252161
252539
|
return true;
|
|
252162
252540
|
}
|
|
252541
|
+
function booleanArg2(value2, fallback) {
|
|
252542
|
+
if (typeof value2 === "boolean")
|
|
252543
|
+
return value2;
|
|
252544
|
+
if (typeof value2 === "string") {
|
|
252545
|
+
if (/^(1|true|yes|on)$/i.test(value2.trim()))
|
|
252546
|
+
return true;
|
|
252547
|
+
if (/^(0|false|no|off)$/i.test(value2.trim()))
|
|
252548
|
+
return false;
|
|
252549
|
+
}
|
|
252550
|
+
return fallback;
|
|
252551
|
+
}
|
|
252552
|
+
function generationFallbackEnabled2(args) {
|
|
252553
|
+
if (booleanArg2(args["strict_model"] ?? args["strictModel"] ?? args["strict"], false))
|
|
252554
|
+
return false;
|
|
252555
|
+
return booleanArg2(args["fallback"] ?? args["allow_fallback"] ?? args["allowFallback"], true);
|
|
252556
|
+
}
|
|
252163
252557
|
function getAudioGenerationPreset(model, kind) {
|
|
252164
252558
|
if (!model)
|
|
252165
252559
|
return void 0;
|
|
252166
252560
|
return AUDIO_GENERATION_MODEL_PRESETS.find((preset) => preset.id === model && (!kind || preset.kind === kind)) ?? AUDIO_GENERATION_MODEL_PRESETS.find((preset) => preset.id === model);
|
|
252167
252561
|
}
|
|
252562
|
+
function audioGenerationQualityLadder(kind) {
|
|
252563
|
+
const ids = kind === "music" ? MUSIC_GENERATION_QUALITY_LADDER : SOUND_GENERATION_QUALITY_LADDER;
|
|
252564
|
+
return ids.map((id) => getAudioGenerationPreset(id, kind)).filter((preset) => Boolean(preset));
|
|
252565
|
+
}
|
|
252168
252566
|
function inferAudioGenerationBackend(model, requested) {
|
|
252169
252567
|
if (requested && requested !== "auto") {
|
|
252170
252568
|
if (requested === "diffusers" || requested === "transformers" || requested === "audiocraft" || requested === "stable-audio" || requested === "tangoflux" || requested === "project")
|
|
@@ -252188,6 +252586,41 @@ function inferAudioGenerationBackend(model, requested) {
|
|
|
252188
252586
|
return "project";
|
|
252189
252587
|
return "diffusers";
|
|
252190
252588
|
}
|
|
252589
|
+
function audioCandidateFor(kind, model, requestedBackend) {
|
|
252590
|
+
const backend = inferAudioGenerationBackend(model, requestedBackend);
|
|
252591
|
+
const resolvedBackend = backend === "auto" ? kind === "music" ? "transformers" : "diffusers" : backend;
|
|
252592
|
+
return {
|
|
252593
|
+
kind,
|
|
252594
|
+
model,
|
|
252595
|
+
backend: resolvedBackend,
|
|
252596
|
+
preset: getAudioGenerationPreset(model, kind)
|
|
252597
|
+
};
|
|
252598
|
+
}
|
|
252599
|
+
function audioGenerationFallbackCandidates(kind, requestedModel, requestedBackend, allowFallback = true) {
|
|
252600
|
+
const ladder = audioGenerationQualityLadder(kind);
|
|
252601
|
+
const candidates = [];
|
|
252602
|
+
const add2 = (candidate) => {
|
|
252603
|
+
const key = `${candidate.kind}:${candidate.backend}:${candidate.model}`;
|
|
252604
|
+
if (!candidates.some((existing) => `${existing.kind}:${existing.backend}:${existing.model}` === key)) {
|
|
252605
|
+
candidates.push(candidate);
|
|
252606
|
+
}
|
|
252607
|
+
};
|
|
252608
|
+
if (requestedModel) {
|
|
252609
|
+
add2(audioCandidateFor(kind, requestedModel, requestedBackend));
|
|
252610
|
+
} else if (requestedBackend && requestedBackend !== "auto") {
|
|
252611
|
+
const firstForBackend = ladder.find((preset) => preset.backend === requestedBackend);
|
|
252612
|
+
add2(audioCandidateFor(kind, firstForBackend?.id ?? (kind === "music" ? DEFAULT_MUSIC_MODEL : DEFAULT_SOUND_MODEL), requestedBackend));
|
|
252613
|
+
} else if (!allowFallback) {
|
|
252614
|
+
add2(audioCandidateFor(kind, kind === "music" ? DEFAULT_MUSIC_MODEL : DEFAULT_SOUND_MODEL, requestedBackend));
|
|
252615
|
+
}
|
|
252616
|
+
if (!allowFallback)
|
|
252617
|
+
return candidates.length ? candidates : [audioCandidateFor(kind, kind === "music" ? DEFAULT_MUSIC_MODEL : DEFAULT_SOUND_MODEL, requestedBackend)];
|
|
252618
|
+
const primaryIndex = requestedModel ? ladder.findIndex((preset) => preset.id === requestedModel) : requestedBackend && requestedBackend !== "auto" ? ladder.findIndex((preset) => preset.backend === requestedBackend) : 0;
|
|
252619
|
+
const fallbackTail = primaryIndex >= 0 ? ladder.slice(primaryIndex) : ladder;
|
|
252620
|
+
for (const preset of fallbackTail)
|
|
252621
|
+
add2(audioCandidateFor(kind, preset.id));
|
|
252622
|
+
return candidates;
|
|
252623
|
+
}
|
|
252191
252624
|
function audioGenerationSetupPlan(kind, backend, repoRoot = ".", model) {
|
|
252192
252625
|
const commandName = kind === "music" ? "music" : "sound";
|
|
252193
252626
|
const fallback = kind === "music" ? DEFAULT_MUSIC_MODEL : DEFAULT_SOUND_MODEL;
|
|
@@ -252261,6 +252694,7 @@ function audioGenerationSetupPlan(kind, backend, repoRoot = ".", model) {
|
|
|
252261
252694
|
],
|
|
252262
252695
|
notes: [
|
|
252263
252696
|
"Use this path for Stable Audio Open 1.0, the serious stereo audio/music baseline.",
|
|
252697
|
+
"Omnius uses Diffusers StableAudioPipeline here; stable-audio-tools is intentionally not installed because it often pulls build-from-source dependencies.",
|
|
252264
252698
|
"Expect larger model downloads and higher VRAM pressure than AudioLDM or MusicGen small."
|
|
252265
252699
|
]
|
|
252266
252700
|
};
|
|
@@ -252296,7 +252730,34 @@ function audioGenerationSetupPlan(kind, backend, repoRoot = ".", model) {
|
|
|
252296
252730
|
]
|
|
252297
252731
|
};
|
|
252298
252732
|
}
|
|
252299
|
-
|
|
252733
|
+
function summarizeToolResult2(result) {
|
|
252734
|
+
return trimProcessText2(String(result.error || result.output || "unknown error"), 700).replace(/\s+/g, " ").trim();
|
|
252735
|
+
}
|
|
252736
|
+
function formatAudioAttempt(candidate, reason, index) {
|
|
252737
|
+
return `${index + 1}. ${candidate.model} [${candidate.backend}] - ${reason}`;
|
|
252738
|
+
}
|
|
252739
|
+
function formatAudioFallbackFailure(kind, failed) {
|
|
252740
|
+
return [
|
|
252741
|
+
`No ${kind} generation model in the fallback ladder completed successfully.`,
|
|
252742
|
+
"Attempted, highest quality to lowest:",
|
|
252743
|
+
...failed.map((attempt, index) => ` ${formatAudioAttempt(attempt.candidate, attempt.reason, index)}`)
|
|
252744
|
+
].join("\n");
|
|
252745
|
+
}
|
|
252746
|
+
function annotateAudioFallbackSuccess(result, failed, winner) {
|
|
252747
|
+
if (failed.length === 0)
|
|
252748
|
+
return result;
|
|
252749
|
+
const prefix = [
|
|
252750
|
+
`Fallback ladder succeeded with ${winner.model} [${winner.backend}] after ${failed.length} failed attempt(s).`,
|
|
252751
|
+
"Failed attempts:",
|
|
252752
|
+
...failed.map((attempt, index) => ` ${formatAudioAttempt(attempt.candidate, attempt.reason, index)}`),
|
|
252753
|
+
""
|
|
252754
|
+
].join("\n");
|
|
252755
|
+
return {
|
|
252756
|
+
...result,
|
|
252757
|
+
output: prefix + result.output
|
|
252758
|
+
};
|
|
252759
|
+
}
|
|
252760
|
+
var DEFAULT_SOUND_MODEL, DEFAULT_MUSIC_MODEL, DIFFUSERS_AUDIO_PACKAGES, TRANSFORMERS_AUDIO_PACKAGES, AUDIOCRAFT_PACKAGES, STABLE_AUDIO_PACKAGES, TANGOFLUX_PACKAGES, AUDIO_GENERATION_MODEL_PRESETS, SOUND_GENERATION_QUALITY_LADDER, MUSIC_GENERATION_QUALITY_LADDER, DIFFUSERS_AUDIO_RUNNER, AUDIOCRAFT_RUNNER, TRANSFORMERS_AUDIO_RUNNER, TANGOFLUX_RUNNER, AudioGenerateTool;
|
|
252300
252761
|
var init_audio_generate = __esm({
|
|
252301
252762
|
"packages/execution/dist/tools/audio-generate.js"() {
|
|
252302
252763
|
"use strict";
|
|
@@ -252338,7 +252799,6 @@ var init_audio_generate = __esm({
|
|
|
252338
252799
|
"accelerate",
|
|
252339
252800
|
"scipy",
|
|
252340
252801
|
"soundfile",
|
|
252341
|
-
"stable-audio-tools",
|
|
252342
252802
|
"einops"
|
|
252343
252803
|
];
|
|
252344
252804
|
TANGOFLUX_PACKAGES = [
|
|
@@ -252644,6 +253104,21 @@ var init_audio_generate = __esm({
|
|
|
252644
253104
|
note: "Legacy specialized music-generation path."
|
|
252645
253105
|
}
|
|
252646
253106
|
];
|
|
253107
|
+
SOUND_GENERATION_QUALITY_LADDER = [
|
|
253108
|
+
"stabilityai/stable-audio-open-1.0",
|
|
253109
|
+
"cvssp/audioldm2-large",
|
|
253110
|
+
"cvssp/audioldm2",
|
|
253111
|
+
"facebook/audiogen-medium",
|
|
253112
|
+
"declare-lab/TangoFlux",
|
|
253113
|
+
DEFAULT_SOUND_MODEL
|
|
253114
|
+
];
|
|
253115
|
+
MUSIC_GENERATION_QUALITY_LADDER = [
|
|
253116
|
+
"stabilityai/stable-audio-open-1.0",
|
|
253117
|
+
"facebook/musicgen-stereo-large",
|
|
253118
|
+
"facebook/musicgen-large",
|
|
253119
|
+
"facebook/musicgen-medium",
|
|
253120
|
+
DEFAULT_MUSIC_MODEL
|
|
253121
|
+
];
|
|
252647
253122
|
DIFFUSERS_AUDIO_RUNNER = String.raw`#!/usr/bin/env python3
|
|
252648
253123
|
import argparse, json, sys, time
|
|
252649
253124
|
from pathlib import Path
|
|
@@ -252685,6 +253160,10 @@ def _snapshot_model(repo_id):
|
|
|
252685
253160
|
def _device():
|
|
252686
253161
|
import torch
|
|
252687
253162
|
if torch.cuda.is_available():
|
|
253163
|
+
cap = torch.cuda.get_device_capability(0)
|
|
253164
|
+
cudnn = torch.backends.cudnn.version() or 0
|
|
253165
|
+
if int(cudnn) >= 90000 and tuple(cap) < (7, 5):
|
|
253166
|
+
raise RuntimeError(f"PyTorch cuDNN {cudnn} is incompatible with CUDA device {torch.cuda.get_device_name(0)} SM {cap[0]}.{cap[1]}; recreate the audio venv or let Omnius repair it with a cu118-compatible Torch wheel")
|
|
252688
253167
|
return "cuda"
|
|
252689
253168
|
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
252690
253169
|
return "mps"
|
|
@@ -252877,6 +253356,10 @@ def _snapshot_model(repo_id):
|
|
|
252877
253356
|
def _device():
|
|
252878
253357
|
import torch
|
|
252879
253358
|
if torch.cuda.is_available():
|
|
253359
|
+
cap = torch.cuda.get_device_capability(0)
|
|
253360
|
+
cudnn = torch.backends.cudnn.version() or 0
|
|
253361
|
+
if int(cudnn) >= 90000 and tuple(cap) < (7, 5):
|
|
253362
|
+
raise RuntimeError(f"PyTorch cuDNN {cudnn} is incompatible with CUDA device {torch.cuda.get_device_name(0)} SM {cap[0]}.{cap[1]}; recreate the audio venv or let Omnius repair it with a cu118-compatible Torch wheel")
|
|
252880
253363
|
return "cuda"
|
|
252881
253364
|
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
252882
253365
|
return "mps"
|
|
@@ -253030,7 +253513,7 @@ if __name__ == "__main__":
|
|
|
253030
253513
|
`;
|
|
253031
253514
|
AudioGenerateTool = class {
|
|
253032
253515
|
name = "generate_audio";
|
|
253033
|
-
description = "Generate a sound effect or music clip from a text prompt using local audio-generation backends. Supports Diffusers AudioLDM/AudioLDM2, Transformers MusicGen, AudioCraft AudioGen, Stable Audio Open deployment paths, and explicit research-project profiles. Saves WAV files under .omnius/audio and returns the file path.";
|
|
253516
|
+
description = "Generate a sound effect or music clip from a text prompt using local audio-generation backends. Supports Diffusers AudioLDM/AudioLDM2, Transformers MusicGen, AudioCraft AudioGen, Stable Audio Open deployment paths, and explicit research-project profiles. When fallback is enabled, auto generation tries ranked high-quality candidates first and gracefully falls back to smaller models if setup, download, or generation fails. Saves WAV files under .omnius/audio and returns the file path.";
|
|
253034
253517
|
parameters = {
|
|
253035
253518
|
type: "object",
|
|
253036
253519
|
properties: {
|
|
@@ -253044,6 +253527,14 @@ if __name__ == "__main__":
|
|
|
253044
253527
|
playback: {
|
|
253045
253528
|
type: "boolean",
|
|
253046
253529
|
description: "Whether the TUI should play generated audio after saving it. Defaults true; set false for silent generation."
|
|
253530
|
+
},
|
|
253531
|
+
fallback: {
|
|
253532
|
+
type: "boolean",
|
|
253533
|
+
description: "Whether to try the ranked quality ladder if the selected model/backend fails. Defaults true."
|
|
253534
|
+
},
|
|
253535
|
+
strict_model: {
|
|
253536
|
+
type: "boolean",
|
|
253537
|
+
description: "When true, use only the requested model/backend and do not fall back. Defaults false."
|
|
253047
253538
|
}
|
|
253048
253539
|
},
|
|
253049
253540
|
required: ["prompt"]
|
|
@@ -253147,14 +253638,14 @@ if __name__ == "__main__":
|
|
|
253147
253638
|
if (action === "list_models") {
|
|
253148
253639
|
return {
|
|
253149
253640
|
success: true,
|
|
253150
|
-
output: AUDIO_GENERATION_MODEL_PRESETS.filter((
|
|
253641
|
+
output: AUDIO_GENERATION_MODEL_PRESETS.filter((preset) => preset.kind === kind).map((preset) => `${preset.id} [${preset.backend}] - ${preset.note}`).join("\n"),
|
|
253151
253642
|
durationMs: performance.now() - start2
|
|
253152
253643
|
};
|
|
253153
253644
|
}
|
|
253154
253645
|
if (action === "setup") {
|
|
253155
253646
|
const requested = String(args["backend"] ?? (kind === "music" ? this.defaults.musicBackend : this.defaults.soundBackend) ?? (kind === "music" ? "transformers" : "diffusers"));
|
|
253156
|
-
const
|
|
253157
|
-
const resolvedBackend =
|
|
253647
|
+
const backend = inferAudioGenerationBackend(typeof args["model"] === "string" ? args["model"] : void 0, requested);
|
|
253648
|
+
const resolvedBackend = backend === "auto" ? kind === "music" ? "transformers" : "diffusers" : backend;
|
|
253158
253649
|
const plan = audioGenerationSetupPlan(kind, resolvedBackend, this.cwd, typeof args["model"] === "string" ? args["model"] : void 0);
|
|
253159
253650
|
return {
|
|
253160
253651
|
success: true,
|
|
@@ -253173,37 +253664,9 @@ if __name__ == "__main__":
|
|
|
253173
253664
|
const defaultBackend2 = kind === "music" ? this.defaults.musicBackend : this.defaults.soundBackend;
|
|
253174
253665
|
const rawModel2 = args["model"] ? String(args["model"]) : defaultModel2;
|
|
253175
253666
|
const requestedModel2 = rawModel2 === "auto" ? void 0 : rawModel2;
|
|
253176
|
-
|
|
253177
|
-
|
|
253178
|
-
|
|
253179
|
-
const model2 = requestedModel2 ?? (kind === "music" ? DEFAULT_MUSIC_MODEL : DEFAULT_SOUND_MODEL);
|
|
253180
|
-
const preset2 = getAudioGenerationPreset(model2, kind);
|
|
253181
|
-
const duration2 = numberArg2(args["duration"], preset2?.defaultDurationSec ?? (kind === "music" ? 20 : 8));
|
|
253182
|
-
if (backend2 === "project") {
|
|
253183
|
-
const plan = audioGenerationSetupPlan(kind, "project", this.cwd, model2);
|
|
253184
|
-
return {
|
|
253185
|
-
success: false,
|
|
253186
|
-
output: [
|
|
253187
|
-
`${preset2?.label ?? model2} is a project deployment profile, not an automatic generic runner.`,
|
|
253188
|
-
"",
|
|
253189
|
-
"Setup path:",
|
|
253190
|
-
...plan.commands.map((cmd) => ` ${cmd}`),
|
|
253191
|
-
"",
|
|
253192
|
-
...plan.notes.map((note) => `- ${note}`)
|
|
253193
|
-
].join("\n"),
|
|
253194
|
-
durationMs: performance.now() - start2
|
|
253195
|
-
};
|
|
253196
|
-
}
|
|
253197
|
-
this.emitProgress({ stage: "setup", message: `Preparing ${kind} model ${model2} (${backend2})` });
|
|
253198
|
-
return await this.prewarmPythonBackend({
|
|
253199
|
-
kind,
|
|
253200
|
-
backend: backend2,
|
|
253201
|
-
runnerBackend: backend2,
|
|
253202
|
-
model: model2,
|
|
253203
|
-
duration: duration2,
|
|
253204
|
-
start: start2,
|
|
253205
|
-
python: args["python"]
|
|
253206
|
-
});
|
|
253667
|
+
const requestedBackend2 = args["backend"] ? String(args["backend"]) : defaultBackend2;
|
|
253668
|
+
const candidates2 = audioGenerationFallbackCandidates(kind, requestedModel2, requestedBackend2, generationFallbackEnabled2(args));
|
|
253669
|
+
return await this.prewarmCandidateLadder({ kind, candidates: candidates2, args, start: start2 });
|
|
253207
253670
|
}
|
|
253208
253671
|
const prompt = String(args["prompt"] ?? "").trim();
|
|
253209
253672
|
if (!prompt) {
|
|
@@ -253213,45 +253676,12 @@ if __name__ == "__main__":
|
|
|
253213
253676
|
const defaultBackend = kind === "music" ? this.defaults.musicBackend : this.defaults.soundBackend;
|
|
253214
253677
|
const rawModel = args["model"] ? String(args["model"]) : defaultModel;
|
|
253215
253678
|
const requestedModel = rawModel === "auto" ? void 0 : rawModel;
|
|
253216
|
-
|
|
253217
|
-
|
|
253218
|
-
backend = kind === "music" ? "transformers" : "diffusers";
|
|
253219
|
-
const model = requestedModel ?? (kind === "music" ? DEFAULT_MUSIC_MODEL : DEFAULT_SOUND_MODEL);
|
|
253220
|
-
const preset = getAudioGenerationPreset(model, kind);
|
|
253221
|
-
const duration = numberArg2(args["duration"], preset?.defaultDurationSec ?? (kind === "music" ? 20 : 8));
|
|
253222
|
-
const steps = optionalNumberArg2(args["steps"]) ?? preset?.defaultSteps;
|
|
253679
|
+
const requestedBackend = args["backend"] ? String(args["backend"]) : defaultBackend;
|
|
253680
|
+
const candidates = audioGenerationFallbackCandidates(kind, requestedModel, requestedBackend, generationFallbackEnabled2(args));
|
|
253223
253681
|
const seed = optionalNumberArg2(args["seed"]);
|
|
253224
253682
|
const playback = playbackRequested(args);
|
|
253225
253683
|
try {
|
|
253226
|
-
this.
|
|
253227
|
-
if (backend === "project") {
|
|
253228
|
-
const plan = audioGenerationSetupPlan(kind, "project", this.cwd, model);
|
|
253229
|
-
return {
|
|
253230
|
-
success: false,
|
|
253231
|
-
output: [
|
|
253232
|
-
`${preset?.label ?? model} is a project deployment profile, not an automatic generic runner.`,
|
|
253233
|
-
"",
|
|
253234
|
-
"Setup path:",
|
|
253235
|
-
...plan.commands.map((cmd) => ` ${cmd}`),
|
|
253236
|
-
"",
|
|
253237
|
-
...plan.notes.map((note) => `- ${note}`)
|
|
253238
|
-
].join("\n"),
|
|
253239
|
-
durationMs: performance.now() - start2
|
|
253240
|
-
};
|
|
253241
|
-
}
|
|
253242
|
-
if (backend === "tangoflux") {
|
|
253243
|
-
return await this.generateWithPythonBackend({ kind, backend, runnerBackend: "tangoflux", prompt, model, duration, steps, seed, playback, start: start2, python: args["python"] });
|
|
253244
|
-
}
|
|
253245
|
-
if (backend === "transformers") {
|
|
253246
|
-
return await this.generateWithPythonBackend({ kind, backend, runnerBackend: "transformers", prompt, model, duration, steps, seed, playback, start: start2, python: args["python"] });
|
|
253247
|
-
}
|
|
253248
|
-
if (backend === "audiocraft") {
|
|
253249
|
-
return await this.generateWithPythonBackend({ kind, backend, runnerBackend: "audiocraft", prompt, model, duration, steps, seed, playback, start: start2, python: args["python"] });
|
|
253250
|
-
}
|
|
253251
|
-
if (backend === "stable-audio") {
|
|
253252
|
-
return await this.generateWithPythonBackend({ kind, backend, runnerBackend: "stable-audio", prompt, model, duration, steps, seed, playback, start: start2, python: args["python"] });
|
|
253253
|
-
}
|
|
253254
|
-
return await this.generateWithPythonBackend({ kind, backend: "diffusers", runnerBackend: "diffusers", prompt, model, duration, steps, seed, playback, start: start2, python: args["python"] });
|
|
253684
|
+
return await this.generateCandidateLadder({ kind, candidates, prompt, args, seed, playback, start: start2 });
|
|
253255
253685
|
} catch (err) {
|
|
253256
253686
|
return {
|
|
253257
253687
|
success: false,
|
|
@@ -253260,6 +253690,96 @@ if __name__ == "__main__":
|
|
|
253260
253690
|
};
|
|
253261
253691
|
}
|
|
253262
253692
|
}
|
|
253693
|
+
async prewarmCandidateLadder(args) {
|
|
253694
|
+
const failed = [];
|
|
253695
|
+
for (let index = 0; index < args.candidates.length; index++) {
|
|
253696
|
+
const candidate = args.candidates[index];
|
|
253697
|
+
const duration = numberArg2(args.args["duration"], candidate.preset?.defaultDurationSec ?? (args.kind === "music" ? 20 : 8));
|
|
253698
|
+
this.emitProgress({
|
|
253699
|
+
stage: "setup",
|
|
253700
|
+
message: `Preparing ${args.kind} model ${candidate.model} (${candidate.backend}) [${index + 1}/${args.candidates.length}]`
|
|
253701
|
+
});
|
|
253702
|
+
const result = candidate.backend === "project" ? this.projectProfileResult(args.kind, candidate, args.start) : await this.prewarmPythonBackend({
|
|
253703
|
+
kind: args.kind,
|
|
253704
|
+
backend: candidate.backend,
|
|
253705
|
+
runnerBackend: candidate.backend,
|
|
253706
|
+
model: candidate.model,
|
|
253707
|
+
duration,
|
|
253708
|
+
start: args.start,
|
|
253709
|
+
python: args.args["python"]
|
|
253710
|
+
});
|
|
253711
|
+
if (result.success)
|
|
253712
|
+
return annotateAudioFallbackSuccess(result, failed, candidate);
|
|
253713
|
+
failed.push({ candidate, reason: summarizeToolResult2(result) });
|
|
253714
|
+
if (index < args.candidates.length - 1) {
|
|
253715
|
+
this.emitProgress({
|
|
253716
|
+
stage: "setup",
|
|
253717
|
+
message: `${candidate.model} failed; trying ${args.candidates[index + 1].model}`
|
|
253718
|
+
});
|
|
253719
|
+
}
|
|
253720
|
+
}
|
|
253721
|
+
return {
|
|
253722
|
+
success: false,
|
|
253723
|
+
output: formatAudioFallbackFailure(args.kind, failed),
|
|
253724
|
+
error: formatAudioFallbackFailure(args.kind, failed),
|
|
253725
|
+
durationMs: performance.now() - args.start
|
|
253726
|
+
};
|
|
253727
|
+
}
|
|
253728
|
+
async generateCandidateLadder(args) {
|
|
253729
|
+
const failed = [];
|
|
253730
|
+
for (let index = 0; index < args.candidates.length; index++) {
|
|
253731
|
+
const candidate = args.candidates[index];
|
|
253732
|
+
const duration = numberArg2(args.args["duration"], candidate.preset?.defaultDurationSec ?? (args.kind === "music" ? 20 : 8));
|
|
253733
|
+
const steps = optionalNumberArg2(args.args["steps"]) ?? candidate.preset?.defaultSteps;
|
|
253734
|
+
this.emitProgress({
|
|
253735
|
+
stage: "setup",
|
|
253736
|
+
message: `Using ${args.kind} model ${candidate.model} (${candidate.backend}) [${index + 1}/${args.candidates.length}]`
|
|
253737
|
+
});
|
|
253738
|
+
const result = candidate.backend === "project" ? this.projectProfileResult(args.kind, candidate, args.start) : await this.generateWithPythonBackend({
|
|
253739
|
+
kind: args.kind,
|
|
253740
|
+
backend: candidate.backend,
|
|
253741
|
+
runnerBackend: candidate.backend,
|
|
253742
|
+
prompt: args.prompt,
|
|
253743
|
+
model: candidate.model,
|
|
253744
|
+
duration,
|
|
253745
|
+
steps,
|
|
253746
|
+
seed: args.seed,
|
|
253747
|
+
playback: args.playback,
|
|
253748
|
+
start: args.start,
|
|
253749
|
+
python: args.args["python"]
|
|
253750
|
+
});
|
|
253751
|
+
if (result.success)
|
|
253752
|
+
return annotateAudioFallbackSuccess(result, failed, candidate);
|
|
253753
|
+
failed.push({ candidate, reason: summarizeToolResult2(result) });
|
|
253754
|
+
if (index < args.candidates.length - 1) {
|
|
253755
|
+
this.emitProgress({
|
|
253756
|
+
stage: "setup",
|
|
253757
|
+
message: `${candidate.model} failed; falling back to ${args.candidates[index + 1].model}`
|
|
253758
|
+
});
|
|
253759
|
+
}
|
|
253760
|
+
}
|
|
253761
|
+
return {
|
|
253762
|
+
success: false,
|
|
253763
|
+
output: formatAudioFallbackFailure(args.kind, failed),
|
|
253764
|
+
error: formatAudioFallbackFailure(args.kind, failed),
|
|
253765
|
+
durationMs: performance.now() - args.start
|
|
253766
|
+
};
|
|
253767
|
+
}
|
|
253768
|
+
projectProfileResult(kind, candidate, start2) {
|
|
253769
|
+
const plan = audioGenerationSetupPlan(kind, "project", this.cwd, candidate.model);
|
|
253770
|
+
return {
|
|
253771
|
+
success: false,
|
|
253772
|
+
output: [
|
|
253773
|
+
`${candidate.preset?.label ?? candidate.model} is a project deployment profile, not an automatic generic runner.`,
|
|
253774
|
+
"",
|
|
253775
|
+
"Setup path:",
|
|
253776
|
+
...plan.commands.map((cmd) => ` ${cmd}`),
|
|
253777
|
+
"",
|
|
253778
|
+
...plan.notes.map((note) => `- ${note}`)
|
|
253779
|
+
].join("\n"),
|
|
253780
|
+
durationMs: performance.now() - start2
|
|
253781
|
+
};
|
|
253782
|
+
}
|
|
253263
253783
|
async generateWithPythonBackend(args) {
|
|
253264
253784
|
const runner = await ensureAudioRunner(this.cwd, args.runnerBackend);
|
|
253265
253785
|
await mkdir12(audioOutputDir(this.cwd), { recursive: true });
|
|
@@ -477005,7 +477525,7 @@ var require_path_browserify = __commonJS({
|
|
|
477005
477525
|
return path11.slice(start2, end);
|
|
477006
477526
|
}
|
|
477007
477527
|
},
|
|
477008
|
-
extname: function
|
|
477528
|
+
extname: function extname17(path11) {
|
|
477009
477529
|
assertPath(path11);
|
|
477010
477530
|
var startDot = -1;
|
|
477011
477531
|
var startPart = 0;
|
|
@@ -507166,22 +507686,22 @@ Saved to: ${tempFile}`,
|
|
|
507166
507686
|
});
|
|
507167
507687
|
|
|
507168
507688
|
// packages/execution/dist/tools/audio-playback.js
|
|
507169
|
-
import { execFileSync as
|
|
507689
|
+
import { execFileSync as execFileSync3, execSync as execSync29, spawn as spawn16 } from "node:child_process";
|
|
507170
507690
|
import { copyFileSync as copyFileSync2, existsSync as existsSync40, statSync as statSync18, writeFileSync as writeFileSync16, mkdirSync as mkdirSync16, readdirSync as readdirSync14 } from "node:fs";
|
|
507171
507691
|
import { basename as basename12, extname as extname10, isAbsolute, join as join58 } from "node:path";
|
|
507172
507692
|
import { homedir as homedir14, tmpdir as tmpdir11 } from "node:os";
|
|
507173
507693
|
function hasCommand3(command) {
|
|
507174
507694
|
try {
|
|
507175
507695
|
if (process.platform === "win32") {
|
|
507176
|
-
|
|
507696
|
+
execFileSync3("where", [command], { stdio: "ignore", timeout: 2e3 });
|
|
507177
507697
|
} else {
|
|
507178
|
-
|
|
507698
|
+
execFileSync3("command", ["-v", command], { stdio: "ignore", timeout: 2e3 });
|
|
507179
507699
|
}
|
|
507180
507700
|
return true;
|
|
507181
507701
|
} catch {
|
|
507182
507702
|
if (process.platform !== "win32") {
|
|
507183
507703
|
try {
|
|
507184
|
-
|
|
507704
|
+
execFileSync3("which", [command], { stdio: "ignore", timeout: 2e3 });
|
|
507185
507705
|
return true;
|
|
507186
507706
|
} catch {
|
|
507187
507707
|
return false;
|
|
@@ -507236,7 +507756,7 @@ function playSoundFile(file, opts = {}) {
|
|
|
507236
507756
|
};
|
|
507237
507757
|
}
|
|
507238
507758
|
try {
|
|
507239
|
-
|
|
507759
|
+
execFileSync3(command.command, command.args, { timeout: opts.timeoutMs ?? 3e5, stdio: "pipe" });
|
|
507240
507760
|
return { ok: true, player: command.label };
|
|
507241
507761
|
} catch (err) {
|
|
507242
507762
|
return { ok: false, error: `Playback via ${command.label} failed: ${err instanceof Error ? err.message.slice(0, 300) : String(err).slice(0, 300)}` };
|
|
@@ -507359,6 +507879,18 @@ function supertonicInferScript() {
|
|
|
507359
507879
|
function mlxVenvPy() {
|
|
507360
507880
|
return process.platform === "win32" ? join58(voiceDir(), "mlx-venv", "Scripts", "python.exe") : join58(voiceDir(), "mlx-venv", "bin", "python3");
|
|
507361
507881
|
}
|
|
507882
|
+
function luxttsVenvDir() {
|
|
507883
|
+
return join58(voiceDir(), "luxtts-venv");
|
|
507884
|
+
}
|
|
507885
|
+
function luxttsVenvPy() {
|
|
507886
|
+
return process.platform === "win32" ? join58(luxttsVenvDir(), "Scripts", "python.exe") : join58(luxttsVenvDir(), "bin", "python3");
|
|
507887
|
+
}
|
|
507888
|
+
function luxttsRepoDir() {
|
|
507889
|
+
return join58(voiceDir(), "LuxTTS");
|
|
507890
|
+
}
|
|
507891
|
+
function luxttsInferScript() {
|
|
507892
|
+
return join58(voiceDir(), "luxtts-infer.py");
|
|
507893
|
+
}
|
|
507362
507894
|
function piperVenvDir() {
|
|
507363
507895
|
return join58(voiceDir(), "piper-venv");
|
|
507364
507896
|
}
|
|
@@ -507371,13 +507903,13 @@ function ensureSupertonicInstalled() {
|
|
|
507371
507903
|
const py = findPython32();
|
|
507372
507904
|
if (!py)
|
|
507373
507905
|
throw new Error("python3 is required to set up Supertonic TTS.");
|
|
507374
|
-
|
|
507906
|
+
execFileSync3(py, ["-m", "venv", join58(voiceDir(), "supertonic3-venv")], { stdio: "pipe", timeout: 18e4 });
|
|
507375
507907
|
}
|
|
507376
507908
|
try {
|
|
507377
|
-
|
|
507909
|
+
execFileSync3(venvPy, ["-c", "import supertonic"], { stdio: "pipe", timeout: 1e4 });
|
|
507378
507910
|
} catch {
|
|
507379
|
-
|
|
507380
|
-
|
|
507911
|
+
execFileSync3(venvPy, ["-m", "pip", "install", "--quiet", "--upgrade", "pip"], { stdio: "pipe", timeout: 12e4 });
|
|
507912
|
+
execFileSync3(venvPy, ["-m", "pip", "install", "--quiet", "supertonic"], { stdio: "pipe", timeout: 6e5 });
|
|
507381
507913
|
}
|
|
507382
507914
|
mkdirSync16(voiceDir(), { recursive: true });
|
|
507383
507915
|
writeFileSync16(supertonicInferScript(), SUPERTONIC_INFER_PY, "utf-8");
|
|
@@ -507385,20 +507917,95 @@ function ensureSupertonicInstalled() {
|
|
|
507385
507917
|
}
|
|
507386
507918
|
function ensureMlxInstalled() {
|
|
507387
507919
|
if (process.platform !== "darwin" || process.arch !== "arm64") {
|
|
507388
|
-
throw new Error("MLX TTS requires macOS on Apple Silicon. Use luxtts, supertonic, onnx/piper, or
|
|
507920
|
+
throw new Error("MLX TTS requires macOS on Apple Silicon. Use luxtts, supertonic, onnx/piper, or backend=auto on this machine.");
|
|
507389
507921
|
}
|
|
507390
507922
|
const venvPy = mlxVenvPy();
|
|
507391
507923
|
if (!existsSync40(venvPy)) {
|
|
507392
507924
|
const py = findPython32();
|
|
507393
507925
|
if (!py)
|
|
507394
507926
|
throw new Error("python3 is required to set up MLX Audio.");
|
|
507395
|
-
|
|
507927
|
+
execFileSync3(py, ["-m", "venv", join58(voiceDir(), "mlx-venv")], { stdio: "pipe", timeout: 18e4 });
|
|
507928
|
+
}
|
|
507929
|
+
try {
|
|
507930
|
+
execFileSync3(venvPy, ["-c", "import mlx_audio"], { stdio: "pipe", timeout: 1e4 });
|
|
507931
|
+
} catch {
|
|
507932
|
+
execFileSync3(venvPy, ["-m", "pip", "install", "--quiet", "--upgrade", "pip"], { stdio: "pipe", timeout: 12e4 });
|
|
507933
|
+
execFileSync3(venvPy, ["-m", "pip", "install", "--quiet", "mlx-audio"], { stdio: "pipe", timeout: 6e5 });
|
|
507934
|
+
}
|
|
507935
|
+
return venvPy;
|
|
507936
|
+
}
|
|
507937
|
+
function pythonCanImportLuxTts(venvPy) {
|
|
507938
|
+
try {
|
|
507939
|
+
execFileSync3(venvPy, [
|
|
507940
|
+
"-c",
|
|
507941
|
+
"import sys, os; sys.path.insert(0, os.environ['LUXTTS_REPO_PATH']); from zipvoice.luxvoice import LuxTTS; print('ok')"
|
|
507942
|
+
], {
|
|
507943
|
+
stdio: "pipe",
|
|
507944
|
+
timeout: 3e4,
|
|
507945
|
+
env: { ...process.env, LUXTTS_REPO_PATH: luxttsRepoDir() }
|
|
507946
|
+
});
|
|
507947
|
+
return true;
|
|
507948
|
+
} catch {
|
|
507949
|
+
return false;
|
|
507950
|
+
}
|
|
507951
|
+
}
|
|
507952
|
+
function pipInstall(venvPy, packages, timeout2 = 9e5) {
|
|
507953
|
+
execFileSync3(venvPy, ["-m", "pip", "install", "--prefer-binary", ...packages], {
|
|
507954
|
+
stdio: "pipe",
|
|
507955
|
+
timeout: timeout2,
|
|
507956
|
+
env: process.env
|
|
507957
|
+
});
|
|
507958
|
+
}
|
|
507959
|
+
function ensureLuxttsInstalled() {
|
|
507960
|
+
const venvPy = luxttsVenvPy();
|
|
507961
|
+
const repoDir = luxttsRepoDir();
|
|
507962
|
+
mkdirSync16(voiceDir(), { recursive: true });
|
|
507963
|
+
if (existsSync40(venvPy) && existsSync40(join58(repoDir, "zipvoice", "luxvoice.py")) && pythonCanImportLuxTts(venvPy)) {
|
|
507964
|
+
writeFileSync16(luxttsInferScript(), LUXTTS_DAEMON_PY, "utf-8");
|
|
507965
|
+
return venvPy;
|
|
507966
|
+
}
|
|
507967
|
+
const py = findPython32();
|
|
507968
|
+
if (!py)
|
|
507969
|
+
throw new Error("python3 is required to set up LuxTTS voice cloning.");
|
|
507970
|
+
if (!existsSync40(venvPy)) {
|
|
507971
|
+
execFileSync3(py, ["-m", "venv", luxttsVenvDir()], { stdio: "pipe", timeout: 18e4 });
|
|
507972
|
+
}
|
|
507973
|
+
execFileSync3(venvPy, ["-m", "pip", "install", "--upgrade", "pip", "wheel", "setuptools<81"], {
|
|
507974
|
+
stdio: "pipe",
|
|
507975
|
+
timeout: 3e5
|
|
507976
|
+
});
|
|
507977
|
+
pipInstall(venvPy, ["torch", "torchaudio"], 12e5);
|
|
507978
|
+
if (!existsSync40(join58(repoDir, "zipvoice", "luxvoice.py"))) {
|
|
507979
|
+
if (!hasCommand3("git"))
|
|
507980
|
+
throw new Error("git is required to set up LuxTTS voice cloning.");
|
|
507981
|
+
execFileSync3("git", ["clone", "--depth", "1", "https://github.com/ysharma3501/LuxTTS.git", repoDir], {
|
|
507982
|
+
stdio: "pipe",
|
|
507983
|
+
timeout: 3e5
|
|
507984
|
+
});
|
|
507396
507985
|
}
|
|
507986
|
+
pipInstall(venvPy, [
|
|
507987
|
+
"lhotse",
|
|
507988
|
+
"huggingface_hub",
|
|
507989
|
+
"safetensors",
|
|
507990
|
+
"pydub",
|
|
507991
|
+
"onnxruntime",
|
|
507992
|
+
"librosa",
|
|
507993
|
+
"transformers<=4.57.6",
|
|
507994
|
+
"inflect",
|
|
507995
|
+
"numpy",
|
|
507996
|
+
"vocos",
|
|
507997
|
+
"jieba",
|
|
507998
|
+
"pypinyin",
|
|
507999
|
+
"cn2an"
|
|
508000
|
+
], 12e5);
|
|
507397
508001
|
try {
|
|
507398
|
-
|
|
508002
|
+
pipInstall(venvPy, ["git+https://github.com/ysharma3501/LinaCodec.git"], 12e5);
|
|
507399
508003
|
} catch {
|
|
507400
|
-
|
|
507401
|
-
|
|
508004
|
+
}
|
|
508005
|
+
pipInstall(venvPy, ["-e", repoDir], 6e5);
|
|
508006
|
+
writeFileSync16(luxttsInferScript(), LUXTTS_DAEMON_PY, "utf-8");
|
|
508007
|
+
if (!pythonCanImportLuxTts(venvPy)) {
|
|
508008
|
+
throw new Error(`LuxTTS setup completed but import still fails in ${luxttsVenvDir()}.`);
|
|
507402
508009
|
}
|
|
507403
508010
|
return venvPy;
|
|
507404
508011
|
}
|
|
@@ -507411,10 +508018,10 @@ function ensurePiperInstalled() {
|
|
|
507411
508018
|
if (!py)
|
|
507412
508019
|
throw new Error("python3 is required to set up Piper TTS.");
|
|
507413
508020
|
mkdirSync16(voiceDir(), { recursive: true });
|
|
507414
|
-
|
|
508021
|
+
execFileSync3(py, ["-m", "venv", piperVenvDir()], { stdio: "pipe", timeout: 18e4 });
|
|
507415
508022
|
const venvPy = process.platform === "win32" ? join58(piperVenvDir(), "Scripts", "python.exe") : join58(piperVenvDir(), "bin", "python3");
|
|
507416
|
-
|
|
507417
|
-
|
|
508023
|
+
execFileSync3(venvPy, ["-m", "pip", "install", "--quiet", "--upgrade", "pip"], { stdio: "pipe", timeout: 12e4 });
|
|
508024
|
+
execFileSync3(venvPy, ["-m", "pip", "install", "--quiet", "piper-tts"], { stdio: "pipe", timeout: 6e5 });
|
|
507418
508025
|
}
|
|
507419
508026
|
if (!existsSync40(bin)) {
|
|
507420
508027
|
throw new Error("Piper TTS installed but the piper executable was not found in the managed venv.");
|
|
@@ -507435,6 +508042,28 @@ function saveCloneRefFromSample(sample, cloneName) {
|
|
|
507435
508042
|
copyFileSync2(source, dest);
|
|
507436
508043
|
return dest;
|
|
507437
508044
|
}
|
|
508045
|
+
function cloneSampleArg(args) {
|
|
508046
|
+
for (const key of ["sample", "source_audio", "voice_sample", "reference_audio", "ref_audio", "clone_sample"]) {
|
|
508047
|
+
const value2 = args[key];
|
|
508048
|
+
if (typeof value2 === "string" && value2.trim())
|
|
508049
|
+
return value2.trim();
|
|
508050
|
+
}
|
|
508051
|
+
return "";
|
|
508052
|
+
}
|
|
508053
|
+
function wantsVoiceClone(args) {
|
|
508054
|
+
if (cloneSampleArg(args))
|
|
508055
|
+
return true;
|
|
508056
|
+
if (typeof args["clone_ref"] === "string" && args["clone_ref"].trim())
|
|
508057
|
+
return true;
|
|
508058
|
+
const voice = typeof args["voice"] === "string" ? args["voice"].trim() : "";
|
|
508059
|
+
return /\.(wav|mp3|flac|ogg|m4a)$/i.test(voice) || voice.startsWith("/") || voice.startsWith("./") || voice.startsWith("../") || voice.startsWith("~/");
|
|
508060
|
+
}
|
|
508061
|
+
function cloneRefForSynthesis(args) {
|
|
508062
|
+
const sample = cloneSampleArg(args);
|
|
508063
|
+
if (sample)
|
|
508064
|
+
return saveCloneRefFromSample(sample, typeof args["clone_name"] === "string" ? args["clone_name"] : void 0);
|
|
508065
|
+
return resolveCloneRef(args["clone_ref"] ?? args["voice"]);
|
|
508066
|
+
}
|
|
507438
508067
|
function ensureLuxttsDaemon() {
|
|
507439
508068
|
if (_luxttsDaemon && !_luxttsDaemon.killed && _luxttsReady)
|
|
507440
508069
|
return Promise.resolve(true);
|
|
@@ -507448,14 +508077,23 @@ function ensureLuxttsDaemon() {
|
|
|
507448
508077
|
}
|
|
507449
508078
|
if (_luxttsStarting)
|
|
507450
508079
|
return Promise.resolve(false);
|
|
507451
|
-
const venvPy =
|
|
507452
|
-
const inferScript =
|
|
507453
|
-
const repoDir =
|
|
508080
|
+
const venvPy = luxttsVenvPy();
|
|
508081
|
+
const inferScript = luxttsInferScript();
|
|
508082
|
+
const repoDir = luxttsRepoDir();
|
|
507454
508083
|
if (!existsSync40(venvPy) || !existsSync40(inferScript))
|
|
507455
508084
|
return Promise.resolve(false);
|
|
507456
508085
|
_luxttsStarting = true;
|
|
507457
508086
|
return new Promise((resolve48) => {
|
|
507458
|
-
|
|
508087
|
+
let settled = false;
|
|
508088
|
+
let timeout2;
|
|
508089
|
+
const finish = (ready) => {
|
|
508090
|
+
if (settled)
|
|
508091
|
+
return;
|
|
508092
|
+
settled = true;
|
|
508093
|
+
clearTimeout(timeout2);
|
|
508094
|
+
resolve48(ready);
|
|
508095
|
+
};
|
|
508096
|
+
timeout2 = setTimeout(() => {
|
|
507459
508097
|
_luxttsStarting = false;
|
|
507460
508098
|
if (_luxttsDaemon && !_luxttsReady) {
|
|
507461
508099
|
try {
|
|
@@ -507464,7 +508102,7 @@ function ensureLuxttsDaemon() {
|
|
|
507464
508102
|
}
|
|
507465
508103
|
_luxttsDaemon = null;
|
|
507466
508104
|
}
|
|
507467
|
-
|
|
508105
|
+
finish(false);
|
|
507468
508106
|
}, 12e4);
|
|
507469
508107
|
const daemon = spawn16(venvPy, [inferScript], {
|
|
507470
508108
|
stdio: ["pipe", "pipe", "pipe"],
|
|
@@ -507486,8 +508124,7 @@ function ensureLuxttsDaemon() {
|
|
|
507486
508124
|
if (msg.type === "ready") {
|
|
507487
508125
|
_luxttsReady = true;
|
|
507488
508126
|
_luxttsStarting = false;
|
|
507489
|
-
|
|
507490
|
-
resolve48(true);
|
|
508127
|
+
finish(true);
|
|
507491
508128
|
} else if (msg.type === "result" && msg.id) {
|
|
507492
508129
|
const pending = _luxttsPending.get(msg.id);
|
|
507493
508130
|
if (pending) {
|
|
@@ -507509,13 +508146,13 @@ function ensureLuxttsDaemon() {
|
|
|
507509
508146
|
_luxttsDaemon = null;
|
|
507510
508147
|
_luxttsReady = false;
|
|
507511
508148
|
_luxttsStarting = false;
|
|
508149
|
+
finish(false);
|
|
507512
508150
|
});
|
|
507513
508151
|
daemon.on("error", () => {
|
|
507514
508152
|
_luxttsDaemon = null;
|
|
507515
508153
|
_luxttsReady = false;
|
|
507516
508154
|
_luxttsStarting = false;
|
|
507517
|
-
|
|
507518
|
-
resolve48(false);
|
|
508155
|
+
finish(false);
|
|
507519
508156
|
});
|
|
507520
508157
|
});
|
|
507521
508158
|
}
|
|
@@ -507545,7 +508182,7 @@ function luxttsSynthesize(text, cloneRef, outputPath2, speed = 1) {
|
|
|
507545
508182
|
_luxttsDaemon.stdin.write(req2 + "\n");
|
|
507546
508183
|
});
|
|
507547
508184
|
}
|
|
507548
|
-
var _luxttsDaemon, _luxttsReady, _luxttsRequestId, _luxttsPending, _luxttsBuffer, _luxttsStarting, SUPERTONIC_INFER_PY, AudioPlaybackTool, TtsGenerateTool, SoundPlaybackTool;
|
|
508185
|
+
var _luxttsDaemon, _luxttsReady, _luxttsRequestId, _luxttsPending, _luxttsBuffer, _luxttsStarting, SUPERTONIC_INFER_PY, LUXTTS_DAEMON_PY, AudioPlaybackTool, TtsGenerateTool, SoundPlaybackTool;
|
|
507549
508186
|
var init_audio_playback = __esm({
|
|
507550
508187
|
"packages/execution/dist/tools/audio-playback.js"() {
|
|
507551
508188
|
"use strict";
|
|
@@ -507585,10 +508222,45 @@ try:
|
|
|
507585
508222
|
except Exception as exc:
|
|
507586
508223
|
print(json.dumps({"ok": False, "error": str(exc), "trace": traceback.format_exc(limit=3)}))
|
|
507587
508224
|
sys.exit(1)
|
|
508225
|
+
`;
|
|
508226
|
+
LUXTTS_DAEMON_PY = String.raw`
|
|
508227
|
+
import json, os, sys, traceback, wave
|
|
508228
|
+
import numpy as np
|
|
508229
|
+
import torch
|
|
508230
|
+
repo = os.environ.get("LUXTTS_REPO_PATH") or ""
|
|
508231
|
+
if repo:
|
|
508232
|
+
sys.path.insert(0, repo)
|
|
508233
|
+
from zipvoice.luxvoice import LuxTTS
|
|
508234
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
508235
|
+
tts = LuxTTS(model_path="YatharthS/LuxTTS", device=device, threads=4)
|
|
508236
|
+
print(json.dumps({"type": "ready", "device": device}), flush=True)
|
|
508237
|
+
for line in sys.stdin:
|
|
508238
|
+
if not line.strip():
|
|
508239
|
+
continue
|
|
508240
|
+
req = json.loads(line)
|
|
508241
|
+
if req.get("action") == "quit":
|
|
508242
|
+
break
|
|
508243
|
+
rid = req.get("id")
|
|
508244
|
+
try:
|
|
508245
|
+
text = str(req.get("text") or "").strip()
|
|
508246
|
+
clone_ref = str(req.get("clone_ref") or "")
|
|
508247
|
+
output = str(req.get("output_path") or "")
|
|
508248
|
+
speed = float(req.get("speed") or 1.0)
|
|
508249
|
+
enc = tts.encode_prompt(clone_ref, duration=5, rms=0.001)
|
|
508250
|
+
wav = tts.generate_speech(text, enc, num_steps=4, guidance_scale=3.0, t_shift=0.5, speed=speed)
|
|
508251
|
+
data = (np.clip(wav.cpu().numpy().squeeze(), -1, 1) * 32767).astype(np.int16)
|
|
508252
|
+
with wave.open(output, "wb") as f:
|
|
508253
|
+
f.setnchannels(1)
|
|
508254
|
+
f.setsampwidth(2)
|
|
508255
|
+
f.setframerate(48000)
|
|
508256
|
+
f.writeframes(data.tobytes())
|
|
508257
|
+
print(json.dumps({"type": "result", "id": rid, "path": output}), flush=True)
|
|
508258
|
+
except Exception as exc:
|
|
508259
|
+
print(json.dumps({"type": "error", "id": rid, "error": str(exc), "trace": traceback.format_exc(limit=3)}), flush=True)
|
|
507588
508260
|
`;
|
|
507589
508261
|
AudioPlaybackTool = class {
|
|
507590
508262
|
name = "audio_playback";
|
|
507591
|
-
description = "Play audio through speakers, synthesize text-to-speech, and manage TTS clone voices. Actions: 'play' to play an audio file (WAV/MP3/OGG — including recordings from memory episodes), 'speak' to synthesize and play text, 'synthesize' to save TTS to a WAV file, 'clone' to register a voice-clone
|
|
508263
|
+
description = "Play audio through speakers, synthesize text-to-speech, and manage TTS clone voices. Actions: 'play' to play an audio file (WAV/MP3/OGG — including recordings from memory episodes), 'speak' to synthesize and play text, 'synthesize' to save TTS to a WAV file, 'clone' to register a voice-clone source clip, 'list_voices' to inspect available clone refs/backends, 'volume' to get or set system volume, 'list' to enumerate audio output devices. TTS backends include auto, LuxTTS voice cloning, Supertonic, MLX, ONNX/Piper, and a local fallback. Neural TTS backends self-provision into ~/.omnius/voice on first use where supported. For cloned speech from a source clip, call generate_tts or audio_playback action=synthesize with sample/source_audio/voice_sample and backend=auto or luxtts. Use generate_tts when the task is specifically to create a TTS file; do not use shell speech commands or generate_audio for spoken TTS.";
|
|
507592
508264
|
parameters = {
|
|
507593
508265
|
type: "object",
|
|
507594
508266
|
properties: {
|
|
@@ -507615,8 +508287,8 @@ except Exception as exc:
|
|
|
507615
508287
|
},
|
|
507616
508288
|
backend: {
|
|
507617
508289
|
type: "string",
|
|
507618
|
-
enum: ["auto", "luxtts", "supertonic", "mlx", "onnx", "piper"
|
|
507619
|
-
description: "TTS backend. auto tries LuxTTS clone, Supertonic, MLX on Apple Silicon, Piper/ONNX, then
|
|
508290
|
+
enum: ["auto", "luxtts", "supertonic", "mlx", "onnx", "piper"],
|
|
508291
|
+
description: "TTS backend. auto tries LuxTTS clone, Supertonic, MLX on Apple Silicon, Piper/ONNX, then a local fallback."
|
|
507620
508292
|
},
|
|
507621
508293
|
output: {
|
|
507622
508294
|
type: "string",
|
|
@@ -507632,11 +508304,31 @@ except Exception as exc:
|
|
|
507632
508304
|
},
|
|
507633
508305
|
sample: {
|
|
507634
508306
|
type: "string",
|
|
507635
|
-
description: "Audio
|
|
508307
|
+
description: "Audio source clip path to register or use as a LuxTTS clone voice."
|
|
508308
|
+
},
|
|
508309
|
+
source_audio: {
|
|
508310
|
+
type: "string",
|
|
508311
|
+
description: "Alias for sample. Use this for cloned speech from a source voice clip."
|
|
508312
|
+
},
|
|
508313
|
+
voice_sample: {
|
|
508314
|
+
type: "string",
|
|
508315
|
+
description: "Alias for sample/source_audio."
|
|
508316
|
+
},
|
|
508317
|
+
reference_audio: {
|
|
508318
|
+
type: "string",
|
|
508319
|
+
description: "Alias for sample/source_audio."
|
|
508320
|
+
},
|
|
508321
|
+
ref_audio: {
|
|
508322
|
+
type: "string",
|
|
508323
|
+
description: "Alias for sample/source_audio."
|
|
508324
|
+
},
|
|
508325
|
+
clone_sample: {
|
|
508326
|
+
type: "string",
|
|
508327
|
+
description: "Alias for sample/source_audio."
|
|
507636
508328
|
},
|
|
507637
508329
|
clone_name: {
|
|
507638
508330
|
type: "string",
|
|
507639
|
-
description: "Friendly filename stem for action=clone."
|
|
508331
|
+
description: "Friendly filename stem for action=clone or for registering a source clip during synthesis."
|
|
507640
508332
|
},
|
|
507641
508333
|
model: {
|
|
507642
508334
|
type: "string",
|
|
@@ -507652,11 +508344,11 @@ except Exception as exc:
|
|
|
507652
508344
|
},
|
|
507653
508345
|
speed: {
|
|
507654
508346
|
type: "number",
|
|
507655
|
-
description: "Speech speed.
|
|
508347
|
+
description: "Speech speed. Neural backends use a multiplier; local fallback uses its backend-specific rate."
|
|
507656
508348
|
},
|
|
507657
508349
|
voice: {
|
|
507658
508350
|
type: "string",
|
|
507659
|
-
description: "Voice id/name. Examples: Supertonic voice M4, MLX voice af_heart,
|
|
508351
|
+
description: "Voice id/name. Examples: Supertonic voice M4, MLX voice af_heart, a source audio path for cloning, or Piper/ONNX model path."
|
|
507660
508352
|
},
|
|
507661
508353
|
lang: {
|
|
507662
508354
|
type: "string",
|
|
@@ -507720,9 +508412,9 @@ except Exception as exc:
|
|
|
507720
508412
|
return await this.synthesizeText(args, start2, true);
|
|
507721
508413
|
}
|
|
507722
508414
|
cloneVoice(args, start2) {
|
|
507723
|
-
const sample =
|
|
508415
|
+
const sample = cloneSampleArg(args) || (typeof args["file"] === "string" ? args["file"] : "");
|
|
507724
508416
|
if (!sample.trim()) {
|
|
507725
|
-
return { success: false, output: "", error: "Missing
|
|
508417
|
+
return { success: false, output: "", error: "Missing source audio. Provide sample=<file> or source_audio=<file> to register as a clone voice.", durationMs: performance.now() - start2 };
|
|
507726
508418
|
}
|
|
507727
508419
|
const saved = saveCloneRefFromSample(sample, typeof args["clone_name"] === "string" ? args["clone_name"] : void 0);
|
|
507728
508420
|
return {
|
|
@@ -507739,10 +508431,11 @@ except Exception as exc:
|
|
|
507739
508431
|
const lines = [
|
|
507740
508432
|
"TTS backends:",
|
|
507741
508433
|
` luxtts: ${existsSync40(join58(voiceDir(), "luxtts-venv", "bin", "python3")) ? "installed" : "not installed"}; clone refs: ${refs.length}`,
|
|
508434
|
+
" clone from source clip: generate_tts text=<words> source_audio=<wav/mp3/flac/ogg/m4a> backend=auto",
|
|
507742
508435
|
` supertonic: ${existsSync40(supertonicVenvPy()) ? "installed" : "not installed"}; voices include M1, M2, M3, M4 when package assets are available`,
|
|
507743
508436
|
` mlx: ${existsSync40(mlxVenvPy()) ? "installed" : "not installed"}; Apple Silicon only; default model mlx-community/Kokoro-82M-bf16`,
|
|
507744
508437
|
` piper/onnx: ${hasCommand3("piper") || existsSync40(piperVenvBin()) ? "available" : "not installed"}; first use installs piper-tts into ${piperVenvDir()}; pass model=<path.onnx> for raw ONNX voices`,
|
|
507745
|
-
`
|
|
508438
|
+
` local fallback: ${hasCommand3("espeak-ng") ? "available" : "not found"}`,
|
|
507746
508439
|
"",
|
|
507747
508440
|
"Registered clone refs:",
|
|
507748
508441
|
...refs.length ? refs.map((ref) => ` ${ref}`) : [" none"]
|
|
@@ -507756,11 +508449,20 @@ except Exception as exc:
|
|
|
507756
508449
|
}
|
|
507757
508450
|
const requestedBackend = normalizeTtsBackend(args["backend"]);
|
|
507758
508451
|
const strictBackend = boolArg(args["strict_backend"] ?? args["strictBackend"], false);
|
|
508452
|
+
const cloneRequested = wantsVoiceClone(args);
|
|
508453
|
+
if (cloneRequested && requestedBackend !== "auto" && requestedBackend !== "luxtts") {
|
|
508454
|
+
return {
|
|
508455
|
+
success: false,
|
|
508456
|
+
output: "",
|
|
508457
|
+
error: "Voice cloning from a source clip requires backend=auto or backend=luxtts.",
|
|
508458
|
+
durationMs: performance.now() - start2
|
|
508459
|
+
};
|
|
508460
|
+
}
|
|
507759
508461
|
const playback = playbackArg(args, speakDefault);
|
|
507760
508462
|
const outputPath2 = ttsOutputPath(args, requestedBackend);
|
|
507761
508463
|
const device = typeof args["device"] === "string" ? args["device"] : "default";
|
|
507762
508464
|
const tried = [];
|
|
507763
|
-
const autoCandidates = ["luxtts", "supertonic", ...process.platform === "darwin" && process.arch === "arm64" ? ["mlx"] : [], "piper", "espeak"];
|
|
508465
|
+
const autoCandidates = cloneRequested ? ["luxtts"] : ["luxtts", "supertonic", ...process.platform === "darwin" && process.arch === "arm64" ? ["mlx"] : [], "piper", "espeak"];
|
|
507764
508466
|
const candidates = requestedBackend === "auto" ? autoCandidates : strictBackend ? [requestedBackend] : [requestedBackend, ...autoCandidates.filter((backend) => backend !== requestedBackend)];
|
|
507765
508467
|
let usedBackend = "";
|
|
507766
508468
|
let voiceSummary = "";
|
|
@@ -507823,21 +508525,19 @@ ${tried.map((line) => `- ${line}`).join("\n")}`,
|
|
|
507823
508525
|
};
|
|
507824
508526
|
}
|
|
507825
508527
|
async synthesizeLuxtts(text, outputPath2, args) {
|
|
507826
|
-
const cloneRef =
|
|
508528
|
+
const cloneRef = cloneRefForSynthesis(args);
|
|
507827
508529
|
if (!cloneRef)
|
|
507828
|
-
throw new Error(`No LuxTTS clone
|
|
508530
|
+
throw new Error(`No LuxTTS clone source found. Provide source_audio=<voice clip> or clone_ref=<registered clip>.`);
|
|
507829
508531
|
const speed = numberArg3(args["speed"], 1);
|
|
508532
|
+
ensureLuxttsInstalled();
|
|
507830
508533
|
const daemonReady = await ensureLuxttsDaemon();
|
|
507831
508534
|
if (daemonReady) {
|
|
507832
508535
|
await luxttsSynthesize(text, cloneRef, outputPath2, speed);
|
|
507833
508536
|
if (existsSync40(outputPath2))
|
|
507834
508537
|
return `${basename12(cloneRef)} (LuxTTS daemon)`;
|
|
507835
508538
|
}
|
|
507836
|
-
const venvPy =
|
|
507837
|
-
const repoDir =
|
|
507838
|
-
if (!existsSync40(venvPy) || !existsSync40(repoDir)) {
|
|
507839
|
-
throw new Error("LuxTTS is not installed in the managed voice environment yet.");
|
|
507840
|
-
}
|
|
508539
|
+
const venvPy = luxttsVenvPy();
|
|
508540
|
+
const repoDir = luxttsRepoDir();
|
|
507841
508541
|
const pyScript = [
|
|
507842
508542
|
"import json, sys, wave",
|
|
507843
508543
|
"import numpy as np, torch",
|
|
@@ -507851,7 +508551,7 @@ ${tried.map((line) => `- ${line}`).join("\n")}`,
|
|
|
507851
508551
|
"d=(np.clip(wav.cpu().numpy().squeeze(), -1, 1)*32767).astype(np.int16)",
|
|
507852
508552
|
"f=wave.open(args['output'], 'wb'); f.setnchannels(1); f.setsampwidth(2); f.setframerate(48000); f.writeframes(d.tobytes()); f.close()"
|
|
507853
508553
|
].join("; ");
|
|
507854
|
-
|
|
508554
|
+
execFileSync3(venvPy, ["-c", pyScript, JSON.stringify({ text, output: outputPath2, clone_ref: cloneRef, repo: repoDir, speed })], {
|
|
507855
508555
|
stdio: "pipe",
|
|
507856
508556
|
timeout: 12e4,
|
|
507857
508557
|
env: { ...process.env, LUXTTS_REPO_PATH: repoDir }
|
|
@@ -507864,7 +508564,7 @@ ${tried.map((line) => `- ${line}`).join("\n")}`,
|
|
|
507864
508564
|
const lang = typeof args["lang"] === "string" ? args["lang"] : "en";
|
|
507865
508565
|
const speed = numberArg3(args["speed"], 1.05);
|
|
507866
508566
|
const totalStep = Math.round(numberArg3(args["total_step"], 8));
|
|
507867
|
-
const stdout =
|
|
508567
|
+
const stdout = execFileSync3(venvPy, [supertonicInferScript()], {
|
|
507868
508568
|
input: JSON.stringify({ text, output_path: outputPath2, voice_name: voice, lang, speed, total_step: totalStep }),
|
|
507869
508569
|
encoding: "utf8",
|
|
507870
508570
|
stdio: ["pipe", "pipe", "pipe"],
|
|
@@ -507887,7 +508587,7 @@ ${tried.map((line) => `- ${line}`).join("\n")}`,
|
|
|
507887
508587
|
"args=json.loads(sys.argv[1])",
|
|
507888
508588
|
"tts_gen.main(['--model', args['model'], '--text', args['text'], '--voice', args['voice'], '--lang_code', args['lang'], '--audio_path', args['output']])"
|
|
507889
508589
|
].join("; ");
|
|
507890
|
-
|
|
508590
|
+
execFileSync3(py, ["-c", pyScript, JSON.stringify({ text, model, voice, lang, output: outputPath2 })], {
|
|
507891
508591
|
stdio: "pipe",
|
|
507892
508592
|
timeout: 18e4,
|
|
507893
508593
|
cwd: tmpdir11()
|
|
@@ -507908,15 +508608,15 @@ ${tried.map((line) => `- ${line}`).join("\n")}`,
|
|
|
507908
508608
|
} else {
|
|
507909
508609
|
throw new Error(`${requireModel ? "Raw ONNX" : "Piper"} TTS requires model=<path.onnx> or voice=<path.onnx>.`);
|
|
507910
508610
|
}
|
|
507911
|
-
|
|
508611
|
+
execFileSync3(piper, argv, { input: text, stdio: ["pipe", "pipe", "pipe"], timeout: 12e4 });
|
|
507912
508612
|
return summary;
|
|
507913
508613
|
}
|
|
507914
508614
|
synthesizeEspeak(text, outputPath2, args) {
|
|
507915
508615
|
if (!hasCommand3("espeak-ng"))
|
|
507916
|
-
throw new Error("
|
|
508616
|
+
throw new Error("Local fallback TTS command not found.");
|
|
507917
508617
|
const voice = typeof args["voice"] === "string" ? args["voice"] : "en";
|
|
507918
508618
|
const speed = Math.round(numberArg3(args["speed"], 160));
|
|
507919
|
-
|
|
508619
|
+
execFileSync3("espeak-ng", ["-v", voice, "-s", String(speed), "-w", outputPath2, text], {
|
|
507920
508620
|
stdio: "pipe",
|
|
507921
508621
|
timeout: 6e4
|
|
507922
508622
|
});
|
|
@@ -507995,20 +508695,27 @@ ${devices.join("\n")}`,
|
|
|
507995
508695
|
};
|
|
507996
508696
|
TtsGenerateTool = class {
|
|
507997
508697
|
name = "generate_tts";
|
|
507998
|
-
description = "Generate text-to-speech audio as a WAV file, optionally playing it after synthesis. Supports explicit backends: auto,
|
|
508698
|
+
description = "Generate text-to-speech audio as a WAV file, optionally playing it after synthesis. Supports explicit backends: auto, LuxTTS voice cloning, Supertonic, MLX, ONNX/Piper, and local fallback. Neural TTS backends self-provision into ~/.omnius/voice on first use where supported. For voice cloning, pass source_audio/sample/voice_sample with the reference clip and backend=auto or luxtts; clone_name can register it for reuse. Use clone_ref to select a registered LuxTTS voice and playback=false for silent file generation. Use this tool for speech/TTS requests; do not use shell commands or generate_audio as a TTS fallback.";
|
|
507999
508699
|
parameters = {
|
|
508000
508700
|
type: "object",
|
|
508001
508701
|
properties: {
|
|
508002
508702
|
text: { type: "string", description: "Text to synthesize" },
|
|
508003
508703
|
input: { type: "string", description: "Alias for text." },
|
|
508004
508704
|
prompt: { type: "string", description: "Alias for text." },
|
|
508005
|
-
backend: { type: "string", enum: ["auto", "luxtts", "supertonic", "mlx", "onnx", "piper"
|
|
508705
|
+
backend: { type: "string", enum: ["auto", "luxtts", "supertonic", "mlx", "onnx", "piper"] },
|
|
508006
508706
|
output: { type: "string", description: "Output WAV path. Defaults to ~/.omnius/voice/generated/tts-*.wav." },
|
|
508007
508707
|
path: { type: "string", description: "Alias for output." },
|
|
508008
508708
|
playback: { type: "boolean", description: "Whether to play after generating. Defaults false for generate_tts." },
|
|
508009
508709
|
strict_backend: { type: "boolean", description: "When true, fail instead of falling back if the requested backend is unavailable. Defaults false." },
|
|
508010
508710
|
voice: { type: "string", description: "Voice id/name, or raw Piper/ONNX path when backend=onnx/piper." },
|
|
508011
508711
|
clone_ref: { type: "string", description: "LuxTTS clone reference path, filename, or registered clone name." },
|
|
508712
|
+
sample: { type: "string", description: "Voice source clip path for cloned speech. Alias: source_audio." },
|
|
508713
|
+
source_audio: { type: "string", description: "Voice source clip path for cloned speech." },
|
|
508714
|
+
voice_sample: { type: "string", description: "Alias for source_audio." },
|
|
508715
|
+
reference_audio: { type: "string", description: "Alias for source_audio." },
|
|
508716
|
+
ref_audio: { type: "string", description: "Alias for source_audio." },
|
|
508717
|
+
clone_sample: { type: "string", description: "Alias for source_audio." },
|
|
508718
|
+
clone_name: { type: "string", description: "Optional name to register the source clip for later reuse." },
|
|
508012
508719
|
model: { type: "string", description: "Backend model id or raw ONNX/Piper model path." },
|
|
508013
508720
|
lang: { type: "string", description: "Language code for Supertonic/MLX where supported." },
|
|
508014
508721
|
speed: { type: "number", description: "Speech speed multiplier or backend-specific rate." },
|
|
@@ -575055,7 +575762,7 @@ __export(image_ascii_preview_exports, {
|
|
|
575055
575762
|
extractSavedImagePath: () => extractSavedImagePath,
|
|
575056
575763
|
formatImageAsciiContext: () => formatImageAsciiContext
|
|
575057
575764
|
});
|
|
575058
|
-
import { execFileSync as
|
|
575765
|
+
import { execFileSync as execFileSync4 } from "node:child_process";
|
|
575059
575766
|
import { createRequire as createRequire5 } from "node:module";
|
|
575060
575767
|
import { existsSync as existsSync94, readFileSync as readFileSync75, statSync as statSync32 } from "node:fs";
|
|
575061
575768
|
import { resolve as resolve37 } from "node:path";
|
|
@@ -575192,7 +575899,7 @@ function convertWithFfmpeg(imagePath, width, height, timeoutMs) {
|
|
|
575192
575899
|
`scale=${width}:${height}`,
|
|
575193
575900
|
"format=gray"
|
|
575194
575901
|
].join(",");
|
|
575195
|
-
const raw =
|
|
575902
|
+
const raw = execFileSync4(
|
|
575196
575903
|
"ffmpeg",
|
|
575197
575904
|
[
|
|
575198
575905
|
"-hide_banner",
|
|
@@ -575357,19 +576064,19 @@ function modelOnnxPath(id) {
|
|
|
575357
576064
|
function modelConfigPath(id) {
|
|
575358
576065
|
return join109(modelDir(id), "config.json");
|
|
575359
576066
|
}
|
|
575360
|
-
function
|
|
576067
|
+
function luxttsVenvDir2() {
|
|
575361
576068
|
return join109(voiceDir2(), "luxtts-venv");
|
|
575362
576069
|
}
|
|
575363
|
-
function
|
|
575364
|
-
return platform5() === "win32" ? join109(
|
|
576070
|
+
function luxttsVenvPy2() {
|
|
576071
|
+
return platform5() === "win32" ? join109(luxttsVenvDir2(), "Scripts", "python.exe") : join109(luxttsVenvDir2(), "bin", "python3");
|
|
575365
576072
|
}
|
|
575366
|
-
function
|
|
576073
|
+
function luxttsRepoDir2() {
|
|
575367
576074
|
return join109(voiceDir2(), "LuxTTS");
|
|
575368
576075
|
}
|
|
575369
576076
|
function luxttsCloneRefsDir() {
|
|
575370
576077
|
return join109(voiceDir2(), "clone-refs");
|
|
575371
576078
|
}
|
|
575372
|
-
function
|
|
576079
|
+
function luxttsInferScript2() {
|
|
575373
576080
|
return join109(voiceDir2(), "luxtts-infer.py");
|
|
575374
576081
|
}
|
|
575375
576082
|
function supertonicVenvDir() {
|
|
@@ -577936,12 +578643,12 @@ Error: ${err2 instanceof Error ? err2.message : String(err2)}`
|
|
|
577936
578643
|
"python3 not found. LuxTTS requires Python 3.10+. Try: apt install python3 / brew install python3"
|
|
577937
578644
|
);
|
|
577938
578645
|
}
|
|
577939
|
-
const venvDir =
|
|
577940
|
-
const venvPy =
|
|
578646
|
+
const venvDir = luxttsVenvDir2();
|
|
578647
|
+
const venvPy = luxttsVenvPy2();
|
|
577941
578648
|
if (existsSync95(venvPy)) {
|
|
577942
578649
|
try {
|
|
577943
578650
|
const quotedPy = `"${venvPy}"`;
|
|
577944
|
-
const repoPath =
|
|
578651
|
+
const repoPath = luxttsRepoDir2().replace(/\\/g, "/");
|
|
577945
578652
|
await this.asyncShell(
|
|
577946
578653
|
`${quotedPy} -c "import sys; sys.path.insert(0, '${repoPath}'); from zipvoice.luxvoice import LuxTTS; print('ok')"`,
|
|
577947
578654
|
3e4
|
|
@@ -578055,7 +578762,7 @@ Error: ${err2 instanceof Error ? err2.message : String(err2)}`
|
|
|
578055
578762
|
}
|
|
578056
578763
|
}
|
|
578057
578764
|
}
|
|
578058
|
-
const repoDir =
|
|
578765
|
+
const repoDir = luxttsRepoDir2();
|
|
578059
578766
|
if (!existsSync95(join109(repoDir, "zipvoice", "luxvoice.py"))) {
|
|
578060
578767
|
renderInfo(" Cloning LuxTTS repository...");
|
|
578061
578768
|
try {
|
|
@@ -578479,18 +579186,18 @@ def main():
|
|
|
578479
579186
|
if __name__ == '__main__':
|
|
578480
579187
|
main()
|
|
578481
579188
|
`;
|
|
578482
|
-
const scriptPath2 =
|
|
579189
|
+
const scriptPath2 = luxttsInferScript2();
|
|
578483
579190
|
mkdirSync52(voiceDir2(), { recursive: true });
|
|
578484
579191
|
writeFileSync49(scriptPath2, script);
|
|
578485
579192
|
}
|
|
578486
579193
|
/** Ensure the LuxTTS daemon is running, spawn if needed */
|
|
578487
579194
|
async ensureLuxttsDaemon() {
|
|
578488
579195
|
if (this._luxttsDaemon && !this._luxttsDaemon.killed) return true;
|
|
578489
|
-
const venvPy =
|
|
579196
|
+
const venvPy = luxttsVenvPy2();
|
|
578490
579197
|
if (!existsSync95(venvPy)) return false;
|
|
578491
579198
|
return new Promise((resolve48) => {
|
|
578492
|
-
const env2 = { ...process.env, LUXTTS_REPO_PATH:
|
|
578493
|
-
const daemon = nodeSpawn(venvPy, [
|
|
579199
|
+
const env2 = { ...process.env, LUXTTS_REPO_PATH: luxttsRepoDir2() };
|
|
579200
|
+
const daemon = nodeSpawn(venvPy, [luxttsInferScript2()], {
|
|
578494
579201
|
stdio: ["pipe", "pipe", "pipe"],
|
|
578495
579202
|
cwd: tmpdir20(),
|
|
578496
579203
|
env: env2
|
|
@@ -596377,6 +597084,17 @@ var init_tool_policy = __esm({
|
|
|
596377
597084
|
"todo_write",
|
|
596378
597085
|
"web_search",
|
|
596379
597086
|
"web_fetch",
|
|
597087
|
+
"image_read",
|
|
597088
|
+
"ocr",
|
|
597089
|
+
"ocr_image_advanced",
|
|
597090
|
+
"ocr_pdf",
|
|
597091
|
+
"pdf_to_text",
|
|
597092
|
+
"vision",
|
|
597093
|
+
"transcribe_file",
|
|
597094
|
+
"video_understand",
|
|
597095
|
+
"audio_analyze",
|
|
597096
|
+
"explore_tools",
|
|
597097
|
+
"telegram_media_recent",
|
|
596380
597098
|
"generate_image",
|
|
596381
597099
|
"generate_audio",
|
|
596382
597100
|
"generate_tts",
|
|
@@ -596393,6 +597111,17 @@ var init_tool_policy = __esm({
|
|
|
596393
597111
|
"web_search",
|
|
596394
597112
|
"web_fetch",
|
|
596395
597113
|
"web_crawl",
|
|
597114
|
+
"image_read",
|
|
597115
|
+
"ocr",
|
|
597116
|
+
"ocr_image_advanced",
|
|
597117
|
+
"ocr_pdf",
|
|
597118
|
+
"pdf_to_text",
|
|
597119
|
+
"vision",
|
|
597120
|
+
"transcribe_file",
|
|
597121
|
+
"video_understand",
|
|
597122
|
+
"audio_analyze",
|
|
597123
|
+
"explore_tools",
|
|
597124
|
+
"telegram_media_recent",
|
|
596396
597125
|
"generate_image",
|
|
596397
597126
|
"generate_audio",
|
|
596398
597127
|
"generate_tts",
|
|
@@ -596500,6 +597229,7 @@ function scopedTool(base3, root, mode) {
|
|
|
596500
597229
|
async execute(args) {
|
|
596501
597230
|
const next = { ...args };
|
|
596502
597231
|
if (base3.name === "generate_image" || base3.name === "generate_audio" || base3.name === "generate_tts") {
|
|
597232
|
+
const cleanup = [];
|
|
596503
597233
|
const localModel = typeof next["model_path"] === "string" ? String(next["model_path"]) : typeof next["model"] === "string" && looksLikeLocalPath(String(next["model"])) ? String(next["model"]) : "";
|
|
596504
597234
|
if (localModel) {
|
|
596505
597235
|
const guarded = guardPath(rootAbs, localModel);
|
|
@@ -596508,6 +597238,22 @@ function scopedTool(base3, root, mode) {
|
|
|
596508
597238
|
else next["model"] = guarded.path.abs;
|
|
596509
597239
|
}
|
|
596510
597240
|
if (base3.name === "generate_tts") {
|
|
597241
|
+
for (const key of TTS_CLONE_SOURCE_KEYS) {
|
|
597242
|
+
const value2 = next[key];
|
|
597243
|
+
if (typeof value2 !== "string" || !value2.trim()) continue;
|
|
597244
|
+
const materialized = materializeTelegramCreativeArtifactForSend(rootAbs, value2.trim());
|
|
597245
|
+
if (!materialized.ok) return denied(materialized.error);
|
|
597246
|
+
next[key] = materialized.path;
|
|
597247
|
+
if (materialized.cleanup) cleanup.push(materialized.cleanup);
|
|
597248
|
+
}
|
|
597249
|
+
for (const key of ["clone_ref", "voice"]) {
|
|
597250
|
+
const value2 = next[key];
|
|
597251
|
+
if (typeof value2 !== "string" || !value2.trim() || !looksLikeAudioPath(value2.trim())) continue;
|
|
597252
|
+
const materialized = materializeTelegramCreativeArtifactForSend(rootAbs, value2.trim());
|
|
597253
|
+
if (!materialized.ok) return denied(materialized.error);
|
|
597254
|
+
next[key] = materialized.path;
|
|
597255
|
+
if (materialized.cleanup) cleanup.push(materialized.cleanup);
|
|
597256
|
+
}
|
|
596511
597257
|
const rawOutput = typeof next["output"] === "string" && String(next["output"]).trim() ? String(next["output"]) : typeof next["output_path"] === "string" && String(next["output_path"]).trim() ? String(next["output_path"]) : `tts-${Date.now()}.wav`;
|
|
596512
597258
|
const guardedOutput = guardPath(rootAbs, rawOutput);
|
|
596513
597259
|
if (!guardedOutput.ok) return denied(guardedOutput.error);
|
|
@@ -596517,16 +597263,20 @@ function scopedTool(base3, root, mode) {
|
|
|
596517
597263
|
next["output"] = guardedOutput.path.abs;
|
|
596518
597264
|
next["playback"] = false;
|
|
596519
597265
|
}
|
|
596520
|
-
|
|
596521
|
-
|
|
596522
|
-
if (
|
|
596523
|
-
|
|
596524
|
-
|
|
596525
|
-
|
|
596526
|
-
|
|
597266
|
+
try {
|
|
597267
|
+
const result2 = await base3.execute(next);
|
|
597268
|
+
if (result2.success) {
|
|
597269
|
+
if (base3.name === "generate_tts" && typeof next["output"] === "string") {
|
|
597270
|
+
rememberCreated(rootAbs, String(next["output"]));
|
|
597271
|
+
}
|
|
597272
|
+
for (const path11 of collectGeneratedArtifactPathsFromText(result2.output, rootAbs)) {
|
|
597273
|
+
rememberCreated(rootAbs, path11);
|
|
597274
|
+
}
|
|
596527
597275
|
}
|
|
597276
|
+
return result2;
|
|
597277
|
+
} finally {
|
|
597278
|
+
for (const fn of cleanup) fn();
|
|
596528
597279
|
}
|
|
596529
|
-
return result2;
|
|
596530
597280
|
}
|
|
596531
597281
|
const pathKey = PATH_KEYS.find((key) => typeof next[key] === "string" && String(next[key]).trim());
|
|
596532
597282
|
if (pathKey) {
|
|
@@ -596591,6 +597341,9 @@ function isInside(root, path11) {
|
|
|
596591
597341
|
function looksLikeLocalPath(value2) {
|
|
596592
597342
|
return value2.startsWith("/") || value2.startsWith("./") || value2.startsWith("../");
|
|
596593
597343
|
}
|
|
597344
|
+
function looksLikeAudioPath(value2) {
|
|
597345
|
+
return looksLikeLocalPath(value2) || value2.startsWith("~/") || /\.(wav|mp3|flac|ogg|m4a)$/i.test(value2);
|
|
597346
|
+
}
|
|
596594
597347
|
function manifestPath(root) {
|
|
596595
597348
|
return join119(root, MANIFEST_FILE);
|
|
596596
597349
|
}
|
|
@@ -596753,7 +597506,7 @@ function denied(error) {
|
|
|
596753
597506
|
mutatedFiles: []
|
|
596754
597507
|
};
|
|
596755
597508
|
}
|
|
596756
|
-
var MANIFEST_FILE, OBJECTS_DIR, SEND_DIR, PATH_KEYS, MEDIA_PATH_RE, PUBLIC_EXECUTABLE_ARTIFACT_EXTENSIONS, CreativeAudioFileTool;
|
|
597509
|
+
var MANIFEST_FILE, OBJECTS_DIR, SEND_DIR, PATH_KEYS, TTS_CLONE_SOURCE_KEYS, MEDIA_PATH_RE, PUBLIC_EXECUTABLE_ARTIFACT_EXTENSIONS, CreativeAudioFileTool;
|
|
596757
597510
|
var init_telegram_creative_tools = __esm({
|
|
596758
597511
|
"packages/cli/src/tui/telegram-creative-tools.ts"() {
|
|
596759
597512
|
"use strict";
|
|
@@ -596762,6 +597515,7 @@ var init_telegram_creative_tools = __esm({
|
|
|
596762
597515
|
OBJECTS_DIR = ".objects";
|
|
596763
597516
|
SEND_DIR = ".send";
|
|
596764
597517
|
PATH_KEYS = ["path", "file", "file_path", "filename", "filepath", "filePath"];
|
|
597518
|
+
TTS_CLONE_SOURCE_KEYS = ["sample", "source_audio", "voice_sample", "reference_audio", "ref_audio", "clone_sample"];
|
|
596765
597519
|
MEDIA_PATH_RE = /(?:^|[\s([])(\/[^\s<>"')\]]+\.[A-Za-z0-9]{1,12})(?:$|[\s),.\]])/g;
|
|
596766
597520
|
PUBLIC_EXECUTABLE_ARTIFACT_EXTENSIONS = /* @__PURE__ */ new Set([
|
|
596767
597521
|
".sh",
|
|
@@ -596836,9 +597590,16 @@ var init_telegram_creative_tools = __esm({
|
|
|
596836
597590
|
input: { type: "string", description: "Alias for text" },
|
|
596837
597591
|
prompt: { type: "string", description: "Alias for text" },
|
|
596838
597592
|
path: { type: "string", description: "Output .wav path inside the creative workspace" },
|
|
596839
|
-
backend: { type: "string", enum: ["auto", "luxtts", "supertonic", "mlx", "onnx", "piper"
|
|
596840
|
-
voice: { type: "string", description: "Voice id/name for the selected TTS backend" },
|
|
597593
|
+
backend: { type: "string", enum: ["auto", "luxtts", "supertonic", "mlx", "onnx", "piper"], description: "TTS backend. Defaults to auto." },
|
|
597594
|
+
voice: { type: "string", description: "Voice id/name for the selected TTS backend, or a scoped source audio path for cloning" },
|
|
596841
597595
|
clone_ref: { type: "string", description: "Optional LuxTTS clone reference" },
|
|
597596
|
+
sample: { type: "string", description: "Voice source clip path inside the creative workspace" },
|
|
597597
|
+
source_audio: { type: "string", description: "Alias for sample" },
|
|
597598
|
+
voice_sample: { type: "string", description: "Alias for sample" },
|
|
597599
|
+
reference_audio: { type: "string", description: "Alias for sample" },
|
|
597600
|
+
ref_audio: { type: "string", description: "Alias for sample" },
|
|
597601
|
+
clone_sample: { type: "string", description: "Alias for sample" },
|
|
597602
|
+
clone_name: { type: "string", description: "Optional name to register the source clip for later reuse" },
|
|
596842
597603
|
model: { type: "string", description: "Optional backend model id or raw Piper/ONNX path" },
|
|
596843
597604
|
speed: { type: "number", description: "Speech speed multiplier or backend-specific rate" }
|
|
596844
597605
|
},
|
|
@@ -596857,26 +597618,57 @@ var init_telegram_creative_tools = __esm({
|
|
|
596857
597618
|
if (!guarded.path.abs.toLowerCase().endsWith(".wav")) {
|
|
596858
597619
|
return denied("create_audio_file currently writes WAV files; use a .wav output path.");
|
|
596859
597620
|
}
|
|
596860
|
-
|
|
596861
|
-
const
|
|
596862
|
-
const
|
|
596863
|
-
|
|
596864
|
-
|
|
596865
|
-
|
|
596866
|
-
|
|
596867
|
-
|
|
596868
|
-
|
|
596869
|
-
|
|
596870
|
-
|
|
596871
|
-
|
|
596872
|
-
|
|
596873
|
-
|
|
596874
|
-
|
|
596875
|
-
|
|
596876
|
-
|
|
597621
|
+
const cloneArgs = {};
|
|
597622
|
+
const cleanup = [];
|
|
597623
|
+
for (const key of TTS_CLONE_SOURCE_KEYS) {
|
|
597624
|
+
const value2 = args[key];
|
|
597625
|
+
if (typeof value2 !== "string" || !value2.trim()) continue;
|
|
597626
|
+
const materialized = materializeTelegramCreativeArtifactForSend(this.root, value2.trim());
|
|
597627
|
+
if (!materialized.ok) return denied(materialized.error);
|
|
597628
|
+
cloneArgs[key] = materialized.path;
|
|
597629
|
+
if (materialized.cleanup) cleanup.push(materialized.cleanup);
|
|
597630
|
+
}
|
|
597631
|
+
for (const key of ["clone_ref", "voice"]) {
|
|
597632
|
+
const value2 = args[key];
|
|
597633
|
+
if (typeof value2 !== "string" || !value2.trim() || !looksLikeAudioPath(value2.trim())) continue;
|
|
597634
|
+
const materialized = materializeTelegramCreativeArtifactForSend(this.root, value2.trim());
|
|
597635
|
+
if (!materialized.ok) return denied(materialized.error);
|
|
597636
|
+
cloneArgs[key] = materialized.path;
|
|
597637
|
+
if (materialized.cleanup) cleanup.push(materialized.cleanup);
|
|
597638
|
+
}
|
|
597639
|
+
let result;
|
|
597640
|
+
try {
|
|
597641
|
+
await mkdir17(dirname33(guarded.path.abs), { recursive: true });
|
|
597642
|
+
const tts = new TtsGenerateTool();
|
|
597643
|
+
result = await tts.execute({
|
|
597644
|
+
text,
|
|
597645
|
+
output: guarded.path.abs,
|
|
597646
|
+
playback: false,
|
|
597647
|
+
backend: args["backend"],
|
|
597648
|
+
voice: cloneArgs["voice"] ?? args["voice"],
|
|
597649
|
+
clone_ref: cloneArgs["clone_ref"] ?? args["clone_ref"],
|
|
597650
|
+
...cloneArgs,
|
|
597651
|
+
sample: cloneArgs["sample"],
|
|
597652
|
+
source_audio: cloneArgs["source_audio"],
|
|
597653
|
+
voice_sample: cloneArgs["voice_sample"],
|
|
597654
|
+
reference_audio: cloneArgs["reference_audio"],
|
|
597655
|
+
ref_audio: cloneArgs["ref_audio"],
|
|
597656
|
+
clone_sample: cloneArgs["clone_sample"],
|
|
597657
|
+
clone_name: args["clone_name"],
|
|
597658
|
+
model: args["model"],
|
|
597659
|
+
speed: args["speed"]
|
|
597660
|
+
});
|
|
597661
|
+
if (!result.success || !existsSync104(guarded.path.abs)) {
|
|
597662
|
+
return {
|
|
597663
|
+
success: false,
|
|
597664
|
+
output: "",
|
|
597665
|
+
error: `Audio synthesis failed through generate_tts.
|
|
596877
597666
|
${(result.error || result.output || "").slice(0, 1200)}`,
|
|
596878
|
-
|
|
596879
|
-
|
|
597667
|
+
durationMs: performance.now() - start2
|
|
597668
|
+
};
|
|
597669
|
+
}
|
|
597670
|
+
} finally {
|
|
597671
|
+
for (const fn of cleanup) fn();
|
|
596880
597672
|
}
|
|
596881
597673
|
rememberCreated(this.root, guarded.path.abs);
|
|
596882
597674
|
const sizeKB = Math.round(statSync35(guarded.path.abs).size / 1024);
|
|
@@ -596904,12 +597696,12 @@ __export(vision_ingress_exports, {
|
|
|
596904
597696
|
queryVisionModel: () => queryVisionModel,
|
|
596905
597697
|
runVisionIngress: () => runVisionIngress
|
|
596906
597698
|
});
|
|
596907
|
-
import { execFileSync as
|
|
597699
|
+
import { execFileSync as execFileSync5 } from "node:child_process";
|
|
596908
597700
|
import { existsSync as existsSync105, readFileSync as readFileSync86, unlinkSync as unlinkSync20 } from "node:fs";
|
|
596909
597701
|
import { join as join120 } from "node:path";
|
|
596910
597702
|
function isTesseractAvailable() {
|
|
596911
597703
|
try {
|
|
596912
|
-
|
|
597704
|
+
execFileSync5("tesseract", ["--version"], { stdio: "ignore", timeout: 3e3 });
|
|
596913
597705
|
return true;
|
|
596914
597706
|
} catch {
|
|
596915
597707
|
return false;
|
|
@@ -596950,7 +597742,7 @@ function advancedOcr(imagePath) {
|
|
|
596950
597742
|
for (const psm of psmModes) {
|
|
596951
597743
|
const outFile = `${tmpBase}_psm${psm}`;
|
|
596952
597744
|
try {
|
|
596953
|
-
|
|
597745
|
+
execFileSync5("tesseract", [
|
|
596954
597746
|
imagePath,
|
|
596955
597747
|
outFile,
|
|
596956
597748
|
"--psm",
|
|
@@ -597049,7 +597841,7 @@ var init_vision_ingress = __esm({
|
|
|
597049
597841
|
|
|
597050
597842
|
// packages/cli/src/tui/telegram-bridge.ts
|
|
597051
597843
|
import { mkdirSync as mkdirSync60, existsSync as existsSync106, unlinkSync as unlinkSync21, readdirSync as readdirSync36, statSync as statSync36, readFileSync as readFileSync87, writeFileSync as writeFileSync57 } from "node:fs";
|
|
597052
|
-
import { join as join121, resolve as resolve39, basename as basename23, relative as relative13, isAbsolute as isAbsolute7 } from "node:path";
|
|
597844
|
+
import { join as join121, resolve as resolve39, basename as basename23, relative as relative13, isAbsolute as isAbsolute7, extname as extname15 } from "node:path";
|
|
597053
597845
|
import { writeFile as writeFileAsync } from "node:fs/promises";
|
|
597054
597846
|
import { createHash as createHash19, randomInt } from "node:crypto";
|
|
597055
597847
|
function parseTelegramInteractionDecision(text, forcedRoute, options2 = {}) {
|
|
@@ -597247,6 +598039,19 @@ function summarizeTelegramMessageAttachments(msg) {
|
|
|
597247
598039
|
parts.push(`caption: ${truncateTelegramContextLine(msg.media.caption, 180)}`);
|
|
597248
598040
|
}
|
|
597249
598041
|
}
|
|
598042
|
+
if (msg.replyToMedia) {
|
|
598043
|
+
const details = [
|
|
598044
|
+
msg.replyToMedia.type,
|
|
598045
|
+
msg.replyToMedia.mimeType,
|
|
598046
|
+
msg.replyToMedia.fileName,
|
|
598047
|
+
msg.replyToMedia.duration ? `${msg.replyToMedia.duration}s` : "",
|
|
598048
|
+
msg.replyToMedia.fileSize ? `${msg.replyToMedia.fileSize} bytes` : ""
|
|
598049
|
+
].filter(Boolean).join(", ");
|
|
598050
|
+
parts.push(`replied-to media: ${details}`);
|
|
598051
|
+
if (msg.replyToMedia.caption) {
|
|
598052
|
+
parts.push(`replied-to caption: ${truncateTelegramContextLine(msg.replyToMedia.caption, 180)}`);
|
|
598053
|
+
}
|
|
598054
|
+
}
|
|
597250
598055
|
if (msg.poll) {
|
|
597251
598056
|
parts.push(`poll: ${truncateTelegramContextLine(msg.poll.question, 180)}`);
|
|
597252
598057
|
}
|
|
@@ -597620,6 +598425,25 @@ function telegramImageMime(media) {
|
|
|
597620
598425
|
if (ext === ".tif" || ext === ".tiff") return "image/tiff";
|
|
597621
598426
|
return "image/jpeg";
|
|
597622
598427
|
}
|
|
598428
|
+
function telegramCachedMediaIsImage(entry) {
|
|
598429
|
+
if (entry.mediaType === "photo") return true;
|
|
598430
|
+
if (entry.mimeType?.toLowerCase().startsWith("image/")) return true;
|
|
598431
|
+
return TELEGRAM_IMAGE_EXTENSIONS.has(extname15(entry.localPath).toLowerCase());
|
|
598432
|
+
}
|
|
598433
|
+
function telegramCachedMediaIsPdf(entry) {
|
|
598434
|
+
if (entry.mimeType?.toLowerCase() === "application/pdf") return true;
|
|
598435
|
+
return extname15(entry.localPath).toLowerCase() === ".pdf";
|
|
598436
|
+
}
|
|
598437
|
+
function telegramCachedMediaIsAudio(entry) {
|
|
598438
|
+
if (entry.mediaType === "audio" || entry.mediaType === "voice") return true;
|
|
598439
|
+
if (entry.mimeType?.toLowerCase().startsWith("audio/")) return true;
|
|
598440
|
+
return [".wav", ".mp3", ".flac", ".aac", ".m4a", ".ogg", ".opus"].includes(extname15(entry.localPath).toLowerCase());
|
|
598441
|
+
}
|
|
598442
|
+
function telegramCachedMediaIsVideo(entry) {
|
|
598443
|
+
if (entry.mediaType === "video" || entry.mediaType === "video_note" || entry.mediaType === "live_photo") return true;
|
|
598444
|
+
if (entry.mimeType?.toLowerCase().startsWith("video/")) return true;
|
|
598445
|
+
return [".mp4", ".mkv", ".avi", ".mov", ".webm"].includes(extname15(entry.localPath).toLowerCase());
|
|
598446
|
+
}
|
|
597623
598447
|
function isPathInside(root, path11) {
|
|
597624
598448
|
const rel = relative13(resolve39(root), resolve39(path11));
|
|
597625
598449
|
return rel === "" || Boolean(rel) && !rel.startsWith("..") && !isAbsolute7(rel);
|
|
@@ -597653,6 +598477,10 @@ function normalizeTelegramUpdate(update2) {
|
|
|
597653
598477
|
const username = message2.from?.username ?? message2.sender_chat?.username ?? "";
|
|
597654
598478
|
const chatType = message2.chat?.type ?? "private";
|
|
597655
598479
|
const media = normalizeTelegramMedia(message2);
|
|
598480
|
+
const replyTo = message2.reply_to_message && typeof message2.reply_to_message === "object" ? message2.reply_to_message : void 0;
|
|
598481
|
+
const replyToMedia = replyTo ? normalizeTelegramMedia(replyTo) : void 0;
|
|
598482
|
+
const replyToPoll = replyTo ? normalizeTelegramPoll(replyTo.poll) : void 0;
|
|
598483
|
+
const replyToText = replyTo ? replyTo.text || replyTo.caption || (replyToPoll ? formatTelegramPollSummary(replyToPoll) : "") : "";
|
|
597656
598484
|
const poll = normalizeTelegramPoll(message2.poll);
|
|
597657
598485
|
const livePhoto = normalizeTelegramLivePhoto(message2.live_photo);
|
|
597658
598486
|
const text = message2.text || message2.caption || (poll ? formatTelegramPollSummary(poll) : "");
|
|
@@ -597667,6 +598495,8 @@ function normalizeTelegramUpdate(update2) {
|
|
|
597667
598495
|
chatType,
|
|
597668
598496
|
chatTitle: message2.chat?.title,
|
|
597669
598497
|
media,
|
|
598498
|
+
replyToMedia,
|
|
598499
|
+
replyToText: replyToText || void 0,
|
|
597670
598500
|
poll,
|
|
597671
598501
|
livePhoto,
|
|
597672
598502
|
guestQueryId: typeof message2.guest_query_id === "string" ? message2.guest_query_id : void 0,
|
|
@@ -597675,9 +598505,9 @@ function normalizeTelegramUpdate(update2) {
|
|
|
597675
598505
|
isGuestMessage: sourceUpdateType === "guest_message",
|
|
597676
598506
|
isDirectMessages: Boolean(message2.chat?.is_direct_messages),
|
|
597677
598507
|
parentChatId: message2.chat?.parent_chat?.id ?? message2.direct_messages_topic?.parent_topic?.id,
|
|
597678
|
-
replyToMessageId:
|
|
597679
|
-
replyToUsername:
|
|
597680
|
-
replyToBot: Boolean(
|
|
598508
|
+
replyToMessageId: replyTo?.message_id,
|
|
598509
|
+
replyToUsername: replyTo?.from?.username ?? replyTo?.sender_chat?.username,
|
|
598510
|
+
replyToBot: Boolean(replyTo?.from?.is_bot),
|
|
597681
598511
|
mentionedUsernames: extractTelegramMentionedUsernames(message2, text),
|
|
597682
598512
|
sourceUpdateType
|
|
597683
598513
|
};
|
|
@@ -597824,7 +598654,7 @@ function renderTelegramSubAgentError(username, error) {
|
|
|
597824
598654
|
process.stdout.write(` ${c3.dim("⎿")} ${c3.red("✘")} @${username}: ${c3.dim(preview)}
|
|
597825
598655
|
`);
|
|
597826
598656
|
}
|
|
597827
|
-
var TELEGRAM_SAFETY_PROMPT, ADMIN_DM_PROMPT, ADMIN_GROUP_PROMPT, TELEGRAM_PUBLIC_SOUL_PROFILE, TELEGRAM_PUBLIC_ORCHESTRATOR_CONTRACT, TELEGRAM_PUBLIC_MEMORY_SCOPE_CONTRACT, GROUP_REPLY_DISCRETION_PROMPT, TELEGRAM_CHAT_MODE_PROMPT, ADMIN_CHAT_PROFILE_PROMPT, TELEGRAM_ACTION_RESPONSE_CONTRACT, TELEGRAM_CHAT_HISTORY_LIMIT, TELEGRAM_CONTEXT_RECENT_DEFAULT, TELEGRAM_CONTEXT_LINE_LIMIT, TELEGRAM_CONTEXT_SAMPLE_LIMIT, TELEGRAM_MEMORY_CARD_LIMIT, TELEGRAM_MEMORY_NOTE_LIMIT, TELEGRAM_MEMORY_STOPWORDS, TELEGRAM_PUBLIC_HELP_COMMANDS, MEDIA_CACHE_TTL_MS, TelegramBridge;
|
|
598657
|
+
var TELEGRAM_SAFETY_PROMPT, ADMIN_DM_PROMPT, ADMIN_GROUP_PROMPT, TELEGRAM_PUBLIC_SOUL_PROFILE, TELEGRAM_PUBLIC_ORCHESTRATOR_CONTRACT, TELEGRAM_PUBLIC_MEMORY_SCOPE_CONTRACT, GROUP_REPLY_DISCRETION_PROMPT, TELEGRAM_CHAT_MODE_PROMPT, ADMIN_CHAT_PROFILE_PROMPT, TELEGRAM_ACTION_RESPONSE_CONTRACT, TELEGRAM_CHAT_HISTORY_LIMIT, TELEGRAM_CONTEXT_RECENT_DEFAULT, TELEGRAM_CONTEXT_LINE_LIMIT, TELEGRAM_CONTEXT_SAMPLE_LIMIT, TELEGRAM_MEMORY_CARD_LIMIT, TELEGRAM_MEMORY_NOTE_LIMIT, TELEGRAM_MEMORY_STOPWORDS, TELEGRAM_PUBLIC_HELP_COMMANDS, TELEGRAM_IMAGE_EXTENSIONS, MEDIA_CACHE_TTL_MS, TelegramBridge;
|
|
597828
598658
|
var init_telegram_bridge = __esm({
|
|
597829
598659
|
"packages/cli/src/tui/telegram-bridge.ts"() {
|
|
597830
598660
|
"use strict";
|
|
@@ -598020,6 +598850,7 @@ Telegram response contract:
|
|
|
598020
598850
|
"your"
|
|
598021
598851
|
]);
|
|
598022
598852
|
TELEGRAM_PUBLIC_HELP_COMMANDS = /* @__PURE__ */ new Set(["help", "start", "auth", "call"]);
|
|
598853
|
+
TELEGRAM_IMAGE_EXTENSIONS = /* @__PURE__ */ new Set([".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif", ".svg"]);
|
|
598023
598854
|
MEDIA_CACHE_TTL_MS = 30 * 60 * 1e3;
|
|
598024
598855
|
TelegramBridge = class {
|
|
598025
598856
|
constructor(botToken, onMessage, agentConfig, repoRoot, toolPolicyConfig) {
|
|
@@ -598431,6 +599262,80 @@ Telegram response contract:
|
|
|
598431
599262
|
}
|
|
598432
599263
|
}
|
|
598433
599264
|
}
|
|
599265
|
+
updateLastTelegramUserMessageText(msg, text) {
|
|
599266
|
+
const sessionKey = this.sessionKeyForMessage(msg);
|
|
599267
|
+
const history = this.chatHistory.get(sessionKey);
|
|
599268
|
+
if (!history || !text.trim()) return;
|
|
599269
|
+
for (let i2 = history.length - 1; i2 >= 0; i2--) {
|
|
599270
|
+
const entry = history[i2];
|
|
599271
|
+
if (entry.role !== "user") continue;
|
|
599272
|
+
if (entry.messageId === msg.messageId || !entry.messageId && entry.text === msg.text) {
|
|
599273
|
+
entry.text = text.trim();
|
|
599274
|
+
entry.mediaSummary = summarizeTelegramMessageAttachments(msg) || entry.mediaSummary;
|
|
599275
|
+
this.updateTelegramMemoryCards(sessionKey, entry);
|
|
599276
|
+
this.saveTelegramConversationState(sessionKey);
|
|
599277
|
+
return;
|
|
599278
|
+
}
|
|
599279
|
+
}
|
|
599280
|
+
}
|
|
599281
|
+
recentTelegramMediaEntries(chatId, limit = 12) {
|
|
599282
|
+
const now = Date.now();
|
|
599283
|
+
return [...this.mediaCache.values()].filter((entry) => {
|
|
599284
|
+
if (chatId !== void 0 && String(entry.chatId) !== String(chatId)) return false;
|
|
599285
|
+
return now - entry.cachedAt <= MEDIA_CACHE_TTL_MS;
|
|
599286
|
+
}).sort((a2, b) => b.cachedAt - a2.cachedAt).slice(0, limit);
|
|
599287
|
+
}
|
|
599288
|
+
telegramMediaEntryMatchesKind(entry, kind) {
|
|
599289
|
+
if (kind === "image") return telegramCachedMediaIsImage(entry);
|
|
599290
|
+
if (kind === "pdf") return telegramCachedMediaIsPdf(entry);
|
|
599291
|
+
if (kind === "audio") return telegramCachedMediaIsAudio(entry);
|
|
599292
|
+
if (kind === "video") return telegramCachedMediaIsVideo(entry);
|
|
599293
|
+
if (kind === "transcribable") {
|
|
599294
|
+
return telegramCachedMediaIsAudio(entry) || telegramCachedMediaIsVideo(entry);
|
|
599295
|
+
}
|
|
599296
|
+
return true;
|
|
599297
|
+
}
|
|
599298
|
+
resolveTelegramScopedMediaPath(rawValue, chatId, currentMsg, kind) {
|
|
599299
|
+
const raw = String(rawValue ?? "").trim();
|
|
599300
|
+
const repoRoot = this.repoRoot || ".";
|
|
599301
|
+
const creativeRoot = telegramCreativeWorkspaceRoot(repoRoot, chatId);
|
|
599302
|
+
const mediaEntries = this.recentTelegramMediaEntries(chatId, 60).filter((entry) => this.telegramMediaEntryMatchesKind(entry, kind));
|
|
599303
|
+
const aliases = /* @__PURE__ */ new Set(["", "latest", "last", "current", "this", "that", "it", "reply", "replied", "replied-to", "replied_to"]);
|
|
599304
|
+
if (aliases.has(raw.toLowerCase())) {
|
|
599305
|
+
const replied = currentMsg?.replyToMessageId ? mediaEntries.find((entry2) => entry2.messageId === currentMsg.replyToMessageId) : void 0;
|
|
599306
|
+
const entry = replied ?? mediaEntries[0];
|
|
599307
|
+
if (!entry) {
|
|
599308
|
+
return { ok: false, error: `No recent ${kind} media is available in this Telegram chat scope.` };
|
|
599309
|
+
}
|
|
599310
|
+
return { ok: true, path: entry.localPath };
|
|
599311
|
+
}
|
|
599312
|
+
const matchingEntry = mediaEntries.find((entry) => {
|
|
599313
|
+
if (resolve39(entry.localPath) === resolve39(raw)) return true;
|
|
599314
|
+
if (basename23(entry.localPath) === raw) return true;
|
|
599315
|
+
if (entry.fileUniqueId === raw || entry.fileId === raw) return true;
|
|
599316
|
+
if (entry.messageId && String(entry.messageId) === raw) return true;
|
|
599317
|
+
return false;
|
|
599318
|
+
});
|
|
599319
|
+
if (matchingEntry) return { ok: true, path: matchingEntry.localPath };
|
|
599320
|
+
const creativeCandidate = isAbsolute7(raw) ? resolve39(raw) : resolve39(creativeRoot, raw);
|
|
599321
|
+
if (isPathInside(creativeRoot, creativeCandidate) && existsSync106(creativeCandidate)) {
|
|
599322
|
+
return { ok: true, path: creativeCandidate };
|
|
599323
|
+
}
|
|
599324
|
+
return {
|
|
599325
|
+
ok: false,
|
|
599326
|
+
error: `Path is outside this Telegram chat's media/workspace scope or does not exist: ${raw || "(empty)"}`
|
|
599327
|
+
};
|
|
599328
|
+
}
|
|
599329
|
+
resolveTelegramScopedOutputPath(rawValue, chatId, fallbackName) {
|
|
599330
|
+
const repoRoot = this.repoRoot || ".";
|
|
599331
|
+
const creativeRoot = telegramCreativeWorkspaceRoot(repoRoot, chatId);
|
|
599332
|
+
const raw = String(rawValue || fallbackName).trim() || fallbackName;
|
|
599333
|
+
const outputPath2 = isAbsolute7(raw) ? resolve39(raw) : resolve39(creativeRoot, raw);
|
|
599334
|
+
if (!isPathInside(creativeRoot, outputPath2)) {
|
|
599335
|
+
return { ok: false, error: `Output path must stay inside this Telegram chat's creative workspace: ${raw}` };
|
|
599336
|
+
}
|
|
599337
|
+
return { ok: true, path: outputPath2 };
|
|
599338
|
+
}
|
|
598434
599339
|
updateTelegramParticipantProfile(sessionKey, msg, text) {
|
|
598435
599340
|
const participantKey = String(msg.fromUserId || msg.username || msg.firstName || "unknown");
|
|
598436
599341
|
const participants = this.chatParticipants.get(sessionKey) ?? /* @__PURE__ */ new Map();
|
|
@@ -598605,6 +599510,22 @@ ${notes2}`;
|
|
|
598605
599510
|
sections.push(`### Zettelkasten Memory Recall
|
|
598606
599511
|
${cardLines.join("\n")}`);
|
|
598607
599512
|
}
|
|
599513
|
+
const recentMedia = this.recentTelegramMediaEntries(msg.chatId, 10);
|
|
599514
|
+
if (recentMedia.length > 0) {
|
|
599515
|
+
const mediaLines = recentMedia.map((entry) => {
|
|
599516
|
+
const kind = telegramCachedMediaIsImage(entry) ? "image" : entry.mediaType;
|
|
599517
|
+
const replyMark = msg.replyToMessageId && entry.messageId === msg.replyToMessageId ? " replied-to" : "";
|
|
599518
|
+
const caption = entry.caption ? ` caption:${truncateTelegramContextLine(entry.caption, 120)}` : "";
|
|
599519
|
+
const extracted = entry.extractedContent ? `
|
|
599520
|
+
${truncateTelegramContextLine(entry.extractedContent.replace(/\s+/g, " "), 220)}` : "";
|
|
599521
|
+
return `- message_id ${entry.messageId}${replyMark}: ${kind}; path ${entry.localPath}; file ${basename23(entry.localPath)}${caption}${extracted}`;
|
|
599522
|
+
});
|
|
599523
|
+
sections.push([
|
|
599524
|
+
"### Recent Chat Media",
|
|
599525
|
+
"Use these paths only as tool inputs when the user asks about media in this chat. Do not quote local paths in the visible Telegram reply.",
|
|
599526
|
+
mediaLines.join("\n")
|
|
599527
|
+
].join("\n"));
|
|
599528
|
+
}
|
|
598608
599529
|
if (olderCount > 0) {
|
|
598609
599530
|
const older = history.slice(0, olderCount);
|
|
598610
599531
|
const bySpeaker = /* @__PURE__ */ new Map();
|
|
@@ -599301,8 +600222,8 @@ Join: ${newUrl}`);
|
|
|
599301
600222
|
}
|
|
599302
600223
|
}
|
|
599303
600224
|
let steeringText = msg.text;
|
|
599304
|
-
if (msg.media) {
|
|
599305
|
-
const mediaContext = await this.
|
|
600225
|
+
if (msg.media || msg.replyToMedia) {
|
|
600226
|
+
const mediaContext = await this.processMediaContextForMessage(msg);
|
|
599306
600227
|
if (mediaContext) {
|
|
599307
600228
|
steeringText += `
|
|
599308
600229
|
|
|
@@ -599376,8 +600297,8 @@ ${mediaContext}`;
|
|
|
599376
600297
|
this.tuiWrite(() => renderTelegramSubAgentStart(msg.username, msg.text, isAdminDM));
|
|
599377
600298
|
try {
|
|
599378
600299
|
let mediaContext = "";
|
|
599379
|
-
if (msg.media) {
|
|
599380
|
-
mediaContext = await this.
|
|
600300
|
+
if (msg.media || msg.replyToMedia) {
|
|
600301
|
+
mediaContext = await this.processMediaContextForMessage(msg);
|
|
599381
600302
|
}
|
|
599382
600303
|
const result = await this.runSubAgent(msg, subAgent, mediaContext);
|
|
599383
600304
|
if (subAgent.typingInterval) {
|
|
@@ -599479,8 +600400,8 @@ ${mediaContext}`;
|
|
|
599479
600400
|
this.tuiWrite(() => renderTelegramSubAgentEvent(msg.username, `admin chat with full context/tools (${this.interactionMode})`));
|
|
599480
600401
|
try {
|
|
599481
600402
|
let mediaContext = "";
|
|
599482
|
-
if (msg.media) {
|
|
599483
|
-
mediaContext = await this.
|
|
600403
|
+
if (msg.media || msg.replyToMedia) {
|
|
600404
|
+
mediaContext = await this.processMediaContextForMessage(msg);
|
|
599484
600405
|
}
|
|
599485
600406
|
const result = await this.runSubAgent(msg, subAgent, mediaContext, "chat");
|
|
599486
600407
|
if (subAgent.typingInterval) {
|
|
@@ -599563,7 +600484,7 @@ ${mediaContext}`;
|
|
|
599563
600484
|
}
|
|
599564
600485
|
this.tuiWrite(() => renderTelegramSubAgentEvent(msg.username, `live inference: chat reply (${this.interactionMode})`));
|
|
599565
600486
|
try {
|
|
599566
|
-
const mediaContext = msg.media || msg.
|
|
600487
|
+
const mediaContext = msg.media || msg.replyToMedia || msg.livePhoto ? await this.processMediaContextForMessage(msg) : "";
|
|
599567
600488
|
const finalText = await this.runTelegramChatCompletion(
|
|
599568
600489
|
msg,
|
|
599569
600490
|
toolContext,
|
|
@@ -600056,6 +600977,128 @@ ${lines.join("\n\n")}` };
|
|
|
600056
600977
|
}
|
|
600057
600978
|
};
|
|
600058
600979
|
}
|
|
600980
|
+
if (tool.name === "image_read") {
|
|
600981
|
+
return {
|
|
600982
|
+
...tool,
|
|
600983
|
+
description: "Read only images from this Telegram chat's media cache or creative workspace. Use path='reply' for the replied-to image or path='latest' for the most recent chat image.",
|
|
600984
|
+
execute: async (args) => {
|
|
600985
|
+
const resolved = this.resolveTelegramScopedMediaPath(args["path"], chatId, currentMsg, "image");
|
|
600986
|
+
if (!resolved.ok) return { success: false, output: "", error: resolved.error };
|
|
600987
|
+
return tool.execute({ ...args, path: resolved.path });
|
|
600988
|
+
}
|
|
600989
|
+
};
|
|
600990
|
+
}
|
|
600991
|
+
if (tool.name === "ocr") {
|
|
600992
|
+
return {
|
|
600993
|
+
...tool,
|
|
600994
|
+
description: "Extract text only from images in this Telegram chat's media cache or creative workspace. Use path='reply' or path='latest' for chat media references.",
|
|
600995
|
+
execute: async (args) => {
|
|
600996
|
+
const resolved = this.resolveTelegramScopedMediaPath(args["path"], chatId, currentMsg, "image");
|
|
600997
|
+
if (!resolved.ok) return { success: false, output: "", error: resolved.error };
|
|
600998
|
+
return tool.execute({ ...args, path: resolved.path });
|
|
600999
|
+
}
|
|
601000
|
+
};
|
|
601001
|
+
}
|
|
601002
|
+
if (tool.name === "vision") {
|
|
601003
|
+
return {
|
|
601004
|
+
...tool,
|
|
601005
|
+
description: "Analyze only images from this Telegram chat's media cache or creative workspace. Use image='reply' for the replied-to image or image='latest' for the most recent chat image.",
|
|
601006
|
+
execute: async (args) => {
|
|
601007
|
+
const resolved = this.resolveTelegramScopedMediaPath(args["image"], chatId, currentMsg, "image");
|
|
601008
|
+
if (!resolved.ok) return { success: false, output: "", error: resolved.error };
|
|
601009
|
+
return tool.execute({ ...args, image: resolved.path });
|
|
601010
|
+
}
|
|
601011
|
+
};
|
|
601012
|
+
}
|
|
601013
|
+
if (tool.name === "ocr_image_advanced") {
|
|
601014
|
+
return {
|
|
601015
|
+
...tool,
|
|
601016
|
+
description: "Advanced OCR only for images in this Telegram chat's media cache or creative workspace. Batch directory mode is disabled in public Telegram scope.",
|
|
601017
|
+
execute: async (args) => {
|
|
601018
|
+
if (args["batch"] === true) return { success: false, output: "", error: "Batch directory OCR is not available in public Telegram scope." };
|
|
601019
|
+
const resolved = this.resolveTelegramScopedMediaPath(args["image"], chatId, currentMsg, "image");
|
|
601020
|
+
if (!resolved.ok) return { success: false, output: "", error: resolved.error };
|
|
601021
|
+
const next = { ...args, image: resolved.path };
|
|
601022
|
+
if (typeof next["output_dir"] === "string" && next["output_dir"].trim()) {
|
|
601023
|
+
const output = this.resolveTelegramScopedOutputPath(next["output_dir"], chatId, "ocr-output");
|
|
601024
|
+
if (!output.ok) return { success: false, output: "", error: output.error };
|
|
601025
|
+
next["output_dir"] = output.path;
|
|
601026
|
+
}
|
|
601027
|
+
return tool.execute(next);
|
|
601028
|
+
}
|
|
601029
|
+
};
|
|
601030
|
+
}
|
|
601031
|
+
if (tool.name === "transcribe_file") {
|
|
601032
|
+
return {
|
|
601033
|
+
...tool,
|
|
601034
|
+
description: "Transcribe only audio/video files from this Telegram chat's media cache or creative workspace. Use path='reply' or path='latest' for chat media references.",
|
|
601035
|
+
execute: async (args) => {
|
|
601036
|
+
const resolved = this.resolveTelegramScopedMediaPath(args["path"], chatId, currentMsg, "transcribable");
|
|
601037
|
+
if (!resolved.ok) return { success: false, output: "", error: resolved.error };
|
|
601038
|
+
return tool.execute({ ...args, path: resolved.path });
|
|
601039
|
+
}
|
|
601040
|
+
};
|
|
601041
|
+
}
|
|
601042
|
+
if (tool.name === "pdf_to_text") {
|
|
601043
|
+
return {
|
|
601044
|
+
...tool,
|
|
601045
|
+
description: "Extract text only from PDFs in this Telegram chat's media cache or creative workspace. Use path='reply' or path='latest' for chat document references.",
|
|
601046
|
+
execute: async (args) => {
|
|
601047
|
+
const resolved = this.resolveTelegramScopedMediaPath(args["path"], chatId, currentMsg, "pdf");
|
|
601048
|
+
if (!resolved.ok) return { success: false, output: "", error: resolved.error };
|
|
601049
|
+
return tool.execute({ ...args, path: resolved.path });
|
|
601050
|
+
}
|
|
601051
|
+
};
|
|
601052
|
+
}
|
|
601053
|
+
if (tool.name === "ocr_pdf") {
|
|
601054
|
+
return {
|
|
601055
|
+
...tool,
|
|
601056
|
+
description: "OCR only PDFs from this Telegram chat's media cache or creative workspace. Output, when requested, is forced into this chat's creative workspace.",
|
|
601057
|
+
execute: async (args) => {
|
|
601058
|
+
const input = this.resolveTelegramScopedMediaPath(args["input"], chatId, currentMsg, "pdf");
|
|
601059
|
+
if (!input.ok) return { success: false, output: "", error: input.error };
|
|
601060
|
+
const next = { ...args, input: input.path };
|
|
601061
|
+
if (typeof next["output"] === "string" && next["output"].trim()) {
|
|
601062
|
+
const output = this.resolveTelegramScopedOutputPath(next["output"], chatId, `ocr-${Date.now()}.pdf`);
|
|
601063
|
+
if (!output.ok) return { success: false, output: "", error: output.error };
|
|
601064
|
+
next["output"] = output.path;
|
|
601065
|
+
}
|
|
601066
|
+
return tool.execute(next);
|
|
601067
|
+
}
|
|
601068
|
+
};
|
|
601069
|
+
}
|
|
601070
|
+
if (tool.name === "video_understand") {
|
|
601071
|
+
return {
|
|
601072
|
+
...tool,
|
|
601073
|
+
description: "Analyze only video files from this Telegram chat's media cache or creative workspace. URL download is disabled in public Telegram scope; use path='reply' or path='latest'.",
|
|
601074
|
+
execute: async (args) => {
|
|
601075
|
+
if (args["url"]) return { success: false, output: "", error: "URL video analysis is not available in public Telegram scope. Use a video posted in this chat." };
|
|
601076
|
+
const resolved = this.resolveTelegramScopedMediaPath(args["path"], chatId, currentMsg, "video");
|
|
601077
|
+
if (!resolved.ok) return { success: false, output: "", error: resolved.error };
|
|
601078
|
+
return tool.execute({ ...args, path: resolved.path });
|
|
601079
|
+
}
|
|
601080
|
+
};
|
|
601081
|
+
}
|
|
601082
|
+
if (tool.name === "audio_analyze") {
|
|
601083
|
+
return {
|
|
601084
|
+
...tool,
|
|
601085
|
+
description: "Analyze only audio files from this Telegram chat's media cache or creative workspace. Microphone/listen mode is disabled in public Telegram scope.",
|
|
601086
|
+
execute: async (args) => {
|
|
601087
|
+
if (String(args["action"] || "").toLowerCase() === "listen") {
|
|
601088
|
+
return { success: false, output: "", error: "Continuous microphone listening is not available in Telegram public scope." };
|
|
601089
|
+
}
|
|
601090
|
+
const resolved = this.resolveTelegramScopedMediaPath(args["file"] ?? args["path"], chatId, currentMsg, "audio");
|
|
601091
|
+
if (!resolved.ok) return { success: false, output: "", error: resolved.error };
|
|
601092
|
+
return tool.execute({ ...args, file: resolved.path, path: resolved.path });
|
|
601093
|
+
}
|
|
601094
|
+
};
|
|
601095
|
+
}
|
|
601096
|
+
if (tool.name === "explore_tools") {
|
|
601097
|
+
return {
|
|
601098
|
+
...tool,
|
|
601099
|
+
description: "List and explain the tools available in this Telegram public/group scope. Do not invent unavailable tool names."
|
|
601100
|
+
};
|
|
601101
|
+
}
|
|
600059
601102
|
return tool;
|
|
600060
601103
|
});
|
|
600061
601104
|
}
|
|
@@ -600219,11 +601262,16 @@ Scoped workspace: ${scopedRoot}`,
|
|
|
600219
601262
|
new ImageReadTool(repoRoot),
|
|
600220
601263
|
new OCRTool(repoRoot),
|
|
600221
601264
|
new VisionTool(repoRoot),
|
|
601265
|
+
new OcrImageAdvancedTool(repoRoot),
|
|
600222
601266
|
new OcrPdfTool(repoRoot),
|
|
600223
601267
|
new PdfToTextTool(repoRoot),
|
|
600224
601268
|
// Transcription tools
|
|
600225
601269
|
new TranscribeFileTool(repoRoot),
|
|
600226
|
-
new TranscribeUrlTool(repoRoot)
|
|
601270
|
+
new TranscribeUrlTool(repoRoot),
|
|
601271
|
+
new VideoUnderstandTool(repoRoot),
|
|
601272
|
+
new AudioAnalyzeTool(),
|
|
601273
|
+
new ExploreToolsTool(),
|
|
601274
|
+
this.buildTelegramMediaRecentTool(chatId, msg)
|
|
600227
601275
|
];
|
|
600228
601276
|
const adminTools = [
|
|
600229
601277
|
new ShellTool(repoRoot),
|
|
@@ -600326,6 +601374,55 @@ Scoped workspace: ${scopedRoot}`,
|
|
|
600326
601374
|
]);
|
|
600327
601375
|
return tools.filter((tool) => !blocked.has(tool.name));
|
|
600328
601376
|
}
|
|
601377
|
+
buildTelegramMediaRecentTool(chatId, currentMsg) {
|
|
601378
|
+
const bridge = this;
|
|
601379
|
+
return {
|
|
601380
|
+
name: "telegram_media_recent",
|
|
601381
|
+
description: "List recent media files available in this Telegram chat scope, including safe aliases for image_read, ocr, vision, transcribe_file, pdf_to_text, video_understand, and audio_analyze.",
|
|
601382
|
+
parameters: {
|
|
601383
|
+
type: "object",
|
|
601384
|
+
properties: {
|
|
601385
|
+
kind: {
|
|
601386
|
+
type: "string",
|
|
601387
|
+
enum: ["media", "image", "audio", "video", "pdf", "transcribable"],
|
|
601388
|
+
description: "Filter by media kind. Defaults to all recent chat media."
|
|
601389
|
+
},
|
|
601390
|
+
limit: { type: "number", description: "Maximum entries to return, 1-20. Default: 10." }
|
|
601391
|
+
}
|
|
601392
|
+
},
|
|
601393
|
+
async execute(args) {
|
|
601394
|
+
const start2 = performance.now();
|
|
601395
|
+
const kind = String(args["kind"] || "media").toLowerCase();
|
|
601396
|
+
const limit = typeof args["limit"] === "number" && Number.isFinite(args["limit"]) ? Math.max(1, Math.min(20, Math.floor(args["limit"]))) : 10;
|
|
601397
|
+
const entries = bridge.recentTelegramMediaEntries(chatId, 60).filter((entry) => bridge.telegramMediaEntryMatchesKind(entry, kind)).slice(0, limit);
|
|
601398
|
+
if (entries.length === 0) {
|
|
601399
|
+
return { success: true, output: `No recent ${kind} media is available in this Telegram chat scope.`, durationMs: performance.now() - start2 };
|
|
601400
|
+
}
|
|
601401
|
+
const lines = entries.map((entry, index) => {
|
|
601402
|
+
const parts = [
|
|
601403
|
+
`${index + 1}. message_id ${entry.messageId || "unknown"}`,
|
|
601404
|
+
currentMsg?.replyToMessageId === entry.messageId ? "replied-to" : "",
|
|
601405
|
+
telegramCachedMediaIsImage(entry) ? "image" : telegramCachedMediaIsPdf(entry) ? "pdf" : telegramCachedMediaIsAudio(entry) ? "audio" : telegramCachedMediaIsVideo(entry) ? "video" : entry.mediaType,
|
|
601406
|
+
`file=${basename23(entry.localPath)}`,
|
|
601407
|
+
`path=${entry.localPath}`,
|
|
601408
|
+
entry.caption ? `caption=${truncateTelegramContextLine(entry.caption, 140)}` : ""
|
|
601409
|
+
].filter(Boolean);
|
|
601410
|
+
const extracted = entry.extractedContent ? `
|
|
601411
|
+
context: ${truncateTelegramContextLine(entry.extractedContent.replace(/\s+/g, " "), 240)}` : "";
|
|
601412
|
+
return `${parts.join("; ")}${extracted}`;
|
|
601413
|
+
});
|
|
601414
|
+
return {
|
|
601415
|
+
success: true,
|
|
601416
|
+
output: [
|
|
601417
|
+
"Recent scoped Telegram media:",
|
|
601418
|
+
"Use path='reply' for replied-to media, path='latest' for the most recent matching item, or one of the listed paths.",
|
|
601419
|
+
lines.join("\n")
|
|
601420
|
+
].join("\n"),
|
|
601421
|
+
durationMs: performance.now() - start2
|
|
601422
|
+
};
|
|
601423
|
+
}
|
|
601424
|
+
};
|
|
601425
|
+
}
|
|
600329
601426
|
imageGenerationDefaultsForRepo(repoRoot) {
|
|
600330
601427
|
const settings = resolveSettings(repoRoot);
|
|
600331
601428
|
return {
|
|
@@ -600543,30 +601640,36 @@ ${knownList}` : "Private-user telegram_send_file target must be this DM or a kno
|
|
|
600543
601640
|
* Downloads the file, runs it through the appropriate pipeline,
|
|
600544
601641
|
* caches it, and returns a text description for the agent.
|
|
600545
601642
|
*/
|
|
600546
|
-
async processMedia(msg) {
|
|
600547
|
-
|
|
600548
|
-
|
|
600549
|
-
const
|
|
601643
|
+
async processMedia(msg, source = "message") {
|
|
601644
|
+
const media = source === "reply" ? msg.replyToMedia : msg.media;
|
|
601645
|
+
if (!media) return "";
|
|
601646
|
+
const { type, fileId, fileUniqueId, mimeType, caption } = media;
|
|
601647
|
+
const isImageMedia = telegramMediaIsImage(media);
|
|
601648
|
+
const sourceMessageId = source === "reply" ? msg.replyToMessageId : msg.messageId;
|
|
601649
|
+
const sourceLabel = source === "reply" ? "replied-to " : "";
|
|
600550
601650
|
let ext = ".bin";
|
|
600551
|
-
if (isImageMedia) ext = telegramImageExtension(
|
|
601651
|
+
if (isImageMedia) ext = telegramImageExtension(media);
|
|
600552
601652
|
else if (type === "audio" || type === "voice") ext = ".ogg";
|
|
600553
601653
|
else if (type === "video" || type === "video_note" || type === "live_photo") ext = ".mp4";
|
|
600554
|
-
else if (
|
|
600555
|
-
const dotIdx =
|
|
600556
|
-
if (dotIdx >= 0) ext =
|
|
601654
|
+
else if (media.fileName) {
|
|
601655
|
+
const dotIdx = media.fileName.lastIndexOf(".");
|
|
601656
|
+
if (dotIdx >= 0) ext = media.fileName.slice(dotIdx);
|
|
600557
601657
|
}
|
|
600558
601658
|
const localPath = await this.downloadTelegramFile(fileId, ext);
|
|
600559
601659
|
if (!localPath) return `[Media: ${type} — failed to download]`;
|
|
600560
601660
|
const cacheEntry = {
|
|
600561
601661
|
localPath,
|
|
600562
601662
|
fileId,
|
|
601663
|
+
fileUniqueId,
|
|
600563
601664
|
chatId: msg.chatId,
|
|
601665
|
+
messageId: sourceMessageId ?? 0,
|
|
600564
601666
|
username: msg.username,
|
|
600565
601667
|
mediaType: type,
|
|
600566
601668
|
mimeType,
|
|
601669
|
+
caption,
|
|
600567
601670
|
cachedAt: Date.now()
|
|
600568
601671
|
};
|
|
600569
|
-
this.mediaCache.set(fileUniqueId
|
|
601672
|
+
this.mediaCache.set(`${String(msg.chatId)}:${String(sourceMessageId ?? 0)}:${fileUniqueId}`, cacheEntry);
|
|
600570
601673
|
const metadataKey = String(msg.chatId);
|
|
600571
601674
|
if (!this.mediaMetadata.has(metadataKey)) {
|
|
600572
601675
|
this.mediaMetadata.set(metadataKey, []);
|
|
@@ -600587,7 +601690,7 @@ ${knownList}` : "Private-user telegram_send_file target must be this DM or a kno
|
|
|
600587
601690
|
{
|
|
600588
601691
|
path: localPath,
|
|
600589
601692
|
buffer: readFileSync87(localPath),
|
|
600590
|
-
mime: telegramImageMime(
|
|
601693
|
+
mime: telegramImageMime(media)
|
|
600591
601694
|
},
|
|
600592
601695
|
this.agentConfig?.model ?? ""
|
|
600593
601696
|
);
|
|
@@ -600596,10 +601699,10 @@ ${knownList}` : "Private-user telegram_send_file target must be this DM or a kno
|
|
|
600596
601699
|
} catch {
|
|
600597
601700
|
}
|
|
600598
601701
|
if (visionContext) {
|
|
600599
|
-
description = `[
|
|
601702
|
+
description = `[${sourceLabel}image received: ${localPath}${caption ? ` — caption: "${caption}"` : ""}
|
|
600600
601703
|
${visionContext}]`;
|
|
600601
601704
|
} else {
|
|
600602
|
-
description = `[
|
|
601705
|
+
description = `[${sourceLabel}image received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}. You can use image_read, ocr, or vision tools to analyze it.]`;
|
|
600603
601706
|
}
|
|
600604
601707
|
try {
|
|
600605
601708
|
await fetch("http://127.0.0.1:11435/v1/memory/ingest", {
|
|
@@ -600623,9 +601726,9 @@ ${visionContext}]`;
|
|
|
600623
601726
|
} catch {
|
|
600624
601727
|
}
|
|
600625
601728
|
if (transcription) {
|
|
600626
|
-
description = `[
|
|
601729
|
+
description = `[${sourceLabel}voice message transcribed: "${transcription}"${caption ? ` — caption: "${caption}"` : ""}]`;
|
|
600627
601730
|
} else {
|
|
600628
|
-
description = `[
|
|
601731
|
+
description = `[${sourceLabel}audio/voice message received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}. You can use transcribe_file to transcribe it.]`;
|
|
600629
601732
|
}
|
|
600630
601733
|
try {
|
|
600631
601734
|
await fetch("http://127.0.0.1:11435/v1/memory/ingest", {
|
|
@@ -600638,13 +601741,30 @@ ${visionContext}]`;
|
|
|
600638
601741
|
}
|
|
600639
601742
|
} else if (type === "video" || type === "video_note" || type === "live_photo") {
|
|
600640
601743
|
const label = type === "live_photo" ? "Live photo" : "Video";
|
|
600641
|
-
description = `[${label} received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}.]`;
|
|
601744
|
+
description = `[${sourceLabel}${label.toLowerCase()} received and saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}. You can use video_understand or transcribe_file to analyze it.]`;
|
|
600642
601745
|
} else if (type === "document") {
|
|
600643
|
-
description = `[
|
|
601746
|
+
description = `[${sourceLabel}document received: ${media.fileName || "unnamed"}${mimeType ? ` (${mimeType})` : ""}, saved to ${localPath}${caption ? ` — caption: "${caption}"` : ""}.]`;
|
|
600644
601747
|
}
|
|
600645
601748
|
cacheEntry.extractedContent = description;
|
|
600646
601749
|
return description;
|
|
600647
601750
|
}
|
|
601751
|
+
async processMediaContextForMessage(msg) {
|
|
601752
|
+
const parts = [];
|
|
601753
|
+
if (msg.media) {
|
|
601754
|
+
const current = await this.processMedia(msg, "message");
|
|
601755
|
+
if (current) parts.push(current);
|
|
601756
|
+
}
|
|
601757
|
+
if (msg.replyToMedia) {
|
|
601758
|
+
const replied = await this.processMedia(msg, "reply");
|
|
601759
|
+
if (replied) parts.push(replied);
|
|
601760
|
+
}
|
|
601761
|
+
const text = parts.join("\n\n");
|
|
601762
|
+
if (text) this.updateLastTelegramUserMessageText(msg, `${msg.text}
|
|
601763
|
+
|
|
601764
|
+
[Media context]
|
|
601765
|
+
${text}`.trim());
|
|
601766
|
+
return text;
|
|
601767
|
+
}
|
|
600648
601768
|
/** Clean up expired media cache entries (older than 30 minutes) */
|
|
600649
601769
|
cleanupMediaCache() {
|
|
600650
601770
|
const now = Date.now();
|
|
@@ -625230,7 +626350,7 @@ var clipboard_media_exports = {};
|
|
|
625230
626350
|
__export(clipboard_media_exports, {
|
|
625231
626351
|
pasteClipboardImageToFile: () => pasteClipboardImageToFile
|
|
625232
626352
|
});
|
|
625233
|
-
import { execFileSync as
|
|
626353
|
+
import { execFileSync as execFileSync6, execSync as execSync58 } from "node:child_process";
|
|
625234
626354
|
import { mkdirSync as mkdirSync72, readFileSync as readFileSync99, rmSync as rmSync5, writeFileSync as writeFileSync67 } from "node:fs";
|
|
625235
626355
|
import { join as join136 } from "node:path";
|
|
625236
626356
|
function pasteClipboardImageToFile(repoRoot) {
|
|
@@ -625247,7 +626367,7 @@ function readClipboardImage() {
|
|
|
625247
626367
|
try {
|
|
625248
626368
|
execSync58("command -v pngpaste", { stdio: "ignore", timeout: 1e3 });
|
|
625249
626369
|
const tmp = `/tmp/omnius-clipboard-${Date.now()}.png`;
|
|
625250
|
-
|
|
626370
|
+
execFileSync6("pngpaste", [tmp], { timeout: 3e3 });
|
|
625251
626371
|
const buffer2 = readFileSync99(tmp);
|
|
625252
626372
|
try {
|
|
625253
626373
|
rmSync5(tmp);
|
|
@@ -625267,7 +626387,7 @@ function readClipboardImage() {
|
|
|
625267
626387
|
];
|
|
625268
626388
|
for (const attempt of attempts) {
|
|
625269
626389
|
try {
|
|
625270
|
-
const buffer2 =
|
|
626390
|
+
const buffer2 = execFileSync6(attempt.cmd, attempt.args, { timeout: 3e3, maxBuffer: 25 * 1024 * 1024 });
|
|
625271
626391
|
if (buffer2.length > 0) return { buffer: buffer2, mime: attempt.mime, ext: attempt.ext };
|
|
625272
626392
|
} catch {
|
|
625273
626393
|
continue;
|
|
@@ -625284,7 +626404,7 @@ function readClipboardImage() {
|
|
|
625284
626404
|
"$img.Save($ms,[Drawing.Imaging.ImageFormat]::Png);",
|
|
625285
626405
|
"[Console]::OpenStandardOutput().Write($ms.ToArray(),0,$ms.Length)"
|
|
625286
626406
|
].join("");
|
|
625287
|
-
const buffer2 =
|
|
626407
|
+
const buffer2 = execFileSync6("powershell.exe", ["-NoProfile", "-Command", ps], {
|
|
625288
626408
|
timeout: 5e3,
|
|
625289
626409
|
maxBuffer: 25 * 1024 * 1024
|
|
625290
626410
|
});
|
|
@@ -625303,7 +626423,7 @@ var init_clipboard_media = __esm({
|
|
|
625303
626423
|
|
|
625304
626424
|
// packages/cli/src/tui/interactive.ts
|
|
625305
626425
|
import { cwd } from "node:process";
|
|
625306
|
-
import { resolve as resolve44, join as join137, dirname as dirname38, extname as
|
|
626426
|
+
import { resolve as resolve44, join as join137, dirname as dirname38, extname as extname16, relative as relative14 } from "node:path";
|
|
625307
626427
|
import { createRequire as createRequire8 } from "node:module";
|
|
625308
626428
|
import { fileURLToPath as fileURLToPath18 } from "node:url";
|
|
625309
626429
|
import {
|
|
@@ -632605,7 +633725,7 @@ Execute this skill now. Follow the behavioral guidance above.`;
|
|
|
632605
633725
|
const imgPath = resolve44(repoRoot, cleanPath);
|
|
632606
633726
|
const imgBuffer = readFileSync100(imgPath);
|
|
632607
633727
|
const base642 = imgBuffer.toString("base64");
|
|
632608
|
-
const ext =
|
|
633728
|
+
const ext = extname16(cleanPath).toLowerCase();
|
|
632609
633729
|
const mime = ext === ".png" ? "image/png" : ext === ".gif" ? "image/gif" : ext === ".webp" ? "image/webp" : "image/jpeg";
|
|
632610
633730
|
const asciiContext = await renderAsciiPreviewForImage(
|
|
632611
633731
|
imgPath,
|