npm - @vibeframe/mcp-server - Versions diffs - 0.53.0 → 0.54.0 - Mend

@vibeframe/mcp-server 0.53.0 → 0.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/dist/index.js +324 -42
package/package.json +3 -3

package/dist/index.js CHANGED Viewed

@@ -428532,7 +428532,7 @@ var init_WhisperProvider = __esm({
       isConfigured() {
         return !!this.apiKey;
       }
-      async transcribe(audio, language) {
+      async transcribe(audio, language, options) {
         if (!this.apiKey) {
           return {
             id: "",
@@ -428540,14 +428540,21 @@ var init_WhisperProvider = __esm({
             error: "Whisper API key not configured"
           };
         }
+        const granularity = options?.granularity ?? "segment";
         try {
           const formData = new FormData();
           formData.append("file", audio, "audio.webm");
           formData.append("model", "whisper-1");
           formData.append("response_format", "verbose_json");
-          formData.append("timestamp_granularities[]", "segment");
-          if (language) {
-            formData.append("language", language);
+          if (granularity === "segment" || granularity === "both") {
+            formData.append("timestamp_granularities[]", "segment");
+          }
+          if (granularity === "word" || granularity === "both") {
+            formData.append("timestamp_granularities[]", "word");
+          }
+          const lang = language ?? options?.language;
+          if (lang) {
+            formData.append("language", lang);
           }
           const response = await fetch(`${this.baseUrl}/audio/transcriptions`, {
             method: "POST",
@@ -428565,20 +428572,30 @@ var init_WhisperProvider = __esm({
             };
           }
           const data = await response.json();
-          return {
+          const result = {
             id: crypto.randomUUID(),
             status: "completed",
             fullText: data.text,
-            detectedLanguage: data.language,
-            segments: data.segments?.map((seg, index) => ({
+            detectedLanguage: data.language
+          };
+          if (granularity === "segment" || granularity === "both") {
+            result.segments = data.segments?.map((seg, index) => ({
               id: `segment-${index}`,
               startTime: seg.start,
               endTime: seg.end,
               text: seg.text.trim(),
               confidence: 1
-              // Whisper doesn't provide confidence per segment
-            }))
-          };
+              // Whisper doesn't provide per-segment confidence
+            }));
+          }
+          if (granularity === "word" || granularity === "both") {
+            result.words = data.words?.map((w) => ({
+              text: w.word,
+              start: w.start,
+              end: w.end
+            }));
+          }
+          return result;
         } catch (error) {
           return {
             id: "",
@@ -432768,6 +432785,101 @@ var init_elevenlabs = __esm({
   }
 });
+// ../ai-providers/dist/kokoro/KokoroProvider.js
+async function loadKokoroFactory() {
+  if (factoryOverride)
+    return factoryOverride;
+  const mod = await import("kokoro-js");
+  return mod.KokoroTTS;
+}
+function loadModel(progress) {
+  if (modelPromise)
+    return modelPromise;
+  modelPromise = (async () => {
+    const factory = await loadKokoroFactory();
+    return factory.from_pretrained(KOKORO_MODEL_ID, {
+      dtype: "q8",
+      device: "cpu",
+      progress_callback: progress ? (raw2) => progress(normaliseEvent(raw2)) : void 0
+    });
+  })().catch((err) => {
+    modelPromise = null;
+    throw err;
+  });
+  return modelPromise;
+}
+function normaliseEvent(raw2) {
+  const r = raw2 ?? {};
+  return {
+    status: typeof r.status === "string" ? r.status : "unknown",
+    file: typeof r.file === "string" ? r.file : void 0,
+    progress: typeof r.progress === "number" ? r.progress : void 0,
+    loaded: typeof r.loaded === "number" ? r.loaded : void 0,
+    total: typeof r.total === "number" ? r.total : void 0
+  };
+}
+var KOKORO_DEFAULT_VOICE, KOKORO_MODEL_ID, modelPromise, factoryOverride, KokoroProvider, kokoroProvider;
+var init_KokoroProvider = __esm({
+  "../ai-providers/dist/kokoro/KokoroProvider.js"() {
+    "use strict";
+    KOKORO_DEFAULT_VOICE = "af_heart";
+    KOKORO_MODEL_ID = "onnx-community/Kokoro-82M-v1.0-ONNX";
+    modelPromise = null;
+    factoryOverride = null;
+    KokoroProvider = class {
+      constructor() {
+        this.id = "kokoro";
+        this.name = "Kokoro (local)";
+        this.description = "Local text-to-speech via Kokoro-82M (Apache 2.0)";
+        this.capabilities = ["text-to-speech"];
+        this.iconUrl = "/icons/kokoro.svg";
+        this.isAvailable = true;
+      }
+      async initialize(_config) {
+      }
+      isConfigured() {
+        return true;
+      }
+      /**
+       * Synthesise speech from text. Returns a WAV buffer matching
+       * `ElevenLabsProvider.textToSpeech`'s `TTSResult` shape.
+       */
+      async textToSpeech(text, options = {}) {
+        if (!text || !text.trim()) {
+          return { success: false, error: "Empty text" };
+        }
+        try {
+          const model = await loadModel(options.onProgress);
+          const audio = await model.generate(text, {
+            voice: options.voice ?? KOKORO_DEFAULT_VOICE,
+            speed: options.speed ?? 1
+          });
+          const buffer = Buffer.from(audio.toWav());
+          return {
+            success: true,
+            audioBuffer: buffer,
+            characterCount: text.length
+          };
+        } catch (error) {
+          return {
+            success: false,
+            error: error instanceof Error ? error.message : "Unknown error"
+          };
+        }
+      }
+    };
+    kokoroProvider = new KokoroProvider();
+  }
+});
+// ../ai-providers/dist/kokoro/index.js
+var init_kokoro = __esm({
+  "../ai-providers/dist/kokoro/index.js"() {
+    "use strict";
+    init_KokoroProvider();
+  }
+});
 // ../ai-providers/dist/openai-image/OpenAIImageProvider.js
 var DEFAULT_MODEL, OpenAIImageProvider, openaiImageProvider;
 var init_OpenAIImageProvider = __esm({
@@ -434730,7 +434842,10 @@ __export(dist_exports2, {
   GeminiProvider: () => GeminiProvider,
   GrokProvider: () => GrokProvider,
   KNOWN_VOICES: () => KNOWN_VOICES,
+  KOKORO_DEFAULT_VOICE: () => KOKORO_DEFAULT_VOICE,
+  KOKORO_MODEL_ID: () => KOKORO_MODEL_ID,
   KlingProvider: () => KlingProvider,
+  KokoroProvider: () => KokoroProvider,
   OllamaProvider: () => OllamaProvider,
   OpenAIImageProvider: () => OpenAIImageProvider,
   OpenAIProvider: () => OpenAIProvider,
@@ -434743,6 +434858,7 @@ __export(dist_exports2, {
   getBestProviderForCapability: () => getBestProviderForCapability,
   grokProvider: () => grokProvider,
   klingProvider: () => klingProvider,
+  kokoroProvider: () => kokoroProvider,
   ollamaProvider: () => ollamaProvider,
   openaiImageProvider: () => openaiImageProvider,
   openaiProvider: () => openaiProvider,
@@ -434763,6 +434879,7 @@ var init_dist2 = __esm({
     init_claude();
     init_ollama();
     init_elevenlabs();
+    init_kokoro();
     init_openai_image();
     init_runway();
     init_kling();
@@ -442748,8 +442865,8 @@ async function extendVideoNaturally(videoPath, targetDuration, outputPath) {
   const videoDuration = await getVideoDuration(videoPath);
   const ratio = targetDuration / videoDuration;
   if (ratio <= 1) {
-    const { copyFile: copyFile4 } = await import("node:fs/promises");
-    await copyFile4(videoPath, outputPath);
+    const { copyFile: copyFile5 } = await import("node:fs/promises");
+    await copyFile5(videoPath, outputPath);
     return;
   }
   if (ratio <= 1.15) {
@@ -446295,6 +446412,16 @@ function slugifySceneName(name) {
   const slug = normalised.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
   return slug || "scene";
 }
+function renderTranscriptSpans(transcript) {
+  return transcript.map((w, i) => `<span class="word" data-i="${i}">${esc(w.text)}</span>`).join(" ");
+}
+function buildTranscriptTweens(transcript, targetSelector) {
+  return transcript.map((w, i) => {
+    const start = Math.max(0, Number(w.start.toFixed(3)));
+    const sel = `${targetSelector}[data-i="${i}"]`;
+    return `tl.fromTo('${sel}', { opacity: 0, y: 10 }, { opacity: 1, y: 0, duration: 0.18, ease: 'power2.out' }, ${start});`;
+  }).join("\n      ");
+}
 function buildPreset(input3) {
   const id = input3.id;
   const scope = `[data-composition-id="${id}"]`;
@@ -446325,7 +446452,15 @@ function buildPreset(input3) {
   const backdropMarkup = `<div class="backdrop"></div>`;
   switch (input3.preset) {
     case "simple": {
-      const caption = subhead || headline;
+      const transcript = input3.transcript;
+      const useWordSync = !!(transcript && transcript.length > 0);
+      const captionText = subhead || headline;
+      const captionInner = useWordSync ? renderTranscriptSpans(transcript) : esc(captionText);
+      const wordCss = useWordSync ? `
+      ${scope} .caption .word { display: inline-block; opacity: 0; }` : "";
+      const timeline = useWordSync ? `${buildTranscriptTweens(transcript, `${scope} .caption .word`)}
+      tl.to('${scope} .caption', { opacity: 0, duration: 0.4, ease: 'power2.in' }, ${(dur - 0.4).toFixed(2)});` : `tl.from('${scope} .caption', { opacity: 0, y: 28, duration: 0.6, ease: 'power2.out' }, 0.1);
+      tl.to('${scope} .caption', { opacity: 0, duration: 0.4, ease: 'power2.in' }, ${(dur - 0.4).toFixed(2)});`;
       return {
         css: `${scope} {
         position: absolute; inset: 0; width: 100%; height: 100%;
@@ -446341,11 +446476,10 @@ function buildPreset(input3) {
         font-weight: 700;
         line-height: 1.2;
         text-shadow: 0 4px 20px rgba(0,0,0,0.65);
-      }`,
+      }${wordCss}`,
         body: `${backdropMarkup}
-    <div class="caption" id="caption">${esc(caption)}</div>`,
-        timeline: `tl.from('${scope} .caption', { opacity: 0, y: 28, duration: 0.6, ease: 'power2.out' }, 0.1);
-      tl.to('${scope} .caption', { opacity: 0, duration: 0.4, ease: 'power2.in' }, ${(dur - 0.4).toFixed(2)});`
+    <div class="caption" id="caption">${captionInner}</div>`,
+        timeline
       };
     }
     case "announcement": {
@@ -446381,6 +446515,12 @@ function buildPreset(input3) {
     case "explainer": {
       const k = kicker || humanise(id).toUpperCase();
       const sub = subhead || "";
+      const transcript = input3.transcript;
+      const useWordSync = !!(transcript && transcript.length > 0 && sub);
+      const subtitleInner = useWordSync ? renderTranscriptSpans(transcript) : esc(sub);
+      const wordCss = useWordSync ? `
+      ${scope} #subtitle .word { display: inline-block; opacity: 0; }` : "";
+      const subtitleTween = useWordSync ? buildTranscriptTweens(transcript, `${scope} #subtitle .word`) : sub ? `tl.from('${scope} #subtitle', { opacity: 0, y: 30, duration: 0.55, ease: 'power3.out' }, 0.55);` : "";
       return {
         css: `${scope} {
         position: absolute; inset: 0; width: 100%; height: 100%;
@@ -446403,23 +446543,28 @@ function buildPreset(input3) {
       }
       ${scope} .subtitle {
         font-size: 38px; font-weight: 300; color: #c0c0d0; max-width: 80%;
-      }`,
+      }${wordCss}`,
         body: `${backdropMarkup}
     <div class="stage">
       <div class="kicker" id="kicker">${esc(k)}</div>
       <h1 class="title" id="title">${esc(headline)}</h1>${sub ? `
-      <div class="subtitle" id="subtitle">${esc(sub)}</div>` : ""}
+      <div class="subtitle" id="subtitle">${subtitleInner}</div>` : ""}
     </div>`,
         timeline: `tl.from('${scope} #kicker', { opacity: 0, y: 16, duration: 0.4, ease: 'power2.out' }, 0.1);
       tl.from('${scope} #title', { opacity: 0, y: 60, duration: 0.7, ease: 'power3.out' }, 0.25);
-      ${sub ? `tl.from('${scope} #subtitle', { opacity: 0, y: 30, duration: 0.55, ease: 'power3.out' }, 0.55);` : ""}`
+      ${subtitleTween}`
       };
     }
     case "kinetic-type": {
-      const words = headline.split(/\s+/).filter(Boolean);
-      const wordSpans = words.map((w, i) => `<span class="word" id="w-${i}">${esc(w)}</span>`).join(" ");
+      const transcript = input3.transcript;
+      const useWordSync = !!(transcript && transcript.length > 0);
+      const words = useWordSync ? transcript.map((w) => w.text) : headline.split(/\s+/).filter(Boolean);
+      const wordSpans = words.map((w, i) => `<span class="word" data-i="${i}" id="w-${i}">${esc(w)}</span>`).join(" ");
       const stagger = Math.max(0.08, Math.min(0.3, (dur - 0.6) / Math.max(words.length, 1)));
-      const tweens = words.map((_, i) => {
+      const tweens = useWordSync ? transcript.map((w, i) => {
+        const start = Math.max(0, Number(w.start.toFixed(3)));
+        return `tl.from('${scope} #w-${i}', { opacity: 0, y: 80, scale: 0.8, duration: 0.35, ease: 'back.out(1.8)' }, ${start});`;
+      }).join("\n      ") : words.map((_, i) => {
         const start = (0.05 + i * stagger).toFixed(2);
         return `tl.from('${scope} #w-${i}', { opacity: 0, y: 80, scale: 0.8, duration: 0.45, ease: 'back.out(1.8)' }, ${start});`;
       }).join("\n      ");
@@ -450197,12 +450342,12 @@ function resolveProvider(category) {
   if (!candidates) return null;
   if (configDefaults?.[category]) {
     const preferred = candidates.find((c) => c.name === configDefaults[category]);
-    if (preferred && hasApiKey(preferred.envVar)) {
+    if (preferred && (preferred.envVar === null || hasApiKey(preferred.envVar))) {
       return { name: preferred.name, label: preferred.label };
     }
   }
   for (const candidate of candidates) {
-    if (hasApiKey(candidate.envVar)) {
+    if (candidate.envVar === null || hasApiKey(candidate.envVar)) {
       return { name: candidate.name, label: candidate.label };
     }
   }
@@ -450225,7 +450370,8 @@ var init_provider_resolver = __esm({
       { name: "runway", envVar: "RUNWAY_API_SECRET", label: "Runway" }
     ];
     SPEECH_PROVIDERS = [
-      { name: "elevenlabs", envVar: "ELEVENLABS_API_KEY", label: "ElevenLabs" }
+      { name: "elevenlabs", envVar: "ELEVENLABS_API_KEY", label: "ElevenLabs" },
+      { name: "kokoro", envVar: null, label: "Kokoro (local)" }
     ];
     PROVIDER_MAP = {
       image: IMAGE_PROVIDERS,
@@ -459473,12 +459619,67 @@ init_source();
 init_ora();
 var import_yaml6 = __toESM(require_dist14(), 1);
 init_dist2();
+import { basename as basename17, resolve as resolve38, relative as relative7, dirname as dirname24 } from "node:path";
+import { mkdir as mkdir19, readFile as readFile22, writeFile as writeFile25, access as access5, copyFile as copyFile4 } from "node:fs/promises";
+import { existsSync as existsSync37 } from "node:fs";
+// ../cli/src/commands/_shared/tts-resolve.ts
+init_dist2();
+init_api_key();
+init_api_key();
+async function resolveTtsProvider(preferred = "auto") {
+  const choice = preferred === "auto" ? hasApiKey("ELEVENLABS_API_KEY") ? "elevenlabs" : "kokoro" : preferred;
+  if (choice === "elevenlabs") {
+    return buildElevenLabs();
+  }
+  return buildKokoro();
+}
+async function buildElevenLabs() {
+  const key2 = await getApiKey("ELEVENLABS_API_KEY", "ElevenLabs");
+  if (!key2) {
+    throw new TtsKeyMissingError("elevenlabs");
+  }
+  const provider = new ElevenLabsProvider();
+  await provider.initialize({ apiKey: key2 });
+  const call = async (text, opts) => provider.textToSpeech(text, {
+    voiceId: opts?.voice,
+    speed: opts?.speed
+  });
+  return { provider: "elevenlabs", audioExtension: "mp3", call };
+}
+async function buildKokoro() {
+  const provider = new KokoroProvider();
+  await provider.initialize({});
+  const call = async (text, opts) => provider.textToSpeech(text, {
+    voice: opts?.voice,
+    speed: opts?.speed,
+    onProgress: opts?.onProgress
+  });
+  return { provider: "kokoro", audioExtension: "wav", call };
+}
+var TtsKeyMissingError = class extends Error {
+  constructor(provider) {
+    super(
+      provider === "elevenlabs" ? "ElevenLabs API key required (ELEVENLABS_API_KEY). Run 'vibe setup', set ELEVENLABS_API_KEY in .env, or pass --tts kokoro for local synthesis." : `Provider ${provider} is unavailable.`
+    );
+    this.provider = provider;
+    this.name = "TtsKeyMissingError";
+  }
+};
+function parseTtsProviderName(value) {
+  if (!value) return "auto";
+  if (value === "auto" || value === "elevenlabs" || value === "kokoro") {
+    return value;
+  }
+  throw new Error(
+    `Invalid --tts: ${value}. Valid: auto, elevenlabs, kokoro.`
+  );
+}
+// ../cli/src/commands/scene.ts
 init_scene_project();
 init_scene_html_emit();
 init_scene_lint();
-import { basename as basename17, resolve as resolve38, relative as relative7, dirname as dirname24 } from "node:path";
-import { mkdir as mkdir19, readFile as readFile22, writeFile as writeFile25, access as access5 } from "node:fs/promises";
-import { existsSync as existsSync37 } from "node:fs";
 init_output();
 init_api_key();
 init_audio();
@@ -459567,9 +459768,15 @@ sceneCommand.command("init").description("Scaffold a new scene project (or safel
     exitWithError(generalError(`Failed to scaffold: ${msg}`));
   }
 });
-sceneCommand.command("add").description("Add a new scene to a project: AI narration + image + per-scene HTML").argument("<name>", "Scene name (slugified into the composition id)").option("--style <preset>", `Style preset: ${SCENE_PRESETS.join(", ")}`, "simple").option("--narration <text>", "Narration text (or path to a .txt file). Drives TTS + scene duration.").option("-d, --duration <sec>", "Explicit scene duration in seconds (overrides narration audio)").option("--visuals <prompt>", "Image prompt \u2014 generates assets/scene-<id>.png via the configured image provider").option("--headline <text>", "Visible headline (defaults to the humanised scene name)").option("--kicker <text>", "Small label above the headline (explainer / product-shot)").option("--insert-into <path>", "Root composition file to update", "index.html").option("--project <dir>", "Project directory", ".").option("--image-provider <name>", "Image provider: gemini, openai", "gemini").option("--voice <id>", "ElevenLabs voice id or name").option("--no-audio", "Skip TTS even when --narration is provided (useful for tests/agent dry runs)").option("--no-image", "Skip image generation even when --visuals is provided").option("--force", "Overwrite an existing compositions/scene-<id>.html").option("--dry-run", "Preview parameters without writing files or calling APIs").action(async (name, options) => {
+sceneCommand.command("add").description("Add a new scene to a project: AI narration + image + per-scene HTML").argument("<name>", "Scene name (slugified into the composition id)").option("--style <preset>", `Style preset: ${SCENE_PRESETS.join(", ")}`, "simple").option("--narration <text>", "Narration text (or path to a .txt file). Drives TTS + scene duration.").option("--narration-file <path>", "Existing narration audio file (.wav/.mp3). Skips TTS \u2014 useful with hyperframes tts, Mac say, or other external tools.").option("-d, --duration <sec>", "Explicit scene duration in seconds (overrides narration audio)").option("--visuals <prompt>", "Image prompt \u2014 generates assets/scene-<id>.png via the configured image provider").option("--headline <text>", "Visible headline (defaults to the humanised scene name)").option("--kicker <text>", "Small label above the headline (explainer / product-shot)").option("--insert-into <path>", "Root composition file to update", "index.html").option("--project <dir>", "Project directory", ".").option("--image-provider <name>", "Image provider: gemini, openai", "gemini").option("--tts <provider>", "TTS provider: auto, elevenlabs, kokoro (default auto \u2014 picks ElevenLabs when key set, else Kokoro local)", "auto").option("--voice <id>", "Voice id (ElevenLabs name/id, or Kokoro id like af_heart, am_michael)").option("--no-audio", "Skip TTS even when --narration is provided (useful for tests/agent dry runs)").option("--no-image", "Skip image generation even when --visuals is provided").option("--no-transcribe", "Skip Whisper word-level transcribe step (no transcript-<id>.json emitted)").option("--transcribe-language <code>", "BCP-47 language code passed to Whisper (e.g. en, ko)").option("--force", "Overwrite an existing compositions/scene-<id>.html").option("--dry-run", "Preview parameters without writing files or calling APIs").action(async (name, options) => {
   if (options.style) options.style = validatePreset(options.style);
   if (options.duration !== void 0) options.duration = validateDuration(options.duration);
+  let tts;
+  try {
+    tts = parseTtsProviderName(options.tts);
+  } catch (error) {
+    exitWithError(usageError(error instanceof Error ? error.message : String(error)));
+  }
   if (options.dryRun) {
     const id = slugifySceneName(name);
     outputResult({
@@ -459587,6 +459794,7 @@ sceneCommand.command("add").description("Add a new scene to a project: AI narrat
         project: options.project,
         insertInto: options.insertInto,
         imageProvider: options.imageProvider,
+        tts,
         audio: options.audio,
         // commander sets `audio: false` when --no-audio is passed
         image: options.image
@@ -459600,6 +459808,7 @@ sceneCommand.command("add").description("Add a new scene to a project: AI narrat
       name,
       preset: options.style,
       narration: options.narration,
+      narrationFile: options.narrationFile,
       duration: options.duration,
       visuals: options.visuals,
       headline: options.headline,
@@ -459607,9 +459816,12 @@ sceneCommand.command("add").description("Add a new scene to a project: AI narrat
       projectDir: options.project,
       insertInto: options.insertInto,
       imageProvider: options.imageProvider,
+      tts,
       voice: options.voice,
       skipAudio: options.audio === false,
       skipImage: options.image === false,
+      skipTranscribe: options.transcribe === false,
+      transcribeLanguage: options.transcribeLanguage,
       force: !!options.force,
       onProgress: (msg) => {
         if (spinner2) spinner2.text = msg;
@@ -459633,6 +459845,7 @@ sceneCommand.command("add").description("Add a new scene to a project: AI narrat
     console.log(source_default.green("  +"), result.scenePath);
     if (result.audioPath) console.log(source_default.green("  +"), result.audioPath);
     if (result.imagePath) console.log(source_default.green("  +"), result.imagePath);
+    if (result.transcriptPath) console.log(source_default.green("  +"), result.transcriptPath);
     console.log(source_default.yellow("  ~"), result.rootPath, source_default.dim("(updated)"));
     console.log();
     console.log(source_default.bold.cyan("Composition"));
@@ -459721,19 +459934,49 @@ async function executeSceneAdd(opts) {
   let audioRelPath;
   let audioAbsPath;
   let narrationDuration;
-  if (narrationText && !opts.skipAudio) {
-    const elevenlabsKey = await getApiKey("ELEVENLABS_API_KEY", "ElevenLabs");
-    if (!elevenlabsKey) {
-      return errResult("ElevenLabs API key required for --narration. Set ELEVENLABS_API_KEY, run 'vibe setup', or pass --no-audio.");
+  if (opts.narrationFile && !opts.skipAudio) {
+    const sourceAbs = resolve38(opts.narrationFile);
+    if (!await pathExists2(sourceAbs)) {
+      return errResult(`Narration file not found: ${sourceAbs}`);
     }
-    opts.onProgress?.("Generating narration with ElevenLabs...");
-    const elevenlabs = new ElevenLabsProvider();
-    await elevenlabs.initialize({ apiKey: elevenlabsKey });
-    const tts = await elevenlabs.textToSpeech(narrationText, { voiceId: opts.voice });
+    const ext = (sourceAbs.match(/\.([a-z0-9]+)$/i)?.[1] ?? "wav").toLowerCase();
+    if (ext !== "wav" && ext !== "mp3") {
+      return errResult(`Unsupported narration file extension: .${ext}. Use .wav or .mp3.`);
+    }
+    audioRelPath = `assets/narration-${id}.${ext}`;
+    audioAbsPath = resolve38(projectDir, audioRelPath);
+    await mkdir19(dirname24(audioAbsPath), { recursive: true });
+    await copyFile4(sourceAbs, audioAbsPath);
+    try {
+      narrationDuration = await getAudioDuration(audioAbsPath);
+    } catch {
+      narrationDuration = void 0;
+    }
+  } else if (narrationText && !opts.skipAudio) {
+    let resolution;
+    try {
+      resolution = await resolveTtsProvider(opts.tts ?? "auto");
+    } catch (error) {
+      if (error instanceof TtsKeyMissingError) {
+        return errResult(error.message);
+      }
+      throw error;
+    }
+    opts.onProgress?.(
+      resolution.provider === "kokoro" ? "Generating narration with Kokoro (local \u2014 first run downloads ~330MB)..." : "Generating narration with ElevenLabs..."
+    );
+    const tts = await resolution.call(narrationText, {
+      voice: opts.voice,
+      onProgress: (event) => {
+        if (event.status === "progress" && typeof event.progress === "number") {
+          opts.onProgress?.(`Kokoro model: ${event.file ?? ""} ${Math.round(event.progress)}%`);
+        }
+      }
+    });
     if (!tts.success || !tts.audioBuffer) {
-      return errResult(`ElevenLabs TTS failed: ${tts.error ?? "unknown error"}`);
+      return errResult(`${resolution.provider} TTS failed: ${tts.error ?? "unknown error"}`);
     }
-    audioRelPath = `assets/narration-${id}.mp3`;
+    audioRelPath = `assets/narration-${id}.${resolution.audioExtension}`;
     audioAbsPath = resolve38(projectDir, audioRelPath);
     await mkdir19(dirname24(audioAbsPath), { recursive: true });
     await writeFile25(audioAbsPath, tts.audioBuffer);
@@ -459743,6 +459986,41 @@ async function executeSceneAdd(opts) {
       narrationDuration = void 0;
     }
   }
+  let transcriptRelPath;
+  let transcriptWordCount;
+  let transcriptWords;
+  if (audioAbsPath && !opts.skipTranscribe) {
+    const whisperKey = await getApiKey("OPENAI_API_KEY", "OpenAI");
+    if (!whisperKey) {
+      opts.onProgress?.(
+        "Skipping transcribe (OPENAI_API_KEY not set \u2014 narration plays but word-sync unavailable)"
+      );
+    } else {
+      opts.onProgress?.("Transcribing narration (Whisper word-level)...");
+      try {
+        const whisper = new WhisperProvider();
+        await whisper.initialize({ apiKey: whisperKey });
+        const audioBytes = await readFile22(audioAbsPath);
+        const audioBlob = new Blob([new Uint8Array(audioBytes)]);
+        const transcript = await whisper.transcribe(audioBlob, void 0, {
+          granularity: "word",
+          language: opts.transcribeLanguage
+        });
+        if (transcript.status === "completed" && transcript.words?.length) {
+          transcriptRelPath = `assets/transcript-${id}.json`;
+          const transcriptAbs = resolve38(projectDir, transcriptRelPath);
+          await writeFile25(transcriptAbs, JSON.stringify(transcript.words, null, 2), "utf-8");
+          transcriptWordCount = transcript.words.length;
+          transcriptWords = transcript.words.map((w) => ({ text: w.text, start: w.start, end: w.end }));
+        } else if (transcript.status === "failed") {
+          opts.onProgress?.(`Transcribe failed: ${transcript.error ?? "unknown error"}`);
+        }
+      } catch (error) {
+        const msg = error instanceof Error ? error.message : String(error);
+        opts.onProgress?.(`Transcribe failed: ${msg}`);
+      }
+    }
+  }
   let imageRelPath;
   let imageAbsPath;
   if (opts.visuals && !opts.skipImage) {
@@ -459812,7 +460090,8 @@ async function executeSceneAdd(opts) {
     subhead: narrationText,
     kicker: opts.kicker,
     imagePath: imageRelPath,
-    audioPath: audioRelPath
+    audioPath: audioRelPath,
+    transcript: transcriptWords
   });
   await mkdir19(dirname24(scenePath), { recursive: true });
   await writeFile25(scenePath, sceneHtml, "utf-8");
@@ -459820,6 +460099,7 @@ async function executeSceneAdd(opts) {
   const start = nextSceneStart(rootHtmlBefore);
   const updated = insertClipIntoRoot(rootHtmlBefore, { id, start, duration });
   await writeFile25(rootPath, updated, "utf-8");
+  const transcriptAbsPath = transcriptRelPath ? resolve38(projectDir, transcriptRelPath) : void 0;
   return {
     success: true,
     id,
@@ -459829,7 +460109,9 @@ async function executeSceneAdd(opts) {
     scenePath: relative7(process.cwd(), scenePath) || scenePath,
     rootPath: relative7(process.cwd(), rootPath) || rootPath,
     audioPath: audioAbsPath ? relative7(process.cwd(), audioAbsPath) || audioAbsPath : void 0,
-    imagePath: imageAbsPath ? relative7(process.cwd(), imageAbsPath) || imageAbsPath : void 0
+    imagePath: imageAbsPath ? relative7(process.cwd(), imageAbsPath) || imageAbsPath : void 0,
+    transcriptPath: transcriptAbsPath ? relative7(process.cwd(), transcriptAbsPath) || transcriptAbsPath : void 0,
+    transcriptWordCount
   };
 }
 sceneCommand.command("lint").description("Validate scene HTML against Hyperframes rules (in-process, no Chrome required)").argument("[root]", "Root composition file relative to --project", "index.html").option("--project <dir>", "Project directory", ".").option("--fix", 'Apply mechanical auto-fixes (currently: missing class="clip")').action(async (root2, options) => {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@vibeframe/mcp-server",
-  "version": "0.53.0",
+  "version": "0.54.0",
   "description": "VibeFrame MCP Server - AI-native video editing via Model Context Protocol",
   "type": "module",
   "bin": {
@@ -57,8 +57,8 @@
     "tsx": "^4.21.0",
     "typescript": "^5.3.3",
     "vitest": "^1.2.2",
-    "@vibeframe/cli": "0.53.0",
-    "@vibeframe/core": "0.53.0"
+    "@vibeframe/cli": "0.54.0",
+    "@vibeframe/core": "0.54.0"
   },
   "engines": {
     "node": ">=20"