@vibeframe/mcp-server 0.53.0 → 0.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +324 -42
  2. package/package.json +3 -3
package/dist/index.js CHANGED
@@ -428532,7 +428532,7 @@ var init_WhisperProvider = __esm({
428532
428532
  isConfigured() {
428533
428533
  return !!this.apiKey;
428534
428534
  }
428535
- async transcribe(audio, language) {
428535
+ async transcribe(audio, language, options) {
428536
428536
  if (!this.apiKey) {
428537
428537
  return {
428538
428538
  id: "",
@@ -428540,14 +428540,21 @@ var init_WhisperProvider = __esm({
428540
428540
  error: "Whisper API key not configured"
428541
428541
  };
428542
428542
  }
428543
+ const granularity = options?.granularity ?? "segment";
428543
428544
  try {
428544
428545
  const formData = new FormData();
428545
428546
  formData.append("file", audio, "audio.webm");
428546
428547
  formData.append("model", "whisper-1");
428547
428548
  formData.append("response_format", "verbose_json");
428548
- formData.append("timestamp_granularities[]", "segment");
428549
- if (language) {
428550
- formData.append("language", language);
428549
+ if (granularity === "segment" || granularity === "both") {
428550
+ formData.append("timestamp_granularities[]", "segment");
428551
+ }
428552
+ if (granularity === "word" || granularity === "both") {
428553
+ formData.append("timestamp_granularities[]", "word");
428554
+ }
428555
+ const lang = language ?? options?.language;
428556
+ if (lang) {
428557
+ formData.append("language", lang);
428551
428558
  }
428552
428559
  const response = await fetch(`${this.baseUrl}/audio/transcriptions`, {
428553
428560
  method: "POST",
@@ -428565,20 +428572,30 @@ var init_WhisperProvider = __esm({
428565
428572
  };
428566
428573
  }
428567
428574
  const data = await response.json();
428568
- return {
428575
+ const result = {
428569
428576
  id: crypto.randomUUID(),
428570
428577
  status: "completed",
428571
428578
  fullText: data.text,
428572
- detectedLanguage: data.language,
428573
- segments: data.segments?.map((seg, index) => ({
428579
+ detectedLanguage: data.language
428580
+ };
428581
+ if (granularity === "segment" || granularity === "both") {
428582
+ result.segments = data.segments?.map((seg, index) => ({
428574
428583
  id: `segment-${index}`,
428575
428584
  startTime: seg.start,
428576
428585
  endTime: seg.end,
428577
428586
  text: seg.text.trim(),
428578
428587
  confidence: 1
428579
- // Whisper doesn't provide confidence per segment
428580
- }))
428581
- };
428588
+ // Whisper doesn't provide per-segment confidence
428589
+ }));
428590
+ }
428591
+ if (granularity === "word" || granularity === "both") {
428592
+ result.words = data.words?.map((w) => ({
428593
+ text: w.word,
428594
+ start: w.start,
428595
+ end: w.end
428596
+ }));
428597
+ }
428598
+ return result;
428582
428599
  } catch (error) {
428583
428600
  return {
428584
428601
  id: "",
@@ -432768,6 +432785,101 @@ var init_elevenlabs = __esm({
432768
432785
  }
432769
432786
  });
432770
432787
 
432788
+ // ../ai-providers/dist/kokoro/KokoroProvider.js
432789
+ async function loadKokoroFactory() {
432790
+ if (factoryOverride)
432791
+ return factoryOverride;
432792
+ const mod = await import("kokoro-js");
432793
+ return mod.KokoroTTS;
432794
+ }
432795
+ function loadModel(progress) {
432796
+ if (modelPromise)
432797
+ return modelPromise;
432798
+ modelPromise = (async () => {
432799
+ const factory = await loadKokoroFactory();
432800
+ return factory.from_pretrained(KOKORO_MODEL_ID, {
432801
+ dtype: "q8",
432802
+ device: "cpu",
432803
+ progress_callback: progress ? (raw2) => progress(normaliseEvent(raw2)) : void 0
432804
+ });
432805
+ })().catch((err) => {
432806
+ modelPromise = null;
432807
+ throw err;
432808
+ });
432809
+ return modelPromise;
432810
+ }
432811
+ function normaliseEvent(raw2) {
432812
+ const r = raw2 ?? {};
432813
+ return {
432814
+ status: typeof r.status === "string" ? r.status : "unknown",
432815
+ file: typeof r.file === "string" ? r.file : void 0,
432816
+ progress: typeof r.progress === "number" ? r.progress : void 0,
432817
+ loaded: typeof r.loaded === "number" ? r.loaded : void 0,
432818
+ total: typeof r.total === "number" ? r.total : void 0
432819
+ };
432820
+ }
432821
+ var KOKORO_DEFAULT_VOICE, KOKORO_MODEL_ID, modelPromise, factoryOverride, KokoroProvider, kokoroProvider;
432822
+ var init_KokoroProvider = __esm({
432823
+ "../ai-providers/dist/kokoro/KokoroProvider.js"() {
432824
+ "use strict";
432825
+ KOKORO_DEFAULT_VOICE = "af_heart";
432826
+ KOKORO_MODEL_ID = "onnx-community/Kokoro-82M-v1.0-ONNX";
432827
+ modelPromise = null;
432828
+ factoryOverride = null;
432829
+ KokoroProvider = class {
432830
+ constructor() {
432831
+ this.id = "kokoro";
432832
+ this.name = "Kokoro (local)";
432833
+ this.description = "Local text-to-speech via Kokoro-82M (Apache 2.0)";
432834
+ this.capabilities = ["text-to-speech"];
432835
+ this.iconUrl = "/icons/kokoro.svg";
432836
+ this.isAvailable = true;
432837
+ }
432838
+ async initialize(_config) {
432839
+ }
432840
+ isConfigured() {
432841
+ return true;
432842
+ }
432843
+ /**
432844
+ * Synthesise speech from text. Returns a WAV buffer matching
432845
+ * `ElevenLabsProvider.textToSpeech`'s `TTSResult` shape.
432846
+ */
432847
+ async textToSpeech(text, options = {}) {
432848
+ if (!text || !text.trim()) {
432849
+ return { success: false, error: "Empty text" };
432850
+ }
432851
+ try {
432852
+ const model = await loadModel(options.onProgress);
432853
+ const audio = await model.generate(text, {
432854
+ voice: options.voice ?? KOKORO_DEFAULT_VOICE,
432855
+ speed: options.speed ?? 1
432856
+ });
432857
+ const buffer = Buffer.from(audio.toWav());
432858
+ return {
432859
+ success: true,
432860
+ audioBuffer: buffer,
432861
+ characterCount: text.length
432862
+ };
432863
+ } catch (error) {
432864
+ return {
432865
+ success: false,
432866
+ error: error instanceof Error ? error.message : "Unknown error"
432867
+ };
432868
+ }
432869
+ }
432870
+ };
432871
+ kokoroProvider = new KokoroProvider();
432872
+ }
432873
+ });
432874
+
432875
+ // ../ai-providers/dist/kokoro/index.js
432876
+ var init_kokoro = __esm({
432877
+ "../ai-providers/dist/kokoro/index.js"() {
432878
+ "use strict";
432879
+ init_KokoroProvider();
432880
+ }
432881
+ });
432882
+
432771
432883
  // ../ai-providers/dist/openai-image/OpenAIImageProvider.js
432772
432884
  var DEFAULT_MODEL, OpenAIImageProvider, openaiImageProvider;
432773
432885
  var init_OpenAIImageProvider = __esm({
@@ -434730,7 +434842,10 @@ __export(dist_exports2, {
434730
434842
  GeminiProvider: () => GeminiProvider,
434731
434843
  GrokProvider: () => GrokProvider,
434732
434844
  KNOWN_VOICES: () => KNOWN_VOICES,
434845
+ KOKORO_DEFAULT_VOICE: () => KOKORO_DEFAULT_VOICE,
434846
+ KOKORO_MODEL_ID: () => KOKORO_MODEL_ID,
434733
434847
  KlingProvider: () => KlingProvider,
434848
+ KokoroProvider: () => KokoroProvider,
434734
434849
  OllamaProvider: () => OllamaProvider,
434735
434850
  OpenAIImageProvider: () => OpenAIImageProvider,
434736
434851
  OpenAIProvider: () => OpenAIProvider,
@@ -434743,6 +434858,7 @@ __export(dist_exports2, {
434743
434858
  getBestProviderForCapability: () => getBestProviderForCapability,
434744
434859
  grokProvider: () => grokProvider,
434745
434860
  klingProvider: () => klingProvider,
434861
+ kokoroProvider: () => kokoroProvider,
434746
434862
  ollamaProvider: () => ollamaProvider,
434747
434863
  openaiImageProvider: () => openaiImageProvider,
434748
434864
  openaiProvider: () => openaiProvider,
@@ -434763,6 +434879,7 @@ var init_dist2 = __esm({
434763
434879
  init_claude();
434764
434880
  init_ollama();
434765
434881
  init_elevenlabs();
434882
+ init_kokoro();
434766
434883
  init_openai_image();
434767
434884
  init_runway();
434768
434885
  init_kling();
@@ -442748,8 +442865,8 @@ async function extendVideoNaturally(videoPath, targetDuration, outputPath) {
442748
442865
  const videoDuration = await getVideoDuration(videoPath);
442749
442866
  const ratio = targetDuration / videoDuration;
442750
442867
  if (ratio <= 1) {
442751
- const { copyFile: copyFile4 } = await import("node:fs/promises");
442752
- await copyFile4(videoPath, outputPath);
442868
+ const { copyFile: copyFile5 } = await import("node:fs/promises");
442869
+ await copyFile5(videoPath, outputPath);
442753
442870
  return;
442754
442871
  }
442755
442872
  if (ratio <= 1.15) {
@@ -446295,6 +446412,16 @@ function slugifySceneName(name) {
446295
446412
  const slug = normalised.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "");
446296
446413
  return slug || "scene";
446297
446414
  }
446415
+ function renderTranscriptSpans(transcript) {
446416
+ return transcript.map((w, i) => `<span class="word" data-i="${i}">${esc(w.text)}</span>`).join(" ");
446417
+ }
446418
+ function buildTranscriptTweens(transcript, targetSelector) {
446419
+ return transcript.map((w, i) => {
446420
+ const start = Math.max(0, Number(w.start.toFixed(3)));
446421
+ const sel = `${targetSelector}[data-i="${i}"]`;
446422
+ return `tl.fromTo('${sel}', { opacity: 0, y: 10 }, { opacity: 1, y: 0, duration: 0.18, ease: 'power2.out' }, ${start});`;
446423
+ }).join("\n ");
446424
+ }
446298
446425
  function buildPreset(input3) {
446299
446426
  const id = input3.id;
446300
446427
  const scope = `[data-composition-id="${id}"]`;
@@ -446325,7 +446452,15 @@ function buildPreset(input3) {
446325
446452
  const backdropMarkup = `<div class="backdrop"></div>`;
446326
446453
  switch (input3.preset) {
446327
446454
  case "simple": {
446328
- const caption = subhead || headline;
446455
+ const transcript = input3.transcript;
446456
+ const useWordSync = !!(transcript && transcript.length > 0);
446457
+ const captionText = subhead || headline;
446458
+ const captionInner = useWordSync ? renderTranscriptSpans(transcript) : esc(captionText);
446459
+ const wordCss = useWordSync ? `
446460
+ ${scope} .caption .word { display: inline-block; opacity: 0; }` : "";
446461
+ const timeline = useWordSync ? `${buildTranscriptTweens(transcript, `${scope} .caption .word`)}
446462
+ tl.to('${scope} .caption', { opacity: 0, duration: 0.4, ease: 'power2.in' }, ${(dur - 0.4).toFixed(2)});` : `tl.from('${scope} .caption', { opacity: 0, y: 28, duration: 0.6, ease: 'power2.out' }, 0.1);
446463
+ tl.to('${scope} .caption', { opacity: 0, duration: 0.4, ease: 'power2.in' }, ${(dur - 0.4).toFixed(2)});`;
446329
446464
  return {
446330
446465
  css: `${scope} {
446331
446466
  position: absolute; inset: 0; width: 100%; height: 100%;
@@ -446341,11 +446476,10 @@ function buildPreset(input3) {
446341
446476
  font-weight: 700;
446342
446477
  line-height: 1.2;
446343
446478
  text-shadow: 0 4px 20px rgba(0,0,0,0.65);
446344
- }`,
446479
+ }${wordCss}`,
446345
446480
  body: `${backdropMarkup}
446346
- <div class="caption" id="caption">${esc(caption)}</div>`,
446347
- timeline: `tl.from('${scope} .caption', { opacity: 0, y: 28, duration: 0.6, ease: 'power2.out' }, 0.1);
446348
- tl.to('${scope} .caption', { opacity: 0, duration: 0.4, ease: 'power2.in' }, ${(dur - 0.4).toFixed(2)});`
446481
+ <div class="caption" id="caption">${captionInner}</div>`,
446482
+ timeline
446349
446483
  };
446350
446484
  }
446351
446485
  case "announcement": {
@@ -446381,6 +446515,12 @@ function buildPreset(input3) {
446381
446515
  case "explainer": {
446382
446516
  const k = kicker || humanise(id).toUpperCase();
446383
446517
  const sub = subhead || "";
446518
+ const transcript = input3.transcript;
446519
+ const useWordSync = !!(transcript && transcript.length > 0 && sub);
446520
+ const subtitleInner = useWordSync ? renderTranscriptSpans(transcript) : esc(sub);
446521
+ const wordCss = useWordSync ? `
446522
+ ${scope} #subtitle .word { display: inline-block; opacity: 0; }` : "";
446523
+ const subtitleTween = useWordSync ? buildTranscriptTweens(transcript, `${scope} #subtitle .word`) : sub ? `tl.from('${scope} #subtitle', { opacity: 0, y: 30, duration: 0.55, ease: 'power3.out' }, 0.55);` : "";
446384
446524
  return {
446385
446525
  css: `${scope} {
446386
446526
  position: absolute; inset: 0; width: 100%; height: 100%;
@@ -446403,23 +446543,28 @@ function buildPreset(input3) {
446403
446543
  }
446404
446544
  ${scope} .subtitle {
446405
446545
  font-size: 38px; font-weight: 300; color: #c0c0d0; max-width: 80%;
446406
- }`,
446546
+ }${wordCss}`,
446407
446547
  body: `${backdropMarkup}
446408
446548
  <div class="stage">
446409
446549
  <div class="kicker" id="kicker">${esc(k)}</div>
446410
446550
  <h1 class="title" id="title">${esc(headline)}</h1>${sub ? `
446411
- <div class="subtitle" id="subtitle">${esc(sub)}</div>` : ""}
446551
+ <div class="subtitle" id="subtitle">${subtitleInner}</div>` : ""}
446412
446552
  </div>`,
446413
446553
  timeline: `tl.from('${scope} #kicker', { opacity: 0, y: 16, duration: 0.4, ease: 'power2.out' }, 0.1);
446414
446554
  tl.from('${scope} #title', { opacity: 0, y: 60, duration: 0.7, ease: 'power3.out' }, 0.25);
446415
- ${sub ? `tl.from('${scope} #subtitle', { opacity: 0, y: 30, duration: 0.55, ease: 'power3.out' }, 0.55);` : ""}`
446555
+ ${subtitleTween}`
446416
446556
  };
446417
446557
  }
446418
446558
  case "kinetic-type": {
446419
- const words = headline.split(/\s+/).filter(Boolean);
446420
- const wordSpans = words.map((w, i) => `<span class="word" id="w-${i}">${esc(w)}</span>`).join(" ");
446559
+ const transcript = input3.transcript;
446560
+ const useWordSync = !!(transcript && transcript.length > 0);
446561
+ const words = useWordSync ? transcript.map((w) => w.text) : headline.split(/\s+/).filter(Boolean);
446562
+ const wordSpans = words.map((w, i) => `<span class="word" data-i="${i}" id="w-${i}">${esc(w)}</span>`).join(" ");
446421
446563
  const stagger = Math.max(0.08, Math.min(0.3, (dur - 0.6) / Math.max(words.length, 1)));
446422
- const tweens = words.map((_, i) => {
446564
+ const tweens = useWordSync ? transcript.map((w, i) => {
446565
+ const start = Math.max(0, Number(w.start.toFixed(3)));
446566
+ return `tl.from('${scope} #w-${i}', { opacity: 0, y: 80, scale: 0.8, duration: 0.35, ease: 'back.out(1.8)' }, ${start});`;
446567
+ }).join("\n ") : words.map((_, i) => {
446423
446568
  const start = (0.05 + i * stagger).toFixed(2);
446424
446569
  return `tl.from('${scope} #w-${i}', { opacity: 0, y: 80, scale: 0.8, duration: 0.45, ease: 'back.out(1.8)' }, ${start});`;
446425
446570
  }).join("\n ");
@@ -450197,12 +450342,12 @@ function resolveProvider(category) {
450197
450342
  if (!candidates) return null;
450198
450343
  if (configDefaults?.[category]) {
450199
450344
  const preferred = candidates.find((c) => c.name === configDefaults[category]);
450200
- if (preferred && hasApiKey(preferred.envVar)) {
450345
+ if (preferred && (preferred.envVar === null || hasApiKey(preferred.envVar))) {
450201
450346
  return { name: preferred.name, label: preferred.label };
450202
450347
  }
450203
450348
  }
450204
450349
  for (const candidate of candidates) {
450205
- if (hasApiKey(candidate.envVar)) {
450350
+ if (candidate.envVar === null || hasApiKey(candidate.envVar)) {
450206
450351
  return { name: candidate.name, label: candidate.label };
450207
450352
  }
450208
450353
  }
@@ -450225,7 +450370,8 @@ var init_provider_resolver = __esm({
450225
450370
  { name: "runway", envVar: "RUNWAY_API_SECRET", label: "Runway" }
450226
450371
  ];
450227
450372
  SPEECH_PROVIDERS = [
450228
- { name: "elevenlabs", envVar: "ELEVENLABS_API_KEY", label: "ElevenLabs" }
450373
+ { name: "elevenlabs", envVar: "ELEVENLABS_API_KEY", label: "ElevenLabs" },
450374
+ { name: "kokoro", envVar: null, label: "Kokoro (local)" }
450229
450375
  ];
450230
450376
  PROVIDER_MAP = {
450231
450377
  image: IMAGE_PROVIDERS,
@@ -459473,12 +459619,67 @@ init_source();
459473
459619
  init_ora();
459474
459620
  var import_yaml6 = __toESM(require_dist14(), 1);
459475
459621
  init_dist2();
459622
+ import { basename as basename17, resolve as resolve38, relative as relative7, dirname as dirname24 } from "node:path";
459623
+ import { mkdir as mkdir19, readFile as readFile22, writeFile as writeFile25, access as access5, copyFile as copyFile4 } from "node:fs/promises";
459624
+ import { existsSync as existsSync37 } from "node:fs";
459625
+
459626
+ // ../cli/src/commands/_shared/tts-resolve.ts
459627
+ init_dist2();
459628
+ init_api_key();
459629
+ init_api_key();
459630
+ async function resolveTtsProvider(preferred = "auto") {
459631
+ const choice = preferred === "auto" ? hasApiKey("ELEVENLABS_API_KEY") ? "elevenlabs" : "kokoro" : preferred;
459632
+ if (choice === "elevenlabs") {
459633
+ return buildElevenLabs();
459634
+ }
459635
+ return buildKokoro();
459636
+ }
459637
+ async function buildElevenLabs() {
459638
+ const key2 = await getApiKey("ELEVENLABS_API_KEY", "ElevenLabs");
459639
+ if (!key2) {
459640
+ throw new TtsKeyMissingError("elevenlabs");
459641
+ }
459642
+ const provider = new ElevenLabsProvider();
459643
+ await provider.initialize({ apiKey: key2 });
459644
+ const call = async (text, opts) => provider.textToSpeech(text, {
459645
+ voiceId: opts?.voice,
459646
+ speed: opts?.speed
459647
+ });
459648
+ return { provider: "elevenlabs", audioExtension: "mp3", call };
459649
+ }
459650
+ async function buildKokoro() {
459651
+ const provider = new KokoroProvider();
459652
+ await provider.initialize({});
459653
+ const call = async (text, opts) => provider.textToSpeech(text, {
459654
+ voice: opts?.voice,
459655
+ speed: opts?.speed,
459656
+ onProgress: opts?.onProgress
459657
+ });
459658
+ return { provider: "kokoro", audioExtension: "wav", call };
459659
+ }
459660
+ var TtsKeyMissingError = class extends Error {
459661
+ constructor(provider) {
459662
+ super(
459663
+ provider === "elevenlabs" ? "ElevenLabs API key required (ELEVENLABS_API_KEY). Run 'vibe setup', set ELEVENLABS_API_KEY in .env, or pass --tts kokoro for local synthesis." : `Provider ${provider} is unavailable.`
459664
+ );
459665
+ this.provider = provider;
459666
+ this.name = "TtsKeyMissingError";
459667
+ }
459668
+ };
459669
+ function parseTtsProviderName(value) {
459670
+ if (!value) return "auto";
459671
+ if (value === "auto" || value === "elevenlabs" || value === "kokoro") {
459672
+ return value;
459673
+ }
459674
+ throw new Error(
459675
+ `Invalid --tts: ${value}. Valid: auto, elevenlabs, kokoro.`
459676
+ );
459677
+ }
459678
+
459679
+ // ../cli/src/commands/scene.ts
459476
459680
  init_scene_project();
459477
459681
  init_scene_html_emit();
459478
459682
  init_scene_lint();
459479
- import { basename as basename17, resolve as resolve38, relative as relative7, dirname as dirname24 } from "node:path";
459480
- import { mkdir as mkdir19, readFile as readFile22, writeFile as writeFile25, access as access5 } from "node:fs/promises";
459481
- import { existsSync as existsSync37 } from "node:fs";
459482
459683
  init_output();
459483
459684
  init_api_key();
459484
459685
  init_audio();
@@ -459567,9 +459768,15 @@ sceneCommand.command("init").description("Scaffold a new scene project (or safel
459567
459768
  exitWithError(generalError(`Failed to scaffold: ${msg}`));
459568
459769
  }
459569
459770
  });
459570
- sceneCommand.command("add").description("Add a new scene to a project: AI narration + image + per-scene HTML").argument("<name>", "Scene name (slugified into the composition id)").option("--style <preset>", `Style preset: ${SCENE_PRESETS.join(", ")}`, "simple").option("--narration <text>", "Narration text (or path to a .txt file). Drives TTS + scene duration.").option("-d, --duration <sec>", "Explicit scene duration in seconds (overrides narration audio)").option("--visuals <prompt>", "Image prompt \u2014 generates assets/scene-<id>.png via the configured image provider").option("--headline <text>", "Visible headline (defaults to the humanised scene name)").option("--kicker <text>", "Small label above the headline (explainer / product-shot)").option("--insert-into <path>", "Root composition file to update", "index.html").option("--project <dir>", "Project directory", ".").option("--image-provider <name>", "Image provider: gemini, openai", "gemini").option("--voice <id>", "ElevenLabs voice id or name").option("--no-audio", "Skip TTS even when --narration is provided (useful for tests/agent dry runs)").option("--no-image", "Skip image generation even when --visuals is provided").option("--force", "Overwrite an existing compositions/scene-<id>.html").option("--dry-run", "Preview parameters without writing files or calling APIs").action(async (name, options) => {
459771
+ sceneCommand.command("add").description("Add a new scene to a project: AI narration + image + per-scene HTML").argument("<name>", "Scene name (slugified into the composition id)").option("--style <preset>", `Style preset: ${SCENE_PRESETS.join(", ")}`, "simple").option("--narration <text>", "Narration text (or path to a .txt file). Drives TTS + scene duration.").option("--narration-file <path>", "Existing narration audio file (.wav/.mp3). Skips TTS \u2014 useful with hyperframes tts, Mac say, or other external tools.").option("-d, --duration <sec>", "Explicit scene duration in seconds (overrides narration audio)").option("--visuals <prompt>", "Image prompt \u2014 generates assets/scene-<id>.png via the configured image provider").option("--headline <text>", "Visible headline (defaults to the humanised scene name)").option("--kicker <text>", "Small label above the headline (explainer / product-shot)").option("--insert-into <path>", "Root composition file to update", "index.html").option("--project <dir>", "Project directory", ".").option("--image-provider <name>", "Image provider: gemini, openai", "gemini").option("--tts <provider>", "TTS provider: auto, elevenlabs, kokoro (default auto \u2014 picks ElevenLabs when key set, else Kokoro local)", "auto").option("--voice <id>", "Voice id (ElevenLabs name/id, or Kokoro id like af_heart, am_michael)").option("--no-audio", "Skip TTS even when --narration is provided (useful for tests/agent dry runs)").option("--no-image", "Skip image generation even when --visuals is provided").option("--no-transcribe", "Skip Whisper word-level transcribe step (no transcript-<id>.json emitted)").option("--transcribe-language <code>", "BCP-47 language code passed to Whisper (e.g. en, ko)").option("--force", "Overwrite an existing compositions/scene-<id>.html").option("--dry-run", "Preview parameters without writing files or calling APIs").action(async (name, options) => {
459571
459772
  if (options.style) options.style = validatePreset(options.style);
459572
459773
  if (options.duration !== void 0) options.duration = validateDuration(options.duration);
459774
+ let tts;
459775
+ try {
459776
+ tts = parseTtsProviderName(options.tts);
459777
+ } catch (error) {
459778
+ exitWithError(usageError(error instanceof Error ? error.message : String(error)));
459779
+ }
459573
459780
  if (options.dryRun) {
459574
459781
  const id = slugifySceneName(name);
459575
459782
  outputResult({
@@ -459587,6 +459794,7 @@ sceneCommand.command("add").description("Add a new scene to a project: AI narrat
459587
459794
  project: options.project,
459588
459795
  insertInto: options.insertInto,
459589
459796
  imageProvider: options.imageProvider,
459797
+ tts,
459590
459798
  audio: options.audio,
459591
459799
  // commander sets `audio: false` when --no-audio is passed
459592
459800
  image: options.image
@@ -459600,6 +459808,7 @@ sceneCommand.command("add").description("Add a new scene to a project: AI narrat
459600
459808
  name,
459601
459809
  preset: options.style,
459602
459810
  narration: options.narration,
459811
+ narrationFile: options.narrationFile,
459603
459812
  duration: options.duration,
459604
459813
  visuals: options.visuals,
459605
459814
  headline: options.headline,
@@ -459607,9 +459816,12 @@ sceneCommand.command("add").description("Add a new scene to a project: AI narrat
459607
459816
  projectDir: options.project,
459608
459817
  insertInto: options.insertInto,
459609
459818
  imageProvider: options.imageProvider,
459819
+ tts,
459610
459820
  voice: options.voice,
459611
459821
  skipAudio: options.audio === false,
459612
459822
  skipImage: options.image === false,
459823
+ skipTranscribe: options.transcribe === false,
459824
+ transcribeLanguage: options.transcribeLanguage,
459613
459825
  force: !!options.force,
459614
459826
  onProgress: (msg) => {
459615
459827
  if (spinner2) spinner2.text = msg;
@@ -459633,6 +459845,7 @@ sceneCommand.command("add").description("Add a new scene to a project: AI narrat
459633
459845
  console.log(source_default.green(" +"), result.scenePath);
459634
459846
  if (result.audioPath) console.log(source_default.green(" +"), result.audioPath);
459635
459847
  if (result.imagePath) console.log(source_default.green(" +"), result.imagePath);
459848
+ if (result.transcriptPath) console.log(source_default.green(" +"), result.transcriptPath);
459636
459849
  console.log(source_default.yellow(" ~"), result.rootPath, source_default.dim("(updated)"));
459637
459850
  console.log();
459638
459851
  console.log(source_default.bold.cyan("Composition"));
@@ -459721,19 +459934,49 @@ async function executeSceneAdd(opts) {
459721
459934
  let audioRelPath;
459722
459935
  let audioAbsPath;
459723
459936
  let narrationDuration;
459724
- if (narrationText && !opts.skipAudio) {
459725
- const elevenlabsKey = await getApiKey("ELEVENLABS_API_KEY", "ElevenLabs");
459726
- if (!elevenlabsKey) {
459727
- return errResult("ElevenLabs API key required for --narration. Set ELEVENLABS_API_KEY, run 'vibe setup', or pass --no-audio.");
459937
+ if (opts.narrationFile && !opts.skipAudio) {
459938
+ const sourceAbs = resolve38(opts.narrationFile);
459939
+ if (!await pathExists2(sourceAbs)) {
459940
+ return errResult(`Narration file not found: ${sourceAbs}`);
459728
459941
  }
459729
- opts.onProgress?.("Generating narration with ElevenLabs...");
459730
- const elevenlabs = new ElevenLabsProvider();
459731
- await elevenlabs.initialize({ apiKey: elevenlabsKey });
459732
- const tts = await elevenlabs.textToSpeech(narrationText, { voiceId: opts.voice });
459942
+ const ext = (sourceAbs.match(/\.([a-z0-9]+)$/i)?.[1] ?? "wav").toLowerCase();
459943
+ if (ext !== "wav" && ext !== "mp3") {
459944
+ return errResult(`Unsupported narration file extension: .${ext}. Use .wav or .mp3.`);
459945
+ }
459946
+ audioRelPath = `assets/narration-${id}.${ext}`;
459947
+ audioAbsPath = resolve38(projectDir, audioRelPath);
459948
+ await mkdir19(dirname24(audioAbsPath), { recursive: true });
459949
+ await copyFile4(sourceAbs, audioAbsPath);
459950
+ try {
459951
+ narrationDuration = await getAudioDuration(audioAbsPath);
459952
+ } catch {
459953
+ narrationDuration = void 0;
459954
+ }
459955
+ } else if (narrationText && !opts.skipAudio) {
459956
+ let resolution;
459957
+ try {
459958
+ resolution = await resolveTtsProvider(opts.tts ?? "auto");
459959
+ } catch (error) {
459960
+ if (error instanceof TtsKeyMissingError) {
459961
+ return errResult(error.message);
459962
+ }
459963
+ throw error;
459964
+ }
459965
+ opts.onProgress?.(
459966
+ resolution.provider === "kokoro" ? "Generating narration with Kokoro (local \u2014 first run downloads ~330MB)..." : "Generating narration with ElevenLabs..."
459967
+ );
459968
+ const tts = await resolution.call(narrationText, {
459969
+ voice: opts.voice,
459970
+ onProgress: (event) => {
459971
+ if (event.status === "progress" && typeof event.progress === "number") {
459972
+ opts.onProgress?.(`Kokoro model: ${event.file ?? ""} ${Math.round(event.progress)}%`);
459973
+ }
459974
+ }
459975
+ });
459733
459976
  if (!tts.success || !tts.audioBuffer) {
459734
- return errResult(`ElevenLabs TTS failed: ${tts.error ?? "unknown error"}`);
459977
+ return errResult(`${resolution.provider} TTS failed: ${tts.error ?? "unknown error"}`);
459735
459978
  }
459736
- audioRelPath = `assets/narration-${id}.mp3`;
459979
+ audioRelPath = `assets/narration-${id}.${resolution.audioExtension}`;
459737
459980
  audioAbsPath = resolve38(projectDir, audioRelPath);
459738
459981
  await mkdir19(dirname24(audioAbsPath), { recursive: true });
459739
459982
  await writeFile25(audioAbsPath, tts.audioBuffer);
@@ -459743,6 +459986,41 @@ async function executeSceneAdd(opts) {
459743
459986
  narrationDuration = void 0;
459744
459987
  }
459745
459988
  }
459989
+ let transcriptRelPath;
459990
+ let transcriptWordCount;
459991
+ let transcriptWords;
459992
+ if (audioAbsPath && !opts.skipTranscribe) {
459993
+ const whisperKey = await getApiKey("OPENAI_API_KEY", "OpenAI");
459994
+ if (!whisperKey) {
459995
+ opts.onProgress?.(
459996
+ "Skipping transcribe (OPENAI_API_KEY not set \u2014 narration plays but word-sync unavailable)"
459997
+ );
459998
+ } else {
459999
+ opts.onProgress?.("Transcribing narration (Whisper word-level)...");
460000
+ try {
460001
+ const whisper = new WhisperProvider();
460002
+ await whisper.initialize({ apiKey: whisperKey });
460003
+ const audioBytes = await readFile22(audioAbsPath);
460004
+ const audioBlob = new Blob([new Uint8Array(audioBytes)]);
460005
+ const transcript = await whisper.transcribe(audioBlob, void 0, {
460006
+ granularity: "word",
460007
+ language: opts.transcribeLanguage
460008
+ });
460009
+ if (transcript.status === "completed" && transcript.words?.length) {
460010
+ transcriptRelPath = `assets/transcript-${id}.json`;
460011
+ const transcriptAbs = resolve38(projectDir, transcriptRelPath);
460012
+ await writeFile25(transcriptAbs, JSON.stringify(transcript.words, null, 2), "utf-8");
460013
+ transcriptWordCount = transcript.words.length;
460014
+ transcriptWords = transcript.words.map((w) => ({ text: w.text, start: w.start, end: w.end }));
460015
+ } else if (transcript.status === "failed") {
460016
+ opts.onProgress?.(`Transcribe failed: ${transcript.error ?? "unknown error"}`);
460017
+ }
460018
+ } catch (error) {
460019
+ const msg = error instanceof Error ? error.message : String(error);
460020
+ opts.onProgress?.(`Transcribe failed: ${msg}`);
460021
+ }
460022
+ }
460023
+ }
459746
460024
  let imageRelPath;
459747
460025
  let imageAbsPath;
459748
460026
  if (opts.visuals && !opts.skipImage) {
@@ -459812,7 +460090,8 @@ async function executeSceneAdd(opts) {
459812
460090
  subhead: narrationText,
459813
460091
  kicker: opts.kicker,
459814
460092
  imagePath: imageRelPath,
459815
- audioPath: audioRelPath
460093
+ audioPath: audioRelPath,
460094
+ transcript: transcriptWords
459816
460095
  });
459817
460096
  await mkdir19(dirname24(scenePath), { recursive: true });
459818
460097
  await writeFile25(scenePath, sceneHtml, "utf-8");
@@ -459820,6 +460099,7 @@ async function executeSceneAdd(opts) {
459820
460099
  const start = nextSceneStart(rootHtmlBefore);
459821
460100
  const updated = insertClipIntoRoot(rootHtmlBefore, { id, start, duration });
459822
460101
  await writeFile25(rootPath, updated, "utf-8");
460102
+ const transcriptAbsPath = transcriptRelPath ? resolve38(projectDir, transcriptRelPath) : void 0;
459823
460103
  return {
459824
460104
  success: true,
459825
460105
  id,
@@ -459829,7 +460109,9 @@ async function executeSceneAdd(opts) {
459829
460109
  scenePath: relative7(process.cwd(), scenePath) || scenePath,
459830
460110
  rootPath: relative7(process.cwd(), rootPath) || rootPath,
459831
460111
  audioPath: audioAbsPath ? relative7(process.cwd(), audioAbsPath) || audioAbsPath : void 0,
459832
- imagePath: imageAbsPath ? relative7(process.cwd(), imageAbsPath) || imageAbsPath : void 0
460112
+ imagePath: imageAbsPath ? relative7(process.cwd(), imageAbsPath) || imageAbsPath : void 0,
460113
+ transcriptPath: transcriptAbsPath ? relative7(process.cwd(), transcriptAbsPath) || transcriptAbsPath : void 0,
460114
+ transcriptWordCount
459833
460115
  };
459834
460116
  }
459835
460117
  sceneCommand.command("lint").description("Validate scene HTML against Hyperframes rules (in-process, no Chrome required)").argument("[root]", "Root composition file relative to --project", "index.html").option("--project <dir>", "Project directory", ".").option("--fix", 'Apply mechanical auto-fixes (currently: missing class="clip")').action(async (root2, options) => {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vibeframe/mcp-server",
3
- "version": "0.53.0",
3
+ "version": "0.54.0",
4
4
  "description": "VibeFrame MCP Server - AI-native video editing via Model Context Protocol",
5
5
  "type": "module",
6
6
  "bin": {
@@ -57,8 +57,8 @@
57
57
  "tsx": "^4.21.0",
58
58
  "typescript": "^5.3.3",
59
59
  "vitest": "^1.2.2",
60
- "@vibeframe/cli": "0.53.0",
61
- "@vibeframe/core": "0.53.0"
60
+ "@vibeframe/cli": "0.54.0",
61
+ "@vibeframe/core": "0.54.0"
62
62
  },
63
63
  "engines": {
64
64
  "node": ">=20"