@vibeframe/mcp-server 0.104.2 → 0.105.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +303 -46
  2. package/package.json +3 -3
package/dist/index.js CHANGED
@@ -7613,7 +7613,7 @@ YAML cues that drive narration, backdrop generation, and timing.
7613
7613
 
7614
7614
  \`\`\`yaml
7615
7615
  narration: "Introduce the promise in one crisp sentence."
7616
- backdrop: "Cinematic abstract technology backdrop, precise light, premium editorial feel"
7616
+ backdrop: "Topic-aligned editorial background plate, abstract visual system, no readable text, no logos, no consumer products, clean negative space for HTML overlays"
7617
7617
  duration: 4
7618
7618
  \`\`\`
7619
7619
 
@@ -7624,7 +7624,7 @@ screen and one spoken breath.
7624
7624
 
7625
7625
  \`\`\`yaml
7626
7626
  narration: "Show the mechanism or proof point that makes the promise believable."
7627
- backdrop: "Layered interface details, subtle motion trails, high-contrast product storytelling"
7627
+ backdrop: "Topic-aligned analytical background plate, abstract dashboard structure, no readable text, no product photos, no shoes, no unrelated objects"
7628
7628
  duration: 4
7629
7629
  \`\`\`
7630
7630
 
@@ -7635,7 +7635,7 @@ before/after.
7635
7635
 
7636
7636
  \`\`\`yaml
7637
7637
  narration: "Close with the action the viewer should remember."
7638
- backdrop: "Resolved hero frame, confident final composition, clean negative space"
7638
+ backdrop: "Resolved editorial background plate, confident final composition, clean negative space, no readable text, no logos, no unrelated products"
7639
7639
  duration: 4
7640
7640
  \`\`\`
7641
7641
 
@@ -7688,6 +7688,33 @@ consult this file \u2014 run the generate command directly.
7688
7688
  Browse named styles: \`vibe scene list-styles\`. Re-seed from one with
7689
7689
  \`vibe scene init . --visual-style "Swiss Pulse"\` (idempotent).
7690
7690
 
7691
+ ## Brief and local media
7692
+
7693
+ \`brief.md\` is raw intent, not a strict schema. It may contain messy notes,
7694
+ pasted research, links, product requirements, or a one-line idea. \`vibe init
7695
+ --from brief.md\` uses it only to seed \`STORYBOARD.md\` and \`DESIGN.md\`;
7696
+ after init, those two files are the working source of truth.
7697
+
7698
+ Use \`media/\` for user-provided source files: product photos, screenshots,
7699
+ logos, B-roll, recorded narration, or reference clips. Keep those inputs
7700
+ inside this project so build references stay project-relative. Do not put user
7701
+ media in \`references/\`; that directory is reserved for local composition
7702
+ rules installed by VibeFrame.
7703
+
7704
+ When a beat should reuse a local file, reference it from \`STORYBOARD.md\`
7705
+ with a project-relative path:
7706
+
7707
+ \`\`\`yaml
7708
+ backdrop: "media/product-shot.png" # existing still image
7709
+ video: "media/broll.mp4" # existing video/B-roll
7710
+ narration: "media/voice.wav" # existing recorded narration
7711
+ asset: "media/logo.png" # generic local asset reference
7712
+ \`\`\`
7713
+
7714
+ Use text cues when you want VibeFrame to generate an asset. Use path cues
7715
+ when you want VibeFrame to reuse a local file. Avoid absolute paths or parent
7716
+ directory references; copy files into \`media/\` first.
7717
+
7691
7718
  ## Provider keys and project scope
7692
7719
 
7693
7720
  Use VibeFrame CLI generation for project assets:
@@ -7735,9 +7762,11 @@ the framework-level minimum, not the cinematic craft layer.
7735
7762
 
7736
7763
  - \`DESIGN.md\` \u2014 visual identity contract (palette, type, motion, transitions)
7737
7764
  - \`STORYBOARD.md\` \u2014 per-beat narration/backdrop/duration cues for \`vibe build\`
7765
+ - \`media/\` \u2014 user-provided source files (photos, logos, clips, voice recordings)
7738
7766
  - \`index.html\` \u2014 root composition (timeline)
7739
7767
  - \`compositions/scene-*.html\` \u2014 per-scene HTML authored by you or the agent
7740
- - \`assets/\` \u2014 shared media (narration audio, images, video)
7768
+ - \`assets/\` \u2014 generated/canonical build media (narration audio, images, video)
7769
+ - \`references/\` \u2014 composition rule docs installed by VibeFrame, not user media
7741
7770
  - \`transcript.json\` \u2014 Whisper word-level transcript (if narration exists)
7742
7771
  - \`hyperframes.json\` \u2014 HF registry config (speak to both toolchains)
7743
7772
  - \`vibe.config.json\` \u2014 canonical VibeFrame config (providers, budget)
@@ -7768,7 +7797,11 @@ npx hyperframes render
7768
7797
  \`\`\`
7769
7798
  4. Videos use \`muted\` with a separate \`<audio>\` element for the audio track.
7770
7799
  5. Sub-compositions use \`data-composition-src="compositions/file.html"\`.
7771
- 6. Only deterministic logic \u2014 no \`Date.now()\`, \`Math.random()\`, or network fetches.
7800
+ 6. For render-stable text, do not apply continuous \`scale\`, \`x\`, \`y\`, or
7801
+ \`filter\` tweens to \`.scene-content\` or any ancestor containing live text.
7802
+ Animate background/media layers instead; text/cards should enter briefly and
7803
+ then hold still at their final CSS positions.
7804
+ 7. Only deterministic logic \u2014 no \`Date.now()\`, \`Math.random()\`, or network fetches.
7772
7805
 
7773
7806
  ## Linting \u2014 run after changes
7774
7807
 
@@ -7878,7 +7911,11 @@ async function scaffoldSceneProject(opts) {
7878
7911
  if (await pathExists(metaPath)) {
7879
7912
  skipped2.push(metaPath);
7880
7913
  } else {
7881
- await writeFile(metaPath, JSON.stringify(buildHyperframesMeta(name, now), null, 2) + "\n", "utf-8");
7914
+ await writeFile(
7915
+ metaPath,
7916
+ JSON.stringify(buildHyperframesMeta(name, now), null, 2) + "\n",
7917
+ "utf-8"
7918
+ );
7882
7919
  created.push(metaPath);
7883
7920
  }
7884
7921
  const rootPath = resolve2(dir, "index.html");
@@ -7893,11 +7930,7 @@ async function scaffoldSceneProject(opts) {
7893
7930
  if (await pathExists(vibeConfigJsonPath)) {
7894
7931
  skipped2.push(vibeConfigJsonPath);
7895
7932
  } else {
7896
- await writeFile(
7897
- vibeConfigJsonPath,
7898
- projectConfigJson({ name, aspect }),
7899
- "utf-8"
7900
- );
7933
+ await writeFile(vibeConfigJsonPath, projectConfigJson({ name, aspect }), "utf-8");
7901
7934
  created.push(vibeConfigJsonPath);
7902
7935
  }
7903
7936
  const vibePath = resolve2(dir, "vibe.project.yaml");
@@ -7936,11 +7969,7 @@ ${existing}`, "utf-8");
7936
7969
  if (await pathExists(designPath)) {
7937
7970
  skipped2.push(designPath);
7938
7971
  } else {
7939
- await writeFile(
7940
- designPath,
7941
- buildDesignMd({ name, style: opts.visualStyle }),
7942
- "utf-8"
7943
- );
7972
+ await writeFile(designPath, buildDesignMd({ name, style: opts.visualStyle }), "utf-8");
7944
7973
  created.push(designPath);
7945
7974
  }
7946
7975
  const storyboardPath = resolve2(dir, "STORYBOARD.md");
@@ -24322,6 +24351,53 @@ function pickReferenceImageUrl(input3) {
24322
24351
  return input3;
24323
24352
  return void 0;
24324
24353
  }
24354
+ function isFalFileInput(value) {
24355
+ return value.startsWith("http://") || value.startsWith("https://") || value.startsWith("data:");
24356
+ }
24357
+ async function normaliseReferences(input3, client) {
24358
+ if (!input3)
24359
+ return [];
24360
+ const references = [];
24361
+ for (const ref of input3) {
24362
+ if (!isFalFileInput(ref.url))
24363
+ continue;
24364
+ references.push({
24365
+ ...ref,
24366
+ url: ref.url.startsWith("data:") ? await uploadDataUri(client, ref.url) : ref.url
24367
+ });
24368
+ }
24369
+ return references;
24370
+ }
24371
+ async function uploadDataUri(client, dataUri) {
24372
+ const blob = dataUriToBlob(dataUri);
24373
+ return client.storage.upload(blob, {
24374
+ lifecycle: { expiresIn: "1h" }
24375
+ });
24376
+ }
24377
+ function dataUriToBlob(dataUri) {
24378
+ const match2 = /^data:([^;,]+)?(;base64)?,(.*)$/s.exec(dataUri);
24379
+ if (!match2)
24380
+ throw new Error("Invalid data URI reference.");
24381
+ const mimeType = match2[1] || "application/octet-stream";
24382
+ const isBase64 = Boolean(match2[2]);
24383
+ const payload = match2[3] ?? "";
24384
+ const buffer = isBase64 ? Buffer.from(payload, "base64") : Buffer.from(decodeURIComponent(payload), "utf-8");
24385
+ return new Blob([new Uint8Array(buffer)], { type: mimeType });
24386
+ }
24387
+ function groupReferences(references) {
24388
+ const image_urls = [];
24389
+ const video_urls = [];
24390
+ const audio_urls = [];
24391
+ for (const ref of references) {
24392
+ if (ref.kind === "image")
24393
+ image_urls.push(ref.url);
24394
+ if (ref.kind === "video")
24395
+ video_urls.push(ref.url);
24396
+ if (ref.kind === "audio")
24397
+ audio_urls.push(ref.url);
24398
+ }
24399
+ return { image_urls, video_urls, audio_urls };
24400
+ }
24325
24401
  function normaliseAspect(value) {
24326
24402
  if (!value)
24327
24403
  return "auto";
@@ -24344,7 +24420,7 @@ function normaliseDuration(value) {
24344
24420
  return "auto";
24345
24421
  return Math.max(4, Math.min(15, Math.round(value)));
24346
24422
  }
24347
- var import_client, ENDPOINT_TEXT_TO_VIDEO, ENDPOINT_IMAGE_TO_VIDEO, DEFAULT_VARIANT, VALID_RESOLUTIONS, VALID_ASPECTS, FalProvider, falProvider;
24423
+ var import_client, ENDPOINT_TEXT_TO_VIDEO, ENDPOINT_IMAGE_TO_VIDEO, ENDPOINT_REFERENCE_TO_VIDEO, DEFAULT_VARIANT, VALID_RESOLUTIONS, VALID_ASPECTS, FalProvider, falProvider;
24348
24424
  var init_FalProvider = __esm({
24349
24425
  "../ai-providers/dist/fal/FalProvider.js"() {
24350
24426
  "use strict";
@@ -24357,6 +24433,10 @@ var init_FalProvider = __esm({
24357
24433
  "seedance-2.0": "bytedance/seedance-2.0/image-to-video",
24358
24434
  "seedance-2.0-fast": "bytedance/seedance-2.0/fast/image-to-video"
24359
24435
  };
24436
+ ENDPOINT_REFERENCE_TO_VIDEO = {
24437
+ "seedance-2.0": "bytedance/seedance-2.0/reference-to-video",
24438
+ "seedance-2.0-fast": "bytedance/seedance-2.0/fast/reference-to-video"
24439
+ };
24360
24440
  DEFAULT_VARIANT = "seedance-2.0";
24361
24441
  VALID_RESOLUTIONS = ["480p", "720p", "1080p"];
24362
24442
  VALID_ASPECTS = ["21:9", "16:9", "4:3", "1:1", "3:4", "9:16", "auto"];
@@ -24365,7 +24445,7 @@ var init_FalProvider = __esm({
24365
24445
  this.id = "seedance";
24366
24446
  this.name = "fal.ai (Seedance 2.0)";
24367
24447
  this.description = "fal.ai hosting ByteDance Seedance 2.0 \u2014 Artificial Analysis #2 on both text-to-video and image-to-video leaderboards";
24368
- this.capabilities = ["text-to-video", "image-to-video"];
24448
+ this.capabilities = ["text-to-video", "image-to-video", "reference-to-video"];
24369
24449
  this.iconUrl = "/icons/fal.svg";
24370
24450
  this.isAvailable = true;
24371
24451
  }
@@ -24404,9 +24484,11 @@ var init_FalProvider = __esm({
24404
24484
  error: `Unknown Seedance variant: ${variant}. Valid: ${Object.keys(ENDPOINT_TEXT_TO_VIDEO).join(", ")}.`
24405
24485
  };
24406
24486
  }
24407
- const referenceImage = pickReferenceImageUrl(options?.referenceImage);
24487
+ const references = await normaliseReferences(options?.references, this.client);
24488
+ const hasReferences = references.length > 0;
24489
+ const referenceImage = hasReferences ? void 0 : pickReferenceImageUrl(options?.referenceImage);
24408
24490
  const isImageToVideo = !!referenceImage;
24409
- const endpointId = isImageToVideo ? ENDPOINT_IMAGE_TO_VIDEO[variant] : ENDPOINT_TEXT_TO_VIDEO[variant];
24491
+ const endpointId = hasReferences ? ENDPOINT_REFERENCE_TO_VIDEO[variant] : isImageToVideo ? ENDPOINT_IMAGE_TO_VIDEO[variant] : ENDPOINT_TEXT_TO_VIDEO[variant];
24410
24492
  const aspect = normaliseAspect(options?.aspectRatio);
24411
24493
  const resolution = normaliseResolution(options?.resolution);
24412
24494
  const duration = normaliseDuration(options?.duration);
@@ -24416,13 +24498,26 @@ var init_FalProvider = __esm({
24416
24498
  resolution,
24417
24499
  duration
24418
24500
  };
24419
- if (referenceImage)
24501
+ if (hasReferences) {
24502
+ const grouped = groupReferences(references);
24503
+ if (grouped.image_urls.length > 0)
24504
+ input3.image_urls = grouped.image_urls;
24505
+ if (grouped.video_urls.length > 0)
24506
+ input3.video_urls = grouped.video_urls;
24507
+ if (grouped.audio_urls.length > 0)
24508
+ input3.audio_urls = grouped.audio_urls;
24509
+ } else if (referenceImage) {
24420
24510
  input3.image_url = referenceImage;
24511
+ }
24421
24512
  if (options?.negativePrompt)
24422
24513
  input3.negative_prompt = options.negativePrompt;
24423
24514
  if (typeof options?.seed === "number")
24424
24515
  input3.seed = options.seed;
24425
- if (options?.lastFrame)
24516
+ if (typeof options?.generateAudio === "boolean")
24517
+ input3.generate_audio = options.generateAudio;
24518
+ if (options?.endUserId)
24519
+ input3.end_user_id = options.endUserId;
24520
+ if (!hasReferences && options?.lastFrame)
24426
24521
  input3.end_image_url = options.lastFrame;
24427
24522
  try {
24428
24523
  const out = await this.client.subscribe(endpointId, { input: input3, logs: false });
@@ -24469,14 +24564,15 @@ var init_fal = __esm({
24469
24564
  // Review this alias at the 1.0 cut.
24470
24565
  aliases: ["fal"],
24471
24566
  models: ["seedance-2.0", "seedance-2.0-fast"],
24472
- capabilities: ["text-to-video", "image-to-video", "native-audio"],
24567
+ capabilities: ["text-to-video", "image-to-video", "reference-to-video", "native-audio"],
24473
24568
  apiKey: "fal",
24474
24569
  kinds: ["video"],
24475
24570
  resolverPriority: { video: 1 },
24476
24571
  commandsUnlocked: [
24477
24572
  "generate video -p seedance (Seedance 2.0 via fal.ai \u2014 default since v0.57)",
24478
24573
  "generate video -p seedance --seedance-model fast (lower-latency variant)",
24479
- "generate video -p seedance -i <image> (image-to-video)"
24574
+ "generate video -p seedance -i <image> (image-to-video)",
24575
+ "generate video -p seedance --ref-images <images...> (reference-to-video)"
24480
24576
  ]
24481
24577
  });
24482
24578
  }
@@ -284700,7 +284796,7 @@ ${newComment.split("\n").map((c) => ` * ${c}`).join("\n")}
284700
284796
  /*mapfn*/
284701
284797
  (name) => ts_FindAllReferences_exports.getReferenceEntriesForNode(-1, name, program2, program2.getSourceFiles(), cancellationToken)
284702
284798
  );
284703
- const groupedReferences = groupReferences(references);
284799
+ const groupedReferences = groupReferences2(references);
284704
284800
  if (!every(
284705
284801
  groupedReferences.declarations,
284706
284802
  /*callback*/
@@ -284709,7 +284805,7 @@ ${newComment.split("\n").map((c) => ` * ${c}`).join("\n")}
284709
284805
  groupedReferences.valid = false;
284710
284806
  }
284711
284807
  return groupedReferences;
284712
- function groupReferences(referenceEntries) {
284808
+ function groupReferences2(referenceEntries) {
284713
284809
  const classReferences = { accessExpressions: [], typeUsages: [] };
284714
284810
  const groupedReferences2 = { functionCalls: [], declarations: [], classReferences, valid: true };
284715
284811
  const functionSymbols = map3(functionNames, getSymbolTargetAtLocation);
@@ -449179,13 +449275,18 @@ Requirements (non-negotiable):
449179
449275
  producer's seek lands past the timeline's natural end and visibility state
449180
449276
  goes stale \u2014 the hold phase renders BLACK. Anchor the timeline to the full
449181
449277
  beat duration via either:
449182
- 1. A subtle idle motion spanning 0\u2192duration on a parent element, e.g.
449183
- \`tl.fromTo(".scene-content", { scale: 1.0 }, { scale: 1.015, duration: <beat>, ease: "none" }, 0);\`
449278
+ 1. A subtle idle motion spanning 0\u2192duration on a background/media layer,
449279
+ e.g. \`tl.fromTo(".backdrop", { scale: 1.0 }, { scale: 1.015, duration: <beat>, ease: "none" }, 0);\`
449184
449280
  (Ken-Burns, breathing opacity, gradient drift \u2014 should be barely
449185
449281
  perceptible so it doesn't compete with entry/exit beats).
449186
449282
  2. OR an explicit \`tl.set(target, { ...natural state... }, <beat - 0.001>)\`
449187
449283
  anchor at the end.
449188
449284
  This is the #2 source of "text disappears mid-beat" bugs after \`.clip\` sizing.
449285
+ - Do not apply continuous \`scale\`, \`x\`, \`y\`, \`filter\`, or other transform
449286
+ tweens to \`.scene-content\` or any ancestor that contains live text/cards.
449287
+ Animate the backdrop/media plane instead; let text enter briefly, then hold
449288
+ still at its final CSS position. Continuous transforms on text ancestors can
449289
+ create subpixel shimmer in screenshot-captured renders.
449189
449290
  - Timed children inside the composition have \`class="clip"\` plus
449190
449291
  \`data-start\`, \`data-duration\`, \`data-track-index\`.
449191
449292
  - If \`assets/backdrop-${ctx.beat.id}.png\` exists, use that local file as the
@@ -449255,7 +449356,9 @@ Reference shape (verbatim \u2014 match this skeleton exactly, no DOCTYPE / html
449255
449356
  const tl = gsap.timeline({ paused: true });
449256
449357
  // Idle motion spanning full beat duration \u2014 required to keep timeline
449257
449358
  // length aligned with data-duration (otherwise hold phase goes black).
449258
- tl.fromTo(".scene-content", { scale: 1.0 }, { scale: 1.015, duration: <sec>, ease: "none" }, 0);
449359
+ // Keep continuous motion on the background/media layer so live text does
449360
+ // not shimmer from subpixel resampling.
449361
+ tl.fromTo(".backdrop", { scale: 1.0 }, { scale: 1.015, duration: <sec>, ease: "none" }, 0);
449259
449362
  // entry tweens
449260
449363
  window.__timelines["${compositionId}"] = tl;
449261
449364
  </script>
@@ -451197,6 +451300,32 @@ var init_build_asset_metadata = __esm({
451197
451300
  }
451198
451301
  });
451199
451302
 
451303
+ // ../cli/src/commands/_shared/build-backdrop-prompt.ts
451304
+ function augmentBackdropPrompt(cue) {
451305
+ const trimmed = cue.trim();
451306
+ const lower = trimmed.toLowerCase();
451307
+ const requestsTextOrMarks = /\b(text|typography|title|headline|label|caption|logo|logos|wordmark|brand mark|brand marks)\b/.test(lower);
451308
+ const forbidsTextOrMarks = /\b(no|without|avoid)\s+(readable\s+)?(text|typography|titles?|headlines?|labels?|captions?|brand\s+logos?|logos?|wordmarks?|brand\s+marks?)\b/.test(
451309
+ lower
451310
+ );
451311
+ const allowsTextOrMarks = requestsTextOrMarks && !forbidsTextOrMarks;
451312
+ const overlayContract = allowsTextOrMarks ? "The image is a video background or end-card plate; do not add any text, logos, charts, or UI beyond what the scene cue explicitly requests." : "The image is a background only; HTML overlays will provide all final text, charts, logos, and UI labels.";
451313
+ const textRule = allowsTextOrMarks ? "If text, logos, or brand marks are explicitly requested, keep them minimal, legible, and do not invent extras." : "No readable text, labels, UI copy, logos, brand marks, watermarks, or invented typography.";
451314
+ return [
451315
+ "Create a 16:9 video background plate for a HyperFrames scene.",
451316
+ overlayContract,
451317
+ `Scene cue: ${trimmed}`,
451318
+ textRule,
451319
+ "Avoid unrelated consumer product photography, shoes, packaging, food, people, celebrity faces, advertisements, and random objects unless explicitly requested by the scene cue.",
451320
+ "Leave generous negative space for overlay text and cards. Keep the result topic-aligned, editorial, cinematic, and non-distracting."
451321
+ ].join(" ");
451322
+ }
451323
+ var init_build_backdrop_prompt = __esm({
451324
+ "../cli/src/commands/_shared/build-backdrop-prompt.ts"() {
451325
+ "use strict";
451326
+ }
451327
+ });
451328
+
451200
451329
  // ../cli/src/commands/_shared/storyboard-edit.ts
451201
451330
  function validateStoryboardMarkdown(markdown) {
451202
451331
  const parsed = parseStoryboard(markdown);
@@ -451481,6 +451610,7 @@ async function createBuildPlan(opts) {
451481
451610
  const voice = stringOrUndefined3(cue.voice) ?? resolved.voice;
451482
451611
  const narrationText = stringOrUndefined3(cue.narration);
451483
451612
  const backdropPrompt = stringOrUndefined3(cue.backdrop);
451613
+ const augmentedBackdropPrompt = backdropPrompt ? augmentBackdropPrompt(backdropPrompt) : null;
451484
451614
  const videoPrompt = stringOrUndefined3(cue.video);
451485
451615
  const musicPrompt = stringOrUndefined3(cue.music);
451486
451616
  const genericReference = resolveGenericAssetReference(projectDir, cue.asset);
@@ -451500,9 +451630,9 @@ async function createBuildPlan(opts) {
451500
451630
  voice,
451501
451631
  ext: resolved.narration.resolved === "elevenlabs" ? "mp3" : "wav"
451502
451632
  }) : null;
451503
- const backdropCache = backdropPrompt && !backdropReference ? backdropCacheDescriptor({
451633
+ const backdropCache = augmentedBackdropPrompt && !backdropReference ? backdropCacheDescriptor({
451504
451634
  beatId: beat.id,
451505
- cue: backdropPrompt,
451635
+ cue: augmentedBackdropPrompt,
451506
451636
  provider: resolved.image.resolved,
451507
451637
  quality: imageQuality,
451508
451638
  size: imageSize2,
@@ -452032,6 +452162,7 @@ var init_build_plan = __esm({
452032
452162
  init_build_asset_reference();
452033
452163
  init_build_cache();
452034
452164
  init_build_asset_metadata();
452165
+ init_build_backdrop_prompt();
452035
452166
  init_composer_resolve();
452036
452167
  init_storyboard_parse();
452037
452168
  init_project_config();
@@ -452432,6 +452563,9 @@ async function executeVideoGenerate(options) {
452432
452563
  prompt: prompt3,
452433
452564
  provider = "kling",
452434
452565
  image,
452566
+ refImages,
452567
+ refVideos,
452568
+ refAudio,
452435
452569
  duration = 5,
452436
452570
  ratio = "16:9",
452437
452571
  seed,
@@ -452440,6 +452574,7 @@ async function executeVideoGenerate(options) {
452440
452574
  resolution,
452441
452575
  veoModel = "3.1-fast",
452442
452576
  seedanceModel = "quality",
452577
+ generateAudio,
452443
452578
  output: output3,
452444
452579
  wait = true,
452445
452580
  apiKey
@@ -452478,8 +452613,13 @@ async function executeVideoGenerate(options) {
452478
452613
  if (provider === "seedance" || provider === "fal") {
452479
452614
  const fal = new FalProvider();
452480
452615
  await fal.initialize({ apiKey: key2 });
452616
+ const references = await prepareSeedanceReferences({
452617
+ refImages,
452618
+ refVideos,
452619
+ refAudio
452620
+ });
452481
452621
  let falImage = referenceImage;
452482
- if (falImage && falImage.startsWith("data:")) {
452622
+ if (falImage && falImage.startsWith("data:") && references.length === 0) {
452483
452623
  const uploadHost = await resolveUploadHost();
452484
452624
  const upload = await uploadHost.uploadImage(referenceImageBuffer, {
452485
452625
  filename: image,
@@ -452490,11 +452630,14 @@ async function executeVideoGenerate(options) {
452490
452630
  const model = seedanceModel === "fast" || seedanceModel === "seedance-2.0-fast" ? "seedance-2.0-fast" : "seedance-2.0";
452491
452631
  const result = await fal.generateVideo(prompt3, {
452492
452632
  prompt: prompt3,
452493
- referenceImage: falImage,
452633
+ referenceImage: references.length > 0 ? void 0 : falImage,
452634
+ references: references.length > 0 ? references : void 0,
452494
452635
  duration,
452495
452636
  aspectRatio: ratio,
452496
452637
  negativePrompt: negative,
452497
- model
452638
+ model,
452639
+ resolution,
452640
+ generateAudio
452498
452641
  });
452499
452642
  if (result.status === "failed")
452500
452643
  return { success: false, error: result.error || "Seedance generation failed" };
@@ -452671,6 +452814,54 @@ async function executeVideoGenerate(options) {
452671
452814
  };
452672
452815
  }
452673
452816
  }
452817
+ async function prepareSeedanceReferences(opts) {
452818
+ const references = [];
452819
+ for (const sourcePath of opts.refImages ?? []) {
452820
+ references.push({
452821
+ kind: "image",
452822
+ url: await fileInputToUrlOrDataUri(sourcePath, "image/png"),
452823
+ sourcePath
452824
+ });
452825
+ }
452826
+ for (const sourcePath of opts.refVideos ?? []) {
452827
+ references.push({
452828
+ kind: "video",
452829
+ url: await fileInputToUrlOrDataUri(sourcePath, "video/mp4"),
452830
+ sourcePath
452831
+ });
452832
+ }
452833
+ for (const sourcePath of opts.refAudio ?? []) {
452834
+ references.push({
452835
+ kind: "audio",
452836
+ url: await fileInputToUrlOrDataUri(sourcePath, "audio/mpeg"),
452837
+ sourcePath
452838
+ });
452839
+ }
452840
+ return references;
452841
+ }
452842
+ async function fileInputToUrlOrDataUri(input3, fallbackMimeType) {
452843
+ if (input3.startsWith("http://") || input3.startsWith("https://") || input3.startsWith("data:")) {
452844
+ return input3;
452845
+ }
452846
+ const absPath = resolve28(process.cwd(), input3);
452847
+ const buffer = await readFile14(absPath);
452848
+ return `data:${mimeTypeForPath(input3, fallbackMimeType)};base64,${buffer.toString("base64")}`;
452849
+ }
452850
+ function mimeTypeForPath(path14, fallback2) {
452851
+ const ext = path14.toLowerCase().split(".").pop();
452852
+ const mimeTypes = {
452853
+ jpg: "image/jpeg",
452854
+ jpeg: "image/jpeg",
452855
+ png: "image/png",
452856
+ gif: "image/gif",
452857
+ webp: "image/webp",
452858
+ mp4: "video/mp4",
452859
+ mov: "video/quicktime",
452860
+ mp3: "audio/mpeg",
452861
+ wav: "audio/wav"
452862
+ };
452863
+ return mimeTypes[ext || ""] || fallback2;
452864
+ }
452674
452865
  async function executeVideoStatus(options) {
452675
452866
  const {
452676
452867
  taskId,
@@ -454786,8 +454977,9 @@ async function dispatchNarration(beat, ctx) {
454786
454977
  async function dispatchBackdrop(beat, ctx) {
454787
454978
  const reference = assetReferenceForBeat(ctx.projectDir, "backdrop", beat);
454788
454979
  if (reference) return referencePrimitiveOutcome("backdrop", beat, ctx, reference);
454789
- const prompt3 = stringOrUndefined4(beat.cues?.backdrop);
454790
- if (!prompt3) return { status: "no-cue" };
454980
+ const cue = stringOrUndefined4(beat.cues?.backdrop);
454981
+ if (!cue) return { status: "no-cue" };
454982
+ const prompt3 = augmentBackdropPrompt(cue);
454791
454983
  const rel = `assets/backdrop-${beat.id}.png`;
454792
454984
  const abs = join33(ctx.projectDir, rel);
454793
454985
  const size = ctx.imageSize ?? "1536x1024";
@@ -455765,6 +455957,7 @@ var init_scene_build = __esm({
455765
455957
  init_root_sync();
455766
455958
  init_build_cache();
455767
455959
  init_build_asset_metadata();
455960
+ init_build_backdrop_prompt();
455768
455961
  init_ai_video();
455769
455962
  init_music();
455770
455963
  init_status_jobs();
@@ -463884,10 +464077,10 @@ var init_provider_resolver = __esm({
463884
464077
 
463885
464078
  // ../cli/src/commands/_shared/openai-image.ts
463886
464079
  function resolveOpenAIImageModel(modelAlias) {
463887
- const isGptImage2 = modelAlias === "2" || modelAlias === "gpt-image-2";
464080
+ const isGptImage15 = modelAlias === "1.5" || modelAlias === "gpt-image-1.5";
463888
464081
  return {
463889
- openaiModel: isGptImage2 ? "gpt-image-2" : void 0,
463890
- modelLabel: isGptImage2 ? "GPT Image 2" : "GPT Image 1.5"
464082
+ openaiModel: isGptImage15 ? "gpt-image-1.5" : "gpt-image-2",
464083
+ modelLabel: isGptImage15 ? "GPT Image 1.5" : "GPT Image 2"
463891
464084
  };
463892
464085
  }
463893
464086
  async function executeOpenAIImageGenerate(prompt3, options, ctx) {
@@ -463924,7 +464117,7 @@ function registerImageCommand(parent) {
463924
464117
  "1:1"
463925
464118
  ).option("--quality <quality>", "Quality: standard, hd (openai only)", "standard").option("--style <style>", "Style: vivid, natural (openai only)", "vivid").option("--count <n>", "Number of images to generate", "1").option(
463926
464119
  "-m, --model <model>",
463927
- "Model. Gemini: flash, 3.1-flash, latest, pro. OpenAI: 1.5 (default), 2 (gpt-image-2)"
464120
+ "Model. Gemini: flash, 3.1-flash, latest, pro. OpenAI: 2 (default), 1.5"
463928
464121
  ).option("--dry-run", "Preview parameters without executing").addHelpText(
463929
464122
  "after",
463930
464123
  `
@@ -465358,8 +465551,8 @@ function registerVideoCommand(parent) {
465358
465551
  "quality"
465359
465552
  ).option("--negative <prompt>", "Negative prompt - what to avoid (Kling/Veo)").option("--resolution <res>", "Video resolution: 720p, 1080p, 4k (Veo only)").option("--last-frame <path>", "Last frame image for frame interpolation (Veo only)").option(
465360
465553
  "--ref-images <paths...>",
465361
- "Reference images for character consistency (Veo 3.1 only, max 3)"
465362
- ).option("--person <mode>", "Person generation: allow_all, allow_adult (Veo only)").option("--veo-model <model>", "Veo model: 3.0, 3.1, 3.1-fast (default: 3.1-fast)", "3.1-fast").option(
465554
+ "Reference images for Seedance reference-to-video or Veo character consistency"
465555
+ ).option("--ref-videos <paths...>", "Reference videos for Seedance reference-to-video").option("--ref-audio <paths...>", "Reference audio for Seedance reference-to-video").option("--no-generate-audio", "Disable native audio when the provider supports it").option("--person <mode>", "Person generation: allow_all, allow_adult (Veo only)").option("--veo-model <model>", "Veo model: 3.0, 3.1, 3.1-fast (default: 3.1-fast)", "3.1-fast").option(
465363
465556
  "--runway-model <model>",
465364
465557
  "Runway model: gen4.5 (default, text+image-to-video), gen4_turbo (image-to-video only)",
465365
465558
  "gen4.5"
@@ -465502,7 +465695,11 @@ Examples:
465502
465695
  negative: options.negative,
465503
465696
  resolution: options.resolution,
465504
465697
  veoModel: options.veoModel,
465505
- seedanceModel: options.seedanceModel
465698
+ seedanceModel: options.seedanceModel,
465699
+ refImages: options.refImages,
465700
+ refVideos: options.refVideos,
465701
+ refAudio: options.refAudio,
465702
+ generateAudio: options.generateAudio
465506
465703
  }
465507
465704
  }
465508
465705
  });
@@ -465789,8 +465986,13 @@ Examples:
465789
465986
  } else if (provider === "seedance") {
465790
465987
  const fal = new FalProvider();
465791
465988
  await fal.initialize({ apiKey });
465989
+ const seedanceReferences = await prepareSeedanceReferences2({
465990
+ refImages: options.refImages,
465991
+ refVideos: options.refVideos,
465992
+ refAudio: options.refAudio
465993
+ });
465792
465994
  let falImage = referenceImage;
465793
- if (falImage && falImage.startsWith("data:")) {
465995
+ if (falImage && falImage.startsWith("data:") && seedanceReferences.length === 0) {
465794
465996
  try {
465795
465997
  const uploadHost = await resolveUploadHost();
465796
465998
  spinner2.text = `Uploading image via ${uploadHost.provider} for Seedance...`;
@@ -465813,11 +466015,14 @@ Examples:
465813
466015
  const falModel = seedanceModel === "fast" || seedanceModel === "seedance-2.0-fast" ? "seedance-2.0-fast" : "seedance-2.0";
465814
466016
  result = await fal.generateVideo(prompt3, {
465815
466017
  prompt: prompt3,
465816
- referenceImage: falImage,
466018
+ referenceImage: seedanceReferences.length > 0 ? void 0 : falImage,
466019
+ references: seedanceReferences.length > 0 ? seedanceReferences : void 0,
465817
466020
  duration: options.duration ? parseInt(options.duration) : void 0,
465818
466021
  aspectRatio: options.ratio,
465819
466022
  negativePrompt: options.negative,
465820
- model: falModel
466023
+ model: falModel,
466024
+ resolution: options.resolution,
466025
+ generateAudio: options.generateAudio
465821
466026
  });
465822
466027
  finalResult = result;
465823
466028
  }
@@ -465872,6 +466077,54 @@ Examples:
465872
466077
  }
465873
466078
  });
465874
466079
  }
466080
+ async function prepareSeedanceReferences2(opts) {
466081
+ const references = [];
466082
+ for (const sourcePath of opts.refImages ?? []) {
466083
+ references.push({
466084
+ kind: "image",
466085
+ url: await fileInputToUrlOrDataUri2(sourcePath, "image/png"),
466086
+ sourcePath
466087
+ });
466088
+ }
466089
+ for (const sourcePath of opts.refVideos ?? []) {
466090
+ references.push({
466091
+ kind: "video",
466092
+ url: await fileInputToUrlOrDataUri2(sourcePath, "video/mp4"),
466093
+ sourcePath
466094
+ });
466095
+ }
466096
+ for (const sourcePath of opts.refAudio ?? []) {
466097
+ references.push({
466098
+ kind: "audio",
466099
+ url: await fileInputToUrlOrDataUri2(sourcePath, "audio/mpeg"),
466100
+ sourcePath
466101
+ });
466102
+ }
466103
+ return references;
466104
+ }
466105
+ async function fileInputToUrlOrDataUri2(input3, fallbackMimeType) {
466106
+ if (input3.startsWith("http://") || input3.startsWith("https://") || input3.startsWith("data:")) {
466107
+ return input3;
466108
+ }
466109
+ const absPath = resolve63(process.cwd(), input3);
466110
+ const buffer = await readFile32(absPath);
466111
+ return `data:${mimeTypeForPath2(input3, fallbackMimeType)};base64,${buffer.toString("base64")}`;
466112
+ }
466113
+ function mimeTypeForPath2(path14, fallback2) {
466114
+ const ext = path14.toLowerCase().split(".").pop();
466115
+ const mimeTypes = {
466116
+ jpg: "image/jpeg",
466117
+ jpeg: "image/jpeg",
466118
+ png: "image/png",
466119
+ gif: "image/gif",
466120
+ webp: "image/webp",
466121
+ mp4: "video/mp4",
466122
+ mov: "video/quicktime",
466123
+ mp3: "audio/mpeg",
466124
+ wav: "audio/wav"
466125
+ };
466126
+ return mimeTypes[ext || ""] || fallback2;
466127
+ }
465875
466128
  async function recordVideoNoWaitJob(opts) {
465876
466129
  return createAndWriteJobRecord({
465877
466130
  jobType: "generate-video",
@@ -472211,6 +472464,9 @@ var generateVideoTool = defineTool({
472211
472464
  "Video provider (default: seedance when FAL_API_KEY is configured, otherwise first configured provider)"
472212
472465
  ),
472213
472466
  image: z5.string().optional().describe("Reference image path for image-to-video"),
472467
+ refImages: z5.array(z5.string()).optional().describe("Reference images for Seedance reference-to-video"),
472468
+ refVideos: z5.array(z5.string()).optional().describe("Reference videos for Seedance reference-to-video"),
472469
+ refAudio: z5.array(z5.string()).optional().describe("Reference audio files for Seedance reference-to-video"),
472214
472470
  duration: z5.number().optional().describe("Duration in seconds (default: 5; Seedance accepts 4-15)"),
472215
472471
  ratio: z5.string().optional().describe("Aspect ratio: 16:9, 9:16, 1:1 (default: 16:9)"),
472216
472472
  mode: z5.string().optional().describe("Kling mode: std or pro"),
@@ -472219,6 +472475,7 @@ var generateVideoTool = defineTool({
472219
472475
  veoModel: z5.string().optional().describe("Veo model: 3.0, 3.1, 3.1-fast"),
472220
472476
  runwayModel: z5.string().optional().describe("Runway model: gen4.5, gen4_turbo"),
472221
472477
  seedanceModel: z5.string().optional().describe("Seedance variant: quality or fast (fal.ai only)"),
472478
+ generateAudio: z5.boolean().optional().describe("Generate native synchronized audio when supported"),
472222
472479
  output: z5.string().optional().describe("Output file path (downloads video)"),
472223
472480
  wait: z5.boolean().optional().describe("Wait for completion (default: true)")
472224
472481
  }),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vibeframe/mcp-server",
3
- "version": "0.104.2",
3
+ "version": "0.105.1",
4
4
  "description": "VibeFrame MCP Server - AI-native video editing via Model Context Protocol",
5
5
  "type": "module",
6
6
  "bin": {
@@ -57,8 +57,8 @@
57
57
  "tsx": "^4.21.0",
58
58
  "typescript": "^5.3.3",
59
59
  "vitest": "^1.2.2",
60
- "@vibeframe/core": "0.104.2",
61
- "@vibeframe/cli": "0.104.2"
60
+ "@vibeframe/core": "0.105.1",
61
+ "@vibeframe/cli": "0.105.1"
62
62
  },
63
63
  "engines": {
64
64
  "node": ">=20"