npm - vargai - Versions diffs - 0.4.0-alpha102 → 0.4.0-alpha105 - Mend

vargai 0.4.0-alpha102 → 0.4.0-alpha105

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/package.json +1 -1
package/src/ai-sdk/providers/elevenlabs.ts +1 -1
package/src/ai-sdk/providers/fal.ts +22 -11
package/src/definitions/models/nano-banana-2.ts +6 -5
package/src/providers/elevenlabs.ts +2 -1
package/src/providers/fal.ts +6 -2
package/src/react/renderers/captions.ts +119 -8
package/src/react/renderers/context.ts +4 -1
package/src/react/renderers/image.ts +3 -3
package/src/react/renderers/music.ts +37 -59
package/src/react/renderers/render.ts +16 -1
package/src/react/renderers/speech.ts +39 -23
package/src/react/renderers/talking-head.ts +1 -0
package/src/react/renderers/utils.ts +2 -1
package/src/react/resolve.ts +134 -6
package/src/react/types.ts +3 -1
package/src/studio/step-renderer.ts +11 -1

package/package.json CHANGED Viewed

@@ -104,7 +104,7 @@
   "license": "Apache-2.0",
   "author": "varg.ai <hello@varg.ai> (https://varg.ai)",
   "sideEffects": false,
-  "version": "0.4.0-alpha102",
+  "version": "0.4.0-alpha105",
   "exports": {
     ".": "./src/index.ts",
     "./ai": "./src/ai-sdk/index.ts",

package/src/ai-sdk/providers/elevenlabs.ts CHANGED Viewed

@@ -89,7 +89,7 @@ class ElevenLabsMusicModel implements MusicModelV3 {
     const elevenLabsOptions = providerOptions?.elevenlabs ?? {};
     const audio = await this.client.music.compose({
       prompt,
-      musicLengthMs: duration ? duration * 1000 : undefined,
+      musicLengthMs: duration ? Math.round(duration * 1000) : undefined,
       modelId: this.modelId,
       ...elevenLabsOptions,
     } as Parameters<typeof this.client.music.compose>[0]);

package/src/ai-sdk/providers/fal.ts CHANGED Viewed

@@ -174,6 +174,7 @@ const MOTION_CONTROL_MODELS: Record<string, string> = {
 const LIPSYNC_MODELS: Record<string, string> = {
   "sync-v2": "fal-ai/sync-lipsync",
   "sync-v2-pro": "fal-ai/sync-lipsync/v2",
+  "sync-v3": "fal-ai/sync-lipsync/v3",
   lipsync: "fal-ai/sync-lipsync",
   "omnihuman-v1.5": "fal-ai/bytedance/omnihuman/v1.5",
   "veed-fabric-1.0": "veed/fabric-1.0",
@@ -195,7 +196,7 @@ const IMAGE_MODELS: Record<string, string> = {
   "recraft-v3": "fal-ai/recraft/v3/text-to-image",
   "nano-banana-pro": "fal-ai/nano-banana-pro",
   "nano-banana-pro/edit": "fal-ai/nano-banana-pro/edit",
-  "nano-banana-2": "fal-ai/nano-banana-2/edit",
+  "nano-banana-2": "fal-ai/nano-banana-2",
   "nano-banana-2/edit": "fal-ai/nano-banana-2/edit",
   "seedream-v4.5/edit": "fal-ai/bytedance/seedream/v4.5/edit",
   // Qwen Image 2 - text-to-image and image-to-image editing (standard + pro)
@@ -923,13 +924,21 @@ class FalImageModel implements ImageModelV3 {
     }
     const hasFiles = files && files.length > 0;
-    const finalEndpoint = this.resolveEndpoint();
     let stableKey: string | undefined;
     if (hasFiles && files) {
       const fileHashes = await computeFileHashes(files);
+      const imageUrls = await pMap(files, fileToUrl, { concurrency: 2 });
+      // Reve uses singular image_url instead of image_urls array
+      if (SINGULAR_IMAGE_URL_MODELS.has(this.modelId)) {
+        input.image_url = imageUrls[0];
+      } else {
+        input.image_urls = imageUrls;
+      }
+      // Compute stable key after files are resolved
+      const finalEndpointForKey = this.resolveEndpoint(hasFiles);
       stableKey = JSON.stringify({
-        endpoint: finalEndpoint,
+        endpoint: finalEndpointForKey,
         prompt,
         n,
         size,
@@ -939,13 +948,6 @@ class FalImageModel implements ImageModelV3 {
         modelId: this.modelId,
         fileHashes,
       });
-      const imageUrls = await pMap(files, fileToUrl, { concurrency: 2 });
-      // Reve uses singular image_url instead of image_urls array
-      if (SINGULAR_IMAGE_URL_MODELS.has(this.modelId)) {
-        input.image_url = imageUrls[0];
-      } else {
-        input.image_urls = imageUrls;
-      }
     }
     if (isQwenAngles && !input.image_urls) {
@@ -961,6 +963,10 @@ class FalImageModel implements ImageModelV3 {
       }
     }
+    // Resolve endpoint after file processing so dual-endpoint models
+    // (e.g. nano-banana-2 vs nano-banana-2/edit) route correctly
+    const finalEndpoint = this.resolveEndpoint(hasFiles);
     const result = await executeWithQueueRecovery<{ data: unknown }>(
       finalEndpoint,
       input,
@@ -997,11 +1003,16 @@ class FalImageModel implements ImageModelV3 {
     };
   }
-  private resolveEndpoint(): string {
+  private resolveEndpoint(hasFiles?: boolean): string {
     if (this.modelId.startsWith("raw:")) {
       return this.modelId.slice(4);
     }
+    // Nano Banana 2: route to /edit when images are provided, base endpoint for t2i
+    if (this.modelId === "nano-banana-2" && hasFiles) {
+      return "fal-ai/nano-banana-2/edit";
+    }
     return IMAGE_MODELS[this.modelId] ?? this.modelId;
   }
 }

package/src/definitions/models/nano-banana-2.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 /**
- * Nano Banana 2 image editing model (Google's next-gen image generation/editing)
- * Edit-only model requiring image_urls input
+ * Nano Banana 2 image model (Google's next-gen image generation/editing)
+ * Supports both text-to-image (no images) and image editing (with image_urls)
  */
 import { z } from "zod";
@@ -35,8 +35,9 @@ const nanoBanana2InputSchema = z.object({
   prompt: z.string().describe("Text description for image editing"),
   image_urls: z
     .array(z.string().url())
+    .optional()
     .describe(
-      "Input image URLs for image-to-image editing. Required for this model.",
+      "Input image URLs for image editing. When provided, routes to the /edit endpoint. Omit for text-to-image generation.",
     ),
   resolution: nanoBanana2ResolutionSchema
     .default("1K")
@@ -103,11 +104,11 @@ export const definition: ModelDefinition<typeof schema> = {
   type: "model",
   name: "nano-banana-2",
   description:
-    "Google Nano Banana 2 - next-gen image editing model. Requires image_urls for all operations.",
+    "Google Nano Banana 2 - next-gen image generation and editing model. Supports text-to-image and image editing (with image_urls).",
   providers: ["fal"],
   defaultProvider: "fal",
   providerModels: {
-    fal: "fal-ai/nano-banana-2/edit",
+    fal: "fal-ai/nano-banana-2",
   },
   schema,
 };

package/src/providers/elevenlabs.ts CHANGED Viewed

@@ -117,7 +117,8 @@ export class ElevenLabsProvider extends BaseProvider {
     const audio = await this.client.music.compose({
       prompt,
-      musicLengthMs,
+      musicLengthMs:
+        musicLengthMs != null ? Math.round(musicLengthMs) : undefined,
       modelId: "music_v1",
     });

package/src/providers/fal.ts CHANGED Viewed

@@ -54,9 +54,13 @@ export class FalProvider extends BaseProvider {
         return "fal-ai/nano-banana-pro/edit";
       }
     }
-    // Nano Banana 2: always route to /edit endpoint (edit-only model)
+    // Nano Banana 2: route to /edit when image_urls are provided, otherwise use base t2i endpoint
     if (model === "fal-ai/nano-banana-2") {
-      return "fal-ai/nano-banana-2/edit";
+      const imageUrls = inputs.image_urls as string[] | undefined;
+      if (imageUrls && imageUrls.length > 0) {
+        return "fal-ai/nano-banana-2/edit";
+      }
+      return "fal-ai/nano-banana-2";
     }
     // Qwen Image 2: route to /edit endpoint when image_urls are provided
     if (model === "fal-ai/qwen-image-2/text-to-image") {

package/src/react/renderers/captions.ts CHANGED Viewed

@@ -156,12 +156,17 @@ function parseSrt(content: string): SrtEntry[] {
   return entries;
 }
+/**
+ * Format seconds to ASS timestamp `H:MM:SS.CC`.
+ * Computes from total centiseconds to avoid overflow when rounding
+ * lands on 100 cs (e.g. 1.999s would otherwise produce `0:00:01.100`).
+ */
 function formatAssTime(seconds: number): string {
-  const h = Math.floor(seconds / 3600);
-  const m = Math.floor((seconds % 3600) / 60);
-  const s = Math.floor(seconds % 60);
-  const cs = Math.floor((seconds % 1) * 100);
+  const totalCs = Math.max(0, Math.round(seconds * 100));
+  const h = Math.floor(totalCs / 360000);
+  const m = Math.floor((totalCs % 360000) / 6000);
+  const s = Math.floor((totalCs % 6000) / 100);
+  const cs = totalCs % 100;
   return `${h}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}.${String(cs).padStart(2, "0")}`;
 }
@@ -190,9 +195,15 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
   const entries = parseSrt(srtContent);
   const assDialogues = entries
-    .map((entry) => {
+    .map((entry, i) => {
       const start = formatAssTime(entry.start);
-      const end = formatAssTime(entry.end);
+      // Clamp end to next entry's start to prevent overlapping subtitles
+      // (transcription engines often produce overlapping word timestamps)
+      const nextStart =
+        i < entries.length - 1 ? entries[i + 1]!.start : undefined;
+      const clampedEnd =
+        nextStart !== undefined ? Math.min(entry.end, nextStart) : entry.end;
+      const end = formatAssTime(clampedEnd);
       const text = entry.text.replace(/\n/g, "\\N");
       return `Dialogue: 0,${start},${end},Default,,0,0,0,,${text}`;
     })
@@ -201,6 +212,93 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
   return assHeader + assDialogues;
 }
+/**
+ * Generates ASS subtitle content with grouped words and active-word highlighting.
+ *
+ * Groups words into chunks of `wordsPerLine`. For each group, generates one
+ * Dialogue event per word timing where the currently-spoken word is colored
+ * with `activeColor` and the rest use the base `primaryColor`.
+ *
+ * Example output for group ["Varg", "AI", "is"] with activeColor orange:
+ *   t=0.5-0.8: {\c&H428CFF&}Varg{\c&HFFFFFF&} AI is
+ *   t=0.8-1.0: Varg {\c&H428CFF&}AI{\c&HFFFFFF&} is
+ *   t=1.0-1.3: Varg AI {\c&H428CFF&}is{\c&HFFFFFF&}
+ */
+function convertSrtToAssGrouped(
+  srtContent: string,
+  style: SubtitleStyle,
+  width: number,
+  height: number,
+  wordsPerLine: number,
+  activeColor?: string,
+): string {
+  const assHeader = `[Script Info]
+Title: Generated Subtitles
+ScriptType: v4.00+
+PlayResX: ${width}
+PlayResY: ${height}
+WrapStyle: 0
+ScaledBorderAndShadow: yes
+YCbCr Matrix: TV.601
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+Style: Default,${style.fontName},${style.fontSize},${style.primaryColor},&H000000FF,${style.outlineColor},${style.backColor},${style.bold ? -1 : 0},0,0,0,100,100,0,0,1,${style.outline},${style.shadow},${style.alignment},10,10,${style.marginV},1
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+`;
+  const entries = parseSrt(srtContent);
+  const dialogues: string[] = [];
+  const baseColor = style.primaryColor;
+  const highlightColor = activeColor ?? baseColor;
+  // Group entries into chunks of wordsPerLine
+  for (let gi = 0; gi < entries.length; gi += wordsPerLine) {
+    const group = entries.slice(gi, gi + wordsPerLine);
+    const groupStart = group[0]!.start;
+    // Cap group end at next group's start to prevent two groups showing simultaneously
+    const nextGroupStart =
+      gi + wordsPerLine < entries.length
+        ? entries[gi + wordsPerLine]!.start
+        : undefined;
+    const groupEnd = nextGroupStart ?? group[group.length - 1]!.end;
+    if (!activeColor) {
+      // No highlight — show entire group as one event
+      const text = group.map((e) => e.text.replace(/\n/g, " ")).join(" ");
+      dialogues.push(
+        `Dialogue: 0,${formatAssTime(groupStart)},${formatAssTime(groupEnd)},Default,,0,0,0,,${text}`,
+      );
+    } else {
+      // Karaoke highlight — one dialogue event per word, shifting the highlight
+      for (let wi = 0; wi < group.length; wi++) {
+        const wordEntry = group[wi]!;
+        const wordStart = wordEntry.start;
+        // Word ends at next word's start (within group), or at group end
+        const wordEnd = wi < group.length - 1 ? group[wi + 1]!.start : groupEnd;
+        // Build the text line with ASS color overrides
+        const parts = group.map((entry, idx) => {
+          const word = entry.text.replace(/\n/g, " ").trim();
+          if (idx === wi) {
+            // Active word — use highlight color
+            return `{\\c${highlightColor}}${word}{\\c${baseColor}}`;
+          }
+          return word;
+        });
+        dialogues.push(
+          `Dialogue: 0,${formatAssTime(wordStart)},${formatAssTime(wordEnd)},Default,,0,0,0,,${parts.join(" ")}`,
+        );
+      }
+    }
+  }
+  return assHeader + dialogues.join("\n");
+}
 const POSITION_ALIGNMENT: Record<string, number> = {
   top: 8,
   center: 5,
@@ -363,7 +461,20 @@ export async function renderCaptions(
     marginV: props.position === "center" ? 0 : baseStyle.marginV,
   };
-  const assContent = convertSrtToAss(srtContent, style, ctx.width, ctx.height);
+  const activeColorAss = props.activeColor
+    ? colorToAss(props.activeColor)
+    : undefined;
+  const assContent = props.wordsPerLine
+    ? convertSrtToAssGrouped(
+        srtContent,
+        style,
+        ctx.width,
+        ctx.height,
+        props.wordsPerLine,
+        activeColorAss,
+      )
+    : convertSrtToAss(srtContent, style, ctx.width, ctx.height);
   const assPath = `/tmp/varg-captions-${Date.now()}.ass`;
   writeFileSync(assPath, assContent);
   ctx.tempFiles.push(assPath);

package/src/react/renderers/context.ts CHANGED Viewed

@@ -1,6 +1,7 @@
-import type { generateImage } from "ai";
+import type { experimental_generateSpeech, generateImage } from "ai";
 import type { CacheStorage } from "../../ai-sdk/cache";
 import type { File } from "../../ai-sdk/file";
+import type { generateMusic } from "../../ai-sdk/generate-music";
 import type { generateVideo } from "../../ai-sdk/generate-video";
 import type { FFmpegBackend } from "../../ai-sdk/providers/editly/backends";
 import type { StorageProvider } from "../../ai-sdk/storage/types";
@@ -15,6 +16,8 @@ export interface RenderContext {
   storage?: StorageProvider;
   generateImage: typeof generateImage;
   generateVideo: typeof generateVideo;
+  generateSpeech: typeof experimental_generateSpeech;
+  generateMusic: typeof generateMusic;
   tempFiles: string[];
   progress?: ProgressTracker;
   pendingFiles: Map<string, Promise<File>>;

package/src/react/renderers/image.ts CHANGED Viewed

@@ -37,9 +37,9 @@ async function resolvePrompt(
   if (typeof prompt === "string") {
     return prompt;
   }
-  const resolvedImages = await Promise.all(
-    prompt.images.map((img) => resolveImageInput(img, ctx)),
-  );
+  const resolvedImages = prompt.images
+    ? await Promise.all(prompt.images.map((img) => resolveImageInput(img, ctx)))
+    : [];
   return { text: prompt.text, images: resolvedImages };
 }

package/src/react/renderers/music.ts CHANGED Viewed

@@ -1,9 +1,10 @@
 import { File } from "../../ai-sdk/file";
-import { generateMusic } from "../../ai-sdk/generate-music";
+import type { generateMusic } from "../../ai-sdk/generate-music";
 import { ResolvedElement } from "../resolved-element";
 import type { MusicProps, VargElement } from "../types";
 import type { RenderContext } from "./context";
 import { addTask, completeTask, startTask } from "./progress";
+import { computeCacheKey } from "./utils";
 export async function renderMusic(
   element: VargElement<"music">,
@@ -23,73 +24,50 @@ export async function renderMusic(
     throw new Error("Music requires prompt and model (or set defaults.music)");
   }
-  const cacheKey = JSON.stringify({
-    type: "music",
-    prompt,
-    model: model.modelId,
-    duration: props.duration,
-  });
+  const cacheKey = computeCacheKey(element);
+  const cacheKeyStr = JSON.stringify(cacheKey);
-  const modelId = model.modelId ?? "music";
-  const taskId = ctx.progress ? addTask(ctx.progress, "music", modelId) : null;
+  // Deduplicate concurrent renders of the same music element
+  const pendingRender = ctx.pendingFiles.get(cacheKeyStr);
+  if (pendingRender) {
+    return pendingRender;
+  }
+  const renderPromise = (async () => {
+    const modelId = model.modelId ?? "music";
+    const taskId = ctx.progress
+      ? addTask(ctx.progress, "music", modelId)
+      : null;
+    if (taskId && ctx.progress) startTask(ctx.progress, taskId);
-  const generateFn = async () => {
-    const result = await generateMusic({
+    const { audio } = await ctx.generateMusic({
       model,
       prompt,
       duration: props.duration,
-    });
-    return result.audio;
-  };
-  let audio: { uint8Array: Uint8Array; url?: string; mediaType?: string };
+      cacheKey,
+    } as Parameters<typeof generateMusic>[0]);
-  if (ctx.cache) {
-    const cached = await ctx.cache.get(cacheKey);
-    if (cached) {
-      const cachedAudio = cached as {
-        uint8Array: Uint8Array;
-        url?: string;
-        mediaType?: string;
-      };
-      audio = {
-        uint8Array: cachedAudio.uint8Array,
-        url: cachedAudio.url,
-        mediaType: cachedAudio.mediaType,
-      };
-      if (taskId && ctx.progress) {
-        startTask(ctx.progress, taskId);
-        completeTask(ctx.progress, taskId);
-      }
-    } else {
-      if (taskId && ctx.progress) startTask(ctx.progress, taskId);
-      audio = await generateFn();
-      if (taskId && ctx.progress) completeTask(ctx.progress, taskId);
-      await ctx.cache.set(cacheKey, {
-        uint8Array: audio.uint8Array,
-        url: audio.url,
-        mediaType: audio.mediaType,
-      });
-    }
-  } else {
-    if (taskId && ctx.progress) startTask(ctx.progress, taskId);
-    audio = await generateFn();
     if (taskId && ctx.progress) completeTask(ctx.progress, taskId);
-  }
-  const mediaType = audio.mediaType ?? "audio/mpeg";
+    const mediaType =
+      (audio as { mediaType?: string }).mediaType ?? "audio/mpeg";
+    const file = File.fromGenerated({
+      uint8Array: audio.uint8Array,
+      mediaType,
+      url: (audio as { url?: string }).url,
+    }).withMetadata({
+      type: "music",
+      model: modelId,
+      prompt,
+    });
+    ctx.generatedFiles.push(file);
-  const file = File.fromGenerated({
-    uint8Array: audio.uint8Array,
-    mediaType,
-    url: audio.url,
-  }).withMetadata({
-    type: "music",
-    model: modelId,
-    prompt,
-  });
+    return file;
+  })();
-  ctx.generatedFiles.push(file);
+  ctx.pendingFiles.set(cacheKeyStr, renderPromise);
-  return file;
+  return renderPromise;
 }

package/src/react/renderers/render.ts CHANGED Viewed

@@ -1,9 +1,14 @@
 import type { ImageModelV3 } from "@ai-sdk/provider";
-import { generateImage, wrapImageModel } from "ai";
+import {
+  generateImage,
+  experimental_generateSpeech as generateSpeech,
+  wrapImageModel,
+} from "ai";
 import pMap from "p-map";
 import { type CacheStorage, withCache } from "../../ai-sdk/cache";
 import type { File, File as VargFile } from "../../ai-sdk/file";
 import { fileCache } from "../../ai-sdk/file-cache";
+import { generateMusic } from "../../ai-sdk/generate-music";
 import { generateVideo } from "../../ai-sdk/generate-video";
 import {
   imagePlaceholderFallbackMiddleware,
@@ -109,6 +114,14 @@ export async function renderRoot(
     ? withCache(generateVideo, { storage: cacheStorage })
     : generateVideo;
+  const cachedGenerateSpeech = cacheStorage
+    ? withCache(generateSpeech, { storage: cacheStorage })
+    : generateSpeech;
+  const cachedGenerateMusic = cacheStorage
+    ? withCache(generateMusic, { storage: cacheStorage })
+    : generateMusic;
   const wrapGenerateImage: typeof generateImage = async (opts) => {
     if (mode === "preview") {
       trackPlaceholder("image");
@@ -158,6 +171,8 @@ export async function renderRoot(
     storage: options.storage,
     generateImage: wrapGenerateImage,
     generateVideo: wrapGenerateVideo,
+    generateSpeech: cachedGenerateSpeech,
+    generateMusic: cachedGenerateMusic,
     tempFiles,
     progress,
     pendingFiles: new Map<string, Promise<File>>(),

package/src/react/renderers/speech.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { experimental_generateSpeech as generateSpeech } from "ai";
+import type { experimental_generateSpeech } from "ai";
 import { File } from "../../ai-sdk/file";
 import { ResolvedElement } from "../resolved-element";
 import type { SpeechProps, VargElement } from "../types";
@@ -29,33 +29,49 @@ export async function renderSpeech(
   }
   const cacheKey = computeCacheKey(element);
+  const cacheKeyStr = JSON.stringify(cacheKey);
-  const modelId = typeof model === "string" ? model : model.modelId;
-  const taskId = ctx.progress ? addTask(ctx.progress, "speech", modelId) : null;
-  if (taskId && ctx.progress) startTask(ctx.progress, taskId);
+  // Deduplicate concurrent renders of the same speech element
+  const pendingRender = ctx.pendingFiles.get(cacheKeyStr);
+  if (pendingRender) {
+    return pendingRender;
+  }
+  const renderPromise = (async () => {
+    const modelId = typeof model === "string" ? model : model.modelId;
+    const taskId = ctx.progress
+      ? addTask(ctx.progress, "speech", modelId)
+      : null;
+    if (taskId && ctx.progress) startTask(ctx.progress, taskId);
+    const { audio } = await ctx.generateSpeech({
+      model,
+      text,
+      voice: props.voice ?? "rachel",
+      cacheKey,
+    } as Parameters<typeof experimental_generateSpeech>[0]);
+    if (taskId && ctx.progress) completeTask(ctx.progress, taskId);
-  const { audio } = await generateSpeech({
-    model,
-    text,
-    voice: props.voice ?? "rachel",
-    cacheKey,
-  } as Parameters<typeof generateSpeech>[0]);
+    const mediaType =
+      (audio as { mediaType?: string }).mediaType ?? "audio/mpeg";
-  if (taskId && ctx.progress) completeTask(ctx.progress, taskId);
+    const file = File.fromGenerated({
+      uint8Array: audio.uint8Array,
+      mediaType,
+      url: (audio as { url?: string }).url,
+    }).withMetadata({
+      type: "speech",
+      model: modelId,
+      prompt: text,
+    });
-  const mediaType = (audio as { mediaType?: string }).mediaType ?? "audio/mpeg";
+    ctx.generatedFiles.push(file);
-  const file = File.fromGenerated({
-    uint8Array: audio.uint8Array,
-    mediaType,
-    url: (audio as { url?: string }).url,
-  }).withMetadata({
-    type: "speech",
-    model: modelId,
-    prompt: text,
-  });
+    return file;
+  })();
-  ctx.generatedFiles.push(file);
+  ctx.pendingFiles.set(cacheKeyStr, renderPromise);
-  return file;
+  return renderPromise;
 }

package/src/react/renderers/talking-head.ts CHANGED Viewed

@@ -14,6 +14,7 @@ import { renderVideo } from "./video";
 const VIDEO_ONLY_LIPSYNC_MODELS = new Set([
   "sync-v2",
   "sync-v2-pro",
+  "sync-v3",
   "lipsync",
 ]);

package/src/react/renderers/utils.ts CHANGED Viewed

@@ -93,13 +93,14 @@ function serializeValue(v: unknown): string {
   }
   // ResolvedElement (e.g. a speech segment used as Video audio input):
   // serialize by content identity (type + text + duration), not binary data.
+  // Deliberately excludes file.url — upload URLs contain Date.now() + Math.random()
+  // and would make downstream cache keys (e.g. VEED video) non-deterministic.
   if (v instanceof ResolvedElement) {
     const parts = [v.type];
     for (const child of v.children) {
       if (typeof child === "string") parts.push(child);
     }
     if (v.meta.duration) parts.push(String(v.meta.duration));
-    if (v.meta.file?.url) parts.push(v.meta.file.url);
     return `resolved(${parts.join(",")})`;
   }
   if (isVargElement(v)) {

package/src/react/resolve.ts CHANGED Viewed

@@ -15,7 +15,7 @@ import {
   experimental_generateSpeech as generateSpeechAI,
 } from "ai";
 import { $ } from "bun";
-import { type CacheStorage, withCache } from "../ai-sdk/cache";
+import { type CacheStorage, depsToKey, withCache } from "../ai-sdk/cache";
 import { File } from "../ai-sdk/file";
 import { fileCache } from "../ai-sdk/file-cache";
 import { generateMusic as generateMusicRaw } from "../ai-sdk/generate-music";
@@ -116,6 +116,12 @@ function getCachedGenerateMusic() {
   return withCache(generateMusicRaw, { storage });
 }
+/** Get a cached generateSpeech wrapper using the active cache storage. */
+function getCachedGenerateSpeech() {
+  const storage = getActiveCache();
+  return withCache(generateSpeechAI, { storage });
+}
 // ---------------------------------------------------------------------------
 // Speech
 // ---------------------------------------------------------------------------
@@ -297,6 +303,77 @@ async function sliceAudio(
   return new Uint8Array(sliced);
 }
+// ---------------------------------------------------------------------------
+// Speech resolve-level cache: serialization helpers
+// ---------------------------------------------------------------------------
+/** Serializable representation of a speech segment for caching. */
+interface CachedSegment {
+  text: string;
+  start: number;
+  end: number;
+  duration: number;
+  props: Record<string, unknown>;
+  children: string[];
+  file: { uint8Array: Uint8Array; mediaType: string };
+  words?: WordTiming[];
+}
+/** Serializable representation of a full resolved speech for caching. */
+interface CachedSpeechResult {
+  file: { uint8Array: Uint8Array; mediaType: string };
+  duration: number;
+  words?: WordTiming[];
+  segments?: CachedSegment[];
+}
+/** Reconstruct a Segment (ResolvedElement<"speech"> + timing props) from cached data. */
+function reconstructSegment(
+  cached: CachedSegment,
+  storage?: import("../ai-sdk/storage/types").StorageProvider,
+): Segment {
+  const segmentFile = File.fromBuffer(
+    cached.file.uint8Array,
+    cached.file.mediaType,
+  );
+  const resolved = new ResolvedElement<"speech">(
+    { type: "speech", props: cached.props, children: cached.children },
+    {
+      file: segmentFile,
+      duration: cached.duration,
+      segments: [],
+      words: cached.words,
+    },
+  );
+  Object.defineProperties(resolved, {
+    text: { value: cached.text, enumerable: true },
+    start: { value: cached.start, enumerable: true },
+    end: { value: cached.end, enumerable: true },
+  });
+  return resolved as Segment;
+}
+/** Serialize a Segment into a cacheable plain object. */
+function serializeSegment(seg: Segment): CachedSegment {
+  return {
+    text: seg.text,
+    start: seg.start,
+    end: seg.end,
+    duration: seg.duration,
+    props: { ...seg.props },
+    children: seg.children.filter((c): c is string => typeof c === "string"),
+    file: {
+      uint8Array: (seg.meta.file as any)._data as Uint8Array,
+      mediaType: "audio/mpeg",
+    },
+    words: seg.meta.words,
+  };
+}
+// ---------------------------------------------------------------------------
+// resolveSpeechElement — cached at the full-result level
+// ---------------------------------------------------------------------------
 /** Generate speech audio via the AI SDK and return a ResolvedElement with duration metadata. */
 export async function resolveSpeechElement(
   element: VargElement<"speech">,
@@ -324,12 +401,52 @@ export async function resolveSpeechElement(
   const cacheKey = computeCacheKey(element);
-  const { audio, ...rest } = await generateSpeechAI({
+  // ---- Check full-result cache (includes segments, words, duration) ----
+  const cache = getActiveCache();
+  const resolveKey = depsToKey("resolveSpeech", cacheKey);
+  const cached = (await cache.get(resolveKey)) as
+    | CachedSpeechResult
+    | undefined;
+  if (cached) {
+    const ctx = getResolveContext();
+    const file = File.fromGenerated({
+      uint8Array: cached.file.uint8Array,
+      mediaType: cached.file.mediaType,
+    }).withMetadata({
+      type: "speech",
+      model: typeof model === "string" ? model : model.modelId,
+      prompt: text,
+    });
+    // Upload reconstructed segment files to storage so downstream cache keys
+    // get stable URLs (instead of no URL at all).
+    const segments = cached.segments?.map((s) =>
+      reconstructSegment(s, ctx?.storage),
+    );
+    if (segments && ctx?.storage) {
+      await Promise.all(
+        segments.map((seg) => seg.meta.file.upload(ctx.storage!)),
+      );
+    }
+    return new ResolvedElement(element, {
+      file,
+      duration: cached.duration,
+      words: cached.words,
+      segments,
+    });
+  }
+  // ---- Cache miss: generate, probe, slice, then cache ----
+  const generateSpeech = getCachedGenerateSpeech();
+  const { audio, ...rest } = await generateSpeech({
     model,
     text,
     voice: props.voice ?? "rachel",
     cacheKey,
-  } as Parameters<typeof generateSpeechAI>[0]);
+  });
   const mediaType = (audio as { mediaType?: string }).mediaType ?? "audio/mpeg";
@@ -377,6 +494,15 @@ export async function resolveSpeechElement(
     }
   }
+  // ---- Write full result to cache ----
+  const toCache: CachedSpeechResult = {
+    file: { uint8Array: audio.uint8Array, mediaType },
+    duration,
+    words,
+    segments: segments?.map(serializeSegment),
+  };
+  await cache.set(resolveKey, toCache);
   return new ResolvedElement(element, {
     file,
     duration,
@@ -451,9 +577,11 @@ async function resolveImagePrompt(
   prompt: ImagePrompt,
 ): Promise<string | { text?: string; images: Uint8Array[] }> {
   if (typeof prompt === "string") return prompt;
-  const resolvedImages = await Promise.all(
-    prompt.images.map((img) => resolveImageInputForStandalone(img)),
-  );
+  const resolvedImages = prompt.images
+    ? await Promise.all(
+        prompt.images.map((img) => resolveImageInputForStandalone(img)),
+      )
+    : [];
   return { text: prompt.text, images: resolvedImages };
 }

package/src/react/types.ts CHANGED Viewed

@@ -129,7 +129,7 @@ export interface OverlayProps extends BaseProps, PositionProps, AudioProps {
 }
 export type ImageInput = Uint8Array | string | VargElement<"image">;
-export type ImagePrompt = string | { text?: string; images: ImageInput[] };
+export type ImagePrompt = string | { text?: string; images?: ImageInput[] };
 export interface ImageProps extends BaseProps, PositionProps {
   prompt?: ImagePrompt;
@@ -253,6 +253,8 @@ export interface CaptionsProps extends BaseProps {
   color?: string;
   activeColor?: string;
   fontSize?: number;
+  /** Number of words to display per subtitle line. When set with activeColor, enables karaoke-style highlighting where the active word is colored differently. */
+  wordsPerLine?: number;
   /** When src is a Speech element, include its audio track in the video. Defaults to false. */
   withAudio?: boolean;
 }

package/src/studio/step-renderer.ts CHANGED Viewed

@@ -1,6 +1,10 @@
-import { generateImage } from "ai";
+import {
+  generateImage,
+  experimental_generateSpeech as generateSpeech,
+} from "ai";
 import { type CacheStorage, withCache } from "../ai-sdk/cache";
 import { fileCache } from "../ai-sdk/file-cache";
+import { generateMusic } from "../ai-sdk/generate-music";
 import { generateVideo } from "../ai-sdk/generate-video";
 import { localBackend } from "../ai-sdk/providers/editly";
 import type { RenderContext } from "../react/renderers/context";
@@ -49,6 +53,12 @@ export function createStepSession(
     generateVideo: cacheStorage
       ? withCache(generateVideo, { storage: cacheStorage })
       : generateVideo,
+    generateSpeech: cacheStorage
+      ? withCache(generateSpeech, { storage: cacheStorage })
+      : generateSpeech,
+    generateMusic: cacheStorage
+      ? withCache(generateMusic, { storage: cacheStorage })
+      : generateMusic,
     tempFiles: [],
     progress: createProgressTracker(false),
     pendingFiles: new Map(),