vargai 0.4.0-alpha102 → 0.4.0-alpha105

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -104,7 +104,7 @@
104
104
  "license": "Apache-2.0",
105
105
  "author": "varg.ai <hello@varg.ai> (https://varg.ai)",
106
106
  "sideEffects": false,
107
- "version": "0.4.0-alpha102",
107
+ "version": "0.4.0-alpha105",
108
108
  "exports": {
109
109
  ".": "./src/index.ts",
110
110
  "./ai": "./src/ai-sdk/index.ts",
@@ -89,7 +89,7 @@ class ElevenLabsMusicModel implements MusicModelV3 {
89
89
  const elevenLabsOptions = providerOptions?.elevenlabs ?? {};
90
90
  const audio = await this.client.music.compose({
91
91
  prompt,
92
- musicLengthMs: duration ? duration * 1000 : undefined,
92
+ musicLengthMs: duration ? Math.round(duration * 1000) : undefined,
93
93
  modelId: this.modelId,
94
94
  ...elevenLabsOptions,
95
95
  } as Parameters<typeof this.client.music.compose>[0]);
@@ -174,6 +174,7 @@ const MOTION_CONTROL_MODELS: Record<string, string> = {
174
174
  const LIPSYNC_MODELS: Record<string, string> = {
175
175
  "sync-v2": "fal-ai/sync-lipsync",
176
176
  "sync-v2-pro": "fal-ai/sync-lipsync/v2",
177
+ "sync-v3": "fal-ai/sync-lipsync/v3",
177
178
  lipsync: "fal-ai/sync-lipsync",
178
179
  "omnihuman-v1.5": "fal-ai/bytedance/omnihuman/v1.5",
179
180
  "veed-fabric-1.0": "veed/fabric-1.0",
@@ -195,7 +196,7 @@ const IMAGE_MODELS: Record<string, string> = {
195
196
  "recraft-v3": "fal-ai/recraft/v3/text-to-image",
196
197
  "nano-banana-pro": "fal-ai/nano-banana-pro",
197
198
  "nano-banana-pro/edit": "fal-ai/nano-banana-pro/edit",
198
- "nano-banana-2": "fal-ai/nano-banana-2/edit",
199
+ "nano-banana-2": "fal-ai/nano-banana-2",
199
200
  "nano-banana-2/edit": "fal-ai/nano-banana-2/edit",
200
201
  "seedream-v4.5/edit": "fal-ai/bytedance/seedream/v4.5/edit",
201
202
  // Qwen Image 2 - text-to-image and image-to-image editing (standard + pro)
@@ -923,13 +924,21 @@ class FalImageModel implements ImageModelV3 {
923
924
  }
924
925
 
925
926
  const hasFiles = files && files.length > 0;
926
- const finalEndpoint = this.resolveEndpoint();
927
927
 
928
928
  let stableKey: string | undefined;
929
929
  if (hasFiles && files) {
930
930
  const fileHashes = await computeFileHashes(files);
931
+ const imageUrls = await pMap(files, fileToUrl, { concurrency: 2 });
932
+ // Reve uses singular image_url instead of image_urls array
933
+ if (SINGULAR_IMAGE_URL_MODELS.has(this.modelId)) {
934
+ input.image_url = imageUrls[0];
935
+ } else {
936
+ input.image_urls = imageUrls;
937
+ }
938
+ // Compute stable key after files are resolved
939
+ const finalEndpointForKey = this.resolveEndpoint(hasFiles);
931
940
  stableKey = JSON.stringify({
932
- endpoint: finalEndpoint,
941
+ endpoint: finalEndpointForKey,
933
942
  prompt,
934
943
  n,
935
944
  size,
@@ -939,13 +948,6 @@ class FalImageModel implements ImageModelV3 {
939
948
  modelId: this.modelId,
940
949
  fileHashes,
941
950
  });
942
- const imageUrls = await pMap(files, fileToUrl, { concurrency: 2 });
943
- // Reve uses singular image_url instead of image_urls array
944
- if (SINGULAR_IMAGE_URL_MODELS.has(this.modelId)) {
945
- input.image_url = imageUrls[0];
946
- } else {
947
- input.image_urls = imageUrls;
948
- }
949
951
  }
950
952
 
951
953
  if (isQwenAngles && !input.image_urls) {
@@ -961,6 +963,10 @@ class FalImageModel implements ImageModelV3 {
961
963
  }
962
964
  }
963
965
 
966
+ // Resolve endpoint after file processing so dual-endpoint models
967
+ // (e.g. nano-banana-2 vs nano-banana-2/edit) route correctly
968
+ const finalEndpoint = this.resolveEndpoint(hasFiles);
969
+
964
970
  const result = await executeWithQueueRecovery<{ data: unknown }>(
965
971
  finalEndpoint,
966
972
  input,
@@ -997,11 +1003,16 @@ class FalImageModel implements ImageModelV3 {
997
1003
  };
998
1004
  }
999
1005
 
1000
- private resolveEndpoint(): string {
1006
+ private resolveEndpoint(hasFiles?: boolean): string {
1001
1007
  if (this.modelId.startsWith("raw:")) {
1002
1008
  return this.modelId.slice(4);
1003
1009
  }
1004
1010
 
1011
+ // Nano Banana 2: route to /edit when images are provided, base endpoint for t2i
1012
+ if (this.modelId === "nano-banana-2" && hasFiles) {
1013
+ return "fal-ai/nano-banana-2/edit";
1014
+ }
1015
+
1005
1016
  return IMAGE_MODELS[this.modelId] ?? this.modelId;
1006
1017
  }
1007
1018
  }
@@ -1,6 +1,6 @@
1
1
  /**
2
- * Nano Banana 2 image editing model (Google's next-gen image generation/editing)
3
- * Edit-only model requiring image_urls input
2
+ * Nano Banana 2 image model (Google's next-gen image generation/editing)
3
+ * Supports both text-to-image (no images) and image editing (with image_urls)
4
4
  */
5
5
 
6
6
  import { z } from "zod";
@@ -35,8 +35,9 @@ const nanoBanana2InputSchema = z.object({
35
35
  prompt: z.string().describe("Text description for image editing"),
36
36
  image_urls: z
37
37
  .array(z.string().url())
38
+ .optional()
38
39
  .describe(
39
- "Input image URLs for image-to-image editing. Required for this model.",
40
+ "Input image URLs for image editing. When provided, routes to the /edit endpoint. Omit for text-to-image generation.",
40
41
  ),
41
42
  resolution: nanoBanana2ResolutionSchema
42
43
  .default("1K")
@@ -103,11 +104,11 @@ export const definition: ModelDefinition<typeof schema> = {
103
104
  type: "model",
104
105
  name: "nano-banana-2",
105
106
  description:
106
- "Google Nano Banana 2 - next-gen image editing model. Requires image_urls for all operations.",
107
+ "Google Nano Banana 2 - next-gen image generation and editing model. Supports text-to-image and image editing (with image_urls).",
107
108
  providers: ["fal"],
108
109
  defaultProvider: "fal",
109
110
  providerModels: {
110
- fal: "fal-ai/nano-banana-2/edit",
111
+ fal: "fal-ai/nano-banana-2",
111
112
  },
112
113
  schema,
113
114
  };
@@ -117,7 +117,8 @@ export class ElevenLabsProvider extends BaseProvider {
117
117
 
118
118
  const audio = await this.client.music.compose({
119
119
  prompt,
120
- musicLengthMs,
120
+ musicLengthMs:
121
+ musicLengthMs != null ? Math.round(musicLengthMs) : undefined,
121
122
  modelId: "music_v1",
122
123
  });
123
124
 
@@ -54,9 +54,13 @@ export class FalProvider extends BaseProvider {
54
54
  return "fal-ai/nano-banana-pro/edit";
55
55
  }
56
56
  }
57
- // Nano Banana 2: always route to /edit endpoint (edit-only model)
57
+ // Nano Banana 2: route to /edit when image_urls are provided, otherwise use base t2i endpoint
58
58
  if (model === "fal-ai/nano-banana-2") {
59
- return "fal-ai/nano-banana-2/edit";
59
+ const imageUrls = inputs.image_urls as string[] | undefined;
60
+ if (imageUrls && imageUrls.length > 0) {
61
+ return "fal-ai/nano-banana-2/edit";
62
+ }
63
+ return "fal-ai/nano-banana-2";
60
64
  }
61
65
  // Qwen Image 2: route to /edit endpoint when image_urls are provided
62
66
  if (model === "fal-ai/qwen-image-2/text-to-image") {
@@ -156,12 +156,17 @@ function parseSrt(content: string): SrtEntry[] {
156
156
  return entries;
157
157
  }
158
158
 
159
+ /**
160
+ * Format seconds to ASS timestamp `H:MM:SS.CC`.
161
+ * Computes from total centiseconds to avoid overflow when rounding
162
+ * lands on 100 cs (e.g. 1.999s would otherwise produce `0:00:01.100`).
163
+ */
159
164
  function formatAssTime(seconds: number): string {
160
- const h = Math.floor(seconds / 3600);
161
- const m = Math.floor((seconds % 3600) / 60);
162
- const s = Math.floor(seconds % 60);
163
- const cs = Math.floor((seconds % 1) * 100);
164
-
165
+ const totalCs = Math.max(0, Math.round(seconds * 100));
166
+ const h = Math.floor(totalCs / 360000);
167
+ const m = Math.floor((totalCs % 360000) / 6000);
168
+ const s = Math.floor((totalCs % 6000) / 100);
169
+ const cs = totalCs % 100;
165
170
  return `${h}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}.${String(cs).padStart(2, "0")}`;
166
171
  }
167
172
 
@@ -190,9 +195,15 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
190
195
 
191
196
  const entries = parseSrt(srtContent);
192
197
  const assDialogues = entries
193
- .map((entry) => {
198
+ .map((entry, i) => {
194
199
  const start = formatAssTime(entry.start);
195
- const end = formatAssTime(entry.end);
200
+ // Clamp end to next entry's start to prevent overlapping subtitles
201
+ // (transcription engines often produce overlapping word timestamps)
202
+ const nextStart =
203
+ i < entries.length - 1 ? entries[i + 1]!.start : undefined;
204
+ const clampedEnd =
205
+ nextStart !== undefined ? Math.min(entry.end, nextStart) : entry.end;
206
+ const end = formatAssTime(clampedEnd);
196
207
  const text = entry.text.replace(/\n/g, "\\N");
197
208
  return `Dialogue: 0,${start},${end},Default,,0,0,0,,${text}`;
198
209
  })
@@ -201,6 +212,93 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
201
212
  return assHeader + assDialogues;
202
213
  }
203
214
 
215
+ /**
216
+ * Generates ASS subtitle content with grouped words and active-word highlighting.
217
+ *
218
+ * Groups words into chunks of `wordsPerLine`. For each group, generates one
219
+ * Dialogue event per word timing where the currently-spoken word is colored
220
+ * with `activeColor` and the rest use the base `primaryColor`.
221
+ *
222
+ * Example output for group ["Varg", "AI", "is"] with activeColor orange:
223
+ * t=0.5-0.8: {\c&H428CFF&}Varg{\c&HFFFFFF&} AI is
224
+ * t=0.8-1.0: Varg {\c&H428CFF&}AI{\c&HFFFFFF&} is
225
+ * t=1.0-1.3: Varg AI {\c&H428CFF&}is{\c&HFFFFFF&}
226
+ */
227
+ function convertSrtToAssGrouped(
228
+ srtContent: string,
229
+ style: SubtitleStyle,
230
+ width: number,
231
+ height: number,
232
+ wordsPerLine: number,
233
+ activeColor?: string,
234
+ ): string {
235
+ const assHeader = `[Script Info]
236
+ Title: Generated Subtitles
237
+ ScriptType: v4.00+
238
+ PlayResX: ${width}
239
+ PlayResY: ${height}
240
+ WrapStyle: 0
241
+ ScaledBorderAndShadow: yes
242
+ YCbCr Matrix: TV.601
243
+
244
+ [V4+ Styles]
245
+ Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
246
+ Style: Default,${style.fontName},${style.fontSize},${style.primaryColor},&H000000FF,${style.outlineColor},${style.backColor},${style.bold ? -1 : 0},0,0,0,100,100,0,0,1,${style.outline},${style.shadow},${style.alignment},10,10,${style.marginV},1
247
+
248
+ [Events]
249
+ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
250
+ `;
251
+
252
+ const entries = parseSrt(srtContent);
253
+ const dialogues: string[] = [];
254
+ const baseColor = style.primaryColor;
255
+ const highlightColor = activeColor ?? baseColor;
256
+
257
+ // Group entries into chunks of wordsPerLine
258
+ for (let gi = 0; gi < entries.length; gi += wordsPerLine) {
259
+ const group = entries.slice(gi, gi + wordsPerLine);
260
+ const groupStart = group[0]!.start;
261
+ // Cap group end at next group's start to prevent two groups showing simultaneously
262
+ const nextGroupStart =
263
+ gi + wordsPerLine < entries.length
264
+ ? entries[gi + wordsPerLine]!.start
265
+ : undefined;
266
+ const groupEnd = nextGroupStart ?? group[group.length - 1]!.end;
267
+
268
+ if (!activeColor) {
269
+ // No highlight — show entire group as one event
270
+ const text = group.map((e) => e.text.replace(/\n/g, " ")).join(" ");
271
+ dialogues.push(
272
+ `Dialogue: 0,${formatAssTime(groupStart)},${formatAssTime(groupEnd)},Default,,0,0,0,,${text}`,
273
+ );
274
+ } else {
275
+ // Karaoke highlight — one dialogue event per word, shifting the highlight
276
+ for (let wi = 0; wi < group.length; wi++) {
277
+ const wordEntry = group[wi]!;
278
+ const wordStart = wordEntry.start;
279
+ // Word ends at next word's start (within group), or at group end
280
+ const wordEnd = wi < group.length - 1 ? group[wi + 1]!.start : groupEnd;
281
+
282
+ // Build the text line with ASS color overrides
283
+ const parts = group.map((entry, idx) => {
284
+ const word = entry.text.replace(/\n/g, " ").trim();
285
+ if (idx === wi) {
286
+ // Active word — use highlight color
287
+ return `{\\c${highlightColor}}${word}{\\c${baseColor}}`;
288
+ }
289
+ return word;
290
+ });
291
+
292
+ dialogues.push(
293
+ `Dialogue: 0,${formatAssTime(wordStart)},${formatAssTime(wordEnd)},Default,,0,0,0,,${parts.join(" ")}`,
294
+ );
295
+ }
296
+ }
297
+ }
298
+
299
+ return assHeader + dialogues.join("\n");
300
+ }
301
+
204
302
  const POSITION_ALIGNMENT: Record<string, number> = {
205
303
  top: 8,
206
304
  center: 5,
@@ -363,7 +461,20 @@ export async function renderCaptions(
363
461
  marginV: props.position === "center" ? 0 : baseStyle.marginV,
364
462
  };
365
463
 
366
- const assContent = convertSrtToAss(srtContent, style, ctx.width, ctx.height);
464
+ const activeColorAss = props.activeColor
465
+ ? colorToAss(props.activeColor)
466
+ : undefined;
467
+
468
+ const assContent = props.wordsPerLine
469
+ ? convertSrtToAssGrouped(
470
+ srtContent,
471
+ style,
472
+ ctx.width,
473
+ ctx.height,
474
+ props.wordsPerLine,
475
+ activeColorAss,
476
+ )
477
+ : convertSrtToAss(srtContent, style, ctx.width, ctx.height);
367
478
  const assPath = `/tmp/varg-captions-${Date.now()}.ass`;
368
479
  writeFileSync(assPath, assContent);
369
480
  ctx.tempFiles.push(assPath);
@@ -1,6 +1,7 @@
1
- import type { generateImage } from "ai";
1
+ import type { experimental_generateSpeech, generateImage } from "ai";
2
2
  import type { CacheStorage } from "../../ai-sdk/cache";
3
3
  import type { File } from "../../ai-sdk/file";
4
+ import type { generateMusic } from "../../ai-sdk/generate-music";
4
5
  import type { generateVideo } from "../../ai-sdk/generate-video";
5
6
  import type { FFmpegBackend } from "../../ai-sdk/providers/editly/backends";
6
7
  import type { StorageProvider } from "../../ai-sdk/storage/types";
@@ -15,6 +16,8 @@ export interface RenderContext {
15
16
  storage?: StorageProvider;
16
17
  generateImage: typeof generateImage;
17
18
  generateVideo: typeof generateVideo;
19
+ generateSpeech: typeof experimental_generateSpeech;
20
+ generateMusic: typeof generateMusic;
18
21
  tempFiles: string[];
19
22
  progress?: ProgressTracker;
20
23
  pendingFiles: Map<string, Promise<File>>;
@@ -37,9 +37,9 @@ async function resolvePrompt(
37
37
  if (typeof prompt === "string") {
38
38
  return prompt;
39
39
  }
40
- const resolvedImages = await Promise.all(
41
- prompt.images.map((img) => resolveImageInput(img, ctx)),
42
- );
40
+ const resolvedImages = prompt.images
41
+ ? await Promise.all(prompt.images.map((img) => resolveImageInput(img, ctx)))
42
+ : [];
43
43
  return { text: prompt.text, images: resolvedImages };
44
44
  }
45
45
 
@@ -1,9 +1,10 @@
1
1
  import { File } from "../../ai-sdk/file";
2
- import { generateMusic } from "../../ai-sdk/generate-music";
2
+ import type { generateMusic } from "../../ai-sdk/generate-music";
3
3
  import { ResolvedElement } from "../resolved-element";
4
4
  import type { MusicProps, VargElement } from "../types";
5
5
  import type { RenderContext } from "./context";
6
6
  import { addTask, completeTask, startTask } from "./progress";
7
+ import { computeCacheKey } from "./utils";
7
8
 
8
9
  export async function renderMusic(
9
10
  element: VargElement<"music">,
@@ -23,73 +24,50 @@ export async function renderMusic(
23
24
  throw new Error("Music requires prompt and model (or set defaults.music)");
24
25
  }
25
26
 
26
- const cacheKey = JSON.stringify({
27
- type: "music",
28
- prompt,
29
- model: model.modelId,
30
- duration: props.duration,
31
- });
27
+ const cacheKey = computeCacheKey(element);
28
+ const cacheKeyStr = JSON.stringify(cacheKey);
32
29
 
33
- const modelId = model.modelId ?? "music";
34
- const taskId = ctx.progress ? addTask(ctx.progress, "music", modelId) : null;
30
+ // Deduplicate concurrent renders of the same music element
31
+ const pendingRender = ctx.pendingFiles.get(cacheKeyStr);
32
+ if (pendingRender) {
33
+ return pendingRender;
34
+ }
35
+
36
+ const renderPromise = (async () => {
37
+ const modelId = model.modelId ?? "music";
38
+ const taskId = ctx.progress
39
+ ? addTask(ctx.progress, "music", modelId)
40
+ : null;
41
+ if (taskId && ctx.progress) startTask(ctx.progress, taskId);
35
42
 
36
- const generateFn = async () => {
37
- const result = await generateMusic({
43
+ const { audio } = await ctx.generateMusic({
38
44
  model,
39
45
  prompt,
40
46
  duration: props.duration,
41
- });
42
- return result.audio;
43
- };
44
-
45
- let audio: { uint8Array: Uint8Array; url?: string; mediaType?: string };
47
+ cacheKey,
48
+ } as Parameters<typeof generateMusic>[0]);
46
49
 
47
- if (ctx.cache) {
48
- const cached = await ctx.cache.get(cacheKey);
49
- if (cached) {
50
- const cachedAudio = cached as {
51
- uint8Array: Uint8Array;
52
- url?: string;
53
- mediaType?: string;
54
- };
55
- audio = {
56
- uint8Array: cachedAudio.uint8Array,
57
- url: cachedAudio.url,
58
- mediaType: cachedAudio.mediaType,
59
- };
60
- if (taskId && ctx.progress) {
61
- startTask(ctx.progress, taskId);
62
- completeTask(ctx.progress, taskId);
63
- }
64
- } else {
65
- if (taskId && ctx.progress) startTask(ctx.progress, taskId);
66
- audio = await generateFn();
67
- if (taskId && ctx.progress) completeTask(ctx.progress, taskId);
68
- await ctx.cache.set(cacheKey, {
69
- uint8Array: audio.uint8Array,
70
- url: audio.url,
71
- mediaType: audio.mediaType,
72
- });
73
- }
74
- } else {
75
- if (taskId && ctx.progress) startTask(ctx.progress, taskId);
76
- audio = await generateFn();
77
50
  if (taskId && ctx.progress) completeTask(ctx.progress, taskId);
78
- }
79
51
 
80
- const mediaType = audio.mediaType ?? "audio/mpeg";
52
+ const mediaType =
53
+ (audio as { mediaType?: string }).mediaType ?? "audio/mpeg";
54
+
55
+ const file = File.fromGenerated({
56
+ uint8Array: audio.uint8Array,
57
+ mediaType,
58
+ url: (audio as { url?: string }).url,
59
+ }).withMetadata({
60
+ type: "music",
61
+ model: modelId,
62
+ prompt,
63
+ });
64
+
65
+ ctx.generatedFiles.push(file);
81
66
 
82
- const file = File.fromGenerated({
83
- uint8Array: audio.uint8Array,
84
- mediaType,
85
- url: audio.url,
86
- }).withMetadata({
87
- type: "music",
88
- model: modelId,
89
- prompt,
90
- });
67
+ return file;
68
+ })();
91
69
 
92
- ctx.generatedFiles.push(file);
70
+ ctx.pendingFiles.set(cacheKeyStr, renderPromise);
93
71
 
94
- return file;
72
+ return renderPromise;
95
73
  }
@@ -1,9 +1,14 @@
1
1
  import type { ImageModelV3 } from "@ai-sdk/provider";
2
- import { generateImage, wrapImageModel } from "ai";
2
+ import {
3
+ generateImage,
4
+ experimental_generateSpeech as generateSpeech,
5
+ wrapImageModel,
6
+ } from "ai";
3
7
  import pMap from "p-map";
4
8
  import { type CacheStorage, withCache } from "../../ai-sdk/cache";
5
9
  import type { File, File as VargFile } from "../../ai-sdk/file";
6
10
  import { fileCache } from "../../ai-sdk/file-cache";
11
+ import { generateMusic } from "../../ai-sdk/generate-music";
7
12
  import { generateVideo } from "../../ai-sdk/generate-video";
8
13
  import {
9
14
  imagePlaceholderFallbackMiddleware,
@@ -109,6 +114,14 @@ export async function renderRoot(
109
114
  ? withCache(generateVideo, { storage: cacheStorage })
110
115
  : generateVideo;
111
116
 
117
+ const cachedGenerateSpeech = cacheStorage
118
+ ? withCache(generateSpeech, { storage: cacheStorage })
119
+ : generateSpeech;
120
+
121
+ const cachedGenerateMusic = cacheStorage
122
+ ? withCache(generateMusic, { storage: cacheStorage })
123
+ : generateMusic;
124
+
112
125
  const wrapGenerateImage: typeof generateImage = async (opts) => {
113
126
  if (mode === "preview") {
114
127
  trackPlaceholder("image");
@@ -158,6 +171,8 @@ export async function renderRoot(
158
171
  storage: options.storage,
159
172
  generateImage: wrapGenerateImage,
160
173
  generateVideo: wrapGenerateVideo,
174
+ generateSpeech: cachedGenerateSpeech,
175
+ generateMusic: cachedGenerateMusic,
161
176
  tempFiles,
162
177
  progress,
163
178
  pendingFiles: new Map<string, Promise<File>>(),
@@ -1,4 +1,4 @@
1
- import { experimental_generateSpeech as generateSpeech } from "ai";
1
+ import type { experimental_generateSpeech } from "ai";
2
2
  import { File } from "../../ai-sdk/file";
3
3
  import { ResolvedElement } from "../resolved-element";
4
4
  import type { SpeechProps, VargElement } from "../types";
@@ -29,33 +29,49 @@ export async function renderSpeech(
29
29
  }
30
30
 
31
31
  const cacheKey = computeCacheKey(element);
32
+ const cacheKeyStr = JSON.stringify(cacheKey);
32
33
 
33
- const modelId = typeof model === "string" ? model : model.modelId;
34
- const taskId = ctx.progress ? addTask(ctx.progress, "speech", modelId) : null;
35
- if (taskId && ctx.progress) startTask(ctx.progress, taskId);
34
+ // Deduplicate concurrent renders of the same speech element
35
+ const pendingRender = ctx.pendingFiles.get(cacheKeyStr);
36
+ if (pendingRender) {
37
+ return pendingRender;
38
+ }
39
+
40
+ const renderPromise = (async () => {
41
+ const modelId = typeof model === "string" ? model : model.modelId;
42
+ const taskId = ctx.progress
43
+ ? addTask(ctx.progress, "speech", modelId)
44
+ : null;
45
+ if (taskId && ctx.progress) startTask(ctx.progress, taskId);
46
+
47
+ const { audio } = await ctx.generateSpeech({
48
+ model,
49
+ text,
50
+ voice: props.voice ?? "rachel",
51
+ cacheKey,
52
+ } as Parameters<typeof experimental_generateSpeech>[0]);
53
+
54
+ if (taskId && ctx.progress) completeTask(ctx.progress, taskId);
36
55
 
37
- const { audio } = await generateSpeech({
38
- model,
39
- text,
40
- voice: props.voice ?? "rachel",
41
- cacheKey,
42
- } as Parameters<typeof generateSpeech>[0]);
56
+ const mediaType =
57
+ (audio as { mediaType?: string }).mediaType ?? "audio/mpeg";
43
58
 
44
- if (taskId && ctx.progress) completeTask(ctx.progress, taskId);
59
+ const file = File.fromGenerated({
60
+ uint8Array: audio.uint8Array,
61
+ mediaType,
62
+ url: (audio as { url?: string }).url,
63
+ }).withMetadata({
64
+ type: "speech",
65
+ model: modelId,
66
+ prompt: text,
67
+ });
45
68
 
46
- const mediaType = (audio as { mediaType?: string }).mediaType ?? "audio/mpeg";
69
+ ctx.generatedFiles.push(file);
47
70
 
48
- const file = File.fromGenerated({
49
- uint8Array: audio.uint8Array,
50
- mediaType,
51
- url: (audio as { url?: string }).url,
52
- }).withMetadata({
53
- type: "speech",
54
- model: modelId,
55
- prompt: text,
56
- });
71
+ return file;
72
+ })();
57
73
 
58
- ctx.generatedFiles.push(file);
74
+ ctx.pendingFiles.set(cacheKeyStr, renderPromise);
59
75
 
60
- return file;
76
+ return renderPromise;
61
77
  }
@@ -14,6 +14,7 @@ import { renderVideo } from "./video";
14
14
  const VIDEO_ONLY_LIPSYNC_MODELS = new Set([
15
15
  "sync-v2",
16
16
  "sync-v2-pro",
17
+ "sync-v3",
17
18
  "lipsync",
18
19
  ]);
19
20
 
@@ -93,13 +93,14 @@ function serializeValue(v: unknown): string {
93
93
  }
94
94
  // ResolvedElement (e.g. a speech segment used as Video audio input):
95
95
  // serialize by content identity (type + text + duration), not binary data.
96
+ // Deliberately excludes file.url — upload URLs contain Date.now() + Math.random()
97
+ // and would make downstream cache keys (e.g. VEED video) non-deterministic.
96
98
  if (v instanceof ResolvedElement) {
97
99
  const parts = [v.type];
98
100
  for (const child of v.children) {
99
101
  if (typeof child === "string") parts.push(child);
100
102
  }
101
103
  if (v.meta.duration) parts.push(String(v.meta.duration));
102
- if (v.meta.file?.url) parts.push(v.meta.file.url);
103
104
  return `resolved(${parts.join(",")})`;
104
105
  }
105
106
  if (isVargElement(v)) {
@@ -15,7 +15,7 @@ import {
15
15
  experimental_generateSpeech as generateSpeechAI,
16
16
  } from "ai";
17
17
  import { $ } from "bun";
18
- import { type CacheStorage, withCache } from "../ai-sdk/cache";
18
+ import { type CacheStorage, depsToKey, withCache } from "../ai-sdk/cache";
19
19
  import { File } from "../ai-sdk/file";
20
20
  import { fileCache } from "../ai-sdk/file-cache";
21
21
  import { generateMusic as generateMusicRaw } from "../ai-sdk/generate-music";
@@ -116,6 +116,12 @@ function getCachedGenerateMusic() {
116
116
  return withCache(generateMusicRaw, { storage });
117
117
  }
118
118
 
119
+ /** Get a cached generateSpeech wrapper using the active cache storage. */
120
+ function getCachedGenerateSpeech() {
121
+ const storage = getActiveCache();
122
+ return withCache(generateSpeechAI, { storage });
123
+ }
124
+
119
125
  // ---------------------------------------------------------------------------
120
126
  // Speech
121
127
  // ---------------------------------------------------------------------------
@@ -297,6 +303,77 @@ async function sliceAudio(
297
303
  return new Uint8Array(sliced);
298
304
  }
299
305
 
306
+ // ---------------------------------------------------------------------------
307
+ // Speech resolve-level cache: serialization helpers
308
+ // ---------------------------------------------------------------------------
309
+
310
+ /** Serializable representation of a speech segment for caching. */
311
+ interface CachedSegment {
312
+ text: string;
313
+ start: number;
314
+ end: number;
315
+ duration: number;
316
+ props: Record<string, unknown>;
317
+ children: string[];
318
+ file: { uint8Array: Uint8Array; mediaType: string };
319
+ words?: WordTiming[];
320
+ }
321
+
322
+ /** Serializable representation of a full resolved speech for caching. */
323
+ interface CachedSpeechResult {
324
+ file: { uint8Array: Uint8Array; mediaType: string };
325
+ duration: number;
326
+ words?: WordTiming[];
327
+ segments?: CachedSegment[];
328
+ }
329
+
330
+ /** Reconstruct a Segment (ResolvedElement<"speech"> + timing props) from cached data. */
331
+ function reconstructSegment(
332
+ cached: CachedSegment,
333
+ storage?: import("../ai-sdk/storage/types").StorageProvider,
334
+ ): Segment {
335
+ const segmentFile = File.fromBuffer(
336
+ cached.file.uint8Array,
337
+ cached.file.mediaType,
338
+ );
339
+ const resolved = new ResolvedElement<"speech">(
340
+ { type: "speech", props: cached.props, children: cached.children },
341
+ {
342
+ file: segmentFile,
343
+ duration: cached.duration,
344
+ segments: [],
345
+ words: cached.words,
346
+ },
347
+ );
348
+ Object.defineProperties(resolved, {
349
+ text: { value: cached.text, enumerable: true },
350
+ start: { value: cached.start, enumerable: true },
351
+ end: { value: cached.end, enumerable: true },
352
+ });
353
+ return resolved as Segment;
354
+ }
355
+
356
+ /** Serialize a Segment into a cacheable plain object. */
357
+ function serializeSegment(seg: Segment): CachedSegment {
358
+ return {
359
+ text: seg.text,
360
+ start: seg.start,
361
+ end: seg.end,
362
+ duration: seg.duration,
363
+ props: { ...seg.props },
364
+ children: seg.children.filter((c): c is string => typeof c === "string"),
365
+ file: {
366
+ uint8Array: (seg.meta.file as any)._data as Uint8Array,
367
+ mediaType: "audio/mpeg",
368
+ },
369
+ words: seg.meta.words,
370
+ };
371
+ }
372
+
373
+ // ---------------------------------------------------------------------------
374
+ // resolveSpeechElement — cached at the full-result level
375
+ // ---------------------------------------------------------------------------
376
+
300
377
  /** Generate speech audio via the AI SDK and return a ResolvedElement with duration metadata. */
301
378
  export async function resolveSpeechElement(
302
379
  element: VargElement<"speech">,
@@ -324,12 +401,52 @@ export async function resolveSpeechElement(
324
401
 
325
402
  const cacheKey = computeCacheKey(element);
326
403
 
327
- const { audio, ...rest } = await generateSpeechAI({
404
+ // ---- Check full-result cache (includes segments, words, duration) ----
405
+ const cache = getActiveCache();
406
+ const resolveKey = depsToKey("resolveSpeech", cacheKey);
407
+ const cached = (await cache.get(resolveKey)) as
408
+ | CachedSpeechResult
409
+ | undefined;
410
+
411
+ if (cached) {
412
+ const ctx = getResolveContext();
413
+ const file = File.fromGenerated({
414
+ uint8Array: cached.file.uint8Array,
415
+ mediaType: cached.file.mediaType,
416
+ }).withMetadata({
417
+ type: "speech",
418
+ model: typeof model === "string" ? model : model.modelId,
419
+ prompt: text,
420
+ });
421
+
422
+ // Upload reconstructed segment files to storage so downstream cache keys
423
+ // get stable URLs (instead of no URL at all).
424
+ const segments = cached.segments?.map((s) =>
425
+ reconstructSegment(s, ctx?.storage),
426
+ );
427
+ if (segments && ctx?.storage) {
428
+ await Promise.all(
429
+ segments.map((seg) => seg.meta.file.upload(ctx.storage!)),
430
+ );
431
+ }
432
+
433
+ return new ResolvedElement(element, {
434
+ file,
435
+ duration: cached.duration,
436
+ words: cached.words,
437
+ segments,
438
+ });
439
+ }
440
+
441
+ // ---- Cache miss: generate, probe, slice, then cache ----
442
+
443
+ const generateSpeech = getCachedGenerateSpeech();
444
+ const { audio, ...rest } = await generateSpeech({
328
445
  model,
329
446
  text,
330
447
  voice: props.voice ?? "rachel",
331
448
  cacheKey,
332
- } as Parameters<typeof generateSpeechAI>[0]);
449
+ });
333
450
 
334
451
  const mediaType = (audio as { mediaType?: string }).mediaType ?? "audio/mpeg";
335
452
 
@@ -377,6 +494,15 @@ export async function resolveSpeechElement(
377
494
  }
378
495
  }
379
496
 
497
+ // ---- Write full result to cache ----
498
+ const toCache: CachedSpeechResult = {
499
+ file: { uint8Array: audio.uint8Array, mediaType },
500
+ duration,
501
+ words,
502
+ segments: segments?.map(serializeSegment),
503
+ };
504
+ await cache.set(resolveKey, toCache);
505
+
380
506
  return new ResolvedElement(element, {
381
507
  file,
382
508
  duration,
@@ -451,9 +577,11 @@ async function resolveImagePrompt(
451
577
  prompt: ImagePrompt,
452
578
  ): Promise<string | { text?: string; images: Uint8Array[] }> {
453
579
  if (typeof prompt === "string") return prompt;
454
- const resolvedImages = await Promise.all(
455
- prompt.images.map((img) => resolveImageInputForStandalone(img)),
456
- );
580
+ const resolvedImages = prompt.images
581
+ ? await Promise.all(
582
+ prompt.images.map((img) => resolveImageInputForStandalone(img)),
583
+ )
584
+ : [];
457
585
  return { text: prompt.text, images: resolvedImages };
458
586
  }
459
587
 
@@ -129,7 +129,7 @@ export interface OverlayProps extends BaseProps, PositionProps, AudioProps {
129
129
  }
130
130
 
131
131
  export type ImageInput = Uint8Array | string | VargElement<"image">;
132
- export type ImagePrompt = string | { text?: string; images: ImageInput[] };
132
+ export type ImagePrompt = string | { text?: string; images?: ImageInput[] };
133
133
 
134
134
  export interface ImageProps extends BaseProps, PositionProps {
135
135
  prompt?: ImagePrompt;
@@ -253,6 +253,8 @@ export interface CaptionsProps extends BaseProps {
253
253
  color?: string;
254
254
  activeColor?: string;
255
255
  fontSize?: number;
256
+ /** Number of words to display per subtitle line. When set with activeColor, enables karaoke-style highlighting where the active word is colored differently. */
257
+ wordsPerLine?: number;
256
258
  /** When src is a Speech element, include its audio track in the video. Defaults to false. */
257
259
  withAudio?: boolean;
258
260
  }
@@ -1,6 +1,10 @@
1
- import { generateImage } from "ai";
1
+ import {
2
+ generateImage,
3
+ experimental_generateSpeech as generateSpeech,
4
+ } from "ai";
2
5
  import { type CacheStorage, withCache } from "../ai-sdk/cache";
3
6
  import { fileCache } from "../ai-sdk/file-cache";
7
+ import { generateMusic } from "../ai-sdk/generate-music";
4
8
  import { generateVideo } from "../ai-sdk/generate-video";
5
9
  import { localBackend } from "../ai-sdk/providers/editly";
6
10
  import type { RenderContext } from "../react/renderers/context";
@@ -49,6 +53,12 @@ export function createStepSession(
49
53
  generateVideo: cacheStorage
50
54
  ? withCache(generateVideo, { storage: cacheStorage })
51
55
  : generateVideo,
56
+ generateSpeech: cacheStorage
57
+ ? withCache(generateSpeech, { storage: cacheStorage })
58
+ : generateSpeech,
59
+ generateMusic: cacheStorage
60
+ ? withCache(generateMusic, { storage: cacheStorage })
61
+ : generateMusic,
52
62
  tempFiles: [],
53
63
  progress: createProgressTracker(false),
54
64
  pendingFiles: new Map(),