vargai 0.4.0-alpha102 → 0.4.0-alpha105
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/ai-sdk/providers/elevenlabs.ts +1 -1
- package/src/ai-sdk/providers/fal.ts +22 -11
- package/src/definitions/models/nano-banana-2.ts +6 -5
- package/src/providers/elevenlabs.ts +2 -1
- package/src/providers/fal.ts +6 -2
- package/src/react/renderers/captions.ts +119 -8
- package/src/react/renderers/context.ts +4 -1
- package/src/react/renderers/image.ts +3 -3
- package/src/react/renderers/music.ts +37 -59
- package/src/react/renderers/render.ts +16 -1
- package/src/react/renderers/speech.ts +39 -23
- package/src/react/renderers/talking-head.ts +1 -0
- package/src/react/renderers/utils.ts +2 -1
- package/src/react/resolve.ts +134 -6
- package/src/react/types.ts +3 -1
- package/src/studio/step-renderer.ts +11 -1
package/package.json
CHANGED
|
@@ -104,7 +104,7 @@
|
|
|
104
104
|
"license": "Apache-2.0",
|
|
105
105
|
"author": "varg.ai <hello@varg.ai> (https://varg.ai)",
|
|
106
106
|
"sideEffects": false,
|
|
107
|
-
"version": "0.4.0-
|
|
107
|
+
"version": "0.4.0-alpha105",
|
|
108
108
|
"exports": {
|
|
109
109
|
".": "./src/index.ts",
|
|
110
110
|
"./ai": "./src/ai-sdk/index.ts",
|
|
@@ -89,7 +89,7 @@ class ElevenLabsMusicModel implements MusicModelV3 {
|
|
|
89
89
|
const elevenLabsOptions = providerOptions?.elevenlabs ?? {};
|
|
90
90
|
const audio = await this.client.music.compose({
|
|
91
91
|
prompt,
|
|
92
|
-
musicLengthMs: duration ? duration * 1000 : undefined,
|
|
92
|
+
musicLengthMs: duration ? Math.round(duration * 1000) : undefined,
|
|
93
93
|
modelId: this.modelId,
|
|
94
94
|
...elevenLabsOptions,
|
|
95
95
|
} as Parameters<typeof this.client.music.compose>[0]);
|
|
@@ -174,6 +174,7 @@ const MOTION_CONTROL_MODELS: Record<string, string> = {
|
|
|
174
174
|
const LIPSYNC_MODELS: Record<string, string> = {
|
|
175
175
|
"sync-v2": "fal-ai/sync-lipsync",
|
|
176
176
|
"sync-v2-pro": "fal-ai/sync-lipsync/v2",
|
|
177
|
+
"sync-v3": "fal-ai/sync-lipsync/v3",
|
|
177
178
|
lipsync: "fal-ai/sync-lipsync",
|
|
178
179
|
"omnihuman-v1.5": "fal-ai/bytedance/omnihuman/v1.5",
|
|
179
180
|
"veed-fabric-1.0": "veed/fabric-1.0",
|
|
@@ -195,7 +196,7 @@ const IMAGE_MODELS: Record<string, string> = {
|
|
|
195
196
|
"recraft-v3": "fal-ai/recraft/v3/text-to-image",
|
|
196
197
|
"nano-banana-pro": "fal-ai/nano-banana-pro",
|
|
197
198
|
"nano-banana-pro/edit": "fal-ai/nano-banana-pro/edit",
|
|
198
|
-
"nano-banana-2": "fal-ai/nano-banana-2
|
|
199
|
+
"nano-banana-2": "fal-ai/nano-banana-2",
|
|
199
200
|
"nano-banana-2/edit": "fal-ai/nano-banana-2/edit",
|
|
200
201
|
"seedream-v4.5/edit": "fal-ai/bytedance/seedream/v4.5/edit",
|
|
201
202
|
// Qwen Image 2 - text-to-image and image-to-image editing (standard + pro)
|
|
@@ -923,13 +924,21 @@ class FalImageModel implements ImageModelV3 {
|
|
|
923
924
|
}
|
|
924
925
|
|
|
925
926
|
const hasFiles = files && files.length > 0;
|
|
926
|
-
const finalEndpoint = this.resolveEndpoint();
|
|
927
927
|
|
|
928
928
|
let stableKey: string | undefined;
|
|
929
929
|
if (hasFiles && files) {
|
|
930
930
|
const fileHashes = await computeFileHashes(files);
|
|
931
|
+
const imageUrls = await pMap(files, fileToUrl, { concurrency: 2 });
|
|
932
|
+
// Reve uses singular image_url instead of image_urls array
|
|
933
|
+
if (SINGULAR_IMAGE_URL_MODELS.has(this.modelId)) {
|
|
934
|
+
input.image_url = imageUrls[0];
|
|
935
|
+
} else {
|
|
936
|
+
input.image_urls = imageUrls;
|
|
937
|
+
}
|
|
938
|
+
// Compute stable key after files are resolved
|
|
939
|
+
const finalEndpointForKey = this.resolveEndpoint(hasFiles);
|
|
931
940
|
stableKey = JSON.stringify({
|
|
932
|
-
endpoint:
|
|
941
|
+
endpoint: finalEndpointForKey,
|
|
933
942
|
prompt,
|
|
934
943
|
n,
|
|
935
944
|
size,
|
|
@@ -939,13 +948,6 @@ class FalImageModel implements ImageModelV3 {
|
|
|
939
948
|
modelId: this.modelId,
|
|
940
949
|
fileHashes,
|
|
941
950
|
});
|
|
942
|
-
const imageUrls = await pMap(files, fileToUrl, { concurrency: 2 });
|
|
943
|
-
// Reve uses singular image_url instead of image_urls array
|
|
944
|
-
if (SINGULAR_IMAGE_URL_MODELS.has(this.modelId)) {
|
|
945
|
-
input.image_url = imageUrls[0];
|
|
946
|
-
} else {
|
|
947
|
-
input.image_urls = imageUrls;
|
|
948
|
-
}
|
|
949
951
|
}
|
|
950
952
|
|
|
951
953
|
if (isQwenAngles && !input.image_urls) {
|
|
@@ -961,6 +963,10 @@ class FalImageModel implements ImageModelV3 {
|
|
|
961
963
|
}
|
|
962
964
|
}
|
|
963
965
|
|
|
966
|
+
// Resolve endpoint after file processing so dual-endpoint models
|
|
967
|
+
// (e.g. nano-banana-2 vs nano-banana-2/edit) route correctly
|
|
968
|
+
const finalEndpoint = this.resolveEndpoint(hasFiles);
|
|
969
|
+
|
|
964
970
|
const result = await executeWithQueueRecovery<{ data: unknown }>(
|
|
965
971
|
finalEndpoint,
|
|
966
972
|
input,
|
|
@@ -997,11 +1003,16 @@ class FalImageModel implements ImageModelV3 {
|
|
|
997
1003
|
};
|
|
998
1004
|
}
|
|
999
1005
|
|
|
1000
|
-
private resolveEndpoint(): string {
|
|
1006
|
+
private resolveEndpoint(hasFiles?: boolean): string {
|
|
1001
1007
|
if (this.modelId.startsWith("raw:")) {
|
|
1002
1008
|
return this.modelId.slice(4);
|
|
1003
1009
|
}
|
|
1004
1010
|
|
|
1011
|
+
// Nano Banana 2: route to /edit when images are provided, base endpoint for t2i
|
|
1012
|
+
if (this.modelId === "nano-banana-2" && hasFiles) {
|
|
1013
|
+
return "fal-ai/nano-banana-2/edit";
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1005
1016
|
return IMAGE_MODELS[this.modelId] ?? this.modelId;
|
|
1006
1017
|
}
|
|
1007
1018
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
/**
|
|
2
|
-
* Nano Banana 2 image
|
|
3
|
-
*
|
|
2
|
+
* Nano Banana 2 image model (Google's next-gen image generation/editing)
|
|
3
|
+
* Supports both text-to-image (no images) and image editing (with image_urls)
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
6
|
import { z } from "zod";
|
|
@@ -35,8 +35,9 @@ const nanoBanana2InputSchema = z.object({
|
|
|
35
35
|
prompt: z.string().describe("Text description for image editing"),
|
|
36
36
|
image_urls: z
|
|
37
37
|
.array(z.string().url())
|
|
38
|
+
.optional()
|
|
38
39
|
.describe(
|
|
39
|
-
"Input image URLs for image
|
|
40
|
+
"Input image URLs for image editing. When provided, routes to the /edit endpoint. Omit for text-to-image generation.",
|
|
40
41
|
),
|
|
41
42
|
resolution: nanoBanana2ResolutionSchema
|
|
42
43
|
.default("1K")
|
|
@@ -103,11 +104,11 @@ export const definition: ModelDefinition<typeof schema> = {
|
|
|
103
104
|
type: "model",
|
|
104
105
|
name: "nano-banana-2",
|
|
105
106
|
description:
|
|
106
|
-
"Google Nano Banana 2 - next-gen image editing model.
|
|
107
|
+
"Google Nano Banana 2 - next-gen image generation and editing model. Supports text-to-image and image editing (with image_urls).",
|
|
107
108
|
providers: ["fal"],
|
|
108
109
|
defaultProvider: "fal",
|
|
109
110
|
providerModels: {
|
|
110
|
-
fal: "fal-ai/nano-banana-2
|
|
111
|
+
fal: "fal-ai/nano-banana-2",
|
|
111
112
|
},
|
|
112
113
|
schema,
|
|
113
114
|
};
|
|
@@ -117,7 +117,8 @@ export class ElevenLabsProvider extends BaseProvider {
|
|
|
117
117
|
|
|
118
118
|
const audio = await this.client.music.compose({
|
|
119
119
|
prompt,
|
|
120
|
-
musicLengthMs
|
|
120
|
+
musicLengthMs:
|
|
121
|
+
musicLengthMs != null ? Math.round(musicLengthMs) : undefined,
|
|
121
122
|
modelId: "music_v1",
|
|
122
123
|
});
|
|
123
124
|
|
package/src/providers/fal.ts
CHANGED
|
@@ -54,9 +54,13 @@ export class FalProvider extends BaseProvider {
|
|
|
54
54
|
return "fal-ai/nano-banana-pro/edit";
|
|
55
55
|
}
|
|
56
56
|
}
|
|
57
|
-
// Nano Banana 2:
|
|
57
|
+
// Nano Banana 2: route to /edit when image_urls are provided, otherwise use base t2i endpoint
|
|
58
58
|
if (model === "fal-ai/nano-banana-2") {
|
|
59
|
-
|
|
59
|
+
const imageUrls = inputs.image_urls as string[] | undefined;
|
|
60
|
+
if (imageUrls && imageUrls.length > 0) {
|
|
61
|
+
return "fal-ai/nano-banana-2/edit";
|
|
62
|
+
}
|
|
63
|
+
return "fal-ai/nano-banana-2";
|
|
60
64
|
}
|
|
61
65
|
// Qwen Image 2: route to /edit endpoint when image_urls are provided
|
|
62
66
|
if (model === "fal-ai/qwen-image-2/text-to-image") {
|
|
@@ -156,12 +156,17 @@ function parseSrt(content: string): SrtEntry[] {
|
|
|
156
156
|
return entries;
|
|
157
157
|
}
|
|
158
158
|
|
|
159
|
+
/**
|
|
160
|
+
* Format seconds to ASS timestamp `H:MM:SS.CC`.
|
|
161
|
+
* Computes from total centiseconds to avoid overflow when rounding
|
|
162
|
+
* lands on 100 cs (e.g. 1.999s would otherwise produce `0:00:01.100`).
|
|
163
|
+
*/
|
|
159
164
|
function formatAssTime(seconds: number): string {
|
|
160
|
-
const
|
|
161
|
-
const
|
|
162
|
-
const
|
|
163
|
-
const
|
|
164
|
-
|
|
165
|
+
const totalCs = Math.max(0, Math.round(seconds * 100));
|
|
166
|
+
const h = Math.floor(totalCs / 360000);
|
|
167
|
+
const m = Math.floor((totalCs % 360000) / 6000);
|
|
168
|
+
const s = Math.floor((totalCs % 6000) / 100);
|
|
169
|
+
const cs = totalCs % 100;
|
|
165
170
|
return `${h}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}.${String(cs).padStart(2, "0")}`;
|
|
166
171
|
}
|
|
167
172
|
|
|
@@ -190,9 +195,15 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
|
|
190
195
|
|
|
191
196
|
const entries = parseSrt(srtContent);
|
|
192
197
|
const assDialogues = entries
|
|
193
|
-
.map((entry) => {
|
|
198
|
+
.map((entry, i) => {
|
|
194
199
|
const start = formatAssTime(entry.start);
|
|
195
|
-
|
|
200
|
+
// Clamp end to next entry's start to prevent overlapping subtitles
|
|
201
|
+
// (transcription engines often produce overlapping word timestamps)
|
|
202
|
+
const nextStart =
|
|
203
|
+
i < entries.length - 1 ? entries[i + 1]!.start : undefined;
|
|
204
|
+
const clampedEnd =
|
|
205
|
+
nextStart !== undefined ? Math.min(entry.end, nextStart) : entry.end;
|
|
206
|
+
const end = formatAssTime(clampedEnd);
|
|
196
207
|
const text = entry.text.replace(/\n/g, "\\N");
|
|
197
208
|
return `Dialogue: 0,${start},${end},Default,,0,0,0,,${text}`;
|
|
198
209
|
})
|
|
@@ -201,6 +212,93 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
|
|
201
212
|
return assHeader + assDialogues;
|
|
202
213
|
}
|
|
203
214
|
|
|
215
|
+
/**
|
|
216
|
+
* Generates ASS subtitle content with grouped words and active-word highlighting.
|
|
217
|
+
*
|
|
218
|
+
* Groups words into chunks of `wordsPerLine`. For each group, generates one
|
|
219
|
+
* Dialogue event per word timing where the currently-spoken word is colored
|
|
220
|
+
* with `activeColor` and the rest use the base `primaryColor`.
|
|
221
|
+
*
|
|
222
|
+
* Example output for group ["Varg", "AI", "is"] with activeColor orange:
|
|
223
|
+
* t=0.5-0.8: {\c&H428CFF&}Varg{\c&HFFFFFF&} AI is
|
|
224
|
+
* t=0.8-1.0: Varg {\c&H428CFF&}AI{\c&HFFFFFF&} is
|
|
225
|
+
* t=1.0-1.3: Varg AI {\c&H428CFF&}is{\c&HFFFFFF&}
|
|
226
|
+
*/
|
|
227
|
+
function convertSrtToAssGrouped(
|
|
228
|
+
srtContent: string,
|
|
229
|
+
style: SubtitleStyle,
|
|
230
|
+
width: number,
|
|
231
|
+
height: number,
|
|
232
|
+
wordsPerLine: number,
|
|
233
|
+
activeColor?: string,
|
|
234
|
+
): string {
|
|
235
|
+
const assHeader = `[Script Info]
|
|
236
|
+
Title: Generated Subtitles
|
|
237
|
+
ScriptType: v4.00+
|
|
238
|
+
PlayResX: ${width}
|
|
239
|
+
PlayResY: ${height}
|
|
240
|
+
WrapStyle: 0
|
|
241
|
+
ScaledBorderAndShadow: yes
|
|
242
|
+
YCbCr Matrix: TV.601
|
|
243
|
+
|
|
244
|
+
[V4+ Styles]
|
|
245
|
+
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
|
246
|
+
Style: Default,${style.fontName},${style.fontSize},${style.primaryColor},&H000000FF,${style.outlineColor},${style.backColor},${style.bold ? -1 : 0},0,0,0,100,100,0,0,1,${style.outline},${style.shadow},${style.alignment},10,10,${style.marginV},1
|
|
247
|
+
|
|
248
|
+
[Events]
|
|
249
|
+
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
|
250
|
+
`;
|
|
251
|
+
|
|
252
|
+
const entries = parseSrt(srtContent);
|
|
253
|
+
const dialogues: string[] = [];
|
|
254
|
+
const baseColor = style.primaryColor;
|
|
255
|
+
const highlightColor = activeColor ?? baseColor;
|
|
256
|
+
|
|
257
|
+
// Group entries into chunks of wordsPerLine
|
|
258
|
+
for (let gi = 0; gi < entries.length; gi += wordsPerLine) {
|
|
259
|
+
const group = entries.slice(gi, gi + wordsPerLine);
|
|
260
|
+
const groupStart = group[0]!.start;
|
|
261
|
+
// Cap group end at next group's start to prevent two groups showing simultaneously
|
|
262
|
+
const nextGroupStart =
|
|
263
|
+
gi + wordsPerLine < entries.length
|
|
264
|
+
? entries[gi + wordsPerLine]!.start
|
|
265
|
+
: undefined;
|
|
266
|
+
const groupEnd = nextGroupStart ?? group[group.length - 1]!.end;
|
|
267
|
+
|
|
268
|
+
if (!activeColor) {
|
|
269
|
+
// No highlight — show entire group as one event
|
|
270
|
+
const text = group.map((e) => e.text.replace(/\n/g, " ")).join(" ");
|
|
271
|
+
dialogues.push(
|
|
272
|
+
`Dialogue: 0,${formatAssTime(groupStart)},${formatAssTime(groupEnd)},Default,,0,0,0,,${text}`,
|
|
273
|
+
);
|
|
274
|
+
} else {
|
|
275
|
+
// Karaoke highlight — one dialogue event per word, shifting the highlight
|
|
276
|
+
for (let wi = 0; wi < group.length; wi++) {
|
|
277
|
+
const wordEntry = group[wi]!;
|
|
278
|
+
const wordStart = wordEntry.start;
|
|
279
|
+
// Word ends at next word's start (within group), or at group end
|
|
280
|
+
const wordEnd = wi < group.length - 1 ? group[wi + 1]!.start : groupEnd;
|
|
281
|
+
|
|
282
|
+
// Build the text line with ASS color overrides
|
|
283
|
+
const parts = group.map((entry, idx) => {
|
|
284
|
+
const word = entry.text.replace(/\n/g, " ").trim();
|
|
285
|
+
if (idx === wi) {
|
|
286
|
+
// Active word — use highlight color
|
|
287
|
+
return `{\\c${highlightColor}}${word}{\\c${baseColor}}`;
|
|
288
|
+
}
|
|
289
|
+
return word;
|
|
290
|
+
});
|
|
291
|
+
|
|
292
|
+
dialogues.push(
|
|
293
|
+
`Dialogue: 0,${formatAssTime(wordStart)},${formatAssTime(wordEnd)},Default,,0,0,0,,${parts.join(" ")}`,
|
|
294
|
+
);
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
return assHeader + dialogues.join("\n");
|
|
300
|
+
}
|
|
301
|
+
|
|
204
302
|
const POSITION_ALIGNMENT: Record<string, number> = {
|
|
205
303
|
top: 8,
|
|
206
304
|
center: 5,
|
|
@@ -363,7 +461,20 @@ export async function renderCaptions(
|
|
|
363
461
|
marginV: props.position === "center" ? 0 : baseStyle.marginV,
|
|
364
462
|
};
|
|
365
463
|
|
|
366
|
-
const
|
|
464
|
+
const activeColorAss = props.activeColor
|
|
465
|
+
? colorToAss(props.activeColor)
|
|
466
|
+
: undefined;
|
|
467
|
+
|
|
468
|
+
const assContent = props.wordsPerLine
|
|
469
|
+
? convertSrtToAssGrouped(
|
|
470
|
+
srtContent,
|
|
471
|
+
style,
|
|
472
|
+
ctx.width,
|
|
473
|
+
ctx.height,
|
|
474
|
+
props.wordsPerLine,
|
|
475
|
+
activeColorAss,
|
|
476
|
+
)
|
|
477
|
+
: convertSrtToAss(srtContent, style, ctx.width, ctx.height);
|
|
367
478
|
const assPath = `/tmp/varg-captions-${Date.now()}.ass`;
|
|
368
479
|
writeFileSync(assPath, assContent);
|
|
369
480
|
ctx.tempFiles.push(assPath);
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import type { generateImage } from "ai";
|
|
1
|
+
import type { experimental_generateSpeech, generateImage } from "ai";
|
|
2
2
|
import type { CacheStorage } from "../../ai-sdk/cache";
|
|
3
3
|
import type { File } from "../../ai-sdk/file";
|
|
4
|
+
import type { generateMusic } from "../../ai-sdk/generate-music";
|
|
4
5
|
import type { generateVideo } from "../../ai-sdk/generate-video";
|
|
5
6
|
import type { FFmpegBackend } from "../../ai-sdk/providers/editly/backends";
|
|
6
7
|
import type { StorageProvider } from "../../ai-sdk/storage/types";
|
|
@@ -15,6 +16,8 @@ export interface RenderContext {
|
|
|
15
16
|
storage?: StorageProvider;
|
|
16
17
|
generateImage: typeof generateImage;
|
|
17
18
|
generateVideo: typeof generateVideo;
|
|
19
|
+
generateSpeech: typeof experimental_generateSpeech;
|
|
20
|
+
generateMusic: typeof generateMusic;
|
|
18
21
|
tempFiles: string[];
|
|
19
22
|
progress?: ProgressTracker;
|
|
20
23
|
pendingFiles: Map<string, Promise<File>>;
|
|
@@ -37,9 +37,9 @@ async function resolvePrompt(
|
|
|
37
37
|
if (typeof prompt === "string") {
|
|
38
38
|
return prompt;
|
|
39
39
|
}
|
|
40
|
-
const resolvedImages =
|
|
41
|
-
prompt.images.map((img) => resolveImageInput(img, ctx))
|
|
42
|
-
|
|
40
|
+
const resolvedImages = prompt.images
|
|
41
|
+
? await Promise.all(prompt.images.map((img) => resolveImageInput(img, ctx)))
|
|
42
|
+
: [];
|
|
43
43
|
return { text: prompt.text, images: resolvedImages };
|
|
44
44
|
}
|
|
45
45
|
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import { File } from "../../ai-sdk/file";
|
|
2
|
-
import { generateMusic } from "../../ai-sdk/generate-music";
|
|
2
|
+
import type { generateMusic } from "../../ai-sdk/generate-music";
|
|
3
3
|
import { ResolvedElement } from "../resolved-element";
|
|
4
4
|
import type { MusicProps, VargElement } from "../types";
|
|
5
5
|
import type { RenderContext } from "./context";
|
|
6
6
|
import { addTask, completeTask, startTask } from "./progress";
|
|
7
|
+
import { computeCacheKey } from "./utils";
|
|
7
8
|
|
|
8
9
|
export async function renderMusic(
|
|
9
10
|
element: VargElement<"music">,
|
|
@@ -23,73 +24,50 @@ export async function renderMusic(
|
|
|
23
24
|
throw new Error("Music requires prompt and model (or set defaults.music)");
|
|
24
25
|
}
|
|
25
26
|
|
|
26
|
-
const cacheKey =
|
|
27
|
-
|
|
28
|
-
prompt,
|
|
29
|
-
model: model.modelId,
|
|
30
|
-
duration: props.duration,
|
|
31
|
-
});
|
|
27
|
+
const cacheKey = computeCacheKey(element);
|
|
28
|
+
const cacheKeyStr = JSON.stringify(cacheKey);
|
|
32
29
|
|
|
33
|
-
|
|
34
|
-
const
|
|
30
|
+
// Deduplicate concurrent renders of the same music element
|
|
31
|
+
const pendingRender = ctx.pendingFiles.get(cacheKeyStr);
|
|
32
|
+
if (pendingRender) {
|
|
33
|
+
return pendingRender;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const renderPromise = (async () => {
|
|
37
|
+
const modelId = model.modelId ?? "music";
|
|
38
|
+
const taskId = ctx.progress
|
|
39
|
+
? addTask(ctx.progress, "music", modelId)
|
|
40
|
+
: null;
|
|
41
|
+
if (taskId && ctx.progress) startTask(ctx.progress, taskId);
|
|
35
42
|
|
|
36
|
-
|
|
37
|
-
const result = await generateMusic({
|
|
43
|
+
const { audio } = await ctx.generateMusic({
|
|
38
44
|
model,
|
|
39
45
|
prompt,
|
|
40
46
|
duration: props.duration,
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
};
|
|
44
|
-
|
|
45
|
-
let audio: { uint8Array: Uint8Array; url?: string; mediaType?: string };
|
|
47
|
+
cacheKey,
|
|
48
|
+
} as Parameters<typeof generateMusic>[0]);
|
|
46
49
|
|
|
47
|
-
if (ctx.cache) {
|
|
48
|
-
const cached = await ctx.cache.get(cacheKey);
|
|
49
|
-
if (cached) {
|
|
50
|
-
const cachedAudio = cached as {
|
|
51
|
-
uint8Array: Uint8Array;
|
|
52
|
-
url?: string;
|
|
53
|
-
mediaType?: string;
|
|
54
|
-
};
|
|
55
|
-
audio = {
|
|
56
|
-
uint8Array: cachedAudio.uint8Array,
|
|
57
|
-
url: cachedAudio.url,
|
|
58
|
-
mediaType: cachedAudio.mediaType,
|
|
59
|
-
};
|
|
60
|
-
if (taskId && ctx.progress) {
|
|
61
|
-
startTask(ctx.progress, taskId);
|
|
62
|
-
completeTask(ctx.progress, taskId);
|
|
63
|
-
}
|
|
64
|
-
} else {
|
|
65
|
-
if (taskId && ctx.progress) startTask(ctx.progress, taskId);
|
|
66
|
-
audio = await generateFn();
|
|
67
|
-
if (taskId && ctx.progress) completeTask(ctx.progress, taskId);
|
|
68
|
-
await ctx.cache.set(cacheKey, {
|
|
69
|
-
uint8Array: audio.uint8Array,
|
|
70
|
-
url: audio.url,
|
|
71
|
-
mediaType: audio.mediaType,
|
|
72
|
-
});
|
|
73
|
-
}
|
|
74
|
-
} else {
|
|
75
|
-
if (taskId && ctx.progress) startTask(ctx.progress, taskId);
|
|
76
|
-
audio = await generateFn();
|
|
77
50
|
if (taskId && ctx.progress) completeTask(ctx.progress, taskId);
|
|
78
|
-
}
|
|
79
51
|
|
|
80
|
-
|
|
52
|
+
const mediaType =
|
|
53
|
+
(audio as { mediaType?: string }).mediaType ?? "audio/mpeg";
|
|
54
|
+
|
|
55
|
+
const file = File.fromGenerated({
|
|
56
|
+
uint8Array: audio.uint8Array,
|
|
57
|
+
mediaType,
|
|
58
|
+
url: (audio as { url?: string }).url,
|
|
59
|
+
}).withMetadata({
|
|
60
|
+
type: "music",
|
|
61
|
+
model: modelId,
|
|
62
|
+
prompt,
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
ctx.generatedFiles.push(file);
|
|
81
66
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
mediaType,
|
|
85
|
-
url: audio.url,
|
|
86
|
-
}).withMetadata({
|
|
87
|
-
type: "music",
|
|
88
|
-
model: modelId,
|
|
89
|
-
prompt,
|
|
90
|
-
});
|
|
67
|
+
return file;
|
|
68
|
+
})();
|
|
91
69
|
|
|
92
|
-
ctx.
|
|
70
|
+
ctx.pendingFiles.set(cacheKeyStr, renderPromise);
|
|
93
71
|
|
|
94
|
-
return
|
|
72
|
+
return renderPromise;
|
|
95
73
|
}
|
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
import type { ImageModelV3 } from "@ai-sdk/provider";
|
|
2
|
-
import {
|
|
2
|
+
import {
|
|
3
|
+
generateImage,
|
|
4
|
+
experimental_generateSpeech as generateSpeech,
|
|
5
|
+
wrapImageModel,
|
|
6
|
+
} from "ai";
|
|
3
7
|
import pMap from "p-map";
|
|
4
8
|
import { type CacheStorage, withCache } from "../../ai-sdk/cache";
|
|
5
9
|
import type { File, File as VargFile } from "../../ai-sdk/file";
|
|
6
10
|
import { fileCache } from "../../ai-sdk/file-cache";
|
|
11
|
+
import { generateMusic } from "../../ai-sdk/generate-music";
|
|
7
12
|
import { generateVideo } from "../../ai-sdk/generate-video";
|
|
8
13
|
import {
|
|
9
14
|
imagePlaceholderFallbackMiddleware,
|
|
@@ -109,6 +114,14 @@ export async function renderRoot(
|
|
|
109
114
|
? withCache(generateVideo, { storage: cacheStorage })
|
|
110
115
|
: generateVideo;
|
|
111
116
|
|
|
117
|
+
const cachedGenerateSpeech = cacheStorage
|
|
118
|
+
? withCache(generateSpeech, { storage: cacheStorage })
|
|
119
|
+
: generateSpeech;
|
|
120
|
+
|
|
121
|
+
const cachedGenerateMusic = cacheStorage
|
|
122
|
+
? withCache(generateMusic, { storage: cacheStorage })
|
|
123
|
+
: generateMusic;
|
|
124
|
+
|
|
112
125
|
const wrapGenerateImage: typeof generateImage = async (opts) => {
|
|
113
126
|
if (mode === "preview") {
|
|
114
127
|
trackPlaceholder("image");
|
|
@@ -158,6 +171,8 @@ export async function renderRoot(
|
|
|
158
171
|
storage: options.storage,
|
|
159
172
|
generateImage: wrapGenerateImage,
|
|
160
173
|
generateVideo: wrapGenerateVideo,
|
|
174
|
+
generateSpeech: cachedGenerateSpeech,
|
|
175
|
+
generateMusic: cachedGenerateMusic,
|
|
161
176
|
tempFiles,
|
|
162
177
|
progress,
|
|
163
178
|
pendingFiles: new Map<string, Promise<File>>(),
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { experimental_generateSpeech
|
|
1
|
+
import type { experimental_generateSpeech } from "ai";
|
|
2
2
|
import { File } from "../../ai-sdk/file";
|
|
3
3
|
import { ResolvedElement } from "../resolved-element";
|
|
4
4
|
import type { SpeechProps, VargElement } from "../types";
|
|
@@ -29,33 +29,49 @@ export async function renderSpeech(
|
|
|
29
29
|
}
|
|
30
30
|
|
|
31
31
|
const cacheKey = computeCacheKey(element);
|
|
32
|
+
const cacheKeyStr = JSON.stringify(cacheKey);
|
|
32
33
|
|
|
33
|
-
|
|
34
|
-
const
|
|
35
|
-
if (
|
|
34
|
+
// Deduplicate concurrent renders of the same speech element
|
|
35
|
+
const pendingRender = ctx.pendingFiles.get(cacheKeyStr);
|
|
36
|
+
if (pendingRender) {
|
|
37
|
+
return pendingRender;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const renderPromise = (async () => {
|
|
41
|
+
const modelId = typeof model === "string" ? model : model.modelId;
|
|
42
|
+
const taskId = ctx.progress
|
|
43
|
+
? addTask(ctx.progress, "speech", modelId)
|
|
44
|
+
: null;
|
|
45
|
+
if (taskId && ctx.progress) startTask(ctx.progress, taskId);
|
|
46
|
+
|
|
47
|
+
const { audio } = await ctx.generateSpeech({
|
|
48
|
+
model,
|
|
49
|
+
text,
|
|
50
|
+
voice: props.voice ?? "rachel",
|
|
51
|
+
cacheKey,
|
|
52
|
+
} as Parameters<typeof experimental_generateSpeech>[0]);
|
|
53
|
+
|
|
54
|
+
if (taskId && ctx.progress) completeTask(ctx.progress, taskId);
|
|
36
55
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
text,
|
|
40
|
-
voice: props.voice ?? "rachel",
|
|
41
|
-
cacheKey,
|
|
42
|
-
} as Parameters<typeof generateSpeech>[0]);
|
|
56
|
+
const mediaType =
|
|
57
|
+
(audio as { mediaType?: string }).mediaType ?? "audio/mpeg";
|
|
43
58
|
|
|
44
|
-
|
|
59
|
+
const file = File.fromGenerated({
|
|
60
|
+
uint8Array: audio.uint8Array,
|
|
61
|
+
mediaType,
|
|
62
|
+
url: (audio as { url?: string }).url,
|
|
63
|
+
}).withMetadata({
|
|
64
|
+
type: "speech",
|
|
65
|
+
model: modelId,
|
|
66
|
+
prompt: text,
|
|
67
|
+
});
|
|
45
68
|
|
|
46
|
-
|
|
69
|
+
ctx.generatedFiles.push(file);
|
|
47
70
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
mediaType,
|
|
51
|
-
url: (audio as { url?: string }).url,
|
|
52
|
-
}).withMetadata({
|
|
53
|
-
type: "speech",
|
|
54
|
-
model: modelId,
|
|
55
|
-
prompt: text,
|
|
56
|
-
});
|
|
71
|
+
return file;
|
|
72
|
+
})();
|
|
57
73
|
|
|
58
|
-
ctx.
|
|
74
|
+
ctx.pendingFiles.set(cacheKeyStr, renderPromise);
|
|
59
75
|
|
|
60
|
-
return
|
|
76
|
+
return renderPromise;
|
|
61
77
|
}
|
|
@@ -93,13 +93,14 @@ function serializeValue(v: unknown): string {
|
|
|
93
93
|
}
|
|
94
94
|
// ResolvedElement (e.g. a speech segment used as Video audio input):
|
|
95
95
|
// serialize by content identity (type + text + duration), not binary data.
|
|
96
|
+
// Deliberately excludes file.url — upload URLs contain Date.now() + Math.random()
|
|
97
|
+
// and would make downstream cache keys (e.g. VEED video) non-deterministic.
|
|
96
98
|
if (v instanceof ResolvedElement) {
|
|
97
99
|
const parts = [v.type];
|
|
98
100
|
for (const child of v.children) {
|
|
99
101
|
if (typeof child === "string") parts.push(child);
|
|
100
102
|
}
|
|
101
103
|
if (v.meta.duration) parts.push(String(v.meta.duration));
|
|
102
|
-
if (v.meta.file?.url) parts.push(v.meta.file.url);
|
|
103
104
|
return `resolved(${parts.join(",")})`;
|
|
104
105
|
}
|
|
105
106
|
if (isVargElement(v)) {
|
package/src/react/resolve.ts
CHANGED
|
@@ -15,7 +15,7 @@ import {
|
|
|
15
15
|
experimental_generateSpeech as generateSpeechAI,
|
|
16
16
|
} from "ai";
|
|
17
17
|
import { $ } from "bun";
|
|
18
|
-
import { type CacheStorage, withCache } from "../ai-sdk/cache";
|
|
18
|
+
import { type CacheStorage, depsToKey, withCache } from "../ai-sdk/cache";
|
|
19
19
|
import { File } from "../ai-sdk/file";
|
|
20
20
|
import { fileCache } from "../ai-sdk/file-cache";
|
|
21
21
|
import { generateMusic as generateMusicRaw } from "../ai-sdk/generate-music";
|
|
@@ -116,6 +116,12 @@ function getCachedGenerateMusic() {
|
|
|
116
116
|
return withCache(generateMusicRaw, { storage });
|
|
117
117
|
}
|
|
118
118
|
|
|
119
|
+
/** Get a cached generateSpeech wrapper using the active cache storage. */
|
|
120
|
+
function getCachedGenerateSpeech() {
|
|
121
|
+
const storage = getActiveCache();
|
|
122
|
+
return withCache(generateSpeechAI, { storage });
|
|
123
|
+
}
|
|
124
|
+
|
|
119
125
|
// ---------------------------------------------------------------------------
|
|
120
126
|
// Speech
|
|
121
127
|
// ---------------------------------------------------------------------------
|
|
@@ -297,6 +303,77 @@ async function sliceAudio(
|
|
|
297
303
|
return new Uint8Array(sliced);
|
|
298
304
|
}
|
|
299
305
|
|
|
306
|
+
// ---------------------------------------------------------------------------
|
|
307
|
+
// Speech resolve-level cache: serialization helpers
|
|
308
|
+
// ---------------------------------------------------------------------------
|
|
309
|
+
|
|
310
|
+
/** Serializable representation of a speech segment for caching. */
|
|
311
|
+
interface CachedSegment {
|
|
312
|
+
text: string;
|
|
313
|
+
start: number;
|
|
314
|
+
end: number;
|
|
315
|
+
duration: number;
|
|
316
|
+
props: Record<string, unknown>;
|
|
317
|
+
children: string[];
|
|
318
|
+
file: { uint8Array: Uint8Array; mediaType: string };
|
|
319
|
+
words?: WordTiming[];
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
/** Serializable representation of a full resolved speech for caching. */
|
|
323
|
+
interface CachedSpeechResult {
|
|
324
|
+
file: { uint8Array: Uint8Array; mediaType: string };
|
|
325
|
+
duration: number;
|
|
326
|
+
words?: WordTiming[];
|
|
327
|
+
segments?: CachedSegment[];
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
/** Reconstruct a Segment (ResolvedElement<"speech"> + timing props) from cached data. */
|
|
331
|
+
function reconstructSegment(
|
|
332
|
+
cached: CachedSegment,
|
|
333
|
+
storage?: import("../ai-sdk/storage/types").StorageProvider,
|
|
334
|
+
): Segment {
|
|
335
|
+
const segmentFile = File.fromBuffer(
|
|
336
|
+
cached.file.uint8Array,
|
|
337
|
+
cached.file.mediaType,
|
|
338
|
+
);
|
|
339
|
+
const resolved = new ResolvedElement<"speech">(
|
|
340
|
+
{ type: "speech", props: cached.props, children: cached.children },
|
|
341
|
+
{
|
|
342
|
+
file: segmentFile,
|
|
343
|
+
duration: cached.duration,
|
|
344
|
+
segments: [],
|
|
345
|
+
words: cached.words,
|
|
346
|
+
},
|
|
347
|
+
);
|
|
348
|
+
Object.defineProperties(resolved, {
|
|
349
|
+
text: { value: cached.text, enumerable: true },
|
|
350
|
+
start: { value: cached.start, enumerable: true },
|
|
351
|
+
end: { value: cached.end, enumerable: true },
|
|
352
|
+
});
|
|
353
|
+
return resolved as Segment;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
/** Serialize a Segment into a cacheable plain object. */
|
|
357
|
+
function serializeSegment(seg: Segment): CachedSegment {
|
|
358
|
+
return {
|
|
359
|
+
text: seg.text,
|
|
360
|
+
start: seg.start,
|
|
361
|
+
end: seg.end,
|
|
362
|
+
duration: seg.duration,
|
|
363
|
+
props: { ...seg.props },
|
|
364
|
+
children: seg.children.filter((c): c is string => typeof c === "string"),
|
|
365
|
+
file: {
|
|
366
|
+
uint8Array: (seg.meta.file as any)._data as Uint8Array,
|
|
367
|
+
mediaType: "audio/mpeg",
|
|
368
|
+
},
|
|
369
|
+
words: seg.meta.words,
|
|
370
|
+
};
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
// ---------------------------------------------------------------------------
|
|
374
|
+
// resolveSpeechElement — cached at the full-result level
|
|
375
|
+
// ---------------------------------------------------------------------------
|
|
376
|
+
|
|
300
377
|
/** Generate speech audio via the AI SDK and return a ResolvedElement with duration metadata. */
|
|
301
378
|
export async function resolveSpeechElement(
|
|
302
379
|
element: VargElement<"speech">,
|
|
@@ -324,12 +401,52 @@ export async function resolveSpeechElement(
|
|
|
324
401
|
|
|
325
402
|
const cacheKey = computeCacheKey(element);
|
|
326
403
|
|
|
327
|
-
|
|
404
|
+
// ---- Check full-result cache (includes segments, words, duration) ----
|
|
405
|
+
const cache = getActiveCache();
|
|
406
|
+
const resolveKey = depsToKey("resolveSpeech", cacheKey);
|
|
407
|
+
const cached = (await cache.get(resolveKey)) as
|
|
408
|
+
| CachedSpeechResult
|
|
409
|
+
| undefined;
|
|
410
|
+
|
|
411
|
+
if (cached) {
|
|
412
|
+
const ctx = getResolveContext();
|
|
413
|
+
const file = File.fromGenerated({
|
|
414
|
+
uint8Array: cached.file.uint8Array,
|
|
415
|
+
mediaType: cached.file.mediaType,
|
|
416
|
+
}).withMetadata({
|
|
417
|
+
type: "speech",
|
|
418
|
+
model: typeof model === "string" ? model : model.modelId,
|
|
419
|
+
prompt: text,
|
|
420
|
+
});
|
|
421
|
+
|
|
422
|
+
// Upload reconstructed segment files to storage so downstream cache keys
|
|
423
|
+
// get stable URLs (instead of no URL at all).
|
|
424
|
+
const segments = cached.segments?.map((s) =>
|
|
425
|
+
reconstructSegment(s, ctx?.storage),
|
|
426
|
+
);
|
|
427
|
+
if (segments && ctx?.storage) {
|
|
428
|
+
await Promise.all(
|
|
429
|
+
segments.map((seg) => seg.meta.file.upload(ctx.storage!)),
|
|
430
|
+
);
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
return new ResolvedElement(element, {
|
|
434
|
+
file,
|
|
435
|
+
duration: cached.duration,
|
|
436
|
+
words: cached.words,
|
|
437
|
+
segments,
|
|
438
|
+
});
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
// ---- Cache miss: generate, probe, slice, then cache ----
|
|
442
|
+
|
|
443
|
+
const generateSpeech = getCachedGenerateSpeech();
|
|
444
|
+
const { audio, ...rest } = await generateSpeech({
|
|
328
445
|
model,
|
|
329
446
|
text,
|
|
330
447
|
voice: props.voice ?? "rachel",
|
|
331
448
|
cacheKey,
|
|
332
|
-
}
|
|
449
|
+
});
|
|
333
450
|
|
|
334
451
|
const mediaType = (audio as { mediaType?: string }).mediaType ?? "audio/mpeg";
|
|
335
452
|
|
|
@@ -377,6 +494,15 @@ export async function resolveSpeechElement(
|
|
|
377
494
|
}
|
|
378
495
|
}
|
|
379
496
|
|
|
497
|
+
// ---- Write full result to cache ----
|
|
498
|
+
const toCache: CachedSpeechResult = {
|
|
499
|
+
file: { uint8Array: audio.uint8Array, mediaType },
|
|
500
|
+
duration,
|
|
501
|
+
words,
|
|
502
|
+
segments: segments?.map(serializeSegment),
|
|
503
|
+
};
|
|
504
|
+
await cache.set(resolveKey, toCache);
|
|
505
|
+
|
|
380
506
|
return new ResolvedElement(element, {
|
|
381
507
|
file,
|
|
382
508
|
duration,
|
|
@@ -451,9 +577,11 @@ async function resolveImagePrompt(
|
|
|
451
577
|
prompt: ImagePrompt,
|
|
452
578
|
): Promise<string | { text?: string; images: Uint8Array[] }> {
|
|
453
579
|
if (typeof prompt === "string") return prompt;
|
|
454
|
-
const resolvedImages =
|
|
455
|
-
|
|
456
|
-
|
|
580
|
+
const resolvedImages = prompt.images
|
|
581
|
+
? await Promise.all(
|
|
582
|
+
prompt.images.map((img) => resolveImageInputForStandalone(img)),
|
|
583
|
+
)
|
|
584
|
+
: [];
|
|
457
585
|
return { text: prompt.text, images: resolvedImages };
|
|
458
586
|
}
|
|
459
587
|
|
package/src/react/types.ts
CHANGED
|
@@ -129,7 +129,7 @@ export interface OverlayProps extends BaseProps, PositionProps, AudioProps {
|
|
|
129
129
|
}
|
|
130
130
|
|
|
131
131
|
export type ImageInput = Uint8Array | string | VargElement<"image">;
|
|
132
|
-
export type ImagePrompt = string | { text?: string; images
|
|
132
|
+
export type ImagePrompt = string | { text?: string; images?: ImageInput[] };
|
|
133
133
|
|
|
134
134
|
export interface ImageProps extends BaseProps, PositionProps {
|
|
135
135
|
prompt?: ImagePrompt;
|
|
@@ -253,6 +253,8 @@ export interface CaptionsProps extends BaseProps {
|
|
|
253
253
|
color?: string;
|
|
254
254
|
activeColor?: string;
|
|
255
255
|
fontSize?: number;
|
|
256
|
+
/** Number of words to display per subtitle line. When set with activeColor, enables karaoke-style highlighting where the active word is colored differently. */
|
|
257
|
+
wordsPerLine?: number;
|
|
256
258
|
/** When src is a Speech element, include its audio track in the video. Defaults to false. */
|
|
257
259
|
withAudio?: boolean;
|
|
258
260
|
}
|
|
@@ -1,6 +1,10 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import {
|
|
2
|
+
generateImage,
|
|
3
|
+
experimental_generateSpeech as generateSpeech,
|
|
4
|
+
} from "ai";
|
|
2
5
|
import { type CacheStorage, withCache } from "../ai-sdk/cache";
|
|
3
6
|
import { fileCache } from "../ai-sdk/file-cache";
|
|
7
|
+
import { generateMusic } from "../ai-sdk/generate-music";
|
|
4
8
|
import { generateVideo } from "../ai-sdk/generate-video";
|
|
5
9
|
import { localBackend } from "../ai-sdk/providers/editly";
|
|
6
10
|
import type { RenderContext } from "../react/renderers/context";
|
|
@@ -49,6 +53,12 @@ export function createStepSession(
|
|
|
49
53
|
generateVideo: cacheStorage
|
|
50
54
|
? withCache(generateVideo, { storage: cacheStorage })
|
|
51
55
|
: generateVideo,
|
|
56
|
+
generateSpeech: cacheStorage
|
|
57
|
+
? withCache(generateSpeech, { storage: cacheStorage })
|
|
58
|
+
: generateSpeech,
|
|
59
|
+
generateMusic: cacheStorage
|
|
60
|
+
? withCache(generateMusic, { storage: cacheStorage })
|
|
61
|
+
: generateMusic,
|
|
52
62
|
tempFiles: [],
|
|
53
63
|
progress: createProgressTracker(false),
|
|
54
64
|
pendingFiles: new Map(),
|