vargai 0.4.0-alpha73 → 0.4.0-alpha75
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -3
- package/docs/react.md +3 -0
- package/launch-videos/07-ugc-weight-loss.tsx +6 -1
- package/launch-videos/08-talking-head-varg.tsx +1 -1
- package/package.json +1 -1
- package/skills/varg-video-generation/SKILL.md +1 -1
- package/src/ai-sdk/providers/elevenlabs.ts +84 -25
- package/src/cli/commands/init.tsx +3 -3
- package/src/cli/commands/storyboard.tsx +1 -0
- package/src/core/schema/shared.ts +24 -1
- package/src/definitions/actions/voice.ts +7 -4
- package/src/providers/elevenlabs.ts +5 -1
- package/src/react/async-elements.test.ts +3 -0
- package/src/react/examples/async/example_we_want_to_test.tsx +1 -1
- package/src/react/examples/async/simple.tsx +1 -1
- package/src/react/examples/async/speech-segments-voiceover.tsx +116 -0
- package/src/react/examples/async/speech-segments.tsx +109 -0
- package/src/react/examples/async/talking-head.tsx +2 -2
- package/src/react/examples/captions-demo.tsx +1 -1
- package/src/react/index.ts +1 -0
- package/src/react/renderers/captions.ts +51 -33
- package/src/react/renderers/resolve-lazy.ts +8 -0
- package/src/react/resolve.ts +182 -2
- package/src/react/resolved-element.ts +36 -9
- package/src/react/types.ts +36 -1
- package/src/speech/index.ts +9 -0
- package/src/speech/map-segments.test.ts +178 -0
- package/src/speech/map-segments.ts +155 -0
- package/src/speech/parse-alignment.test.ts +122 -0
- package/src/speech/parse-alignment.ts +78 -0
- package/src/speech/types.ts +76 -0
- package/test-nano-banana.ts +4 -1
package/README.md
CHANGED
|
@@ -199,7 +199,7 @@ await render(
|
|
|
199
199
|
| `<Music>` | background music | `prompt`, `src`, `model`, `volume`, `loop`, `ducking` |
|
|
200
200
|
| `<Title>` | text overlay | `position`, `color`, `start`, `end` |
|
|
201
201
|
| `<Subtitle>` | subtitle text | `backgroundColor` |
|
|
202
|
-
| `<Captions>` | auto-generated subs | `src`, `srt`, `style`, `color`, `activeColor` |
|
|
202
|
+
| `<Captions>` | auto-generated subs | `src`, `srt`, `style`, `color`, `activeColor`, `withAudio` |
|
|
203
203
|
| `<Overlay>` | positioned layer | `left`, `top`, `width`, `height`, `keepAudio` |
|
|
204
204
|
| `<Split>` | side-by-side | `direction` |
|
|
205
205
|
| `<Slider>` | before/after reveal | `direction` |
|
|
@@ -282,7 +282,7 @@ await render(
|
|
|
282
282
|
/>
|
|
283
283
|
</Clip>
|
|
284
284
|
|
|
285
|
-
<Captions src={voiceover} style="tiktok" color="#ffffff" />
|
|
285
|
+
<Captions src={voiceover} style="tiktok" color="#ffffff" withAudio />
|
|
286
286
|
</Render>,
|
|
287
287
|
{ output: "output/talking-head.mp4" }
|
|
288
288
|
);
|
|
@@ -337,7 +337,7 @@ await render(
|
|
|
337
337
|
<Title position="top" color="#ffffff">My 3-Month Transformation</Title>
|
|
338
338
|
</Clip>
|
|
339
339
|
|
|
340
|
-
<Captions src={voiceover} style="tiktok" color="#ffffff" />
|
|
340
|
+
<Captions src={voiceover} style="tiktok" color="#ffffff" withAudio />
|
|
341
341
|
</Render>,
|
|
342
342
|
{ output: "output/transformation.mp4" }
|
|
343
343
|
);
|
package/docs/react.md
CHANGED
|
@@ -328,10 +328,13 @@ or feed it a speech element directly:
|
|
|
328
328
|
<Captions
|
|
329
329
|
src={ralph-speech}
|
|
330
330
|
style="tiktok"
|
|
331
|
+
withAudio
|
|
331
332
|
/>
|
|
332
333
|
</Clip>
|
|
333
334
|
```
|
|
334
335
|
|
|
336
|
+
> by default, `<Captions src={speech} />` renders captions only (no audio). add `withAudio` to include the speech audio track in the video.
|
|
337
|
+
|
|
335
338
|
### caption styles
|
|
336
339
|
|
|
337
340
|
```tsx
|
|
@@ -123,6 +123,11 @@ export default (
|
|
|
123
123
|
</Clip>
|
|
124
124
|
|
|
125
125
|
{/* TikTok-style captions with voiceover */}
|
|
126
|
-
<Captions
|
|
126
|
+
<Captions
|
|
127
|
+
src={voiceover}
|
|
128
|
+
style={CAPTIONS_STYLE}
|
|
129
|
+
color={CAPTIONS_COLOR}
|
|
130
|
+
withAudio
|
|
131
|
+
/>
|
|
127
132
|
</Render>
|
|
128
133
|
);
|
package/package.json
CHANGED
|
@@ -9,8 +9,17 @@ import {
|
|
|
9
9
|
type SpeechModelV3CallOptions,
|
|
10
10
|
} from "@ai-sdk/provider";
|
|
11
11
|
import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
|
|
12
|
+
import type { ElevenLabsCharacterAlignment } from "../../speech/types";
|
|
12
13
|
import type { MusicModelV3, MusicModelV3CallOptions } from "../music-model";
|
|
13
14
|
|
|
15
|
+
/**
|
|
16
|
+
* Curated name → voice_id mapping for backward-compatible friendly names.
|
|
17
|
+
* These are convenience aliases only — any valid ElevenLabs voice_id can be
|
|
18
|
+
* passed directly as the `voice` parameter and it will be forwarded as-is.
|
|
19
|
+
*
|
|
20
|
+
* For the full catalog of 600+ voices, use voice_id strings directly or
|
|
21
|
+
* call the gateway's GET /v1/voices endpoint to browse/search.
|
|
22
|
+
*/
|
|
14
23
|
const VOICES: Record<string, string> = {
|
|
15
24
|
rachel: "21m00Tcm4TlvDq8ikWAM",
|
|
16
25
|
domi: "AZnzlk1XvdvUeBnXmlld",
|
|
@@ -105,11 +114,11 @@ class ElevenLabsSpeechModel implements SpeechModelV3 {
|
|
|
105
114
|
readonly provider = "elevenlabs";
|
|
106
115
|
readonly modelId: string;
|
|
107
116
|
|
|
108
|
-
private
|
|
117
|
+
private apiKey: string;
|
|
109
118
|
|
|
110
|
-
constructor(modelId: string,
|
|
119
|
+
constructor(modelId: string, apiKey: string) {
|
|
111
120
|
this.modelId = modelId;
|
|
112
|
-
this.
|
|
121
|
+
this.apiKey = apiKey;
|
|
113
122
|
}
|
|
114
123
|
|
|
115
124
|
async doGenerate(options: SpeechModelV3CallOptions) {
|
|
@@ -127,31 +136,80 @@ class ElevenLabsSpeechModel implements SpeechModelV3 {
|
|
|
127
136
|
});
|
|
128
137
|
}
|
|
129
138
|
|
|
130
|
-
const elevenLabsOptions = providerOptions?.elevenlabs ?? {}
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
const
|
|
139
|
-
const
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
139
|
+
const elevenLabsOptions = (providerOptions?.elevenlabs ?? {}) as Record<
|
|
140
|
+
string,
|
|
141
|
+
unknown
|
|
142
|
+
>;
|
|
143
|
+
|
|
144
|
+
// Call the /with-timestamps endpoint via raw fetch.
|
|
145
|
+
// Returns JSON with base64 audio + character-level alignment.
|
|
146
|
+
const controller = new AbortController();
|
|
147
|
+
const timeoutMs = 120_000; // 2 minutes — generous for long-form TTS
|
|
148
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
149
|
+
|
|
150
|
+
let response: Response;
|
|
151
|
+
try {
|
|
152
|
+
response = await fetch(
|
|
153
|
+
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/with-timestamps?output_format=mp3_44100_128`,
|
|
154
|
+
{
|
|
155
|
+
method: "POST",
|
|
156
|
+
headers: {
|
|
157
|
+
"xi-api-key": this.apiKey,
|
|
158
|
+
"Content-Type": "application/json",
|
|
159
|
+
},
|
|
160
|
+
body: JSON.stringify({
|
|
161
|
+
text,
|
|
162
|
+
model_id: model,
|
|
163
|
+
...elevenLabsOptions,
|
|
164
|
+
}),
|
|
165
|
+
signal: controller.signal,
|
|
166
|
+
},
|
|
167
|
+
);
|
|
168
|
+
} catch (error) {
|
|
169
|
+
clearTimeout(timer);
|
|
170
|
+
if (error instanceof DOMException && error.name === "AbortError") {
|
|
171
|
+
throw new Error(
|
|
172
|
+
`ElevenLabs speech timed out after ${timeoutMs / 1000}s for voice ${voiceId}`,
|
|
173
|
+
);
|
|
174
|
+
}
|
|
175
|
+
throw error;
|
|
145
176
|
}
|
|
177
|
+
clearTimeout(timer);
|
|
146
178
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
offset += chunk.length;
|
|
179
|
+
if (!response.ok) {
|
|
180
|
+
const errorText = await response.text();
|
|
181
|
+
throw new Error(
|
|
182
|
+
`ElevenLabs speech with timestamps failed (${response.status}): ${errorText}`,
|
|
183
|
+
);
|
|
153
184
|
}
|
|
154
185
|
|
|
186
|
+
const json = (await response.json()) as {
|
|
187
|
+
audio_base64: string;
|
|
188
|
+
alignment?: ElevenLabsCharacterAlignment;
|
|
189
|
+
normalized_alignment?: ElevenLabsCharacterAlignment;
|
|
190
|
+
};
|
|
191
|
+
|
|
192
|
+
// Decode base64 audio to binary
|
|
193
|
+
const audioBytes = Buffer.from(json.audio_base64, "base64");
|
|
194
|
+
const result = new Uint8Array(
|
|
195
|
+
audioBytes.buffer,
|
|
196
|
+
audioBytes.byteOffset,
|
|
197
|
+
audioBytes.byteLength,
|
|
198
|
+
);
|
|
199
|
+
|
|
200
|
+
// Pack alignment data into providerMetadata so the AI SDK passes it through.
|
|
201
|
+
// biome-ignore lint/suspicious/noExplicitAny: JSON.parse returns any, matching JSONObject
|
|
202
|
+
const providerMetadata: Record<string, any> | undefined = json.alignment
|
|
203
|
+
? JSON.parse(
|
|
204
|
+
JSON.stringify({
|
|
205
|
+
elevenlabs: {
|
|
206
|
+
alignment: json.alignment,
|
|
207
|
+
normalizedAlignment: json.normalized_alignment,
|
|
208
|
+
},
|
|
209
|
+
}),
|
|
210
|
+
)
|
|
211
|
+
: undefined;
|
|
212
|
+
|
|
155
213
|
return {
|
|
156
214
|
audio: result,
|
|
157
215
|
warnings,
|
|
@@ -160,6 +218,7 @@ class ElevenLabsSpeechModel implements SpeechModelV3 {
|
|
|
160
218
|
modelId: this.modelId,
|
|
161
219
|
headers: undefined,
|
|
162
220
|
},
|
|
221
|
+
providerMetadata,
|
|
163
222
|
};
|
|
164
223
|
}
|
|
165
224
|
}
|
|
@@ -191,7 +250,7 @@ export function createElevenLabs(
|
|
|
191
250
|
return {
|
|
192
251
|
specificationVersion: "v3",
|
|
193
252
|
speechModel(modelId = ELEVENLABS_DEFAULTS.speechModel) {
|
|
194
|
-
return new ElevenLabsSpeechModel(modelId,
|
|
253
|
+
return new ElevenLabsSpeechModel(modelId, apiKey);
|
|
195
254
|
},
|
|
196
255
|
musicModel(modelId = ELEVENLABS_DEFAULTS.musicModel) {
|
|
197
256
|
return new ElevenLabsMusicModel(modelId, client);
|
|
@@ -226,13 +226,13 @@ export default (
|
|
|
226
226
|
<Clip duration={21}>
|
|
227
227
|
<Image src={character} />
|
|
228
228
|
</Clip>
|
|
229
|
-
<Captions src={voiceover} style="tiktok" color="#ffffff" activeColor="#FFD700" />
|
|
229
|
+
<Captions src={voiceover} style="tiktok" color="#ffffff" activeColor="#FFD700" withAudio />
|
|
230
230
|
</Render>
|
|
231
231
|
);
|
|
232
232
|
\`\`\`
|
|
233
233
|
This file can be both rendered directly (\`bunx vargai render file.tsx\`) and imported by other files (\`import { character } from "./file.tsx"\`).
|
|
234
234
|
|
|
235
|
-
2. **Captions
|
|
235
|
+
2. **Captions and audio** - \`<Captions src={voiceover} />\` renders captions only (no audio). Add \`withAudio\` to also play the speech audio: \`<Captions src={voiceover} withAudio />\`.
|
|
236
236
|
|
|
237
237
|
3. **Clip duration** - Omit \`duration\` to auto-fit content. Set explicit \`duration={N}\` to lock length. If duration is shorter than content, you get black screen while audio continues.
|
|
238
238
|
|
|
@@ -305,7 +305,7 @@ export default (
|
|
|
305
305
|
aspectRatio="9:16"
|
|
306
306
|
/>
|
|
307
307
|
</Clip>
|
|
308
|
-
<Captions src={voiceover} style="tiktok" color="#ffffff" activeColor="#FFD700" />
|
|
308
|
+
<Captions src={voiceover} style="tiktok" color="#ffffff" activeColor="#FFD700" withAudio />
|
|
309
309
|
</Render>
|
|
310
310
|
);
|
|
311
311
|
\`\`\`
|
|
@@ -21,7 +21,15 @@ export type VideoDurationString = z.infer<typeof videoDurationStringSchema>;
|
|
|
21
21
|
export const resolutionSchema = z.enum(["480p", "720p", "1080p"]);
|
|
22
22
|
export type Resolution = z.infer<typeof resolutionSchema>;
|
|
23
23
|
|
|
24
|
-
// ElevenLabs
|
|
24
|
+
// Voice parameter: accepts any voice name or ElevenLabs voice_id string.
|
|
25
|
+
// ElevenLabs has 1000+ voices — pass a voice_id directly for full catalog access.
|
|
26
|
+
// Common names ("rachel", "adam", etc.) are resolved to voice_ids automatically.
|
|
27
|
+
export const voiceSchema = z
|
|
28
|
+
.string()
|
|
29
|
+
.min(1, "Voice name or voice_id cannot be empty");
|
|
30
|
+
export type Voice = z.infer<typeof voiceSchema>;
|
|
31
|
+
|
|
32
|
+
/** @deprecated Use voiceSchema instead. Kept for backward compatibility. */
|
|
25
33
|
export const voiceNameSchema = z.enum([
|
|
26
34
|
"rachel",
|
|
27
35
|
"domi",
|
|
@@ -36,6 +44,21 @@ export const voiceNameSchema = z.enum([
|
|
|
36
44
|
]);
|
|
37
45
|
export type VoiceName = z.infer<typeof voiceNameSchema>;
|
|
38
46
|
|
|
47
|
+
// Well-known voice names for quick reference in skills/prompts.
|
|
48
|
+
// These are convenience aliases — any valid ElevenLabs voice_id also works.
|
|
49
|
+
export const WELL_KNOWN_VOICE_NAMES = [
|
|
50
|
+
"rachel",
|
|
51
|
+
"domi",
|
|
52
|
+
"sarah",
|
|
53
|
+
"bella",
|
|
54
|
+
"antoni",
|
|
55
|
+
"elli",
|
|
56
|
+
"josh",
|
|
57
|
+
"arnold",
|
|
58
|
+
"adam",
|
|
59
|
+
"sam",
|
|
60
|
+
] as const;
|
|
61
|
+
|
|
39
62
|
// Simplified voice set (commonly used in skills)
|
|
40
63
|
export const simpleVoiceSchema = z.enum(["rachel", "sam", "adam", "josh"]);
|
|
41
64
|
export type SimpleVoice = z.infer<typeof simpleVoiceSchema>;
|
|
@@ -4,15 +4,17 @@
|
|
|
4
4
|
*/
|
|
5
5
|
|
|
6
6
|
import { z } from "zod";
|
|
7
|
-
import { filePathSchema,
|
|
7
|
+
import { filePathSchema, voiceSchema } from "../../core/schema/shared";
|
|
8
8
|
import type { ActionDefinition, ZodSchema } from "../../core/schema/types";
|
|
9
9
|
import { elevenlabsProvider, VOICES } from "../../providers/elevenlabs";
|
|
10
10
|
import { storageProvider } from "../../providers/storage";
|
|
11
11
|
|
|
12
|
-
// Input schema with Zod
|
|
12
|
+
// Input schema with Zod — accepts any voice name or ElevenLabs voice_id
|
|
13
13
|
const voiceInputSchema = z.object({
|
|
14
14
|
text: z.string().describe("Text to convert to speech"),
|
|
15
|
-
voice:
|
|
15
|
+
voice: voiceSchema
|
|
16
|
+
.default("rachel")
|
|
17
|
+
.describe("Voice name or ElevenLabs voice_id"),
|
|
16
18
|
output: filePathSchema.optional().describe("Output file path"),
|
|
17
19
|
});
|
|
18
20
|
|
|
@@ -58,10 +60,11 @@ export interface VoiceResult {
|
|
|
58
60
|
uploadUrl?: string;
|
|
59
61
|
}
|
|
60
62
|
|
|
61
|
-
// Voice name to ID mapping
|
|
63
|
+
// Voice name to ID mapping. Unknown names pass through as voice_ids.
|
|
62
64
|
const VOICE_MAP: Record<string, string> = {
|
|
63
65
|
rachel: VOICES.RACHEL,
|
|
64
66
|
domi: VOICES.DOMI,
|
|
67
|
+
sarah: VOICES.SARAH,
|
|
65
68
|
bella: VOICES.BELLA,
|
|
66
69
|
antoni: VOICES.ANTONI,
|
|
67
70
|
elli: VOICES.ELLI,
|
|
@@ -186,7 +186,11 @@ export class ElevenLabsProvider extends BaseProvider {
|
|
|
186
186
|
}
|
|
187
187
|
}
|
|
188
188
|
|
|
189
|
-
|
|
189
|
+
/**
|
|
190
|
+
* Curated voice_id constants for common ElevenLabs voices.
|
|
191
|
+
* For the full catalog of 600+ voices, use voice_ids directly or
|
|
192
|
+
* call the gateway's GET /v1/voices endpoint to browse/search.
|
|
193
|
+
*/
|
|
190
194
|
export const VOICES = {
|
|
191
195
|
RACHEL: "21m00Tcm4TlvDq8ikWAM",
|
|
192
196
|
DOMI: "AZnzlk1XvdvUeBnXmlld",
|
|
@@ -190,6 +190,7 @@ describe("ResolvedElement in composition tree", () => {
|
|
|
190
190
|
const captions = Captions({
|
|
191
191
|
src: audio as unknown as VargElement<"speech">,
|
|
192
192
|
style: "tiktok",
|
|
193
|
+
withAudio: true,
|
|
193
194
|
});
|
|
194
195
|
|
|
195
196
|
expect(captions.type).toBe("captions");
|
|
@@ -458,6 +459,7 @@ describe("nested clips (container clip pattern)", () => {
|
|
|
458
459
|
Captions({
|
|
459
460
|
src: audio as unknown as VargElement<"speech">,
|
|
460
461
|
style: "tiktok",
|
|
462
|
+
withAudio: true,
|
|
461
463
|
}),
|
|
462
464
|
],
|
|
463
465
|
}),
|
|
@@ -585,6 +587,7 @@ describe("nested clips (container clip pattern)", () => {
|
|
|
585
587
|
Captions({
|
|
586
588
|
src: audio as unknown as VargElement<"speech">,
|
|
587
589
|
style: "tiktok",
|
|
590
|
+
withAudio: true,
|
|
588
591
|
}),
|
|
589
592
|
],
|
|
590
593
|
});
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Speech segments demo — single continuous voiceover.
|
|
3
|
+
*
|
|
4
|
+
* Scene 1: VEED lipsync talking head (segment 0 for lipsync, audio muted)
|
|
5
|
+
* Scene 2: image b-roll (no per-clip audio)
|
|
6
|
+
* Scene 3: VEED lipsync talking head (segment 2 for lipsync, audio muted)
|
|
7
|
+
*
|
|
8
|
+
* One full voiceover plays at the Render level — smooth, continuous audio
|
|
9
|
+
* with no splicing artifacts. VEED videos use keepAudio: false so the
|
|
10
|
+
* baked-in lipsync audio doesn't double up with the voiceover.
|
|
11
|
+
*
|
|
12
|
+
* Segments are only used for:
|
|
13
|
+
* - Feeding audio to VEED for lipsync generation
|
|
14
|
+
* - Setting clip durations from segment timing
|
|
15
|
+
*
|
|
16
|
+
* Run: bun run src/react/examples/async/speech-segments-voiceover.tsx
|
|
17
|
+
* Output: output/speech-segments-voiceover.mp4
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { elevenlabs } from "../../../ai-sdk/providers/elevenlabs";
|
|
21
|
+
import { fal } from "../../../ai-sdk/providers/fal";
|
|
22
|
+
import { Captions, Clip, Image, Render, render, Speech, Video } from "../..";
|
|
23
|
+
|
|
24
|
+
// --- One speech call, three segments ---
|
|
25
|
+
const { audio, segments } = await Speech({
|
|
26
|
+
model: elevenlabs.speechModel("eleven_v3"),
|
|
27
|
+
voice: "adam",
|
|
28
|
+
children: [
|
|
29
|
+
"Scientists always lied to you about bananas.",
|
|
30
|
+
"Bananas are normally dangerous, they can kill your gut health.",
|
|
31
|
+
'The actual issue is Banana bacteria called "alupios manurale" causing food poisoning symptoms.',
|
|
32
|
+
],
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
console.log(`Total duration: ${audio.duration.toFixed(2)}s`);
|
|
36
|
+
console.log(`Segments: ${segments.length}`);
|
|
37
|
+
for (const [i, seg] of segments.entries()) {
|
|
38
|
+
console.log(
|
|
39
|
+
` [${i}] ${seg.start.toFixed(2)}s -> ${seg.end.toFixed(2)}s (${seg.duration.toFixed(2)}s) "${seg.text}"`,
|
|
40
|
+
);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// --- Portrait for the talking head ---
|
|
44
|
+
const portrait = Image({
|
|
45
|
+
prompt:
|
|
46
|
+
"Ultra-realistic studio portrait of a serious male scientist in his 40s, lab coat, glasses, concerned expression, dramatic lighting, dark background, documentary style",
|
|
47
|
+
model: fal.imageModel("nano-banana-pro"),
|
|
48
|
+
aspectRatio: "9:16",
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
// --- Scene 1: lipsync (segment audio for VEED, but muted in final video) ---
|
|
52
|
+
const talking1 = Video({
|
|
53
|
+
model: fal.videoModel("veed-fabric-1.0"),
|
|
54
|
+
keepAudio: false, // muted — full voiceover handles audio
|
|
55
|
+
prompt: { images: [portrait], audio: segments[0] },
|
|
56
|
+
providerOptions: { fal: { resolution: "720p" } },
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
// --- Scene 3: lipsync (segment audio for VEED, but muted in final video) ---
|
|
60
|
+
const talking3 = Video({
|
|
61
|
+
model: fal.videoModel("veed-fabric-1.0"),
|
|
62
|
+
keepAudio: false, // muted — full voiceover handles audio
|
|
63
|
+
prompt: { images: [portrait], audio: segments[2] },
|
|
64
|
+
providerOptions: { fal: { resolution: "720p" } },
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
const demo = (
|
|
68
|
+
<Render width={1080} height={1920}>
|
|
69
|
+
{/* Scene 1: talking head */}
|
|
70
|
+
<Clip duration={segments[0]!.duration}>{talking1}</Clip>
|
|
71
|
+
|
|
72
|
+
{/* Scene 2: banana b-roll (no per-clip audio — voiceover covers it) */}
|
|
73
|
+
<Clip duration={segments[1]!.duration}>
|
|
74
|
+
<Image
|
|
75
|
+
prompt="macro shot of a dangerous banana with dramatic dark lighting, bacteria visualization, medical documentary style, gut health danger concept"
|
|
76
|
+
model={fal.imageModel("nano-banana-pro")}
|
|
77
|
+
aspectRatio="9:16"
|
|
78
|
+
zoom="in"
|
|
79
|
+
/>
|
|
80
|
+
</Clip>
|
|
81
|
+
|
|
82
|
+
{/* Scene 3: talking head */}
|
|
83
|
+
<Clip duration={segments[2]!.duration}>{talking3}</Clip>
|
|
84
|
+
|
|
85
|
+
{/* Full continuous voiceover — smooth, no splicing */}
|
|
86
|
+
{audio}
|
|
87
|
+
|
|
88
|
+
{/* Captions from the voiceover — no withAudio since audio is already included above */}
|
|
89
|
+
<Captions src={audio} style="tiktok" />
|
|
90
|
+
</Render>
|
|
91
|
+
);
|
|
92
|
+
|
|
93
|
+
async function main() {
|
|
94
|
+
if (!process.env.FAL_API_KEY && !process.env.FAL_KEY) {
|
|
95
|
+
console.error("ERROR: FAL_API_KEY/FAL_KEY not found in environment");
|
|
96
|
+
process.exit(1);
|
|
97
|
+
}
|
|
98
|
+
if (!process.env.ELEVENLABS_API_KEY) {
|
|
99
|
+
console.error("ERROR: ELEVENLABS_API_KEY not found in environment");
|
|
100
|
+
process.exit(1);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
const result = await render(demo, {
|
|
104
|
+
output: "output/speech-segments-voiceover.mp4",
|
|
105
|
+
cache: ".cache/ai-speech-segments-voiceover",
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
console.log(
|
|
109
|
+
`Done: output/speech-segments-voiceover.mp4 (${(result.video.byteLength / 1024 / 1024).toFixed(2)} MB)`,
|
|
110
|
+
);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
main().catch((err) => {
|
|
114
|
+
console.error(err);
|
|
115
|
+
process.exit(1);
|
|
116
|
+
});
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Speech segments demo — per-clip audio, hard cuts between scenes.
|
|
3
|
+
*
|
|
4
|
+
* Scene 1: VEED lipsync talking head (segment 0, keepAudio)
|
|
5
|
+
* Scene 2: image b-roll + voiceover (segment 1 as clip child)
|
|
6
|
+
* Scene 3: VEED lipsync talking head (segment 2, keepAudio)
|
|
7
|
+
*
|
|
8
|
+
* Each clip carries its own segment audio. Hard cuts between scenes —
|
|
9
|
+
* no crossfade transitions, so audio from adjacent clips never overlaps.
|
|
10
|
+
* This is the cleanest approach for per-clip audio.
|
|
11
|
+
*
|
|
12
|
+
* For smooth audio with crossfade transitions, use the single-voiceover
|
|
13
|
+
* pattern instead (see speech-segments-voiceover.tsx).
|
|
14
|
+
*
|
|
15
|
+
* Run: bun run src/react/examples/async/speech-segments.tsx
|
|
16
|
+
* Output: output/speech-segments.mp4
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import { elevenlabs } from "../../../ai-sdk/providers/elevenlabs";
|
|
20
|
+
import { fal } from "../../../ai-sdk/providers/fal";
|
|
21
|
+
import { Clip, Image, Render, render, Speech, Video } from "../..";
|
|
22
|
+
|
|
23
|
+
// --- One speech call, three segments ---
|
|
24
|
+
const { segments } = await Speech({
|
|
25
|
+
model: elevenlabs.speechModel("eleven_v3"),
|
|
26
|
+
voice: "adam",
|
|
27
|
+
children: [
|
|
28
|
+
"Scientists always lied to you about bananas.",
|
|
29
|
+
"Bananas are normally dangerous, they can kill your gut health.",
|
|
30
|
+
'The actual issue is Banana bacteria called "alupios manurale" causing food poisoning symptoms.',
|
|
31
|
+
],
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
console.log(`Segments: ${segments.length}`);
|
|
35
|
+
for (const [i, seg] of segments.entries()) {
|
|
36
|
+
console.log(
|
|
37
|
+
` [${i}] ${seg.start.toFixed(2)}s -> ${seg.end.toFixed(2)}s (${seg.duration.toFixed(2)}s) "${seg.text}"`,
|
|
38
|
+
);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// --- Portrait for the talking head ---
|
|
42
|
+
const portrait = Image({
|
|
43
|
+
prompt:
|
|
44
|
+
"Ultra-realistic studio portrait of a serious male scientist in his 40s, lab coat, glasses, concerned expression, dramatic lighting, dark background, documentary style",
|
|
45
|
+
model: fal.imageModel("nano-banana-pro"),
|
|
46
|
+
aspectRatio: "9:16",
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
// --- Scene 1: lipsync talking head ---
|
|
50
|
+
const talking1 = Video({
|
|
51
|
+
model: fal.videoModel("veed-fabric-1.0"),
|
|
52
|
+
keepAudio: true,
|
|
53
|
+
prompt: { images: [portrait], audio: segments[0] },
|
|
54
|
+
providerOptions: { fal: { resolution: "720p" } },
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
// --- Scene 3: lipsync talking head ---
|
|
58
|
+
const talking3 = Video({
|
|
59
|
+
model: fal.videoModel("veed-fabric-1.0"),
|
|
60
|
+
keepAudio: true,
|
|
61
|
+
prompt: { images: [portrait], audio: segments[2] },
|
|
62
|
+
providerOptions: { fal: { resolution: "720p" } },
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
const demo = (
|
|
66
|
+
<Render width={1080} height={1920}>
|
|
67
|
+
{/* Scene 1: talking head */}
|
|
68
|
+
<Clip duration={segments[0]!.duration}>{talking1}</Clip>
|
|
69
|
+
|
|
70
|
+
{/* Scene 2: b-roll + segment voiceover */}
|
|
71
|
+
<Clip duration={segments[1]!.duration}>
|
|
72
|
+
<Image
|
|
73
|
+
prompt="macro shot of a dangerous banana with dramatic dark lighting, bacteria visualization, medical documentary style, gut health danger concept"
|
|
74
|
+
model={fal.imageModel("nano-banana-pro")}
|
|
75
|
+
aspectRatio="9:16"
|
|
76
|
+
zoom="in"
|
|
77
|
+
/>
|
|
78
|
+
{segments[1]}
|
|
79
|
+
</Clip>
|
|
80
|
+
|
|
81
|
+
{/* Scene 3: talking head */}
|
|
82
|
+
<Clip duration={segments[2]!.duration}>{talking3}</Clip>
|
|
83
|
+
</Render>
|
|
84
|
+
);
|
|
85
|
+
|
|
86
|
+
async function main() {
|
|
87
|
+
if (!process.env.FAL_API_KEY && !process.env.FAL_KEY) {
|
|
88
|
+
console.error("ERROR: FAL_API_KEY/FAL_KEY not found in environment");
|
|
89
|
+
process.exit(1);
|
|
90
|
+
}
|
|
91
|
+
if (!process.env.ELEVENLABS_API_KEY) {
|
|
92
|
+
console.error("ERROR: ELEVENLABS_API_KEY not found in environment");
|
|
93
|
+
process.exit(1);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
const result = await render(demo, {
|
|
97
|
+
output: "output/speech-segments.mp4",
|
|
98
|
+
cache: ".cache/ai-speech-segments",
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
console.log(
|
|
102
|
+
`Done: output/speech-segments.mp4 (${(result.video.byteLength / 1024 / 1024).toFixed(2)} MB)`,
|
|
103
|
+
);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
main().catch((err) => {
|
|
107
|
+
console.error(err);
|
|
108
|
+
process.exit(1);
|
|
109
|
+
});
|
|
@@ -37,7 +37,7 @@ export default (
|
|
|
37
37
|
{/* Scene 1: talking head — lipsync via VEED, audio baked in */}
|
|
38
38
|
<Clip duration={audio1.duration}>
|
|
39
39
|
{talkingHead}
|
|
40
|
-
<Captions src={audio1} style="tiktok" />
|
|
40
|
+
<Captions src={audio1} style="tiktok" withAudio />
|
|
41
41
|
</Clip>
|
|
42
42
|
|
|
43
43
|
{/* Scene 2: science b-roll — image + voiceover via captions */}
|
|
@@ -48,7 +48,7 @@ export default (
|
|
|
48
48
|
aspectRatio="9:16"
|
|
49
49
|
zoom="out"
|
|
50
50
|
/>
|
|
51
|
-
<Captions src={audio2} style="tiktok" />
|
|
51
|
+
<Captions src={audio2} style="tiktok" withAudio />
|
|
52
52
|
</Clip>
|
|
53
53
|
</Render>
|
|
54
54
|
);
|
package/src/react/index.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
export type { CacheStorage } from "../ai-sdk/cache";
|
|
2
2
|
export { File } from "../ai-sdk/file";
|
|
3
3
|
export type { SizeValue } from "../ai-sdk/providers/editly/types";
|
|
4
|
+
export type { Segment, WordTiming } from "../speech/types";
|
|
4
5
|
export { assets } from "./assets";
|
|
5
6
|
export {
|
|
6
7
|
Captions,
|