vargai 0.4.0-alpha73 → 0.4.0-alpha74

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -71,7 +71,7 @@
71
71
  "zod": "^4.2.1"
72
72
  },
73
73
  "sideEffects": false,
74
- "version": "0.4.0-alpha73",
74
+ "version": "0.4.0-alpha74",
75
75
  "exports": {
76
76
  ".": "./src/index.ts",
77
77
  "./ai": "./src/ai-sdk/index.ts",
@@ -9,8 +9,17 @@ import {
9
9
  type SpeechModelV3CallOptions,
10
10
  } from "@ai-sdk/provider";
11
11
  import { ElevenLabsClient } from "@elevenlabs/elevenlabs-js";
12
+ import type { ElevenLabsCharacterAlignment } from "../../speech/types";
12
13
  import type { MusicModelV3, MusicModelV3CallOptions } from "../music-model";
13
14
 
15
+ /**
16
+ * Curated name → voice_id mapping for backward-compatible friendly names.
17
+ * These are convenience aliases only — any valid ElevenLabs voice_id can be
18
+ * passed directly as the `voice` parameter and it will be forwarded as-is.
19
+ *
20
+ * For the full catalog of 600+ voices, use voice_id strings directly or
21
+ * call the gateway's GET /v1/voices endpoint to browse/search.
22
+ */
14
23
  const VOICES: Record<string, string> = {
15
24
  rachel: "21m00Tcm4TlvDq8ikWAM",
16
25
  domi: "AZnzlk1XvdvUeBnXmlld",
@@ -105,11 +114,11 @@ class ElevenLabsSpeechModel implements SpeechModelV3 {
105
114
  readonly provider = "elevenlabs";
106
115
  readonly modelId: string;
107
116
 
108
- private client: ElevenLabsClient;
117
+ private apiKey: string;
109
118
 
110
- constructor(modelId: string, client: ElevenLabsClient) {
119
+ constructor(modelId: string, apiKey: string) {
111
120
  this.modelId = modelId;
112
- this.client = client;
121
+ this.apiKey = apiKey;
113
122
  }
114
123
 
115
124
  async doGenerate(options: SpeechModelV3CallOptions) {
@@ -127,31 +136,80 @@ class ElevenLabsSpeechModel implements SpeechModelV3 {
127
136
  });
128
137
  }
129
138
 
130
- const elevenLabsOptions = providerOptions?.elevenlabs ?? {};
131
- const audio = await this.client.textToSpeech.convert(voiceId, {
132
- text,
133
- modelId: model,
134
- outputFormat: "mp3_44100_128",
135
- ...elevenLabsOptions,
136
- } as Parameters<typeof this.client.textToSpeech.convert>[1]);
137
-
138
- const reader = audio.getReader();
139
- const chunks: Uint8Array[] = [];
140
-
141
- while (true) {
142
- const { done, value } = await reader.read();
143
- if (done) break;
144
- chunks.push(value);
139
+ const elevenLabsOptions = (providerOptions?.elevenlabs ?? {}) as Record<
140
+ string,
141
+ unknown
142
+ >;
143
+
144
+ // Call the /with-timestamps endpoint via raw fetch.
145
+ // Returns JSON with base64 audio + character-level alignment.
146
+ const controller = new AbortController();
147
+ const timeoutMs = 120_000; // 2 minutes — generous for long-form TTS
148
+ const timer = setTimeout(() => controller.abort(), timeoutMs);
149
+
150
+ let response: Response;
151
+ try {
152
+ response = await fetch(
153
+ `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/with-timestamps?output_format=mp3_44100_128`,
154
+ {
155
+ method: "POST",
156
+ headers: {
157
+ "xi-api-key": this.apiKey,
158
+ "Content-Type": "application/json",
159
+ },
160
+ body: JSON.stringify({
161
+ text,
162
+ model_id: model,
163
+ ...elevenLabsOptions,
164
+ }),
165
+ signal: controller.signal,
166
+ },
167
+ );
168
+ } catch (error) {
169
+ clearTimeout(timer);
170
+ if (error instanceof DOMException && error.name === "AbortError") {
171
+ throw new Error(
172
+ `ElevenLabs speech timed out after ${timeoutMs / 1000}s for voice ${voiceId}`,
173
+ );
174
+ }
175
+ throw error;
145
176
  }
177
+ clearTimeout(timer);
146
178
 
147
- const totalLength = chunks.reduce((acc, c) => acc + c.length, 0);
148
- const result = new Uint8Array(totalLength);
149
- let offset = 0;
150
- for (const chunk of chunks) {
151
- result.set(chunk, offset);
152
- offset += chunk.length;
179
+ if (!response.ok) {
180
+ const errorText = await response.text();
181
+ throw new Error(
182
+ `ElevenLabs speech with timestamps failed (${response.status}): ${errorText}`,
183
+ );
153
184
  }
154
185
 
186
+ const json = (await response.json()) as {
187
+ audio_base64: string;
188
+ alignment?: ElevenLabsCharacterAlignment;
189
+ normalized_alignment?: ElevenLabsCharacterAlignment;
190
+ };
191
+
192
+ // Decode base64 audio to binary
193
+ const audioBytes = Buffer.from(json.audio_base64, "base64");
194
+ const result = new Uint8Array(
195
+ audioBytes.buffer,
196
+ audioBytes.byteOffset,
197
+ audioBytes.byteLength,
198
+ );
199
+
200
+ // Pack alignment data into providerMetadata so the AI SDK passes it through.
201
+ // biome-ignore lint/suspicious/noExplicitAny: JSON.parse returns any, matching JSONObject
202
+ const providerMetadata: Record<string, any> | undefined = json.alignment
203
+ ? JSON.parse(
204
+ JSON.stringify({
205
+ elevenlabs: {
206
+ alignment: json.alignment,
207
+ normalizedAlignment: json.normalized_alignment,
208
+ },
209
+ }),
210
+ )
211
+ : undefined;
212
+
155
213
  return {
156
214
  audio: result,
157
215
  warnings,
@@ -160,6 +218,7 @@ class ElevenLabsSpeechModel implements SpeechModelV3 {
160
218
  modelId: this.modelId,
161
219
  headers: undefined,
162
220
  },
221
+ providerMetadata,
163
222
  };
164
223
  }
165
224
  }
@@ -191,7 +250,7 @@ export function createElevenLabs(
191
250
  return {
192
251
  specificationVersion: "v3",
193
252
  speechModel(modelId = ELEVENLABS_DEFAULTS.speechModel) {
194
- return new ElevenLabsSpeechModel(modelId, client);
253
+ return new ElevenLabsSpeechModel(modelId, apiKey);
195
254
  },
196
255
  musicModel(modelId = ELEVENLABS_DEFAULTS.musicModel) {
197
256
  return new ElevenLabsMusicModel(modelId, client);
@@ -21,7 +21,15 @@ export type VideoDurationString = z.infer<typeof videoDurationStringSchema>;
21
21
  export const resolutionSchema = z.enum(["480p", "720p", "1080p"]);
22
22
  export type Resolution = z.infer<typeof resolutionSchema>;
23
23
 
24
- // ElevenLabs preset voices
24
+ // Voice parameter: accepts any voice name or ElevenLabs voice_id string.
25
+ // ElevenLabs has 1000+ voices — pass a voice_id directly for full catalog access.
26
+ // Common names ("rachel", "adam", etc.) are resolved to voice_ids automatically.
27
+ export const voiceSchema = z
28
+ .string()
29
+ .min(1, "Voice name or voice_id cannot be empty");
30
+ export type Voice = z.infer<typeof voiceSchema>;
31
+
32
+ /** @deprecated Use voiceSchema instead. Kept for backward compatibility. */
25
33
  export const voiceNameSchema = z.enum([
26
34
  "rachel",
27
35
  "domi",
@@ -36,6 +44,21 @@ export const voiceNameSchema = z.enum([
36
44
  ]);
37
45
  export type VoiceName = z.infer<typeof voiceNameSchema>;
38
46
 
47
+ // Well-known voice names for quick reference in skills/prompts.
48
+ // These are convenience aliases — any valid ElevenLabs voice_id also works.
49
+ export const WELL_KNOWN_VOICE_NAMES = [
50
+ "rachel",
51
+ "domi",
52
+ "sarah",
53
+ "bella",
54
+ "antoni",
55
+ "elli",
56
+ "josh",
57
+ "arnold",
58
+ "adam",
59
+ "sam",
60
+ ] as const;
61
+
39
62
  // Simplified voice set (commonly used in skills)
40
63
  export const simpleVoiceSchema = z.enum(["rachel", "sam", "adam", "josh"]);
41
64
  export type SimpleVoice = z.infer<typeof simpleVoiceSchema>;
@@ -4,15 +4,17 @@
4
4
  */
5
5
 
6
6
  import { z } from "zod";
7
- import { filePathSchema, voiceNameSchema } from "../../core/schema/shared";
7
+ import { filePathSchema, voiceSchema } from "../../core/schema/shared";
8
8
  import type { ActionDefinition, ZodSchema } from "../../core/schema/types";
9
9
  import { elevenlabsProvider, VOICES } from "../../providers/elevenlabs";
10
10
  import { storageProvider } from "../../providers/storage";
11
11
 
12
- // Input schema with Zod
12
+ // Input schema with Zod — accepts any voice name or ElevenLabs voice_id
13
13
  const voiceInputSchema = z.object({
14
14
  text: z.string().describe("Text to convert to speech"),
15
- voice: voiceNameSchema.default("rachel").describe("Voice to use"),
15
+ voice: voiceSchema
16
+ .default("rachel")
17
+ .describe("Voice name or ElevenLabs voice_id"),
16
18
  output: filePathSchema.optional().describe("Output file path"),
17
19
  });
18
20
 
@@ -58,10 +60,11 @@ export interface VoiceResult {
58
60
  uploadUrl?: string;
59
61
  }
60
62
 
61
- // Voice name to ID mapping
63
+ // Voice name to ID mapping. Unknown names pass through as voice_ids.
62
64
  const VOICE_MAP: Record<string, string> = {
63
65
  rachel: VOICES.RACHEL,
64
66
  domi: VOICES.DOMI,
67
+ sarah: VOICES.SARAH,
65
68
  bella: VOICES.BELLA,
66
69
  antoni: VOICES.ANTONI,
67
70
  elli: VOICES.ELLI,
@@ -186,7 +186,11 @@ export class ElevenLabsProvider extends BaseProvider {
186
186
  }
187
187
  }
188
188
 
189
- // Popular voices
189
+ /**
190
+ * Curated voice_id constants for common ElevenLabs voices.
191
+ * For the full catalog of 600+ voices, use voice_ids directly or
192
+ * call the gateway's GET /v1/voices endpoint to browse/search.
193
+ */
190
194
  export const VOICES = {
191
195
  RACHEL: "21m00Tcm4TlvDq8ikWAM",
192
196
  DOMI: "AZnzlk1XvdvUeBnXmlld",
@@ -0,0 +1,113 @@
1
+ /**
2
+ * Speech segments demo — single continuous voiceover.
3
+ *
4
+ * Scene 1: VEED lipsync talking head (segment 0 for lipsync, audio muted)
5
+ * Scene 2: image b-roll (no per-clip audio)
6
+ * Scene 3: VEED lipsync talking head (segment 2 for lipsync, audio muted)
7
+ *
8
+ * One full voiceover plays at the Render level — smooth, continuous audio
9
+ * with no splicing artifacts. VEED videos use keepAudio: false so the
10
+ * baked-in lipsync audio doesn't double up with the voiceover.
11
+ *
12
+ * Segments are only used for:
13
+ * - Feeding audio to VEED for lipsync generation
14
+ * - Setting clip durations from segment timing
15
+ *
16
+ * Run: bun run src/react/examples/async/speech-segments-voiceover.tsx
17
+ * Output: output/speech-segments-voiceover.mp4
18
+ */
19
+
20
+ import { elevenlabs } from "../../../ai-sdk/providers/elevenlabs";
21
+ import { fal } from "../../../ai-sdk/providers/fal";
22
+ import { Clip, Image, Render, render, Speech, Video } from "../..";
23
+
24
+ // --- One speech call, three segments ---
25
+ const { audio, segments } = await Speech({
26
+ model: elevenlabs.speechModel("eleven_v3"),
27
+ voice: "adam",
28
+ children: [
29
+ "Scientists always lied to you about bananas.",
30
+ "Bananas are normally dangerous, they can kill your gut health.",
31
+ 'The actual issue is Banana bacteria called "alupios manurale" causing food poisoning symptoms.',
32
+ ],
33
+ });
34
+
35
+ console.log(`Total duration: ${audio.duration.toFixed(2)}s`);
36
+ console.log(`Segments: ${segments.length}`);
37
+ for (const [i, seg] of segments.entries()) {
38
+ console.log(
39
+ ` [${i}] ${seg.start.toFixed(2)}s -> ${seg.end.toFixed(2)}s (${seg.duration.toFixed(2)}s) "${seg.text}"`,
40
+ );
41
+ }
42
+
43
+ // --- Portrait for the talking head ---
44
+ const portrait = Image({
45
+ prompt:
46
+ "Ultra-realistic studio portrait of a serious male scientist in his 40s, lab coat, glasses, concerned expression, dramatic lighting, dark background, documentary style",
47
+ model: fal.imageModel("nano-banana-pro"),
48
+ aspectRatio: "9:16",
49
+ });
50
+
51
+ // --- Scene 1: lipsync (segment audio for VEED, but muted in final video) ---
52
+ const talking1 = Video({
53
+ model: fal.videoModel("veed-fabric-1.0"),
54
+ keepAudio: false, // muted — full voiceover handles audio
55
+ prompt: { images: [portrait], audio: segments[0] },
56
+ providerOptions: { fal: { resolution: "720p" } },
57
+ });
58
+
59
+ // --- Scene 3: lipsync (segment audio for VEED, but muted in final video) ---
60
+ const talking3 = Video({
61
+ model: fal.videoModel("veed-fabric-1.0"),
62
+ keepAudio: false, // muted — full voiceover handles audio
63
+ prompt: { images: [portrait], audio: segments[2] },
64
+ providerOptions: { fal: { resolution: "720p" } },
65
+ });
66
+
67
+ const demo = (
68
+ <Render width={1080} height={1920}>
69
+ {/* Scene 1: talking head */}
70
+ <Clip duration={segments[0]!.duration}>{talking1}</Clip>
71
+
72
+ {/* Scene 2: banana b-roll (no per-clip audio — voiceover covers it) */}
73
+ <Clip duration={segments[1]!.duration}>
74
+ <Image
75
+ prompt="macro shot of a dangerous banana with dramatic dark lighting, bacteria visualization, medical documentary style, gut health danger concept"
76
+ model={fal.imageModel("nano-banana-pro")}
77
+ aspectRatio="9:16"
78
+ zoom="in"
79
+ />
80
+ </Clip>
81
+
82
+ {/* Scene 3: talking head */}
83
+ <Clip duration={segments[2]!.duration}>{talking3}</Clip>
84
+
85
+ {/* Full continuous voiceover — smooth, no splicing */}
86
+ {audio}
87
+ </Render>
88
+ );
89
+
90
+ async function main() {
91
+ if (!process.env.FAL_API_KEY && !process.env.FAL_KEY) {
92
+ console.error("ERROR: FAL_API_KEY/FAL_KEY not found in environment");
93
+ process.exit(1);
94
+ }
95
+ if (!process.env.ELEVENLABS_API_KEY) {
96
+ console.error("ERROR: ELEVENLABS_API_KEY not found in environment");
97
+ process.exit(1);
98
+ }
99
+
100
+ const result = await render(demo, {
101
+ output: "output/speech-segments-voiceover.mp4",
102
+ cache: ".cache/ai-speech-segments-voiceover",
103
+ });
104
+
105
+ console.log(
106
+ `Done: output/speech-segments-voiceover.mp4 (${(result.video.byteLength / 1024 / 1024).toFixed(2)} MB)`,
107
+ );
108
+ }
109
+
110
+ main().catch((err) => {
111
+ console.error(err);
112
+ process.exit(1);
113
+ });
@@ -0,0 +1,109 @@
1
+ /**
2
+ * Speech segments demo — per-clip audio, hard cuts between scenes.
3
+ *
4
+ * Scene 1: VEED lipsync talking head (segment 0, keepAudio)
5
+ * Scene 2: image b-roll + voiceover (segment 1 as clip child)
6
+ * Scene 3: VEED lipsync talking head (segment 2, keepAudio)
7
+ *
8
+ * Each clip carries its own segment audio. Hard cuts between scenes —
9
+ * no crossfade transitions, so audio from adjacent clips never overlaps.
10
+ * This is the cleanest approach for per-clip audio.
11
+ *
12
+ * For smooth audio with crossfade transitions, use the single-voiceover
13
+ * pattern instead (see speech-segments-voiceover.tsx).
14
+ *
15
+ * Run: bun run src/react/examples/async/speech-segments.tsx
16
+ * Output: output/speech-segments.mp4
17
+ */
18
+
19
+ import { elevenlabs } from "../../../ai-sdk/providers/elevenlabs";
20
+ import { fal } from "../../../ai-sdk/providers/fal";
21
+ import { Clip, Image, Render, render, Speech, Video } from "../..";
22
+
23
+ // --- One speech call, three segments ---
24
+ const { segments } = await Speech({
25
+ model: elevenlabs.speechModel("eleven_v3"),
26
+ voice: "adam",
27
+ children: [
28
+ "Scientists always lied to you about bananas.",
29
+ "Bananas are normally dangerous, they can kill your gut health.",
30
+ 'The actual issue is Banana bacteria called "alupios manurale" causing food poisoning symptoms.',
31
+ ],
32
+ });
33
+
34
+ console.log(`Segments: ${segments.length}`);
35
+ for (const [i, seg] of segments.entries()) {
36
+ console.log(
37
+ ` [${i}] ${seg.start.toFixed(2)}s -> ${seg.end.toFixed(2)}s (${seg.duration.toFixed(2)}s) "${seg.text}"`,
38
+ );
39
+ }
40
+
41
+ // --- Portrait for the talking head ---
42
+ const portrait = Image({
43
+ prompt:
44
+ "Ultra-realistic studio portrait of a serious male scientist in his 40s, lab coat, glasses, concerned expression, dramatic lighting, dark background, documentary style",
45
+ model: fal.imageModel("nano-banana-pro"),
46
+ aspectRatio: "9:16",
47
+ });
48
+
49
+ // --- Scene 1: lipsync talking head ---
50
+ const talking1 = Video({
51
+ model: fal.videoModel("veed-fabric-1.0"),
52
+ keepAudio: true,
53
+ prompt: { images: [portrait], audio: segments[0] },
54
+ providerOptions: { fal: { resolution: "720p" } },
55
+ });
56
+
57
+ // --- Scene 3: lipsync talking head ---
58
+ const talking3 = Video({
59
+ model: fal.videoModel("veed-fabric-1.0"),
60
+ keepAudio: true,
61
+ prompt: { images: [portrait], audio: segments[2] },
62
+ providerOptions: { fal: { resolution: "720p" } },
63
+ });
64
+
65
+ const demo = (
66
+ <Render width={1080} height={1920}>
67
+ {/* Scene 1: talking head */}
68
+ <Clip duration={segments[0]!.duration}>{talking1}</Clip>
69
+
70
+ {/* Scene 2: b-roll + segment voiceover */}
71
+ <Clip duration={segments[1]!.duration}>
72
+ <Image
73
+ prompt="macro shot of a dangerous banana with dramatic dark lighting, bacteria visualization, medical documentary style, gut health danger concept"
74
+ model={fal.imageModel("nano-banana-pro")}
75
+ aspectRatio="9:16"
76
+ zoom="in"
77
+ />
78
+ {segments[1]}
79
+ </Clip>
80
+
81
+ {/* Scene 3: talking head */}
82
+ <Clip duration={segments[2]!.duration}>{talking3}</Clip>
83
+ </Render>
84
+ );
85
+
86
+ async function main() {
87
+ if (!process.env.FAL_API_KEY && !process.env.FAL_KEY) {
88
+ console.error("ERROR: FAL_API_KEY/FAL_KEY not found in environment");
89
+ process.exit(1);
90
+ }
91
+ if (!process.env.ELEVENLABS_API_KEY) {
92
+ console.error("ERROR: ELEVENLABS_API_KEY not found in environment");
93
+ process.exit(1);
94
+ }
95
+
96
+ const result = await render(demo, {
97
+ output: "output/speech-segments.mp4",
98
+ cache: ".cache/ai-speech-segments",
99
+ });
100
+
101
+ console.log(
102
+ `Done: output/speech-segments.mp4 (${(result.video.byteLength / 1024 / 1024).toFixed(2)} MB)`,
103
+ );
104
+ }
105
+
106
+ main().catch((err) => {
107
+ console.error(err);
108
+ process.exit(1);
109
+ });
@@ -1,6 +1,7 @@
1
1
  export type { CacheStorage } from "../ai-sdk/cache";
2
2
  export { File } from "../ai-sdk/file";
3
3
  export type { SizeValue } from "../ai-sdk/providers/editly/types";
4
+ export type { Segment, WordTiming } from "../speech/types";
4
5
  export { assets } from "./assets";
5
6
  export {
6
7
  Captions,
@@ -251,39 +251,53 @@ export async function renderCaptions(
251
251
  : await renderSpeech(props.src, ctx);
252
252
  audioPath = await ctx.backend.resolvePath(speechFile);
253
253
 
254
- const transcribeTaskId = ctx.progress
255
- ? addTask(ctx.progress, "transcribe", "groq-whisper")
256
- : null;
257
- if (transcribeTaskId && ctx.progress)
258
- startTask(ctx.progress, transcribeTaskId);
259
-
260
- const audioData =
261
- audioPath.startsWith("http://") || audioPath.startsWith("https://")
262
- ? await fetch(audioPath).then((res) => res.arrayBuffer())
263
- : await Bun.file(audioPath).arrayBuffer();
264
-
265
- const result = await transcribe({
266
- model: groq.transcription("whisper-large-v3"),
267
- audio: new Uint8Array(audioData),
268
- providerOptions: {
269
- groq: {
270
- responseFormat: "verbose_json",
271
- timestampGranularities: ["word"],
272
- },
273
- },
274
- });
275
-
276
- if (transcribeTaskId && ctx.progress)
277
- completeTask(ctx.progress, transcribeTaskId);
278
-
279
- const rawBody = (result.responses[0] as { body?: unknown })?.body;
280
- const parsed = groqResponseSchema.safeParse(rawBody);
281
- const words = parsed.success ? parsed.data.words : undefined;
282
-
283
- if (!words || words.length === 0) {
284
- srtContent = `1\n00:00:00,000 --> 00:00:05,000\n${result.text}\n`;
254
+ // Check if the speech element already has word-level timing from ElevenLabs.
255
+ // If so, skip the Whisper transcription step entirely (saves time and cost).
256
+ const nativeWords =
257
+ props.src instanceof ResolvedElement ? props.src.meta.words : undefined;
258
+
259
+ if (nativeWords && nativeWords.length > 0) {
260
+ // Use native ElevenLabs word timing — same shape as GroqWord
261
+ srtContent = convertToSRT(nativeWords);
285
262
  } else {
286
- srtContent = convertToSRT(words);
263
+ // Fallback: transcribe with Groq Whisper to get word-level timestamps
264
+ const transcribeTaskId = ctx.progress
265
+ ? addTask(ctx.progress, "transcribe", "groq-whisper")
266
+ : null;
267
+ if (transcribeTaskId && ctx.progress)
268
+ startTask(ctx.progress, transcribeTaskId);
269
+
270
+ let result: Awaited<ReturnType<typeof transcribe>>;
271
+ try {
272
+ const audioData =
273
+ audioPath.startsWith("http://") || audioPath.startsWith("https://")
274
+ ? await fetch(audioPath).then((res) => res.arrayBuffer())
275
+ : await Bun.file(audioPath).arrayBuffer();
276
+
277
+ result = await transcribe({
278
+ model: groq.transcription("whisper-large-v3"),
279
+ audio: new Uint8Array(audioData),
280
+ providerOptions: {
281
+ groq: {
282
+ responseFormat: "verbose_json",
283
+ timestampGranularities: ["word"],
284
+ },
285
+ },
286
+ });
287
+ } finally {
288
+ if (transcribeTaskId && ctx.progress)
289
+ completeTask(ctx.progress, transcribeTaskId);
290
+ }
291
+
292
+ const rawBody = (result.responses[0] as { body?: unknown })?.body;
293
+ const parsed = groqResponseSchema.safeParse(rawBody);
294
+ const words = parsed.success ? parsed.data.words : undefined;
295
+
296
+ if (!words || words.length === 0) {
297
+ srtContent = `1\n00:00:00,000 --> 00:00:05,000\n${result.text}\n`;
298
+ } else {
299
+ srtContent = convertToSRT(words);
300
+ }
287
301
  }
288
302
 
289
303
  srtPath = `/tmp/varg-captions-${Date.now()}.srt`;
@@ -11,6 +11,7 @@
11
11
  * Must be called BEFORE renderRoot() so the render pipeline sees a fully
12
12
  * static VargElement tree.
13
13
  */
14
+ import { ResolvedElement } from "../resolved-element";
14
15
  import type { VargElement, VargNode } from "../types";
15
16
 
16
17
  export async function resolveLazy(node: VargNode): Promise<VargNode> {
@@ -53,6 +54,13 @@ export async function resolveLazy(node: VargNode): Promise<VargNode> {
53
54
  }
54
55
  }
55
56
 
57
+ // Preserve ResolvedElement instances (segments, pre-resolved speech/images).
58
+ // Spreading would destroy the prototype and break `instanceof` checks
59
+ // used by the clip renderer to detect pre-resolved elements.
60
+ if (element instanceof ResolvedElement) {
61
+ return element;
62
+ }
63
+
56
64
  // Return a new element with resolved children (don't mutate the original).
57
65
  // IMPORTANT: strip .then from the spread to prevent Promise.all from
58
66
  // treating the result as a thenable (Image/Speech/etc. elements are thenable).