vargai 0.4.0-alpha65 → 0.4.0-alpha67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -70,7 +70,7 @@
70
70
  "zod": "^4.2.1"
71
71
  },
72
72
  "sideEffects": false,
73
- "version": "0.4.0-alpha65",
73
+ "version": "0.4.0-alpha67",
74
74
  "exports": {
75
75
  ".": "./src/index.ts",
76
76
  "./ai": "./src/ai-sdk/index.ts",
@@ -64,6 +64,7 @@ export {
64
64
  } from "./providers/editly/rendi";
65
65
  export {
66
66
  createElevenLabs,
67
+ ELEVENLABS_DEFAULTS,
67
68
  type ElevenLabsProvider,
68
69
  elevenlabs,
69
70
  VOICES,
@@ -168,6 +168,12 @@ export interface ElevenLabsProviderSettings {
168
168
  apiKey?: string;
169
169
  }
170
170
 
171
+ /** Default model IDs used when callers omit the modelId argument. */
172
+ export const ELEVENLABS_DEFAULTS = {
173
+ speechModel: "eleven_turbo_v2",
174
+ musicModel: "music_v1",
175
+ } as const;
176
+
171
177
  export interface ElevenLabsProvider extends ProviderV3 {
172
178
  speechModel(modelId?: string): SpeechModelV3;
173
179
  musicModel(modelId?: string): MusicModelV3;
@@ -184,10 +190,10 @@ export function createElevenLabs(
184
190
 
185
191
  return {
186
192
  specificationVersion: "v3",
187
- speechModel(modelId = "eleven_turbo_v2") {
193
+ speechModel(modelId = ELEVENLABS_DEFAULTS.speechModel) {
188
194
  return new ElevenLabsSpeechModel(modelId, client);
189
195
  },
190
- musicModel(modelId = "music_v1") {
196
+ musicModel(modelId = ELEVENLABS_DEFAULTS.musicModel) {
191
197
  return new ElevenLabsMusicModel(modelId, client);
192
198
  },
193
199
  languageModel(modelId: string): LanguageModelV3 {
@@ -0,0 +1,136 @@
1
+ import { readFileSync, writeFileSync } from "node:fs";
2
+
3
+ export interface AssSegment {
4
+ assPath: string;
5
+ timeOffset: number;
6
+ styleSuffix?: string;
7
+ }
8
+
9
+ /**
10
+ * Parse ASS timestamp `H:MM:SS.CC` to seconds.
11
+ */
12
+ function parseAssTime(ts: string): number {
13
+ const match = ts.match(/^(\d+):(\d{2}):(\d{2})\.(\d{2})$/);
14
+ if (!match) return 0;
15
+ const [, h, m, s, cs] = match;
16
+ return (
17
+ Number.parseInt(h!, 10) * 3600 +
18
+ Number.parseInt(m!, 10) * 60 +
19
+ Number.parseInt(s!, 10) +
20
+ Number.parseInt(cs!, 10) / 100
21
+ );
22
+ }
23
+
24
+ /**
25
+ * Format seconds to ASS timestamp `H:MM:SS.CC`.
26
+ * Computes from total centiseconds to avoid overflow when rounding
27
+ * lands on 100 cs (e.g. 1.999s would otherwise produce `0:00:01.100`).
28
+ */
29
+ function formatAssTime(seconds: number): string {
30
+ const totalCs = Math.max(0, Math.round(seconds * 100));
31
+ const h = Math.floor(totalCs / 360000);
32
+ const m = Math.floor((totalCs % 360000) / 6000);
33
+ const s = Math.floor((totalCs % 6000) / 100);
34
+ const cs = totalCs % 100;
35
+ return `${h}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}.${String(cs).padStart(2, "0")}`;
36
+ }
37
+
38
+ /**
39
+ * Shift all Dialogue timestamps in an ASS file by `offset` seconds.
40
+ * Returns path to a new temp file.
41
+ */
42
+ export function shiftAssTimestamps(assPath: string, offset: number): string {
43
+ const content = readFileSync(assPath, "utf-8");
44
+ const shifted = content.replace(
45
+ /^(Dialogue:\s*\d+,)(\d+:\d{2}:\d{2}\.\d{2}),(\d+:\d{2}:\d{2}\.\d{2})/gm,
46
+ (_match, prefix: string, startTs: string, endTs: string) => {
47
+ const newStart = formatAssTime(parseAssTime(startTs) + offset);
48
+ const newEnd = formatAssTime(parseAssTime(endTs) + offset);
49
+ return `${prefix}${newStart},${newEnd}`;
50
+ },
51
+ );
52
+ const outPath = `/tmp/varg-shifted-captions-${Date.now()}.ass`;
53
+ writeFileSync(outPath, shifted);
54
+ return outPath;
55
+ }
56
+
57
+ /**
58
+ * Merge multiple ASS files into one, shifting timestamps and renaming styles
59
+ * to avoid collisions between segments.
60
+ *
61
+ * Each segment's `Default` style is renamed to `Default_N` (using styleSuffix)
62
+ * and all its Dialogue lines reference the renamed style.
63
+ */
64
+ export function mergeAssFiles(
65
+ segments: AssSegment[],
66
+ width: number,
67
+ height: number,
68
+ ): string {
69
+ const allStyles: string[] = [];
70
+ const allDialogues: string[] = [];
71
+
72
+ for (const segment of segments) {
73
+ const content = readFileSync(segment.assPath, "utf-8");
74
+ const suffix = segment.styleSuffix ?? "";
75
+
76
+ // Extract Style lines from [V4+ Styles] section
77
+ const styleLines = content
78
+ .split("\n")
79
+ .filter((line) => line.startsWith("Style:"));
80
+
81
+ for (const styleLine of styleLines) {
82
+ // Rename style: "Style: Default,..." -> "Style: Default_0,..."
83
+ // Use [^,]+ to handle style names that may contain spaces.
84
+ const renamed = styleLine.replace(
85
+ /^Style:\s*([^,]+),/,
86
+ (_m, name: string) => `Style: ${name.trim()}${suffix},`,
87
+ );
88
+ allStyles.push(renamed);
89
+ }
90
+
91
+ // Extract Dialogue lines from [Events] section
92
+ const dialogueLines = content
93
+ .split("\n")
94
+ .filter((line) => line.startsWith("Dialogue:"));
95
+
96
+ for (const dialogueLine of dialogueLines) {
97
+ // Parse: Dialogue: Layer,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text
98
+ const parts = dialogueLine.split(",");
99
+ if (parts.length < 10) continue;
100
+
101
+ // Shift Start (index 1) and End (index 2)
102
+ const startTs = parts[1]!.trim();
103
+ const endTs = parts[2]!.trim();
104
+ parts[1] = formatAssTime(parseAssTime(startTs) + segment.timeOffset);
105
+ parts[2] = formatAssTime(parseAssTime(endTs) + segment.timeOffset);
106
+
107
+ // Rename style reference (index 3)
108
+ const styleName = parts[3]!.trim();
109
+ parts[3] = `${styleName}${suffix}`;
110
+
111
+ allDialogues.push(parts.join(","));
112
+ }
113
+ }
114
+
115
+ const header = `[Script Info]
116
+ Title: Merged Subtitles
117
+ ScriptType: v4.00+
118
+ PlayResX: ${width}
119
+ PlayResY: ${height}
120
+ WrapStyle: 0
121
+ ScaledBorderAndShadow: yes
122
+ YCbCr Matrix: TV.601
123
+
124
+ [V4+ Styles]
125
+ Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
126
+ ${allStyles.join("\n")}
127
+
128
+ [Events]
129
+ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
130
+ ${allDialogues.join("\n")}
131
+ `;
132
+
133
+ const outPath = `/tmp/varg-merged-captions-${Date.now()}.ass`;
134
+ writeFileSync(outPath, header);
135
+ return outPath;
136
+ }
@@ -29,10 +29,11 @@ import type {
29
29
  VargElement,
30
30
  } from "../types";
31
31
  import { burnCaptions } from "./burn-captions";
32
- import { renderCaptions } from "./captions";
32
+ import { type CaptionsResult, renderCaptions } from "./captions";
33
33
  import { renderClip } from "./clip";
34
34
  import type { RenderContext } from "./context";
35
35
  import { renderImage } from "./image";
36
+ import { mergeAssFiles, shiftAssTimestamps } from "./merge-ass";
36
37
  import { renderMusic } from "./music";
37
38
  import {
38
39
  addTask,
@@ -172,32 +173,56 @@ export async function renderRoot(
172
173
 
173
174
  // Hoist <Captions> out of <Clip> elements — the AI often places them inside
174
175
  // clips, but captions must be processed at the <Render> level to work.
175
- const hoistedCaptions: VargElement<"captions">[] = [];
176
+ // Track which clip each caption came from so we can apply the correct
177
+ // timeline offset when stitching audio and ASS files.
178
+ // We shallow-clone clip elements to avoid mutating the caller's tree.
179
+ interface HoistedCaption {
180
+ element: VargElement<"captions">;
181
+ clipIndex: number;
182
+ }
183
+ const hoistedCaptions: HoistedCaption[] = [];
184
+ const processedChildren: VargElement[] = [];
185
+ let clipIndexCounter = 0;
176
186
  for (const child of element.children) {
177
- if (!child || typeof child !== "object" || !("type" in child)) continue;
187
+ if (!child || typeof child !== "object" || !("type" in child)) {
188
+ continue;
189
+ }
178
190
  const childElement = child as VargElement;
179
- if (childElement.type === "clip" && childElement.children) {
180
- const kept: typeof childElement.children = [];
181
- for (const clipChild of childElement.children) {
182
- if (
183
- clipChild &&
184
- typeof clipChild === "object" &&
185
- "type" in clipChild &&
186
- (clipChild as VargElement).type === "captions"
187
- ) {
188
- hoistedCaptions.push(clipChild as VargElement<"captions">);
189
- } else {
190
- kept.push(clipChild);
191
+ if (childElement.type === "clip") {
192
+ const currentClipIndex = clipIndexCounter++;
193
+ if (childElement.children) {
194
+ const kept: typeof childElement.children = [];
195
+ for (const clipChild of childElement.children) {
196
+ if (
197
+ clipChild &&
198
+ typeof clipChild === "object" &&
199
+ "type" in clipChild &&
200
+ (clipChild as VargElement).type === "captions"
201
+ ) {
202
+ hoistedCaptions.push({
203
+ element: clipChild as VargElement<"captions">,
204
+ clipIndex: currentClipIndex,
205
+ });
206
+ } else {
207
+ kept.push(clipChild);
208
+ }
191
209
  }
210
+ // Shallow-clone the clip with captions removed, leaving the
211
+ // original element tree untouched for potential re-renders.
212
+ processedChildren.push({
213
+ ...childElement,
214
+ children: kept,
215
+ } as VargElement);
216
+ } else {
217
+ processedChildren.push(childElement);
192
218
  }
193
- childElement.children = kept;
219
+ } else {
220
+ processedChildren.push(childElement);
194
221
  }
195
222
  }
196
223
 
197
- for (const child of element.children) {
198
- if (!child || typeof child !== "object" || !("type" in child)) continue;
199
-
200
- const childElement = child as VargElement;
224
+ for (const child of processedChildren) {
225
+ const childElement = child;
201
226
 
202
227
  if (childElement.type === "clip") {
203
228
  clipElements.push(childElement as VargElement<"clip">);
@@ -230,17 +255,9 @@ export async function renderRoot(
230
255
  }
231
256
  }
232
257
 
233
- // Process any <Captions> that were hoisted from inside <Clip> elements
234
- if (!captionsResult && hoistedCaptions.length > 0) {
235
- const captionsElement = hoistedCaptions[0]!;
236
- captionsResult = await renderCaptions(captionsElement, ctx);
237
- if (captionsResult.audioPath) {
238
- audioTracks.push({
239
- path: captionsResult.audioPath,
240
- mixVolume: 1,
241
- });
242
- }
243
- }
258
+ // Hoisted captions are processed AFTER clip timeline offsets are computed
259
+ // (see below) so that each caption's audio can be delayed to the correct
260
+ // clip start time and ASS timestamps can be shifted accordingly.
244
261
 
245
262
  const renderedOverlays: RenderedOverlay[] = [];
246
263
  for (const overlay of overlayElements) {
@@ -335,6 +352,7 @@ export async function renderRoot(
335
352
  });
336
353
 
337
354
  const clips: Clip[] = [];
355
+ const clipStartOffsets: number[] = [];
338
356
  let currentTime = 0;
339
357
 
340
358
  for (let i = 0; i < clipElements.length; i++) {
@@ -347,6 +365,8 @@ export async function renderRoot(
347
365
  const clipDuration =
348
366
  typeof clipProps.duration === "number" ? clipProps.duration : 3;
349
367
 
368
+ clipStartOffsets.push(currentTime);
369
+
350
370
  for (const overlay of renderedOverlays) {
351
371
  const overlayLayer: VideoLayer = {
352
372
  type: "video",
@@ -371,6 +391,54 @@ export async function renderRoot(
371
391
 
372
392
  const totalDuration = currentTime;
373
393
 
394
+ // Process any <Captions> that were hoisted from inside <Clip> elements.
395
+ // Now that we know clipStartOffsets, each caption's audio can be delayed
396
+ // and ASS timestamps shifted to the correct position in the timeline.
397
+ const hoistedCaptionsResults: CaptionsResult[] = [];
398
+ let mergedAssPath: string | undefined;
399
+
400
+ if (captionsResult && hoistedCaptions.length > 0) {
401
+ console.warn(
402
+ `\x1b[33m⚠ Found both a Render-level <Captions> and ${hoistedCaptions.length} clip-level <Captions>. ` +
403
+ "Clip-level captions will be ignored — move all <Captions> inside clips, " +
404
+ "or use a single <Captions> at the Render level.\x1b[0m",
405
+ );
406
+ }
407
+
408
+ if (!captionsResult && hoistedCaptions.length > 0) {
409
+ for (const { element: captionsElement, clipIndex } of hoistedCaptions) {
410
+ const result = await renderCaptions(captionsElement, ctx);
411
+ hoistedCaptionsResults.push(result);
412
+
413
+ if (result.audioPath) {
414
+ audioTracks.push({
415
+ path: result.audioPath,
416
+ start: clipStartOffsets[clipIndex] ?? 0,
417
+ mixVolume: 1,
418
+ });
419
+ }
420
+ }
421
+
422
+ // Merge ASS files: shift timestamps by each clip's start offset
423
+ if (hoistedCaptionsResults.length === 1) {
424
+ const offset = clipStartOffsets[hoistedCaptions[0]!.clipIndex] ?? 0;
425
+ const assPath = hoistedCaptionsResults[0]!.assPath;
426
+ mergedAssPath =
427
+ offset > 0 ? shiftAssTimestamps(assPath, offset) : assPath;
428
+ if (mergedAssPath !== assPath) {
429
+ ctx.tempFiles.push(mergedAssPath);
430
+ }
431
+ } else if (hoistedCaptionsResults.length > 1) {
432
+ const segments = hoistedCaptionsResults.map((result, i) => ({
433
+ assPath: result.assPath,
434
+ timeOffset: clipStartOffsets[hoistedCaptions[i]!.clipIndex] ?? 0,
435
+ styleSuffix: `_${i}`,
436
+ }));
437
+ mergedAssPath = mergeAssFiles(segments, ctx.width, ctx.height);
438
+ ctx.tempFiles.push(mergedAssPath);
439
+ }
440
+ }
441
+
374
442
  // process music after clips so we know total duration for auto-trim
375
443
  for (const musicElement of musicElements) {
376
444
  const musicProps = musicElement.props as MusicProps;
@@ -400,7 +468,10 @@ export async function renderRoot(
400
468
  });
401
469
  }
402
470
 
403
- const hasCaptions = captionsResult !== undefined;
471
+ // Determine the ASS path to burn: Render-level captions take priority,
472
+ // then merged/shifted hoisted captions from clips.
473
+ const finalAssPath = captionsResult?.assPath ?? mergedAssPath;
474
+ const hasCaptions = finalAssPath !== undefined;
404
475
 
405
476
  const tempOutPath = hasCaptions
406
477
  ? `/tmp/varg-pre-captions-${Date.now()}.mp4`
@@ -426,13 +497,13 @@ export async function renderRoot(
426
497
 
427
498
  let output = editlyResult.output;
428
499
 
429
- if (hasCaptions && captionsResult) {
500
+ if (hasCaptions && finalAssPath) {
430
501
  const captionsTaskId = addTask(progress, "captions", "ffmpeg");
431
502
  startTask(progress, captionsTaskId);
432
503
 
433
504
  output = await burnCaptions({
434
505
  video: output,
435
- assPath: captionsResult.assPath,
506
+ assPath: finalAssPath,
436
507
  outputPath: finalOutPath,
437
508
  backend: options.backend,
438
509
  verbose: options.verbose,