vargai 0.4.0-alpha65 → 0.4.0-alpha67
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
package/src/ai-sdk/index.ts
CHANGED
|
@@ -168,6 +168,12 @@ export interface ElevenLabsProviderSettings {
|
|
|
168
168
|
apiKey?: string;
|
|
169
169
|
}
|
|
170
170
|
|
|
171
|
+
/** Default model IDs used when callers omit the modelId argument. */
|
|
172
|
+
export const ELEVENLABS_DEFAULTS = {
|
|
173
|
+
speechModel: "eleven_turbo_v2",
|
|
174
|
+
musicModel: "music_v1",
|
|
175
|
+
} as const;
|
|
176
|
+
|
|
171
177
|
export interface ElevenLabsProvider extends ProviderV3 {
|
|
172
178
|
speechModel(modelId?: string): SpeechModelV3;
|
|
173
179
|
musicModel(modelId?: string): MusicModelV3;
|
|
@@ -184,10 +190,10 @@ export function createElevenLabs(
|
|
|
184
190
|
|
|
185
191
|
return {
|
|
186
192
|
specificationVersion: "v3",
|
|
187
|
-
speechModel(modelId =
|
|
193
|
+
speechModel(modelId = ELEVENLABS_DEFAULTS.speechModel) {
|
|
188
194
|
return new ElevenLabsSpeechModel(modelId, client);
|
|
189
195
|
},
|
|
190
|
-
musicModel(modelId =
|
|
196
|
+
musicModel(modelId = ELEVENLABS_DEFAULTS.musicModel) {
|
|
191
197
|
return new ElevenLabsMusicModel(modelId, client);
|
|
192
198
|
},
|
|
193
199
|
languageModel(modelId: string): LanguageModelV3 {
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import { readFileSync, writeFileSync } from "node:fs";
|
|
2
|
+
|
|
3
|
+
export interface AssSegment {
|
|
4
|
+
assPath: string;
|
|
5
|
+
timeOffset: number;
|
|
6
|
+
styleSuffix?: string;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Parse ASS timestamp `H:MM:SS.CC` to seconds.
|
|
11
|
+
*/
|
|
12
|
+
function parseAssTime(ts: string): number {
|
|
13
|
+
const match = ts.match(/^(\d+):(\d{2}):(\d{2})\.(\d{2})$/);
|
|
14
|
+
if (!match) return 0;
|
|
15
|
+
const [, h, m, s, cs] = match;
|
|
16
|
+
return (
|
|
17
|
+
Number.parseInt(h!, 10) * 3600 +
|
|
18
|
+
Number.parseInt(m!, 10) * 60 +
|
|
19
|
+
Number.parseInt(s!, 10) +
|
|
20
|
+
Number.parseInt(cs!, 10) / 100
|
|
21
|
+
);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Format seconds to ASS timestamp `H:MM:SS.CC`.
|
|
26
|
+
* Computes from total centiseconds to avoid overflow when rounding
|
|
27
|
+
* lands on 100 cs (e.g. 1.999s would otherwise produce `0:00:01.100`).
|
|
28
|
+
*/
|
|
29
|
+
function formatAssTime(seconds: number): string {
|
|
30
|
+
const totalCs = Math.max(0, Math.round(seconds * 100));
|
|
31
|
+
const h = Math.floor(totalCs / 360000);
|
|
32
|
+
const m = Math.floor((totalCs % 360000) / 6000);
|
|
33
|
+
const s = Math.floor((totalCs % 6000) / 100);
|
|
34
|
+
const cs = totalCs % 100;
|
|
35
|
+
return `${h}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}.${String(cs).padStart(2, "0")}`;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Shift all Dialogue timestamps in an ASS file by `offset` seconds.
|
|
40
|
+
* Returns path to a new temp file.
|
|
41
|
+
*/
|
|
42
|
+
export function shiftAssTimestamps(assPath: string, offset: number): string {
|
|
43
|
+
const content = readFileSync(assPath, "utf-8");
|
|
44
|
+
const shifted = content.replace(
|
|
45
|
+
/^(Dialogue:\s*\d+,)(\d+:\d{2}:\d{2}\.\d{2}),(\d+:\d{2}:\d{2}\.\d{2})/gm,
|
|
46
|
+
(_match, prefix: string, startTs: string, endTs: string) => {
|
|
47
|
+
const newStart = formatAssTime(parseAssTime(startTs) + offset);
|
|
48
|
+
const newEnd = formatAssTime(parseAssTime(endTs) + offset);
|
|
49
|
+
return `${prefix}${newStart},${newEnd}`;
|
|
50
|
+
},
|
|
51
|
+
);
|
|
52
|
+
const outPath = `/tmp/varg-shifted-captions-${Date.now()}.ass`;
|
|
53
|
+
writeFileSync(outPath, shifted);
|
|
54
|
+
return outPath;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Merge multiple ASS files into one, shifting timestamps and renaming styles
|
|
59
|
+
* to avoid collisions between segments.
|
|
60
|
+
*
|
|
61
|
+
* Each segment's `Default` style is renamed to `Default_N` (using styleSuffix)
|
|
62
|
+
* and all its Dialogue lines reference the renamed style.
|
|
63
|
+
*/
|
|
64
|
+
export function mergeAssFiles(
|
|
65
|
+
segments: AssSegment[],
|
|
66
|
+
width: number,
|
|
67
|
+
height: number,
|
|
68
|
+
): string {
|
|
69
|
+
const allStyles: string[] = [];
|
|
70
|
+
const allDialogues: string[] = [];
|
|
71
|
+
|
|
72
|
+
for (const segment of segments) {
|
|
73
|
+
const content = readFileSync(segment.assPath, "utf-8");
|
|
74
|
+
const suffix = segment.styleSuffix ?? "";
|
|
75
|
+
|
|
76
|
+
// Extract Style lines from [V4+ Styles] section
|
|
77
|
+
const styleLines = content
|
|
78
|
+
.split("\n")
|
|
79
|
+
.filter((line) => line.startsWith("Style:"));
|
|
80
|
+
|
|
81
|
+
for (const styleLine of styleLines) {
|
|
82
|
+
// Rename style: "Style: Default,..." -> "Style: Default_0,..."
|
|
83
|
+
// Use [^,]+ to handle style names that may contain spaces.
|
|
84
|
+
const renamed = styleLine.replace(
|
|
85
|
+
/^Style:\s*([^,]+),/,
|
|
86
|
+
(_m, name: string) => `Style: ${name.trim()}${suffix},`,
|
|
87
|
+
);
|
|
88
|
+
allStyles.push(renamed);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Extract Dialogue lines from [Events] section
|
|
92
|
+
const dialogueLines = content
|
|
93
|
+
.split("\n")
|
|
94
|
+
.filter((line) => line.startsWith("Dialogue:"));
|
|
95
|
+
|
|
96
|
+
for (const dialogueLine of dialogueLines) {
|
|
97
|
+
// Parse: Dialogue: Layer,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text
|
|
98
|
+
const parts = dialogueLine.split(",");
|
|
99
|
+
if (parts.length < 10) continue;
|
|
100
|
+
|
|
101
|
+
// Shift Start (index 1) and End (index 2)
|
|
102
|
+
const startTs = parts[1]!.trim();
|
|
103
|
+
const endTs = parts[2]!.trim();
|
|
104
|
+
parts[1] = formatAssTime(parseAssTime(startTs) + segment.timeOffset);
|
|
105
|
+
parts[2] = formatAssTime(parseAssTime(endTs) + segment.timeOffset);
|
|
106
|
+
|
|
107
|
+
// Rename style reference (index 3)
|
|
108
|
+
const styleName = parts[3]!.trim();
|
|
109
|
+
parts[3] = `${styleName}${suffix}`;
|
|
110
|
+
|
|
111
|
+
allDialogues.push(parts.join(","));
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const header = `[Script Info]
|
|
116
|
+
Title: Merged Subtitles
|
|
117
|
+
ScriptType: v4.00+
|
|
118
|
+
PlayResX: ${width}
|
|
119
|
+
PlayResY: ${height}
|
|
120
|
+
WrapStyle: 0
|
|
121
|
+
ScaledBorderAndShadow: yes
|
|
122
|
+
YCbCr Matrix: TV.601
|
|
123
|
+
|
|
124
|
+
[V4+ Styles]
|
|
125
|
+
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
|
126
|
+
${allStyles.join("\n")}
|
|
127
|
+
|
|
128
|
+
[Events]
|
|
129
|
+
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
|
130
|
+
${allDialogues.join("\n")}
|
|
131
|
+
`;
|
|
132
|
+
|
|
133
|
+
const outPath = `/tmp/varg-merged-captions-${Date.now()}.ass`;
|
|
134
|
+
writeFileSync(outPath, header);
|
|
135
|
+
return outPath;
|
|
136
|
+
}
|
|
@@ -29,10 +29,11 @@ import type {
|
|
|
29
29
|
VargElement,
|
|
30
30
|
} from "../types";
|
|
31
31
|
import { burnCaptions } from "./burn-captions";
|
|
32
|
-
import { renderCaptions } from "./captions";
|
|
32
|
+
import { type CaptionsResult, renderCaptions } from "./captions";
|
|
33
33
|
import { renderClip } from "./clip";
|
|
34
34
|
import type { RenderContext } from "./context";
|
|
35
35
|
import { renderImage } from "./image";
|
|
36
|
+
import { mergeAssFiles, shiftAssTimestamps } from "./merge-ass";
|
|
36
37
|
import { renderMusic } from "./music";
|
|
37
38
|
import {
|
|
38
39
|
addTask,
|
|
@@ -172,32 +173,56 @@ export async function renderRoot(
|
|
|
172
173
|
|
|
173
174
|
// Hoist <Captions> out of <Clip> elements — the AI often places them inside
|
|
174
175
|
// clips, but captions must be processed at the <Render> level to work.
|
|
175
|
-
|
|
176
|
+
// Track which clip each caption came from so we can apply the correct
|
|
177
|
+
// timeline offset when stitching audio and ASS files.
|
|
178
|
+
// We shallow-clone clip elements to avoid mutating the caller's tree.
|
|
179
|
+
interface HoistedCaption {
|
|
180
|
+
element: VargElement<"captions">;
|
|
181
|
+
clipIndex: number;
|
|
182
|
+
}
|
|
183
|
+
const hoistedCaptions: HoistedCaption[] = [];
|
|
184
|
+
const processedChildren: VargElement[] = [];
|
|
185
|
+
let clipIndexCounter = 0;
|
|
176
186
|
for (const child of element.children) {
|
|
177
|
-
if (!child || typeof child !== "object" || !("type" in child))
|
|
187
|
+
if (!child || typeof child !== "object" || !("type" in child)) {
|
|
188
|
+
continue;
|
|
189
|
+
}
|
|
178
190
|
const childElement = child as VargElement;
|
|
179
|
-
if (childElement.type === "clip"
|
|
180
|
-
const
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
+
if (childElement.type === "clip") {
|
|
192
|
+
const currentClipIndex = clipIndexCounter++;
|
|
193
|
+
if (childElement.children) {
|
|
194
|
+
const kept: typeof childElement.children = [];
|
|
195
|
+
for (const clipChild of childElement.children) {
|
|
196
|
+
if (
|
|
197
|
+
clipChild &&
|
|
198
|
+
typeof clipChild === "object" &&
|
|
199
|
+
"type" in clipChild &&
|
|
200
|
+
(clipChild as VargElement).type === "captions"
|
|
201
|
+
) {
|
|
202
|
+
hoistedCaptions.push({
|
|
203
|
+
element: clipChild as VargElement<"captions">,
|
|
204
|
+
clipIndex: currentClipIndex,
|
|
205
|
+
});
|
|
206
|
+
} else {
|
|
207
|
+
kept.push(clipChild);
|
|
208
|
+
}
|
|
191
209
|
}
|
|
210
|
+
// Shallow-clone the clip with captions removed, leaving the
|
|
211
|
+
// original element tree untouched for potential re-renders.
|
|
212
|
+
processedChildren.push({
|
|
213
|
+
...childElement,
|
|
214
|
+
children: kept,
|
|
215
|
+
} as VargElement);
|
|
216
|
+
} else {
|
|
217
|
+
processedChildren.push(childElement);
|
|
192
218
|
}
|
|
193
|
-
|
|
219
|
+
} else {
|
|
220
|
+
processedChildren.push(childElement);
|
|
194
221
|
}
|
|
195
222
|
}
|
|
196
223
|
|
|
197
|
-
for (const child of
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
const childElement = child as VargElement;
|
|
224
|
+
for (const child of processedChildren) {
|
|
225
|
+
const childElement = child;
|
|
201
226
|
|
|
202
227
|
if (childElement.type === "clip") {
|
|
203
228
|
clipElements.push(childElement as VargElement<"clip">);
|
|
@@ -230,17 +255,9 @@ export async function renderRoot(
|
|
|
230
255
|
}
|
|
231
256
|
}
|
|
232
257
|
|
|
233
|
-
//
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
captionsResult = await renderCaptions(captionsElement, ctx);
|
|
237
|
-
if (captionsResult.audioPath) {
|
|
238
|
-
audioTracks.push({
|
|
239
|
-
path: captionsResult.audioPath,
|
|
240
|
-
mixVolume: 1,
|
|
241
|
-
});
|
|
242
|
-
}
|
|
243
|
-
}
|
|
258
|
+
// Hoisted captions are processed AFTER clip timeline offsets are computed
|
|
259
|
+
// (see below) so that each caption's audio can be delayed to the correct
|
|
260
|
+
// clip start time and ASS timestamps can be shifted accordingly.
|
|
244
261
|
|
|
245
262
|
const renderedOverlays: RenderedOverlay[] = [];
|
|
246
263
|
for (const overlay of overlayElements) {
|
|
@@ -335,6 +352,7 @@ export async function renderRoot(
|
|
|
335
352
|
});
|
|
336
353
|
|
|
337
354
|
const clips: Clip[] = [];
|
|
355
|
+
const clipStartOffsets: number[] = [];
|
|
338
356
|
let currentTime = 0;
|
|
339
357
|
|
|
340
358
|
for (let i = 0; i < clipElements.length; i++) {
|
|
@@ -347,6 +365,8 @@ export async function renderRoot(
|
|
|
347
365
|
const clipDuration =
|
|
348
366
|
typeof clipProps.duration === "number" ? clipProps.duration : 3;
|
|
349
367
|
|
|
368
|
+
clipStartOffsets.push(currentTime);
|
|
369
|
+
|
|
350
370
|
for (const overlay of renderedOverlays) {
|
|
351
371
|
const overlayLayer: VideoLayer = {
|
|
352
372
|
type: "video",
|
|
@@ -371,6 +391,54 @@ export async function renderRoot(
|
|
|
371
391
|
|
|
372
392
|
const totalDuration = currentTime;
|
|
373
393
|
|
|
394
|
+
// Process any <Captions> that were hoisted from inside <Clip> elements.
|
|
395
|
+
// Now that we know clipStartOffsets, each caption's audio can be delayed
|
|
396
|
+
// and ASS timestamps shifted to the correct position in the timeline.
|
|
397
|
+
const hoistedCaptionsResults: CaptionsResult[] = [];
|
|
398
|
+
let mergedAssPath: string | undefined;
|
|
399
|
+
|
|
400
|
+
if (captionsResult && hoistedCaptions.length > 0) {
|
|
401
|
+
console.warn(
|
|
402
|
+
`\x1b[33m⚠ Found both a Render-level <Captions> and ${hoistedCaptions.length} clip-level <Captions>. ` +
|
|
403
|
+
"Clip-level captions will be ignored — move all <Captions> inside clips, " +
|
|
404
|
+
"or use a single <Captions> at the Render level.\x1b[0m",
|
|
405
|
+
);
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
if (!captionsResult && hoistedCaptions.length > 0) {
|
|
409
|
+
for (const { element: captionsElement, clipIndex } of hoistedCaptions) {
|
|
410
|
+
const result = await renderCaptions(captionsElement, ctx);
|
|
411
|
+
hoistedCaptionsResults.push(result);
|
|
412
|
+
|
|
413
|
+
if (result.audioPath) {
|
|
414
|
+
audioTracks.push({
|
|
415
|
+
path: result.audioPath,
|
|
416
|
+
start: clipStartOffsets[clipIndex] ?? 0,
|
|
417
|
+
mixVolume: 1,
|
|
418
|
+
});
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
// Merge ASS files: shift timestamps by each clip's start offset
|
|
423
|
+
if (hoistedCaptionsResults.length === 1) {
|
|
424
|
+
const offset = clipStartOffsets[hoistedCaptions[0]!.clipIndex] ?? 0;
|
|
425
|
+
const assPath = hoistedCaptionsResults[0]!.assPath;
|
|
426
|
+
mergedAssPath =
|
|
427
|
+
offset > 0 ? shiftAssTimestamps(assPath, offset) : assPath;
|
|
428
|
+
if (mergedAssPath !== assPath) {
|
|
429
|
+
ctx.tempFiles.push(mergedAssPath);
|
|
430
|
+
}
|
|
431
|
+
} else if (hoistedCaptionsResults.length > 1) {
|
|
432
|
+
const segments = hoistedCaptionsResults.map((result, i) => ({
|
|
433
|
+
assPath: result.assPath,
|
|
434
|
+
timeOffset: clipStartOffsets[hoistedCaptions[i]!.clipIndex] ?? 0,
|
|
435
|
+
styleSuffix: `_${i}`,
|
|
436
|
+
}));
|
|
437
|
+
mergedAssPath = mergeAssFiles(segments, ctx.width, ctx.height);
|
|
438
|
+
ctx.tempFiles.push(mergedAssPath);
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
|
|
374
442
|
// process music after clips so we know total duration for auto-trim
|
|
375
443
|
for (const musicElement of musicElements) {
|
|
376
444
|
const musicProps = musicElement.props as MusicProps;
|
|
@@ -400,7 +468,10 @@ export async function renderRoot(
|
|
|
400
468
|
});
|
|
401
469
|
}
|
|
402
470
|
|
|
403
|
-
|
|
471
|
+
// Determine the ASS path to burn: Render-level captions take priority,
|
|
472
|
+
// then merged/shifted hoisted captions from clips.
|
|
473
|
+
const finalAssPath = captionsResult?.assPath ?? mergedAssPath;
|
|
474
|
+
const hasCaptions = finalAssPath !== undefined;
|
|
404
475
|
|
|
405
476
|
const tempOutPath = hasCaptions
|
|
406
477
|
? `/tmp/varg-pre-captions-${Date.now()}.mp4`
|
|
@@ -426,13 +497,13 @@ export async function renderRoot(
|
|
|
426
497
|
|
|
427
498
|
let output = editlyResult.output;
|
|
428
499
|
|
|
429
|
-
if (hasCaptions &&
|
|
500
|
+
if (hasCaptions && finalAssPath) {
|
|
430
501
|
const captionsTaskId = addTask(progress, "captions", "ffmpeg");
|
|
431
502
|
startTask(progress, captionsTaskId);
|
|
432
503
|
|
|
433
504
|
output = await burnCaptions({
|
|
434
505
|
video: output,
|
|
435
|
-
assPath:
|
|
506
|
+
assPath: finalAssPath,
|
|
436
507
|
outputPath: finalOutPath,
|
|
437
508
|
backend: options.backend,
|
|
438
509
|
verbose: options.verbose,
|