npm - vargai - Versions diffs - 0.4.0-alpha65 → 0.4.0-alpha67 - Mend

vargai 0.4.0-alpha65 → 0.4.0-alpha67

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/src/ai-sdk/index.ts +1 -0
package/src/ai-sdk/providers/elevenlabs.ts +8 -2
package/src/react/renderers/merge-ass.ts +136 -0
package/src/react/renderers/render.ts +105 -34

package/package.json CHANGED Viewed

@@ -70,7 +70,7 @@
     "zod": "^4.2.1"
   },
   "sideEffects": false,
-  "version": "0.4.0-alpha65",
+  "version": "0.4.0-alpha67",
   "exports": {
     ".": "./src/index.ts",
     "./ai": "./src/ai-sdk/index.ts",

package/src/ai-sdk/index.ts CHANGED Viewed

@@ -64,6 +64,7 @@ export {
 } from "./providers/editly/rendi";
 export {
   createElevenLabs,
+  ELEVENLABS_DEFAULTS,
   type ElevenLabsProvider,
   elevenlabs,
   VOICES,

package/src/ai-sdk/providers/elevenlabs.ts CHANGED Viewed

@@ -168,6 +168,12 @@ export interface ElevenLabsProviderSettings {
   apiKey?: string;
 }
+/** Default model IDs used when callers omit the modelId argument. */
+export const ELEVENLABS_DEFAULTS = {
+  speechModel: "eleven_turbo_v2",
+  musicModel: "music_v1",
+} as const;
 export interface ElevenLabsProvider extends ProviderV3 {
   speechModel(modelId?: string): SpeechModelV3;
   musicModel(modelId?: string): MusicModelV3;
@@ -184,10 +190,10 @@ export function createElevenLabs(
   return {
     specificationVersion: "v3",
-    speechModel(modelId = "eleven_turbo_v2") {
+    speechModel(modelId = ELEVENLABS_DEFAULTS.speechModel) {
       return new ElevenLabsSpeechModel(modelId, client);
     },
-    musicModel(modelId = "music_v1") {
+    musicModel(modelId = ELEVENLABS_DEFAULTS.musicModel) {
       return new ElevenLabsMusicModel(modelId, client);
     },
     languageModel(modelId: string): LanguageModelV3 {

package/src/react/renderers/merge-ass.ts ADDED Viewed

@@ -0,0 +1,136 @@
+import { readFileSync, writeFileSync } from "node:fs";
+export interface AssSegment {
+  assPath: string;
+  timeOffset: number;
+  styleSuffix?: string;
+}
+/**
+ * Parse ASS timestamp `H:MM:SS.CC` to seconds.
+ */
+function parseAssTime(ts: string): number {
+  const match = ts.match(/^(\d+):(\d{2}):(\d{2})\.(\d{2})$/);
+  if (!match) return 0;
+  const [, h, m, s, cs] = match;
+  return (
+    Number.parseInt(h!, 10) * 3600 +
+    Number.parseInt(m!, 10) * 60 +
+    Number.parseInt(s!, 10) +
+    Number.parseInt(cs!, 10) / 100
+  );
+}
+/**
+ * Format seconds to ASS timestamp `H:MM:SS.CC`.
+ * Computes from total centiseconds to avoid overflow when rounding
+ * lands on 100 cs (e.g. 1.999s would otherwise produce `0:00:01.100`).
+ */
+function formatAssTime(seconds: number): string {
+  const totalCs = Math.max(0, Math.round(seconds * 100));
+  const h = Math.floor(totalCs / 360000);
+  const m = Math.floor((totalCs % 360000) / 6000);
+  const s = Math.floor((totalCs % 6000) / 100);
+  const cs = totalCs % 100;
+  return `${h}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}.${String(cs).padStart(2, "0")}`;
+}
+/**
+ * Shift all Dialogue timestamps in an ASS file by `offset` seconds.
+ * Returns path to a new temp file.
+ */
+export function shiftAssTimestamps(assPath: string, offset: number): string {
+  const content = readFileSync(assPath, "utf-8");
+  const shifted = content.replace(
+    /^(Dialogue:\s*\d+,)(\d+:\d{2}:\d{2}\.\d{2}),(\d+:\d{2}:\d{2}\.\d{2})/gm,
+    (_match, prefix: string, startTs: string, endTs: string) => {
+      const newStart = formatAssTime(parseAssTime(startTs) + offset);
+      const newEnd = formatAssTime(parseAssTime(endTs) + offset);
+      return `${prefix}${newStart},${newEnd}`;
+    },
+  );
+  const outPath = `/tmp/varg-shifted-captions-${Date.now()}.ass`;
+  writeFileSync(outPath, shifted);
+  return outPath;
+}
+/**
+ * Merge multiple ASS files into one, shifting timestamps and renaming styles
+ * to avoid collisions between segments.
+ *
+ * Each segment's `Default` style is renamed to `Default_N` (using styleSuffix)
+ * and all its Dialogue lines reference the renamed style.
+ */
+export function mergeAssFiles(
+  segments: AssSegment[],
+  width: number,
+  height: number,
+): string {
+  const allStyles: string[] = [];
+  const allDialogues: string[] = [];
+  for (const segment of segments) {
+    const content = readFileSync(segment.assPath, "utf-8");
+    const suffix = segment.styleSuffix ?? "";
+    // Extract Style lines from [V4+ Styles] section
+    const styleLines = content
+      .split("\n")
+      .filter((line) => line.startsWith("Style:"));
+    for (const styleLine of styleLines) {
+      // Rename style: "Style: Default,..." -> "Style: Default_0,..."
+      // Use [^,]+ to handle style names that may contain spaces.
+      const renamed = styleLine.replace(
+        /^Style:\s*([^,]+),/,
+        (_m, name: string) => `Style: ${name.trim()}${suffix},`,
+      );
+      allStyles.push(renamed);
+    }
+    // Extract Dialogue lines from [Events] section
+    const dialogueLines = content
+      .split("\n")
+      .filter((line) => line.startsWith("Dialogue:"));
+    for (const dialogueLine of dialogueLines) {
+      // Parse: Dialogue: Layer,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text
+      const parts = dialogueLine.split(",");
+      if (parts.length < 10) continue;
+      // Shift Start (index 1) and End (index 2)
+      const startTs = parts[1]!.trim();
+      const endTs = parts[2]!.trim();
+      parts[1] = formatAssTime(parseAssTime(startTs) + segment.timeOffset);
+      parts[2] = formatAssTime(parseAssTime(endTs) + segment.timeOffset);
+      // Rename style reference (index 3)
+      const styleName = parts[3]!.trim();
+      parts[3] = `${styleName}${suffix}`;
+      allDialogues.push(parts.join(","));
+    }
+  }
+  const header = `[Script Info]
+Title: Merged Subtitles
+ScriptType: v4.00+
+PlayResX: ${width}
+PlayResY: ${height}
+WrapStyle: 0
+ScaledBorderAndShadow: yes
+YCbCr Matrix: TV.601
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+${allStyles.join("\n")}
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+${allDialogues.join("\n")}
+`;
+  const outPath = `/tmp/varg-merged-captions-${Date.now()}.ass`;
+  writeFileSync(outPath, header);
+  return outPath;
+}

package/src/react/renderers/render.ts CHANGED Viewed

@@ -29,10 +29,11 @@ import type {
   VargElement,
 } from "../types";
 import { burnCaptions } from "./burn-captions";
-import { renderCaptions } from "./captions";
+import { type CaptionsResult, renderCaptions } from "./captions";
 import { renderClip } from "./clip";
 import type { RenderContext } from "./context";
 import { renderImage } from "./image";
+import { mergeAssFiles, shiftAssTimestamps } from "./merge-ass";
 import { renderMusic } from "./music";
 import {
   addTask,
@@ -172,32 +173,56 @@ export async function renderRoot(
   // Hoist <Captions> out of <Clip> elements — the AI often places them inside
   // clips, but captions must be processed at the <Render> level to work.
-  const hoistedCaptions: VargElement<"captions">[] = [];
+  // Track which clip each caption came from so we can apply the correct
+  // timeline offset when stitching audio and ASS files.
+  // We shallow-clone clip elements to avoid mutating the caller's tree.
+  interface HoistedCaption {
+    element: VargElement<"captions">;
+    clipIndex: number;
+  }
+  const hoistedCaptions: HoistedCaption[] = [];
+  const processedChildren: VargElement[] = [];
+  let clipIndexCounter = 0;
   for (const child of element.children) {
-    if (!child || typeof child !== "object" || !("type" in child)) continue;
+    if (!child || typeof child !== "object" || !("type" in child)) {
+      continue;
+    }
     const childElement = child as VargElement;
-    if (childElement.type === "clip" && childElement.children) {
-      const kept: typeof childElement.children = [];
-      for (const clipChild of childElement.children) {
-        if (
-          clipChild &&
-          typeof clipChild === "object" &&
-          "type" in clipChild &&
-          (clipChild as VargElement).type === "captions"
-        ) {
-          hoistedCaptions.push(clipChild as VargElement<"captions">);
-        } else {
-          kept.push(clipChild);
+    if (childElement.type === "clip") {
+      const currentClipIndex = clipIndexCounter++;
+      if (childElement.children) {
+        const kept: typeof childElement.children = [];
+        for (const clipChild of childElement.children) {
+          if (
+            clipChild &&
+            typeof clipChild === "object" &&
+            "type" in clipChild &&
+            (clipChild as VargElement).type === "captions"
+          ) {
+            hoistedCaptions.push({
+              element: clipChild as VargElement<"captions">,
+              clipIndex: currentClipIndex,
+            });
+          } else {
+            kept.push(clipChild);
+          }
         }
+        // Shallow-clone the clip with captions removed, leaving the
+        // original element tree untouched for potential re-renders.
+        processedChildren.push({
+          ...childElement,
+          children: kept,
+        } as VargElement);
+      } else {
+        processedChildren.push(childElement);
       }
-      childElement.children = kept;
+    } else {
+      processedChildren.push(childElement);
     }
   }
-  for (const child of element.children) {
-    if (!child || typeof child !== "object" || !("type" in child)) continue;
-    const childElement = child as VargElement;
+  for (const child of processedChildren) {
+    const childElement = child;
     if (childElement.type === "clip") {
       clipElements.push(childElement as VargElement<"clip">);
@@ -230,17 +255,9 @@ export async function renderRoot(
     }
   }
-  // Process any <Captions> that were hoisted from inside <Clip> elements
-  if (!captionsResult && hoistedCaptions.length > 0) {
-    const captionsElement = hoistedCaptions[0]!;
-    captionsResult = await renderCaptions(captionsElement, ctx);
-    if (captionsResult.audioPath) {
-      audioTracks.push({
-        path: captionsResult.audioPath,
-        mixVolume: 1,
-      });
-    }
-  }
+  // Hoisted captions are processed AFTER clip timeline offsets are computed
+  // (see below) so that each caption's audio can be delayed to the correct
+  // clip start time and ASS timestamps can be shifted accordingly.
   const renderedOverlays: RenderedOverlay[] = [];
   for (const overlay of overlayElements) {
@@ -335,6 +352,7 @@ export async function renderRoot(
   });
   const clips: Clip[] = [];
+  const clipStartOffsets: number[] = [];
   let currentTime = 0;
   for (let i = 0; i < clipElements.length; i++) {
@@ -347,6 +365,8 @@ export async function renderRoot(
     const clipDuration =
       typeof clipProps.duration === "number" ? clipProps.duration : 3;
+    clipStartOffsets.push(currentTime);
     for (const overlay of renderedOverlays) {
       const overlayLayer: VideoLayer = {
         type: "video",
@@ -371,6 +391,54 @@ export async function renderRoot(
   const totalDuration = currentTime;
+  // Process any <Captions> that were hoisted from inside <Clip> elements.
+  // Now that we know clipStartOffsets, each caption's audio can be delayed
+  // and ASS timestamps shifted to the correct position in the timeline.
+  const hoistedCaptionsResults: CaptionsResult[] = [];
+  let mergedAssPath: string | undefined;
+  if (captionsResult && hoistedCaptions.length > 0) {
+    console.warn(
+      `\x1b[33m⚠ Found both a Render-level <Captions> and ${hoistedCaptions.length} clip-level <Captions>. ` +
+        "Clip-level captions will be ignored — move all <Captions> inside clips, " +
+        "or use a single <Captions> at the Render level.\x1b[0m",
+    );
+  }
+  if (!captionsResult && hoistedCaptions.length > 0) {
+    for (const { element: captionsElement, clipIndex } of hoistedCaptions) {
+      const result = await renderCaptions(captionsElement, ctx);
+      hoistedCaptionsResults.push(result);
+      if (result.audioPath) {
+        audioTracks.push({
+          path: result.audioPath,
+          start: clipStartOffsets[clipIndex] ?? 0,
+          mixVolume: 1,
+        });
+      }
+    }
+    // Merge ASS files: shift timestamps by each clip's start offset
+    if (hoistedCaptionsResults.length === 1) {
+      const offset = clipStartOffsets[hoistedCaptions[0]!.clipIndex] ?? 0;
+      const assPath = hoistedCaptionsResults[0]!.assPath;
+      mergedAssPath =
+        offset > 0 ? shiftAssTimestamps(assPath, offset) : assPath;
+      if (mergedAssPath !== assPath) {
+        ctx.tempFiles.push(mergedAssPath);
+      }
+    } else if (hoistedCaptionsResults.length > 1) {
+      const segments = hoistedCaptionsResults.map((result, i) => ({
+        assPath: result.assPath,
+        timeOffset: clipStartOffsets[hoistedCaptions[i]!.clipIndex] ?? 0,
+        styleSuffix: `_${i}`,
+      }));
+      mergedAssPath = mergeAssFiles(segments, ctx.width, ctx.height);
+      ctx.tempFiles.push(mergedAssPath);
+    }
+  }
   // process music after clips so we know total duration for auto-trim
   for (const musicElement of musicElements) {
     const musicProps = musicElement.props as MusicProps;
@@ -400,7 +468,10 @@ export async function renderRoot(
     });
   }
-  const hasCaptions = captionsResult !== undefined;
+  // Determine the ASS path to burn: Render-level captions take priority,
+  // then merged/shifted hoisted captions from clips.
+  const finalAssPath = captionsResult?.assPath ?? mergedAssPath;
+  const hasCaptions = finalAssPath !== undefined;
   const tempOutPath = hasCaptions
     ? `/tmp/varg-pre-captions-${Date.now()}.mp4`
@@ -426,13 +497,13 @@ export async function renderRoot(
   let output = editlyResult.output;
-  if (hasCaptions && captionsResult) {
+  if (hasCaptions && finalAssPath) {
     const captionsTaskId = addTask(progress, "captions", "ffmpeg");
     startTask(progress, captionsTaskId);
     output = await burnCaptions({
       video: output,
-      assPath: captionsResult.assPath,
+      assPath: finalAssPath,
       outputPath: finalOutPath,
       backend: options.backend,
       verbose: options.verbose,