npm - vargai - Versions diffs - 0.4.0-alpha106 → 0.4.0-alpha107 - Mend

vargai 0.4.0-alpha106 → 0.4.0-alpha107

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/src/react/renderers/captions.ts +33 -16
package/src/speech/map-segments.ts +2 -1
package/src/speech/parse-alignment.ts +111 -6
package/src/speech/word-segmenter.ts +172 -0

package/package.json CHANGED Viewed

@@ -107,7 +107,7 @@
   "license": "Apache-2.0",
   "author": "varg.ai <hello@varg.ai> (https://varg.ai)",
   "sideEffects": false,
-  "version": "0.4.0-alpha106",
+  "version": "0.4.0-alpha107",
   "exports": {
     ".": "./src/index.ts",
     "./ai": "./src/ai-sdk/index.ts",

package/src/react/renderers/captions.ts CHANGED Viewed

@@ -2,20 +2,21 @@ import { writeFileSync } from "node:fs";
 import { groq } from "@ai-sdk/groq";
 import { experimental_transcribe as transcribe } from "ai";
 import { z } from "zod";
+import { smartJoin } from "../../speech/word-segmenter";
 import { ResolvedElement } from "../resolved-element";
 import type { CaptionsProps, VargElement } from "../types";
+import { ensureLocalFonts } from "./burn-captions";
 import type { RenderContext } from "./context";
 import {
-  type EmojiInstance,
-  type EmojiOverlay,
   calculateEmojiSize,
   calculateEmojiY,
+  type EmojiInstance,
+  type EmojiOverlay,
   extractEmoji,
   hasEmoji,
   stripEmoji,
 } from "./emoji";
 import { type FontResolution, getDefaultFontId, resolveFonts } from "./fonts";
-import { ensureLocalFonts } from "./burn-captions";
 import { addTask, completeTask, startTask } from "./progress";
 import { renderSpeech } from "./speech";
 import {
@@ -319,7 +320,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
     if (!activeColor) {
       // No highlight — show entire group as one event
-      let rawText = group.map((e) => e.text.replace(/\n/g, " ")).join(" ");
+      let rawText = smartJoin(group.map((e) => e.text.replace(/\n/g, " ")));
       // Strip emoji from the grouped text line
       let groupEmojiInstances: EmojiInstance[] | undefined;
@@ -350,7 +351,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
       for (const entry of group) {
         allGroupWords.push(entry.text.replace(/\n/g, " ").trim());
       }
-      let fullLineRaw = allGroupWords.join(" ");
+      const fullLineRaw = smartJoin(allGroupWords);
       let lineEmojiInstances: EmojiInstance[] | undefined;
       let strippedFullLine: string | undefined;
@@ -361,16 +362,13 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
       // Build per-word stripped words for highlight assembly
       const strippedWords = lineEmojiInstances
-        ? allGroupWords.map((w) =>
-            hasEmoji(w) ? stripEmoji(w, nSpaces) : w,
-          )
+        ? allGroupWords.map((w) => (hasEmoji(w) ? stripEmoji(w, nSpaces) : w))
         : allGroupWords;
       for (let wi = 0; wi < group.length; wi++) {
         const wordEntry = group[wi]!;
         const wordStart = wordEntry.start;
-        const wordEnd =
-          wi < group.length - 1 ? group[wi + 1]!.start : groupEnd;
+        const wordEnd = wi < group.length - 1 ? group[wi + 1]!.start : groupEnd;
         const parts: string[] = [];
         for (let idx = 0; idx < group.length; idx++) {
@@ -383,7 +381,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
           }
         }
-        const lineText = parts.join(" ");
+        const lineText = smartJoin(parts);
         // Collect emoji data once for the first word's dialogue
         // (all words in the group show the same line text, just with different highlight)
@@ -604,7 +602,10 @@ export async function renderCaptions(
   if (srtHasEmoji) {
     // Download fonts locally for measurement
     const localFontsDir = await ensureLocalFonts(
-      fontResolution.fontFiles.map((f) => ({ url: f.url, fileName: f.fileName })),
+      fontResolution.fontFiles.map((f) => ({
+        url: f.url,
+        fileName: f.fileName,
+      })),
     );
     // Build font name → local path mapping
@@ -617,7 +618,11 @@ export async function renderCaptions(
     const primaryFontPath = fontPathMap.get(fontResolution.primary.fontName);
     if (primaryFontPath) {
       const metrics = getFontMetrics(primaryFontPath, style.fontSize);
-      const emojiSize = calculateEmojiSize(metrics.winAscent, ctx.height, ctx.height);
+      const emojiSize = calculateEmojiSize(
+        metrics.winAscent,
+        ctx.height,
+        ctx.height,
+      );
       const spaceWidth = getSpaceWidth(primaryFontPath, style.fontSize);
       // +1 buffer space for visual breathing room between emoji and adjacent text
       spacesPerEmoji = Math.max(1, Math.ceil(emojiSize / spaceWidth) + 1);
@@ -655,8 +660,17 @@ export async function renderCaptions(
     const primaryFontPath = fontPathMap.get(style.fontName);
     const metrics = primaryFontPath
       ? getFontMetrics(primaryFontPath, style.fontSize)
-      : { ppem: style.fontSize * 0.64, capHeight: style.fontSize * 0.45, winAscent: style.fontSize * 0.7, winDescent: style.fontSize * 0.3 };
-    const emojiSize = calculateEmojiSize(metrics.winAscent, ctx.height, ctx.height);
+      : {
+          ppem: style.fontSize * 0.64,
+          capHeight: style.fontSize * 0.45,
+          winAscent: style.fontSize * 0.7,
+          winDescent: style.fontSize * 0.3,
+        };
+    const emojiSize = calculateEmojiSize(
+      metrics.winAscent,
+      ctx.height,
+      ctx.height,
+    );
     const nSpaces = spacesPerEmoji ?? 1;
     const spaceW = primaryFontPath
       ? getSpaceWidth(primaryFontPath, style.fontSize)
@@ -713,7 +727,10 @@ export async function renderCaptions(
   // When emoji are overlaid as color PNGs, exclude Noto Emoji from font files
   // (emoji chars are spaces in the ASS text, so the monochrome font is unused)
   const fontFiles = fontResolution.fontFiles
-    .filter((f) => !(emojiOverlays && emojiOverlays.length > 0 && f.id === "noto-emoji"))
+    .filter(
+      (f) =>
+        !(emojiOverlays && emojiOverlays.length > 0 && f.id === "noto-emoji"),
+    )
     .map((f) => ({ url: f.url, fileName: f.fileName }));
   return {

package/src/speech/map-segments.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import type { SegmentDescriptor, WordTiming } from "./types";
+import { countWords } from "./word-segmenter";
 /**
  * Map word-level timings back to the original string array to produce segments.
@@ -67,7 +68,7 @@ export function mapWordsToSegments(
   let wordIndex = 0;
   for (const text of children) {
-    const segmentWordCount = text.trim().split(/\s+/).filter(Boolean).length;
+    const segmentWordCount = countWords(text);
     if (segmentWordCount === 0) {
       const pos =

package/src/speech/parse-alignment.ts CHANGED Viewed

@@ -1,21 +1,33 @@
 import type { ElevenLabsCharacterAlignment, WordTiming } from "./types";
+import { hasSpacelessChars, segmentWords } from "./word-segmenter";
 /**
  * Convert ElevenLabs character-level alignment to word-level timing.
  *
  * ElevenLabs returns arrays of individual characters with start/end times.
- * This function groups consecutive non-whitespace characters into words
- * and computes each word's start (first char start) and end (last char end).
+ * This function groups characters into words and computes each word's
+ * start (first char start) and end (last char end).
+ *
+ * For languages that use spaces (English, Arabic, Korean, etc.), words are
+ * split at whitespace boundaries — same as before.
+ *
+ * For spaceless-script languages (Japanese, Chinese, Thai, etc.), we use
+ * `Intl.Segmenter` (ICU-backed) to find linguistically correct word
+ * boundaries, then map each word back to the character-level timing data.
  *
  * @example
  * ```ts
+ * // English
  * const alignment = {
  *   characters: ["H","e","l","l","o"," ","w","o","r","l","d"],
  *   character_start_times_seconds: [0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55],
  *   character_end_times_seconds:   [0.05, 0.1, 0.15, 0.2, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.65],
  * };
- * const words = parseElevenLabsAlignment(alignment);
+ * parseElevenLabsAlignment(alignment);
  * // [{word: "Hello", start: 0, end: 0.3}, {word: "world", start: 0.35, end: 0.65}]
+ *
+ * // Japanese — "これはテストです" → ["これ", "は", "テスト", "です"]
+ * // Each word gets precise timing from the character-level data.
  * ```
  */
 export function parseElevenLabsAlignment(
@@ -35,13 +47,40 @@ export function parseElevenLabsAlignment(
     return [];
   }
+  // Reconstruct the full text and check if it contains spaceless scripts
+  const fullText = characters.join("");
+  if (hasSpacelessChars(fullText)) {
+    return parseWithSegmenter(
+      fullText,
+      characters,
+      character_start_times_seconds,
+      character_end_times_seconds,
+    );
+  }
+  return parseByWhitespace(
+    characters,
+    character_start_times_seconds,
+    character_end_times_seconds,
+  );
+}
+/**
+ * Original whitespace-based parsing for space-delimited languages.
+ */
+function parseByWhitespace(
+  characters: string[],
+  startTimes: number[],
+  endTimes: number[],
+): WordTiming[] {
   const words: WordTiming[] = [];
   let wordChars = "";
   let wordStart = 0;
   for (let i = 0; i < characters.length; i++) {
     const char = characters[i]!;
-    const startTime = character_start_times_seconds[i]!;
+    const startTime = startTimes[i]!;
     const isWhitespace =
       char === " " || char === "\n" || char === "\t" || char === "\r";
@@ -51,7 +90,7 @@ export function parseElevenLabsAlignment(
         words.push({
           word: wordChars,
           start: wordStart,
-          end: character_end_times_seconds[i - 1] ?? wordStart,
+          end: endTimes[i - 1] ?? wordStart,
         });
         wordChars = "";
       }
@@ -66,7 +105,7 @@ export function parseElevenLabsAlignment(
   // Flush final word
   if (wordChars) {
-    const lastEnd = character_end_times_seconds[characters.length - 1];
+    const lastEnd = endTimes[characters.length - 1];
     words.push({
       word: wordChars,
       start: wordStart,
@@ -76,3 +115,69 @@ export function parseElevenLabsAlignment(
   return words;
 }
+/**
+ * Intl.Segmenter-based parsing for text containing spaceless scripts.
+ *
+ * Steps:
+ * 1. Build a mapping from each code-unit offset in the full text to its
+ *    index in the ElevenLabs character arrays (handling multi-char graphemes).
+ * 2. Use `Intl.Segmenter` to find word boundaries in the full text.
+ * 3. For each word-like segment, look up the start time of its first character
+ *    and the end time of its last character from the alignment data.
+ */
+function parseWithSegmenter(
+  fullText: string,
+  characters: string[],
+  startTimes: number[],
+  endTimes: number[],
+): WordTiming[] {
+  // Build a mapping: code-unit offset in fullText → character array index.
+  // ElevenLabs characters may be single code points or multi-code-unit chars
+  // (e.g., emoji), so we track offsets carefully.
+  const offsetToCharIndex = new Map<number, number>();
+  let offset = 0;
+  for (let ci = 0; ci < characters.length; ci++) {
+    offsetToCharIndex.set(offset, ci);
+    offset += characters[ci]!.length;
+  }
+  // Segment the full text into words
+  const segments = segmentWords(fullText);
+  const words: WordTiming[] = [];
+  for (const seg of segments) {
+    // Find the character indices for this segment's boundaries
+    const segStart = seg.index;
+    const segEnd = seg.index + seg.length;
+    // Find the first character index in this segment
+    let firstCharIdx: number | undefined;
+    let lastCharIdx: number | undefined;
+    // Walk through offsets to find all char indices within this segment
+    for (const [off, ci] of offsetToCharIndex) {
+      if (off >= segStart && off < segEnd) {
+        if (firstCharIdx === undefined || ci < firstCharIdx) {
+          firstCharIdx = ci;
+        }
+        if (lastCharIdx === undefined || ci > lastCharIdx) {
+          lastCharIdx = ci;
+        }
+      }
+    }
+    if (firstCharIdx === undefined || lastCharIdx === undefined) continue;
+    const wordStart = startTimes[firstCharIdx] ?? 0;
+    const wordEnd = endTimes[lastCharIdx] ?? wordStart;
+    words.push({
+      word: seg.word,
+      start: wordStart,
+      end: wordEnd,
+    });
+  }
+  return words;
+}

package/src/speech/word-segmenter.ts ADDED Viewed

@@ -0,0 +1,172 @@
+/**
+ * Language-aware word segmentation utilities.
+ *
+ * Uses `Intl.Segmenter` (ICU-backed, zero dependencies) to handle languages
+ * that don't use spaces between words: Japanese, Chinese, Thai, Khmer, etc.
+ *
+ * For Latin/Cyrillic/Arabic/Korean and other space-delimited scripts, simple
+ * whitespace splitting is equivalent — but `Intl.Segmenter` handles them too,
+ * so we use a single code path for all languages.
+ */
+// ---------------------------------------------------------------------------
+// Script detection helpers
+// ---------------------------------------------------------------------------
+/**
+ * Check if a code point belongs to a script that doesn't use spaces between words.
+ *
+ * Covers: CJK ideographs, Hiragana, Katakana, Thai, Lao, Myanmar, Khmer, Tibetan.
+ * Does NOT include Korean (Hangul) — Korean uses spaces between words.
+ */
+export function isSpacelessScript(cp: number): boolean {
+  // Hiragana
+  if (cp >= 0x3040 && cp <= 0x309f) return true;
+  // Katakana
+  if (cp >= 0x30a0 && cp <= 0x30ff) return true;
+  if (cp >= 0x31f0 && cp <= 0x31ff) return true; // Katakana Phonetic Extensions
+  if (cp >= 0xff65 && cp <= 0xff9f) return true; // Halfwidth Katakana
+  // CJK Unified Ideographs
+  if (cp >= 0x4e00 && cp <= 0x9fff) return true;
+  if (cp >= 0x3400 && cp <= 0x4dbf) return true; // Extension A
+  if (cp >= 0xf900 && cp <= 0xfaff) return true; // Compatibility
+  if (cp >= 0x20000 && cp <= 0x2a6df) return true; // Extension B
+  if (cp >= 0x2a700 && cp <= 0x2b73f) return true; // Extension C
+  if (cp >= 0x2b740 && cp <= 0x2b81f) return true; // Extension D
+  // CJK Radicals
+  if (cp >= 0x2e80 && cp <= 0x2eff) return true;
+  if (cp >= 0x2f00 && cp <= 0x2fdf) return true;
+  // Thai
+  if (cp >= 0x0e00 && cp <= 0x0e7f) return true;
+  // Lao
+  if (cp >= 0x0e80 && cp <= 0x0eff) return true;
+  // Myanmar
+  if (cp >= 0x1000 && cp <= 0x109f) return true;
+  // Khmer
+  if (cp >= 0x1780 && cp <= 0x17ff) return true;
+  // Tibetan
+  if (cp >= 0x0f00 && cp <= 0x0fff) return true;
+  return false;
+}
+/**
+ * Check if a string contains any characters from spaceless scripts.
+ * Used as a fast gate to decide whether we need `Intl.Segmenter`.
+ */
+export function hasSpacelessChars(text: string): boolean {
+  for (const char of text) {
+    const cp = char.codePointAt(0);
+    if (cp !== undefined && isSpacelessScript(cp)) return true;
+  }
+  return false;
+}
+// ---------------------------------------------------------------------------
+// Intl.Segmenter-based word segmentation
+// ---------------------------------------------------------------------------
+interface WordSegment {
+  /** The word text. */
+  word: string;
+  /** Character offset in the original string (code-unit index). */
+  index: number;
+  /** Length in code units. */
+  length: number;
+}
+/** Cached segmenter instance (default locale, word granularity). */
+let _segmenter: Intl.Segmenter | undefined;
+function getSegmenter(): Intl.Segmenter {
+  if (!_segmenter) {
+    _segmenter = new Intl.Segmenter(undefined, { granularity: "word" });
+  }
+  return _segmenter;
+}
+/**
+ * Segment text into word-like tokens using `Intl.Segmenter`.
+ *
+ * Returns only word-like segments (no punctuation-only or whitespace segments).
+ * Works correctly for all languages including Japanese, Chinese, Thai, Korean,
+ * Arabic, Hindi, etc.
+ */
+export function segmentWords(text: string): WordSegment[] {
+  const segmenter = getSegmenter();
+  const result: WordSegment[] = [];
+  for (const seg of segmenter.segment(text)) {
+    if (seg.isWordLike) {
+      result.push({
+        word: seg.segment,
+        index: seg.index,
+        length: seg.segment.length,
+      });
+    }
+  }
+  return result;
+}
+/**
+ * Count the number of word-like tokens in a string.
+ *
+ * Uses `Intl.Segmenter` for spaceless scripts, whitespace splitting for others.
+ * This matches the word count from `segmentWords()` / `parseElevenLabsAlignment()`.
+ */
+export function countWords(text: string): number {
+  if (hasSpacelessChars(text)) {
+    return segmentWords(text).length;
+  }
+  // Fast path for space-delimited scripts
+  return text.trim().split(/\s+/).filter(Boolean).length;
+}
+// ---------------------------------------------------------------------------
+// Smart join — language-aware word concatenation
+// ---------------------------------------------------------------------------
+/**
+ * Join words without inserting incorrect spaces between CJK/Thai tokens.
+ *
+ * Rules:
+ * - Between two spaceless-script tokens → no space (日本語 + テスト → 日本語テスト)
+ * - Between a spaceless-script token and a Latin token → no space (Varg + は → Vargは)
+ * - Between two Latin/Cyrillic/etc. tokens → space (Hello + world → Hello world)
+ *
+ * This matches how the original text would look — CJK/Thai don't use spaces.
+ */
+export function smartJoin(words: string[]): string {
+  if (words.length === 0) return "";
+  if (words.length === 1) return words[0]!;
+  let result = words[0]!;
+  for (let i = 1; i < words.length; i++) {
+    const prev = words[i - 1]!;
+    const curr = words[i]!;
+    // Check the last char of prev and first char of curr
+    const prevLastCp = lastCodePoint(prev);
+    const currFirstCp = curr.codePointAt(0) ?? 0;
+    // No space if either side is a spaceless script character
+    const needsSpace =
+      !isSpacelessScript(prevLastCp) && !isSpacelessScript(currFirstCp);
+    result += needsSpace ? ` ${curr}` : curr;
+  }
+  return result;
+}
+/**
+ * Get the last code point of a string.
+ */
+function lastCodePoint(s: string): number {
+  if (s.length === 0) return 0;
+  // Handle surrogate pairs
+  const last = s.codePointAt(s.length - 1);
+  if (last !== undefined && last >= 0xdc00 && last <= 0xdfff && s.length >= 2) {
+    // Low surrogate — the actual code point starts one position earlier
+    return s.codePointAt(s.length - 2) ?? 0;
+  }
+  return last ?? 0;
+}