vargai 0.4.0-alpha106 → 0.4.0-alpha107

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -107,7 +107,7 @@
107
107
  "license": "Apache-2.0",
108
108
  "author": "varg.ai <hello@varg.ai> (https://varg.ai)",
109
109
  "sideEffects": false,
110
- "version": "0.4.0-alpha106",
110
+ "version": "0.4.0-alpha107",
111
111
  "exports": {
112
112
  ".": "./src/index.ts",
113
113
  "./ai": "./src/ai-sdk/index.ts",
@@ -2,20 +2,21 @@ import { writeFileSync } from "node:fs";
2
2
  import { groq } from "@ai-sdk/groq";
3
3
  import { experimental_transcribe as transcribe } from "ai";
4
4
  import { z } from "zod";
5
+ import { smartJoin } from "../../speech/word-segmenter";
5
6
  import { ResolvedElement } from "../resolved-element";
6
7
  import type { CaptionsProps, VargElement } from "../types";
8
+ import { ensureLocalFonts } from "./burn-captions";
7
9
  import type { RenderContext } from "./context";
8
10
  import {
9
- type EmojiInstance,
10
- type EmojiOverlay,
11
11
  calculateEmojiSize,
12
12
  calculateEmojiY,
13
+ type EmojiInstance,
14
+ type EmojiOverlay,
13
15
  extractEmoji,
14
16
  hasEmoji,
15
17
  stripEmoji,
16
18
  } from "./emoji";
17
19
  import { type FontResolution, getDefaultFontId, resolveFonts } from "./fonts";
18
- import { ensureLocalFonts } from "./burn-captions";
19
20
  import { addTask, completeTask, startTask } from "./progress";
20
21
  import { renderSpeech } from "./speech";
21
22
  import {
@@ -319,7 +320,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
319
320
 
320
321
  if (!activeColor) {
321
322
  // No highlight — show entire group as one event
322
- let rawText = group.map((e) => e.text.replace(/\n/g, " ")).join(" ");
323
+ let rawText = smartJoin(group.map((e) => e.text.replace(/\n/g, " ")));
323
324
 
324
325
  // Strip emoji from the grouped text line
325
326
  let groupEmojiInstances: EmojiInstance[] | undefined;
@@ -350,7 +351,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
350
351
  for (const entry of group) {
351
352
  allGroupWords.push(entry.text.replace(/\n/g, " ").trim());
352
353
  }
353
- let fullLineRaw = allGroupWords.join(" ");
354
+ const fullLineRaw = smartJoin(allGroupWords);
354
355
 
355
356
  let lineEmojiInstances: EmojiInstance[] | undefined;
356
357
  let strippedFullLine: string | undefined;
@@ -361,16 +362,13 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
361
362
 
362
363
  // Build per-word stripped words for highlight assembly
363
364
  const strippedWords = lineEmojiInstances
364
- ? allGroupWords.map((w) =>
365
- hasEmoji(w) ? stripEmoji(w, nSpaces) : w,
366
- )
365
+ ? allGroupWords.map((w) => (hasEmoji(w) ? stripEmoji(w, nSpaces) : w))
367
366
  : allGroupWords;
368
367
 
369
368
  for (let wi = 0; wi < group.length; wi++) {
370
369
  const wordEntry = group[wi]!;
371
370
  const wordStart = wordEntry.start;
372
- const wordEnd =
373
- wi < group.length - 1 ? group[wi + 1]!.start : groupEnd;
371
+ const wordEnd = wi < group.length - 1 ? group[wi + 1]!.start : groupEnd;
374
372
 
375
373
  const parts: string[] = [];
376
374
  for (let idx = 0; idx < group.length; idx++) {
@@ -383,7 +381,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
383
381
  }
384
382
  }
385
383
 
386
- const lineText = parts.join(" ");
384
+ const lineText = smartJoin(parts);
387
385
 
388
386
  // Collect emoji data once for the first word's dialogue
389
387
  // (all words in the group show the same line text, just with different highlight)
@@ -604,7 +602,10 @@ export async function renderCaptions(
604
602
  if (srtHasEmoji) {
605
603
  // Download fonts locally for measurement
606
604
  const localFontsDir = await ensureLocalFonts(
607
- fontResolution.fontFiles.map((f) => ({ url: f.url, fileName: f.fileName })),
605
+ fontResolution.fontFiles.map((f) => ({
606
+ url: f.url,
607
+ fileName: f.fileName,
608
+ })),
608
609
  );
609
610
 
610
611
  // Build font name → local path mapping
@@ -617,7 +618,11 @@ export async function renderCaptions(
617
618
  const primaryFontPath = fontPathMap.get(fontResolution.primary.fontName);
618
619
  if (primaryFontPath) {
619
620
  const metrics = getFontMetrics(primaryFontPath, style.fontSize);
620
- const emojiSize = calculateEmojiSize(metrics.winAscent, ctx.height, ctx.height);
621
+ const emojiSize = calculateEmojiSize(
622
+ metrics.winAscent,
623
+ ctx.height,
624
+ ctx.height,
625
+ );
621
626
  const spaceWidth = getSpaceWidth(primaryFontPath, style.fontSize);
622
627
  // +1 buffer space for visual breathing room between emoji and adjacent text
623
628
  spacesPerEmoji = Math.max(1, Math.ceil(emojiSize / spaceWidth) + 1);
@@ -655,8 +660,17 @@ export async function renderCaptions(
655
660
  const primaryFontPath = fontPathMap.get(style.fontName);
656
661
  const metrics = primaryFontPath
657
662
  ? getFontMetrics(primaryFontPath, style.fontSize)
658
- : { ppem: style.fontSize * 0.64, capHeight: style.fontSize * 0.45, winAscent: style.fontSize * 0.7, winDescent: style.fontSize * 0.3 };
659
- const emojiSize = calculateEmojiSize(metrics.winAscent, ctx.height, ctx.height);
663
+ : {
664
+ ppem: style.fontSize * 0.64,
665
+ capHeight: style.fontSize * 0.45,
666
+ winAscent: style.fontSize * 0.7,
667
+ winDescent: style.fontSize * 0.3,
668
+ };
669
+ const emojiSize = calculateEmojiSize(
670
+ metrics.winAscent,
671
+ ctx.height,
672
+ ctx.height,
673
+ );
660
674
  const nSpaces = spacesPerEmoji ?? 1;
661
675
  const spaceW = primaryFontPath
662
676
  ? getSpaceWidth(primaryFontPath, style.fontSize)
@@ -713,7 +727,10 @@ export async function renderCaptions(
713
727
  // When emoji are overlaid as color PNGs, exclude Noto Emoji from font files
714
728
  // (emoji chars are spaces in the ASS text, so the monochrome font is unused)
715
729
  const fontFiles = fontResolution.fontFiles
716
- .filter((f) => !(emojiOverlays && emojiOverlays.length > 0 && f.id === "noto-emoji"))
730
+ .filter(
731
+ (f) =>
732
+ !(emojiOverlays && emojiOverlays.length > 0 && f.id === "noto-emoji"),
733
+ )
717
734
  .map((f) => ({ url: f.url, fileName: f.fileName }));
718
735
 
719
736
  return {
@@ -1,4 +1,5 @@
1
1
  import type { SegmentDescriptor, WordTiming } from "./types";
2
+ import { countWords } from "./word-segmenter";
2
3
 
3
4
  /**
4
5
  * Map word-level timings back to the original string array to produce segments.
@@ -67,7 +68,7 @@ export function mapWordsToSegments(
67
68
  let wordIndex = 0;
68
69
 
69
70
  for (const text of children) {
70
- const segmentWordCount = text.trim().split(/\s+/).filter(Boolean).length;
71
+ const segmentWordCount = countWords(text);
71
72
 
72
73
  if (segmentWordCount === 0) {
73
74
  const pos =
@@ -1,21 +1,33 @@
1
1
  import type { ElevenLabsCharacterAlignment, WordTiming } from "./types";
2
+ import { hasSpacelessChars, segmentWords } from "./word-segmenter";
2
3
 
3
4
  /**
4
5
  * Convert ElevenLabs character-level alignment to word-level timing.
5
6
  *
6
7
  * ElevenLabs returns arrays of individual characters with start/end times.
7
- * This function groups consecutive non-whitespace characters into words
8
- * and computes each word's start (first char start) and end (last char end).
8
+ * This function groups characters into words and computes each word's
9
+ * start (first char start) and end (last char end).
10
+ *
11
+ * For languages that use spaces (English, Arabic, Korean, etc.), words are
12
+ * split at whitespace boundaries — same as before.
13
+ *
14
+ * For spaceless-script languages (Japanese, Chinese, Thai, etc.), we use
15
+ * `Intl.Segmenter` (ICU-backed) to find linguistically correct word
16
+ * boundaries, then map each word back to the character-level timing data.
9
17
  *
10
18
  * @example
11
19
  * ```ts
20
+ * // English
12
21
  * const alignment = {
13
22
  * characters: ["H","e","l","l","o"," ","w","o","r","l","d"],
14
23
  * character_start_times_seconds: [0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55],
15
24
  * character_end_times_seconds: [0.05, 0.1, 0.15, 0.2, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.65],
16
25
  * };
17
- * const words = parseElevenLabsAlignment(alignment);
26
+ * parseElevenLabsAlignment(alignment);
18
27
  * // [{word: "Hello", start: 0, end: 0.3}, {word: "world", start: 0.35, end: 0.65}]
28
+ *
29
+ * // Japanese — "これはテストです" → ["これ", "は", "テスト", "です"]
30
+ * // Each word gets precise timing from the character-level data.
19
31
  * ```
20
32
  */
21
33
  export function parseElevenLabsAlignment(
@@ -35,13 +47,40 @@ export function parseElevenLabsAlignment(
35
47
  return [];
36
48
  }
37
49
 
50
+ // Reconstruct the full text and check if it contains spaceless scripts
51
+ const fullText = characters.join("");
52
+
53
+ if (hasSpacelessChars(fullText)) {
54
+ return parseWithSegmenter(
55
+ fullText,
56
+ characters,
57
+ character_start_times_seconds,
58
+ character_end_times_seconds,
59
+ );
60
+ }
61
+
62
+ return parseByWhitespace(
63
+ characters,
64
+ character_start_times_seconds,
65
+ character_end_times_seconds,
66
+ );
67
+ }
68
+
69
+ /**
70
+ * Original whitespace-based parsing for space-delimited languages.
71
+ */
72
+ function parseByWhitespace(
73
+ characters: string[],
74
+ startTimes: number[],
75
+ endTimes: number[],
76
+ ): WordTiming[] {
38
77
  const words: WordTiming[] = [];
39
78
  let wordChars = "";
40
79
  let wordStart = 0;
41
80
 
42
81
  for (let i = 0; i < characters.length; i++) {
43
82
  const char = characters[i]!;
44
- const startTime = character_start_times_seconds[i]!;
83
+ const startTime = startTimes[i]!;
45
84
  const isWhitespace =
46
85
  char === " " || char === "\n" || char === "\t" || char === "\r";
47
86
 
@@ -51,7 +90,7 @@ export function parseElevenLabsAlignment(
51
90
  words.push({
52
91
  word: wordChars,
53
92
  start: wordStart,
54
- end: character_end_times_seconds[i - 1] ?? wordStart,
93
+ end: endTimes[i - 1] ?? wordStart,
55
94
  });
56
95
  wordChars = "";
57
96
  }
@@ -66,7 +105,7 @@ export function parseElevenLabsAlignment(
66
105
 
67
106
  // Flush final word
68
107
  if (wordChars) {
69
- const lastEnd = character_end_times_seconds[characters.length - 1];
108
+ const lastEnd = endTimes[characters.length - 1];
70
109
  words.push({
71
110
  word: wordChars,
72
111
  start: wordStart,
@@ -76,3 +115,69 @@ export function parseElevenLabsAlignment(
76
115
 
77
116
  return words;
78
117
  }
118
+
119
+ /**
120
+ * Intl.Segmenter-based parsing for text containing spaceless scripts.
121
+ *
122
+ * Steps:
123
+ * 1. Build a mapping from each code-unit offset in the full text to its
124
+ * index in the ElevenLabs character arrays (handling multi-char graphemes).
125
+ * 2. Use `Intl.Segmenter` to find word boundaries in the full text.
126
+ * 3. For each word-like segment, look up the start time of its first character
127
+ * and the end time of its last character from the alignment data.
128
+ */
129
+ function parseWithSegmenter(
130
+ fullText: string,
131
+ characters: string[],
132
+ startTimes: number[],
133
+ endTimes: number[],
134
+ ): WordTiming[] {
135
+ // Build a mapping: code-unit offset in fullText → character array index.
136
+ // ElevenLabs characters may be single code points or multi-code-unit chars
137
+ // (e.g., emoji), so we track offsets carefully.
138
+ const offsetToCharIndex = new Map<number, number>();
139
+ let offset = 0;
140
+ for (let ci = 0; ci < characters.length; ci++) {
141
+ offsetToCharIndex.set(offset, ci);
142
+ offset += characters[ci]!.length;
143
+ }
144
+
145
+ // Segment the full text into words
146
+ const segments = segmentWords(fullText);
147
+
148
+ const words: WordTiming[] = [];
149
+ for (const seg of segments) {
150
+ // Find the character indices for this segment's boundaries
151
+ const segStart = seg.index;
152
+ const segEnd = seg.index + seg.length;
153
+
154
+ // Find the first character index in this segment
155
+ let firstCharIdx: number | undefined;
156
+ let lastCharIdx: number | undefined;
157
+
158
+ // Walk through offsets to find all char indices within this segment
159
+ for (const [off, ci] of offsetToCharIndex) {
160
+ if (off >= segStart && off < segEnd) {
161
+ if (firstCharIdx === undefined || ci < firstCharIdx) {
162
+ firstCharIdx = ci;
163
+ }
164
+ if (lastCharIdx === undefined || ci > lastCharIdx) {
165
+ lastCharIdx = ci;
166
+ }
167
+ }
168
+ }
169
+
170
+ if (firstCharIdx === undefined || lastCharIdx === undefined) continue;
171
+
172
+ const wordStart = startTimes[firstCharIdx] ?? 0;
173
+ const wordEnd = endTimes[lastCharIdx] ?? wordStart;
174
+
175
+ words.push({
176
+ word: seg.word,
177
+ start: wordStart,
178
+ end: wordEnd,
179
+ });
180
+ }
181
+
182
+ return words;
183
+ }
@@ -0,0 +1,172 @@
1
+ /**
2
+ * Language-aware word segmentation utilities.
3
+ *
4
+ * Uses `Intl.Segmenter` (ICU-backed, zero dependencies) to handle languages
5
+ * that don't use spaces between words: Japanese, Chinese, Thai, Khmer, etc.
6
+ *
7
+ * For Latin/Cyrillic/Arabic/Korean and other space-delimited scripts, simple
8
+ * whitespace splitting is equivalent — but `Intl.Segmenter` handles them too,
9
+ * so we use a single code path for all languages.
10
+ */
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // Script detection helpers
14
+ // ---------------------------------------------------------------------------
15
+
16
+ /**
17
+ * Check if a code point belongs to a script that doesn't use spaces between words.
18
+ *
19
+ * Covers: CJK ideographs, Hiragana, Katakana, Thai, Lao, Myanmar, Khmer, Tibetan.
20
+ * Does NOT include Korean (Hangul) — Korean uses spaces between words.
21
+ */
22
+ export function isSpacelessScript(cp: number): boolean {
23
+ // Hiragana
24
+ if (cp >= 0x3040 && cp <= 0x309f) return true;
25
+ // Katakana
26
+ if (cp >= 0x30a0 && cp <= 0x30ff) return true;
27
+ if (cp >= 0x31f0 && cp <= 0x31ff) return true; // Katakana Phonetic Extensions
28
+ if (cp >= 0xff65 && cp <= 0xff9f) return true; // Halfwidth Katakana
29
+ // CJK Unified Ideographs
30
+ if (cp >= 0x4e00 && cp <= 0x9fff) return true;
31
+ if (cp >= 0x3400 && cp <= 0x4dbf) return true; // Extension A
32
+ if (cp >= 0xf900 && cp <= 0xfaff) return true; // Compatibility
33
+ if (cp >= 0x20000 && cp <= 0x2a6df) return true; // Extension B
34
+ if (cp >= 0x2a700 && cp <= 0x2b73f) return true; // Extension C
35
+ if (cp >= 0x2b740 && cp <= 0x2b81f) return true; // Extension D
36
+ // CJK Radicals
37
+ if (cp >= 0x2e80 && cp <= 0x2eff) return true;
38
+ if (cp >= 0x2f00 && cp <= 0x2fdf) return true;
39
+ // Thai
40
+ if (cp >= 0x0e00 && cp <= 0x0e7f) return true;
41
+ // Lao
42
+ if (cp >= 0x0e80 && cp <= 0x0eff) return true;
43
+ // Myanmar
44
+ if (cp >= 0x1000 && cp <= 0x109f) return true;
45
+ // Khmer
46
+ if (cp >= 0x1780 && cp <= 0x17ff) return true;
47
+ // Tibetan
48
+ if (cp >= 0x0f00 && cp <= 0x0fff) return true;
49
+
50
+ return false;
51
+ }
52
+
53
+ /**
54
+ * Check if a string contains any characters from spaceless scripts.
55
+ * Used as a fast gate to decide whether we need `Intl.Segmenter`.
56
+ */
57
+ export function hasSpacelessChars(text: string): boolean {
58
+ for (const char of text) {
59
+ const cp = char.codePointAt(0);
60
+ if (cp !== undefined && isSpacelessScript(cp)) return true;
61
+ }
62
+ return false;
63
+ }
64
+
65
+ // ---------------------------------------------------------------------------
66
+ // Intl.Segmenter-based word segmentation
67
+ // ---------------------------------------------------------------------------
68
+
69
+ interface WordSegment {
70
+ /** The word text. */
71
+ word: string;
72
+ /** Character offset in the original string (code-unit index). */
73
+ index: number;
74
+ /** Length in code units. */
75
+ length: number;
76
+ }
77
+
78
+ /** Cached segmenter instance (default locale, word granularity). */
79
+ let _segmenter: Intl.Segmenter | undefined;
80
+
81
+ function getSegmenter(): Intl.Segmenter {
82
+ if (!_segmenter) {
83
+ _segmenter = new Intl.Segmenter(undefined, { granularity: "word" });
84
+ }
85
+ return _segmenter;
86
+ }
87
+
88
+ /**
89
+ * Segment text into word-like tokens using `Intl.Segmenter`.
90
+ *
91
+ * Returns only word-like segments (no punctuation-only or whitespace segments).
92
+ * Works correctly for all languages including Japanese, Chinese, Thai, Korean,
93
+ * Arabic, Hindi, etc.
94
+ */
95
+ export function segmentWords(text: string): WordSegment[] {
96
+ const segmenter = getSegmenter();
97
+ const result: WordSegment[] = [];
98
+ for (const seg of segmenter.segment(text)) {
99
+ if (seg.isWordLike) {
100
+ result.push({
101
+ word: seg.segment,
102
+ index: seg.index,
103
+ length: seg.segment.length,
104
+ });
105
+ }
106
+ }
107
+ return result;
108
+ }
109
+
110
+ /**
111
+ * Count the number of word-like tokens in a string.
112
+ *
113
+ * Uses `Intl.Segmenter` for spaceless scripts, whitespace splitting for others.
114
+ * This matches the word count from `segmentWords()` / `parseElevenLabsAlignment()`.
115
+ */
116
+ export function countWords(text: string): number {
117
+ if (hasSpacelessChars(text)) {
118
+ return segmentWords(text).length;
119
+ }
120
+ // Fast path for space-delimited scripts
121
+ return text.trim().split(/\s+/).filter(Boolean).length;
122
+ }
123
+
124
+ // ---------------------------------------------------------------------------
125
+ // Smart join — language-aware word concatenation
126
+ // ---------------------------------------------------------------------------
127
+
128
+ /**
129
+ * Join words without inserting incorrect spaces between CJK/Thai tokens.
130
+ *
131
+ * Rules:
132
+ * - Between two spaceless-script tokens → no space (日本語 + テスト → 日本語テスト)
133
+ * - Between a spaceless-script token and a Latin token → no space (Varg + は → Vargは)
134
+ * - Between two Latin/Cyrillic/etc. tokens → space (Hello + world → Hello world)
135
+ *
136
+ * This matches how the original text would look — CJK/Thai don't use spaces.
137
+ */
138
+ export function smartJoin(words: string[]): string {
139
+ if (words.length === 0) return "";
140
+ if (words.length === 1) return words[0]!;
141
+
142
+ let result = words[0]!;
143
+ for (let i = 1; i < words.length; i++) {
144
+ const prev = words[i - 1]!;
145
+ const curr = words[i]!;
146
+
147
+ // Check the last char of prev and first char of curr
148
+ const prevLastCp = lastCodePoint(prev);
149
+ const currFirstCp = curr.codePointAt(0) ?? 0;
150
+
151
+ // No space if either side is a spaceless script character
152
+ const needsSpace =
153
+ !isSpacelessScript(prevLastCp) && !isSpacelessScript(currFirstCp);
154
+
155
+ result += needsSpace ? ` ${curr}` : curr;
156
+ }
157
+ return result;
158
+ }
159
+
160
+ /**
161
+ * Get the last code point of a string.
162
+ */
163
+ function lastCodePoint(s: string): number {
164
+ if (s.length === 0) return 0;
165
+ // Handle surrogate pairs
166
+ const last = s.codePointAt(s.length - 1);
167
+ if (last !== undefined && last >= 0xdc00 && last <= 0xdfff && s.length >= 2) {
168
+ // Low surrogate — the actual code point starts one position earlier
169
+ return s.codePointAt(s.length - 2) ?? 0;
170
+ }
171
+ return last ?? 0;
172
+ }