vargai 0.4.0-alpha106 → 0.4.0-alpha107
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -107,7 +107,7 @@
|
|
|
107
107
|
"license": "Apache-2.0",
|
|
108
108
|
"author": "varg.ai <hello@varg.ai> (https://varg.ai)",
|
|
109
109
|
"sideEffects": false,
|
|
110
|
-
"version": "0.4.0-
|
|
110
|
+
"version": "0.4.0-alpha107",
|
|
111
111
|
"exports": {
|
|
112
112
|
".": "./src/index.ts",
|
|
113
113
|
"./ai": "./src/ai-sdk/index.ts",
|
|
@@ -2,20 +2,21 @@ import { writeFileSync } from "node:fs";
|
|
|
2
2
|
import { groq } from "@ai-sdk/groq";
|
|
3
3
|
import { experimental_transcribe as transcribe } from "ai";
|
|
4
4
|
import { z } from "zod";
|
|
5
|
+
import { smartJoin } from "../../speech/word-segmenter";
|
|
5
6
|
import { ResolvedElement } from "../resolved-element";
|
|
6
7
|
import type { CaptionsProps, VargElement } from "../types";
|
|
8
|
+
import { ensureLocalFonts } from "./burn-captions";
|
|
7
9
|
import type { RenderContext } from "./context";
|
|
8
10
|
import {
|
|
9
|
-
type EmojiInstance,
|
|
10
|
-
type EmojiOverlay,
|
|
11
11
|
calculateEmojiSize,
|
|
12
12
|
calculateEmojiY,
|
|
13
|
+
type EmojiInstance,
|
|
14
|
+
type EmojiOverlay,
|
|
13
15
|
extractEmoji,
|
|
14
16
|
hasEmoji,
|
|
15
17
|
stripEmoji,
|
|
16
18
|
} from "./emoji";
|
|
17
19
|
import { type FontResolution, getDefaultFontId, resolveFonts } from "./fonts";
|
|
18
|
-
import { ensureLocalFonts } from "./burn-captions";
|
|
19
20
|
import { addTask, completeTask, startTask } from "./progress";
|
|
20
21
|
import { renderSpeech } from "./speech";
|
|
21
22
|
import {
|
|
@@ -319,7 +320,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
|
|
319
320
|
|
|
320
321
|
if (!activeColor) {
|
|
321
322
|
// No highlight — show entire group as one event
|
|
322
|
-
let rawText = group.map((e) => e.text.replace(/\n/g, " "))
|
|
323
|
+
let rawText = smartJoin(group.map((e) => e.text.replace(/\n/g, " ")));
|
|
323
324
|
|
|
324
325
|
// Strip emoji from the grouped text line
|
|
325
326
|
let groupEmojiInstances: EmojiInstance[] | undefined;
|
|
@@ -350,7 +351,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
|
|
350
351
|
for (const entry of group) {
|
|
351
352
|
allGroupWords.push(entry.text.replace(/\n/g, " ").trim());
|
|
352
353
|
}
|
|
353
|
-
|
|
354
|
+
const fullLineRaw = smartJoin(allGroupWords);
|
|
354
355
|
|
|
355
356
|
let lineEmojiInstances: EmojiInstance[] | undefined;
|
|
356
357
|
let strippedFullLine: string | undefined;
|
|
@@ -361,16 +362,13 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
|
|
361
362
|
|
|
362
363
|
// Build per-word stripped words for highlight assembly
|
|
363
364
|
const strippedWords = lineEmojiInstances
|
|
364
|
-
? allGroupWords.map((w) =>
|
|
365
|
-
hasEmoji(w) ? stripEmoji(w, nSpaces) : w,
|
|
366
|
-
)
|
|
365
|
+
? allGroupWords.map((w) => (hasEmoji(w) ? stripEmoji(w, nSpaces) : w))
|
|
367
366
|
: allGroupWords;
|
|
368
367
|
|
|
369
368
|
for (let wi = 0; wi < group.length; wi++) {
|
|
370
369
|
const wordEntry = group[wi]!;
|
|
371
370
|
const wordStart = wordEntry.start;
|
|
372
|
-
const wordEnd =
|
|
373
|
-
wi < group.length - 1 ? group[wi + 1]!.start : groupEnd;
|
|
371
|
+
const wordEnd = wi < group.length - 1 ? group[wi + 1]!.start : groupEnd;
|
|
374
372
|
|
|
375
373
|
const parts: string[] = [];
|
|
376
374
|
for (let idx = 0; idx < group.length; idx++) {
|
|
@@ -383,7 +381,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
|
|
383
381
|
}
|
|
384
382
|
}
|
|
385
383
|
|
|
386
|
-
const lineText = parts
|
|
384
|
+
const lineText = smartJoin(parts);
|
|
387
385
|
|
|
388
386
|
// Collect emoji data once for the first word's dialogue
|
|
389
387
|
// (all words in the group show the same line text, just with different highlight)
|
|
@@ -604,7 +602,10 @@ export async function renderCaptions(
|
|
|
604
602
|
if (srtHasEmoji) {
|
|
605
603
|
// Download fonts locally for measurement
|
|
606
604
|
const localFontsDir = await ensureLocalFonts(
|
|
607
|
-
fontResolution.fontFiles.map((f) => ({
|
|
605
|
+
fontResolution.fontFiles.map((f) => ({
|
|
606
|
+
url: f.url,
|
|
607
|
+
fileName: f.fileName,
|
|
608
|
+
})),
|
|
608
609
|
);
|
|
609
610
|
|
|
610
611
|
// Build font name → local path mapping
|
|
@@ -617,7 +618,11 @@ export async function renderCaptions(
|
|
|
617
618
|
const primaryFontPath = fontPathMap.get(fontResolution.primary.fontName);
|
|
618
619
|
if (primaryFontPath) {
|
|
619
620
|
const metrics = getFontMetrics(primaryFontPath, style.fontSize);
|
|
620
|
-
const emojiSize = calculateEmojiSize(
|
|
621
|
+
const emojiSize = calculateEmojiSize(
|
|
622
|
+
metrics.winAscent,
|
|
623
|
+
ctx.height,
|
|
624
|
+
ctx.height,
|
|
625
|
+
);
|
|
621
626
|
const spaceWidth = getSpaceWidth(primaryFontPath, style.fontSize);
|
|
622
627
|
// +1 buffer space for visual breathing room between emoji and adjacent text
|
|
623
628
|
spacesPerEmoji = Math.max(1, Math.ceil(emojiSize / spaceWidth) + 1);
|
|
@@ -655,8 +660,17 @@ export async function renderCaptions(
|
|
|
655
660
|
const primaryFontPath = fontPathMap.get(style.fontName);
|
|
656
661
|
const metrics = primaryFontPath
|
|
657
662
|
? getFontMetrics(primaryFontPath, style.fontSize)
|
|
658
|
-
: {
|
|
659
|
-
|
|
663
|
+
: {
|
|
664
|
+
ppem: style.fontSize * 0.64,
|
|
665
|
+
capHeight: style.fontSize * 0.45,
|
|
666
|
+
winAscent: style.fontSize * 0.7,
|
|
667
|
+
winDescent: style.fontSize * 0.3,
|
|
668
|
+
};
|
|
669
|
+
const emojiSize = calculateEmojiSize(
|
|
670
|
+
metrics.winAscent,
|
|
671
|
+
ctx.height,
|
|
672
|
+
ctx.height,
|
|
673
|
+
);
|
|
660
674
|
const nSpaces = spacesPerEmoji ?? 1;
|
|
661
675
|
const spaceW = primaryFontPath
|
|
662
676
|
? getSpaceWidth(primaryFontPath, style.fontSize)
|
|
@@ -713,7 +727,10 @@ export async function renderCaptions(
|
|
|
713
727
|
// When emoji are overlaid as color PNGs, exclude Noto Emoji from font files
|
|
714
728
|
// (emoji chars are spaces in the ASS text, so the monochrome font is unused)
|
|
715
729
|
const fontFiles = fontResolution.fontFiles
|
|
716
|
-
.filter(
|
|
730
|
+
.filter(
|
|
731
|
+
(f) =>
|
|
732
|
+
!(emojiOverlays && emojiOverlays.length > 0 && f.id === "noto-emoji"),
|
|
733
|
+
)
|
|
717
734
|
.map((f) => ({ url: f.url, fileName: f.fileName }));
|
|
718
735
|
|
|
719
736
|
return {
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import type { SegmentDescriptor, WordTiming } from "./types";
|
|
2
|
+
import { countWords } from "./word-segmenter";
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
5
|
* Map word-level timings back to the original string array to produce segments.
|
|
@@ -67,7 +68,7 @@ export function mapWordsToSegments(
|
|
|
67
68
|
let wordIndex = 0;
|
|
68
69
|
|
|
69
70
|
for (const text of children) {
|
|
70
|
-
const segmentWordCount = text
|
|
71
|
+
const segmentWordCount = countWords(text);
|
|
71
72
|
|
|
72
73
|
if (segmentWordCount === 0) {
|
|
73
74
|
const pos =
|
|
@@ -1,21 +1,33 @@
|
|
|
1
1
|
import type { ElevenLabsCharacterAlignment, WordTiming } from "./types";
|
|
2
|
+
import { hasSpacelessChars, segmentWords } from "./word-segmenter";
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
5
|
* Convert ElevenLabs character-level alignment to word-level timing.
|
|
5
6
|
*
|
|
6
7
|
* ElevenLabs returns arrays of individual characters with start/end times.
|
|
7
|
-
* This function groups
|
|
8
|
-
*
|
|
8
|
+
* This function groups characters into words and computes each word's
|
|
9
|
+
* start (first char start) and end (last char end).
|
|
10
|
+
*
|
|
11
|
+
* For languages that use spaces (English, Arabic, Korean, etc.), words are
|
|
12
|
+
* split at whitespace boundaries — same as before.
|
|
13
|
+
*
|
|
14
|
+
* For spaceless-script languages (Japanese, Chinese, Thai, etc.), we use
|
|
15
|
+
* `Intl.Segmenter` (ICU-backed) to find linguistically correct word
|
|
16
|
+
* boundaries, then map each word back to the character-level timing data.
|
|
9
17
|
*
|
|
10
18
|
* @example
|
|
11
19
|
* ```ts
|
|
20
|
+
* // English
|
|
12
21
|
* const alignment = {
|
|
13
22
|
* characters: ["H","e","l","l","o"," ","w","o","r","l","d"],
|
|
14
23
|
* character_start_times_seconds: [0, 0.05, 0.1, 0.15, 0.2, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55],
|
|
15
24
|
* character_end_times_seconds: [0.05, 0.1, 0.15, 0.2, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.65],
|
|
16
25
|
* };
|
|
17
|
-
*
|
|
26
|
+
* parseElevenLabsAlignment(alignment);
|
|
18
27
|
* // [{word: "Hello", start: 0, end: 0.3}, {word: "world", start: 0.35, end: 0.65}]
|
|
28
|
+
*
|
|
29
|
+
* // Japanese — "これはテストです" → ["これ", "は", "テスト", "です"]
|
|
30
|
+
* // Each word gets precise timing from the character-level data.
|
|
19
31
|
* ```
|
|
20
32
|
*/
|
|
21
33
|
export function parseElevenLabsAlignment(
|
|
@@ -35,13 +47,40 @@ export function parseElevenLabsAlignment(
|
|
|
35
47
|
return [];
|
|
36
48
|
}
|
|
37
49
|
|
|
50
|
+
// Reconstruct the full text and check if it contains spaceless scripts
|
|
51
|
+
const fullText = characters.join("");
|
|
52
|
+
|
|
53
|
+
if (hasSpacelessChars(fullText)) {
|
|
54
|
+
return parseWithSegmenter(
|
|
55
|
+
fullText,
|
|
56
|
+
characters,
|
|
57
|
+
character_start_times_seconds,
|
|
58
|
+
character_end_times_seconds,
|
|
59
|
+
);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
return parseByWhitespace(
|
|
63
|
+
characters,
|
|
64
|
+
character_start_times_seconds,
|
|
65
|
+
character_end_times_seconds,
|
|
66
|
+
);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Original whitespace-based parsing for space-delimited languages.
|
|
71
|
+
*/
|
|
72
|
+
function parseByWhitespace(
|
|
73
|
+
characters: string[],
|
|
74
|
+
startTimes: number[],
|
|
75
|
+
endTimes: number[],
|
|
76
|
+
): WordTiming[] {
|
|
38
77
|
const words: WordTiming[] = [];
|
|
39
78
|
let wordChars = "";
|
|
40
79
|
let wordStart = 0;
|
|
41
80
|
|
|
42
81
|
for (let i = 0; i < characters.length; i++) {
|
|
43
82
|
const char = characters[i]!;
|
|
44
|
-
const startTime =
|
|
83
|
+
const startTime = startTimes[i]!;
|
|
45
84
|
const isWhitespace =
|
|
46
85
|
char === " " || char === "\n" || char === "\t" || char === "\r";
|
|
47
86
|
|
|
@@ -51,7 +90,7 @@ export function parseElevenLabsAlignment(
|
|
|
51
90
|
words.push({
|
|
52
91
|
word: wordChars,
|
|
53
92
|
start: wordStart,
|
|
54
|
-
end:
|
|
93
|
+
end: endTimes[i - 1] ?? wordStart,
|
|
55
94
|
});
|
|
56
95
|
wordChars = "";
|
|
57
96
|
}
|
|
@@ -66,7 +105,7 @@ export function parseElevenLabsAlignment(
|
|
|
66
105
|
|
|
67
106
|
// Flush final word
|
|
68
107
|
if (wordChars) {
|
|
69
|
-
const lastEnd =
|
|
108
|
+
const lastEnd = endTimes[characters.length - 1];
|
|
70
109
|
words.push({
|
|
71
110
|
word: wordChars,
|
|
72
111
|
start: wordStart,
|
|
@@ -76,3 +115,69 @@ export function parseElevenLabsAlignment(
|
|
|
76
115
|
|
|
77
116
|
return words;
|
|
78
117
|
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Intl.Segmenter-based parsing for text containing spaceless scripts.
|
|
121
|
+
*
|
|
122
|
+
* Steps:
|
|
123
|
+
* 1. Build a mapping from each code-unit offset in the full text to its
|
|
124
|
+
* index in the ElevenLabs character arrays (handling multi-char graphemes).
|
|
125
|
+
* 2. Use `Intl.Segmenter` to find word boundaries in the full text.
|
|
126
|
+
* 3. For each word-like segment, look up the start time of its first character
|
|
127
|
+
* and the end time of its last character from the alignment data.
|
|
128
|
+
*/
|
|
129
|
+
function parseWithSegmenter(
|
|
130
|
+
fullText: string,
|
|
131
|
+
characters: string[],
|
|
132
|
+
startTimes: number[],
|
|
133
|
+
endTimes: number[],
|
|
134
|
+
): WordTiming[] {
|
|
135
|
+
// Build a mapping: code-unit offset in fullText → character array index.
|
|
136
|
+
// ElevenLabs characters may be single code points or multi-code-unit chars
|
|
137
|
+
// (e.g., emoji), so we track offsets carefully.
|
|
138
|
+
const offsetToCharIndex = new Map<number, number>();
|
|
139
|
+
let offset = 0;
|
|
140
|
+
for (let ci = 0; ci < characters.length; ci++) {
|
|
141
|
+
offsetToCharIndex.set(offset, ci);
|
|
142
|
+
offset += characters[ci]!.length;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// Segment the full text into words
|
|
146
|
+
const segments = segmentWords(fullText);
|
|
147
|
+
|
|
148
|
+
const words: WordTiming[] = [];
|
|
149
|
+
for (const seg of segments) {
|
|
150
|
+
// Find the character indices for this segment's boundaries
|
|
151
|
+
const segStart = seg.index;
|
|
152
|
+
const segEnd = seg.index + seg.length;
|
|
153
|
+
|
|
154
|
+
// Find the first character index in this segment
|
|
155
|
+
let firstCharIdx: number | undefined;
|
|
156
|
+
let lastCharIdx: number | undefined;
|
|
157
|
+
|
|
158
|
+
// Walk through offsets to find all char indices within this segment
|
|
159
|
+
for (const [off, ci] of offsetToCharIndex) {
|
|
160
|
+
if (off >= segStart && off < segEnd) {
|
|
161
|
+
if (firstCharIdx === undefined || ci < firstCharIdx) {
|
|
162
|
+
firstCharIdx = ci;
|
|
163
|
+
}
|
|
164
|
+
if (lastCharIdx === undefined || ci > lastCharIdx) {
|
|
165
|
+
lastCharIdx = ci;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
if (firstCharIdx === undefined || lastCharIdx === undefined) continue;
|
|
171
|
+
|
|
172
|
+
const wordStart = startTimes[firstCharIdx] ?? 0;
|
|
173
|
+
const wordEnd = endTimes[lastCharIdx] ?? wordStart;
|
|
174
|
+
|
|
175
|
+
words.push({
|
|
176
|
+
word: seg.word,
|
|
177
|
+
start: wordStart,
|
|
178
|
+
end: wordEnd,
|
|
179
|
+
});
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return words;
|
|
183
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Language-aware word segmentation utilities.
|
|
3
|
+
*
|
|
4
|
+
* Uses `Intl.Segmenter` (ICU-backed, zero dependencies) to handle languages
|
|
5
|
+
* that don't use spaces between words: Japanese, Chinese, Thai, Khmer, etc.
|
|
6
|
+
*
|
|
7
|
+
* For Latin/Cyrillic/Arabic/Korean and other space-delimited scripts, simple
|
|
8
|
+
* whitespace splitting is equivalent — but `Intl.Segmenter` handles them too,
|
|
9
|
+
* so we use a single code path for all languages.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Script detection helpers
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Check if a code point belongs to a script that doesn't use spaces between words.
|
|
18
|
+
*
|
|
19
|
+
* Covers: CJK ideographs, Hiragana, Katakana, Thai, Lao, Myanmar, Khmer, Tibetan.
|
|
20
|
+
* Does NOT include Korean (Hangul) — Korean uses spaces between words.
|
|
21
|
+
*/
|
|
22
|
+
export function isSpacelessScript(cp: number): boolean {
|
|
23
|
+
// Hiragana
|
|
24
|
+
if (cp >= 0x3040 && cp <= 0x309f) return true;
|
|
25
|
+
// Katakana
|
|
26
|
+
if (cp >= 0x30a0 && cp <= 0x30ff) return true;
|
|
27
|
+
if (cp >= 0x31f0 && cp <= 0x31ff) return true; // Katakana Phonetic Extensions
|
|
28
|
+
if (cp >= 0xff65 && cp <= 0xff9f) return true; // Halfwidth Katakana
|
|
29
|
+
// CJK Unified Ideographs
|
|
30
|
+
if (cp >= 0x4e00 && cp <= 0x9fff) return true;
|
|
31
|
+
if (cp >= 0x3400 && cp <= 0x4dbf) return true; // Extension A
|
|
32
|
+
if (cp >= 0xf900 && cp <= 0xfaff) return true; // Compatibility
|
|
33
|
+
if (cp >= 0x20000 && cp <= 0x2a6df) return true; // Extension B
|
|
34
|
+
if (cp >= 0x2a700 && cp <= 0x2b73f) return true; // Extension C
|
|
35
|
+
if (cp >= 0x2b740 && cp <= 0x2b81f) return true; // Extension D
|
|
36
|
+
// CJK Radicals
|
|
37
|
+
if (cp >= 0x2e80 && cp <= 0x2eff) return true;
|
|
38
|
+
if (cp >= 0x2f00 && cp <= 0x2fdf) return true;
|
|
39
|
+
// Thai
|
|
40
|
+
if (cp >= 0x0e00 && cp <= 0x0e7f) return true;
|
|
41
|
+
// Lao
|
|
42
|
+
if (cp >= 0x0e80 && cp <= 0x0eff) return true;
|
|
43
|
+
// Myanmar
|
|
44
|
+
if (cp >= 0x1000 && cp <= 0x109f) return true;
|
|
45
|
+
// Khmer
|
|
46
|
+
if (cp >= 0x1780 && cp <= 0x17ff) return true;
|
|
47
|
+
// Tibetan
|
|
48
|
+
if (cp >= 0x0f00 && cp <= 0x0fff) return true;
|
|
49
|
+
|
|
50
|
+
return false;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Check if a string contains any characters from spaceless scripts.
|
|
55
|
+
* Used as a fast gate to decide whether we need `Intl.Segmenter`.
|
|
56
|
+
*/
|
|
57
|
+
export function hasSpacelessChars(text: string): boolean {
|
|
58
|
+
for (const char of text) {
|
|
59
|
+
const cp = char.codePointAt(0);
|
|
60
|
+
if (cp !== undefined && isSpacelessScript(cp)) return true;
|
|
61
|
+
}
|
|
62
|
+
return false;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
// Intl.Segmenter-based word segmentation
|
|
67
|
+
// ---------------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
interface WordSegment {
|
|
70
|
+
/** The word text. */
|
|
71
|
+
word: string;
|
|
72
|
+
/** Character offset in the original string (code-unit index). */
|
|
73
|
+
index: number;
|
|
74
|
+
/** Length in code units. */
|
|
75
|
+
length: number;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/** Cached segmenter instance (default locale, word granularity). */
|
|
79
|
+
let _segmenter: Intl.Segmenter | undefined;
|
|
80
|
+
|
|
81
|
+
function getSegmenter(): Intl.Segmenter {
|
|
82
|
+
if (!_segmenter) {
|
|
83
|
+
_segmenter = new Intl.Segmenter(undefined, { granularity: "word" });
|
|
84
|
+
}
|
|
85
|
+
return _segmenter;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Segment text into word-like tokens using `Intl.Segmenter`.
|
|
90
|
+
*
|
|
91
|
+
* Returns only word-like segments (no punctuation-only or whitespace segments).
|
|
92
|
+
* Works correctly for all languages including Japanese, Chinese, Thai, Korean,
|
|
93
|
+
* Arabic, Hindi, etc.
|
|
94
|
+
*/
|
|
95
|
+
export function segmentWords(text: string): WordSegment[] {
|
|
96
|
+
const segmenter = getSegmenter();
|
|
97
|
+
const result: WordSegment[] = [];
|
|
98
|
+
for (const seg of segmenter.segment(text)) {
|
|
99
|
+
if (seg.isWordLike) {
|
|
100
|
+
result.push({
|
|
101
|
+
word: seg.segment,
|
|
102
|
+
index: seg.index,
|
|
103
|
+
length: seg.segment.length,
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
return result;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* Count the number of word-like tokens in a string.
|
|
112
|
+
*
|
|
113
|
+
* Uses `Intl.Segmenter` for spaceless scripts, whitespace splitting for others.
|
|
114
|
+
* This matches the word count from `segmentWords()` / `parseElevenLabsAlignment()`.
|
|
115
|
+
*/
|
|
116
|
+
export function countWords(text: string): number {
|
|
117
|
+
if (hasSpacelessChars(text)) {
|
|
118
|
+
return segmentWords(text).length;
|
|
119
|
+
}
|
|
120
|
+
// Fast path for space-delimited scripts
|
|
121
|
+
return text.trim().split(/\s+/).filter(Boolean).length;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
// Smart join — language-aware word concatenation
|
|
126
|
+
// ---------------------------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Join words without inserting incorrect spaces between CJK/Thai tokens.
|
|
130
|
+
*
|
|
131
|
+
* Rules:
|
|
132
|
+
* - Between two spaceless-script tokens → no space (日本語 + テスト → 日本語テスト)
|
|
133
|
+
* - Between a spaceless-script token and a Latin token → no space (Varg + は → Vargは)
|
|
134
|
+
* - Between two Latin/Cyrillic/etc. tokens → space (Hello + world → Hello world)
|
|
135
|
+
*
|
|
136
|
+
* This matches how the original text would look — CJK/Thai don't use spaces.
|
|
137
|
+
*/
|
|
138
|
+
export function smartJoin(words: string[]): string {
|
|
139
|
+
if (words.length === 0) return "";
|
|
140
|
+
if (words.length === 1) return words[0]!;
|
|
141
|
+
|
|
142
|
+
let result = words[0]!;
|
|
143
|
+
for (let i = 1; i < words.length; i++) {
|
|
144
|
+
const prev = words[i - 1]!;
|
|
145
|
+
const curr = words[i]!;
|
|
146
|
+
|
|
147
|
+
// Check the last char of prev and first char of curr
|
|
148
|
+
const prevLastCp = lastCodePoint(prev);
|
|
149
|
+
const currFirstCp = curr.codePointAt(0) ?? 0;
|
|
150
|
+
|
|
151
|
+
// No space if either side is a spaceless script character
|
|
152
|
+
const needsSpace =
|
|
153
|
+
!isSpacelessScript(prevLastCp) && !isSpacelessScript(currFirstCp);
|
|
154
|
+
|
|
155
|
+
result += needsSpace ? ` ${curr}` : curr;
|
|
156
|
+
}
|
|
157
|
+
return result;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Get the last code point of a string.
|
|
162
|
+
*/
|
|
163
|
+
function lastCodePoint(s: string): number {
|
|
164
|
+
if (s.length === 0) return 0;
|
|
165
|
+
// Handle surrogate pairs
|
|
166
|
+
const last = s.codePointAt(s.length - 1);
|
|
167
|
+
if (last !== undefined && last >= 0xdc00 && last <= 0xdfff && s.length >= 2) {
|
|
168
|
+
// Low surrogate — the actual code point starts one position earlier
|
|
169
|
+
return s.codePointAt(s.length - 2) ?? 0;
|
|
170
|
+
}
|
|
171
|
+
return last ?? 0;
|
|
172
|
+
}
|