react-native-sherpa-onnx 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/android/src/main/assets/model_licenses/alignment-models-license-status.csv +5 -0
- package/android/src/main/cpp/CMakeLists.txt +3 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.cpp +66 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.h +17 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-alignment.cpp +108 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +30 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.cpp +66 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.h +30 -0
- package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +21 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxAlignmentHelper.kt +555 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +76 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTextSegmenter.kt +330 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +180 -23
- package/ios/Resources/model_licenses/alignment-models-license-status.csv +5 -0
- package/ios/SherpaOnnx+Alignment.mm +704 -0
- package/ios/SherpaOnnx+STT.mm +6 -0
- package/ios/SherpaOnnx+TTS.mm +624 -50
- package/ios/model_detect/sherpa-onnx-model-detect-alignment.mm +108 -0
- package/ios/model_detect/sherpa-onnx-model-detect.h +31 -0
- package/ios/model_detect/sherpa-onnx-validate-alignment.h +30 -0
- package/ios/model_detect/sherpa-onnx-validate-alignment.mm +66 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.h +3 -1
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +6 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/alignment/index.js +27 -0
- package/lib/module/alignment/index.js.map +1 -0
- package/lib/module/alignment/types.js +2 -0
- package/lib/module/alignment/types.js.map +1 -0
- package/lib/module/alignment/vocab.js +40 -0
- package/lib/module/alignment/vocab.js.map +1 -0
- package/lib/module/download/paths.js +9 -1
- package/lib/module/download/paths.js.map +1 -1
- package/lib/module/download/registry.js +17 -1
- package/lib/module/download/registry.js.map +1 -1
- package/lib/module/download/types.js +1 -0
- package/lib/module/download/types.js.map +1 -1
- package/lib/module/index.js +6 -4
- package/lib/module/index.js.map +1 -1
- package/lib/module/licenses.js +8 -2
- package/lib/module/licenses.js.map +1 -1
- package/lib/module/stt/types.js.map +1 -1
- package/lib/module/tts/index.js +68 -2
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/subtitles.js +400 -0
- package/lib/module/tts/subtitles.js.map +1 -0
- package/lib/module/tts/tempAudio.js +17 -0
- package/lib/module/tts/tempAudio.js.map +1 -0
- package/lib/module/tts/types.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +34 -3
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/alignment/index.d.ts +8 -0
- package/lib/typescript/src/alignment/index.d.ts.map +1 -0
- package/lib/typescript/src/alignment/types.d.ts +23 -0
- package/lib/typescript/src/alignment/types.d.ts.map +1 -0
- package/lib/typescript/src/alignment/vocab.d.ts +5 -0
- package/lib/typescript/src/alignment/vocab.d.ts.map +1 -0
- package/lib/typescript/src/download/paths.d.ts +5 -2
- package/lib/typescript/src/download/paths.d.ts.map +1 -1
- package/lib/typescript/src/download/registry.d.ts.map +1 -1
- package/lib/typescript/src/download/types.d.ts +2 -1
- package/lib/typescript/src/download/types.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +1 -0
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/licenses.d.ts.map +1 -1
- package/lib/typescript/src/stt/types.d.ts +5 -2
- package/lib/typescript/src/stt/types.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +2 -1
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/subtitles.d.ts +24 -0
- package/lib/typescript/src/tts/subtitles.d.ts.map +1 -0
- package/lib/typescript/src/tts/tempAudio.d.ts +3 -0
- package/lib/typescript/src/tts/tempAudio.d.ts.map +1 -0
- package/lib/typescript/src/tts/types.d.ts +68 -2
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/package.json +6 -1
- package/scripts/alignment-models/README.md +90 -0
- package/scripts/alignment-models/build_and_upload.js +724 -0
- package/scripts/alignment-models/sources.csv +5 -0
- package/scripts/alignment-models/sync_alignment_license_status.js +123 -0
- package/src/NativeSherpaOnnx.ts +35 -3
- package/src/alignment/index.ts +41 -0
- package/src/alignment/types.ts +22 -0
- package/src/alignment/vocab.ts +38 -0
- package/src/download/paths.ts +18 -5
- package/src/download/registry.ts +23 -3
- package/src/download/types.ts +1 -0
- package/src/index.tsx +6 -4
- package/src/licenses.ts +12 -1
- package/src/stt/types.ts +5 -2
- package/src/tts/index.ts +110 -3
- package/src/tts/subtitles.ts +611 -0
- package/src/tts/tempAudio.ts +31 -0
- package/src/tts/types.ts +79 -2
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
|
@@ -0,0 +1,611 @@
|
|
|
1
|
+
import { unlink } from '@dr.pogodin/react-native-fs';
|
|
2
|
+
import SherpaOnnx from '../NativeSherpaOnnx';
|
|
3
|
+
import { WAV2VEC2_VOCAB } from '../alignment/vocab';
|
|
4
|
+
import { decodeAudioFileToFloatSamples } from '../audio';
|
|
5
|
+
import { saveAlignmentAudioToTempWav } from './tempAudio';
|
|
6
|
+
import type {
|
|
7
|
+
SubtitleGranularity,
|
|
8
|
+
SubtitleMode,
|
|
9
|
+
SubtitleFromAudioOptions,
|
|
10
|
+
SubtitleResult,
|
|
11
|
+
TtsSubtitleItem,
|
|
12
|
+
} from './types';
|
|
13
|
+
|
|
14
|
+
export function assertSubtitleGranularityForMode(
|
|
15
|
+
mode: SubtitleMode,
|
|
16
|
+
granularity: SubtitleGranularity
|
|
17
|
+
): void {
|
|
18
|
+
if (granularity === 'character' && mode !== 'accurate') {
|
|
19
|
+
throw new Error(
|
|
20
|
+
"Character granularity is only supported when subtitles.mode is 'accurate'."
|
|
21
|
+
);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const SENTENCE_TERMINATORS = new Set([
|
|
26
|
+
'.',
|
|
27
|
+
'!',
|
|
28
|
+
'?',
|
|
29
|
+
';',
|
|
30
|
+
'。',
|
|
31
|
+
'!',
|
|
32
|
+
'?',
|
|
33
|
+
';',
|
|
34
|
+
]);
|
|
35
|
+
const TRAILING_CLOSERS = new Set([
|
|
36
|
+
'"',
|
|
37
|
+
"'",
|
|
38
|
+
')',
|
|
39
|
+
']',
|
|
40
|
+
'}',
|
|
41
|
+
'>',
|
|
42
|
+
'”',
|
|
43
|
+
'’',
|
|
44
|
+
'」',
|
|
45
|
+
'』',
|
|
46
|
+
'】',
|
|
47
|
+
')',
|
|
48
|
+
]);
|
|
49
|
+
|
|
50
|
+
const COMMON_ABBREVIATIONS = new Set([
|
|
51
|
+
'mr',
|
|
52
|
+
'mrs',
|
|
53
|
+
'ms',
|
|
54
|
+
'dr',
|
|
55
|
+
'prof',
|
|
56
|
+
'sr',
|
|
57
|
+
'jr',
|
|
58
|
+
'st',
|
|
59
|
+
'vs',
|
|
60
|
+
'etc',
|
|
61
|
+
'e.g',
|
|
62
|
+
'i.e',
|
|
63
|
+
]);
|
|
64
|
+
|
|
65
|
+
const WAV2VEC2_VOCAB_JSON = JSON.stringify(WAV2VEC2_VOCAB);
|
|
66
|
+
|
|
67
|
+
function isWhitespaceChar(char: string): boolean {
|
|
68
|
+
return /\s/u.test(char);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function isSentenceTerminator(char: string): boolean {
|
|
72
|
+
return SENTENCE_TERMINATORS.has(char);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function isTrailingCloser(char: string): boolean {
|
|
76
|
+
return TRAILING_CLOSERS.has(char);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function extractTokenBeforePeriod(text: string, periodIndex: number): string {
|
|
80
|
+
let i = periodIndex - 1;
|
|
81
|
+
while (i >= 0 && isWhitespaceChar(text[i] ?? '')) {
|
|
82
|
+
i -= 1;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const end = i;
|
|
86
|
+
while (i >= 0) {
|
|
87
|
+
const char = text[i] ?? '';
|
|
88
|
+
if (/\p{L}|\./u.test(char)) {
|
|
89
|
+
i -= 1;
|
|
90
|
+
continue;
|
|
91
|
+
}
|
|
92
|
+
break;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if (end < i + 1) {
|
|
96
|
+
return '';
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
return text.slice(i + 1, end + 1).replace(/\.+$/u, '');
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function shouldSplitOnPeriod(text: string, periodIndex: number): boolean {
|
|
103
|
+
const prev = text[periodIndex - 1] ?? '';
|
|
104
|
+
const next = text[periodIndex + 1] ?? '';
|
|
105
|
+
|
|
106
|
+
// Do not split decimal numbers like 3.14.
|
|
107
|
+
if (/\d/u.test(prev) && /\d/u.test(next)) {
|
|
108
|
+
return false;
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
const tokenRaw = extractTokenBeforePeriod(text, periodIndex);
|
|
112
|
+
const token = tokenRaw.toLowerCase();
|
|
113
|
+
if (COMMON_ABBREVIATIONS.has(token)) {
|
|
114
|
+
return false;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Likely initial, e.g. "A. Smith" (check original case; token is lowercased for abbreviations).
|
|
118
|
+
if (tokenRaw.length === 1 && /\p{Lu}/u.test(tokenRaw)) {
|
|
119
|
+
return false;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return true;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
function sentenceBoundaryEnd(text: string, index: number): number {
|
|
126
|
+
let end = index + 1;
|
|
127
|
+
|
|
128
|
+
while (end < text.length && isSentenceTerminator(text[end] ?? '')) {
|
|
129
|
+
end += 1;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
while (end < text.length && isTrailingCloser(text[end] ?? '')) {
|
|
133
|
+
end += 1;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
return end;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
function sanitizeSegments(segments: string[]): string[] {
|
|
140
|
+
return segments
|
|
141
|
+
.map((segment) => segment.trim())
|
|
142
|
+
.filter((segment) => segment.length > 0);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
function distributeSamplesByTextWeight(
|
|
146
|
+
totalSamples: number,
|
|
147
|
+
segments: string[]
|
|
148
|
+
): number[] {
|
|
149
|
+
if (segments.length === 0) {
|
|
150
|
+
return [];
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
const safeTotal = Math.max(0, Math.floor(totalSamples));
|
|
154
|
+
if (safeTotal === 0) {
|
|
155
|
+
return new Array(segments.length).fill(0);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
const weights = segments.map((segment) =>
|
|
159
|
+
Math.max(1, Array.from(segment).length)
|
|
160
|
+
);
|
|
161
|
+
const weightSum = weights.reduce((sum, value) => sum + value, 0);
|
|
162
|
+
if (weightSum <= 0) {
|
|
163
|
+
return new Array(segments.length).fill(0);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
const base = weights.map((weight) =>
|
|
167
|
+
Math.floor((safeTotal * weight) / weightSum)
|
|
168
|
+
);
|
|
169
|
+
|
|
170
|
+
let assigned = base.reduce((sum, value) => sum + value, 0);
|
|
171
|
+
let remaining = safeTotal - assigned;
|
|
172
|
+
|
|
173
|
+
if (remaining > 0) {
|
|
174
|
+
const fractionalOrder = weights
|
|
175
|
+
.map((weight, index) => {
|
|
176
|
+
const exact = (safeTotal * weight) / weightSum;
|
|
177
|
+
return {
|
|
178
|
+
index,
|
|
179
|
+
fraction: exact - Math.floor(exact),
|
|
180
|
+
};
|
|
181
|
+
})
|
|
182
|
+
.sort((a, b) => b.fraction - a.fraction);
|
|
183
|
+
|
|
184
|
+
let ptr = 0;
|
|
185
|
+
while (remaining > 0 && fractionalOrder.length > 0) {
|
|
186
|
+
const target = fractionalOrder[ptr % fractionalOrder.length];
|
|
187
|
+
if (target == null) {
|
|
188
|
+
break;
|
|
189
|
+
}
|
|
190
|
+
base[target.index] = (base[target.index] ?? 0) + 1;
|
|
191
|
+
assigned += 1;
|
|
192
|
+
remaining = safeTotal - assigned;
|
|
193
|
+
ptr += 1;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
return base;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
function alignChunkCountsToSegments(
|
|
201
|
+
segments: string[],
|
|
202
|
+
chunkSampleCounts: number[]
|
|
203
|
+
): number[] {
|
|
204
|
+
if (segments.length === 0) {
|
|
205
|
+
return [];
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
const counts = chunkSampleCounts.map((value) =>
|
|
209
|
+
Number.isFinite(value) ? Math.max(0, Math.floor(value)) : 0
|
|
210
|
+
);
|
|
211
|
+
|
|
212
|
+
if (counts.length === segments.length) {
|
|
213
|
+
return counts;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
if (counts.length > segments.length) {
|
|
217
|
+
const merged = counts.slice(0, segments.length);
|
|
218
|
+
const extra = counts
|
|
219
|
+
.slice(segments.length)
|
|
220
|
+
.reduce((sum, value) => sum + value, 0);
|
|
221
|
+
const lastIndex = merged.length - 1;
|
|
222
|
+
if (lastIndex >= 0) {
|
|
223
|
+
merged[lastIndex] = (merged[lastIndex] ?? 0) + extra;
|
|
224
|
+
}
|
|
225
|
+
return merged;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
const total = counts.reduce((sum, value) => sum + value, 0);
|
|
229
|
+
return distributeSamplesByTextWeight(total, segments);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
type AlignmentNativeItem = {
|
|
233
|
+
text: string;
|
|
234
|
+
start: number;
|
|
235
|
+
end: number;
|
|
236
|
+
};
|
|
237
|
+
|
|
238
|
+
function distributeItemCounts(total: number, weights: number[]): number[] {
|
|
239
|
+
if (weights.length === 0) {
|
|
240
|
+
return [];
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
const safeTotal = Math.max(0, Math.floor(total));
|
|
244
|
+
if (safeTotal === 0) {
|
|
245
|
+
return new Array(weights.length).fill(0);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
const safeWeights = weights.map((weight) => Math.max(1, Math.floor(weight)));
|
|
249
|
+
const weightSum = safeWeights.reduce((sum, value) => sum + value, 0);
|
|
250
|
+
if (weightSum <= 0) {
|
|
251
|
+
return new Array(weights.length).fill(0);
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
const base = safeWeights.map((weight) =>
|
|
255
|
+
Math.floor((safeTotal * weight) / weightSum)
|
|
256
|
+
);
|
|
257
|
+
|
|
258
|
+
let assigned = base.reduce((sum, value) => sum + value, 0);
|
|
259
|
+
let remaining = safeTotal - assigned;
|
|
260
|
+
let index = 0;
|
|
261
|
+
while (remaining > 0 && base.length > 0) {
|
|
262
|
+
const slot = index % base.length;
|
|
263
|
+
base[slot] = (base[slot] ?? 0) + 1;
|
|
264
|
+
assigned += 1;
|
|
265
|
+
remaining = safeTotal - assigned;
|
|
266
|
+
index += 1;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
return base;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
function normalizeAlignmentItems(
|
|
273
|
+
items: AlignmentNativeItem[]
|
|
274
|
+
): TtsSubtitleItem[] {
|
|
275
|
+
return items
|
|
276
|
+
.map((item) => ({
|
|
277
|
+
text: item.text,
|
|
278
|
+
start: Number.isFinite(item.start) ? Math.max(0, item.start) : 0,
|
|
279
|
+
end: Number.isFinite(item.end) ? Math.max(0, item.end) : 0,
|
|
280
|
+
}))
|
|
281
|
+
.map((item) => ({
|
|
282
|
+
...item,
|
|
283
|
+
end: item.end < item.start ? item.start : item.end,
|
|
284
|
+
}))
|
|
285
|
+
.filter((item) => item.text.trim().length > 0);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
function buildSentenceSubtitlesFromAlignedWords(
|
|
289
|
+
text: string,
|
|
290
|
+
alignedWords: TtsSubtitleItem[]
|
|
291
|
+
): TtsSubtitleItem[] {
|
|
292
|
+
const sentences = splitTextIntoSentences(text);
|
|
293
|
+
if (sentences.length === 0 || alignedWords.length === 0) {
|
|
294
|
+
return [];
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
const sentenceWeights = sentences.map((sentence) =>
|
|
298
|
+
Math.max(1, splitTextIntoWords(sentence).length)
|
|
299
|
+
);
|
|
300
|
+
const sentenceWordCounts = distributeItemCounts(
|
|
301
|
+
alignedWords.length,
|
|
302
|
+
sentenceWeights
|
|
303
|
+
);
|
|
304
|
+
|
|
305
|
+
const subtitles: TtsSubtitleItem[] = [];
|
|
306
|
+
let wordCursor = 0;
|
|
307
|
+
let fallbackTime = alignedWords[0]?.start ?? 0;
|
|
308
|
+
|
|
309
|
+
for (let i = 0; i < sentences.length; i += 1) {
|
|
310
|
+
const sentence = sentences[i] ?? '';
|
|
311
|
+
const count = Math.max(0, sentenceWordCounts[i] ?? 0);
|
|
312
|
+
const chunk = alignedWords.slice(wordCursor, wordCursor + count);
|
|
313
|
+
wordCursor += count;
|
|
314
|
+
|
|
315
|
+
if (chunk.length === 0) {
|
|
316
|
+
subtitles.push({
|
|
317
|
+
text: sentence,
|
|
318
|
+
start: fallbackTime,
|
|
319
|
+
end: fallbackTime,
|
|
320
|
+
});
|
|
321
|
+
continue;
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
const start = chunk[0]?.start ?? fallbackTime;
|
|
325
|
+
const end = chunk[chunk.length - 1]?.end ?? start;
|
|
326
|
+
fallbackTime = end;
|
|
327
|
+
|
|
328
|
+
subtitles.push({
|
|
329
|
+
text: sentence,
|
|
330
|
+
start,
|
|
331
|
+
end,
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
const lastAlignedEnd = alignedWords[alignedWords.length - 1]?.end;
|
|
336
|
+
if (lastAlignedEnd !== undefined && subtitles.length > 0) {
|
|
337
|
+
const lastIndex = subtitles.length - 1;
|
|
338
|
+
const last = subtitles[lastIndex];
|
|
339
|
+
if (last != null) {
|
|
340
|
+
subtitles[lastIndex] = {
|
|
341
|
+
...last,
|
|
342
|
+
end: Math.max(last.end, lastAlignedEnd),
|
|
343
|
+
};
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
return subtitles;
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
function isCjkChar(char: string): boolean {
|
|
351
|
+
return /\p{Script=Han}|\p{Script=Hiragana}|\p{Script=Katakana}|\p{Script=Hangul}/u.test(
|
|
352
|
+
char
|
|
353
|
+
);
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
function isWordDelimiter(char: string): boolean {
|
|
357
|
+
return /[\s.,!?;:()[\]{}"'`~<>/\\|@#$%^&*+=…,。!?;:、]/u.test(char);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
export function splitTextIntoSentences(text: string): string[] {
|
|
361
|
+
const normalized = text.trim();
|
|
362
|
+
if (normalized.length === 0) {
|
|
363
|
+
return [];
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
const sentences: string[] = [];
|
|
367
|
+
let start = 0;
|
|
368
|
+
let i = 0;
|
|
369
|
+
|
|
370
|
+
while (i < normalized.length) {
|
|
371
|
+
const current = normalized[i] ?? '';
|
|
372
|
+
if (!isSentenceTerminator(current)) {
|
|
373
|
+
i += 1;
|
|
374
|
+
continue;
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
if (current === '.' && !shouldSplitOnPeriod(normalized, i)) {
|
|
378
|
+
i += 1;
|
|
379
|
+
continue;
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
const end = sentenceBoundaryEnd(normalized, i);
|
|
383
|
+
const next = normalized[end];
|
|
384
|
+
if (next !== undefined && !isWhitespaceChar(next)) {
|
|
385
|
+
i += 1;
|
|
386
|
+
continue;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
const sentence = normalized.slice(start, end).trim();
|
|
390
|
+
if (sentence.length > 0) {
|
|
391
|
+
sentences.push(sentence);
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
start = end;
|
|
395
|
+
while (
|
|
396
|
+
start < normalized.length &&
|
|
397
|
+
isWhitespaceChar(normalized[start] ?? '')
|
|
398
|
+
) {
|
|
399
|
+
start += 1;
|
|
400
|
+
}
|
|
401
|
+
i = start;
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
const tail = normalized.slice(start).trim();
|
|
405
|
+
if (tail.length > 0) {
|
|
406
|
+
sentences.push(tail);
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
return sentences.length > 0 ? sentences : [normalized];
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
export function splitTextIntoWords(text: string): string[] {
|
|
413
|
+
const normalized = text.trim();
|
|
414
|
+
if (normalized.length === 0) {
|
|
415
|
+
return [];
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
const words: string[] = [];
|
|
419
|
+
let current = '';
|
|
420
|
+
|
|
421
|
+
const flushCurrent = () => {
|
|
422
|
+
const token = current.trim();
|
|
423
|
+
if (token.length > 0) {
|
|
424
|
+
words.push(token);
|
|
425
|
+
}
|
|
426
|
+
current = '';
|
|
427
|
+
};
|
|
428
|
+
|
|
429
|
+
for (const char of normalized) {
|
|
430
|
+
if (isWhitespaceChar(char)) {
|
|
431
|
+
flushCurrent();
|
|
432
|
+
continue;
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
if (isCjkChar(char)) {
|
|
436
|
+
flushCurrent();
|
|
437
|
+
words.push(char);
|
|
438
|
+
continue;
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
if (isWordDelimiter(char)) {
|
|
442
|
+
flushCurrent();
|
|
443
|
+
continue;
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
current += char;
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
flushCurrent();
|
|
450
|
+
|
|
451
|
+
return words.length > 0 ? words : [normalized];
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
export function buildSubtitlesFromChunks(
|
|
455
|
+
segments: string[],
|
|
456
|
+
chunkSampleCounts: number[],
|
|
457
|
+
sampleRate: number
|
|
458
|
+
): TtsSubtitleItem[] {
|
|
459
|
+
if (!Number.isFinite(sampleRate) || sampleRate <= 0) {
|
|
460
|
+
return [];
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
const cleanedSegments = sanitizeSegments(segments);
|
|
464
|
+
if (cleanedSegments.length === 0) {
|
|
465
|
+
return [];
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
const alignedCounts = alignChunkCountsToSegments(
|
|
469
|
+
cleanedSegments,
|
|
470
|
+
chunkSampleCounts
|
|
471
|
+
);
|
|
472
|
+
|
|
473
|
+
const subtitles: TtsSubtitleItem[] = [];
|
|
474
|
+
let offsetSamples = 0;
|
|
475
|
+
|
|
476
|
+
for (let i = 0; i < cleanedSegments.length; i += 1) {
|
|
477
|
+
const samples = Math.max(0, alignedCounts[i] ?? 0);
|
|
478
|
+
if (samples === 0 && offsetSamples === 0) {
|
|
479
|
+
continue;
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
const start = offsetSamples / sampleRate;
|
|
483
|
+
offsetSamples += samples;
|
|
484
|
+
const end = offsetSamples / sampleRate;
|
|
485
|
+
|
|
486
|
+
subtitles.push({
|
|
487
|
+
text: cleanedSegments[i] ?? '',
|
|
488
|
+
start,
|
|
489
|
+
end,
|
|
490
|
+
});
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
return subtitles;
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
/**
|
|
497
|
+
* Generate subtitle timelines from an existing transcript plus audio.
|
|
498
|
+
*
|
|
499
|
+
* This helper supports two modes:
|
|
500
|
+
* - `mode: 'fast'`: single-stage estimation from transcript chunks + audio duration.
|
|
501
|
+
* - `mode: 'accurate'`: two-stage pipeline: external STT transcript + wav2vec2 CTC forced alignment.
|
|
502
|
+
*
|
|
503
|
+
* For accurate mode, an alignment model from `ModelCategory.Alignment` must be available.
|
|
504
|
+
* You can pre-validate a model path via `detectAlignmentModel` from `react-native-sherpa-onnx/alignment`
|
|
505
|
+
* before calling this function.
|
|
506
|
+
*
|
|
507
|
+
* Related exports:
|
|
508
|
+
* - Alignment detection: `detectAlignmentModel`
|
|
509
|
+
*/
|
|
510
|
+
export async function generateSubtitlesFromAudio(
|
|
511
|
+
text: string,
|
|
512
|
+
audioPathOrSamples: string | { samples: number[]; sampleRate: number },
|
|
513
|
+
options: SubtitleFromAudioOptions
|
|
514
|
+
): Promise<SubtitleResult> {
|
|
515
|
+
const mode = options.mode;
|
|
516
|
+
const granularity = options.granularity ?? 'sentence';
|
|
517
|
+
|
|
518
|
+
assertSubtitleGranularityForMode(mode, granularity);
|
|
519
|
+
|
|
520
|
+
if (mode === 'accurate') {
|
|
521
|
+
const resolvedModelPath = options.alignmentModelPath?.trim();
|
|
522
|
+
|
|
523
|
+
if (!resolvedModelPath) {
|
|
524
|
+
throw new Error(
|
|
525
|
+
'ALIGNMENT_MODEL_MISSING: Provide options.alignmentModelPath for accurate subtitles.'
|
|
526
|
+
);
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
let audioPath = '';
|
|
530
|
+
let shouldCleanup = false;
|
|
531
|
+
|
|
532
|
+
if (typeof audioPathOrSamples === 'string') {
|
|
533
|
+
audioPath = audioPathOrSamples;
|
|
534
|
+
} else {
|
|
535
|
+
audioPath = await saveAlignmentAudioToTempWav(audioPathOrSamples);
|
|
536
|
+
shouldCleanup = true;
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
try {
|
|
540
|
+
const aligned = await SherpaOnnx.runCTCForcedAlignment(
|
|
541
|
+
resolvedModelPath,
|
|
542
|
+
audioPath,
|
|
543
|
+
text,
|
|
544
|
+
WAV2VEC2_VOCAB_JSON
|
|
545
|
+
);
|
|
546
|
+
|
|
547
|
+
const wordItems = normalizeAlignmentItems(aligned.words ?? []);
|
|
548
|
+
const charItems = normalizeAlignmentItems(aligned.chars ?? []);
|
|
549
|
+
|
|
550
|
+
return {
|
|
551
|
+
subtitles:
|
|
552
|
+
granularity === 'character'
|
|
553
|
+
? charItems
|
|
554
|
+
: granularity === 'word'
|
|
555
|
+
? wordItems
|
|
556
|
+
: buildSentenceSubtitlesFromAlignedWords(text, wordItems),
|
|
557
|
+
timingMode: 'aligned',
|
|
558
|
+
};
|
|
559
|
+
} finally {
|
|
560
|
+
if (shouldCleanup) {
|
|
561
|
+
unlink(audioPath).catch(() => {
|
|
562
|
+
// ignore cleanup errors
|
|
563
|
+
});
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
const segments =
|
|
569
|
+
granularity === 'word'
|
|
570
|
+
? splitTextIntoWords(text)
|
|
571
|
+
: splitTextIntoSentences(text);
|
|
572
|
+
|
|
573
|
+
if (segments.length === 0) {
|
|
574
|
+
return {
|
|
575
|
+
subtitles: [],
|
|
576
|
+
timingMode: 'estimated',
|
|
577
|
+
};
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
let totalSamples = 0;
|
|
581
|
+
let sampleRate = 0;
|
|
582
|
+
|
|
583
|
+
if (typeof audioPathOrSamples === 'string') {
|
|
584
|
+
const decoded = await decodeAudioFileToFloatSamples(audioPathOrSamples);
|
|
585
|
+
totalSamples = decoded.samples.length;
|
|
586
|
+
sampleRate = decoded.sampleRate;
|
|
587
|
+
} else {
|
|
588
|
+
totalSamples = audioPathOrSamples.samples.length;
|
|
589
|
+
sampleRate = audioPathOrSamples.sampleRate;
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
if (!Number.isFinite(sampleRate) || sampleRate <= 0 || totalSamples <= 0) {
|
|
593
|
+
return {
|
|
594
|
+
subtitles: [],
|
|
595
|
+
timingMode: 'estimated',
|
|
596
|
+
};
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
const chunkSampleCounts = distributeSamplesByTextWeight(
|
|
600
|
+
totalSamples,
|
|
601
|
+
segments
|
|
602
|
+
);
|
|
603
|
+
return {
|
|
604
|
+
subtitles: buildSubtitlesFromChunks(
|
|
605
|
+
segments,
|
|
606
|
+
chunkSampleCounts,
|
|
607
|
+
sampleRate
|
|
608
|
+
),
|
|
609
|
+
timingMode: 'estimated',
|
|
610
|
+
};
|
|
611
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import { DocumentDirectoryPath, mkdir } from '@dr.pogodin/react-native-fs';
|
|
2
|
+
import SherpaOnnx from '../NativeSherpaOnnx';
|
|
3
|
+
import type { GeneratedAudio } from './types';
|
|
4
|
+
|
|
5
|
+
function createTempAlignmentWavPath(instanceId?: string): string {
|
|
6
|
+
const nonce = `${Date.now()}-${Math.random().toString(36).slice(2, 10)}`;
|
|
7
|
+
const prefix = instanceId?.trim() ? `${instanceId}-` : '';
|
|
8
|
+
return `${DocumentDirectoryPath}/sherpa-onnx/cache/${prefix}alignment-${nonce}.wav`.replace(
|
|
9
|
+
/\/+/g,
|
|
10
|
+
'/'
|
|
11
|
+
);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export async function saveAlignmentAudioToTempWav(
|
|
15
|
+
audio: GeneratedAudio,
|
|
16
|
+
instanceId?: string
|
|
17
|
+
): Promise<string> {
|
|
18
|
+
const cacheDir = `${DocumentDirectoryPath}/sherpa-onnx/cache`.replace(
|
|
19
|
+
/\/+/g,
|
|
20
|
+
'/'
|
|
21
|
+
);
|
|
22
|
+
await mkdir(cacheDir);
|
|
23
|
+
|
|
24
|
+
const tempPath = createTempAlignmentWavPath(instanceId);
|
|
25
|
+
await SherpaOnnx.saveTtsAudioToFile(
|
|
26
|
+
audio.samples,
|
|
27
|
+
audio.sampleRate,
|
|
28
|
+
tempPath
|
|
29
|
+
);
|
|
30
|
+
return tempPath;
|
|
31
|
+
}
|