react-native-sherpa-onnx 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +3 -0
  2. package/android/src/main/assets/model_licenses/alignment-models-license-status.csv +5 -0
  3. package/android/src/main/cpp/CMakeLists.txt +3 -0
  4. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.cpp +66 -0
  5. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.h +17 -0
  6. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-alignment.cpp +108 -0
  7. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +30 -0
  8. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.cpp +66 -0
  9. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.h +30 -0
  10. package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +21 -0
  11. package/android/src/main/java/com/sherpaonnx/SherpaOnnxAlignmentHelper.kt +555 -0
  12. package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +76 -0
  13. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTextSegmenter.kt +330 -0
  14. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +180 -23
  15. package/ios/Resources/model_licenses/alignment-models-license-status.csv +5 -0
  16. package/ios/SherpaOnnx+Alignment.mm +704 -0
  17. package/ios/SherpaOnnx+STT.mm +6 -0
  18. package/ios/SherpaOnnx+TTS.mm +624 -50
  19. package/ios/model_detect/sherpa-onnx-model-detect-alignment.mm +108 -0
  20. package/ios/model_detect/sherpa-onnx-model-detect.h +31 -0
  21. package/ios/model_detect/sherpa-onnx-validate-alignment.h +30 -0
  22. package/ios/model_detect/sherpa-onnx-validate-alignment.mm +66 -0
  23. package/ios/stt/sherpa-onnx-stt-wrapper.h +3 -1
  24. package/ios/stt/sherpa-onnx-stt-wrapper.mm +6 -0
  25. package/lib/module/NativeSherpaOnnx.js.map +1 -1
  26. package/lib/module/alignment/index.js +27 -0
  27. package/lib/module/alignment/index.js.map +1 -0
  28. package/lib/module/alignment/types.js +2 -0
  29. package/lib/module/alignment/types.js.map +1 -0
  30. package/lib/module/alignment/vocab.js +40 -0
  31. package/lib/module/alignment/vocab.js.map +1 -0
  32. package/lib/module/download/paths.js +9 -1
  33. package/lib/module/download/paths.js.map +1 -1
  34. package/lib/module/download/registry.js +17 -1
  35. package/lib/module/download/registry.js.map +1 -1
  36. package/lib/module/download/types.js +1 -0
  37. package/lib/module/download/types.js.map +1 -1
  38. package/lib/module/index.js +6 -4
  39. package/lib/module/index.js.map +1 -1
  40. package/lib/module/licenses.js +8 -2
  41. package/lib/module/licenses.js.map +1 -1
  42. package/lib/module/stt/types.js.map +1 -1
  43. package/lib/module/tts/index.js +68 -2
  44. package/lib/module/tts/index.js.map +1 -1
  45. package/lib/module/tts/subtitles.js +400 -0
  46. package/lib/module/tts/subtitles.js.map +1 -0
  47. package/lib/module/tts/tempAudio.js +17 -0
  48. package/lib/module/tts/tempAudio.js.map +1 -0
  49. package/lib/module/tts/types.js.map +1 -1
  50. package/lib/typescript/src/NativeSherpaOnnx.d.ts +34 -3
  51. package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
  52. package/lib/typescript/src/alignment/index.d.ts +8 -0
  53. package/lib/typescript/src/alignment/index.d.ts.map +1 -0
  54. package/lib/typescript/src/alignment/types.d.ts +23 -0
  55. package/lib/typescript/src/alignment/types.d.ts.map +1 -0
  56. package/lib/typescript/src/alignment/vocab.d.ts +5 -0
  57. package/lib/typescript/src/alignment/vocab.d.ts.map +1 -0
  58. package/lib/typescript/src/download/paths.d.ts +5 -2
  59. package/lib/typescript/src/download/paths.d.ts.map +1 -1
  60. package/lib/typescript/src/download/registry.d.ts.map +1 -1
  61. package/lib/typescript/src/download/types.d.ts +2 -1
  62. package/lib/typescript/src/download/types.d.ts.map +1 -1
  63. package/lib/typescript/src/index.d.ts +1 -0
  64. package/lib/typescript/src/index.d.ts.map +1 -1
  65. package/lib/typescript/src/licenses.d.ts.map +1 -1
  66. package/lib/typescript/src/stt/types.d.ts +5 -2
  67. package/lib/typescript/src/stt/types.d.ts.map +1 -1
  68. package/lib/typescript/src/tts/index.d.ts +2 -1
  69. package/lib/typescript/src/tts/index.d.ts.map +1 -1
  70. package/lib/typescript/src/tts/subtitles.d.ts +24 -0
  71. package/lib/typescript/src/tts/subtitles.d.ts.map +1 -0
  72. package/lib/typescript/src/tts/tempAudio.d.ts +3 -0
  73. package/lib/typescript/src/tts/tempAudio.d.ts.map +1 -0
  74. package/lib/typescript/src/tts/types.d.ts +68 -2
  75. package/lib/typescript/src/tts/types.d.ts.map +1 -1
  76. package/package.json +6 -1
  77. package/scripts/alignment-models/README.md +90 -0
  78. package/scripts/alignment-models/build_and_upload.js +724 -0
  79. package/scripts/alignment-models/sources.csv +5 -0
  80. package/scripts/alignment-models/sync_alignment_license_status.js +123 -0
  81. package/src/NativeSherpaOnnx.ts +35 -3
  82. package/src/alignment/index.ts +41 -0
  83. package/src/alignment/types.ts +22 -0
  84. package/src/alignment/vocab.ts +38 -0
  85. package/src/download/paths.ts +18 -5
  86. package/src/download/registry.ts +23 -3
  87. package/src/download/types.ts +1 -0
  88. package/src/index.tsx +6 -4
  89. package/src/licenses.ts +12 -1
  90. package/src/stt/types.ts +5 -2
  91. package/src/tts/index.ts +110 -3
  92. package/src/tts/subtitles.ts +611 -0
  93. package/src/tts/tempAudio.ts +31 -0
  94. package/src/tts/types.ts +79 -2
  95. package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
@@ -0,0 +1,611 @@
1
+ import { unlink } from '@dr.pogodin/react-native-fs';
2
+ import SherpaOnnx from '../NativeSherpaOnnx';
3
+ import { WAV2VEC2_VOCAB } from '../alignment/vocab';
4
+ import { decodeAudioFileToFloatSamples } from '../audio';
5
+ import { saveAlignmentAudioToTempWav } from './tempAudio';
6
+ import type {
7
+ SubtitleGranularity,
8
+ SubtitleMode,
9
+ SubtitleFromAudioOptions,
10
+ SubtitleResult,
11
+ TtsSubtitleItem,
12
+ } from './types';
13
+
14
+ export function assertSubtitleGranularityForMode(
15
+ mode: SubtitleMode,
16
+ granularity: SubtitleGranularity
17
+ ): void {
18
+ if (granularity === 'character' && mode !== 'accurate') {
19
+ throw new Error(
20
+ "Character granularity is only supported when subtitles.mode is 'accurate'."
21
+ );
22
+ }
23
+ }
24
+
25
+ const SENTENCE_TERMINATORS = new Set([
26
+ '.',
27
+ '!',
28
+ '?',
29
+ ';',
30
+ '。',
31
+ '!',
32
+ '?',
33
+ ';',
34
+ ]);
35
+ const TRAILING_CLOSERS = new Set([
36
+ '"',
37
+ "'",
38
+ ')',
39
+ ']',
40
+ '}',
41
+ '>',
42
+ '”',
43
+ '’',
44
+ '」',
45
+ '』',
46
+ '】',
47
+ ')',
48
+ ]);
49
+
50
+ const COMMON_ABBREVIATIONS = new Set([
51
+ 'mr',
52
+ 'mrs',
53
+ 'ms',
54
+ 'dr',
55
+ 'prof',
56
+ 'sr',
57
+ 'jr',
58
+ 'st',
59
+ 'vs',
60
+ 'etc',
61
+ 'e.g',
62
+ 'i.e',
63
+ ]);
64
+
65
+ const WAV2VEC2_VOCAB_JSON = JSON.stringify(WAV2VEC2_VOCAB);
66
+
67
+ function isWhitespaceChar(char: string): boolean {
68
+ return /\s/u.test(char);
69
+ }
70
+
71
+ function isSentenceTerminator(char: string): boolean {
72
+ return SENTENCE_TERMINATORS.has(char);
73
+ }
74
+
75
+ function isTrailingCloser(char: string): boolean {
76
+ return TRAILING_CLOSERS.has(char);
77
+ }
78
+
79
+ function extractTokenBeforePeriod(text: string, periodIndex: number): string {
80
+ let i = periodIndex - 1;
81
+ while (i >= 0 && isWhitespaceChar(text[i] ?? '')) {
82
+ i -= 1;
83
+ }
84
+
85
+ const end = i;
86
+ while (i >= 0) {
87
+ const char = text[i] ?? '';
88
+ if (/\p{L}|\./u.test(char)) {
89
+ i -= 1;
90
+ continue;
91
+ }
92
+ break;
93
+ }
94
+
95
+ if (end < i + 1) {
96
+ return '';
97
+ }
98
+
99
+ return text.slice(i + 1, end + 1).replace(/\.+$/u, '');
100
+ }
101
+
102
+ function shouldSplitOnPeriod(text: string, periodIndex: number): boolean {
103
+ const prev = text[periodIndex - 1] ?? '';
104
+ const next = text[periodIndex + 1] ?? '';
105
+
106
+ // Do not split decimal numbers like 3.14.
107
+ if (/\d/u.test(prev) && /\d/u.test(next)) {
108
+ return false;
109
+ }
110
+
111
+ const tokenRaw = extractTokenBeforePeriod(text, periodIndex);
112
+ const token = tokenRaw.toLowerCase();
113
+ if (COMMON_ABBREVIATIONS.has(token)) {
114
+ return false;
115
+ }
116
+
117
+ // Likely initial, e.g. "A. Smith" (check original case; token is lowercased for abbreviations).
118
+ if (tokenRaw.length === 1 && /\p{Lu}/u.test(tokenRaw)) {
119
+ return false;
120
+ }
121
+
122
+ return true;
123
+ }
124
+
125
+ function sentenceBoundaryEnd(text: string, index: number): number {
126
+ let end = index + 1;
127
+
128
+ while (end < text.length && isSentenceTerminator(text[end] ?? '')) {
129
+ end += 1;
130
+ }
131
+
132
+ while (end < text.length && isTrailingCloser(text[end] ?? '')) {
133
+ end += 1;
134
+ }
135
+
136
+ return end;
137
+ }
138
+
139
+ function sanitizeSegments(segments: string[]): string[] {
140
+ return segments
141
+ .map((segment) => segment.trim())
142
+ .filter((segment) => segment.length > 0);
143
+ }
144
+
145
+ function distributeSamplesByTextWeight(
146
+ totalSamples: number,
147
+ segments: string[]
148
+ ): number[] {
149
+ if (segments.length === 0) {
150
+ return [];
151
+ }
152
+
153
+ const safeTotal = Math.max(0, Math.floor(totalSamples));
154
+ if (safeTotal === 0) {
155
+ return new Array(segments.length).fill(0);
156
+ }
157
+
158
+ const weights = segments.map((segment) =>
159
+ Math.max(1, Array.from(segment).length)
160
+ );
161
+ const weightSum = weights.reduce((sum, value) => sum + value, 0);
162
+ if (weightSum <= 0) {
163
+ return new Array(segments.length).fill(0);
164
+ }
165
+
166
+ const base = weights.map((weight) =>
167
+ Math.floor((safeTotal * weight) / weightSum)
168
+ );
169
+
170
+ let assigned = base.reduce((sum, value) => sum + value, 0);
171
+ let remaining = safeTotal - assigned;
172
+
173
+ if (remaining > 0) {
174
+ const fractionalOrder = weights
175
+ .map((weight, index) => {
176
+ const exact = (safeTotal * weight) / weightSum;
177
+ return {
178
+ index,
179
+ fraction: exact - Math.floor(exact),
180
+ };
181
+ })
182
+ .sort((a, b) => b.fraction - a.fraction);
183
+
184
+ let ptr = 0;
185
+ while (remaining > 0 && fractionalOrder.length > 0) {
186
+ const target = fractionalOrder[ptr % fractionalOrder.length];
187
+ if (target == null) {
188
+ break;
189
+ }
190
+ base[target.index] = (base[target.index] ?? 0) + 1;
191
+ assigned += 1;
192
+ remaining = safeTotal - assigned;
193
+ ptr += 1;
194
+ }
195
+ }
196
+
197
+ return base;
198
+ }
199
+
200
+ function alignChunkCountsToSegments(
201
+ segments: string[],
202
+ chunkSampleCounts: number[]
203
+ ): number[] {
204
+ if (segments.length === 0) {
205
+ return [];
206
+ }
207
+
208
+ const counts = chunkSampleCounts.map((value) =>
209
+ Number.isFinite(value) ? Math.max(0, Math.floor(value)) : 0
210
+ );
211
+
212
+ if (counts.length === segments.length) {
213
+ return counts;
214
+ }
215
+
216
+ if (counts.length > segments.length) {
217
+ const merged = counts.slice(0, segments.length);
218
+ const extra = counts
219
+ .slice(segments.length)
220
+ .reduce((sum, value) => sum + value, 0);
221
+ const lastIndex = merged.length - 1;
222
+ if (lastIndex >= 0) {
223
+ merged[lastIndex] = (merged[lastIndex] ?? 0) + extra;
224
+ }
225
+ return merged;
226
+ }
227
+
228
+ const total = counts.reduce((sum, value) => sum + value, 0);
229
+ return distributeSamplesByTextWeight(total, segments);
230
+ }
231
+
232
+ type AlignmentNativeItem = {
233
+ text: string;
234
+ start: number;
235
+ end: number;
236
+ };
237
+
238
+ function distributeItemCounts(total: number, weights: number[]): number[] {
239
+ if (weights.length === 0) {
240
+ return [];
241
+ }
242
+
243
+ const safeTotal = Math.max(0, Math.floor(total));
244
+ if (safeTotal === 0) {
245
+ return new Array(weights.length).fill(0);
246
+ }
247
+
248
+ const safeWeights = weights.map((weight) => Math.max(1, Math.floor(weight)));
249
+ const weightSum = safeWeights.reduce((sum, value) => sum + value, 0);
250
+ if (weightSum <= 0) {
251
+ return new Array(weights.length).fill(0);
252
+ }
253
+
254
+ const base = safeWeights.map((weight) =>
255
+ Math.floor((safeTotal * weight) / weightSum)
256
+ );
257
+
258
+ let assigned = base.reduce((sum, value) => sum + value, 0);
259
+ let remaining = safeTotal - assigned;
260
+ let index = 0;
261
+ while (remaining > 0 && base.length > 0) {
262
+ const slot = index % base.length;
263
+ base[slot] = (base[slot] ?? 0) + 1;
264
+ assigned += 1;
265
+ remaining = safeTotal - assigned;
266
+ index += 1;
267
+ }
268
+
269
+ return base;
270
+ }
271
+
272
+ function normalizeAlignmentItems(
273
+ items: AlignmentNativeItem[]
274
+ ): TtsSubtitleItem[] {
275
+ return items
276
+ .map((item) => ({
277
+ text: item.text,
278
+ start: Number.isFinite(item.start) ? Math.max(0, item.start) : 0,
279
+ end: Number.isFinite(item.end) ? Math.max(0, item.end) : 0,
280
+ }))
281
+ .map((item) => ({
282
+ ...item,
283
+ end: item.end < item.start ? item.start : item.end,
284
+ }))
285
+ .filter((item) => item.text.trim().length > 0);
286
+ }
287
+
288
+ function buildSentenceSubtitlesFromAlignedWords(
289
+ text: string,
290
+ alignedWords: TtsSubtitleItem[]
291
+ ): TtsSubtitleItem[] {
292
+ const sentences = splitTextIntoSentences(text);
293
+ if (sentences.length === 0 || alignedWords.length === 0) {
294
+ return [];
295
+ }
296
+
297
+ const sentenceWeights = sentences.map((sentence) =>
298
+ Math.max(1, splitTextIntoWords(sentence).length)
299
+ );
300
+ const sentenceWordCounts = distributeItemCounts(
301
+ alignedWords.length,
302
+ sentenceWeights
303
+ );
304
+
305
+ const subtitles: TtsSubtitleItem[] = [];
306
+ let wordCursor = 0;
307
+ let fallbackTime = alignedWords[0]?.start ?? 0;
308
+
309
+ for (let i = 0; i < sentences.length; i += 1) {
310
+ const sentence = sentences[i] ?? '';
311
+ const count = Math.max(0, sentenceWordCounts[i] ?? 0);
312
+ const chunk = alignedWords.slice(wordCursor, wordCursor + count);
313
+ wordCursor += count;
314
+
315
+ if (chunk.length === 0) {
316
+ subtitles.push({
317
+ text: sentence,
318
+ start: fallbackTime,
319
+ end: fallbackTime,
320
+ });
321
+ continue;
322
+ }
323
+
324
+ const start = chunk[0]?.start ?? fallbackTime;
325
+ const end = chunk[chunk.length - 1]?.end ?? start;
326
+ fallbackTime = end;
327
+
328
+ subtitles.push({
329
+ text: sentence,
330
+ start,
331
+ end,
332
+ });
333
+ }
334
+
335
+ const lastAlignedEnd = alignedWords[alignedWords.length - 1]?.end;
336
+ if (lastAlignedEnd !== undefined && subtitles.length > 0) {
337
+ const lastIndex = subtitles.length - 1;
338
+ const last = subtitles[lastIndex];
339
+ if (last != null) {
340
+ subtitles[lastIndex] = {
341
+ ...last,
342
+ end: Math.max(last.end, lastAlignedEnd),
343
+ };
344
+ }
345
+ }
346
+
347
+ return subtitles;
348
+ }
349
+
350
+ function isCjkChar(char: string): boolean {
351
+ return /\p{Script=Han}|\p{Script=Hiragana}|\p{Script=Katakana}|\p{Script=Hangul}/u.test(
352
+ char
353
+ );
354
+ }
355
+
356
+ function isWordDelimiter(char: string): boolean {
357
+ return /[\s.,!?;:()[\]{}"'`~<>/\\|@#$%^&*+=…,。!?;:、]/u.test(char);
358
+ }
359
+
360
+ export function splitTextIntoSentences(text: string): string[] {
361
+ const normalized = text.trim();
362
+ if (normalized.length === 0) {
363
+ return [];
364
+ }
365
+
366
+ const sentences: string[] = [];
367
+ let start = 0;
368
+ let i = 0;
369
+
370
+ while (i < normalized.length) {
371
+ const current = normalized[i] ?? '';
372
+ if (!isSentenceTerminator(current)) {
373
+ i += 1;
374
+ continue;
375
+ }
376
+
377
+ if (current === '.' && !shouldSplitOnPeriod(normalized, i)) {
378
+ i += 1;
379
+ continue;
380
+ }
381
+
382
+ const end = sentenceBoundaryEnd(normalized, i);
383
+ const next = normalized[end];
384
+ if (next !== undefined && !isWhitespaceChar(next)) {
385
+ i += 1;
386
+ continue;
387
+ }
388
+
389
+ const sentence = normalized.slice(start, end).trim();
390
+ if (sentence.length > 0) {
391
+ sentences.push(sentence);
392
+ }
393
+
394
+ start = end;
395
+ while (
396
+ start < normalized.length &&
397
+ isWhitespaceChar(normalized[start] ?? '')
398
+ ) {
399
+ start += 1;
400
+ }
401
+ i = start;
402
+ }
403
+
404
+ const tail = normalized.slice(start).trim();
405
+ if (tail.length > 0) {
406
+ sentences.push(tail);
407
+ }
408
+
409
+ return sentences.length > 0 ? sentences : [normalized];
410
+ }
411
+
412
+ export function splitTextIntoWords(text: string): string[] {
413
+ const normalized = text.trim();
414
+ if (normalized.length === 0) {
415
+ return [];
416
+ }
417
+
418
+ const words: string[] = [];
419
+ let current = '';
420
+
421
+ const flushCurrent = () => {
422
+ const token = current.trim();
423
+ if (token.length > 0) {
424
+ words.push(token);
425
+ }
426
+ current = '';
427
+ };
428
+
429
+ for (const char of normalized) {
430
+ if (isWhitespaceChar(char)) {
431
+ flushCurrent();
432
+ continue;
433
+ }
434
+
435
+ if (isCjkChar(char)) {
436
+ flushCurrent();
437
+ words.push(char);
438
+ continue;
439
+ }
440
+
441
+ if (isWordDelimiter(char)) {
442
+ flushCurrent();
443
+ continue;
444
+ }
445
+
446
+ current += char;
447
+ }
448
+
449
+ flushCurrent();
450
+
451
+ return words.length > 0 ? words : [normalized];
452
+ }
453
+
454
+ export function buildSubtitlesFromChunks(
455
+ segments: string[],
456
+ chunkSampleCounts: number[],
457
+ sampleRate: number
458
+ ): TtsSubtitleItem[] {
459
+ if (!Number.isFinite(sampleRate) || sampleRate <= 0) {
460
+ return [];
461
+ }
462
+
463
+ const cleanedSegments = sanitizeSegments(segments);
464
+ if (cleanedSegments.length === 0) {
465
+ return [];
466
+ }
467
+
468
+ const alignedCounts = alignChunkCountsToSegments(
469
+ cleanedSegments,
470
+ chunkSampleCounts
471
+ );
472
+
473
+ const subtitles: TtsSubtitleItem[] = [];
474
+ let offsetSamples = 0;
475
+
476
+ for (let i = 0; i < cleanedSegments.length; i += 1) {
477
+ const samples = Math.max(0, alignedCounts[i] ?? 0);
478
+ if (samples === 0 && offsetSamples === 0) {
479
+ continue;
480
+ }
481
+
482
+ const start = offsetSamples / sampleRate;
483
+ offsetSamples += samples;
484
+ const end = offsetSamples / sampleRate;
485
+
486
+ subtitles.push({
487
+ text: cleanedSegments[i] ?? '',
488
+ start,
489
+ end,
490
+ });
491
+ }
492
+
493
+ return subtitles;
494
+ }
495
+
496
+ /**
497
+ * Generate subtitle timelines from an existing transcript plus audio.
498
+ *
499
+ * This helper supports two modes:
500
+ * - `mode: 'fast'`: single-stage estimation from transcript chunks + audio duration.
501
+ * - `mode: 'accurate'`: two-stage pipeline: external STT transcript + wav2vec2 CTC forced alignment.
502
+ *
503
+ * For accurate mode, an alignment model from `ModelCategory.Alignment` must be available.
504
+ * You can pre-validate a model path via `detectAlignmentModel` from `react-native-sherpa-onnx/alignment`
505
+ * before calling this function.
506
+ *
507
+ * Related exports:
508
+ * - Alignment detection: `detectAlignmentModel`
509
+ */
510
+ export async function generateSubtitlesFromAudio(
511
+ text: string,
512
+ audioPathOrSamples: string | { samples: number[]; sampleRate: number },
513
+ options: SubtitleFromAudioOptions
514
+ ): Promise<SubtitleResult> {
515
+ const mode = options.mode;
516
+ const granularity = options.granularity ?? 'sentence';
517
+
518
+ assertSubtitleGranularityForMode(mode, granularity);
519
+
520
+ if (mode === 'accurate') {
521
+ const resolvedModelPath = options.alignmentModelPath?.trim();
522
+
523
+ if (!resolvedModelPath) {
524
+ throw new Error(
525
+ 'ALIGNMENT_MODEL_MISSING: Provide options.alignmentModelPath for accurate subtitles.'
526
+ );
527
+ }
528
+
529
+ let audioPath = '';
530
+ let shouldCleanup = false;
531
+
532
+ if (typeof audioPathOrSamples === 'string') {
533
+ audioPath = audioPathOrSamples;
534
+ } else {
535
+ audioPath = await saveAlignmentAudioToTempWav(audioPathOrSamples);
536
+ shouldCleanup = true;
537
+ }
538
+
539
+ try {
540
+ const aligned = await SherpaOnnx.runCTCForcedAlignment(
541
+ resolvedModelPath,
542
+ audioPath,
543
+ text,
544
+ WAV2VEC2_VOCAB_JSON
545
+ );
546
+
547
+ const wordItems = normalizeAlignmentItems(aligned.words ?? []);
548
+ const charItems = normalizeAlignmentItems(aligned.chars ?? []);
549
+
550
+ return {
551
+ subtitles:
552
+ granularity === 'character'
553
+ ? charItems
554
+ : granularity === 'word'
555
+ ? wordItems
556
+ : buildSentenceSubtitlesFromAlignedWords(text, wordItems),
557
+ timingMode: 'aligned',
558
+ };
559
+ } finally {
560
+ if (shouldCleanup) {
561
+ unlink(audioPath).catch(() => {
562
+ // ignore cleanup errors
563
+ });
564
+ }
565
+ }
566
+ }
567
+
568
+ const segments =
569
+ granularity === 'word'
570
+ ? splitTextIntoWords(text)
571
+ : splitTextIntoSentences(text);
572
+
573
+ if (segments.length === 0) {
574
+ return {
575
+ subtitles: [],
576
+ timingMode: 'estimated',
577
+ };
578
+ }
579
+
580
+ let totalSamples = 0;
581
+ let sampleRate = 0;
582
+
583
+ if (typeof audioPathOrSamples === 'string') {
584
+ const decoded = await decodeAudioFileToFloatSamples(audioPathOrSamples);
585
+ totalSamples = decoded.samples.length;
586
+ sampleRate = decoded.sampleRate;
587
+ } else {
588
+ totalSamples = audioPathOrSamples.samples.length;
589
+ sampleRate = audioPathOrSamples.sampleRate;
590
+ }
591
+
592
+ if (!Number.isFinite(sampleRate) || sampleRate <= 0 || totalSamples <= 0) {
593
+ return {
594
+ subtitles: [],
595
+ timingMode: 'estimated',
596
+ };
597
+ }
598
+
599
+ const chunkSampleCounts = distributeSamplesByTextWeight(
600
+ totalSamples,
601
+ segments
602
+ );
603
+ return {
604
+ subtitles: buildSubtitlesFromChunks(
605
+ segments,
606
+ chunkSampleCounts,
607
+ sampleRate
608
+ ),
609
+ timingMode: 'estimated',
610
+ };
611
+ }
@@ -0,0 +1,31 @@
1
+ import { DocumentDirectoryPath, mkdir } from '@dr.pogodin/react-native-fs';
2
+ import SherpaOnnx from '../NativeSherpaOnnx';
3
+ import type { GeneratedAudio } from './types';
4
+
5
+ function createTempAlignmentWavPath(instanceId?: string): string {
6
+ const nonce = `${Date.now()}-${Math.random().toString(36).slice(2, 10)}`;
7
+ const prefix = instanceId?.trim() ? `${instanceId}-` : '';
8
+ return `${DocumentDirectoryPath}/sherpa-onnx/cache/${prefix}alignment-${nonce}.wav`.replace(
9
+ /\/+/g,
10
+ '/'
11
+ );
12
+ }
13
+
14
+ export async function saveAlignmentAudioToTempWav(
15
+ audio: GeneratedAudio,
16
+ instanceId?: string
17
+ ): Promise<string> {
18
+ const cacheDir = `${DocumentDirectoryPath}/sherpa-onnx/cache`.replace(
19
+ /\/+/g,
20
+ '/'
21
+ );
22
+ await mkdir(cacheDir);
23
+
24
+ const tempPath = createTempAlignmentWavPath(instanceId);
25
+ await SherpaOnnx.saveTtsAudioToFile(
26
+ audio.samples,
27
+ audio.sampleRate,
28
+ tempPath
29
+ );
30
+ return tempPath;
31
+ }