react-native-sherpa-onnx 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +3 -0
  2. package/android/src/main/assets/model_licenses/alignment-models-license-status.csv +5 -0
  3. package/android/src/main/cpp/CMakeLists.txt +3 -0
  4. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.cpp +66 -0
  5. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.h +17 -0
  6. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-alignment.cpp +108 -0
  7. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +30 -0
  8. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.cpp +66 -0
  9. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.h +30 -0
  10. package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +21 -0
  11. package/android/src/main/java/com/sherpaonnx/SherpaOnnxAlignmentHelper.kt +555 -0
  12. package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +76 -0
  13. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTextSegmenter.kt +330 -0
  14. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +180 -23
  15. package/ios/Resources/model_licenses/alignment-models-license-status.csv +5 -0
  16. package/ios/SherpaOnnx+Alignment.mm +704 -0
  17. package/ios/SherpaOnnx+STT.mm +6 -0
  18. package/ios/SherpaOnnx+TTS.mm +624 -50
  19. package/ios/model_detect/sherpa-onnx-model-detect-alignment.mm +108 -0
  20. package/ios/model_detect/sherpa-onnx-model-detect.h +31 -0
  21. package/ios/model_detect/sherpa-onnx-validate-alignment.h +30 -0
  22. package/ios/model_detect/sherpa-onnx-validate-alignment.mm +66 -0
  23. package/ios/stt/sherpa-onnx-stt-wrapper.h +3 -1
  24. package/ios/stt/sherpa-onnx-stt-wrapper.mm +6 -0
  25. package/lib/module/NativeSherpaOnnx.js.map +1 -1
  26. package/lib/module/alignment/index.js +27 -0
  27. package/lib/module/alignment/index.js.map +1 -0
  28. package/lib/module/alignment/types.js +2 -0
  29. package/lib/module/alignment/types.js.map +1 -0
  30. package/lib/module/alignment/vocab.js +40 -0
  31. package/lib/module/alignment/vocab.js.map +1 -0
  32. package/lib/module/download/paths.js +9 -1
  33. package/lib/module/download/paths.js.map +1 -1
  34. package/lib/module/download/registry.js +17 -1
  35. package/lib/module/download/registry.js.map +1 -1
  36. package/lib/module/download/types.js +1 -0
  37. package/lib/module/download/types.js.map +1 -1
  38. package/lib/module/index.js +6 -4
  39. package/lib/module/index.js.map +1 -1
  40. package/lib/module/licenses.js +8 -2
  41. package/lib/module/licenses.js.map +1 -1
  42. package/lib/module/stt/types.js.map +1 -1
  43. package/lib/module/tts/index.js +68 -2
  44. package/lib/module/tts/index.js.map +1 -1
  45. package/lib/module/tts/subtitles.js +400 -0
  46. package/lib/module/tts/subtitles.js.map +1 -0
  47. package/lib/module/tts/tempAudio.js +17 -0
  48. package/lib/module/tts/tempAudio.js.map +1 -0
  49. package/lib/module/tts/types.js.map +1 -1
  50. package/lib/typescript/src/NativeSherpaOnnx.d.ts +34 -3
  51. package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
  52. package/lib/typescript/src/alignment/index.d.ts +8 -0
  53. package/lib/typescript/src/alignment/index.d.ts.map +1 -0
  54. package/lib/typescript/src/alignment/types.d.ts +23 -0
  55. package/lib/typescript/src/alignment/types.d.ts.map +1 -0
  56. package/lib/typescript/src/alignment/vocab.d.ts +5 -0
  57. package/lib/typescript/src/alignment/vocab.d.ts.map +1 -0
  58. package/lib/typescript/src/download/paths.d.ts +5 -2
  59. package/lib/typescript/src/download/paths.d.ts.map +1 -1
  60. package/lib/typescript/src/download/registry.d.ts.map +1 -1
  61. package/lib/typescript/src/download/types.d.ts +2 -1
  62. package/lib/typescript/src/download/types.d.ts.map +1 -1
  63. package/lib/typescript/src/index.d.ts +1 -0
  64. package/lib/typescript/src/index.d.ts.map +1 -1
  65. package/lib/typescript/src/licenses.d.ts.map +1 -1
  66. package/lib/typescript/src/stt/types.d.ts +5 -2
  67. package/lib/typescript/src/stt/types.d.ts.map +1 -1
  68. package/lib/typescript/src/tts/index.d.ts +2 -1
  69. package/lib/typescript/src/tts/index.d.ts.map +1 -1
  70. package/lib/typescript/src/tts/subtitles.d.ts +24 -0
  71. package/lib/typescript/src/tts/subtitles.d.ts.map +1 -0
  72. package/lib/typescript/src/tts/tempAudio.d.ts +3 -0
  73. package/lib/typescript/src/tts/tempAudio.d.ts.map +1 -0
  74. package/lib/typescript/src/tts/types.d.ts +68 -2
  75. package/lib/typescript/src/tts/types.d.ts.map +1 -1
  76. package/package.json +6 -1
  77. package/scripts/alignment-models/README.md +90 -0
  78. package/scripts/alignment-models/build_and_upload.js +724 -0
  79. package/scripts/alignment-models/sources.csv +5 -0
  80. package/scripts/alignment-models/sync_alignment_license_status.js +123 -0
  81. package/src/NativeSherpaOnnx.ts +35 -3
  82. package/src/alignment/index.ts +41 -0
  83. package/src/alignment/types.ts +22 -0
  84. package/src/alignment/vocab.ts +38 -0
  85. package/src/download/paths.ts +18 -5
  86. package/src/download/registry.ts +23 -3
  87. package/src/download/types.ts +1 -0
  88. package/src/index.tsx +6 -4
  89. package/src/licenses.ts +12 -1
  90. package/src/stt/types.ts +5 -2
  91. package/src/tts/index.ts +110 -3
  92. package/src/tts/subtitles.ts +611 -0
  93. package/src/tts/tempAudio.ts +31 -0
  94. package/src/tts/types.ts +79 -2
  95. package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
@@ -20,6 +20,8 @@
20
20
  #include <optional>
21
21
  #include <sstream>
22
22
  #include <string>
23
+ #include <cmath>
24
+ #include <set>
23
25
  #include <unordered_map>
24
26
  #include <vector>
25
27
  #include <chrono>
@@ -64,17 +66,553 @@ static NSString *ttsModelKindToNSString(sherpaonnx::TtsModelKind kind) {
64
66
  }
65
67
 
66
68
  namespace {
67
- std::vector<std::string> SplitTtsTokens(const std::string &text) {
68
- std::vector<std::string> tokens;
69
- std::istringstream iss(text);
70
- std::string token;
71
- while (iss >> token) {
72
- tokens.push_back(token);
69
+ struct SubtitleTimingItem {
70
+ std::string text;
71
+ double start = 0.0;
72
+ double end = 0.0;
73
+ };
74
+
75
+ static bool IsSentenceTerminator(unichar c) {
76
+ switch (c) {
77
+ case '.':
78
+ case '!':
79
+ case '?':
80
+ case ';':
81
+ case 0x3002: // 。
82
+ case 0xFF01: // !
83
+ case 0xFF1F: // ?
84
+ case 0xFF1B: // ;
85
+ return true;
86
+ default:
87
+ return false;
88
+ }
89
+ }
90
+
91
+ static bool IsTrailingCloser(unichar c) {
92
+ switch (c) {
93
+ case '"':
94
+ case '\'':
95
+ case ')':
96
+ case ']':
97
+ case '}':
98
+ case '>':
99
+ case 0x201D: // ”
100
+ case 0x2019: // ’
101
+ case 0x300D: // 」
102
+ case 0x300F: // 』
103
+ case 0x3011: // 】
104
+ case 0xFF09: // )
105
+ return true;
106
+ default:
107
+ return false;
108
+ }
109
+ }
110
+
111
+ static bool IsWordDelimiter(unichar c) {
112
+ switch (c) {
113
+ case '.':
114
+ case ',':
115
+ case '!':
116
+ case '?':
117
+ case ';':
118
+ case ':':
119
+ case '(':
120
+ case ')':
121
+ case '[':
122
+ case ']':
123
+ case '{':
124
+ case '}':
125
+ case '"':
126
+ case '\'':
127
+ case '`':
128
+ case '~':
129
+ case '<':
130
+ case '>':
131
+ case '/':
132
+ case '\\':
133
+ case '|':
134
+ case '@':
135
+ case '#':
136
+ case '$':
137
+ case '%':
138
+ case '^':
139
+ case '&':
140
+ case '*':
141
+ case '+':
142
+ case '=':
143
+ case 0x2026: // …
144
+ case 0xFF0C: // ,
145
+ case 0x3002: // 。
146
+ case 0xFF01: // !
147
+ case 0xFF1F: // ?
148
+ case 0xFF1B: // ;
149
+ case 0xFF1A: // :
150
+ case 0x3001: // 、
151
+ return true;
152
+ default:
153
+ return false;
154
+ }
155
+ }
156
+
157
+ static bool IsCjkCodepoint(unichar c) {
158
+ return (c >= 0x4E00 && c <= 0x9FFF) ||
159
+ (c >= 0x3400 && c <= 0x4DBF) ||
160
+ (c >= 0x3040 && c <= 0x309F) ||
161
+ (c >= 0x30A0 && c <= 0x30FF) ||
162
+ (c >= 0xAC00 && c <= 0xD7AF);
163
+ }
164
+
165
+ static NSString *ExtractTokenBeforePeriod(NSString *text, NSInteger periodIndex) {
166
+ if (text == nil || text.length == 0 || periodIndex <= 0) {
167
+ return @"";
168
+ }
169
+
170
+ NSCharacterSet *ws = [NSCharacterSet whitespaceAndNewlineCharacterSet];
171
+ NSCharacterSet *letters = [NSCharacterSet letterCharacterSet];
172
+
173
+ NSInteger i = periodIndex - 1;
174
+ while (i >= 0 && [ws characterIsMember:[text characterAtIndex:i]]) {
175
+ i -= 1;
176
+ }
177
+
178
+ NSInteger end = i;
179
+ while (i >= 0) {
180
+ unichar c = [text characterAtIndex:i];
181
+ if ([letters characterIsMember:c] || c == '.') {
182
+ i -= 1;
183
+ continue;
184
+ }
185
+ break;
186
+ }
187
+
188
+ if (end < i + 1) {
189
+ return @"";
190
+ }
191
+
192
+ NSString *token = [text substringWithRange:NSMakeRange(i + 1, end - i)];
193
+ while (token.length > 0 && [token characterAtIndex:token.length - 1] == '.') {
194
+ token = [token substringToIndex:token.length - 1];
195
+ }
196
+ return token;
197
+ }
198
+
199
+ static bool ShouldSplitOnPeriod(NSString *text, NSInteger periodIndex) {
200
+ if (text == nil || periodIndex < 0 || periodIndex >= text.length) {
201
+ return true;
73
202
  }
74
- if (tokens.empty() && !text.empty()) {
75
- tokens.push_back(text);
203
+
204
+ NSCharacterSet *digits = [NSCharacterSet decimalDigitCharacterSet];
205
+ if (periodIndex > 0 && periodIndex + 1 < text.length) {
206
+ unichar prev = [text characterAtIndex:periodIndex - 1];
207
+ unichar next = [text characterAtIndex:periodIndex + 1];
208
+ if ([digits characterIsMember:prev] && [digits characterIsMember:next]) {
209
+ return false;
210
+ }
76
211
  }
77
- return tokens;
212
+
213
+ static const std::set<std::string> kAbbreviations = {
214
+ "mr", "mrs", "ms", "dr", "prof", "sr", "jr", "st", "vs", "etc", "e.g", "i.e"
215
+ };
216
+
217
+ NSString *tokenRaw = [ExtractTokenBeforePeriod(text, periodIndex)
218
+ stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
219
+ NSString *tokenLower = [tokenRaw lowercaseString];
220
+ std::string tokenUtf8 = tokenLower != nil ? std::string([tokenLower UTF8String]) : std::string();
221
+ if (!tokenUtf8.empty() && kAbbreviations.find(tokenUtf8) != kAbbreviations.end()) {
222
+ return false;
223
+ }
224
+
225
+ // Likely initial, e.g. "A. Smith" — use original case; tokenLower cannot match uppercaseLetter.
226
+ if (tokenRaw.length == 1) {
227
+ NSCharacterSet *upper = [NSCharacterSet uppercaseLetterCharacterSet];
228
+ if ([upper characterIsMember:[tokenRaw characterAtIndex:0]]) {
229
+ return false;
230
+ }
231
+ }
232
+
233
+ return true;
234
+ }
235
+
236
+ static NSInteger SentenceBoundaryEnd(NSString *text, NSInteger index) {
237
+ NSInteger end = index + 1;
238
+ while (end < text.length && IsSentenceTerminator([text characterAtIndex:end])) {
239
+ end += 1;
240
+ }
241
+ while (end < text.length && IsTrailingCloser([text characterAtIndex:end])) {
242
+ end += 1;
243
+ }
244
+ return end;
245
+ }
246
+
247
+ static std::vector<std::string> SanitizeSegments(const std::vector<std::string> &segments) {
248
+ std::vector<std::string> cleaned;
249
+ cleaned.reserve(segments.size());
250
+ for (const auto &segment : segments) {
251
+ NSString *s = [NSString stringWithUTF8String:segment.c_str()];
252
+ if (s == nil) {
253
+ continue;
254
+ }
255
+ NSString *trimmed = [s stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
256
+ if (trimmed.length == 0) {
257
+ continue;
258
+ }
259
+ cleaned.emplace_back([trimmed UTF8String]);
260
+ }
261
+ return cleaned;
262
+ }
263
+
264
+ static int32_t TextWeight(const std::string &text) {
265
+ NSString *s = [NSString stringWithUTF8String:text.c_str()];
266
+ if (s == nil || s.length == 0) {
267
+ return 1;
268
+ }
269
+ return static_cast<int32_t>(MAX(1, static_cast<int32_t>(s.length)));
270
+ }
271
+
272
+ static std::vector<int32_t> DistributeSamplesByTextWeight(
273
+ int32_t totalSamples,
274
+ const std::vector<std::string> &segments
275
+ ) {
276
+ if (segments.empty()) {
277
+ return {};
278
+ }
279
+
280
+ int32_t safeTotal = std::max<int32_t>(0, totalSamples);
281
+ if (safeTotal == 0) {
282
+ return std::vector<int32_t>(segments.size(), 0);
283
+ }
284
+
285
+ std::vector<int32_t> weights;
286
+ weights.reserve(segments.size());
287
+ int32_t weightSum = 0;
288
+ for (const auto &segment : segments) {
289
+ int32_t w = std::max<int32_t>(1, TextWeight(segment));
290
+ weights.push_back(w);
291
+ weightSum += w;
292
+ }
293
+ if (weightSum <= 0) {
294
+ return std::vector<int32_t>(segments.size(), 0);
295
+ }
296
+
297
+ std::vector<int32_t> base(segments.size(), 0);
298
+ std::vector<std::pair<size_t, double>> fractions;
299
+ fractions.reserve(segments.size());
300
+
301
+ for (size_t i = 0; i < segments.size(); ++i) {
302
+ double exact = (static_cast<double>(safeTotal) * static_cast<double>(weights[i])) / static_cast<double>(weightSum);
303
+ int32_t floorValue = static_cast<int32_t>(std::floor(exact));
304
+ base[i] = floorValue;
305
+ fractions.emplace_back(i, exact - static_cast<double>(floorValue));
306
+ }
307
+
308
+ int32_t assigned = 0;
309
+ for (auto v : base) {
310
+ assigned += v;
311
+ }
312
+
313
+ int32_t remaining = safeTotal - assigned;
314
+ if (remaining > 0) {
315
+ std::sort(
316
+ fractions.begin(),
317
+ fractions.end(),
318
+ [](const auto &a, const auto &b) { return a.second > b.second; }
319
+ );
320
+
321
+ size_t ptr = 0;
322
+ while (remaining > 0 && !fractions.empty()) {
323
+ size_t idx = fractions[ptr % fractions.size()].first;
324
+ base[idx] += 1;
325
+ remaining -= 1;
326
+ ptr += 1;
327
+ }
328
+ }
329
+
330
+ return base;
331
+ }
332
+
333
+ static std::vector<int32_t> AlignChunkCountsToSegments(
334
+ const std::vector<std::string> &segments,
335
+ const std::vector<int32_t> &chunkSampleCounts
336
+ ) {
337
+ if (segments.empty()) {
338
+ return {};
339
+ }
340
+
341
+ std::vector<int32_t> counts;
342
+ counts.reserve(chunkSampleCounts.size());
343
+ for (auto value : chunkSampleCounts) {
344
+ counts.push_back(std::max<int32_t>(0, value));
345
+ }
346
+
347
+ if (counts.size() == segments.size()) {
348
+ return counts;
349
+ }
350
+
351
+ if (counts.size() > segments.size()) {
352
+ std::vector<int32_t> merged(counts.begin(), counts.begin() + static_cast<long>(segments.size()));
353
+ int32_t extra = 0;
354
+ for (size_t i = segments.size(); i < counts.size(); ++i) {
355
+ extra += counts[i];
356
+ }
357
+ if (!merged.empty()) {
358
+ merged.back() += extra;
359
+ }
360
+ return merged;
361
+ }
362
+
363
+ int32_t total = 0;
364
+ for (auto value : counts) {
365
+ total += value;
366
+ }
367
+ return DistributeSamplesByTextWeight(total, segments);
368
+ }
369
+
370
+ static std::vector<SubtitleTimingItem> BuildSubtitlesFromChunks(
371
+ const std::vector<std::string> &segments,
372
+ const std::vector<int32_t> &chunkSampleCounts,
373
+ int32_t sampleRate
374
+ ) {
375
+ if (sampleRate <= 0) {
376
+ return {};
377
+ }
378
+
379
+ std::vector<std::string> cleaned = SanitizeSegments(segments);
380
+ if (cleaned.empty()) {
381
+ return {};
382
+ }
383
+
384
+ std::vector<int32_t> aligned = AlignChunkCountsToSegments(cleaned, chunkSampleCounts);
385
+
386
+ std::vector<SubtitleTimingItem> out;
387
+ out.reserve(cleaned.size());
388
+ int64_t offsetSamples = 0;
389
+
390
+ for (size_t i = 0; i < cleaned.size(); ++i) {
391
+ int32_t count = i < aligned.size() ? std::max<int32_t>(0, aligned[i]) : 0;
392
+ if (count == 0 && offsetSamples == 0) {
393
+ continue;
394
+ }
395
+
396
+ double start = static_cast<double>(offsetSamples) / static_cast<double>(sampleRate);
397
+ offsetSamples += count;
398
+ double end = static_cast<double>(offsetSamples) / static_cast<double>(sampleRate);
399
+
400
+ out.push_back(SubtitleTimingItem{cleaned[i], start, end});
401
+ }
402
+
403
+ return out;
404
+ }
405
+
406
+ static std::vector<SubtitleTimingItem> BuildWordSubtitlesFromSentenceChunks(
407
+ const std::vector<std::string> &sentences,
408
+ const std::vector<int32_t> &sentenceChunkSampleCounts,
409
+ int32_t sampleRate
410
+ );
411
+
412
+ static std::vector<std::string> SplitTextIntoSentences(const std::string &text) {
413
+ NSString *source = [NSString stringWithUTF8String:text.c_str()];
414
+ if (source == nil) {
415
+ return {};
416
+ }
417
+ NSString *normalized = [source stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
418
+ if (normalized.length == 0) {
419
+ return {};
420
+ }
421
+
422
+ NSCharacterSet *ws = [NSCharacterSet whitespaceAndNewlineCharacterSet];
423
+ NSMutableArray<NSString *> *items = [NSMutableArray array];
424
+
425
+ NSInteger start = 0;
426
+ NSInteger i = 0;
427
+
428
+ while (i < normalized.length) {
429
+ unichar current = [normalized characterAtIndex:i];
430
+ if (!IsSentenceTerminator(current)) {
431
+ i += 1;
432
+ continue;
433
+ }
434
+
435
+ if (current == '.' && !ShouldSplitOnPeriod(normalized, i)) {
436
+ i += 1;
437
+ continue;
438
+ }
439
+
440
+ NSInteger end = SentenceBoundaryEnd(normalized, i);
441
+ if (end < normalized.length && ![ws characterIsMember:[normalized characterAtIndex:end]]) {
442
+ i += 1;
443
+ continue;
444
+ }
445
+
446
+ NSString *segment = [[normalized substringWithRange:NSMakeRange(start, end - start)]
447
+ stringByTrimmingCharactersInSet:ws];
448
+ if (segment.length > 0) {
449
+ [items addObject:segment];
450
+ }
451
+
452
+ start = end;
453
+ while (start < normalized.length && [ws characterIsMember:[normalized characterAtIndex:start]]) {
454
+ start += 1;
455
+ }
456
+ i = start;
457
+ }
458
+
459
+ if (start < normalized.length) {
460
+ NSString *tail = [[normalized substringFromIndex:start] stringByTrimmingCharactersInSet:ws];
461
+ if (tail.length > 0) {
462
+ [items addObject:tail];
463
+ }
464
+ }
465
+
466
+ std::vector<std::string> out;
467
+ out.reserve(items.count);
468
+ for (NSString *segment in items) {
469
+ out.emplace_back([segment UTF8String]);
470
+ }
471
+
472
+ if (out.empty()) {
473
+ out.emplace_back([normalized UTF8String]);
474
+ }
475
+ return out;
476
+ }
477
+
478
+ static std::vector<std::string> SplitTextIntoWords(const std::string &text) {
479
+ NSString *source = [NSString stringWithUTF8String:text.c_str()];
480
+ if (source == nil) {
481
+ return {};
482
+ }
483
+ NSString *normalized = [source stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
484
+ if (normalized.length == 0) {
485
+ return {};
486
+ }
487
+
488
+ NSCharacterSet *ws = [NSCharacterSet whitespaceAndNewlineCharacterSet];
489
+ NSMutableArray<NSString *> *items = [NSMutableArray array];
490
+ NSMutableString *current = [NSMutableString string];
491
+
492
+ void (^flushCurrent)(void) = ^{
493
+ NSString *token = [current stringByTrimmingCharactersInSet:ws];
494
+ if (token.length > 0) {
495
+ [items addObject:token];
496
+ }
497
+ [current setString:@""];
498
+ };
499
+
500
+ for (NSInteger i = 0; i < normalized.length; ++i) {
501
+ unichar c = [normalized characterAtIndex:i];
502
+ if ([ws characterIsMember:c]) {
503
+ flushCurrent();
504
+ continue;
505
+ }
506
+ if (IsCjkCodepoint(c)) {
507
+ flushCurrent();
508
+ [items addObject:[NSString stringWithCharacters:&c length:1]];
509
+ continue;
510
+ }
511
+ if (IsWordDelimiter(c)) {
512
+ flushCurrent();
513
+ continue;
514
+ }
515
+ [current appendFormat:@"%C", c];
516
+ }
517
+
518
+ flushCurrent();
519
+
520
+ std::vector<std::string> out;
521
+ out.reserve(items.count);
522
+ for (NSString *segment in items) {
523
+ out.emplace_back([segment UTF8String]);
524
+ }
525
+ if (out.empty()) {
526
+ out.emplace_back([normalized UTF8String]);
527
+ }
528
+ return out;
529
+ }
530
+
531
+ static std::vector<SubtitleTimingItem> BuildWordSubtitlesFromSentenceChunks(
532
+ const std::vector<std::string> &sentences,
533
+ const std::vector<int32_t> &sentenceChunkSampleCounts,
534
+ int32_t sampleRate
535
+ ) {
536
+ std::vector<std::string> cleanedSentences = SanitizeSegments(sentences);
537
+ if (cleanedSentences.empty()) {
538
+ return {};
539
+ }
540
+
541
+ std::vector<int32_t> alignedSentenceCounts = AlignChunkCountsToSegments(
542
+ cleanedSentences,
543
+ sentenceChunkSampleCounts
544
+ );
545
+
546
+ std::vector<std::string> wordSegments;
547
+ std::vector<int32_t> wordChunkCounts;
548
+
549
+ for (size_t i = 0; i < cleanedSentences.size(); ++i) {
550
+ int32_t sentenceSamples = i < alignedSentenceCounts.size()
551
+ ? std::max<int32_t>(0, alignedSentenceCounts[i])
552
+ : 0;
553
+ std::vector<std::string> words = SplitTextIntoWords(cleanedSentences[i]);
554
+ if (words.empty()) {
555
+ continue;
556
+ }
557
+
558
+ std::vector<int32_t> distributed = DistributeSamplesByTextWeight(sentenceSamples, words);
559
+ for (size_t j = 0; j < words.size(); ++j) {
560
+ wordSegments.push_back(words[j]);
561
+ wordChunkCounts.push_back(j < distributed.size() ? distributed[j] : 0);
562
+ }
563
+ }
564
+
565
+ return BuildSubtitlesFromChunks(wordSegments, wordChunkCounts, sampleRate);
566
+ }
567
+
568
+ static NSMutableArray *SubtitleTimingsToNSArray(const std::vector<SubtitleTimingItem> &items) {
569
+ NSMutableArray *array = [NSMutableArray arrayWithCapacity:items.size()];
570
+ for (const auto &item : items) {
571
+ NSString *text = [NSString stringWithUTF8String:item.text.c_str()] ?: @"";
572
+ NSDictionary *entry = @{
573
+ @"text": text,
574
+ @"start": @(item.start),
575
+ @"end": @(item.end)
576
+ };
577
+ [array addObject:entry];
578
+ }
579
+ return array;
580
+ }
581
+
582
+ static NSString *SubtitleModeFromOptions(NSDictionary *options) {
583
+ NSString *raw = [options[@"subtitleMode"] isKindOfClass:[NSString class]] ? options[@"subtitleMode"] : nil;
584
+ NSString *normalized = raw != nil
585
+ ? [[raw lowercaseString] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]
586
+ : @"fast";
587
+
588
+ if ([normalized isEqualToString:@"off"] ||
589
+ [normalized isEqualToString:@"fast"] ||
590
+ [normalized isEqualToString:@"accurate"]) {
591
+ return normalized;
592
+ }
593
+
594
+ return @"fast";
595
+ }
596
+
597
+ static NSString *SubtitleGranularityFromOptions(NSDictionary *options) {
598
+ NSString *raw = [options[@"subtitleGranularity"] isKindOfClass:[NSString class]] ? options[@"subtitleGranularity"] : nil;
599
+ NSString *normalized = raw != nil
600
+ ? [[raw lowercaseString] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]
601
+ : @"sentence";
602
+
603
+ if ([normalized isEqualToString:@"word"] || [normalized isEqualToString:@"sentence"]) {
604
+ return normalized;
605
+ }
606
+
607
+ return @"sentence";
608
+ }
609
+
610
+ static bool IsCharacterGranularityRequested(NSDictionary *options) {
611
+ NSString *raw = [options[@"subtitleGranularity"] isKindOfClass:[NSString class]] ? options[@"subtitleGranularity"] : nil;
612
+ NSString *normalized = raw != nil
613
+ ? [[raw lowercaseString] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]
614
+ : @"";
615
+ return [normalized isEqualToString:@"character"];
78
616
  }
79
617
 
80
618
  /** When options omit numSteps, matches Android SherpaOnnxTtsHelper / upstream GenerationConfig default. */
@@ -536,6 +1074,14 @@ static bool NSDictionaryHasValidReferenceAudio(NSDictionary *options) {
536
1074
  reject(@"TTS_GENERATE_ERROR", @"instanceId is required", nil);
537
1075
  return;
538
1076
  }
1077
+
1078
+ NSString *subtitleMode = SubtitleModeFromOptions(options);
1079
+ NSString *subtitleGranularity = SubtitleGranularityFromOptions(options);
1080
+ if (IsCharacterGranularityRequested(options) && ![subtitleMode isEqualToString:@"accurate"]) {
1081
+ reject(@"TTS_SUBTITLE_ERROR", @"Character granularity is only supported when subtitleMode is 'accurate'.", nil);
1082
+ return;
1083
+ }
1084
+
539
1085
  double sid = 0;
540
1086
  double speed = 1.0;
541
1087
  if (options != nil) {
@@ -579,59 +1125,87 @@ static bool NSDictionaryHasValidReferenceAudio(NSDictionary *options) {
579
1125
  cloneOpt = VoiceCloneOptionsFromNSDictionary(options, kDefaultVoiceCloneNumSteps);
580
1126
  }
581
1127
 
582
- auto result = wrapper->generate(
583
- textStr,
584
- static_cast<int32_t>(sid),
585
- static_cast<float>(speed),
586
- cloneOpt
587
- );
1128
+ std::vector<float> generatedSamples;
1129
+ int32_t sampleRate = 0;
1130
+ std::vector<int32_t> sentenceChunkSizes;
588
1131
 
589
- if (result.samples.empty() || result.sampleRate == 0) {
590
- NSString *errorMsg = @"Failed to generate speech or result is empty";
591
- RCTLogError(@"%@", errorMsg);
592
- reject(@"TTS_GENERATE_ERROR", errorMsg, nil);
593
- return;
1132
+ if ([subtitleMode isEqualToString:@"off"]) {
1133
+ auto result = wrapper->generate(
1134
+ textStr,
1135
+ static_cast<int32_t>(sid),
1136
+ static_cast<float>(speed),
1137
+ cloneOpt
1138
+ );
1139
+ if (result.samples.empty() || result.sampleRate == 0) {
1140
+ NSString *errorMsg = @"Failed to generate speech or result is empty";
1141
+ RCTLogError(@"%@", errorMsg);
1142
+ reject(@"TTS_GENERATE_ERROR", errorMsg, nil);
1143
+ return;
1144
+ }
1145
+ generatedSamples = std::move(result.samples);
1146
+ sampleRate = result.sampleRate;
1147
+ } else {
1148
+ auto callback = [&generatedSamples, &sentenceChunkSizes](const float *samples, int32_t numSamples, float progress) -> int32_t {
1149
+ (void)progress;
1150
+ if (samples == nullptr || numSamples <= 0) {
1151
+ return 1;
1152
+ }
1153
+ generatedSamples.insert(generatedSamples.end(), samples, samples + numSamples);
1154
+ sentenceChunkSizes.push_back(numSamples);
1155
+ return numSamples;
1156
+ };
1157
+
1158
+ bool streamOk = cloneOpt.has_value()
1159
+ ? wrapper->generateStream(
1160
+ textStr,
1161
+ static_cast<int32_t>(sid),
1162
+ static_cast<float>(speed),
1163
+ callback,
1164
+ cloneOpt
1165
+ )
1166
+ : wrapper->generateStream(
1167
+ textStr,
1168
+ static_cast<int32_t>(sid),
1169
+ static_cast<float>(speed),
1170
+ callback
1171
+ );
1172
+
1173
+ sampleRate = wrapper->getSampleRate();
1174
+ if (!streamOk || generatedSamples.empty() || sampleRate == 0) {
1175
+ NSString *errorMsg = @"Failed to generate speech or result is empty";
1176
+ RCTLogError(@"%@", errorMsg);
1177
+ reject(@"TTS_GENERATE_ERROR", errorMsg, nil);
1178
+ return;
1179
+ }
1180
+
1181
+ if (sentenceChunkSizes.empty()) {
1182
+ sentenceChunkSizes.push_back(static_cast<int32_t>(generatedSamples.size()));
1183
+ }
594
1184
  }
595
1185
 
596
- NSMutableArray *samplesArray = [NSMutableArray arrayWithCapacity:result.samples.size()];
597
- for (float sample : result.samples) {
1186
+ NSMutableArray *samplesArray = [NSMutableArray arrayWithCapacity:generatedSamples.size()];
1187
+ for (float sample : generatedSamples) {
598
1188
  [samplesArray addObject:@(sample)];
599
1189
  }
600
1190
 
601
1191
  NSMutableArray *subtitlesArray = [NSMutableArray array];
602
- if (hasRef && !result.samples.empty() && result.sampleRate > 0) {
603
- double durationSec = static_cast<double>(result.samples.size()) / static_cast<double>(result.sampleRate);
604
- NSDictionary *subtitleMap = @{
605
- @"text": text,
606
- @"start": @0.0,
607
- @"end": @(durationSec)
608
- };
609
- [subtitlesArray addObject:subtitleMap];
610
- } else {
611
- std::vector<std::string> tokens = SplitTtsTokens(textStr);
612
- if (!tokens.empty()) {
613
- double totalSeconds = static_cast<double>(result.samples.size()) /
614
- static_cast<double>(result.sampleRate);
615
- double perToken = totalSeconds / static_cast<double>(tokens.size());
616
-
617
- for (size_t i = 0; i < tokens.size(); ++i) {
618
- double start = perToken * static_cast<double>(i);
619
- double end = perToken * static_cast<double>(i + 1);
620
- NSDictionary *item = @{
621
- @"text": [NSString stringWithUTF8String:tokens[i].c_str()],
622
- @"start": @(start),
623
- @"end": @(end)
624
- };
625
- [subtitlesArray addObject:item];
626
- }
627
- }
1192
+ NSString *timingMode = @"off";
1193
+
1194
+ if (![subtitleMode isEqualToString:@"off"]) {
1195
+ std::vector<std::string> sentences = SplitTextIntoSentences(textStr);
1196
+ std::vector<SubtitleTimingItem> subtitleItems = [subtitleGranularity isEqualToString:@"word"]
1197
+ ? BuildWordSubtitlesFromSentenceChunks(sentences, sentenceChunkSizes, sampleRate)
1198
+ : BuildSubtitlesFromChunks(sentences, sentenceChunkSizes, sampleRate);
1199
+
1200
+ subtitlesArray = SubtitleTimingsToNSArray(subtitleItems);
1201
+ timingMode = @"estimated";
628
1202
  }
629
1203
 
630
1204
  NSDictionary *resultDict = @{
631
1205
  @"samples": samplesArray,
632
- @"sampleRate": @(result.sampleRate),
1206
+ @"sampleRate": @(sampleRate),
633
1207
  @"subtitles": subtitlesArray,
634
- @"estimated": @YES
1208
+ @"timingMode": timingMode
635
1209
  };
636
1210
 
637
1211
  resolve(resultDict);