react-native-sherpa-onnx 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/android/src/main/assets/model_licenses/alignment-models-license-status.csv +5 -0
- package/android/src/main/cpp/CMakeLists.txt +3 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.cpp +66 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.h +17 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-alignment.cpp +108 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +30 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.cpp +66 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.h +30 -0
- package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +21 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxAlignmentHelper.kt +555 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +76 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTextSegmenter.kt +330 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +114 -10
- package/ios/Resources/model_licenses/alignment-models-license-status.csv +5 -0
- package/ios/SherpaOnnx+Alignment.mm +704 -0
- package/ios/SherpaOnnx+STT.mm +6 -0
- package/ios/SherpaOnnx+TTS.mm +624 -50
- package/ios/model_detect/sherpa-onnx-model-detect-alignment.mm +108 -0
- package/ios/model_detect/sherpa-onnx-model-detect.h +31 -0
- package/ios/model_detect/sherpa-onnx-validate-alignment.h +30 -0
- package/ios/model_detect/sherpa-onnx-validate-alignment.mm +66 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.h +3 -1
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +6 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/alignment/index.js +27 -0
- package/lib/module/alignment/index.js.map +1 -0
- package/lib/module/alignment/types.js +2 -0
- package/lib/module/alignment/types.js.map +1 -0
- package/lib/module/alignment/vocab.js +40 -0
- package/lib/module/alignment/vocab.js.map +1 -0
- package/lib/module/download/paths.js +9 -1
- package/lib/module/download/paths.js.map +1 -1
- package/lib/module/download/registry.js +17 -1
- package/lib/module/download/registry.js.map +1 -1
- package/lib/module/download/types.js +1 -0
- package/lib/module/download/types.js.map +1 -1
- package/lib/module/index.js +6 -4
- package/lib/module/index.js.map +1 -1
- package/lib/module/licenses.js +8 -2
- package/lib/module/licenses.js.map +1 -1
- package/lib/module/stt/types.js.map +1 -1
- package/lib/module/tts/index.js +68 -2
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/subtitles.js +400 -0
- package/lib/module/tts/subtitles.js.map +1 -0
- package/lib/module/tts/tempAudio.js +17 -0
- package/lib/module/tts/tempAudio.js.map +1 -0
- package/lib/module/tts/types.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +34 -3
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/alignment/index.d.ts +8 -0
- package/lib/typescript/src/alignment/index.d.ts.map +1 -0
- package/lib/typescript/src/alignment/types.d.ts +23 -0
- package/lib/typescript/src/alignment/types.d.ts.map +1 -0
- package/lib/typescript/src/alignment/vocab.d.ts +5 -0
- package/lib/typescript/src/alignment/vocab.d.ts.map +1 -0
- package/lib/typescript/src/download/paths.d.ts +5 -2
- package/lib/typescript/src/download/paths.d.ts.map +1 -1
- package/lib/typescript/src/download/registry.d.ts.map +1 -1
- package/lib/typescript/src/download/types.d.ts +2 -1
- package/lib/typescript/src/download/types.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +1 -0
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/licenses.d.ts.map +1 -1
- package/lib/typescript/src/stt/types.d.ts +5 -2
- package/lib/typescript/src/stt/types.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +2 -1
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/subtitles.d.ts +24 -0
- package/lib/typescript/src/tts/subtitles.d.ts.map +1 -0
- package/lib/typescript/src/tts/tempAudio.d.ts +3 -0
- package/lib/typescript/src/tts/tempAudio.d.ts.map +1 -0
- package/lib/typescript/src/tts/types.d.ts +68 -2
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/package.json +6 -1
- package/scripts/alignment-models/README.md +90 -0
- package/scripts/alignment-models/build_and_upload.js +724 -0
- package/scripts/alignment-models/sources.csv +5 -0
- package/scripts/alignment-models/sync_alignment_license_status.js +123 -0
- package/src/NativeSherpaOnnx.ts +35 -3
- package/src/alignment/index.ts +41 -0
- package/src/alignment/types.ts +22 -0
- package/src/alignment/vocab.ts +38 -0
- package/src/download/paths.ts +18 -5
- package/src/download/registry.ts +23 -3
- package/src/download/types.ts +1 -0
- package/src/index.tsx +6 -4
- package/src/licenses.ts +12 -1
- package/src/stt/types.ts +5 -2
- package/src/tts/index.ts +110 -3
- package/src/tts/subtitles.ts +611 -0
- package/src/tts/tempAudio.ts +31 -0
- package/src/tts/types.ts +79 -2
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
package/ios/SherpaOnnx+TTS.mm
CHANGED
|
@@ -20,6 +20,8 @@
|
|
|
20
20
|
#include <optional>
|
|
21
21
|
#include <sstream>
|
|
22
22
|
#include <string>
|
|
23
|
+
#include <cmath>
|
|
24
|
+
#include <set>
|
|
23
25
|
#include <unordered_map>
|
|
24
26
|
#include <vector>
|
|
25
27
|
#include <chrono>
|
|
@@ -64,17 +66,553 @@ static NSString *ttsModelKindToNSString(sherpaonnx::TtsModelKind kind) {
|
|
|
64
66
|
}
|
|
65
67
|
|
|
66
68
|
namespace {
|
|
67
|
-
|
|
68
|
-
std::
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
69
|
+
struct SubtitleTimingItem {
|
|
70
|
+
std::string text;
|
|
71
|
+
double start = 0.0;
|
|
72
|
+
double end = 0.0;
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
static bool IsSentenceTerminator(unichar c) {
|
|
76
|
+
switch (c) {
|
|
77
|
+
case '.':
|
|
78
|
+
case '!':
|
|
79
|
+
case '?':
|
|
80
|
+
case ';':
|
|
81
|
+
case 0x3002: // 。
|
|
82
|
+
case 0xFF01: // !
|
|
83
|
+
case 0xFF1F: // ?
|
|
84
|
+
case 0xFF1B: // ;
|
|
85
|
+
return true;
|
|
86
|
+
default:
|
|
87
|
+
return false;
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
static bool IsTrailingCloser(unichar c) {
|
|
92
|
+
switch (c) {
|
|
93
|
+
case '"':
|
|
94
|
+
case '\'':
|
|
95
|
+
case ')':
|
|
96
|
+
case ']':
|
|
97
|
+
case '}':
|
|
98
|
+
case '>':
|
|
99
|
+
case 0x201D: // ”
|
|
100
|
+
case 0x2019: // ’
|
|
101
|
+
case 0x300D: // 」
|
|
102
|
+
case 0x300F: // 』
|
|
103
|
+
case 0x3011: // 】
|
|
104
|
+
case 0xFF09: // )
|
|
105
|
+
return true;
|
|
106
|
+
default:
|
|
107
|
+
return false;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
static bool IsWordDelimiter(unichar c) {
|
|
112
|
+
switch (c) {
|
|
113
|
+
case '.':
|
|
114
|
+
case ',':
|
|
115
|
+
case '!':
|
|
116
|
+
case '?':
|
|
117
|
+
case ';':
|
|
118
|
+
case ':':
|
|
119
|
+
case '(':
|
|
120
|
+
case ')':
|
|
121
|
+
case '[':
|
|
122
|
+
case ']':
|
|
123
|
+
case '{':
|
|
124
|
+
case '}':
|
|
125
|
+
case '"':
|
|
126
|
+
case '\'':
|
|
127
|
+
case '`':
|
|
128
|
+
case '~':
|
|
129
|
+
case '<':
|
|
130
|
+
case '>':
|
|
131
|
+
case '/':
|
|
132
|
+
case '\\':
|
|
133
|
+
case '|':
|
|
134
|
+
case '@':
|
|
135
|
+
case '#':
|
|
136
|
+
case '$':
|
|
137
|
+
case '%':
|
|
138
|
+
case '^':
|
|
139
|
+
case '&':
|
|
140
|
+
case '*':
|
|
141
|
+
case '+':
|
|
142
|
+
case '=':
|
|
143
|
+
case 0x2026: // …
|
|
144
|
+
case 0xFF0C: // ,
|
|
145
|
+
case 0x3002: // 。
|
|
146
|
+
case 0xFF01: // !
|
|
147
|
+
case 0xFF1F: // ?
|
|
148
|
+
case 0xFF1B: // ;
|
|
149
|
+
case 0xFF1A: // :
|
|
150
|
+
case 0x3001: // 、
|
|
151
|
+
return true;
|
|
152
|
+
default:
|
|
153
|
+
return false;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
static bool IsCjkCodepoint(unichar c) {
|
|
158
|
+
return (c >= 0x4E00 && c <= 0x9FFF) ||
|
|
159
|
+
(c >= 0x3400 && c <= 0x4DBF) ||
|
|
160
|
+
(c >= 0x3040 && c <= 0x309F) ||
|
|
161
|
+
(c >= 0x30A0 && c <= 0x30FF) ||
|
|
162
|
+
(c >= 0xAC00 && c <= 0xD7AF);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
static NSString *ExtractTokenBeforePeriod(NSString *text, NSInteger periodIndex) {
|
|
166
|
+
if (text == nil || text.length == 0 || periodIndex <= 0) {
|
|
167
|
+
return @"";
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
NSCharacterSet *ws = [NSCharacterSet whitespaceAndNewlineCharacterSet];
|
|
171
|
+
NSCharacterSet *letters = [NSCharacterSet letterCharacterSet];
|
|
172
|
+
|
|
173
|
+
NSInteger i = periodIndex - 1;
|
|
174
|
+
while (i >= 0 && [ws characterIsMember:[text characterAtIndex:i]]) {
|
|
175
|
+
i -= 1;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
NSInteger end = i;
|
|
179
|
+
while (i >= 0) {
|
|
180
|
+
unichar c = [text characterAtIndex:i];
|
|
181
|
+
if ([letters characterIsMember:c] || c == '.') {
|
|
182
|
+
i -= 1;
|
|
183
|
+
continue;
|
|
184
|
+
}
|
|
185
|
+
break;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
if (end < i + 1) {
|
|
189
|
+
return @"";
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
NSString *token = [text substringWithRange:NSMakeRange(i + 1, end - i)];
|
|
193
|
+
while (token.length > 0 && [token characterAtIndex:token.length - 1] == '.') {
|
|
194
|
+
token = [token substringToIndex:token.length - 1];
|
|
195
|
+
}
|
|
196
|
+
return token;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
static bool ShouldSplitOnPeriod(NSString *text, NSInteger periodIndex) {
|
|
200
|
+
if (text == nil || periodIndex < 0 || periodIndex >= text.length) {
|
|
201
|
+
return true;
|
|
73
202
|
}
|
|
74
|
-
|
|
75
|
-
|
|
203
|
+
|
|
204
|
+
NSCharacterSet *digits = [NSCharacterSet decimalDigitCharacterSet];
|
|
205
|
+
if (periodIndex > 0 && periodIndex + 1 < text.length) {
|
|
206
|
+
unichar prev = [text characterAtIndex:periodIndex - 1];
|
|
207
|
+
unichar next = [text characterAtIndex:periodIndex + 1];
|
|
208
|
+
if ([digits characterIsMember:prev] && [digits characterIsMember:next]) {
|
|
209
|
+
return false;
|
|
210
|
+
}
|
|
76
211
|
}
|
|
77
|
-
|
|
212
|
+
|
|
213
|
+
static const std::set<std::string> kAbbreviations = {
|
|
214
|
+
"mr", "mrs", "ms", "dr", "prof", "sr", "jr", "st", "vs", "etc", "e.g", "i.e"
|
|
215
|
+
};
|
|
216
|
+
|
|
217
|
+
NSString *tokenRaw = [ExtractTokenBeforePeriod(text, periodIndex)
|
|
218
|
+
stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
|
|
219
|
+
NSString *tokenLower = [tokenRaw lowercaseString];
|
|
220
|
+
std::string tokenUtf8 = tokenLower != nil ? std::string([tokenLower UTF8String]) : std::string();
|
|
221
|
+
if (!tokenUtf8.empty() && kAbbreviations.find(tokenUtf8) != kAbbreviations.end()) {
|
|
222
|
+
return false;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Likely initial, e.g. "A. Smith" — use original case; tokenLower cannot match uppercaseLetter.
|
|
226
|
+
if (tokenRaw.length == 1) {
|
|
227
|
+
NSCharacterSet *upper = [NSCharacterSet uppercaseLetterCharacterSet];
|
|
228
|
+
if ([upper characterIsMember:[tokenRaw characterAtIndex:0]]) {
|
|
229
|
+
return false;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
return true;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
static NSInteger SentenceBoundaryEnd(NSString *text, NSInteger index) {
|
|
237
|
+
NSInteger end = index + 1;
|
|
238
|
+
while (end < text.length && IsSentenceTerminator([text characterAtIndex:end])) {
|
|
239
|
+
end += 1;
|
|
240
|
+
}
|
|
241
|
+
while (end < text.length && IsTrailingCloser([text characterAtIndex:end])) {
|
|
242
|
+
end += 1;
|
|
243
|
+
}
|
|
244
|
+
return end;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
static std::vector<std::string> SanitizeSegments(const std::vector<std::string> &segments) {
|
|
248
|
+
std::vector<std::string> cleaned;
|
|
249
|
+
cleaned.reserve(segments.size());
|
|
250
|
+
for (const auto &segment : segments) {
|
|
251
|
+
NSString *s = [NSString stringWithUTF8String:segment.c_str()];
|
|
252
|
+
if (s == nil) {
|
|
253
|
+
continue;
|
|
254
|
+
}
|
|
255
|
+
NSString *trimmed = [s stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
|
|
256
|
+
if (trimmed.length == 0) {
|
|
257
|
+
continue;
|
|
258
|
+
}
|
|
259
|
+
cleaned.emplace_back([trimmed UTF8String]);
|
|
260
|
+
}
|
|
261
|
+
return cleaned;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
static int32_t TextWeight(const std::string &text) {
|
|
265
|
+
NSString *s = [NSString stringWithUTF8String:text.c_str()];
|
|
266
|
+
if (s == nil || s.length == 0) {
|
|
267
|
+
return 1;
|
|
268
|
+
}
|
|
269
|
+
return static_cast<int32_t>(MAX(1, static_cast<int32_t>(s.length)));
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
static std::vector<int32_t> DistributeSamplesByTextWeight(
|
|
273
|
+
int32_t totalSamples,
|
|
274
|
+
const std::vector<std::string> &segments
|
|
275
|
+
) {
|
|
276
|
+
if (segments.empty()) {
|
|
277
|
+
return {};
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
int32_t safeTotal = std::max<int32_t>(0, totalSamples);
|
|
281
|
+
if (safeTotal == 0) {
|
|
282
|
+
return std::vector<int32_t>(segments.size(), 0);
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
std::vector<int32_t> weights;
|
|
286
|
+
weights.reserve(segments.size());
|
|
287
|
+
int32_t weightSum = 0;
|
|
288
|
+
for (const auto &segment : segments) {
|
|
289
|
+
int32_t w = std::max<int32_t>(1, TextWeight(segment));
|
|
290
|
+
weights.push_back(w);
|
|
291
|
+
weightSum += w;
|
|
292
|
+
}
|
|
293
|
+
if (weightSum <= 0) {
|
|
294
|
+
return std::vector<int32_t>(segments.size(), 0);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
std::vector<int32_t> base(segments.size(), 0);
|
|
298
|
+
std::vector<std::pair<size_t, double>> fractions;
|
|
299
|
+
fractions.reserve(segments.size());
|
|
300
|
+
|
|
301
|
+
for (size_t i = 0; i < segments.size(); ++i) {
|
|
302
|
+
double exact = (static_cast<double>(safeTotal) * static_cast<double>(weights[i])) / static_cast<double>(weightSum);
|
|
303
|
+
int32_t floorValue = static_cast<int32_t>(std::floor(exact));
|
|
304
|
+
base[i] = floorValue;
|
|
305
|
+
fractions.emplace_back(i, exact - static_cast<double>(floorValue));
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
int32_t assigned = 0;
|
|
309
|
+
for (auto v : base) {
|
|
310
|
+
assigned += v;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
int32_t remaining = safeTotal - assigned;
|
|
314
|
+
if (remaining > 0) {
|
|
315
|
+
std::sort(
|
|
316
|
+
fractions.begin(),
|
|
317
|
+
fractions.end(),
|
|
318
|
+
[](const auto &a, const auto &b) { return a.second > b.second; }
|
|
319
|
+
);
|
|
320
|
+
|
|
321
|
+
size_t ptr = 0;
|
|
322
|
+
while (remaining > 0 && !fractions.empty()) {
|
|
323
|
+
size_t idx = fractions[ptr % fractions.size()].first;
|
|
324
|
+
base[idx] += 1;
|
|
325
|
+
remaining -= 1;
|
|
326
|
+
ptr += 1;
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
return base;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
static std::vector<int32_t> AlignChunkCountsToSegments(
|
|
334
|
+
const std::vector<std::string> &segments,
|
|
335
|
+
const std::vector<int32_t> &chunkSampleCounts
|
|
336
|
+
) {
|
|
337
|
+
if (segments.empty()) {
|
|
338
|
+
return {};
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
std::vector<int32_t> counts;
|
|
342
|
+
counts.reserve(chunkSampleCounts.size());
|
|
343
|
+
for (auto value : chunkSampleCounts) {
|
|
344
|
+
counts.push_back(std::max<int32_t>(0, value));
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
if (counts.size() == segments.size()) {
|
|
348
|
+
return counts;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
if (counts.size() > segments.size()) {
|
|
352
|
+
std::vector<int32_t> merged(counts.begin(), counts.begin() + static_cast<long>(segments.size()));
|
|
353
|
+
int32_t extra = 0;
|
|
354
|
+
for (size_t i = segments.size(); i < counts.size(); ++i) {
|
|
355
|
+
extra += counts[i];
|
|
356
|
+
}
|
|
357
|
+
if (!merged.empty()) {
|
|
358
|
+
merged.back() += extra;
|
|
359
|
+
}
|
|
360
|
+
return merged;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
int32_t total = 0;
|
|
364
|
+
for (auto value : counts) {
|
|
365
|
+
total += value;
|
|
366
|
+
}
|
|
367
|
+
return DistributeSamplesByTextWeight(total, segments);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
static std::vector<SubtitleTimingItem> BuildSubtitlesFromChunks(
|
|
371
|
+
const std::vector<std::string> &segments,
|
|
372
|
+
const std::vector<int32_t> &chunkSampleCounts,
|
|
373
|
+
int32_t sampleRate
|
|
374
|
+
) {
|
|
375
|
+
if (sampleRate <= 0) {
|
|
376
|
+
return {};
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
std::vector<std::string> cleaned = SanitizeSegments(segments);
|
|
380
|
+
if (cleaned.empty()) {
|
|
381
|
+
return {};
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
std::vector<int32_t> aligned = AlignChunkCountsToSegments(cleaned, chunkSampleCounts);
|
|
385
|
+
|
|
386
|
+
std::vector<SubtitleTimingItem> out;
|
|
387
|
+
out.reserve(cleaned.size());
|
|
388
|
+
int64_t offsetSamples = 0;
|
|
389
|
+
|
|
390
|
+
for (size_t i = 0; i < cleaned.size(); ++i) {
|
|
391
|
+
int32_t count = i < aligned.size() ? std::max<int32_t>(0, aligned[i]) : 0;
|
|
392
|
+
if (count == 0 && offsetSamples == 0) {
|
|
393
|
+
continue;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
double start = static_cast<double>(offsetSamples) / static_cast<double>(sampleRate);
|
|
397
|
+
offsetSamples += count;
|
|
398
|
+
double end = static_cast<double>(offsetSamples) / static_cast<double>(sampleRate);
|
|
399
|
+
|
|
400
|
+
out.push_back(SubtitleTimingItem{cleaned[i], start, end});
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
return out;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
static std::vector<SubtitleTimingItem> BuildWordSubtitlesFromSentenceChunks(
|
|
407
|
+
const std::vector<std::string> &sentences,
|
|
408
|
+
const std::vector<int32_t> &sentenceChunkSampleCounts,
|
|
409
|
+
int32_t sampleRate
|
|
410
|
+
);
|
|
411
|
+
|
|
412
|
+
static std::vector<std::string> SplitTextIntoSentences(const std::string &text) {
|
|
413
|
+
NSString *source = [NSString stringWithUTF8String:text.c_str()];
|
|
414
|
+
if (source == nil) {
|
|
415
|
+
return {};
|
|
416
|
+
}
|
|
417
|
+
NSString *normalized = [source stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
|
|
418
|
+
if (normalized.length == 0) {
|
|
419
|
+
return {};
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
NSCharacterSet *ws = [NSCharacterSet whitespaceAndNewlineCharacterSet];
|
|
423
|
+
NSMutableArray<NSString *> *items = [NSMutableArray array];
|
|
424
|
+
|
|
425
|
+
NSInteger start = 0;
|
|
426
|
+
NSInteger i = 0;
|
|
427
|
+
|
|
428
|
+
while (i < normalized.length) {
|
|
429
|
+
unichar current = [normalized characterAtIndex:i];
|
|
430
|
+
if (!IsSentenceTerminator(current)) {
|
|
431
|
+
i += 1;
|
|
432
|
+
continue;
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
if (current == '.' && !ShouldSplitOnPeriod(normalized, i)) {
|
|
436
|
+
i += 1;
|
|
437
|
+
continue;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
NSInteger end = SentenceBoundaryEnd(normalized, i);
|
|
441
|
+
if (end < normalized.length && ![ws characterIsMember:[normalized characterAtIndex:end]]) {
|
|
442
|
+
i += 1;
|
|
443
|
+
continue;
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
NSString *segment = [[normalized substringWithRange:NSMakeRange(start, end - start)]
|
|
447
|
+
stringByTrimmingCharactersInSet:ws];
|
|
448
|
+
if (segment.length > 0) {
|
|
449
|
+
[items addObject:segment];
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
start = end;
|
|
453
|
+
while (start < normalized.length && [ws characterIsMember:[normalized characterAtIndex:start]]) {
|
|
454
|
+
start += 1;
|
|
455
|
+
}
|
|
456
|
+
i = start;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
if (start < normalized.length) {
|
|
460
|
+
NSString *tail = [[normalized substringFromIndex:start] stringByTrimmingCharactersInSet:ws];
|
|
461
|
+
if (tail.length > 0) {
|
|
462
|
+
[items addObject:tail];
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
std::vector<std::string> out;
|
|
467
|
+
out.reserve(items.count);
|
|
468
|
+
for (NSString *segment in items) {
|
|
469
|
+
out.emplace_back([segment UTF8String]);
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
if (out.empty()) {
|
|
473
|
+
out.emplace_back([normalized UTF8String]);
|
|
474
|
+
}
|
|
475
|
+
return out;
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
static std::vector<std::string> SplitTextIntoWords(const std::string &text) {
|
|
479
|
+
NSString *source = [NSString stringWithUTF8String:text.c_str()];
|
|
480
|
+
if (source == nil) {
|
|
481
|
+
return {};
|
|
482
|
+
}
|
|
483
|
+
NSString *normalized = [source stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
|
|
484
|
+
if (normalized.length == 0) {
|
|
485
|
+
return {};
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
NSCharacterSet *ws = [NSCharacterSet whitespaceAndNewlineCharacterSet];
|
|
489
|
+
NSMutableArray<NSString *> *items = [NSMutableArray array];
|
|
490
|
+
NSMutableString *current = [NSMutableString string];
|
|
491
|
+
|
|
492
|
+
void (^flushCurrent)(void) = ^{
|
|
493
|
+
NSString *token = [current stringByTrimmingCharactersInSet:ws];
|
|
494
|
+
if (token.length > 0) {
|
|
495
|
+
[items addObject:token];
|
|
496
|
+
}
|
|
497
|
+
[current setString:@""];
|
|
498
|
+
};
|
|
499
|
+
|
|
500
|
+
for (NSInteger i = 0; i < normalized.length; ++i) {
|
|
501
|
+
unichar c = [normalized characterAtIndex:i];
|
|
502
|
+
if ([ws characterIsMember:c]) {
|
|
503
|
+
flushCurrent();
|
|
504
|
+
continue;
|
|
505
|
+
}
|
|
506
|
+
if (IsCjkCodepoint(c)) {
|
|
507
|
+
flushCurrent();
|
|
508
|
+
[items addObject:[NSString stringWithCharacters:&c length:1]];
|
|
509
|
+
continue;
|
|
510
|
+
}
|
|
511
|
+
if (IsWordDelimiter(c)) {
|
|
512
|
+
flushCurrent();
|
|
513
|
+
continue;
|
|
514
|
+
}
|
|
515
|
+
[current appendFormat:@"%C", c];
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
flushCurrent();
|
|
519
|
+
|
|
520
|
+
std::vector<std::string> out;
|
|
521
|
+
out.reserve(items.count);
|
|
522
|
+
for (NSString *segment in items) {
|
|
523
|
+
out.emplace_back([segment UTF8String]);
|
|
524
|
+
}
|
|
525
|
+
if (out.empty()) {
|
|
526
|
+
out.emplace_back([normalized UTF8String]);
|
|
527
|
+
}
|
|
528
|
+
return out;
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
static std::vector<SubtitleTimingItem> BuildWordSubtitlesFromSentenceChunks(
|
|
532
|
+
const std::vector<std::string> &sentences,
|
|
533
|
+
const std::vector<int32_t> &sentenceChunkSampleCounts,
|
|
534
|
+
int32_t sampleRate
|
|
535
|
+
) {
|
|
536
|
+
std::vector<std::string> cleanedSentences = SanitizeSegments(sentences);
|
|
537
|
+
if (cleanedSentences.empty()) {
|
|
538
|
+
return {};
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
std::vector<int32_t> alignedSentenceCounts = AlignChunkCountsToSegments(
|
|
542
|
+
cleanedSentences,
|
|
543
|
+
sentenceChunkSampleCounts
|
|
544
|
+
);
|
|
545
|
+
|
|
546
|
+
std::vector<std::string> wordSegments;
|
|
547
|
+
std::vector<int32_t> wordChunkCounts;
|
|
548
|
+
|
|
549
|
+
for (size_t i = 0; i < cleanedSentences.size(); ++i) {
|
|
550
|
+
int32_t sentenceSamples = i < alignedSentenceCounts.size()
|
|
551
|
+
? std::max<int32_t>(0, alignedSentenceCounts[i])
|
|
552
|
+
: 0;
|
|
553
|
+
std::vector<std::string> words = SplitTextIntoWords(cleanedSentences[i]);
|
|
554
|
+
if (words.empty()) {
|
|
555
|
+
continue;
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
std::vector<int32_t> distributed = DistributeSamplesByTextWeight(sentenceSamples, words);
|
|
559
|
+
for (size_t j = 0; j < words.size(); ++j) {
|
|
560
|
+
wordSegments.push_back(words[j]);
|
|
561
|
+
wordChunkCounts.push_back(j < distributed.size() ? distributed[j] : 0);
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
return BuildSubtitlesFromChunks(wordSegments, wordChunkCounts, sampleRate);
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
static NSMutableArray *SubtitleTimingsToNSArray(const std::vector<SubtitleTimingItem> &items) {
|
|
569
|
+
NSMutableArray *array = [NSMutableArray arrayWithCapacity:items.size()];
|
|
570
|
+
for (const auto &item : items) {
|
|
571
|
+
NSString *text = [NSString stringWithUTF8String:item.text.c_str()] ?: @"";
|
|
572
|
+
NSDictionary *entry = @{
|
|
573
|
+
@"text": text,
|
|
574
|
+
@"start": @(item.start),
|
|
575
|
+
@"end": @(item.end)
|
|
576
|
+
};
|
|
577
|
+
[array addObject:entry];
|
|
578
|
+
}
|
|
579
|
+
return array;
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
static NSString *SubtitleModeFromOptions(NSDictionary *options) {
|
|
583
|
+
NSString *raw = [options[@"subtitleMode"] isKindOfClass:[NSString class]] ? options[@"subtitleMode"] : nil;
|
|
584
|
+
NSString *normalized = raw != nil
|
|
585
|
+
? [[raw lowercaseString] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]
|
|
586
|
+
: @"fast";
|
|
587
|
+
|
|
588
|
+
if ([normalized isEqualToString:@"off"] ||
|
|
589
|
+
[normalized isEqualToString:@"fast"] ||
|
|
590
|
+
[normalized isEqualToString:@"accurate"]) {
|
|
591
|
+
return normalized;
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
return @"fast";
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
static NSString *SubtitleGranularityFromOptions(NSDictionary *options) {
|
|
598
|
+
NSString *raw = [options[@"subtitleGranularity"] isKindOfClass:[NSString class]] ? options[@"subtitleGranularity"] : nil;
|
|
599
|
+
NSString *normalized = raw != nil
|
|
600
|
+
? [[raw lowercaseString] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]
|
|
601
|
+
: @"sentence";
|
|
602
|
+
|
|
603
|
+
if ([normalized isEqualToString:@"word"] || [normalized isEqualToString:@"sentence"]) {
|
|
604
|
+
return normalized;
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
return @"sentence";
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
static bool IsCharacterGranularityRequested(NSDictionary *options) {
|
|
611
|
+
NSString *raw = [options[@"subtitleGranularity"] isKindOfClass:[NSString class]] ? options[@"subtitleGranularity"] : nil;
|
|
612
|
+
NSString *normalized = raw != nil
|
|
613
|
+
? [[raw lowercaseString] stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]
|
|
614
|
+
: @"";
|
|
615
|
+
return [normalized isEqualToString:@"character"];
|
|
78
616
|
}
|
|
79
617
|
|
|
80
618
|
/** When options omit numSteps, matches Android SherpaOnnxTtsHelper / upstream GenerationConfig default. */
|
|
@@ -536,6 +1074,14 @@ static bool NSDictionaryHasValidReferenceAudio(NSDictionary *options) {
|
|
|
536
1074
|
reject(@"TTS_GENERATE_ERROR", @"instanceId is required", nil);
|
|
537
1075
|
return;
|
|
538
1076
|
}
|
|
1077
|
+
|
|
1078
|
+
NSString *subtitleMode = SubtitleModeFromOptions(options);
|
|
1079
|
+
NSString *subtitleGranularity = SubtitleGranularityFromOptions(options);
|
|
1080
|
+
if (IsCharacterGranularityRequested(options) && ![subtitleMode isEqualToString:@"accurate"]) {
|
|
1081
|
+
reject(@"TTS_SUBTITLE_ERROR", @"Character granularity is only supported when subtitleMode is 'accurate'.", nil);
|
|
1082
|
+
return;
|
|
1083
|
+
}
|
|
1084
|
+
|
|
539
1085
|
double sid = 0;
|
|
540
1086
|
double speed = 1.0;
|
|
541
1087
|
if (options != nil) {
|
|
@@ -579,59 +1125,87 @@ static bool NSDictionaryHasValidReferenceAudio(NSDictionary *options) {
|
|
|
579
1125
|
cloneOpt = VoiceCloneOptionsFromNSDictionary(options, kDefaultVoiceCloneNumSteps);
|
|
580
1126
|
}
|
|
581
1127
|
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
static_cast<float>(speed),
|
|
586
|
-
cloneOpt
|
|
587
|
-
);
|
|
1128
|
+
std::vector<float> generatedSamples;
|
|
1129
|
+
int32_t sampleRate = 0;
|
|
1130
|
+
std::vector<int32_t> sentenceChunkSizes;
|
|
588
1131
|
|
|
589
|
-
if (
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
1132
|
+
if ([subtitleMode isEqualToString:@"off"]) {
|
|
1133
|
+
auto result = wrapper->generate(
|
|
1134
|
+
textStr,
|
|
1135
|
+
static_cast<int32_t>(sid),
|
|
1136
|
+
static_cast<float>(speed),
|
|
1137
|
+
cloneOpt
|
|
1138
|
+
);
|
|
1139
|
+
if (result.samples.empty() || result.sampleRate == 0) {
|
|
1140
|
+
NSString *errorMsg = @"Failed to generate speech or result is empty";
|
|
1141
|
+
RCTLogError(@"%@", errorMsg);
|
|
1142
|
+
reject(@"TTS_GENERATE_ERROR", errorMsg, nil);
|
|
1143
|
+
return;
|
|
1144
|
+
}
|
|
1145
|
+
generatedSamples = std::move(result.samples);
|
|
1146
|
+
sampleRate = result.sampleRate;
|
|
1147
|
+
} else {
|
|
1148
|
+
auto callback = [&generatedSamples, &sentenceChunkSizes](const float *samples, int32_t numSamples, float progress) -> int32_t {
|
|
1149
|
+
(void)progress;
|
|
1150
|
+
if (samples == nullptr || numSamples <= 0) {
|
|
1151
|
+
return 1;
|
|
1152
|
+
}
|
|
1153
|
+
generatedSamples.insert(generatedSamples.end(), samples, samples + numSamples);
|
|
1154
|
+
sentenceChunkSizes.push_back(numSamples);
|
|
1155
|
+
return numSamples;
|
|
1156
|
+
};
|
|
1157
|
+
|
|
1158
|
+
bool streamOk = cloneOpt.has_value()
|
|
1159
|
+
? wrapper->generateStream(
|
|
1160
|
+
textStr,
|
|
1161
|
+
static_cast<int32_t>(sid),
|
|
1162
|
+
static_cast<float>(speed),
|
|
1163
|
+
callback,
|
|
1164
|
+
cloneOpt
|
|
1165
|
+
)
|
|
1166
|
+
: wrapper->generateStream(
|
|
1167
|
+
textStr,
|
|
1168
|
+
static_cast<int32_t>(sid),
|
|
1169
|
+
static_cast<float>(speed),
|
|
1170
|
+
callback
|
|
1171
|
+
);
|
|
1172
|
+
|
|
1173
|
+
sampleRate = wrapper->getSampleRate();
|
|
1174
|
+
if (!streamOk || generatedSamples.empty() || sampleRate == 0) {
|
|
1175
|
+
NSString *errorMsg = @"Failed to generate speech or result is empty";
|
|
1176
|
+
RCTLogError(@"%@", errorMsg);
|
|
1177
|
+
reject(@"TTS_GENERATE_ERROR", errorMsg, nil);
|
|
1178
|
+
return;
|
|
1179
|
+
}
|
|
1180
|
+
|
|
1181
|
+
if (sentenceChunkSizes.empty()) {
|
|
1182
|
+
sentenceChunkSizes.push_back(static_cast<int32_t>(generatedSamples.size()));
|
|
1183
|
+
}
|
|
594
1184
|
}
|
|
595
1185
|
|
|
596
|
-
NSMutableArray *samplesArray = [NSMutableArray arrayWithCapacity:
|
|
597
|
-
for (float sample :
|
|
1186
|
+
NSMutableArray *samplesArray = [NSMutableArray arrayWithCapacity:generatedSamples.size()];
|
|
1187
|
+
for (float sample : generatedSamples) {
|
|
598
1188
|
[samplesArray addObject:@(sample)];
|
|
599
1189
|
}
|
|
600
1190
|
|
|
601
1191
|
NSMutableArray *subtitlesArray = [NSMutableArray array];
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
if (!tokens.empty()) {
|
|
613
|
-
double totalSeconds = static_cast<double>(result.samples.size()) /
|
|
614
|
-
static_cast<double>(result.sampleRate);
|
|
615
|
-
double perToken = totalSeconds / static_cast<double>(tokens.size());
|
|
616
|
-
|
|
617
|
-
for (size_t i = 0; i < tokens.size(); ++i) {
|
|
618
|
-
double start = perToken * static_cast<double>(i);
|
|
619
|
-
double end = perToken * static_cast<double>(i + 1);
|
|
620
|
-
NSDictionary *item = @{
|
|
621
|
-
@"text": [NSString stringWithUTF8String:tokens[i].c_str()],
|
|
622
|
-
@"start": @(start),
|
|
623
|
-
@"end": @(end)
|
|
624
|
-
};
|
|
625
|
-
[subtitlesArray addObject:item];
|
|
626
|
-
}
|
|
627
|
-
}
|
|
1192
|
+
NSString *timingMode = @"off";
|
|
1193
|
+
|
|
1194
|
+
if (![subtitleMode isEqualToString:@"off"]) {
|
|
1195
|
+
std::vector<std::string> sentences = SplitTextIntoSentences(textStr);
|
|
1196
|
+
std::vector<SubtitleTimingItem> subtitleItems = [subtitleGranularity isEqualToString:@"word"]
|
|
1197
|
+
? BuildWordSubtitlesFromSentenceChunks(sentences, sentenceChunkSizes, sampleRate)
|
|
1198
|
+
: BuildSubtitlesFromChunks(sentences, sentenceChunkSizes, sampleRate);
|
|
1199
|
+
|
|
1200
|
+
subtitlesArray = SubtitleTimingsToNSArray(subtitleItems);
|
|
1201
|
+
timingMode = @"estimated";
|
|
628
1202
|
}
|
|
629
1203
|
|
|
630
1204
|
NSDictionary *resultDict = @{
|
|
631
1205
|
@"samples": samplesArray,
|
|
632
|
-
@"sampleRate": @(
|
|
1206
|
+
@"sampleRate": @(sampleRate),
|
|
633
1207
|
@"subtitles": subtitlesArray,
|
|
634
|
-
@"
|
|
1208
|
+
@"timingMode": timingMode
|
|
635
1209
|
};
|
|
636
1210
|
|
|
637
1211
|
resolve(resultDict);
|