react-native-sherpa-onnx 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/android/src/main/assets/model_licenses/alignment-models-license-status.csv +5 -0
- package/android/src/main/cpp/CMakeLists.txt +3 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.cpp +66 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.h +17 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-alignment.cpp +108 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +30 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.cpp +66 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.h +30 -0
- package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +21 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxAlignmentHelper.kt +555 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +76 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTextSegmenter.kt +330 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +180 -23
- package/ios/Resources/model_licenses/alignment-models-license-status.csv +5 -0
- package/ios/SherpaOnnx+Alignment.mm +704 -0
- package/ios/SherpaOnnx+STT.mm +6 -0
- package/ios/SherpaOnnx+TTS.mm +624 -50
- package/ios/model_detect/sherpa-onnx-model-detect-alignment.mm +108 -0
- package/ios/model_detect/sherpa-onnx-model-detect.h +31 -0
- package/ios/model_detect/sherpa-onnx-validate-alignment.h +30 -0
- package/ios/model_detect/sherpa-onnx-validate-alignment.mm +66 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.h +3 -1
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +6 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/alignment/index.js +27 -0
- package/lib/module/alignment/index.js.map +1 -0
- package/lib/module/alignment/types.js +2 -0
- package/lib/module/alignment/types.js.map +1 -0
- package/lib/module/alignment/vocab.js +40 -0
- package/lib/module/alignment/vocab.js.map +1 -0
- package/lib/module/download/paths.js +9 -1
- package/lib/module/download/paths.js.map +1 -1
- package/lib/module/download/registry.js +17 -1
- package/lib/module/download/registry.js.map +1 -1
- package/lib/module/download/types.js +1 -0
- package/lib/module/download/types.js.map +1 -1
- package/lib/module/index.js +6 -4
- package/lib/module/index.js.map +1 -1
- package/lib/module/licenses.js +8 -2
- package/lib/module/licenses.js.map +1 -1
- package/lib/module/stt/types.js.map +1 -1
- package/lib/module/tts/index.js +68 -2
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/subtitles.js +400 -0
- package/lib/module/tts/subtitles.js.map +1 -0
- package/lib/module/tts/tempAudio.js +17 -0
- package/lib/module/tts/tempAudio.js.map +1 -0
- package/lib/module/tts/types.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +34 -3
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/alignment/index.d.ts +8 -0
- package/lib/typescript/src/alignment/index.d.ts.map +1 -0
- package/lib/typescript/src/alignment/types.d.ts +23 -0
- package/lib/typescript/src/alignment/types.d.ts.map +1 -0
- package/lib/typescript/src/alignment/vocab.d.ts +5 -0
- package/lib/typescript/src/alignment/vocab.d.ts.map +1 -0
- package/lib/typescript/src/download/paths.d.ts +5 -2
- package/lib/typescript/src/download/paths.d.ts.map +1 -1
- package/lib/typescript/src/download/registry.d.ts.map +1 -1
- package/lib/typescript/src/download/types.d.ts +2 -1
- package/lib/typescript/src/download/types.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +1 -0
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/licenses.d.ts.map +1 -1
- package/lib/typescript/src/stt/types.d.ts +5 -2
- package/lib/typescript/src/stt/types.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +2 -1
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/subtitles.d.ts +24 -0
- package/lib/typescript/src/tts/subtitles.d.ts.map +1 -0
- package/lib/typescript/src/tts/tempAudio.d.ts +3 -0
- package/lib/typescript/src/tts/tempAudio.d.ts.map +1 -0
- package/lib/typescript/src/tts/types.d.ts +68 -2
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/package.json +6 -1
- package/scripts/alignment-models/README.md +90 -0
- package/scripts/alignment-models/build_and_upload.js +724 -0
- package/scripts/alignment-models/sources.csv +5 -0
- package/scripts/alignment-models/sync_alignment_license_status.js +123 -0
- package/src/NativeSherpaOnnx.ts +35 -3
- package/src/alignment/index.ts +41 -0
- package/src/alignment/types.ts +22 -0
- package/src/alignment/vocab.ts +38 -0
- package/src/download/paths.ts +18 -5
- package/src/download/registry.ts +23 -3
- package/src/download/types.ts +1 -0
- package/src/index.tsx +6 -4
- package/src/licenses.ts +12 -1
- package/src/stt/types.ts +5 -2
- package/src/tts/index.ts +110 -3
- package/src/tts/subtitles.ts +611 -0
- package/src/tts/tempAudio.ts +31 -0
- package/src/tts/types.ts +79 -2
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
package com.sherpaonnx
|
|
2
|
+
|
|
3
|
+
import kotlin.math.floor
|
|
4
|
+
import kotlin.math.max
|
|
5
|
+
|
|
6
|
+
internal data class SubtitleTimingItem(
|
|
7
|
+
val text: String,
|
|
8
|
+
val start: Double,
|
|
9
|
+
val end: Double
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
internal object SherpaOnnxTextSegmenter {
|
|
13
|
+
private val sentenceTerminators = setOf('.', '!', '?', ';', '。', '!', '?', ';')
|
|
14
|
+
private val trailingClosers = setOf('"', '\'', ')', ']', '}', '>', '”', '’', '」', '』', '】', ')')
|
|
15
|
+
private val commonAbbreviations = setOf(
|
|
16
|
+
"mr",
|
|
17
|
+
"mrs",
|
|
18
|
+
"ms",
|
|
19
|
+
"dr",
|
|
20
|
+
"prof",
|
|
21
|
+
"sr",
|
|
22
|
+
"jr",
|
|
23
|
+
"st",
|
|
24
|
+
"vs",
|
|
25
|
+
"etc",
|
|
26
|
+
"e.g",
|
|
27
|
+
"i.e"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
fun splitIntoSentences(text: String): List<String> {
|
|
31
|
+
val normalized = text.trim()
|
|
32
|
+
if (normalized.isEmpty()) return emptyList()
|
|
33
|
+
|
|
34
|
+
val out = mutableListOf<String>()
|
|
35
|
+
var start = 0
|
|
36
|
+
var i = 0
|
|
37
|
+
|
|
38
|
+
while (i < normalized.length) {
|
|
39
|
+
val current = normalized[i]
|
|
40
|
+
if (!isSentenceTerminator(current)) {
|
|
41
|
+
i += 1
|
|
42
|
+
continue
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if (current == '.' && !shouldSplitOnPeriod(normalized, i)) {
|
|
46
|
+
i += 1
|
|
47
|
+
continue
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
val end = sentenceBoundaryEnd(normalized, i)
|
|
51
|
+
val next = normalized.getOrNull(end)
|
|
52
|
+
if (next != null && !next.isWhitespace()) {
|
|
53
|
+
i += 1
|
|
54
|
+
continue
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
val sentence = normalized.substring(start, end).trim()
|
|
58
|
+
if (sentence.isNotEmpty()) {
|
|
59
|
+
out += sentence
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
start = end
|
|
63
|
+
while (start < normalized.length && normalized[start].isWhitespace()) {
|
|
64
|
+
start += 1
|
|
65
|
+
}
|
|
66
|
+
i = start
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
val tail = if (start < normalized.length) normalized.substring(start).trim() else ""
|
|
70
|
+
if (tail.isNotEmpty()) {
|
|
71
|
+
out += tail
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
return if (out.isNotEmpty()) out else listOf(normalized)
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
fun splitIntoWords(text: String): List<String> {
|
|
78
|
+
val normalized = text.trim()
|
|
79
|
+
if (normalized.isEmpty()) return emptyList()
|
|
80
|
+
|
|
81
|
+
val out = mutableListOf<String>()
|
|
82
|
+
val current = StringBuilder()
|
|
83
|
+
|
|
84
|
+
fun flushCurrent() {
|
|
85
|
+
val token = current.toString().trim()
|
|
86
|
+
if (token.isNotEmpty()) {
|
|
87
|
+
out += token
|
|
88
|
+
}
|
|
89
|
+
current.clear()
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
for (char in normalized) {
|
|
93
|
+
when {
|
|
94
|
+
char.isWhitespace() -> flushCurrent()
|
|
95
|
+
isCjkChar(char) -> {
|
|
96
|
+
flushCurrent()
|
|
97
|
+
out += char.toString()
|
|
98
|
+
}
|
|
99
|
+
isWordDelimiter(char) -> flushCurrent()
|
|
100
|
+
else -> current.append(char)
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
flushCurrent()
|
|
105
|
+
return if (out.isNotEmpty()) out else listOf(normalized)
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
fun buildSubtitlesFromChunks(
|
|
109
|
+
segments: List<String>,
|
|
110
|
+
chunkSampleCounts: List<Int>,
|
|
111
|
+
sampleRate: Int
|
|
112
|
+
): List<SubtitleTimingItem> {
|
|
113
|
+
if (sampleRate <= 0) return emptyList()
|
|
114
|
+
|
|
115
|
+
val cleanedSegments = sanitizeSegments(segments)
|
|
116
|
+
if (cleanedSegments.isEmpty()) return emptyList()
|
|
117
|
+
|
|
118
|
+
val alignedCounts = alignChunkCountsToSegments(cleanedSegments, chunkSampleCounts)
|
|
119
|
+
|
|
120
|
+
val subtitles = mutableListOf<SubtitleTimingItem>()
|
|
121
|
+
var offsetSamples = 0
|
|
122
|
+
|
|
123
|
+
for (index in cleanedSegments.indices) {
|
|
124
|
+
val samples = alignedCounts.getOrElse(index) { 0 }.coerceAtLeast(0)
|
|
125
|
+
if (samples == 0 && offsetSamples == 0) {
|
|
126
|
+
continue
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
val startSec = offsetSamples.toDouble() / sampleRate.toDouble()
|
|
130
|
+
offsetSamples += samples
|
|
131
|
+
val endSec = offsetSamples.toDouble() / sampleRate.toDouble()
|
|
132
|
+
|
|
133
|
+
subtitles += SubtitleTimingItem(
|
|
134
|
+
text = cleanedSegments[index],
|
|
135
|
+
start = startSec,
|
|
136
|
+
end = endSec
|
|
137
|
+
)
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return subtitles
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
fun buildWordSubtitlesFromSentenceChunks(
|
|
144
|
+
sentences: List<String>,
|
|
145
|
+
sentenceChunkSampleCounts: List<Int>,
|
|
146
|
+
sampleRate: Int
|
|
147
|
+
): List<SubtitleTimingItem> {
|
|
148
|
+
val cleanedSentences = sanitizeSegments(sentences)
|
|
149
|
+
if (cleanedSentences.isEmpty()) return emptyList()
|
|
150
|
+
|
|
151
|
+
val alignedSentenceCounts = alignChunkCountsToSegments(
|
|
152
|
+
cleanedSentences,
|
|
153
|
+
sentenceChunkSampleCounts
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
val wordSegments = mutableListOf<String>()
|
|
157
|
+
val wordChunkCounts = mutableListOf<Int>()
|
|
158
|
+
|
|
159
|
+
for (index in cleanedSentences.indices) {
|
|
160
|
+
val sentence = cleanedSentences[index]
|
|
161
|
+
val sentenceSamples = alignedSentenceCounts.getOrElse(index) { 0 }.coerceAtLeast(0)
|
|
162
|
+
val words = splitIntoWords(sentence)
|
|
163
|
+
if (words.isEmpty()) continue
|
|
164
|
+
|
|
165
|
+
val distributed = distributeSamplesByTextWeight(sentenceSamples, words)
|
|
166
|
+
for (wordIndex in words.indices) {
|
|
167
|
+
wordSegments += words[wordIndex]
|
|
168
|
+
wordChunkCounts += distributed.getOrElse(wordIndex) { 0 }
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return buildSubtitlesFromChunks(wordSegments, wordChunkCounts, sampleRate)
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
private fun sanitizeSegments(segments: List<String>): List<String> {
|
|
176
|
+
return segments
|
|
177
|
+
.map { it.trim() }
|
|
178
|
+
.filter { it.isNotEmpty() }
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
private fun alignChunkCountsToSegments(
|
|
182
|
+
segments: List<String>,
|
|
183
|
+
chunkSampleCounts: List<Int>
|
|
184
|
+
): List<Int> {
|
|
185
|
+
if (segments.isEmpty()) return emptyList()
|
|
186
|
+
|
|
187
|
+
val counts = chunkSampleCounts.map { max(0, it) }
|
|
188
|
+
if (counts.size == segments.size) {
|
|
189
|
+
return counts
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
if (counts.size > segments.size) {
|
|
193
|
+
val merged = counts.take(segments.size).toMutableList()
|
|
194
|
+
val extra = counts.drop(segments.size).sum()
|
|
195
|
+
if (merged.isNotEmpty()) {
|
|
196
|
+
val lastIndex = merged.lastIndex
|
|
197
|
+
merged[lastIndex] = merged[lastIndex] + extra
|
|
198
|
+
}
|
|
199
|
+
return merged
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return distributeSamplesByTextWeight(counts.sum(), segments)
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
private fun distributeSamplesByTextWeight(totalSamples: Int, segments: List<String>): List<Int> {
|
|
206
|
+
if (segments.isEmpty()) return emptyList()
|
|
207
|
+
|
|
208
|
+
val safeTotal = totalSamples.coerceAtLeast(0)
|
|
209
|
+
if (safeTotal == 0) {
|
|
210
|
+
return List(segments.size) { 0 }
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
val weights = segments.map { max(1, it.length) }
|
|
214
|
+
val weightSum = weights.sum().coerceAtLeast(1)
|
|
215
|
+
|
|
216
|
+
val base = MutableList(segments.size) { 0 }
|
|
217
|
+
val fractions = mutableListOf<Pair<Int, Double>>()
|
|
218
|
+
|
|
219
|
+
for (index in segments.indices) {
|
|
220
|
+
val exact = (safeTotal.toDouble() * weights[index].toDouble()) / weightSum.toDouble()
|
|
221
|
+
val floorValue = floor(exact).toInt()
|
|
222
|
+
base[index] = floorValue
|
|
223
|
+
fractions += index to (exact - floorValue.toDouble())
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
var assigned = base.sum()
|
|
227
|
+
var remaining = safeTotal - assigned
|
|
228
|
+
|
|
229
|
+
if (remaining > 0) {
|
|
230
|
+
val order = fractions.sortedByDescending { it.second }
|
|
231
|
+
var ptr = 0
|
|
232
|
+
while (remaining > 0 && order.isNotEmpty()) {
|
|
233
|
+
val target = order[ptr % order.size].first
|
|
234
|
+
base[target] = base[target] + 1
|
|
235
|
+
assigned += 1
|
|
236
|
+
remaining = safeTotal - assigned
|
|
237
|
+
ptr += 1
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
return base
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
private fun isSentenceTerminator(char: Char): Boolean {
|
|
245
|
+
return sentenceTerminators.contains(char)
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
private fun shouldSplitOnPeriod(text: String, periodIndex: Int): Boolean {
|
|
249
|
+
val prev = text.getOrNull(periodIndex - 1)
|
|
250
|
+
val next = text.getOrNull(periodIndex + 1)
|
|
251
|
+
|
|
252
|
+
if (prev != null && next != null && prev.isDigit() && next.isDigit()) {
|
|
253
|
+
return false
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
val tokenRaw = extractTokenBeforePeriod(text, periodIndex)
|
|
257
|
+
val tokenLower = tokenRaw.lowercase()
|
|
258
|
+
if (commonAbbreviations.contains(tokenLower)) {
|
|
259
|
+
return false
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Likely initial, e.g. "A. Smith" — use original case; tokenLower[0] is never uppercase.
|
|
263
|
+
if (tokenRaw.length == 1 && tokenRaw[0].isUpperCase()) {
|
|
264
|
+
return false
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
return true
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
private fun extractTokenBeforePeriod(text: String, periodIndex: Int): String {
|
|
271
|
+
var i = periodIndex - 1
|
|
272
|
+
while (i >= 0 && text[i].isWhitespace()) {
|
|
273
|
+
i -= 1
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
val end = i
|
|
277
|
+
while (i >= 0) {
|
|
278
|
+
val c = text[i]
|
|
279
|
+
if (c.isLetter() || c == '.') {
|
|
280
|
+
i -= 1
|
|
281
|
+
continue
|
|
282
|
+
}
|
|
283
|
+
break
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
if (end < i + 1) return ""
|
|
287
|
+
|
|
288
|
+
var token = text.substring(i + 1, end + 1)
|
|
289
|
+
while (token.endsWith('.')) {
|
|
290
|
+
token = token.dropLast(1)
|
|
291
|
+
}
|
|
292
|
+
return token
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
private fun sentenceBoundaryEnd(text: String, startIndex: Int): Int {
|
|
296
|
+
var end = startIndex + 1
|
|
297
|
+
while (end < text.length && isSentenceTerminator(text[end])) {
|
|
298
|
+
end += 1
|
|
299
|
+
}
|
|
300
|
+
while (end < text.length && trailingClosers.contains(text[end])) {
|
|
301
|
+
end += 1
|
|
302
|
+
}
|
|
303
|
+
return end
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
private fun isCjkChar(char: Char): Boolean {
|
|
307
|
+
val block = Character.UnicodeBlock.of(char)
|
|
308
|
+
return block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
|
|
309
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A ||
|
|
310
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B ||
|
|
311
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C ||
|
|
312
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D ||
|
|
313
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E ||
|
|
314
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F ||
|
|
315
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G ||
|
|
316
|
+
block == Character.UnicodeBlock.HIRAGANA ||
|
|
317
|
+
block == Character.UnicodeBlock.KATAKANA ||
|
|
318
|
+
block == Character.UnicodeBlock.HANGUL_SYLLABLES
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
private fun isWordDelimiter(char: Char): Boolean {
|
|
322
|
+
return when (char) {
|
|
323
|
+
'.', ',', '!', '?', ';', ':', '(', ')', '[', ']', '{', '}',
|
|
324
|
+
'"', '\'', '`', '~', '<', '>', '/', '\\', '|', '@', '#', '$',
|
|
325
|
+
'%', '^', '&', '*', '+', '=', '…', ',', '。', '!', '?', ';',
|
|
326
|
+
':', '、' -> true
|
|
327
|
+
else -> false
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
}
|
|
@@ -111,6 +111,46 @@ internal class SherpaOnnxTtsHelper(
|
|
|
111
111
|
}
|
|
112
112
|
}
|
|
113
113
|
|
|
114
|
+
/**
|
|
115
|
+
* libsherpa-onnx-jni looks up `invoke([F)Ljava/lang/Integer` (see sherpa-onnx `offline-tts.cc` CallCallback).
|
|
116
|
+
* Kotlin `Function1<*, Int>` compiles to `invoke([F)I`, so GetMethodID fails and JNI aborts.
|
|
117
|
+
* Using [java.lang.Integer] as the type parameter yields the boxed JVM signature the JNI expects.
|
|
118
|
+
* The cast is only for the Kotlin API (`generateWithCallback` still declares `Function1<FloatArray, Int>`).
|
|
119
|
+
*/
|
|
120
|
+
/** Box for JNI: must be real [java.lang.Integer], not Kotlin [Int] (primitive `invoke([F)I` breaks sherpa JNI). */
|
|
121
|
+
@Suppress("DEPRECATION")
|
|
122
|
+
private fun boxForTtsJni(n: Int): java.lang.Integer = java.lang.Integer(n)
|
|
123
|
+
|
|
124
|
+
@Suppress("UNCHECKED_CAST")
|
|
125
|
+
private fun ttsChunkCallbackForJni(
|
|
126
|
+
sentenceChunkSizes: MutableList<Int>
|
|
127
|
+
): kotlin.Function1<FloatArray, Int> {
|
|
128
|
+
val boxed =
|
|
129
|
+
object : kotlin.jvm.functions.Function1<FloatArray, java.lang.Integer> {
|
|
130
|
+
override fun invoke(chunk: FloatArray): java.lang.Integer {
|
|
131
|
+
sentenceChunkSizes.add(chunk.size)
|
|
132
|
+
return boxForTtsJni(chunk.size)
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
return boxed as kotlin.Function1<FloatArray, Int>
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
@Suppress("UNCHECKED_CAST")
|
|
139
|
+
private fun ttsStreamChunkCallbackForJni(
|
|
140
|
+
cancelled: AtomicBoolean,
|
|
141
|
+
onChunk: (FloatArray) -> Unit
|
|
142
|
+
): kotlin.Function1<FloatArray, Int> {
|
|
143
|
+
val boxed =
|
|
144
|
+
object : kotlin.jvm.functions.Function1<FloatArray, java.lang.Integer> {
|
|
145
|
+
override fun invoke(chunk: FloatArray): java.lang.Integer {
|
|
146
|
+
if (cancelled.get()) return boxForTtsJni(0)
|
|
147
|
+
onChunk(chunk)
|
|
148
|
+
return boxForTtsJni(chunk.size)
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
return boxed as kotlin.Function1<FloatArray, Int>
|
|
152
|
+
}
|
|
153
|
+
|
|
114
154
|
/** Single-thread executor for TTS init so the RN bridge thread is not blocked (avoids Inspector/dev WebSocket races in debug builds). */
|
|
115
155
|
private val ttsInitExecutor = Executors.newSingleThreadExecutor()
|
|
116
156
|
|
|
@@ -451,9 +491,66 @@ internal class SherpaOnnxTtsHelper(
|
|
|
451
491
|
promise.reject("TTS_GENERATE_ERROR", "TTS not initialized")
|
|
452
492
|
return
|
|
453
493
|
}
|
|
494
|
+
|
|
495
|
+
val subtitleMode = getSubtitleMode(options)
|
|
496
|
+
val subtitleGranularity = getSubtitleGranularity(options)
|
|
497
|
+
if (isCharacterGranularityRequested(options) && subtitleMode != "accurate") {
|
|
498
|
+
Log.e(
|
|
499
|
+
"SherpaOnnxTts",
|
|
500
|
+
"TTS_SUBTITLE_ERROR: Character granularity is only supported when subtitleMode is 'accurate'"
|
|
501
|
+
)
|
|
502
|
+
promise.reject(
|
|
503
|
+
"TTS_SUBTITLE_ERROR",
|
|
504
|
+
"Character granularity is only supported when subtitleMode is 'accurate'."
|
|
505
|
+
)
|
|
506
|
+
return
|
|
507
|
+
}
|
|
508
|
+
|
|
454
509
|
val sid = getSid(options)
|
|
455
510
|
val speed = getSpeed(options)
|
|
511
|
+
val sentenceChunkSizes = mutableListOf<Int>()
|
|
456
512
|
val audio = when {
|
|
513
|
+
subtitleMode == "off" -> {
|
|
514
|
+
when {
|
|
515
|
+
hasReferenceAudio(options) && (inst.isZipvoice || inst.isPocket) -> {
|
|
516
|
+
if (inst.isZipvoice) {
|
|
517
|
+
val promptText = options!!.getString("referenceText")?.trim().orEmpty()
|
|
518
|
+
if (promptText.isEmpty()) {
|
|
519
|
+
Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: Zipvoice voice cloning requires non-empty referenceText")
|
|
520
|
+
promise.reject(
|
|
521
|
+
"TTS_GENERATE_ERROR",
|
|
522
|
+
"Zipvoice voice cloning requires non-empty referenceText (transcript of reference audio)."
|
|
523
|
+
)
|
|
524
|
+
return
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
val config = parseGenerationConfig(options) ?: GenerationConfig(speed = speed, sid = sid)
|
|
528
|
+
inst.tts!!.generateWithConfig(text, config)
|
|
529
|
+
}
|
|
530
|
+
hasReferenceAudio(options) -> {
|
|
531
|
+
Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: Reference audio is not supported for this TTS model type")
|
|
532
|
+
promise.reject(
|
|
533
|
+
"TTS_GENERATE_ERROR",
|
|
534
|
+
"Reference audio is only supported for Zipvoice and Pocket TTS."
|
|
535
|
+
)
|
|
536
|
+
return
|
|
537
|
+
}
|
|
538
|
+
inst.isPocket -> {
|
|
539
|
+
Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: Pocket TTS requires reference audio for voice cloning")
|
|
540
|
+
promise.reject(
|
|
541
|
+
"TTS_GENERATE_ERROR",
|
|
542
|
+
"Pocket TTS requires reference audio for voice cloning. Pass referenceAudio and referenceSampleRate (> 0) in options."
|
|
543
|
+
)
|
|
544
|
+
return
|
|
545
|
+
}
|
|
546
|
+
else -> dispatchGenerate(inst, text, sid, speed)
|
|
547
|
+
?: run {
|
|
548
|
+
Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: TTS not initialized")
|
|
549
|
+
promise.reject("TTS_GENERATE_ERROR", "TTS not initialized")
|
|
550
|
+
return
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
}
|
|
457
554
|
hasReferenceAudio(options) && (inst.isZipvoice || inst.isPocket) -> {
|
|
458
555
|
if (inst.isZipvoice) {
|
|
459
556
|
val promptText = options!!.getString("referenceText")?.trim().orEmpty()
|
|
@@ -467,7 +564,11 @@ internal class SherpaOnnxTtsHelper(
|
|
|
467
564
|
}
|
|
468
565
|
}
|
|
469
566
|
val config = parseGenerationConfig(options) ?: GenerationConfig(speed = speed, sid = sid)
|
|
470
|
-
inst.tts!!.
|
|
567
|
+
inst.tts!!.generateWithConfigAndCallback(
|
|
568
|
+
text,
|
|
569
|
+
config,
|
|
570
|
+
ttsChunkCallbackForJni(sentenceChunkSizes)
|
|
571
|
+
)
|
|
471
572
|
}
|
|
472
573
|
hasReferenceAudio(options) -> {
|
|
473
574
|
Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: Reference audio is not supported for this TTS model type")
|
|
@@ -485,13 +586,21 @@ internal class SherpaOnnxTtsHelper(
|
|
|
485
586
|
)
|
|
486
587
|
return
|
|
487
588
|
}
|
|
488
|
-
else ->
|
|
489
|
-
|
|
589
|
+
else -> {
|
|
590
|
+
val tts = inst.tts
|
|
591
|
+
if (tts == null) {
|
|
490
592
|
Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: TTS not initialized")
|
|
491
593
|
promise.reject("TTS_GENERATE_ERROR", "TTS not initialized")
|
|
492
594
|
return
|
|
493
595
|
}
|
|
596
|
+
tts.generateWithCallback(text, sid, speed, ttsChunkCallbackForJni(sentenceChunkSizes))
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
if (subtitleMode != "off" && sentenceChunkSizes.isEmpty() && audio.samples.isNotEmpty()) {
|
|
601
|
+
sentenceChunkSizes.add(audio.samples.size)
|
|
494
602
|
}
|
|
603
|
+
|
|
495
604
|
val map = Arguments.createMap()
|
|
496
605
|
val samplesArray = Arguments.createArray()
|
|
497
606
|
for (sample in audio.samples) {
|
|
@@ -499,17 +608,29 @@ internal class SherpaOnnxTtsHelper(
|
|
|
499
608
|
}
|
|
500
609
|
map.putArray("samples", samplesArray)
|
|
501
610
|
map.putInt("sampleRate", audio.sampleRate)
|
|
502
|
-
|
|
503
|
-
if (
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
611
|
+
|
|
612
|
+
val subtitleItems = if (subtitleMode == "off") {
|
|
613
|
+
emptyList()
|
|
614
|
+
} else {
|
|
615
|
+
val sentenceSegments = SherpaOnnxTextSegmenter.splitIntoSentences(text)
|
|
616
|
+
if (subtitleGranularity == "word") {
|
|
617
|
+
SherpaOnnxTextSegmenter.buildWordSubtitlesFromSentenceChunks(
|
|
618
|
+
sentenceSegments,
|
|
619
|
+
sentenceChunkSizes,
|
|
620
|
+
audio.sampleRate
|
|
621
|
+
)
|
|
622
|
+
} else {
|
|
623
|
+
SherpaOnnxTextSegmenter.buildSubtitlesFromChunks(
|
|
624
|
+
sentenceSegments,
|
|
625
|
+
sentenceChunkSizes,
|
|
626
|
+
audio.sampleRate
|
|
627
|
+
)
|
|
628
|
+
}
|
|
510
629
|
}
|
|
511
|
-
|
|
512
|
-
map.
|
|
630
|
+
|
|
631
|
+
map.putArray("subtitles", toSubtitleWritableArray(subtitleItems))
|
|
632
|
+
val timingMode = if (subtitleMode == "off") "off" else "estimated"
|
|
633
|
+
map.putString("timingMode", timingMode)
|
|
513
634
|
promise.resolve(map)
|
|
514
635
|
} catch (e: Exception) {
|
|
515
636
|
Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: ${e.message ?: "Failed to generate speech"}", e)
|
|
@@ -564,18 +685,23 @@ internal class SherpaOnnxTtsHelper(
|
|
|
564
685
|
when {
|
|
565
686
|
hasReferenceAudio(options) && inst.isPocket -> {
|
|
566
687
|
val config = parseGenerationConfig(options) ?: GenerationConfig(speed = speed, sid = sid)
|
|
567
|
-
inst.tts!!.generateWithConfigAndCallback(
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
chunk
|
|
571
|
-
|
|
688
|
+
inst.tts!!.generateWithConfigAndCallback(
|
|
689
|
+
text,
|
|
690
|
+
config,
|
|
691
|
+
ttsStreamChunkCallbackForJni(inst.ttsStreamCancelled) { chunk ->
|
|
692
|
+
emitChunk(instanceId, requestId, chunk, sampleRate, 0f, false)
|
|
693
|
+
}
|
|
694
|
+
)
|
|
572
695
|
}
|
|
573
696
|
else -> {
|
|
574
|
-
inst.tts!!.generateWithCallback(
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
697
|
+
inst.tts!!.generateWithCallback(
|
|
698
|
+
text,
|
|
699
|
+
sid,
|
|
700
|
+
speed,
|
|
701
|
+
ttsStreamChunkCallbackForJni(inst.ttsStreamCancelled) { chunk ->
|
|
702
|
+
emitChunk(instanceId, requestId, chunk, sampleRate, 0f, false)
|
|
703
|
+
}
|
|
704
|
+
)
|
|
579
705
|
}
|
|
580
706
|
}
|
|
581
707
|
if (!inst.ttsStreamCancelled.get()) {
|
|
@@ -915,6 +1041,37 @@ internal class SherpaOnnxTtsHelper(
|
|
|
915
1041
|
private fun getSpeed(options: ReadableMap?): Float =
|
|
916
1042
|
if (options != null && options.hasKey("speed")) options.getDouble("speed").toFloat() else 1.0f
|
|
917
1043
|
|
|
1044
|
+
private fun getSubtitleMode(options: ReadableMap?): String {
|
|
1045
|
+
val raw = options?.getString("subtitleMode")?.trim()?.lowercase()
|
|
1046
|
+
return when (raw) {
|
|
1047
|
+
"off", "fast", "accurate" -> raw
|
|
1048
|
+
else -> "fast"
|
|
1049
|
+
}
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
private fun getSubtitleGranularity(options: ReadableMap?): String {
|
|
1053
|
+
val raw = options?.getString("subtitleGranularity")?.trim()?.lowercase()
|
|
1054
|
+
return when (raw) {
|
|
1055
|
+
"word", "sentence" -> raw
|
|
1056
|
+
else -> "sentence"
|
|
1057
|
+
}
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
private fun isCharacterGranularityRequested(options: ReadableMap?): Boolean {
|
|
1061
|
+
val raw = options?.getString("subtitleGranularity")?.trim()?.lowercase()
|
|
1062
|
+
return raw == "character"
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
private fun toSubtitleWritableArray(items: List<SubtitleTimingItem>) = Arguments.createArray().apply {
|
|
1066
|
+
for (item in items) {
|
|
1067
|
+
val subtitleMap = Arguments.createMap()
|
|
1068
|
+
subtitleMap.putString("text", item.text)
|
|
1069
|
+
subtitleMap.putDouble("start", item.start)
|
|
1070
|
+
subtitleMap.putDouble("end", item.end)
|
|
1071
|
+
pushMap(subtitleMap)
|
|
1072
|
+
}
|
|
1073
|
+
}
|
|
1074
|
+
|
|
918
1075
|
/** Build Kotlin GenerationConfig from ReadableMap. Returns null only when options is null; otherwise returns a config with sid, speed, silenceScale, numSteps, and any reference/extra fields from options. */
|
|
919
1076
|
private fun parseGenerationConfig(options: ReadableMap?): GenerationConfig? {
|
|
920
1077
|
if (options == null) return null
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
asset_name,license_type,commercial_use,confidence,detection_source,license_file
|
|
2
|
+
wav2vec2-base-960h-fp16.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md
|
|
3
|
+
wav2vec2-base-960h-int8.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md
|
|
4
|
+
wav2vec2-base-960h-q4f16.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md
|
|
5
|
+
wav2vec2-base-960h.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md
|