react-native-sherpa-onnx 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/android/src/main/assets/model_licenses/alignment-models-license-status.csv +5 -0
- package/android/src/main/cpp/CMakeLists.txt +3 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.cpp +66 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.h +17 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-alignment.cpp +108 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +30 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.cpp +66 -0
- package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.h +30 -0
- package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +21 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxAlignmentHelper.kt +555 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +76 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTextSegmenter.kt +330 -0
- package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +114 -10
- package/ios/Resources/model_licenses/alignment-models-license-status.csv +5 -0
- package/ios/SherpaOnnx+Alignment.mm +704 -0
- package/ios/SherpaOnnx+STT.mm +6 -0
- package/ios/SherpaOnnx+TTS.mm +624 -50
- package/ios/model_detect/sherpa-onnx-model-detect-alignment.mm +108 -0
- package/ios/model_detect/sherpa-onnx-model-detect.h +31 -0
- package/ios/model_detect/sherpa-onnx-validate-alignment.h +30 -0
- package/ios/model_detect/sherpa-onnx-validate-alignment.mm +66 -0
- package/ios/stt/sherpa-onnx-stt-wrapper.h +3 -1
- package/ios/stt/sherpa-onnx-stt-wrapper.mm +6 -0
- package/lib/module/NativeSherpaOnnx.js.map +1 -1
- package/lib/module/alignment/index.js +27 -0
- package/lib/module/alignment/index.js.map +1 -0
- package/lib/module/alignment/types.js +2 -0
- package/lib/module/alignment/types.js.map +1 -0
- package/lib/module/alignment/vocab.js +40 -0
- package/lib/module/alignment/vocab.js.map +1 -0
- package/lib/module/download/paths.js +9 -1
- package/lib/module/download/paths.js.map +1 -1
- package/lib/module/download/registry.js +17 -1
- package/lib/module/download/registry.js.map +1 -1
- package/lib/module/download/types.js +1 -0
- package/lib/module/download/types.js.map +1 -1
- package/lib/module/index.js +6 -4
- package/lib/module/index.js.map +1 -1
- package/lib/module/licenses.js +8 -2
- package/lib/module/licenses.js.map +1 -1
- package/lib/module/stt/types.js.map +1 -1
- package/lib/module/tts/index.js +68 -2
- package/lib/module/tts/index.js.map +1 -1
- package/lib/module/tts/subtitles.js +400 -0
- package/lib/module/tts/subtitles.js.map +1 -0
- package/lib/module/tts/tempAudio.js +17 -0
- package/lib/module/tts/tempAudio.js.map +1 -0
- package/lib/module/tts/types.js.map +1 -1
- package/lib/typescript/src/NativeSherpaOnnx.d.ts +34 -3
- package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
- package/lib/typescript/src/alignment/index.d.ts +8 -0
- package/lib/typescript/src/alignment/index.d.ts.map +1 -0
- package/lib/typescript/src/alignment/types.d.ts +23 -0
- package/lib/typescript/src/alignment/types.d.ts.map +1 -0
- package/lib/typescript/src/alignment/vocab.d.ts +5 -0
- package/lib/typescript/src/alignment/vocab.d.ts.map +1 -0
- package/lib/typescript/src/download/paths.d.ts +5 -2
- package/lib/typescript/src/download/paths.d.ts.map +1 -1
- package/lib/typescript/src/download/registry.d.ts.map +1 -1
- package/lib/typescript/src/download/types.d.ts +2 -1
- package/lib/typescript/src/download/types.d.ts.map +1 -1
- package/lib/typescript/src/index.d.ts +1 -0
- package/lib/typescript/src/index.d.ts.map +1 -1
- package/lib/typescript/src/licenses.d.ts.map +1 -1
- package/lib/typescript/src/stt/types.d.ts +5 -2
- package/lib/typescript/src/stt/types.d.ts.map +1 -1
- package/lib/typescript/src/tts/index.d.ts +2 -1
- package/lib/typescript/src/tts/index.d.ts.map +1 -1
- package/lib/typescript/src/tts/subtitles.d.ts +24 -0
- package/lib/typescript/src/tts/subtitles.d.ts.map +1 -0
- package/lib/typescript/src/tts/tempAudio.d.ts +3 -0
- package/lib/typescript/src/tts/tempAudio.d.ts.map +1 -0
- package/lib/typescript/src/tts/types.d.ts +68 -2
- package/lib/typescript/src/tts/types.d.ts.map +1 -1
- package/package.json +6 -1
- package/scripts/alignment-models/README.md +90 -0
- package/scripts/alignment-models/build_and_upload.js +724 -0
- package/scripts/alignment-models/sources.csv +5 -0
- package/scripts/alignment-models/sync_alignment_license_status.js +123 -0
- package/src/NativeSherpaOnnx.ts +35 -3
- package/src/alignment/index.ts +41 -0
- package/src/alignment/types.ts +22 -0
- package/src/alignment/vocab.ts +38 -0
- package/src/download/paths.ts +18 -5
- package/src/download/registry.ts +23 -3
- package/src/download/types.ts +1 -0
- package/src/index.tsx +6 -4
- package/src/licenses.ts +12 -1
- package/src/stt/types.ts +5 -2
- package/src/tts/index.ts +110 -3
- package/src/tts/subtitles.ts +611 -0
- package/src/tts/tempAudio.ts +31 -0
- package/src/tts/types.ts +79 -2
- package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
package com.sherpaonnx
|
|
2
|
+
|
|
3
|
+
import kotlin.math.floor
|
|
4
|
+
import kotlin.math.max
|
|
5
|
+
|
|
6
|
+
internal data class SubtitleTimingItem(
|
|
7
|
+
val text: String,
|
|
8
|
+
val start: Double,
|
|
9
|
+
val end: Double
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
internal object SherpaOnnxTextSegmenter {
|
|
13
|
+
private val sentenceTerminators = setOf('.', '!', '?', ';', '。', '!', '?', ';')
|
|
14
|
+
private val trailingClosers = setOf('"', '\'', ')', ']', '}', '>', '”', '’', '」', '』', '】', ')')
|
|
15
|
+
private val commonAbbreviations = setOf(
|
|
16
|
+
"mr",
|
|
17
|
+
"mrs",
|
|
18
|
+
"ms",
|
|
19
|
+
"dr",
|
|
20
|
+
"prof",
|
|
21
|
+
"sr",
|
|
22
|
+
"jr",
|
|
23
|
+
"st",
|
|
24
|
+
"vs",
|
|
25
|
+
"etc",
|
|
26
|
+
"e.g",
|
|
27
|
+
"i.e"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
fun splitIntoSentences(text: String): List<String> {
|
|
31
|
+
val normalized = text.trim()
|
|
32
|
+
if (normalized.isEmpty()) return emptyList()
|
|
33
|
+
|
|
34
|
+
val out = mutableListOf<String>()
|
|
35
|
+
var start = 0
|
|
36
|
+
var i = 0
|
|
37
|
+
|
|
38
|
+
while (i < normalized.length) {
|
|
39
|
+
val current = normalized[i]
|
|
40
|
+
if (!isSentenceTerminator(current)) {
|
|
41
|
+
i += 1
|
|
42
|
+
continue
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if (current == '.' && !shouldSplitOnPeriod(normalized, i)) {
|
|
46
|
+
i += 1
|
|
47
|
+
continue
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
val end = sentenceBoundaryEnd(normalized, i)
|
|
51
|
+
val next = normalized.getOrNull(end)
|
|
52
|
+
if (next != null && !next.isWhitespace()) {
|
|
53
|
+
i += 1
|
|
54
|
+
continue
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
val sentence = normalized.substring(start, end).trim()
|
|
58
|
+
if (sentence.isNotEmpty()) {
|
|
59
|
+
out += sentence
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
start = end
|
|
63
|
+
while (start < normalized.length && normalized[start].isWhitespace()) {
|
|
64
|
+
start += 1
|
|
65
|
+
}
|
|
66
|
+
i = start
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
val tail = if (start < normalized.length) normalized.substring(start).trim() else ""
|
|
70
|
+
if (tail.isNotEmpty()) {
|
|
71
|
+
out += tail
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
return if (out.isNotEmpty()) out else listOf(normalized)
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
fun splitIntoWords(text: String): List<String> {
|
|
78
|
+
val normalized = text.trim()
|
|
79
|
+
if (normalized.isEmpty()) return emptyList()
|
|
80
|
+
|
|
81
|
+
val out = mutableListOf<String>()
|
|
82
|
+
val current = StringBuilder()
|
|
83
|
+
|
|
84
|
+
fun flushCurrent() {
|
|
85
|
+
val token = current.toString().trim()
|
|
86
|
+
if (token.isNotEmpty()) {
|
|
87
|
+
out += token
|
|
88
|
+
}
|
|
89
|
+
current.clear()
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
for (char in normalized) {
|
|
93
|
+
when {
|
|
94
|
+
char.isWhitespace() -> flushCurrent()
|
|
95
|
+
isCjkChar(char) -> {
|
|
96
|
+
flushCurrent()
|
|
97
|
+
out += char.toString()
|
|
98
|
+
}
|
|
99
|
+
isWordDelimiter(char) -> flushCurrent()
|
|
100
|
+
else -> current.append(char)
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
flushCurrent()
|
|
105
|
+
return if (out.isNotEmpty()) out else listOf(normalized)
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
fun buildSubtitlesFromChunks(
|
|
109
|
+
segments: List<String>,
|
|
110
|
+
chunkSampleCounts: List<Int>,
|
|
111
|
+
sampleRate: Int
|
|
112
|
+
): List<SubtitleTimingItem> {
|
|
113
|
+
if (sampleRate <= 0) return emptyList()
|
|
114
|
+
|
|
115
|
+
val cleanedSegments = sanitizeSegments(segments)
|
|
116
|
+
if (cleanedSegments.isEmpty()) return emptyList()
|
|
117
|
+
|
|
118
|
+
val alignedCounts = alignChunkCountsToSegments(cleanedSegments, chunkSampleCounts)
|
|
119
|
+
|
|
120
|
+
val subtitles = mutableListOf<SubtitleTimingItem>()
|
|
121
|
+
var offsetSamples = 0
|
|
122
|
+
|
|
123
|
+
for (index in cleanedSegments.indices) {
|
|
124
|
+
val samples = alignedCounts.getOrElse(index) { 0 }.coerceAtLeast(0)
|
|
125
|
+
if (samples == 0 && offsetSamples == 0) {
|
|
126
|
+
continue
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
val startSec = offsetSamples.toDouble() / sampleRate.toDouble()
|
|
130
|
+
offsetSamples += samples
|
|
131
|
+
val endSec = offsetSamples.toDouble() / sampleRate.toDouble()
|
|
132
|
+
|
|
133
|
+
subtitles += SubtitleTimingItem(
|
|
134
|
+
text = cleanedSegments[index],
|
|
135
|
+
start = startSec,
|
|
136
|
+
end = endSec
|
|
137
|
+
)
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return subtitles
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
fun buildWordSubtitlesFromSentenceChunks(
|
|
144
|
+
sentences: List<String>,
|
|
145
|
+
sentenceChunkSampleCounts: List<Int>,
|
|
146
|
+
sampleRate: Int
|
|
147
|
+
): List<SubtitleTimingItem> {
|
|
148
|
+
val cleanedSentences = sanitizeSegments(sentences)
|
|
149
|
+
if (cleanedSentences.isEmpty()) return emptyList()
|
|
150
|
+
|
|
151
|
+
val alignedSentenceCounts = alignChunkCountsToSegments(
|
|
152
|
+
cleanedSentences,
|
|
153
|
+
sentenceChunkSampleCounts
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
val wordSegments = mutableListOf<String>()
|
|
157
|
+
val wordChunkCounts = mutableListOf<Int>()
|
|
158
|
+
|
|
159
|
+
for (index in cleanedSentences.indices) {
|
|
160
|
+
val sentence = cleanedSentences[index]
|
|
161
|
+
val sentenceSamples = alignedSentenceCounts.getOrElse(index) { 0 }.coerceAtLeast(0)
|
|
162
|
+
val words = splitIntoWords(sentence)
|
|
163
|
+
if (words.isEmpty()) continue
|
|
164
|
+
|
|
165
|
+
val distributed = distributeSamplesByTextWeight(sentenceSamples, words)
|
|
166
|
+
for (wordIndex in words.indices) {
|
|
167
|
+
wordSegments += words[wordIndex]
|
|
168
|
+
wordChunkCounts += distributed.getOrElse(wordIndex) { 0 }
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return buildSubtitlesFromChunks(wordSegments, wordChunkCounts, sampleRate)
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
private fun sanitizeSegments(segments: List<String>): List<String> {
|
|
176
|
+
return segments
|
|
177
|
+
.map { it.trim() }
|
|
178
|
+
.filter { it.isNotEmpty() }
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
private fun alignChunkCountsToSegments(
|
|
182
|
+
segments: List<String>,
|
|
183
|
+
chunkSampleCounts: List<Int>
|
|
184
|
+
): List<Int> {
|
|
185
|
+
if (segments.isEmpty()) return emptyList()
|
|
186
|
+
|
|
187
|
+
val counts = chunkSampleCounts.map { max(0, it) }
|
|
188
|
+
if (counts.size == segments.size) {
|
|
189
|
+
return counts
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
if (counts.size > segments.size) {
|
|
193
|
+
val merged = counts.take(segments.size).toMutableList()
|
|
194
|
+
val extra = counts.drop(segments.size).sum()
|
|
195
|
+
if (merged.isNotEmpty()) {
|
|
196
|
+
val lastIndex = merged.lastIndex
|
|
197
|
+
merged[lastIndex] = merged[lastIndex] + extra
|
|
198
|
+
}
|
|
199
|
+
return merged
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return distributeSamplesByTextWeight(counts.sum(), segments)
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
private fun distributeSamplesByTextWeight(totalSamples: Int, segments: List<String>): List<Int> {
|
|
206
|
+
if (segments.isEmpty()) return emptyList()
|
|
207
|
+
|
|
208
|
+
val safeTotal = totalSamples.coerceAtLeast(0)
|
|
209
|
+
if (safeTotal == 0) {
|
|
210
|
+
return List(segments.size) { 0 }
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
val weights = segments.map { max(1, it.length) }
|
|
214
|
+
val weightSum = weights.sum().coerceAtLeast(1)
|
|
215
|
+
|
|
216
|
+
val base = MutableList(segments.size) { 0 }
|
|
217
|
+
val fractions = mutableListOf<Pair<Int, Double>>()
|
|
218
|
+
|
|
219
|
+
for (index in segments.indices) {
|
|
220
|
+
val exact = (safeTotal.toDouble() * weights[index].toDouble()) / weightSum.toDouble()
|
|
221
|
+
val floorValue = floor(exact).toInt()
|
|
222
|
+
base[index] = floorValue
|
|
223
|
+
fractions += index to (exact - floorValue.toDouble())
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
var assigned = base.sum()
|
|
227
|
+
var remaining = safeTotal - assigned
|
|
228
|
+
|
|
229
|
+
if (remaining > 0) {
|
|
230
|
+
val order = fractions.sortedByDescending { it.second }
|
|
231
|
+
var ptr = 0
|
|
232
|
+
while (remaining > 0 && order.isNotEmpty()) {
|
|
233
|
+
val target = order[ptr % order.size].first
|
|
234
|
+
base[target] = base[target] + 1
|
|
235
|
+
assigned += 1
|
|
236
|
+
remaining = safeTotal - assigned
|
|
237
|
+
ptr += 1
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
return base
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
private fun isSentenceTerminator(char: Char): Boolean {
|
|
245
|
+
return sentenceTerminators.contains(char)
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
private fun shouldSplitOnPeriod(text: String, periodIndex: Int): Boolean {
|
|
249
|
+
val prev = text.getOrNull(periodIndex - 1)
|
|
250
|
+
val next = text.getOrNull(periodIndex + 1)
|
|
251
|
+
|
|
252
|
+
if (prev != null && next != null && prev.isDigit() && next.isDigit()) {
|
|
253
|
+
return false
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
val tokenRaw = extractTokenBeforePeriod(text, periodIndex)
|
|
257
|
+
val tokenLower = tokenRaw.lowercase()
|
|
258
|
+
if (commonAbbreviations.contains(tokenLower)) {
|
|
259
|
+
return false
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Likely initial, e.g. "A. Smith" — use original case; tokenLower[0] is never uppercase.
|
|
263
|
+
if (tokenRaw.length == 1 && tokenRaw[0].isUpperCase()) {
|
|
264
|
+
return false
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
return true
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
private fun extractTokenBeforePeriod(text: String, periodIndex: Int): String {
|
|
271
|
+
var i = periodIndex - 1
|
|
272
|
+
while (i >= 0 && text[i].isWhitespace()) {
|
|
273
|
+
i -= 1
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
val end = i
|
|
277
|
+
while (i >= 0) {
|
|
278
|
+
val c = text[i]
|
|
279
|
+
if (c.isLetter() || c == '.') {
|
|
280
|
+
i -= 1
|
|
281
|
+
continue
|
|
282
|
+
}
|
|
283
|
+
break
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
if (end < i + 1) return ""
|
|
287
|
+
|
|
288
|
+
var token = text.substring(i + 1, end + 1)
|
|
289
|
+
while (token.endsWith('.')) {
|
|
290
|
+
token = token.dropLast(1)
|
|
291
|
+
}
|
|
292
|
+
return token
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
private fun sentenceBoundaryEnd(text: String, startIndex: Int): Int {
|
|
296
|
+
var end = startIndex + 1
|
|
297
|
+
while (end < text.length && isSentenceTerminator(text[end])) {
|
|
298
|
+
end += 1
|
|
299
|
+
}
|
|
300
|
+
while (end < text.length && trailingClosers.contains(text[end])) {
|
|
301
|
+
end += 1
|
|
302
|
+
}
|
|
303
|
+
return end
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
private fun isCjkChar(char: Char): Boolean {
|
|
307
|
+
val block = Character.UnicodeBlock.of(char)
|
|
308
|
+
return block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
|
|
309
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A ||
|
|
310
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B ||
|
|
311
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C ||
|
|
312
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D ||
|
|
313
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E ||
|
|
314
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F ||
|
|
315
|
+
block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G ||
|
|
316
|
+
block == Character.UnicodeBlock.HIRAGANA ||
|
|
317
|
+
block == Character.UnicodeBlock.KATAKANA ||
|
|
318
|
+
block == Character.UnicodeBlock.HANGUL_SYLLABLES
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
private fun isWordDelimiter(char: Char): Boolean {
|
|
322
|
+
return when (char) {
|
|
323
|
+
'.', ',', '!', '?', ';', ':', '(', ')', '[', ']', '{', '}',
|
|
324
|
+
'"', '\'', '`', '~', '<', '>', '/', '\\', '|', '@', '#', '$',
|
|
325
|
+
'%', '^', '&', '*', '+', '=', '…', ',', '。', '!', '?', ';',
|
|
326
|
+
':', '、' -> true
|
|
327
|
+
else -> false
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
}
|
|
@@ -491,10 +491,66 @@ internal class SherpaOnnxTtsHelper(
|
|
|
491
491
|
promise.reject("TTS_GENERATE_ERROR", "TTS not initialized")
|
|
492
492
|
return
|
|
493
493
|
}
|
|
494
|
+
|
|
495
|
+
val subtitleMode = getSubtitleMode(options)
|
|
496
|
+
val subtitleGranularity = getSubtitleGranularity(options)
|
|
497
|
+
if (isCharacterGranularityRequested(options) && subtitleMode != "accurate") {
|
|
498
|
+
Log.e(
|
|
499
|
+
"SherpaOnnxTts",
|
|
500
|
+
"TTS_SUBTITLE_ERROR: Character granularity is only supported when subtitleMode is 'accurate'"
|
|
501
|
+
)
|
|
502
|
+
promise.reject(
|
|
503
|
+
"TTS_SUBTITLE_ERROR",
|
|
504
|
+
"Character granularity is only supported when subtitleMode is 'accurate'."
|
|
505
|
+
)
|
|
506
|
+
return
|
|
507
|
+
}
|
|
508
|
+
|
|
494
509
|
val sid = getSid(options)
|
|
495
510
|
val speed = getSpeed(options)
|
|
496
511
|
val sentenceChunkSizes = mutableListOf<Int>()
|
|
497
512
|
val audio = when {
|
|
513
|
+
subtitleMode == "off" -> {
|
|
514
|
+
when {
|
|
515
|
+
hasReferenceAudio(options) && (inst.isZipvoice || inst.isPocket) -> {
|
|
516
|
+
if (inst.isZipvoice) {
|
|
517
|
+
val promptText = options!!.getString("referenceText")?.trim().orEmpty()
|
|
518
|
+
if (promptText.isEmpty()) {
|
|
519
|
+
Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: Zipvoice voice cloning requires non-empty referenceText")
|
|
520
|
+
promise.reject(
|
|
521
|
+
"TTS_GENERATE_ERROR",
|
|
522
|
+
"Zipvoice voice cloning requires non-empty referenceText (transcript of reference audio)."
|
|
523
|
+
)
|
|
524
|
+
return
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
val config = parseGenerationConfig(options) ?: GenerationConfig(speed = speed, sid = sid)
|
|
528
|
+
inst.tts!!.generateWithConfig(text, config)
|
|
529
|
+
}
|
|
530
|
+
hasReferenceAudio(options) -> {
|
|
531
|
+
Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: Reference audio is not supported for this TTS model type")
|
|
532
|
+
promise.reject(
|
|
533
|
+
"TTS_GENERATE_ERROR",
|
|
534
|
+
"Reference audio is only supported for Zipvoice and Pocket TTS."
|
|
535
|
+
)
|
|
536
|
+
return
|
|
537
|
+
}
|
|
538
|
+
inst.isPocket -> {
|
|
539
|
+
Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: Pocket TTS requires reference audio for voice cloning")
|
|
540
|
+
promise.reject(
|
|
541
|
+
"TTS_GENERATE_ERROR",
|
|
542
|
+
"Pocket TTS requires reference audio for voice cloning. Pass referenceAudio and referenceSampleRate (> 0) in options."
|
|
543
|
+
)
|
|
544
|
+
return
|
|
545
|
+
}
|
|
546
|
+
else -> dispatchGenerate(inst, text, sid, speed)
|
|
547
|
+
?: run {
|
|
548
|
+
Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: TTS not initialized")
|
|
549
|
+
promise.reject("TTS_GENERATE_ERROR", "TTS not initialized")
|
|
550
|
+
return
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
}
|
|
498
554
|
hasReferenceAudio(options) && (inst.isZipvoice || inst.isPocket) -> {
|
|
499
555
|
if (inst.isZipvoice) {
|
|
500
556
|
val promptText = options!!.getString("referenceText")?.trim().orEmpty()
|
|
@@ -540,6 +596,11 @@ internal class SherpaOnnxTtsHelper(
|
|
|
540
596
|
tts.generateWithCallback(text, sid, speed, ttsChunkCallbackForJni(sentenceChunkSizes))
|
|
541
597
|
}
|
|
542
598
|
}
|
|
599
|
+
|
|
600
|
+
if (subtitleMode != "off" && sentenceChunkSizes.isEmpty() && audio.samples.isNotEmpty()) {
|
|
601
|
+
sentenceChunkSizes.add(audio.samples.size)
|
|
602
|
+
}
|
|
603
|
+
|
|
543
604
|
val map = Arguments.createMap()
|
|
544
605
|
val samplesArray = Arguments.createArray()
|
|
545
606
|
for (sample in audio.samples) {
|
|
@@ -547,17 +608,29 @@ internal class SherpaOnnxTtsHelper(
|
|
|
547
608
|
}
|
|
548
609
|
map.putArray("samples", samplesArray)
|
|
549
610
|
map.putInt("sampleRate", audio.sampleRate)
|
|
550
|
-
|
|
551
|
-
if (
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
611
|
+
|
|
612
|
+
val subtitleItems = if (subtitleMode == "off") {
|
|
613
|
+
emptyList()
|
|
614
|
+
} else {
|
|
615
|
+
val sentenceSegments = SherpaOnnxTextSegmenter.splitIntoSentences(text)
|
|
616
|
+
if (subtitleGranularity == "word") {
|
|
617
|
+
SherpaOnnxTextSegmenter.buildWordSubtitlesFromSentenceChunks(
|
|
618
|
+
sentenceSegments,
|
|
619
|
+
sentenceChunkSizes,
|
|
620
|
+
audio.sampleRate
|
|
621
|
+
)
|
|
622
|
+
} else {
|
|
623
|
+
SherpaOnnxTextSegmenter.buildSubtitlesFromChunks(
|
|
624
|
+
sentenceSegments,
|
|
625
|
+
sentenceChunkSizes,
|
|
626
|
+
audio.sampleRate
|
|
627
|
+
)
|
|
628
|
+
}
|
|
558
629
|
}
|
|
559
|
-
|
|
560
|
-
map.
|
|
630
|
+
|
|
631
|
+
map.putArray("subtitles", toSubtitleWritableArray(subtitleItems))
|
|
632
|
+
val timingMode = if (subtitleMode == "off") "off" else "estimated"
|
|
633
|
+
map.putString("timingMode", timingMode)
|
|
561
634
|
promise.resolve(map)
|
|
562
635
|
} catch (e: Exception) {
|
|
563
636
|
Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: ${e.message ?: "Failed to generate speech"}", e)
|
|
@@ -968,6 +1041,37 @@ internal class SherpaOnnxTtsHelper(
|
|
|
968
1041
|
private fun getSpeed(options: ReadableMap?): Float =
|
|
969
1042
|
if (options != null && options.hasKey("speed")) options.getDouble("speed").toFloat() else 1.0f
|
|
970
1043
|
|
|
1044
|
+
private fun getSubtitleMode(options: ReadableMap?): String {
|
|
1045
|
+
val raw = options?.getString("subtitleMode")?.trim()?.lowercase()
|
|
1046
|
+
return when (raw) {
|
|
1047
|
+
"off", "fast", "accurate" -> raw
|
|
1048
|
+
else -> "fast"
|
|
1049
|
+
}
|
|
1050
|
+
}
|
|
1051
|
+
|
|
1052
|
+
private fun getSubtitleGranularity(options: ReadableMap?): String {
|
|
1053
|
+
val raw = options?.getString("subtitleGranularity")?.trim()?.lowercase()
|
|
1054
|
+
return when (raw) {
|
|
1055
|
+
"word", "sentence" -> raw
|
|
1056
|
+
else -> "sentence"
|
|
1057
|
+
}
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
private fun isCharacterGranularityRequested(options: ReadableMap?): Boolean {
|
|
1061
|
+
val raw = options?.getString("subtitleGranularity")?.trim()?.lowercase()
|
|
1062
|
+
return raw == "character"
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
private fun toSubtitleWritableArray(items: List<SubtitleTimingItem>) = Arguments.createArray().apply {
|
|
1066
|
+
for (item in items) {
|
|
1067
|
+
val subtitleMap = Arguments.createMap()
|
|
1068
|
+
subtitleMap.putString("text", item.text)
|
|
1069
|
+
subtitleMap.putDouble("start", item.start)
|
|
1070
|
+
subtitleMap.putDouble("end", item.end)
|
|
1071
|
+
pushMap(subtitleMap)
|
|
1072
|
+
}
|
|
1073
|
+
}
|
|
1074
|
+
|
|
971
1075
|
/** Build Kotlin GenerationConfig from ReadableMap. Returns null only when options is null; otherwise returns a config with sid, speed, silenceScale, numSteps, and any reference/extra fields from options. */
|
|
972
1076
|
private fun parseGenerationConfig(options: ReadableMap?): GenerationConfig? {
|
|
973
1077
|
if (options == null) return null
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
asset_name,license_type,commercial_use,confidence,detection_source,license_file
|
|
2
|
+
wav2vec2-base-960h-fp16.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md
|
|
3
|
+
wav2vec2-base-960h-int8.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md
|
|
4
|
+
wav2vec2-base-960h-q4f16.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md
|
|
5
|
+
wav2vec2-base-960h.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md
|