react-native-sherpa-onnx 0.4.0 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/README.md +3 -0
  2. package/android/src/main/assets/model_licenses/alignment-models-license-status.csv +5 -0
  3. package/android/src/main/cpp/CMakeLists.txt +3 -0
  4. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.cpp +66 -0
  5. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-alignment-wrapper.h +17 -0
  6. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect-alignment.cpp +108 -0
  7. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-model-detect.h +30 -0
  8. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.cpp +66 -0
  9. package/android/src/main/cpp/jni/model_detect/sherpa-onnx-validate-alignment.h +30 -0
  10. package/android/src/main/cpp/jni/module/sherpa-onnx-module-jni.cpp +21 -0
  11. package/android/src/main/java/com/sherpaonnx/SherpaOnnxAlignmentHelper.kt +555 -0
  12. package/android/src/main/java/com/sherpaonnx/SherpaOnnxModule.kt +76 -0
  13. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTextSegmenter.kt +330 -0
  14. package/android/src/main/java/com/sherpaonnx/SherpaOnnxTtsHelper.kt +180 -23
  15. package/ios/Resources/model_licenses/alignment-models-license-status.csv +5 -0
  16. package/ios/SherpaOnnx+Alignment.mm +704 -0
  17. package/ios/SherpaOnnx+STT.mm +6 -0
  18. package/ios/SherpaOnnx+TTS.mm +624 -50
  19. package/ios/model_detect/sherpa-onnx-model-detect-alignment.mm +108 -0
  20. package/ios/model_detect/sherpa-onnx-model-detect.h +31 -0
  21. package/ios/model_detect/sherpa-onnx-validate-alignment.h +30 -0
  22. package/ios/model_detect/sherpa-onnx-validate-alignment.mm +66 -0
  23. package/ios/stt/sherpa-onnx-stt-wrapper.h +3 -1
  24. package/ios/stt/sherpa-onnx-stt-wrapper.mm +6 -0
  25. package/lib/module/NativeSherpaOnnx.js.map +1 -1
  26. package/lib/module/alignment/index.js +27 -0
  27. package/lib/module/alignment/index.js.map +1 -0
  28. package/lib/module/alignment/types.js +2 -0
  29. package/lib/module/alignment/types.js.map +1 -0
  30. package/lib/module/alignment/vocab.js +40 -0
  31. package/lib/module/alignment/vocab.js.map +1 -0
  32. package/lib/module/download/paths.js +9 -1
  33. package/lib/module/download/paths.js.map +1 -1
  34. package/lib/module/download/registry.js +17 -1
  35. package/lib/module/download/registry.js.map +1 -1
  36. package/lib/module/download/types.js +1 -0
  37. package/lib/module/download/types.js.map +1 -1
  38. package/lib/module/index.js +6 -4
  39. package/lib/module/index.js.map +1 -1
  40. package/lib/module/licenses.js +8 -2
  41. package/lib/module/licenses.js.map +1 -1
  42. package/lib/module/stt/types.js.map +1 -1
  43. package/lib/module/tts/index.js +68 -2
  44. package/lib/module/tts/index.js.map +1 -1
  45. package/lib/module/tts/subtitles.js +400 -0
  46. package/lib/module/tts/subtitles.js.map +1 -0
  47. package/lib/module/tts/tempAudio.js +17 -0
  48. package/lib/module/tts/tempAudio.js.map +1 -0
  49. package/lib/module/tts/types.js.map +1 -1
  50. package/lib/typescript/src/NativeSherpaOnnx.d.ts +34 -3
  51. package/lib/typescript/src/NativeSherpaOnnx.d.ts.map +1 -1
  52. package/lib/typescript/src/alignment/index.d.ts +8 -0
  53. package/lib/typescript/src/alignment/index.d.ts.map +1 -0
  54. package/lib/typescript/src/alignment/types.d.ts +23 -0
  55. package/lib/typescript/src/alignment/types.d.ts.map +1 -0
  56. package/lib/typescript/src/alignment/vocab.d.ts +5 -0
  57. package/lib/typescript/src/alignment/vocab.d.ts.map +1 -0
  58. package/lib/typescript/src/download/paths.d.ts +5 -2
  59. package/lib/typescript/src/download/paths.d.ts.map +1 -1
  60. package/lib/typescript/src/download/registry.d.ts.map +1 -1
  61. package/lib/typescript/src/download/types.d.ts +2 -1
  62. package/lib/typescript/src/download/types.d.ts.map +1 -1
  63. package/lib/typescript/src/index.d.ts +1 -0
  64. package/lib/typescript/src/index.d.ts.map +1 -1
  65. package/lib/typescript/src/licenses.d.ts.map +1 -1
  66. package/lib/typescript/src/stt/types.d.ts +5 -2
  67. package/lib/typescript/src/stt/types.d.ts.map +1 -1
  68. package/lib/typescript/src/tts/index.d.ts +2 -1
  69. package/lib/typescript/src/tts/index.d.ts.map +1 -1
  70. package/lib/typescript/src/tts/subtitles.d.ts +24 -0
  71. package/lib/typescript/src/tts/subtitles.d.ts.map +1 -0
  72. package/lib/typescript/src/tts/tempAudio.d.ts +3 -0
  73. package/lib/typescript/src/tts/tempAudio.d.ts.map +1 -0
  74. package/lib/typescript/src/tts/types.d.ts +68 -2
  75. package/lib/typescript/src/tts/types.d.ts.map +1 -1
  76. package/package.json +6 -1
  77. package/scripts/alignment-models/README.md +90 -0
  78. package/scripts/alignment-models/build_and_upload.js +724 -0
  79. package/scripts/alignment-models/sources.csv +5 -0
  80. package/scripts/alignment-models/sync_alignment_license_status.js +123 -0
  81. package/src/NativeSherpaOnnx.ts +35 -3
  82. package/src/alignment/index.ts +41 -0
  83. package/src/alignment/types.ts +22 -0
  84. package/src/alignment/vocab.ts +38 -0
  85. package/src/download/paths.ts +18 -5
  86. package/src/download/registry.ts +23 -3
  87. package/src/download/types.ts +1 -0
  88. package/src/index.tsx +6 -4
  89. package/src/licenses.ts +12 -1
  90. package/src/stt/types.ts +5 -2
  91. package/src/tts/index.ts +110 -3
  92. package/src/tts/subtitles.ts +611 -0
  93. package/src/tts/tempAudio.ts +31 -0
  94. package/src/tts/types.ts +79 -2
  95. package/third_party/sherpa-onnx-prebuilt/IOS_RELEASE_TAG +1 -1
@@ -0,0 +1,330 @@
1
+ package com.sherpaonnx
2
+
3
+ import kotlin.math.floor
4
+ import kotlin.math.max
5
+
6
+ internal data class SubtitleTimingItem(
7
+ val text: String,
8
+ val start: Double,
9
+ val end: Double
10
+ )
11
+
12
+ internal object SherpaOnnxTextSegmenter {
13
+ private val sentenceTerminators = setOf('.', '!', '?', ';', '。', '!', '?', ';')
14
+ private val trailingClosers = setOf('"', '\'', ')', ']', '}', '>', '”', '’', '」', '』', '】', ')')
15
+ private val commonAbbreviations = setOf(
16
+ "mr",
17
+ "mrs",
18
+ "ms",
19
+ "dr",
20
+ "prof",
21
+ "sr",
22
+ "jr",
23
+ "st",
24
+ "vs",
25
+ "etc",
26
+ "e.g",
27
+ "i.e"
28
+ )
29
+
30
+ fun splitIntoSentences(text: String): List<String> {
31
+ val normalized = text.trim()
32
+ if (normalized.isEmpty()) return emptyList()
33
+
34
+ val out = mutableListOf<String>()
35
+ var start = 0
36
+ var i = 0
37
+
38
+ while (i < normalized.length) {
39
+ val current = normalized[i]
40
+ if (!isSentenceTerminator(current)) {
41
+ i += 1
42
+ continue
43
+ }
44
+
45
+ if (current == '.' && !shouldSplitOnPeriod(normalized, i)) {
46
+ i += 1
47
+ continue
48
+ }
49
+
50
+ val end = sentenceBoundaryEnd(normalized, i)
51
+ val next = normalized.getOrNull(end)
52
+ if (next != null && !next.isWhitespace()) {
53
+ i += 1
54
+ continue
55
+ }
56
+
57
+ val sentence = normalized.substring(start, end).trim()
58
+ if (sentence.isNotEmpty()) {
59
+ out += sentence
60
+ }
61
+
62
+ start = end
63
+ while (start < normalized.length && normalized[start].isWhitespace()) {
64
+ start += 1
65
+ }
66
+ i = start
67
+ }
68
+
69
+ val tail = if (start < normalized.length) normalized.substring(start).trim() else ""
70
+ if (tail.isNotEmpty()) {
71
+ out += tail
72
+ }
73
+
74
+ return if (out.isNotEmpty()) out else listOf(normalized)
75
+ }
76
+
77
+ fun splitIntoWords(text: String): List<String> {
78
+ val normalized = text.trim()
79
+ if (normalized.isEmpty()) return emptyList()
80
+
81
+ val out = mutableListOf<String>()
82
+ val current = StringBuilder()
83
+
84
+ fun flushCurrent() {
85
+ val token = current.toString().trim()
86
+ if (token.isNotEmpty()) {
87
+ out += token
88
+ }
89
+ current.clear()
90
+ }
91
+
92
+ for (char in normalized) {
93
+ when {
94
+ char.isWhitespace() -> flushCurrent()
95
+ isCjkChar(char) -> {
96
+ flushCurrent()
97
+ out += char.toString()
98
+ }
99
+ isWordDelimiter(char) -> flushCurrent()
100
+ else -> current.append(char)
101
+ }
102
+ }
103
+
104
+ flushCurrent()
105
+ return if (out.isNotEmpty()) out else listOf(normalized)
106
+ }
107
+
108
+ fun buildSubtitlesFromChunks(
109
+ segments: List<String>,
110
+ chunkSampleCounts: List<Int>,
111
+ sampleRate: Int
112
+ ): List<SubtitleTimingItem> {
113
+ if (sampleRate <= 0) return emptyList()
114
+
115
+ val cleanedSegments = sanitizeSegments(segments)
116
+ if (cleanedSegments.isEmpty()) return emptyList()
117
+
118
+ val alignedCounts = alignChunkCountsToSegments(cleanedSegments, chunkSampleCounts)
119
+
120
+ val subtitles = mutableListOf<SubtitleTimingItem>()
121
+ var offsetSamples = 0
122
+
123
+ for (index in cleanedSegments.indices) {
124
+ val samples = alignedCounts.getOrElse(index) { 0 }.coerceAtLeast(0)
125
+ if (samples == 0 && offsetSamples == 0) {
126
+ continue
127
+ }
128
+
129
+ val startSec = offsetSamples.toDouble() / sampleRate.toDouble()
130
+ offsetSamples += samples
131
+ val endSec = offsetSamples.toDouble() / sampleRate.toDouble()
132
+
133
+ subtitles += SubtitleTimingItem(
134
+ text = cleanedSegments[index],
135
+ start = startSec,
136
+ end = endSec
137
+ )
138
+ }
139
+
140
+ return subtitles
141
+ }
142
+
143
+ fun buildWordSubtitlesFromSentenceChunks(
144
+ sentences: List<String>,
145
+ sentenceChunkSampleCounts: List<Int>,
146
+ sampleRate: Int
147
+ ): List<SubtitleTimingItem> {
148
+ val cleanedSentences = sanitizeSegments(sentences)
149
+ if (cleanedSentences.isEmpty()) return emptyList()
150
+
151
+ val alignedSentenceCounts = alignChunkCountsToSegments(
152
+ cleanedSentences,
153
+ sentenceChunkSampleCounts
154
+ )
155
+
156
+ val wordSegments = mutableListOf<String>()
157
+ val wordChunkCounts = mutableListOf<Int>()
158
+
159
+ for (index in cleanedSentences.indices) {
160
+ val sentence = cleanedSentences[index]
161
+ val sentenceSamples = alignedSentenceCounts.getOrElse(index) { 0 }.coerceAtLeast(0)
162
+ val words = splitIntoWords(sentence)
163
+ if (words.isEmpty()) continue
164
+
165
+ val distributed = distributeSamplesByTextWeight(sentenceSamples, words)
166
+ for (wordIndex in words.indices) {
167
+ wordSegments += words[wordIndex]
168
+ wordChunkCounts += distributed.getOrElse(wordIndex) { 0 }
169
+ }
170
+ }
171
+
172
+ return buildSubtitlesFromChunks(wordSegments, wordChunkCounts, sampleRate)
173
+ }
174
+
175
+ private fun sanitizeSegments(segments: List<String>): List<String> {
176
+ return segments
177
+ .map { it.trim() }
178
+ .filter { it.isNotEmpty() }
179
+ }
180
+
181
+ private fun alignChunkCountsToSegments(
182
+ segments: List<String>,
183
+ chunkSampleCounts: List<Int>
184
+ ): List<Int> {
185
+ if (segments.isEmpty()) return emptyList()
186
+
187
+ val counts = chunkSampleCounts.map { max(0, it) }
188
+ if (counts.size == segments.size) {
189
+ return counts
190
+ }
191
+
192
+ if (counts.size > segments.size) {
193
+ val merged = counts.take(segments.size).toMutableList()
194
+ val extra = counts.drop(segments.size).sum()
195
+ if (merged.isNotEmpty()) {
196
+ val lastIndex = merged.lastIndex
197
+ merged[lastIndex] = merged[lastIndex] + extra
198
+ }
199
+ return merged
200
+ }
201
+
202
+ return distributeSamplesByTextWeight(counts.sum(), segments)
203
+ }
204
+
205
+ private fun distributeSamplesByTextWeight(totalSamples: Int, segments: List<String>): List<Int> {
206
+ if (segments.isEmpty()) return emptyList()
207
+
208
+ val safeTotal = totalSamples.coerceAtLeast(0)
209
+ if (safeTotal == 0) {
210
+ return List(segments.size) { 0 }
211
+ }
212
+
213
+ val weights = segments.map { max(1, it.length) }
214
+ val weightSum = weights.sum().coerceAtLeast(1)
215
+
216
+ val base = MutableList(segments.size) { 0 }
217
+ val fractions = mutableListOf<Pair<Int, Double>>()
218
+
219
+ for (index in segments.indices) {
220
+ val exact = (safeTotal.toDouble() * weights[index].toDouble()) / weightSum.toDouble()
221
+ val floorValue = floor(exact).toInt()
222
+ base[index] = floorValue
223
+ fractions += index to (exact - floorValue.toDouble())
224
+ }
225
+
226
+ var assigned = base.sum()
227
+ var remaining = safeTotal - assigned
228
+
229
+ if (remaining > 0) {
230
+ val order = fractions.sortedByDescending { it.second }
231
+ var ptr = 0
232
+ while (remaining > 0 && order.isNotEmpty()) {
233
+ val target = order[ptr % order.size].first
234
+ base[target] = base[target] + 1
235
+ assigned += 1
236
+ remaining = safeTotal - assigned
237
+ ptr += 1
238
+ }
239
+ }
240
+
241
+ return base
242
+ }
243
+
244
+ private fun isSentenceTerminator(char: Char): Boolean {
245
+ return sentenceTerminators.contains(char)
246
+ }
247
+
248
+ private fun shouldSplitOnPeriod(text: String, periodIndex: Int): Boolean {
249
+ val prev = text.getOrNull(periodIndex - 1)
250
+ val next = text.getOrNull(periodIndex + 1)
251
+
252
+ if (prev != null && next != null && prev.isDigit() && next.isDigit()) {
253
+ return false
254
+ }
255
+
256
+ val tokenRaw = extractTokenBeforePeriod(text, periodIndex)
257
+ val tokenLower = tokenRaw.lowercase()
258
+ if (commonAbbreviations.contains(tokenLower)) {
259
+ return false
260
+ }
261
+
262
+ // Likely initial, e.g. "A. Smith" — use original case; tokenLower[0] is never uppercase.
263
+ if (tokenRaw.length == 1 && tokenRaw[0].isUpperCase()) {
264
+ return false
265
+ }
266
+
267
+ return true
268
+ }
269
+
270
+ private fun extractTokenBeforePeriod(text: String, periodIndex: Int): String {
271
+ var i = periodIndex - 1
272
+ while (i >= 0 && text[i].isWhitespace()) {
273
+ i -= 1
274
+ }
275
+
276
+ val end = i
277
+ while (i >= 0) {
278
+ val c = text[i]
279
+ if (c.isLetter() || c == '.') {
280
+ i -= 1
281
+ continue
282
+ }
283
+ break
284
+ }
285
+
286
+ if (end < i + 1) return ""
287
+
288
+ var token = text.substring(i + 1, end + 1)
289
+ while (token.endsWith('.')) {
290
+ token = token.dropLast(1)
291
+ }
292
+ return token
293
+ }
294
+
295
+ private fun sentenceBoundaryEnd(text: String, startIndex: Int): Int {
296
+ var end = startIndex + 1
297
+ while (end < text.length && isSentenceTerminator(text[end])) {
298
+ end += 1
299
+ }
300
+ while (end < text.length && trailingClosers.contains(text[end])) {
301
+ end += 1
302
+ }
303
+ return end
304
+ }
305
+
306
+ private fun isCjkChar(char: Char): Boolean {
307
+ val block = Character.UnicodeBlock.of(char)
308
+ return block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
309
+ block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A ||
310
+ block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B ||
311
+ block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C ||
312
+ block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D ||
313
+ block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E ||
314
+ block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F ||
315
+ block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G ||
316
+ block == Character.UnicodeBlock.HIRAGANA ||
317
+ block == Character.UnicodeBlock.KATAKANA ||
318
+ block == Character.UnicodeBlock.HANGUL_SYLLABLES
319
+ }
320
+
321
+ private fun isWordDelimiter(char: Char): Boolean {
322
+ return when (char) {
323
+ '.', ',', '!', '?', ';', ':', '(', ')', '[', ']', '{', '}',
324
+ '"', '\'', '`', '~', '<', '>', '/', '\\', '|', '@', '#', '$',
325
+ '%', '^', '&', '*', '+', '=', '…', ',', '。', '!', '?', ';',
326
+ ':', '、' -> true
327
+ else -> false
328
+ }
329
+ }
330
+ }
@@ -111,6 +111,46 @@ internal class SherpaOnnxTtsHelper(
111
111
  }
112
112
  }
113
113
 
114
+ /**
115
+ * libsherpa-onnx-jni looks up `invoke([F)Ljava/lang/Integer` (see sherpa-onnx `offline-tts.cc` CallCallback).
116
+ * Kotlin `Function1<*, Int>` compiles to `invoke([F)I`, so GetMethodID fails and JNI aborts.
117
+ * Using [java.lang.Integer] as the type parameter yields the boxed JVM signature the JNI expects.
118
+ * The cast is only for the Kotlin API (`generateWithCallback` still declares `Function1<FloatArray, Int>`).
119
+ */
120
+ /** Box for JNI: must be real [java.lang.Integer], not Kotlin [Int] (primitive `invoke([F)I` breaks sherpa JNI). */
121
+ @Suppress("DEPRECATION")
122
+ private fun boxForTtsJni(n: Int): java.lang.Integer = java.lang.Integer(n)
123
+
124
+ @Suppress("UNCHECKED_CAST")
125
+ private fun ttsChunkCallbackForJni(
126
+ sentenceChunkSizes: MutableList<Int>
127
+ ): kotlin.Function1<FloatArray, Int> {
128
+ val boxed =
129
+ object : kotlin.jvm.functions.Function1<FloatArray, java.lang.Integer> {
130
+ override fun invoke(chunk: FloatArray): java.lang.Integer {
131
+ sentenceChunkSizes.add(chunk.size)
132
+ return boxForTtsJni(chunk.size)
133
+ }
134
+ }
135
+ return boxed as kotlin.Function1<FloatArray, Int>
136
+ }
137
+
138
+ @Suppress("UNCHECKED_CAST")
139
+ private fun ttsStreamChunkCallbackForJni(
140
+ cancelled: AtomicBoolean,
141
+ onChunk: (FloatArray) -> Unit
142
+ ): kotlin.Function1<FloatArray, Int> {
143
+ val boxed =
144
+ object : kotlin.jvm.functions.Function1<FloatArray, java.lang.Integer> {
145
+ override fun invoke(chunk: FloatArray): java.lang.Integer {
146
+ if (cancelled.get()) return boxForTtsJni(0)
147
+ onChunk(chunk)
148
+ return boxForTtsJni(chunk.size)
149
+ }
150
+ }
151
+ return boxed as kotlin.Function1<FloatArray, Int>
152
+ }
153
+
114
154
  /** Single-thread executor for TTS init so the RN bridge thread is not blocked (avoids Inspector/dev WebSocket races in debug builds). */
115
155
  private val ttsInitExecutor = Executors.newSingleThreadExecutor()
116
156
 
@@ -451,9 +491,66 @@ internal class SherpaOnnxTtsHelper(
451
491
  promise.reject("TTS_GENERATE_ERROR", "TTS not initialized")
452
492
  return
453
493
  }
494
+
495
+ val subtitleMode = getSubtitleMode(options)
496
+ val subtitleGranularity = getSubtitleGranularity(options)
497
+ if (isCharacterGranularityRequested(options) && subtitleMode != "accurate") {
498
+ Log.e(
499
+ "SherpaOnnxTts",
500
+ "TTS_SUBTITLE_ERROR: Character granularity is only supported when subtitleMode is 'accurate'"
501
+ )
502
+ promise.reject(
503
+ "TTS_SUBTITLE_ERROR",
504
+ "Character granularity is only supported when subtitleMode is 'accurate'."
505
+ )
506
+ return
507
+ }
508
+
454
509
  val sid = getSid(options)
455
510
  val speed = getSpeed(options)
511
+ val sentenceChunkSizes = mutableListOf<Int>()
456
512
  val audio = when {
513
+ subtitleMode == "off" -> {
514
+ when {
515
+ hasReferenceAudio(options) && (inst.isZipvoice || inst.isPocket) -> {
516
+ if (inst.isZipvoice) {
517
+ val promptText = options!!.getString("referenceText")?.trim().orEmpty()
518
+ if (promptText.isEmpty()) {
519
+ Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: Zipvoice voice cloning requires non-empty referenceText")
520
+ promise.reject(
521
+ "TTS_GENERATE_ERROR",
522
+ "Zipvoice voice cloning requires non-empty referenceText (transcript of reference audio)."
523
+ )
524
+ return
525
+ }
526
+ }
527
+ val config = parseGenerationConfig(options) ?: GenerationConfig(speed = speed, sid = sid)
528
+ inst.tts!!.generateWithConfig(text, config)
529
+ }
530
+ hasReferenceAudio(options) -> {
531
+ Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: Reference audio is not supported for this TTS model type")
532
+ promise.reject(
533
+ "TTS_GENERATE_ERROR",
534
+ "Reference audio is only supported for Zipvoice and Pocket TTS."
535
+ )
536
+ return
537
+ }
538
+ inst.isPocket -> {
539
+ Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: Pocket TTS requires reference audio for voice cloning")
540
+ promise.reject(
541
+ "TTS_GENERATE_ERROR",
542
+ "Pocket TTS requires reference audio for voice cloning. Pass referenceAudio and referenceSampleRate (> 0) in options."
543
+ )
544
+ return
545
+ }
546
+ else -> dispatchGenerate(inst, text, sid, speed)
547
+ ?: run {
548
+ Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: TTS not initialized")
549
+ promise.reject("TTS_GENERATE_ERROR", "TTS not initialized")
550
+ return
551
+ }
552
+ }
553
+ }
457
554
  hasReferenceAudio(options) && (inst.isZipvoice || inst.isPocket) -> {
458
555
  if (inst.isZipvoice) {
459
556
  val promptText = options!!.getString("referenceText")?.trim().orEmpty()
@@ -467,7 +564,11 @@ internal class SherpaOnnxTtsHelper(
467
564
  }
468
565
  }
469
566
  val config = parseGenerationConfig(options) ?: GenerationConfig(speed = speed, sid = sid)
470
- inst.tts!!.generateWithConfig(text, config)
567
+ inst.tts!!.generateWithConfigAndCallback(
568
+ text,
569
+ config,
570
+ ttsChunkCallbackForJni(sentenceChunkSizes)
571
+ )
471
572
  }
472
573
  hasReferenceAudio(options) -> {
473
574
  Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: Reference audio is not supported for this TTS model type")
@@ -485,13 +586,21 @@ internal class SherpaOnnxTtsHelper(
485
586
  )
486
587
  return
487
588
  }
488
- else -> dispatchGenerate(inst, text, sid, speed)
489
- ?: run {
589
+ else -> {
590
+ val tts = inst.tts
591
+ if (tts == null) {
490
592
  Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: TTS not initialized")
491
593
  promise.reject("TTS_GENERATE_ERROR", "TTS not initialized")
492
594
  return
493
595
  }
596
+ tts.generateWithCallback(text, sid, speed, ttsChunkCallbackForJni(sentenceChunkSizes))
597
+ }
598
+ }
599
+
600
+ if (subtitleMode != "off" && sentenceChunkSizes.isEmpty() && audio.samples.isNotEmpty()) {
601
+ sentenceChunkSizes.add(audio.samples.size)
494
602
  }
603
+
495
604
  val map = Arguments.createMap()
496
605
  val samplesArray = Arguments.createArray()
497
606
  for (sample in audio.samples) {
@@ -499,17 +608,29 @@ internal class SherpaOnnxTtsHelper(
499
608
  }
500
609
  map.putArray("samples", samplesArray)
501
610
  map.putInt("sampleRate", audio.sampleRate)
502
- val subtitlesArray = Arguments.createArray()
503
- if (audio.samples.isNotEmpty() && audio.sampleRate > 0) {
504
- val durationSec = audio.samples.size.toDouble() / audio.sampleRate
505
- val subtitleMap = Arguments.createMap()
506
- subtitleMap.putString("text", text)
507
- subtitleMap.putDouble("start", 0.0)
508
- subtitleMap.putDouble("end", durationSec)
509
- subtitlesArray.pushMap(subtitleMap)
611
+
612
+ val subtitleItems = if (subtitleMode == "off") {
613
+ emptyList()
614
+ } else {
615
+ val sentenceSegments = SherpaOnnxTextSegmenter.splitIntoSentences(text)
616
+ if (subtitleGranularity == "word") {
617
+ SherpaOnnxTextSegmenter.buildWordSubtitlesFromSentenceChunks(
618
+ sentenceSegments,
619
+ sentenceChunkSizes,
620
+ audio.sampleRate
621
+ )
622
+ } else {
623
+ SherpaOnnxTextSegmenter.buildSubtitlesFromChunks(
624
+ sentenceSegments,
625
+ sentenceChunkSizes,
626
+ audio.sampleRate
627
+ )
628
+ }
510
629
  }
511
- map.putArray("subtitles", subtitlesArray)
512
- map.putBoolean("estimated", true)
630
+
631
+ map.putArray("subtitles", toSubtitleWritableArray(subtitleItems))
632
+ val timingMode = if (subtitleMode == "off") "off" else "estimated"
633
+ map.putString("timingMode", timingMode)
513
634
  promise.resolve(map)
514
635
  } catch (e: Exception) {
515
636
  Log.e("SherpaOnnxTts", "TTS_GENERATE_ERROR: ${e.message ?: "Failed to generate speech"}", e)
@@ -564,18 +685,23 @@ internal class SherpaOnnxTtsHelper(
564
685
  when {
565
686
  hasReferenceAudio(options) && inst.isPocket -> {
566
687
  val config = parseGenerationConfig(options) ?: GenerationConfig(speed = speed, sid = sid)
567
- inst.tts!!.generateWithConfigAndCallback(text, config) { chunk ->
568
- if (inst.ttsStreamCancelled.get()) return@generateWithConfigAndCallback 0
569
- emitChunk(instanceId, requestId, chunk, sampleRate, 0f, false)
570
- chunk.size
571
- }
688
+ inst.tts!!.generateWithConfigAndCallback(
689
+ text,
690
+ config,
691
+ ttsStreamChunkCallbackForJni(inst.ttsStreamCancelled) { chunk ->
692
+ emitChunk(instanceId, requestId, chunk, sampleRate, 0f, false)
693
+ }
694
+ )
572
695
  }
573
696
  else -> {
574
- inst.tts!!.generateWithCallback(text, sid, speed) { chunk ->
575
- if (inst.ttsStreamCancelled.get()) return@generateWithCallback 0
576
- emitChunk(instanceId, requestId, chunk, sampleRate, 0f, false)
577
- chunk.size
578
- }
697
+ inst.tts!!.generateWithCallback(
698
+ text,
699
+ sid,
700
+ speed,
701
+ ttsStreamChunkCallbackForJni(inst.ttsStreamCancelled) { chunk ->
702
+ emitChunk(instanceId, requestId, chunk, sampleRate, 0f, false)
703
+ }
704
+ )
579
705
  }
580
706
  }
581
707
  if (!inst.ttsStreamCancelled.get()) {
@@ -915,6 +1041,37 @@ internal class SherpaOnnxTtsHelper(
915
1041
  private fun getSpeed(options: ReadableMap?): Float =
916
1042
  if (options != null && options.hasKey("speed")) options.getDouble("speed").toFloat() else 1.0f
917
1043
 
1044
+ private fun getSubtitleMode(options: ReadableMap?): String {
1045
+ val raw = options?.getString("subtitleMode")?.trim()?.lowercase()
1046
+ return when (raw) {
1047
+ "off", "fast", "accurate" -> raw
1048
+ else -> "fast"
1049
+ }
1050
+ }
1051
+
1052
+ private fun getSubtitleGranularity(options: ReadableMap?): String {
1053
+ val raw = options?.getString("subtitleGranularity")?.trim()?.lowercase()
1054
+ return when (raw) {
1055
+ "word", "sentence" -> raw
1056
+ else -> "sentence"
1057
+ }
1058
+ }
1059
+
1060
+ private fun isCharacterGranularityRequested(options: ReadableMap?): Boolean {
1061
+ val raw = options?.getString("subtitleGranularity")?.trim()?.lowercase()
1062
+ return raw == "character"
1063
+ }
1064
+
1065
+ private fun toSubtitleWritableArray(items: List<SubtitleTimingItem>) = Arguments.createArray().apply {
1066
+ for (item in items) {
1067
+ val subtitleMap = Arguments.createMap()
1068
+ subtitleMap.putString("text", item.text)
1069
+ subtitleMap.putDouble("start", item.start)
1070
+ subtitleMap.putDouble("end", item.end)
1071
+ pushMap(subtitleMap)
1072
+ }
1073
+ }
1074
+
918
1075
  /** Build Kotlin GenerationConfig from ReadableMap. Returns null only when options is null; otherwise returns a config with sid, speed, silenceScale, numSteps, and any reference/extra fields from options. */
919
1076
  private fun parseGenerationConfig(options: ReadableMap?): GenerationConfig? {
920
1077
  if (options == null) return null
@@ -0,0 +1,5 @@
1
+ asset_name,license_type,commercial_use,confidence,detection_source,license_file
2
+ wav2vec2-base-960h-fp16.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md
3
+ wav2vec2-base-960h-int8.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md
4
+ wav2vec2-base-960h-q4f16.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md
5
+ wav2vec2-base-960h.tar.bz2,apache-2.0,yes,high,manual,https://huggingface.co/datasets/choosealicense/licenses/resolve/main/markdown/apache-2.0.md