react-native-tts-kit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/ATTRIBUTIONS.md +87 -0
  2. package/LICENSE +21 -0
  3. package/README.md +231 -0
  4. package/android/build.gradle +50 -0
  5. package/android/src/main/AndroidManifest.xml +3 -0
  6. package/android/src/main/java/expo/modules/ttskit/RNTTSKitModule.kt +158 -0
  7. package/android/src/main/java/expo/modules/ttskit/supertonic/AudioEngine.kt +158 -0
  8. package/android/src/main/java/expo/modules/ttskit/supertonic/ModelLocator.kt +372 -0
  9. package/android/src/main/java/expo/modules/ttskit/supertonic/SupertonicSession.kt +373 -0
  10. package/android/src/main/java/expo/modules/ttskit/supertonic/TextFrontend.kt +154 -0
  11. package/android/src/main/java/expo/modules/ttskit/supertonic/VoicePack.kt +47 -0
  12. package/build/engines/BufferedStreamEmitter.d.ts +26 -0
  13. package/build/engines/BufferedStreamEmitter.d.ts.map +1 -0
  14. package/build/engines/BufferedStreamEmitter.js +68 -0
  15. package/build/engines/BufferedStreamEmitter.js.map +1 -0
  16. package/build/engines/Engine.d.ts +15 -0
  17. package/build/engines/Engine.d.ts.map +1 -0
  18. package/build/engines/Engine.js +2 -0
  19. package/build/engines/Engine.js.map +1 -0
  20. package/build/engines/SupertonicEngine.d.ts +14 -0
  21. package/build/engines/SupertonicEngine.d.ts.map +1 -0
  22. package/build/engines/SupertonicEngine.js +183 -0
  23. package/build/engines/SupertonicEngine.js.map +1 -0
  24. package/build/engines/SystemEngine.d.ts +13 -0
  25. package/build/engines/SystemEngine.d.ts.map +1 -0
  26. package/build/engines/SystemEngine.js +78 -0
  27. package/build/engines/SystemEngine.js.map +1 -0
  28. package/build/index.d.ts +46 -0
  29. package/build/index.d.ts.map +1 -0
  30. package/build/index.js +118 -0
  31. package/build/index.js.map +1 -0
  32. package/build/types.d.ts +77 -0
  33. package/build/types.d.ts.map +1 -0
  34. package/build/types.js +2 -0
  35. package/build/types.js.map +1 -0
  36. package/build/voices/catalog.d.ts +12 -0
  37. package/build/voices/catalog.d.ts.map +1 -0
  38. package/build/voices/catalog.js +28 -0
  39. package/build/voices/catalog.js.map +1 -0
  40. package/build/voices/prosody.d.ts +8 -0
  41. package/build/voices/prosody.d.ts.map +1 -0
  42. package/build/voices/prosody.js +28 -0
  43. package/build/voices/prosody.js.map +1 -0
  44. package/expo-module.config.json +9 -0
  45. package/ios/RNTTSKit.podspec +28 -0
  46. package/ios/RNTTSKitModule.swift +133 -0
  47. package/ios/Supertonic/AudioEngine.swift +110 -0
  48. package/ios/Supertonic/ModelLocator.swift +416 -0
  49. package/ios/Supertonic/SupertonicSession.swift +405 -0
  50. package/ios/Supertonic/TextFrontend.swift +216 -0
  51. package/ios/Supertonic/VoicePack.swift +51 -0
  52. package/licenses/OpenRAIL-M.txt +209 -0
  53. package/package.json +77 -0
  54. package/src/engines/BufferedStreamEmitter.ts +50 -0
  55. package/src/engines/Engine.ts +28 -0
  56. package/src/engines/SupertonicEngine.ts +250 -0
  57. package/src/engines/SystemEngine.ts +96 -0
  58. package/src/engines/__tests__/BufferedStreamEmitter.test.ts +65 -0
  59. package/src/index.ts +156 -0
  60. package/src/types.ts +95 -0
  61. package/src/voices/__tests__/catalog.test.ts +46 -0
  62. package/src/voices/__tests__/prosody.test.ts +63 -0
  63. package/src/voices/catalog.ts +32 -0
  64. package/src/voices/prosody.ts +39 -0
@@ -0,0 +1,373 @@
1
+ package expo.modules.ttskit.supertonic
2
+
3
+ import ai.onnxruntime.OnnxTensor
4
+ import ai.onnxruntime.OrtEnvironment
5
+ import ai.onnxruntime.OrtSession
6
+ import android.content.Context
7
+ import org.json.JSONObject
8
+ import java.io.File
9
+ import java.nio.ByteBuffer
10
+ import java.nio.ByteOrder
11
+ import java.nio.FloatBuffer
12
+ import java.nio.LongBuffer
13
+ import kotlin.math.cos
14
+ import kotlin.math.ln
15
+ import kotlin.math.max
16
+ import kotlin.math.min
17
+ import kotlin.math.sqrt
18
+ import kotlin.random.Random
19
+
20
+ class SupertonicSession(private val context: Context) {
21
+ private var env: OrtEnvironment? = null
22
+ private var dp: OrtSession? = null
23
+ private var enc: OrtSession? = null
24
+ private var vec: OrtSession? = null
25
+ private var voc: OrtSession? = null
26
+ private var indexer: UnicodeIndexer? = null
27
+ private val voiceCache = HashMap<String, VoicePack>()
28
+ private var sampleRateValue: Int = 24_000
29
+ private var baseChunkSize: Int = 0
30
+ private var chunkCompressFactor: Int = 0
31
+ private var latentDimBase: Int = 0
32
+ @Volatile private var cancelled = false
33
+
34
+ val isReady: Boolean
35
+ get() = dp != null && enc != null && vec != null && voc != null && indexer != null && baseChunkSize > 0
36
+ val sampleRate: Int get() = sampleRateValue
37
+
38
+ fun loadIfNeeded() {
39
+ if (isReady) return
40
+ val ortEnv = OrtEnvironment.getEnvironment()
41
+
42
+ // EP strategy on Android: NNAPI with USE_FP16, then XNNPACK as fallback.
43
+ //
44
+ // NNAPI + USE_FP16: takes the fp32 graph and relaxes it to fp16 inside
45
+ // the device's neural accelerator (Hexagon / Mali / etc.). This is the
46
+ // documented path to fp16 speed on Android; XNNPACK EP and the default
47
+ // CPU EP have no native fp16 kernels and produce a Cast-storm on fp16
48
+ // models (ORT issue #25824 — ~50% of time in casts, garbled outputs in
49
+ // diffusion models). That's why ModelLocator ships fp32 to Android.
50
+ //
51
+ // If NNAPI rejects ops it can't handle, ORT auto-partitions them to the
52
+ // CPU EP — fine for stragglers. If addNnapi() itself throws (older
53
+ // Android with no NNAPI 1.2+, emulator), we fall through to XNNPACK
54
+ // which handles fp32 Conv/MatMul/Gemm quickly.
55
+ val cpuCount = Runtime.getRuntime().availableProcessors()
56
+ val xnnpackThreads = minOf(4, maxOf(2, cpuCount))
57
+
58
+ fun OrtSession.SessionOptions.applyEps() {
59
+ val nnapiOk = runCatching {
60
+ // USE_FP16 = relax float32 → float16 at runtime where supported.
61
+ // CPU_DISABLED stays unset so unsupported ops auto-fall-back to CPU EP.
62
+ addNnapi(java.util.EnumSet.of(ai.onnxruntime.providers.NNAPIFlags.USE_FP16))
63
+ }.isSuccess
64
+ if (!nnapiOk) {
65
+ android.util.Log.w("ST", "NNAPI EP unavailable, falling back to XNNPACK")
66
+ runCatching { addXnnpack(mapOf("intra_op_num_threads" to xnnpackThreads.toString())) }
67
+ .onFailure { android.util.Log.w("ST", "XNNPACK also unavailable, using CPU EP: ${it.message}") }
68
+ } else {
69
+ android.util.Log.i("ST", "NNAPI EP loaded with USE_FP16")
70
+ }
71
+ }
72
+
73
+ // Toggle this to VERBOSE briefly when investigating NNAPI partitioning.
74
+ // VERBOSE makes ORT log every op it placed on each EP and every "this op
75
+ // is unsupported by NNAPI, falling back to CPU" decision. Helpful when
76
+ // synthesis is unexpectedly slow on Android — we want to see what NNAPI
77
+ // rejected. Leave at WARNING for release.
78
+ val sessLogLevel = ai.onnxruntime.OrtLoggingLevel.ORT_LOGGING_LEVEL_WARNING
79
+
80
+ val opts = OrtSession.SessionOptions().apply {
81
+ setIntraOpNumThreads(1)
82
+ setOptimizationLevel(OrtSession.SessionOptions.OptLevel.ALL_OPT)
83
+ setSessionLogLevel(sessLogLevel)
84
+ applyEps()
85
+ }
86
+ // Diffusion graph (vec) sees varying input shapes per call. ORT's memory-
87
+ // pattern optimizer pre-allocates buffers from a profiled shape and then
88
+ // emits "Shape mismatch attempting to re-use buffer" warnings + reallocs
89
+ // every step at runtime. Disabling the optimizer for this one session
90
+ // skips the wasted alloc/free on the hot path. The text encoder, duration
91
+ // predictor and vocoder have stable enough shapes that we leave it on.
92
+ val vecOpts = OrtSession.SessionOptions().apply {
93
+ setIntraOpNumThreads(1)
94
+ setOptimizationLevel(OrtSession.SessionOptions.OptLevel.ALL_OPT)
95
+ setMemoryPatternOptimization(false)
96
+ setSessionLogLevel(sessLogLevel)
97
+ applyEps()
98
+ }
99
+ android.util.Log.i("ST", "Loading sessions with ortIntraOp=1 cpuCount=$cpuCount")
100
+
101
+ env = ortEnv
102
+ dp = ortEnv.createSession(ModelLocator.resolvedOnnxPath(context, "duration_predictor.onnx"), opts)
103
+ enc = ortEnv.createSession(ModelLocator.resolvedOnnxPath(context, "text_encoder.onnx"), opts)
104
+ vec = ortEnv.createSession(ModelLocator.resolvedOnnxPath(context, "vector_estimator.onnx"), vecOpts)
105
+ voc = ortEnv.createSession(ModelLocator.resolvedOnnxPath(context, "vocoder.onnx"), opts)
106
+
107
+ val cfgPath = ModelLocator.resolvedOnnxPath(context, "tts.json")
108
+ val cfg = JSONObject(File(cfgPath).readText())
109
+ val ae = cfg.getJSONObject("ae")
110
+ val ttl = cfg.getJSONObject("ttl")
111
+ sampleRateValue = ae.getInt("sample_rate")
112
+ baseChunkSize = ae.getInt("base_chunk_size")
113
+ chunkCompressFactor = ttl.getInt("chunk_compress_factor")
114
+ latentDimBase = ttl.getInt("latent_dim")
115
+
116
+ val idxPath = ModelLocator.resolvedOnnxPath(context, "unicode_indexer.json")
117
+ indexer = UnicodeIndexer(idxPath)
118
+ }
119
+
120
+ fun beginRun() { cancelled = false }
121
+ fun cancel() { cancelled = true }
122
+
123
+ private fun voicePack(voiceId: String): VoicePack {
124
+ voiceCache[voiceId]?.let { return it }
125
+ val path = ModelLocator.resolvedVoicePath(context, voiceId)
126
+ require(File(path).exists()) { "Voice $voiceId not available" }
127
+ val pack = VoicePack(voiceId, env!!, path)
128
+ // Bound the cache. With 10 total voices we don't actually evict in
129
+ // practice, but the cap means a future model expansion can't leak.
130
+ if (voiceCache.size >= 8) {
131
+ voiceCache.values.forEach { runCatching { it.close() } }
132
+ voiceCache.clear()
133
+ }
134
+ voiceCache[voiceId] = pack
135
+ return pack
136
+ }
137
+
138
+ /** Pre-warm the JSON-decode + tensor-allocation path for the most likely
139
+ * first-tap voice. Called from `prefetch()` so the user's first speak()
140
+ * doesn't pay 50–150 ms of voice-load cost mid-tap. */
141
+ fun prewarmDefaultVoice() {
142
+ runCatching { voicePack("F1") }
143
+ }
144
+
145
+ /** Drop all loaded sessions, indexer, voice tensors. Called from OnDestroy
146
+ * so resources release deterministically rather than waiting for GC. */
147
+ fun tearDown() {
148
+ voiceCache.values.forEach { runCatching { it.close() } }
149
+ voiceCache.clear()
150
+ indexer = null
151
+ runCatching { dp?.close() }; dp = null
152
+ runCatching { enc?.close() }; enc = null
153
+ runCatching { vec?.close() }; vec = null
154
+ runCatching { voc?.close() }; voc = null
155
+ env = null
156
+ baseChunkSize = 0
157
+ chunkCompressFactor = 0
158
+ latentDimBase = 0
159
+ }
160
+
161
+ fun synthesizeOne(text: String, lang: String, voiceId: String, totalStep: Int, speed: Double): FloatArray {
162
+ val t0 = System.nanoTime()
163
+ fun dMs(from: Long, to: Long) = ((to - from) / 1_000_000.0).toInt()
164
+
165
+ loadIfNeeded()
166
+ val tLoad = System.nanoTime()
167
+ val ortEnv = env ?: error("env not initialized")
168
+ val voice = voicePack(voiceId)
169
+ val tVoice = System.nanoTime()
170
+
171
+ val processed = TextFrontend.preprocess(text, lang)
172
+ val ids = indexer!!.encode(processed)
173
+ if (ids.isEmpty()) return FloatArray(0)
174
+ val bsz = 1
175
+ val textLen = ids.size
176
+ val mask = FloatArray(textLen) { 1f }
177
+ val tText = System.nanoTime()
178
+
179
+ val textIdsT = OnnxTensor.createTensor(ortEnv, LongBuffer.wrap(ids), longArrayOf(bsz.toLong(), textLen.toLong()))
180
+ val textMaskT = OnnxTensor.createTensor(ortEnv, FloatBuffer.wrap(mask), longArrayOf(bsz.toLong(), 1, textLen.toLong()))
181
+
182
+ val tTensors = System.nanoTime()
183
+ val dpInputs = mapOf("text_ids" to textIdsT, "style_dp" to voice.dp, "text_mask" to textMaskT)
184
+ val dpOutput = dp!!.run(dpInputs)
185
+ val durArr = (dpOutput.get(0).value as FloatArray).copyOf()
186
+ dpOutput.close()
187
+ for (i in durArr.indices) durArr[i] = (durArr[i] / speed.toFloat())
188
+ val tDP = System.nanoTime()
189
+
190
+ if (cancelled) throw RuntimeException("Synthesis cancelled")
191
+
192
+ val encInputs = mapOf("text_ids" to textIdsT, "style_ttl" to voice.ttl, "text_mask" to textMaskT)
193
+ val encOutput = enc!!.run(encInputs)
194
+ // CRITICAL: Java ONNX Runtime ties child tensor lifetimes to the parent
195
+ // OrtSession.Result. If we hold the raw `textEmb` across the denoising
196
+ // loop and then close `encOutput`, every iteration after the first sees
197
+ // an invalidated tensor and produces garbage audio. Clone into a fresh
198
+ // owned tensor immediately and close the parent right away.
199
+ val textEmb: OnnxTensor = encOutput.use { out ->
200
+ val src = out.get(0) as OnnxTensor
201
+ val shape = src.info.shape.copyOf()
202
+ val total = shape.fold(1L) { acc, d -> acc * d }.toInt()
203
+ val flat = FloatArray(total)
204
+ val buf = src.floatBuffer
205
+ buf.rewind()
206
+ buf.get(flat)
207
+ OnnxTensor.createTensor(ortEnv, FloatBuffer.wrap(flat), shape)
208
+ }
209
+ val tEnc = System.nanoTime()
210
+
211
+ try {
212
+ val latentDim = latentDimBase * chunkCompressFactor
213
+ val chunkSize = baseChunkSize * chunkCompressFactor
214
+ val maxDur = durArr.max()
215
+ val wavLenMax = (maxDur * sampleRateValue).toInt()
216
+ val latentLen = (wavLenMax + chunkSize - 1) / chunkSize
217
+ val wavLengths = durArr.map { (it * sampleRateValue).toInt() }
218
+ val latentLengths = wavLengths.map { (it + chunkSize - 1) / chunkSize }
219
+
220
+ val noisy = FloatArray(bsz * latentDim * latentLen)
221
+ var idx = 0
222
+ for (b in 0 until bsz) {
223
+ val lLen = latentLengths[b]
224
+ for (d in 0 until latentDim) {
225
+ for (t in 0 until latentLen) {
226
+ if (t < lLen) {
227
+ val u1 = max(1e-7f, Random.nextFloat())
228
+ val u2 = Random.nextFloat()
229
+ noisy[idx] = sqrt(-2f * ln(u1)) * cos(2f * Math.PI.toFloat() * u2)
230
+ }
231
+ idx++
232
+ }
233
+ }
234
+ }
235
+ val latentMask = FloatArray(bsz * latentLen)
236
+ for (b in 0 until bsz) {
237
+ for (t in 0 until latentLengths[b]) latentMask[b * latentLen + t] = 1f
238
+ }
239
+
240
+ val latentMaskT = OnnxTensor.createTensor(ortEnv, FloatBuffer.wrap(latentMask), longArrayOf(bsz.toLong(), 1, latentLen.toLong()))
241
+ val totalStepArr = FloatArray(bsz) { totalStep.toFloat() }
242
+ val totalStepT = OnnxTensor.createTensor(ortEnv, FloatBuffer.wrap(totalStepArr), longArrayOf(bsz.toLong()))
243
+ val tNoise = System.nanoTime()
244
+
245
+ var current = noisy
246
+ val stepTimes = IntArray(totalStep)
247
+ for (step in 0 until totalStep) {
248
+ val tStepStart = System.nanoTime()
249
+ if (cancelled) throw RuntimeException("Synthesis cancelled")
250
+ val xt = OnnxTensor.createTensor(
251
+ ortEnv,
252
+ FloatBuffer.wrap(current),
253
+ longArrayOf(bsz.toLong(), latentDim.toLong(), latentLen.toLong())
254
+ )
255
+ val curStepT = OnnxTensor.createTensor(ortEnv, FloatBuffer.wrap(FloatArray(bsz) { step.toFloat() }), longArrayOf(bsz.toLong()))
256
+ val vecOut = vec!!.run(mapOf(
257
+ "noisy_latent" to xt,
258
+ "text_emb" to textEmb,
259
+ "style_ttl" to voice.ttl,
260
+ "latent_mask" to latentMaskT,
261
+ "text_mask" to textMaskT,
262
+ "current_step" to curStepT,
263
+ "total_step" to totalStepT
264
+ ))
265
+ @Suppress("UNCHECKED_CAST")
266
+ val raw = vecOut.get(0).value
267
+ current = flatten3D(raw)
268
+ vecOut.close()
269
+ xt.close()
270
+ curStepT.close()
271
+ stepTimes[step] = ((System.nanoTime() - tStepStart) / 1_000_000).toInt()
272
+ }
273
+ val tDiffusion = System.nanoTime()
274
+
275
+ if (cancelled) throw RuntimeException("Synthesis cancelled")
276
+
277
+ val finalLatent = OnnxTensor.createTensor(
278
+ ortEnv,
279
+ FloatBuffer.wrap(current),
280
+ longArrayOf(bsz.toLong(), latentDim.toLong(), latentLen.toLong())
281
+ )
282
+ val vocOut = voc!!.run(mapOf("latent" to finalLatent))
283
+ @Suppress("UNCHECKED_CAST")
284
+ val wavRaw = vocOut.get(0).value
285
+ val wav = (wavRaw as Array<FloatArray>)[0]
286
+ vocOut.close()
287
+ finalLatent.close()
288
+ latentMaskT.close()
289
+ totalStepT.close()
290
+
291
+ val tVoc = System.nanoTime()
292
+ val trimLen = min(wav.size, (durArr[0] * sampleRateValue).toInt())
293
+ val outArr = if (trimLen > 0 && trimLen < wav.size) wav.copyOfRange(0, trimLen) else wav
294
+
295
+ val totalMs = dMs(t0, tVoc)
296
+ val stepSummary = stepTimes.withIndex().joinToString(" ") { "${it.index}:${it.value}" }
297
+ android.util.Log.i("ST.timing",
298
+ "total=${totalMs}ms " +
299
+ "load=${dMs(t0, tLoad)} voice=${dMs(tLoad, tVoice)} " +
300
+ "text=${dMs(tVoice, tText)} tensors=${dMs(tText, tTensors)} " +
301
+ "dp=${dMs(tTensors, tDP)} enc=${dMs(tDP, tEnc)} " +
302
+ "noise=${dMs(tEnc, tNoise)} diffusion=${dMs(tNoise, tDiffusion)} " +
303
+ "voc=${dMs(tDiffusion, tVoc)} " +
304
+ "chars=${ids.size} latentLen=$latentLen steps=[$stepSummary]")
305
+ return outArr
306
+ } finally {
307
+ runCatching { textEmb.close() }
308
+ runCatching { textIdsT.close() }
309
+ runCatching { textMaskT.close() }
310
+ }
311
+ }
312
+
313
+ fun synthesize(text: String, lang: String, voiceId: String, totalStep: Int, speed: Double): FloatArray {
314
+ beginRun()
315
+ val chunks = TextFrontend.chunk(text, lang); if (chunks.isEmpty()) return FloatArray(0)
316
+ val silence = FloatArray((0.3 * sampleRateValue).toInt())
317
+ val out = ArrayList<Float>()
318
+ for ((i, c) in chunks.withIndex()) {
319
+ if (cancelled) throw RuntimeException("Synthesis cancelled")
320
+ val pcm = synthesizeOne(c, lang, voiceId, totalStep, speed)
321
+ if (i > 0) for (s in silence) out.add(s)
322
+ for (s in pcm) out.add(s)
323
+ }
324
+ return FloatArray(out.size) { out[it] }
325
+ }
326
+
327
+ fun synthesizeStreaming(
328
+ text: String, lang: String, voiceId: String, totalStep: Int, speed: Double,
329
+ onChunk: (FloatArray) -> Unit
330
+ ) {
331
+ val tStart = System.nanoTime()
332
+ loadIfNeeded()
333
+ beginRun()
334
+ val chunks = TextFrontend.chunk(text, lang)
335
+ var firstChunkLogged = false
336
+ for (c in chunks) {
337
+ if (cancelled) return
338
+ val pcm = synthesizeOne(c, lang, voiceId, totalStep, speed)
339
+ if (pcm.isNotEmpty()) {
340
+ if (!firstChunkLogged) {
341
+ val ttfa = ((System.nanoTime() - tStart) / 1_000_000).toInt()
342
+ android.util.Log.i("ST.timing", "TTFA=${ttfa}ms (first chunk emitted, chunks=${chunks.size})")
343
+ firstChunkLogged = true
344
+ }
345
+ onChunk(pcm)
346
+ }
347
+ }
348
+ }
349
+
350
+ /** Flatten the ONNX float[B][D][T] result into a single FloatArray. */
351
+ @Suppress("UNCHECKED_CAST")
352
+ private fun flatten3D(raw: Any): FloatArray {
353
+ val outer = raw as Array<Array<FloatArray>>
354
+ val b = outer.size; val d = outer[0].size; val t = outer[0][0].size
355
+ val out = FloatArray(b * d * t)
356
+ var idx = 0
357
+ for (i in 0 until b) for (j in 0 until d) for (k in 0 until t) {
358
+ out[idx++] = outer[i][j][k]
359
+ }
360
+ return out
361
+ }
362
+
363
+ companion object {
364
+ fun toPcm16(samples: FloatArray): ByteArray {
365
+ val out = ByteBuffer.allocate(samples.size * 2).order(ByteOrder.LITTLE_ENDIAN)
366
+ for (s in samples) {
367
+ val clamped = if (s > 1f) 1f else if (s < -1f) -1f else s
368
+ out.putShort((clamped * 32767f).toInt().toShort())
369
+ }
370
+ return out.array()
371
+ }
372
+ }
373
+ }
@@ -0,0 +1,154 @@
1
+ package expo.modules.ttskit.supertonic
2
+
3
+ import org.json.JSONArray
4
+ import java.io.File
5
+ import java.text.Normalizer
6
+
7
+ object TextFrontend {
8
+ val AVAILABLE_LANGS = setOf(
9
+ "en", "ko", "ja", "ar", "bg", "cs", "da", "de", "el", "es", "et", "fi",
10
+ "fr", "hi", "hr", "hu", "id", "it", "lt", "lv", "nl", "pl", "pt", "ro",
11
+ "ru", "sk", "sl", "sv", "tr", "uk", "vi"
12
+ )
13
+
14
+ private val ABBREVIATIONS = setOf(
15
+ "Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.",
16
+ "St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.",
17
+ "Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D."
18
+ )
19
+
20
+ fun maxChunkLength(lang: String): Int = if (lang == "ko" || lang == "ja") 120 else 300
21
+
22
+ fun preprocess(text: String, lang: String): String {
23
+ require(AVAILABLE_LANGS.contains(lang)) { "Unsupported language: $lang" }
24
+
25
+ var s = Normalizer.normalize(text, Normalizer.Form.NFKD)
26
+
27
+ // Strip emoji blocks.
28
+ val sb = StringBuilder(s.length)
29
+ var i = 0
30
+ while (i < s.length) {
31
+ val cp = s.codePointAt(i)
32
+ val skip = (cp in 0x1F600..0x1F64F || cp in 0x1F300..0x1F5FF ||
33
+ cp in 0x1F680..0x1F6FF || cp in 0x1F700..0x1F77F ||
34
+ cp in 0x1F780..0x1F7FF || cp in 0x1F800..0x1F8FF ||
35
+ cp in 0x1F900..0x1F9FF || cp in 0x1FA00..0x1FA6F ||
36
+ cp in 0x1FA70..0x1FAFF || cp in 0x2600..0x26FF ||
37
+ cp in 0x2700..0x27BF || cp in 0x1F1E6..0x1F1FF)
38
+ if (!skip) sb.appendCodePoint(cp)
39
+ i += Character.charCount(cp)
40
+ }
41
+ s = sb.toString()
42
+
43
+ val replacements = mapOf(
44
+ "–" to "-", "‑" to "-", "—" to "-",
45
+ "_" to " ",
46
+ "“" to "\"", "”" to "\"",
47
+ "‘" to "'", "’" to "'",
48
+ "´" to "'", "`" to "'",
49
+ "[" to " ", "]" to " ", "|" to " ", "/" to " ", "#" to " ",
50
+ "→" to " ", "←" to " "
51
+ )
52
+ for ((k, v) in replacements) s = s.replace(k, v)
53
+
54
+ for (sym in listOf("♥", "☆", "♡", "©", "\\")) s = s.replace(sym, "")
55
+
56
+ s = s.replace("@", " at ").replace("e.g.,", "for example, ").replace("i.e.,", "that is, ")
57
+
58
+ val pSpacing = listOf(" ," to ",", " ." to ".", " !" to "!", " ?" to "?",
59
+ " ;" to ";", " :" to ":", " '" to "'")
60
+ for ((k, v) in pSpacing) s = s.replace(k, v)
61
+
62
+ while (s.contains("\"\"")) s = s.replace("\"\"", "\"")
63
+ while (s.contains("''")) s = s.replace("''", "'")
64
+ while (s.contains("``")) s = s.replace("``", "`")
65
+
66
+ s = s.replace(Regex("\\s+"), " ").trim()
67
+
68
+ if (s.isNotEmpty() && !s.matches(Regex(".*[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]\$"))) {
69
+ s += "."
70
+ }
71
+ return "<$lang>$s</$lang>"
72
+ }
73
+
74
+ fun chunk(text: String, lang: String): List<String> {
75
+ val maxLen = maxChunkLength(lang)
76
+ val trimmed = text.trim(); if (trimmed.isEmpty()) return emptyList()
77
+
78
+ val paragraphs = trimmed.split(Regex("\\n\\s*\\n")).map { it.trim() }.filter { it.isNotEmpty() }
79
+ val chunks = mutableListOf<String>()
80
+ val source = if (paragraphs.isEmpty()) listOf(trimmed) else paragraphs
81
+
82
+ for (p in source) {
83
+ if (p.length <= maxLen) { chunks.add(p); continue }
84
+ chunks.addAll(greedyJoin(splitSentences(p), maxLen))
85
+ }
86
+ return if (chunks.isEmpty()) listOf(trimmed) else chunks
87
+ }
88
+
89
+ private fun splitSentences(text: String): List<String> {
90
+ // Latin punctuation requires trailing whitespace; Asian terminal
91
+ // punctuation (。!?) does not. Without the Asian branch, long ja/ko/zh
92
+ // strings collapse into one oversized chunk which the model truncates.
93
+ val regex = Regex("([.!?])\\s+|([。!?])")
94
+ val matches = regex.findAll(text).toList()
95
+ if (matches.isEmpty()) return listOf(text)
96
+
97
+ val out = mutableListOf<String>()
98
+ var lastEnd = 0
99
+ for (m in matches) {
100
+ val before = text.substring(lastEnd, m.range.first)
101
+ val punc = text[m.range.first].toString()
102
+ val combined = before.trim() + punc
103
+ val isAbbrev = ABBREVIATIONS.any { combined.endsWith(it) }
104
+ if (!isAbbrev) {
105
+ out.add(text.substring(lastEnd, m.range.last + 1))
106
+ lastEnd = m.range.last + 1
107
+ }
108
+ }
109
+ if (lastEnd < text.length) out.add(text.substring(lastEnd))
110
+ return if (out.isEmpty()) listOf(text) else out
111
+ }
112
+
113
+ private fun greedyJoin(pieces: List<String>, maxLen: Int): List<String> {
114
+ val out = mutableListOf<String>()
115
+ var current = ""
116
+ for (raw in pieces) {
117
+ val p = raw.trim(); if (p.isEmpty()) continue
118
+ current = when {
119
+ current.isEmpty() -> p
120
+ current.length + 1 + p.length <= maxLen -> "$current $p"
121
+ else -> { out.add(current); p }
122
+ }
123
+ }
124
+ if (current.isNotEmpty()) out.add(current)
125
+ return out
126
+ }
127
+ }
128
+
129
+ class UnicodeIndexer(path: String) {
130
+ private val table: LongArray
131
+
132
+ init {
133
+ val text = File(path).readText(Charsets.UTF_8)
134
+ val arr = JSONArray(text)
135
+ val out = LongArray(arr.length())
136
+ for (i in 0 until arr.length()) out[i] = arr.getLong(i)
137
+ table = out
138
+ }
139
+
140
+ fun encode(text: String): LongArray {
141
+ val codepoints = mutableListOf<Int>()
142
+ var i = 0
143
+ while (i < text.length) {
144
+ val cp = text.codePointAt(i)
145
+ codepoints.add(cp)
146
+ i += Character.charCount(cp)
147
+ }
148
+ val out = LongArray(codepoints.size)
149
+ for ((j, cp) in codepoints.withIndex()) {
150
+ out[j] = if (cp < table.size) table[cp] else -1L
151
+ }
152
+ return out
153
+ }
154
+ }
@@ -0,0 +1,47 @@
1
+ package expo.modules.ttskit.supertonic
2
+
3
+ import ai.onnxruntime.OnnxTensor
4
+ import ai.onnxruntime.OrtEnvironment
5
+ import org.json.JSONObject
6
+ import java.io.File
7
+ import java.nio.FloatBuffer
8
+
9
+ /**
10
+ * Loads `voice_styles/<id>.json` from upstream Supertonic.
11
+ * Each file contains two 3D float tensors: style_ttl and style_dp.
12
+ */
13
+ class VoicePack(val voiceId: String, env: OrtEnvironment, path: String) {
14
+ val ttl: OnnxTensor
15
+ val dp: OnnxTensor
16
+
17
+ init {
18
+ val text = File(path).readText(Charsets.UTF_8)
19
+ val root = JSONObject(text)
20
+ ttl = parseComponent(env, root.getJSONObject("style_ttl"))
21
+ dp = parseComponent(env, root.getJSONObject("style_dp"))
22
+ }
23
+
24
+ fun close() {
25
+ runCatching { ttl.close() }
26
+ runCatching { dp.close() }
27
+ }
28
+
29
+ private fun parseComponent(env: OrtEnvironment, obj: JSONObject): OnnxTensor {
30
+ val dimsArr = obj.getJSONArray("dims")
31
+ val dims = LongArray(dimsArr.length()) { i -> dimsArr.getLong(i) }
32
+ val total = dims.fold(1L) { acc, d -> acc * d }
33
+ val flat = FloatArray(total.toInt())
34
+ val data = obj.getJSONArray("data")
35
+ var idx = 0
36
+ for (a in 0 until data.length()) {
37
+ val l1 = data.getJSONArray(a)
38
+ for (b in 0 until l1.length()) {
39
+ val l2 = l1.getJSONArray(b)
40
+ for (c in 0 until l2.length()) {
41
+ flat[idx++] = l2.getDouble(c).toFloat()
42
+ }
43
+ }
44
+ }
45
+ return OnnxTensor.createTensor(env, FloatBuffer.wrap(flat), dims)
46
+ }
47
+ }
@@ -0,0 +1,26 @@
1
+ /**
2
+ * Per-stream emitter that buffers events emitted before any listener attaches.
3
+ *
4
+ * The native side of `stream()` starts producing chunks the moment we kick it
5
+ * off, but the JS caller usually attaches `.on('chunk')` immediately after —
6
+ * there's a small async gap. Without buffering, early chunks would be silently
7
+ * dropped. Once a listener is attached, queued events drain to it.
8
+ *
9
+ * This class is exported separately from SupertonicEngine so it can be unit
10
+ * tested without needing the native module to load.
11
+ */
12
+ export declare class BufferedStreamEmitter {
13
+ private chunkListeners;
14
+ private endListeners;
15
+ private errorListeners;
16
+ private pendingChunks;
17
+ private pendingEnd;
18
+ private pendingError;
19
+ on(event: 'chunk', listener: (pcm: Uint8Array) => void): void;
20
+ on(event: 'end', listener: () => void): void;
21
+ on(event: 'error', listener: (err: Error) => void): void;
22
+ emitChunk(pcm: Uint8Array): void;
23
+ emitEnd(): void;
24
+ emitError(err: Error): void;
25
+ }
26
+ //# sourceMappingURL=BufferedStreamEmitter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"BufferedStreamEmitter.d.ts","sourceRoot":"","sources":["../../src/engines/BufferedStreamEmitter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AACH,qBAAa,qBAAqB;IAChC,OAAO,CAAC,cAAc,CAAwC;IAC9D,OAAO,CAAC,YAAY,CAAyB;IAC7C,OAAO,CAAC,cAAc,CAAmC;IACzD,OAAO,CAAC,aAAa,CAAoB;IACzC,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,YAAY,CAAsB;IAE1C,EAAE,CAAC,KAAK,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,GAAG,EAAE,UAAU,KAAK,IAAI,GAAG,IAAI;IAC7D,EAAE,CAAC,KAAK,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,IAAI,GAAG,IAAI;IAC5C,EAAE,CAAC,KAAK,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,GAAG,EAAE,KAAK,KAAK,IAAI,GAAG,IAAI;IAgBxD,SAAS,CAAC,GAAG,EAAE,UAAU,GAAG,IAAI;IAIhC,OAAO,IAAI,IAAI;IAIf,SAAS,CAAC,GAAG,EAAE,KAAK,GAAG,IAAI;CAI5B"}
@@ -0,0 +1,68 @@
1
+ /**
2
+ * Per-stream emitter that buffers events emitted before any listener attaches.
3
+ *
4
+ * The native side of `stream()` starts producing chunks the moment we kick it
5
+ * off, but the JS caller usually attaches `.on('chunk')` immediately after —
6
+ * there's a small async gap. Without buffering, early chunks would be silently
7
+ * dropped. Once a listener is attached, queued events drain to it.
8
+ *
9
+ * This class is exported separately from SupertonicEngine so it can be unit
10
+ * tested without needing the native module to load.
11
+ */
12
+ export class BufferedStreamEmitter {
13
+ chunkListeners = [];
14
+ endListeners = [];
15
+ errorListeners = [];
16
+ pendingChunks = [];
17
+ pendingEnd = false;
18
+ pendingError = null;
19
+ on(event, listener) {
20
+ if (event === 'chunk') {
21
+ this.chunkListeners.push(listener);
22
+ const drained = this.pendingChunks;
23
+ this.pendingChunks = [];
24
+ for (const pcm of drained)
25
+ listener(pcm);
26
+ }
27
+ else if (event === 'end') {
28
+ this.endListeners.push(listener);
29
+ if (this.pendingEnd) {
30
+ this.pendingEnd = false;
31
+ listener();
32
+ }
33
+ }
34
+ else if (event === 'error') {
35
+ this.errorListeners.push(listener);
36
+ if (this.pendingError) {
37
+ const e = this.pendingError;
38
+ this.pendingError = null;
39
+ listener(e);
40
+ }
41
+ }
42
+ }
43
+ emitChunk(pcm) {
44
+ if (this.chunkListeners.length === 0) {
45
+ this.pendingChunks.push(pcm);
46
+ return;
47
+ }
48
+ for (const l of this.chunkListeners)
49
+ l(pcm);
50
+ }
51
+ emitEnd() {
52
+ if (this.endListeners.length === 0) {
53
+ this.pendingEnd = true;
54
+ return;
55
+ }
56
+ for (const l of this.endListeners)
57
+ l();
58
+ }
59
+ emitError(err) {
60
+ if (this.errorListeners.length === 0) {
61
+ this.pendingError = err;
62
+ return;
63
+ }
64
+ for (const l of this.errorListeners)
65
+ l(err);
66
+ }
67
+ }
68
+ //# sourceMappingURL=BufferedStreamEmitter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"BufferedStreamEmitter.js","sourceRoot":"","sources":["../../src/engines/BufferedStreamEmitter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AACH,MAAM,OAAO,qBAAqB;IACxB,cAAc,GAAqC,EAAE,CAAC;IACtD,YAAY,GAAsB,EAAE,CAAC;IACrC,cAAc,GAAgC,EAAE,CAAC;IACjD,aAAa,GAAiB,EAAE,CAAC;IACjC,UAAU,GAAG,KAAK,CAAC;IACnB,YAAY,GAAiB,IAAI,CAAC;IAK1C,EAAE,CAAC,KAAgC,EAAE,QAAkC;QACrE,IAAI,KAAK,KAAK,OAAO,EAAE,CAAC;YACtB,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACnC,MAAM,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC;YACnC,IAAI,CAAC,aAAa,GAAG,EAAE,CAAC;YACxB,KAAK,MAAM,GAAG,IAAI,OAAO;gBAAE,QAAQ,CAAC,GAAG,CAAC,CAAC;QAC3C,CAAC;aAAM,IAAI,KAAK,KAAK,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACjC,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;gBAAC,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;gBAAC,QAAQ,EAAE,CAAC;YAAC,CAAC;QAC/D,CAAC;aAAM,IAAI,KAAK,KAAK,OAAO,EAAE,CAAC;YAC7B,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACnC,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;gBAAC,MAAM,CAAC,GAAG,IAAI,CAAC,YAAY,CAAC;gBAAC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;gBAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;YAAC,CAAC;QAChG,CAAC;IACH,CAAC;IAED,SAAS,CAAC,GAAe;QACvB,IAAI,IAAI,CAAC,cAAc,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAAC,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAAC,OAAO;QAAC,CAAC;QAC/E,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,cAAc;YAAE,CAAC,CAAC,GAAG,CAAC,CAAC;IAC9C,CAAC;IACD,OAAO;QACL,IAAI,IAAI,CAAC,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAAC,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC;YAAC,OAAO;QAAC,CAAC;QACvE,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,YAAY;YAAE,CAAC,EAAE,CAAC;IACzC,CAAC;IACD,SAAS,CAAC,GAAU;QAClB,IAAI,IAAI,CAAC,cAAc,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAAC,IAAI,CAAC,YAAY,GAAG,GAAG,CAAC;YAAC,OAAO;QAAC,CAAC;QAC1E,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,cAAc;YAAE,CAAC,CAAC,GAAG,CAAC,CAAC;IAC9C,CAAC;CACF","sourcesContent":["/**\n * Per-stream emitter that buffers events emitted before any listener attaches.\n *\n * The native side of `stream()` starts producing chunks the moment we kick it\n * off, but the JS caller usually attaches `.on('chunk')` immediately after —\n * there's a small async gap. Without buffering, early chunks would be silently\n * dropped. Once a listener is attached, queued events drain to it.\n *\n * This class is exported separately from SupertonicEngine so it can be unit\n * tested without needing the native module to load.\n */\nexport class BufferedStreamEmitter {\n private chunkListeners: Array<(pcm: Uint8Array) => void> = [];\n private endListeners: Array<() => void> = [];\n private errorListeners: Array<(err: Error) => void> = [];\n private pendingChunks: Uint8Array[] = [];\n private pendingEnd = false;\n private pendingError: Error | null = null;\n\n on(event: 'chunk', listener: (pcm: Uint8Array) => void): void;\n on(event: 'end', listener: () => void): void;\n on(event: 'error', listener: (err: Error) => void): void;\n on(event: 'chunk' | 'end' | 'error', listener: (...args: any[]) => void): void {\n if (event === 'chunk') {\n this.chunkListeners.push(listener);\n const drained = this.pendingChunks;\n this.pendingChunks = [];\n for (const pcm of drained) listener(pcm);\n } else if (event === 'end') {\n this.endListeners.push(listener);\n if (this.pendingEnd) { this.pendingEnd = false; listener(); }\n } else if (event === 'error') {\n this.errorListeners.push(listener);\n if (this.pendingError) { const e = this.pendingError; this.pendingError = null; listener(e); }\n }\n }\n\n emitChunk(pcm: Uint8Array): void {\n if (this.chunkListeners.length === 0) { this.pendingChunks.push(pcm); return; }\n for (const l of this.chunkListeners) l(pcm);\n }\n emitEnd(): void {\n if (this.endListeners.length === 0) { this.pendingEnd = true; return; }\n for (const l of this.endListeners) l();\n }\n emitError(err: Error): void {\n if (this.errorListeners.length === 0) { this.pendingError = err; return; }\n for (const l of this.errorListeners) l(err);\n }\n}\n"]}