npm - react-native-tts-kit - Versions diffs - 0.1.0 - Mend

react-native-tts-kit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/ATTRIBUTIONS.md +87 -0
package/LICENSE +21 -0
package/README.md +231 -0
package/android/build.gradle +50 -0
package/android/src/main/AndroidManifest.xml +3 -0
package/android/src/main/java/expo/modules/ttskit/RNTTSKitModule.kt +158 -0
package/android/src/main/java/expo/modules/ttskit/supertonic/AudioEngine.kt +158 -0
package/android/src/main/java/expo/modules/ttskit/supertonic/ModelLocator.kt +372 -0
package/android/src/main/java/expo/modules/ttskit/supertonic/SupertonicSession.kt +373 -0
package/android/src/main/java/expo/modules/ttskit/supertonic/TextFrontend.kt +154 -0
package/android/src/main/java/expo/modules/ttskit/supertonic/VoicePack.kt +47 -0
package/build/engines/BufferedStreamEmitter.d.ts +26 -0
package/build/engines/BufferedStreamEmitter.d.ts.map +1 -0
package/build/engines/BufferedStreamEmitter.js +68 -0
package/build/engines/BufferedStreamEmitter.js.map +1 -0
package/build/engines/Engine.d.ts +15 -0
package/build/engines/Engine.d.ts.map +1 -0
package/build/engines/Engine.js +2 -0
package/build/engines/Engine.js.map +1 -0
package/build/engines/SupertonicEngine.d.ts +14 -0
package/build/engines/SupertonicEngine.d.ts.map +1 -0
package/build/engines/SupertonicEngine.js +183 -0
package/build/engines/SupertonicEngine.js.map +1 -0
package/build/engines/SystemEngine.d.ts +13 -0
package/build/engines/SystemEngine.d.ts.map +1 -0
package/build/engines/SystemEngine.js +78 -0
package/build/engines/SystemEngine.js.map +1 -0
package/build/index.d.ts +46 -0
package/build/index.d.ts.map +1 -0
package/build/index.js +118 -0
package/build/index.js.map +1 -0
package/build/types.d.ts +77 -0
package/build/types.d.ts.map +1 -0
package/build/types.js +2 -0
package/build/types.js.map +1 -0
package/build/voices/catalog.d.ts +12 -0
package/build/voices/catalog.d.ts.map +1 -0
package/build/voices/catalog.js +28 -0
package/build/voices/catalog.js.map +1 -0
package/build/voices/prosody.d.ts +8 -0
package/build/voices/prosody.d.ts.map +1 -0
package/build/voices/prosody.js +28 -0
package/build/voices/prosody.js.map +1 -0
package/expo-module.config.json +9 -0
package/ios/RNTTSKit.podspec +28 -0
package/ios/RNTTSKitModule.swift +133 -0
package/ios/Supertonic/AudioEngine.swift +110 -0
package/ios/Supertonic/ModelLocator.swift +416 -0
package/ios/Supertonic/SupertonicSession.swift +405 -0
package/ios/Supertonic/TextFrontend.swift +216 -0
package/ios/Supertonic/VoicePack.swift +51 -0
package/licenses/OpenRAIL-M.txt +209 -0
package/package.json +77 -0
package/src/engines/BufferedStreamEmitter.ts +50 -0
package/src/engines/Engine.ts +28 -0
package/src/engines/SupertonicEngine.ts +250 -0
package/src/engines/SystemEngine.ts +96 -0
package/src/engines/__tests__/BufferedStreamEmitter.test.ts +65 -0
package/src/index.ts +156 -0
package/src/types.ts +95 -0
package/src/voices/__tests__/catalog.test.ts +46 -0
package/src/voices/__tests__/prosody.test.ts +63 -0
package/src/voices/catalog.ts +32 -0
package/src/voices/prosody.ts +39 -0

package/android/src/main/java/expo/modules/ttskit/supertonic/SupertonicSession.kt ADDED Viewed

@@ -0,0 +1,373 @@
+package expo.modules.ttskit.supertonic
+import ai.onnxruntime.OnnxTensor
+import ai.onnxruntime.OrtEnvironment
+import ai.onnxruntime.OrtSession
+import android.content.Context
+import org.json.JSONObject
+import java.io.File
+import java.nio.ByteBuffer
+import java.nio.ByteOrder
+import java.nio.FloatBuffer
+import java.nio.LongBuffer
+import kotlin.math.cos
+import kotlin.math.ln
+import kotlin.math.max
+import kotlin.math.min
+import kotlin.math.sqrt
+import kotlin.random.Random
+class SupertonicSession(private val context: Context) {
+  private var env: OrtEnvironment? = null
+  private var dp: OrtSession? = null
+  private var enc: OrtSession? = null
+  private var vec: OrtSession? = null
+  private var voc: OrtSession? = null
+  private var indexer: UnicodeIndexer? = null
+  private val voiceCache = HashMap<String, VoicePack>()
+  private var sampleRateValue: Int = 24_000
+  private var baseChunkSize: Int = 0
+  private var chunkCompressFactor: Int = 0
+  private var latentDimBase: Int = 0
+  @Volatile private var cancelled = false
+  val isReady: Boolean
+    get() = dp != null && enc != null && vec != null && voc != null && indexer != null && baseChunkSize > 0
+  val sampleRate: Int get() = sampleRateValue
+  fun loadIfNeeded() {
+    if (isReady) return
+    val ortEnv = OrtEnvironment.getEnvironment()
+    // EP strategy on Android: NNAPI with USE_FP16, then XNNPACK as fallback.
+    //
+    //   NNAPI + USE_FP16: takes the fp32 graph and relaxes it to fp16 inside
+    //   the device's neural accelerator (Hexagon / Mali / etc.). This is the
+    //   documented path to fp16 speed on Android; XNNPACK EP and the default
+    //   CPU EP have no native fp16 kernels and produce a Cast-storm on fp16
+    //   models (ORT issue #25824 — ~50% of time in casts, garbled outputs in
+    //   diffusion models). That's why ModelLocator ships fp32 to Android.
+    //
+    //   If NNAPI rejects ops it can't handle, ORT auto-partitions them to the
+    //   CPU EP — fine for stragglers. If addNnapi() itself throws (older
+    //   Android with no NNAPI 1.2+, emulator), we fall through to XNNPACK
+    //   which handles fp32 Conv/MatMul/Gemm quickly.
+    val cpuCount = Runtime.getRuntime().availableProcessors()
+    val xnnpackThreads = minOf(4, maxOf(2, cpuCount))
+    fun OrtSession.SessionOptions.applyEps() {
+      val nnapiOk = runCatching {
+        // USE_FP16 = relax float32 → float16 at runtime where supported.
+        // CPU_DISABLED stays unset so unsupported ops auto-fall-back to CPU EP.
+        addNnapi(java.util.EnumSet.of(ai.onnxruntime.providers.NNAPIFlags.USE_FP16))
+      }.isSuccess
+      if (!nnapiOk) {
+        android.util.Log.w("ST", "NNAPI EP unavailable, falling back to XNNPACK")
+        runCatching { addXnnpack(mapOf("intra_op_num_threads" to xnnpackThreads.toString())) }
+          .onFailure { android.util.Log.w("ST", "XNNPACK also unavailable, using CPU EP: ${it.message}") }
+      } else {
+        android.util.Log.i("ST", "NNAPI EP loaded with USE_FP16")
+      }
+    }
+    // Toggle this to VERBOSE briefly when investigating NNAPI partitioning.
+    // VERBOSE makes ORT log every op it placed on each EP and every "this op
+    // is unsupported by NNAPI, falling back to CPU" decision. Helpful when
+    // synthesis is unexpectedly slow on Android — we want to see what NNAPI
+    // rejected. Leave at WARNING for release.
+    val sessLogLevel = ai.onnxruntime.OrtLoggingLevel.ORT_LOGGING_LEVEL_WARNING
+    val opts = OrtSession.SessionOptions().apply {
+      setIntraOpNumThreads(1)
+      setOptimizationLevel(OrtSession.SessionOptions.OptLevel.ALL_OPT)
+      setSessionLogLevel(sessLogLevel)
+      applyEps()
+    }
+    // Diffusion graph (vec) sees varying input shapes per call. ORT's memory-
+    // pattern optimizer pre-allocates buffers from a profiled shape and then
+    // emits "Shape mismatch attempting to re-use buffer" warnings + reallocs
+    // every step at runtime. Disabling the optimizer for this one session
+    // skips the wasted alloc/free on the hot path. The text encoder, duration
+    // predictor and vocoder have stable enough shapes that we leave it on.
+    val vecOpts = OrtSession.SessionOptions().apply {
+      setIntraOpNumThreads(1)
+      setOptimizationLevel(OrtSession.SessionOptions.OptLevel.ALL_OPT)
+      setMemoryPatternOptimization(false)
+      setSessionLogLevel(sessLogLevel)
+      applyEps()
+    }
+    android.util.Log.i("ST", "Loading sessions with ortIntraOp=1 cpuCount=$cpuCount")
+    env = ortEnv
+    dp  = ortEnv.createSession(ModelLocator.resolvedOnnxPath(context, "duration_predictor.onnx"), opts)
+    enc = ortEnv.createSession(ModelLocator.resolvedOnnxPath(context, "text_encoder.onnx"), opts)
+    vec = ortEnv.createSession(ModelLocator.resolvedOnnxPath(context, "vector_estimator.onnx"), vecOpts)
+    voc = ortEnv.createSession(ModelLocator.resolvedOnnxPath(context, "vocoder.onnx"), opts)
+    val cfgPath = ModelLocator.resolvedOnnxPath(context, "tts.json")
+    val cfg = JSONObject(File(cfgPath).readText())
+    val ae = cfg.getJSONObject("ae")
+    val ttl = cfg.getJSONObject("ttl")
+    sampleRateValue = ae.getInt("sample_rate")
+    baseChunkSize = ae.getInt("base_chunk_size")
+    chunkCompressFactor = ttl.getInt("chunk_compress_factor")
+    latentDimBase = ttl.getInt("latent_dim")
+    val idxPath = ModelLocator.resolvedOnnxPath(context, "unicode_indexer.json")
+    indexer = UnicodeIndexer(idxPath)
+  }
+  fun beginRun() { cancelled = false }
+  fun cancel() { cancelled = true }
+  private fun voicePack(voiceId: String): VoicePack {
+    voiceCache[voiceId]?.let { return it }
+    val path = ModelLocator.resolvedVoicePath(context, voiceId)
+    require(File(path).exists()) { "Voice $voiceId not available" }
+    val pack = VoicePack(voiceId, env!!, path)
+    // Bound the cache. With 10 total voices we don't actually evict in
+    // practice, but the cap means a future model expansion can't leak.
+    if (voiceCache.size >= 8) {
+      voiceCache.values.forEach { runCatching { it.close() } }
+      voiceCache.clear()
+    }
+    voiceCache[voiceId] = pack
+    return pack
+  }
+  /** Pre-warm the JSON-decode + tensor-allocation path for the most likely
+   *  first-tap voice. Called from `prefetch()` so the user's first speak()
+   *  doesn't pay 50–150 ms of voice-load cost mid-tap. */
+  fun prewarmDefaultVoice() {
+    runCatching { voicePack("F1") }
+  }
+  /** Drop all loaded sessions, indexer, voice tensors. Called from OnDestroy
+   *  so resources release deterministically rather than waiting for GC. */
+  fun tearDown() {
+    voiceCache.values.forEach { runCatching { it.close() } }
+    voiceCache.clear()
+    indexer = null
+    runCatching { dp?.close() }; dp = null
+    runCatching { enc?.close() }; enc = null
+    runCatching { vec?.close() }; vec = null
+    runCatching { voc?.close() }; voc = null
+    env = null
+    baseChunkSize = 0
+    chunkCompressFactor = 0
+    latentDimBase = 0
+  }
+  fun synthesizeOne(text: String, lang: String, voiceId: String, totalStep: Int, speed: Double): FloatArray {
+    val t0 = System.nanoTime()
+    fun dMs(from: Long, to: Long) = ((to - from) / 1_000_000.0).toInt()
+    loadIfNeeded()
+    val tLoad = System.nanoTime()
+    val ortEnv = env ?: error("env not initialized")
+    val voice = voicePack(voiceId)
+    val tVoice = System.nanoTime()
+    val processed = TextFrontend.preprocess(text, lang)
+    val ids = indexer!!.encode(processed)
+    if (ids.isEmpty()) return FloatArray(0)
+    val bsz = 1
+    val textLen = ids.size
+    val mask = FloatArray(textLen) { 1f }
+    val tText = System.nanoTime()
+    val textIdsT = OnnxTensor.createTensor(ortEnv, LongBuffer.wrap(ids), longArrayOf(bsz.toLong(), textLen.toLong()))
+    val textMaskT = OnnxTensor.createTensor(ortEnv, FloatBuffer.wrap(mask), longArrayOf(bsz.toLong(), 1, textLen.toLong()))
+    val tTensors = System.nanoTime()
+    val dpInputs = mapOf("text_ids" to textIdsT, "style_dp" to voice.dp, "text_mask" to textMaskT)
+    val dpOutput = dp!!.run(dpInputs)
+    val durArr = (dpOutput.get(0).value as FloatArray).copyOf()
+    dpOutput.close()
+    for (i in durArr.indices) durArr[i] = (durArr[i] / speed.toFloat())
+    val tDP = System.nanoTime()
+    if (cancelled) throw RuntimeException("Synthesis cancelled")
+    val encInputs = mapOf("text_ids" to textIdsT, "style_ttl" to voice.ttl, "text_mask" to textMaskT)
+    val encOutput = enc!!.run(encInputs)
+    // CRITICAL: Java ONNX Runtime ties child tensor lifetimes to the parent
+    // OrtSession.Result. If we hold the raw `textEmb` across the denoising
+    // loop and then close `encOutput`, every iteration after the first sees
+    // an invalidated tensor and produces garbage audio. Clone into a fresh
+    // owned tensor immediately and close the parent right away.
+    val textEmb: OnnxTensor = encOutput.use { out ->
+      val src = out.get(0) as OnnxTensor
+      val shape = src.info.shape.copyOf()
+      val total = shape.fold(1L) { acc, d -> acc * d }.toInt()
+      val flat = FloatArray(total)
+      val buf = src.floatBuffer
+      buf.rewind()
+      buf.get(flat)
+      OnnxTensor.createTensor(ortEnv, FloatBuffer.wrap(flat), shape)
+    }
+    val tEnc = System.nanoTime()
+    try {
+      val latentDim = latentDimBase * chunkCompressFactor
+      val chunkSize = baseChunkSize * chunkCompressFactor
+      val maxDur = durArr.max()
+      val wavLenMax = (maxDur * sampleRateValue).toInt()
+      val latentLen = (wavLenMax + chunkSize - 1) / chunkSize
+      val wavLengths = durArr.map { (it * sampleRateValue).toInt() }
+      val latentLengths = wavLengths.map { (it + chunkSize - 1) / chunkSize }
+      val noisy = FloatArray(bsz * latentDim * latentLen)
+      var idx = 0
+      for (b in 0 until bsz) {
+        val lLen = latentLengths[b]
+        for (d in 0 until latentDim) {
+          for (t in 0 until latentLen) {
+            if (t < lLen) {
+              val u1 = max(1e-7f, Random.nextFloat())
+              val u2 = Random.nextFloat()
+              noisy[idx] = sqrt(-2f * ln(u1)) * cos(2f * Math.PI.toFloat() * u2)
+            }
+            idx++
+          }
+        }
+      }
+      val latentMask = FloatArray(bsz * latentLen)
+      for (b in 0 until bsz) {
+        for (t in 0 until latentLengths[b]) latentMask[b * latentLen + t] = 1f
+      }
+      val latentMaskT = OnnxTensor.createTensor(ortEnv, FloatBuffer.wrap(latentMask), longArrayOf(bsz.toLong(), 1, latentLen.toLong()))
+      val totalStepArr = FloatArray(bsz) { totalStep.toFloat() }
+      val totalStepT = OnnxTensor.createTensor(ortEnv, FloatBuffer.wrap(totalStepArr), longArrayOf(bsz.toLong()))
+      val tNoise = System.nanoTime()
+      var current = noisy
+      val stepTimes = IntArray(totalStep)
+      for (step in 0 until totalStep) {
+        val tStepStart = System.nanoTime()
+        if (cancelled) throw RuntimeException("Synthesis cancelled")
+        val xt = OnnxTensor.createTensor(
+          ortEnv,
+          FloatBuffer.wrap(current),
+          longArrayOf(bsz.toLong(), latentDim.toLong(), latentLen.toLong())
+        )
+        val curStepT = OnnxTensor.createTensor(ortEnv, FloatBuffer.wrap(FloatArray(bsz) { step.toFloat() }), longArrayOf(bsz.toLong()))
+        val vecOut = vec!!.run(mapOf(
+          "noisy_latent" to xt,
+          "text_emb" to textEmb,
+          "style_ttl" to voice.ttl,
+          "latent_mask" to latentMaskT,
+          "text_mask" to textMaskT,
+          "current_step" to curStepT,
+          "total_step" to totalStepT
+        ))
+        @Suppress("UNCHECKED_CAST")
+        val raw = vecOut.get(0).value
+        current = flatten3D(raw)
+        vecOut.close()
+        xt.close()
+        curStepT.close()
+        stepTimes[step] = ((System.nanoTime() - tStepStart) / 1_000_000).toInt()
+      }
+      val tDiffusion = System.nanoTime()
+      if (cancelled) throw RuntimeException("Synthesis cancelled")
+      val finalLatent = OnnxTensor.createTensor(
+        ortEnv,
+        FloatBuffer.wrap(current),
+        longArrayOf(bsz.toLong(), latentDim.toLong(), latentLen.toLong())
+      )
+      val vocOut = voc!!.run(mapOf("latent" to finalLatent))
+      @Suppress("UNCHECKED_CAST")
+      val wavRaw = vocOut.get(0).value
+      val wav = (wavRaw as Array<FloatArray>)[0]
+      vocOut.close()
+      finalLatent.close()
+      latentMaskT.close()
+      totalStepT.close()
+      val tVoc = System.nanoTime()
+      val trimLen = min(wav.size, (durArr[0] * sampleRateValue).toInt())
+      val outArr = if (trimLen > 0 && trimLen < wav.size) wav.copyOfRange(0, trimLen) else wav
+      val totalMs = dMs(t0, tVoc)
+      val stepSummary = stepTimes.withIndex().joinToString(" ") { "${it.index}:${it.value}" }
+      android.util.Log.i("ST.timing",
+        "total=${totalMs}ms " +
+        "load=${dMs(t0, tLoad)} voice=${dMs(tLoad, tVoice)} " +
+        "text=${dMs(tVoice, tText)} tensors=${dMs(tText, tTensors)} " +
+        "dp=${dMs(tTensors, tDP)} enc=${dMs(tDP, tEnc)} " +
+        "noise=${dMs(tEnc, tNoise)} diffusion=${dMs(tNoise, tDiffusion)} " +
+        "voc=${dMs(tDiffusion, tVoc)} " +
+        "chars=${ids.size} latentLen=$latentLen steps=[$stepSummary]")
+      return outArr
+    } finally {
+      runCatching { textEmb.close() }
+      runCatching { textIdsT.close() }
+      runCatching { textMaskT.close() }
+    }
+  }
+  fun synthesize(text: String, lang: String, voiceId: String, totalStep: Int, speed: Double): FloatArray {
+    beginRun()
+    val chunks = TextFrontend.chunk(text, lang); if (chunks.isEmpty()) return FloatArray(0)
+    val silence = FloatArray((0.3 * sampleRateValue).toInt())
+    val out = ArrayList<Float>()
+    for ((i, c) in chunks.withIndex()) {
+      if (cancelled) throw RuntimeException("Synthesis cancelled")
+      val pcm = synthesizeOne(c, lang, voiceId, totalStep, speed)
+      if (i > 0) for (s in silence) out.add(s)
+      for (s in pcm) out.add(s)
+    }
+    return FloatArray(out.size) { out[it] }
+  }
+  fun synthesizeStreaming(
+    text: String, lang: String, voiceId: String, totalStep: Int, speed: Double,
+    onChunk: (FloatArray) -> Unit
+  ) {
+    val tStart = System.nanoTime()
+    loadIfNeeded()
+    beginRun()
+    val chunks = TextFrontend.chunk(text, lang)
+    var firstChunkLogged = false
+    for (c in chunks) {
+      if (cancelled) return
+      val pcm = synthesizeOne(c, lang, voiceId, totalStep, speed)
+      if (pcm.isNotEmpty()) {
+        if (!firstChunkLogged) {
+          val ttfa = ((System.nanoTime() - tStart) / 1_000_000).toInt()
+          android.util.Log.i("ST.timing", "TTFA=${ttfa}ms (first chunk emitted, chunks=${chunks.size})")
+          firstChunkLogged = true
+        }
+        onChunk(pcm)
+      }
+    }
+  }
+  /** Flatten the ONNX float[B][D][T] result into a single FloatArray. */
+  @Suppress("UNCHECKED_CAST")
+  private fun flatten3D(raw: Any): FloatArray {
+    val outer = raw as Array<Array<FloatArray>>
+    val b = outer.size; val d = outer[0].size; val t = outer[0][0].size
+    val out = FloatArray(b * d * t)
+    var idx = 0
+    for (i in 0 until b) for (j in 0 until d) for (k in 0 until t) {
+      out[idx++] = outer[i][j][k]
+    }
+    return out
+  }
+  companion object {
+    fun toPcm16(samples: FloatArray): ByteArray {
+      val out = ByteBuffer.allocate(samples.size * 2).order(ByteOrder.LITTLE_ENDIAN)
+      for (s in samples) {
+        val clamped = if (s > 1f) 1f else if (s < -1f) -1f else s
+        out.putShort((clamped * 32767f).toInt().toShort())
+      }
+      return out.array()
+    }
+  }
+}

package/android/src/main/java/expo/modules/ttskit/supertonic/TextFrontend.kt ADDED Viewed

@@ -0,0 +1,154 @@
+package expo.modules.ttskit.supertonic
+import org.json.JSONArray
+import java.io.File
+import java.text.Normalizer
+object TextFrontend {
+  val AVAILABLE_LANGS = setOf(
+    "en", "ko", "ja", "ar", "bg", "cs", "da", "de", "el", "es", "et", "fi",
+    "fr", "hi", "hr", "hu", "id", "it", "lt", "lv", "nl", "pl", "pt", "ro",
+    "ru", "sk", "sl", "sv", "tr", "uk", "vi"
+  )
+  private val ABBREVIATIONS = setOf(
+    "Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.",
+    "St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.",
+    "Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D."
+  )
+  fun maxChunkLength(lang: String): Int = if (lang == "ko" || lang == "ja") 120 else 300
+  fun preprocess(text: String, lang: String): String {
+    require(AVAILABLE_LANGS.contains(lang)) { "Unsupported language: $lang" }
+    var s = Normalizer.normalize(text, Normalizer.Form.NFKD)
+    // Strip emoji blocks.
+    val sb = StringBuilder(s.length)
+    var i = 0
+    while (i < s.length) {
+      val cp = s.codePointAt(i)
+      val skip = (cp in 0x1F600..0x1F64F || cp in 0x1F300..0x1F5FF ||
+        cp in 0x1F680..0x1F6FF || cp in 0x1F700..0x1F77F ||
+        cp in 0x1F780..0x1F7FF || cp in 0x1F800..0x1F8FF ||
+        cp in 0x1F900..0x1F9FF || cp in 0x1FA00..0x1FA6F ||
+        cp in 0x1FA70..0x1FAFF || cp in 0x2600..0x26FF ||
+        cp in 0x2700..0x27BF || cp in 0x1F1E6..0x1F1FF)
+      if (!skip) sb.appendCodePoint(cp)
+      i += Character.charCount(cp)
+    }
+    s = sb.toString()
+    val replacements = mapOf(
+      "–" to "-", "‑" to "-", "—" to "-",
+      "_" to " ",
+      "“" to "\"", "”" to "\"",
+      "‘" to "'", "’" to "'",
+      "´" to "'", "`" to "'",
+      "[" to " ", "]" to " ", "|" to " ", "/" to " ", "#" to " ",
+      "→" to " ", "←" to " "
+    )
+    for ((k, v) in replacements) s = s.replace(k, v)
+    for (sym in listOf("♥", "☆", "♡", "©", "\\")) s = s.replace(sym, "")
+    s = s.replace("@", " at ").replace("e.g.,", "for example, ").replace("i.e.,", "that is, ")
+    val pSpacing = listOf(" ," to ",", " ." to ".", " !" to "!", " ?" to "?",
+      " ;" to ";", " :" to ":", " '" to "'")
+    for ((k, v) in pSpacing) s = s.replace(k, v)
+    while (s.contains("\"\"")) s = s.replace("\"\"", "\"")
+    while (s.contains("''"))   s = s.replace("''", "'")
+    while (s.contains("``"))   s = s.replace("``", "`")
+    s = s.replace(Regex("\\s+"), " ").trim()
+    if (s.isNotEmpty() && !s.matches(Regex(".*[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]\$"))) {
+      s += "."
+    }
+    return "<$lang>$s</$lang>"
+  }
+  fun chunk(text: String, lang: String): List<String> {
+    val maxLen = maxChunkLength(lang)
+    val trimmed = text.trim(); if (trimmed.isEmpty()) return emptyList()
+    val paragraphs = trimmed.split(Regex("\\n\\s*\\n")).map { it.trim() }.filter { it.isNotEmpty() }
+    val chunks = mutableListOf<String>()
+    val source = if (paragraphs.isEmpty()) listOf(trimmed) else paragraphs
+    for (p in source) {
+      if (p.length <= maxLen) { chunks.add(p); continue }
+      chunks.addAll(greedyJoin(splitSentences(p), maxLen))
+    }
+    return if (chunks.isEmpty()) listOf(trimmed) else chunks
+  }
+  private fun splitSentences(text: String): List<String> {
+    // Latin punctuation requires trailing whitespace; Asian terminal
+    // punctuation (。！？) does not. Without the Asian branch, long ja/ko/zh
+    // strings collapse into one oversized chunk which the model truncates.
+    val regex = Regex("([.!?])\\s+|([。！？])")
+    val matches = regex.findAll(text).toList()
+    if (matches.isEmpty()) return listOf(text)
+    val out = mutableListOf<String>()
+    var lastEnd = 0
+    for (m in matches) {
+      val before = text.substring(lastEnd, m.range.first)
+      val punc = text[m.range.first].toString()
+      val combined = before.trim() + punc
+      val isAbbrev = ABBREVIATIONS.any { combined.endsWith(it) }
+      if (!isAbbrev) {
+        out.add(text.substring(lastEnd, m.range.last + 1))
+        lastEnd = m.range.last + 1
+      }
+    }
+    if (lastEnd < text.length) out.add(text.substring(lastEnd))
+    return if (out.isEmpty()) listOf(text) else out
+  }
+  private fun greedyJoin(pieces: List<String>, maxLen: Int): List<String> {
+    val out = mutableListOf<String>()
+    var current = ""
+    for (raw in pieces) {
+      val p = raw.trim(); if (p.isEmpty()) continue
+      current = when {
+        current.isEmpty() -> p
+        current.length + 1 + p.length <= maxLen -> "$current $p"
+        else -> { out.add(current); p }
+      }
+    }
+    if (current.isNotEmpty()) out.add(current)
+    return out
+  }
+}
+class UnicodeIndexer(path: String) {
+  private val table: LongArray
+  init {
+    val text = File(path).readText(Charsets.UTF_8)
+    val arr = JSONArray(text)
+    val out = LongArray(arr.length())
+    for (i in 0 until arr.length()) out[i] = arr.getLong(i)
+    table = out
+  }
+  fun encode(text: String): LongArray {
+    val codepoints = mutableListOf<Int>()
+    var i = 0
+    while (i < text.length) {
+      val cp = text.codePointAt(i)
+      codepoints.add(cp)
+      i += Character.charCount(cp)
+    }
+    val out = LongArray(codepoints.size)
+    for ((j, cp) in codepoints.withIndex()) {
+      out[j] = if (cp < table.size) table[cp] else -1L
+    }
+    return out
+  }
+}

package/android/src/main/java/expo/modules/ttskit/supertonic/VoicePack.kt ADDED Viewed

@@ -0,0 +1,47 @@
+package expo.modules.ttskit.supertonic
+import ai.onnxruntime.OnnxTensor
+import ai.onnxruntime.OrtEnvironment
+import org.json.JSONObject
+import java.io.File
+import java.nio.FloatBuffer
+/**
+ * Loads `voice_styles/<id>.json` from upstream Supertonic.
+ * Each file contains two 3D float tensors: style_ttl and style_dp.
+ */
+class VoicePack(val voiceId: String, env: OrtEnvironment, path: String) {
+  val ttl: OnnxTensor
+  val dp: OnnxTensor
+  init {
+    val text = File(path).readText(Charsets.UTF_8)
+    val root = JSONObject(text)
+    ttl = parseComponent(env, root.getJSONObject("style_ttl"))
+    dp  = parseComponent(env, root.getJSONObject("style_dp"))
+  }
+  fun close() {
+    runCatching { ttl.close() }
+    runCatching { dp.close() }
+  }
+  private fun parseComponent(env: OrtEnvironment, obj: JSONObject): OnnxTensor {
+    val dimsArr = obj.getJSONArray("dims")
+    val dims = LongArray(dimsArr.length()) { i -> dimsArr.getLong(i) }
+    val total = dims.fold(1L) { acc, d -> acc * d }
+    val flat = FloatArray(total.toInt())
+    val data = obj.getJSONArray("data")
+    var idx = 0
+    for (a in 0 until data.length()) {
+      val l1 = data.getJSONArray(a)
+      for (b in 0 until l1.length()) {
+        val l2 = l1.getJSONArray(b)
+        for (c in 0 until l2.length()) {
+          flat[idx++] = l2.getDouble(c).toFloat()
+        }
+      }
+    }
+    return OnnxTensor.createTensor(env, FloatBuffer.wrap(flat), dims)
+  }
+}

package/build/engines/BufferedStreamEmitter.d.ts ADDED Viewed

@@ -0,0 +1,26 @@
+/**
+ * Per-stream emitter that buffers events emitted before any listener attaches.
+ *
+ * The native side of `stream()` starts producing chunks the moment we kick it
+ * off, but the JS caller usually attaches `.on('chunk')` immediately after —
+ * there's a small async gap. Without buffering, early chunks would be silently
+ * dropped. Once a listener is attached, queued events drain to it.
+ *
+ * This class is exported separately from SupertonicEngine so it can be unit
+ * tested without needing the native module to load.
+ */
+export declare class BufferedStreamEmitter {
+    private chunkListeners;
+    private endListeners;
+    private errorListeners;
+    private pendingChunks;
+    private pendingEnd;
+    private pendingError;
+    on(event: 'chunk', listener: (pcm: Uint8Array) => void): void;
+    on(event: 'end', listener: () => void): void;
+    on(event: 'error', listener: (err: Error) => void): void;
+    emitChunk(pcm: Uint8Array): void;
+    emitEnd(): void;
+    emitError(err: Error): void;
+}
+//# sourceMappingURL=BufferedStreamEmitter.d.ts.map

package/build/engines/BufferedStreamEmitter.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"BufferedStreamEmitter.d.ts","sourceRoot":"","sources":["../../src/engines/BufferedStreamEmitter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AACH,qBAAa,qBAAqB;IAChC,OAAO,CAAC,cAAc,CAAwC;IAC9D,OAAO,CAAC,YAAY,CAAyB;IAC7C,OAAO,CAAC,cAAc,CAAmC;IACzD,OAAO,CAAC,aAAa,CAAoB;IACzC,OAAO,CAAC,UAAU,CAAS;IAC3B,OAAO,CAAC,YAAY,CAAsB;IAE1C,EAAE,CAAC,KAAK,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,GAAG,EAAE,UAAU,KAAK,IAAI,GAAG,IAAI;IAC7D,EAAE,CAAC,KAAK,EAAE,KAAK,EAAE,QAAQ,EAAE,MAAM,IAAI,GAAG,IAAI;IAC5C,EAAE,CAAC,KAAK,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,GAAG,EAAE,KAAK,KAAK,IAAI,GAAG,IAAI;IAgBxD,SAAS,CAAC,GAAG,EAAE,UAAU,GAAG,IAAI;IAIhC,OAAO,IAAI,IAAI;IAIf,SAAS,CAAC,GAAG,EAAE,KAAK,GAAG,IAAI;CAI5B"}

package/build/engines/BufferedStreamEmitter.js ADDED Viewed

@@ -0,0 +1,68 @@
+/**
+ * Per-stream emitter that buffers events emitted before any listener attaches.
+ *
+ * The native side of `stream()` starts producing chunks the moment we kick it
+ * off, but the JS caller usually attaches `.on('chunk')` immediately after —
+ * there's a small async gap. Without buffering, early chunks would be silently
+ * dropped. Once a listener is attached, queued events drain to it.
+ *
+ * This class is exported separately from SupertonicEngine so it can be unit
+ * tested without needing the native module to load.
+ */
+export class BufferedStreamEmitter {
+    chunkListeners = [];
+    endListeners = [];
+    errorListeners = [];
+    pendingChunks = [];
+    pendingEnd = false;
+    pendingError = null;
+    on(event, listener) {
+        if (event === 'chunk') {
+            this.chunkListeners.push(listener);
+            const drained = this.pendingChunks;
+            this.pendingChunks = [];
+            for (const pcm of drained)
+                listener(pcm);
+        }
+        else if (event === 'end') {
+            this.endListeners.push(listener);
+            if (this.pendingEnd) {
+                this.pendingEnd = false;
+                listener();
+            }
+        }
+        else if (event === 'error') {
+            this.errorListeners.push(listener);
+            if (this.pendingError) {
+                const e = this.pendingError;
+                this.pendingError = null;
+                listener(e);
+            }
+        }
+    }
+    emitChunk(pcm) {
+        if (this.chunkListeners.length === 0) {
+            this.pendingChunks.push(pcm);
+            return;
+        }
+        for (const l of this.chunkListeners)
+            l(pcm);
+    }
+    emitEnd() {
+        if (this.endListeners.length === 0) {
+            this.pendingEnd = true;
+            return;
+        }
+        for (const l of this.endListeners)
+            l();
+    }
+    emitError(err) {
+        if (this.errorListeners.length === 0) {
+            this.pendingError = err;
+            return;
+        }
+        for (const l of this.errorListeners)
+            l(err);
+    }
+}
+//# sourceMappingURL=BufferedStreamEmitter.js.map

package/build/engines/BufferedStreamEmitter.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"BufferedStreamEmitter.js","sourceRoot":"","sources":["../../src/engines/BufferedStreamEmitter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AACH,MAAM,OAAO,qBAAqB;IACxB,cAAc,GAAqC,EAAE,CAAC;IACtD,YAAY,GAAsB,EAAE,CAAC;IACrC,cAAc,GAAgC,EAAE,CAAC;IACjD,aAAa,GAAiB,EAAE,CAAC;IACjC,UAAU,GAAG,KAAK,CAAC;IACnB,YAAY,GAAiB,IAAI,CAAC;IAK1C,EAAE,CAAC,KAAgC,EAAE,QAAkC;QACrE,IAAI,KAAK,KAAK,OAAO,EAAE,CAAC;YACtB,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACnC,MAAM,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC;YACnC,IAAI,CAAC,aAAa,GAAG,EAAE,CAAC;YACxB,KAAK,MAAM,GAAG,IAAI,OAAO;gBAAE,QAAQ,CAAC,GAAG,CAAC,CAAC;QAC3C,CAAC;aAAM,IAAI,KAAK,KAAK,KAAK,EAAE,CAAC;YAC3B,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACjC,IAAI,IAAI,CAAC,UAAU,EAAE,CAAC;gBAAC,IAAI,CAAC,UAAU,GAAG,KAAK,CAAC;gBAAC,QAAQ,EAAE,CAAC;YAAC,CAAC;QAC/D,CAAC;aAAM,IAAI,KAAK,KAAK,OAAO,EAAE,CAAC;YAC7B,IAAI,CAAC,cAAc,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACnC,IAAI,IAAI,CAAC,YAAY,EAAE,CAAC;gBAAC,MAAM,CAAC,GAAG,IAAI,CAAC,YAAY,CAAC;gBAAC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC;gBAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;YAAC,CAAC;QAChG,CAAC;IACH,CAAC;IAED,SAAS,CAAC,GAAe;QACvB,IAAI,IAAI,CAAC,cAAc,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAAC,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAAC,OAAO;QAAC,CAAC;QAC/E,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,cAAc;YAAE,CAAC,CAAC,GAAG,CAAC,CAAC;IAC9C,CAAC;IACD,OAAO;QACL,IAAI,IAAI,CAAC,YAAY,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAAC,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC;YAAC,OAAO;QAAC,CAAC;QACvE,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,YAAY;YAAE,CAAC,EAAE,CAAC;IACzC,CAAC;IACD,SAAS,CAAC,GAAU;QAClB,IAAI,IAAI,CAAC,cAAc,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAAC,IAAI,CAAC,YAAY,GAAG,GAAG,CAAC;YAAC,OAAO;QAAC,CAAC;QAC1E,KAAK,MAAM,CAAC,IAAI,IAAI,CAAC,cAAc;YAAE,CAAC,CAAC,GAAG,CAAC,CAAC;IAC9C,CAAC;CACF","sourcesContent":["/**\n * Per-stream emitter that buffers events emitted before any listener attaches.\n *\n * The native side of `stream()` starts producing chunks the moment we kick it\n * off, but the JS caller usually attaches `.on('chunk')` immediately after —\n * there's a small async gap. Without buffering, early chunks would be silently\n * dropped. Once a listener is attached, queued events drain to it.\n *\n * This class is exported separately from SupertonicEngine so it can be unit\n * tested without needing the native module to load.\n */\nexport class BufferedStreamEmitter {\n private chunkListeners: Array<(pcm: Uint8Array) => void> = [];\n private endListeners: Array<() => void> = [];\n private errorListeners: Array<(err: Error) => void> = [];\n private pendingChunks: Uint8Array[] = [];\n private pendingEnd = false;\n private pendingError: Error | null = null;\n\n on(event: 'chunk', listener: (pcm: Uint8Array) => void): void;\n on(event: 'end', listener: () => void): void;\n on(event: 'error', listener: (err: Error) => void): void;\n on(event: 'chunk' | 'end' | 'error', listener: (...args: any[]) => void): void {\n if (event === 'chunk') {\n this.chunkListeners.push(listener);\n const drained = this.pendingChunks;\n this.pendingChunks = [];\n for (const pcm of drained) listener(pcm);\n } else if (event === 'end') {\n this.endListeners.push(listener);\n if (this.pendingEnd) { this.pendingEnd = false; listener(); }\n } else if (event === 'error') {\n this.errorListeners.push(listener);\n if (this.pendingError) { const e = this.pendingError; this.pendingError = null; listener(e); }\n }\n }\n\n emitChunk(pcm: Uint8Array): void {\n if (this.chunkListeners.length === 0) { this.pendingChunks.push(pcm); return; }\n for (const l of this.chunkListeners) l(pcm);\n }\n emitEnd(): void {\n if (this.endListeners.length === 0) { this.pendingEnd = true; return; }\n for (const l of this.endListeners) l();\n }\n emitError(err: Error): void {\n if (this.errorListeners.length === 0) { this.pendingError = err; return; }\n for (const l of this.errorListeners) l(err);\n }\n}\n"]}