react-native-tts-kit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/ATTRIBUTIONS.md +87 -0
  2. package/LICENSE +21 -0
  3. package/README.md +231 -0
  4. package/android/build.gradle +50 -0
  5. package/android/src/main/AndroidManifest.xml +3 -0
  6. package/android/src/main/java/expo/modules/ttskit/RNTTSKitModule.kt +158 -0
  7. package/android/src/main/java/expo/modules/ttskit/supertonic/AudioEngine.kt +158 -0
  8. package/android/src/main/java/expo/modules/ttskit/supertonic/ModelLocator.kt +372 -0
  9. package/android/src/main/java/expo/modules/ttskit/supertonic/SupertonicSession.kt +373 -0
  10. package/android/src/main/java/expo/modules/ttskit/supertonic/TextFrontend.kt +154 -0
  11. package/android/src/main/java/expo/modules/ttskit/supertonic/VoicePack.kt +47 -0
  12. package/build/engines/BufferedStreamEmitter.d.ts +26 -0
  13. package/build/engines/BufferedStreamEmitter.d.ts.map +1 -0
  14. package/build/engines/BufferedStreamEmitter.js +68 -0
  15. package/build/engines/BufferedStreamEmitter.js.map +1 -0
  16. package/build/engines/Engine.d.ts +15 -0
  17. package/build/engines/Engine.d.ts.map +1 -0
  18. package/build/engines/Engine.js +2 -0
  19. package/build/engines/Engine.js.map +1 -0
  20. package/build/engines/SupertonicEngine.d.ts +14 -0
  21. package/build/engines/SupertonicEngine.d.ts.map +1 -0
  22. package/build/engines/SupertonicEngine.js +183 -0
  23. package/build/engines/SupertonicEngine.js.map +1 -0
  24. package/build/engines/SystemEngine.d.ts +13 -0
  25. package/build/engines/SystemEngine.d.ts.map +1 -0
  26. package/build/engines/SystemEngine.js +78 -0
  27. package/build/engines/SystemEngine.js.map +1 -0
  28. package/build/index.d.ts +46 -0
  29. package/build/index.d.ts.map +1 -0
  30. package/build/index.js +118 -0
  31. package/build/index.js.map +1 -0
  32. package/build/types.d.ts +77 -0
  33. package/build/types.d.ts.map +1 -0
  34. package/build/types.js +2 -0
  35. package/build/types.js.map +1 -0
  36. package/build/voices/catalog.d.ts +12 -0
  37. package/build/voices/catalog.d.ts.map +1 -0
  38. package/build/voices/catalog.js +28 -0
  39. package/build/voices/catalog.js.map +1 -0
  40. package/build/voices/prosody.d.ts +8 -0
  41. package/build/voices/prosody.d.ts.map +1 -0
  42. package/build/voices/prosody.js +28 -0
  43. package/build/voices/prosody.js.map +1 -0
  44. package/expo-module.config.json +9 -0
  45. package/ios/RNTTSKit.podspec +28 -0
  46. package/ios/RNTTSKitModule.swift +133 -0
  47. package/ios/Supertonic/AudioEngine.swift +110 -0
  48. package/ios/Supertonic/ModelLocator.swift +416 -0
  49. package/ios/Supertonic/SupertonicSession.swift +405 -0
  50. package/ios/Supertonic/TextFrontend.swift +216 -0
  51. package/ios/Supertonic/VoicePack.swift +51 -0
  52. package/licenses/OpenRAIL-M.txt +209 -0
  53. package/package.json +77 -0
  54. package/src/engines/BufferedStreamEmitter.ts +50 -0
  55. package/src/engines/Engine.ts +28 -0
  56. package/src/engines/SupertonicEngine.ts +250 -0
  57. package/src/engines/SystemEngine.ts +96 -0
  58. package/src/engines/__tests__/BufferedStreamEmitter.test.ts +65 -0
  59. package/src/index.ts +156 -0
  60. package/src/types.ts +95 -0
  61. package/src/voices/__tests__/catalog.test.ts +46 -0
  62. package/src/voices/__tests__/prosody.test.ts +63 -0
  63. package/src/voices/catalog.ts +32 -0
  64. package/src/voices/prosody.ts +39 -0
@@ -0,0 +1,87 @@
1
+ # Attributions
2
+
3
+ `react-native-tts-kit` ships and depends on the following components.
4
+
5
+ ---
6
+
7
+ ## Model weights — Supertonic-3 (BigScience OpenRAIL-M)
8
+
9
+ - **Original source:** https://huggingface.co/Supertone/supertonic-3
10
+ - Pinned upstream commit: `724fb5abbf5502583fb520898d45929e62f02c0b` (2026-04-29 release)
11
+ - **Mirror used by this package:** https://huggingface.co/ahk-d/supertonic-3
12
+ - Pinned mirror commit: `5024985bc861c2ae97ef9038dc2fc56f410e95be`
13
+ - Byte-identical copy of the upstream weights, hosted as a redistribution
14
+ safeguard (OpenRAIL-M Section III §4 explicitly permits redistribution).
15
+ - The package downloads from the mirror first; if it's unreachable, falls
16
+ back to the upstream Supertone repo at the pinned upstream commit.
17
+ - **Copyright:** © 2025 Supertone Inc.
18
+ - **License:** [BigScience OpenRAIL-M](./licenses/OpenRAIL-M.txt) (full text shipped under `licenses/`)
19
+
20
+ ### What you can do
21
+ - Use the weights in commercial apps, no royalty.
22
+ - Redistribute, fine-tune, derive new models.
23
+ - Modify `.onnx` files (mark them as modified per Section III §4(c)).
24
+
25
+ ### What you cannot do (Attachment A — Use Restrictions)
26
+ You may not use the model, or any model derived from it, to:
27
+ 1. Violate any law.
28
+ 2. Exploit or harm minors.
29
+ 3. Generate or spread verifiably false information intended to harm.
30
+ 4. Generate or spread personal identifiable information to harm someone.
31
+ 5. Generate AI content without **clearly disclosing that it is AI-generated**.
32
+ 6. Defame, disparage, or harass.
33
+ 7. **Impersonate someone (e.g. deepfakes) without their consent.**
34
+ 8. Make fully-automated decisions affecting a person's legal rights.
35
+ 9. Discriminate or harm groups based on protected characteristics.
36
+ 10. Exploit vulnerable populations.
37
+ 11. Provide medical advice / interpret medical results.
38
+ 12. Use for law-enforcement, immigration, or asylum prediction.
39
+
40
+ ### What you must do when distributing this package or apps built with it
41
+ Per OpenRAIL-M Section III §4:
42
+ - (a) Bind your end users to the same use restrictions in your ToS or license.
43
+ - (b) Ship a copy of the OpenRAIL-M license with the model (we do — see `licenses/`).
44
+ - (c) Mark any modified model files. (We don't modify the `.onnx` files.)
45
+ - (d) Preserve Supertone's copyright and attribution notices. (We do.)
46
+
47
+ > **Practical guidance for apps shipping this package:** add a line to your ToS / "About" screen along the lines of:
48
+ > *"This app uses Supertone's Supertonic-3 model under the BigScience OpenRAIL-M License. Your use of this app's voice features is subject to the OpenRAIL-M Use Restrictions, which prohibit impersonation without consent, generation of misleading content, and other harmful uses."*
49
+
50
+ ---
51
+
52
+ ## Source-code reference — Supertone/supertonic GitHub
53
+
54
+ We **do not vendor** Supertone's code. Our iOS Swift inference and Android
55
+ Kotlin port ([`ios/Supertonic/`](./ios/Supertonic/), [`android/.../supertonic/`](./android/src/main/java/expo/modules/ttskit/supertonic/))
56
+ were written from scratch using the upstream Python and Swift references as a
57
+ specification:
58
+
59
+ - **Source:** https://github.com/supertone-inc/supertonic
60
+ - **Copyright:** © 2025 Supertone Inc.
61
+ - **License:** [MIT](https://github.com/supertone-inc/supertonic/blob/main/LICENSE)
62
+
63
+ The MIT license on the upstream code does not impose redistribution
64
+ obligations on our independent port; we credit Supertone here for
65
+ transparency and because the code is closely modeled on theirs.
66
+
67
+ ---
68
+
69
+ ## Runtime dependencies
70
+
71
+ | Package | Source | License |
72
+ |---|---|---|
73
+ | ONNX Runtime (iOS / Android) | https://github.com/microsoft/onnxruntime | MIT |
74
+ | Expo Modules Core | https://github.com/expo/expo | MIT |
75
+ | `expo-speech` (optional system engine) | https://github.com/expo/expo/tree/main/packages/expo-speech | MIT |
76
+ | `expo-asset`, `expo-constants`, `expo-dev-client` | https://github.com/expo/expo | MIT |
77
+
78
+ ---
79
+
80
+ ## This package
81
+
82
+ - **Code license:** MIT — see [`LICENSE`](./LICENSE)
83
+ - **Copyright:** © 2026 ahk-d
84
+
85
+ The MIT license on this repository covers only the code in `src/`, `ios/`,
86
+ `android/`, `example/`, and `benchmarks/`. The Supertonic-3 model weights
87
+ downloaded at runtime remain under the OpenRAIL-M license described above.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 ahk-d
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,231 @@
1
+ # react-native-tts-kit
2
+
3
+ <p align="center">
4
+ <img src="docs/thumbnail.png" alt="react-native-tts-kit — Neural TTS for React Native · On-device · 31 languages" width="1024">
5
+ </p>
6
+
7
+ **Neural text-to-speech for React Native and Expo. On-device. Sub-100ms. 31 languages.**
8
+
9
+ ```ts
10
+ import TTSKit from 'react-native-tts-kit';
11
+
12
+ await TTSKit.speak('Hello, world.');
13
+ ```
14
+
15
+ No API keys. No network. No robotic system voice.
16
+
17
+ > **Status:** v0.1 alpha. iOS verified on iPhone (iOS 26+). Android scaffolded but not yet validated on a real device. Feedback via [GitHub issues](https://github.com/ahk-d/react-native-tts-kit/issues).
18
+
19
+ ---
20
+
21
+ ## Why
22
+
23
+ | | Quality | Offline | Cost | Languages |
24
+ |---|---|---|---|---|
25
+ | `expo-speech` (system) | robotic | ✅ | free | OS-bound |
26
+ | ElevenLabs / OpenAI TTS | excellent | ❌ | per-request | 30+ |
27
+ | **`react-native-tts-kit`** | **neural** | **✅** | **free** | **31** |
28
+
29
+ There was no good answer for on-device neural TTS in React Native. We needed it for [Flowent](https://getflowent.com), so we built it and open-sourced it.
30
+
31
+ ---
32
+
33
+ ## Install
34
+
35
+ ```bash
36
+ npx expo install react-native-tts-kit
37
+ npx expo prebuild --platform ios
38
+ cd ios && pod install && cd ..
39
+ npx expo run:ios --device
40
+ ```
41
+
42
+ Bare RN: same flow, just install `expo-modules-core` as a peer dep.
43
+
44
+ > Custom native code can't run inside Expo Go — use a dev build. Standard Expo workflow in 2026.
45
+
46
+ ---
47
+
48
+ ## Use
49
+
50
+ ```ts
51
+ import TTSKit from 'react-native-tts-kit';
52
+
53
+ // Default: F1 voice, English
54
+ await TTSKit.speak('Hello, world.');
55
+
56
+ // Pair any voice with any of 31 languages
57
+ await TTSKit.speak('Bonjour le monde', { voice: 'F1', language: 'fr' });
58
+ await TTSKit.speak('こんにちは', { voice: 'M2', language: 'ja' });
59
+ await TTSKit.speak('안녕하세요', { voice: 'F3', language: 'ko' });
60
+
61
+ // Stream long text — first audio arrives before synthesis finishes
62
+ const stream = TTSKit.stream(longArticle, { language: 'en' });
63
+ stream.on('chunk', (pcm) => { /* PCM16LE @ 44.1 kHz */ });
64
+ stream.on('end', () => {});
65
+
66
+ // Stop in-flight synthesis
67
+ await TTSKit.stop();
68
+ ```
69
+
70
+ ### First-launch UX
71
+
72
+ The model is ~210 MB (multilingual split — 31 languages, fp16 weights) and downloads on first use. Call `prefetchModel()` from a settings screen so users aren't surprised mid-conversation:
73
+
74
+ ```ts
75
+ await TTSKit.prefetchModel((p) => {
76
+ setProgress(p.percent); // 0–100
77
+ });
78
+ ```
79
+
80
+ Once downloaded, all calls are instant and offline forever.
81
+
82
+ ### Voices
83
+
84
+ 10 voices, all language-agnostic. Pair any voice with any language.
85
+
86
+ | Male | M1, M2, M3, M4, M5 |
87
+ |-------|---------------------|
88
+ | Female | F1, F2, F3, F4, F5 |
89
+
90
+ ```ts
91
+ const voices = await TTSKit.getVoices();
92
+ // [{ id: 'F1', name: 'F1', gender: 'female', engine: 'supertonic' }, ...]
93
+ ```
94
+
95
+ ### Languages (31, verified on-device)
96
+
97
+ All 31 languages produce intelligible neural-quality audio on iPhone.
98
+ The example app ([`example/App.tsx`](example/App.tsx)) ships a tappable
99
+ sample sentence for every one.
100
+
101
+ | | | | | |
102
+ |---|---|---|---|---|
103
+ | Arabic (`ar`) | Bulgarian (`bg`) | Czech (`cs`) | Danish (`da`) | German (`de`) |
104
+ | Greek (`el`) | English (`en`) | Spanish (`es`) | Estonian (`et`) | Finnish (`fi`) |
105
+ | French (`fr`) | Hindi (`hi`) | Croatian (`hr`) | Hungarian (`hu`) | Indonesian (`id`) |
106
+ | Italian (`it`) | Japanese (`ja`) | Korean (`ko`) | Lithuanian (`lt`) | Latvian (`lv`) |
107
+ | Dutch (`nl`) | Polish (`pl`) | Portuguese (`pt`) | Romanian (`ro`) | Russian (`ru`) |
108
+ | Slovak (`sk`) | Slovenian (`sl`) | Swedish (`sv`) | Turkish (`tr`) | Ukrainian (`uk`) |
109
+ | Vietnamese (`vi`) | | | | |
110
+
111
+ ```ts
112
+ import { SUPERTONIC_LANGUAGES } from 'react-native-tts-kit';
113
+
114
+ await TTSKit.speak('こんにちは', { voice: 'F1', language: 'ja' });
115
+ await TTSKit.speak('Привет', { voice: 'M2', language: 'ru' });
116
+ await TTSKit.speak('नमस्ते', { voice: 'F3', language: 'hi' });
117
+ ```
118
+
119
+ Voices are language-agnostic — any voice can speak any language.
120
+
121
+ ### Engines
122
+
123
+ ```ts
124
+ TTSKit.setEngine('supertonic'); // default — neural, on-device
125
+ TTSKit.setEngine('system'); // expo-speech fallback (robotic but free)
126
+ ```
127
+
128
+ | Engine | Status | Use when |
129
+ |---|---|---|
130
+ | `supertonic` | ✅ shipping | Production neural TTS |
131
+ | `system` | ✅ shipping | Fallback, no model download |
132
+ | `neutts` (voice cloning) | ⏳ v1.1 | 3-second voice clone |
133
+ | `cloud:eleven` / `cloud:openai` | ⏳ v1.2 | When you want premium quality |
134
+
135
+ ---
136
+
137
+ ## Performance
138
+
139
+ Measured on iPhone (iOS 26.4, Debug build):
140
+
141
+ | Metric | Value |
142
+ |---|---|
143
+ | Time to first audio (1 sentence) | ~70 ms |
144
+ | Real-time factor | < 0.5× |
145
+ | Cold start (first speak after launch) | ~1–2 s |
146
+ | Memory (peak during synthesis) | ~250 MB |
147
+
148
+ Reproducible benchmark harness in [`benchmarks/`](benchmarks/). Numbers across more devices (iPhone 14, Pixel 8, mid-tier Android) coming with v1.0.
149
+
150
+ ---
151
+
152
+ ## Privacy
153
+
154
+ Text passed to `speak()` and `stream()` is processed entirely on-device. Once the model is downloaded (one-time, on first `prefetchModel()` call), **no text or audio crosses the network at runtime** — synthesis runs in the local ONNX session and audio plays through the platform audio engine.
155
+
156
+ The only network activity this package performs is the initial model download from HuggingFace (see [`ATTRIBUTIONS.md`](ATTRIBUTIONS.md) for endpoints). Downloads are SHA-256-verified against fingerprints baked into the package when present. If you ship your app in a privacy-sensitive context, an offline mode (e.g. bundling the model file with your app's assets) is also supported — see [CONTRIBUTING.md](CONTRIBUTING.md).
157
+
158
+ ---
159
+
160
+ ## Architecture
161
+
162
+ ```
163
+ your app
164
+
165
+ TTSKit — public API ([src/index.ts](src/index.ts))
166
+
167
+ SupertonicEngine — JS wrapper, listens for native events
168
+
169
+ RNTTSKitModule — Expo Module bridge ([ios/](ios/), [android/](android/))
170
+
171
+ 4 ONNX sessions — duration_predictor → text_encoder
172
+ → vector_estimator (×8 denoising)
173
+ → vocoder
174
+
175
+ Float32 PCM @ 44.1 kHz — AVAudioEngine / AudioTrack playback
176
+ ```
177
+
178
+ Model weights: [Supertonic-3](https://huggingface.co/Supertone/supertonic-3) (99M params, OpenRAIL-M). We host a [pinned mirror](https://huggingface.co/ahk-d/supertonic-3) for resilience.
179
+
180
+ ---
181
+
182
+ ## License
183
+
184
+ | Component | License |
185
+ |---|---|
186
+ | This package | MIT |
187
+ | Supertonic-3 model weights | [BigScience OpenRAIL-M](licenses/OpenRAIL-M.txt) |
188
+ | ONNX Runtime, Expo Modules | MIT |
189
+
190
+ **If you ship this in your app**, OpenRAIL-M requires you to:
191
+
192
+ 1. Bind your end users to the [Use Restrictions](ATTRIBUTIONS.md#what-you-cannot-do-attachment-a--use-restrictions) — most importantly **no impersonation/deepfakes without consent** and **AI-generated audio must be disclosed**.
193
+ 2. Ship a copy of [`licenses/OpenRAIL-M.txt`](licenses/OpenRAIL-M.txt) with your app.
194
+ 3. Preserve the Supertone copyright notice.
195
+
196
+ A 2-line ToS clause covers it. See [ATTRIBUTIONS.md](ATTRIBUTIONS.md) for boilerplate.
197
+
198
+ ---
199
+
200
+ ## Roadmap
201
+
202
+ - **v1.0** — Android validation, benchmarks across 4+ devices, cleaner first-launch UX
203
+ - **v1.1** — Voice cloning via [NeuTTS Air](https://huggingface.co/neuphonic/neutts-air) (3-sec sample → cloned voice)
204
+ - **v1.2** — Cloud engine adapters (ElevenLabs, OpenAI, Cartesia) behind the same API
205
+ - **v2.0** — `@ttskit/web` — same API in the browser, WASM/WebGPU
206
+
207
+ ---
208
+
209
+ ## FAQ
210
+
211
+ **How big is the model?** ~210 MB at fp16 (the multilingual split that supports all 31 languages — `vector_estimator.onnx` alone is 138 MB because it carries cross-lingual weights). Downloaded once on first launch and stored in Application Support / app-private files. fp32 fallback is available — see [tools/quantize.md](tools/quantize.md).
212
+
213
+ **Does it work in Expo Go?** No — custom native code requires a dev build. Run `npx expo prebuild` and use `npx expo run:ios --device`.
214
+
215
+ **Why isn't Chinese in the language list?** Supertonic-3's open-source weights cover 31 languages; Mandarin isn't one of them. Use a cloud engine (v1.2) if you need it.
216
+
217
+ **Can I use a custom voice?** Voice cloning lands in v1.1. For now, you have 10 preset voices.
218
+
219
+ **Does this work in production?** It's v0.1 alpha. The pipeline runs but we've validated on one iPhone. Expect rough edges for ~2 weeks post-launch.
220
+
221
+ **How does this compare to `expo-kokoro-onnx`?** Different model (Supertonic-3 vs Kokoro-82M), more languages (31 vs 8), faster TTFA, no espeak-ng dependency. Both are valid choices; we chose Supertonic for Flowent's needs.
222
+
223
+ ---
224
+
225
+ ## Credits
226
+
227
+ - [Supertone Inc.](https://www.supertone.ai/) — for [Supertonic-3](https://huggingface.co/Supertone/supertonic-3), the model that does the heavy lifting
228
+ - [Microsoft ONNX Runtime](https://onnxruntime.ai/) — inference engine
229
+ - [Expo Modules](https://docs.expo.dev/modules/overview/) — native bridge
230
+
231
+ Built by [@ahk-d](https://github.com/ahk-d) for [Flowent](https://getflowent.com).
@@ -0,0 +1,50 @@
1
+ apply plugin: 'com.android.library'
2
+ apply plugin: 'kotlin-android'
3
+
4
+ group = 'expo.modules.ttskit'
5
+ version = '0.1.0'
6
+
7
+ def expoModulesCorePlugin = new File(project(":expo-modules-core").projectDir.absolutePath, "ExpoModulesCorePlugin.gradle")
8
+ apply from: expoModulesCorePlugin
9
+ applyKotlinExpoModulesCorePlugin()
10
+ useCoreDependencies()
11
+ useExpoPublishing()
12
+
13
+ android {
14
+ namespace "expo.modules.ttskit"
15
+ compileSdkVersion safeExtGet('compileSdkVersion', 34)
16
+
17
+ defaultConfig {
18
+ minSdkVersion safeExtGet('minSdkVersion', 24)
19
+ targetSdkVersion safeExtGet('targetSdkVersion', 34)
20
+ versionCode 1
21
+ versionName "0.1.0"
22
+ }
23
+
24
+ buildFeatures {
25
+ buildConfig true
26
+ }
27
+
28
+ compileOptions {
29
+ sourceCompatibility JavaVersion.VERSION_17
30
+ targetCompatibility JavaVersion.VERSION_17
31
+ }
32
+
33
+ kotlinOptions {
34
+ jvmTarget = '17'
35
+ }
36
+
37
+ packagingOptions {
38
+ pickFirst '**/libonnxruntime.so'
39
+ }
40
+ }
41
+
42
+ dependencies {
43
+ implementation "com.microsoft.onnxruntime:onnxruntime-android:1.18.0"
44
+ implementation "org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3"
45
+ implementation "org.jetbrains.kotlinx:kotlinx-coroutines-android:1.7.3"
46
+ }
47
+
48
+ def safeExtGet(prop, fallback) {
49
+ rootProject.ext.has(prop) ? rootProject.ext.get(prop) : fallback
50
+ }
@@ -0,0 +1,3 @@
1
+ <manifest xmlns:android="http://schemas.android.com/apk/res/android">
2
+ <uses-permission android:name="android.permission.INTERNET" />
3
+ </manifest>
@@ -0,0 +1,158 @@
1
+ package expo.modules.ttskit
2
+
3
+ import android.util.Base64
4
+ import expo.modules.kotlin.Promise
5
+ import expo.modules.kotlin.modules.Module
6
+ import expo.modules.kotlin.modules.ModuleDefinition
7
+ import expo.modules.ttskit.supertonic.AudioEngine
8
+ import expo.modules.ttskit.supertonic.ModelLocator
9
+ import expo.modules.ttskit.supertonic.SupertonicSession
10
+ import kotlinx.coroutines.CoroutineScope
11
+ import kotlinx.coroutines.Dispatchers
12
+ import kotlinx.coroutines.Job
13
+ import kotlinx.coroutines.SupervisorJob
14
+ import kotlinx.coroutines.cancel
15
+ import kotlinx.coroutines.launch
16
+
17
+ class RNTTSKitModule : Module() {
18
+ private var session: SupertonicSession? = null
19
+ private val audio = AudioEngine()
20
+ private val scope = CoroutineScope(Dispatchers.IO + SupervisorJob())
21
+ private var prefetchJob: Job? = null
22
+
23
+ override fun definition() = ModuleDefinition {
24
+ Name("RNTTSKit")
25
+
26
+ Events(
27
+ "onPrefetchProgress",
28
+ "onStreamChunk",
29
+ "onStreamEnd",
30
+ "onStreamError",
31
+ "onSpeakStart",
32
+ "onSpeakDone"
33
+ )
34
+
35
+ OnCreate {
36
+ session = SupertonicSession(appContext.reactContext!!)
37
+ }
38
+
39
+ OnDestroy {
40
+ runCatching { session?.tearDown() }
41
+ session = null
42
+ audio.tearDown()
43
+ scope.cancel()
44
+ }
45
+
46
+ AsyncFunction("isAvailable") {
47
+ val ctx = appContext.reactContext ?: return@AsyncFunction false
48
+ ModelLocator.modelExists(ctx)
49
+ }
50
+
51
+ AsyncFunction("prefetch") { promise: Promise ->
52
+ val ctx = appContext.reactContext
53
+ if (ctx == null) {
54
+ promise.reject("CONTEXT_UNAVAILABLE", "React context unavailable", null)
55
+ return@AsyncFunction
56
+ }
57
+ prefetchJob?.cancel()
58
+ prefetchJob = scope.launch {
59
+ try {
60
+ ModelLocator.ensureModel(ctx) { downloaded, total ->
61
+ sendEvent("onPrefetchProgress", mapOf(
62
+ "bytesDownloaded" to downloaded,
63
+ "totalBytes" to total,
64
+ "percent" to if (total > 0) (downloaded.toDouble() / total * 100.0) else 0.0
65
+ ))
66
+ }
67
+ session?.loadIfNeeded()
68
+ // Pre-warm default voice so first speak() after prefetch doesn't
69
+ // pay JSON-decode + tensor-alloc cost.
70
+ session?.prewarmDefaultVoice()
71
+ promise.resolve(null)
72
+ } catch (e: Exception) {
73
+ promise.reject("PREFETCH_FAILED", e.message ?: "prefetch failed", e)
74
+ }
75
+ }
76
+ }
77
+
78
+ AsyncFunction("speak") { id: String, text: String, voiceId: String, lang: String, totalStep: Int, speed: Double, volume: Double, promise: Promise ->
79
+ scope.launch {
80
+ try {
81
+ val s = session ?: throw IllegalStateException("session not initialized")
82
+ s.loadIfNeeded()
83
+ // Pipe chunks through AudioTrack as they finish synthesizing rather
84
+ // than synthesizing the whole utterance first. With multi-sentence
85
+ // input this drops perceived TTFA from O(total-synthesis) to
86
+ // O(first-sentence). For a single short input it's the same as
87
+ // before. onSpeakStart fires when the first chunk hits the speaker.
88
+ audio.beginStream(s.sampleRate, volume.toFloat())
89
+ var startEmitted = false
90
+ s.synthesizeStreaming(text, lang, voiceId, totalStep, speed) { samples ->
91
+ if (!startEmitted) {
92
+ sendEvent("onSpeakStart", mapOf("id" to id))
93
+ startEmitted = true
94
+ }
95
+ audio.feedStream(samples)
96
+ }
97
+ audio.endStream()
98
+ sendEvent("onSpeakDone", mapOf("id" to id))
99
+ promise.resolve(null)
100
+ } catch (e: Exception) {
101
+ audio.endStream()
102
+ promise.reject("SYNTHESIS_FAILED", e.message ?: "synthesis failed", e)
103
+ }
104
+ }
105
+ }
106
+
107
+ AsyncFunction("stream") { id: String, text: String, voiceId: String, lang: String, totalStep: Int, speed: Double, volume: Double, promise: Promise ->
108
+ scope.launch {
109
+ val s = session ?: run {
110
+ promise.reject("MODEL_NOT_LOADED", "session not initialized", null)
111
+ return@launch
112
+ }
113
+ try {
114
+ s.loadIfNeeded()
115
+ audio.beginStream(s.sampleRate, volume.toFloat())
116
+ s.synthesizeStreaming(text, lang, voiceId, totalStep, speed) { samples ->
117
+ val pcm16 = SupertonicSession.toPcm16(samples)
118
+ val b64 = Base64.encodeToString(pcm16, Base64.NO_WRAP)
119
+ sendEvent("onStreamChunk", mapOf("id" to id, "pcm" to b64))
120
+ audio.feedStream(samples)
121
+ }
122
+ audio.endStream()
123
+ sendEvent("onStreamEnd", mapOf("id" to id))
124
+ promise.resolve(null)
125
+ } catch (e: Exception) {
126
+ audio.endStream()
127
+ sendEvent("onStreamError", mapOf("id" to id, "message" to (e.message ?: "stream failed")))
128
+ promise.reject("SYNTHESIS_FAILED", e.message ?: "stream failed", e)
129
+ }
130
+ }
131
+ }
132
+
133
+ AsyncFunction("stop") { promise: Promise ->
134
+ audio.stop()
135
+ session?.cancel()
136
+ promise.resolve(null)
137
+ }
138
+
139
+ AsyncFunction("clearCache") { promise: Promise ->
140
+ // Tear down the loaded ONNX sessions before deleting the files they
141
+ // reference. Otherwise the next loadIfNeeded() would short-circuit
142
+ // (isReady == true) and skip re-loading from disk. Then build a fresh
143
+ // session so subsequent prefetch/speak calls have a non-null target.
144
+ val ctx = appContext.reactContext
145
+ prefetchJob?.cancel()
146
+ audio.stop()
147
+ runCatching { session?.cancel() }
148
+ runCatching { session?.tearDown() }
149
+ if (ctx != null) {
150
+ ModelLocator.clearCache(ctx)
151
+ session = SupertonicSession(ctx)
152
+ } else {
153
+ session = null
154
+ }
155
+ promise.resolve(null)
156
+ }
157
+ }
158
+ }