react-native-tts-kit 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ATTRIBUTIONS.md +87 -0
- package/LICENSE +21 -0
- package/README.md +231 -0
- package/android/build.gradle +50 -0
- package/android/src/main/AndroidManifest.xml +3 -0
- package/android/src/main/java/expo/modules/ttskit/RNTTSKitModule.kt +158 -0
- package/android/src/main/java/expo/modules/ttskit/supertonic/AudioEngine.kt +158 -0
- package/android/src/main/java/expo/modules/ttskit/supertonic/ModelLocator.kt +372 -0
- package/android/src/main/java/expo/modules/ttskit/supertonic/SupertonicSession.kt +373 -0
- package/android/src/main/java/expo/modules/ttskit/supertonic/TextFrontend.kt +154 -0
- package/android/src/main/java/expo/modules/ttskit/supertonic/VoicePack.kt +47 -0
- package/build/engines/BufferedStreamEmitter.d.ts +26 -0
- package/build/engines/BufferedStreamEmitter.d.ts.map +1 -0
- package/build/engines/BufferedStreamEmitter.js +68 -0
- package/build/engines/BufferedStreamEmitter.js.map +1 -0
- package/build/engines/Engine.d.ts +15 -0
- package/build/engines/Engine.d.ts.map +1 -0
- package/build/engines/Engine.js +2 -0
- package/build/engines/Engine.js.map +1 -0
- package/build/engines/SupertonicEngine.d.ts +14 -0
- package/build/engines/SupertonicEngine.d.ts.map +1 -0
- package/build/engines/SupertonicEngine.js +183 -0
- package/build/engines/SupertonicEngine.js.map +1 -0
- package/build/engines/SystemEngine.d.ts +13 -0
- package/build/engines/SystemEngine.d.ts.map +1 -0
- package/build/engines/SystemEngine.js +78 -0
- package/build/engines/SystemEngine.js.map +1 -0
- package/build/index.d.ts +46 -0
- package/build/index.d.ts.map +1 -0
- package/build/index.js +118 -0
- package/build/index.js.map +1 -0
- package/build/types.d.ts +77 -0
- package/build/types.d.ts.map +1 -0
- package/build/types.js +2 -0
- package/build/types.js.map +1 -0
- package/build/voices/catalog.d.ts +12 -0
- package/build/voices/catalog.d.ts.map +1 -0
- package/build/voices/catalog.js +28 -0
- package/build/voices/catalog.js.map +1 -0
- package/build/voices/prosody.d.ts +8 -0
- package/build/voices/prosody.d.ts.map +1 -0
- package/build/voices/prosody.js +28 -0
- package/build/voices/prosody.js.map +1 -0
- package/expo-module.config.json +9 -0
- package/ios/RNTTSKit.podspec +28 -0
- package/ios/RNTTSKitModule.swift +133 -0
- package/ios/Supertonic/AudioEngine.swift +110 -0
- package/ios/Supertonic/ModelLocator.swift +416 -0
- package/ios/Supertonic/SupertonicSession.swift +405 -0
- package/ios/Supertonic/TextFrontend.swift +216 -0
- package/ios/Supertonic/VoicePack.swift +51 -0
- package/licenses/OpenRAIL-M.txt +209 -0
- package/package.json +77 -0
- package/src/engines/BufferedStreamEmitter.ts +50 -0
- package/src/engines/Engine.ts +28 -0
- package/src/engines/SupertonicEngine.ts +250 -0
- package/src/engines/SystemEngine.ts +96 -0
- package/src/engines/__tests__/BufferedStreamEmitter.test.ts +65 -0
- package/src/index.ts +156 -0
- package/src/types.ts +95 -0
- package/src/voices/__tests__/catalog.test.ts +46 -0
- package/src/voices/__tests__/prosody.test.ts +63 -0
- package/src/voices/catalog.ts +32 -0
- package/src/voices/prosody.ts +39 -0
package/ATTRIBUTIONS.md
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Attributions
|
|
2
|
+
|
|
3
|
+
`react-native-tts-kit` ships and depends on the following components.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Model weights — Supertonic-3 (BigScience OpenRAIL-M)
|
|
8
|
+
|
|
9
|
+
- **Original source:** https://huggingface.co/Supertone/supertonic-3
|
|
10
|
+
- Pinned upstream commit: `724fb5abbf5502583fb520898d45929e62f02c0b` (2026-04-29 release)
|
|
11
|
+
- **Mirror used by this package:** https://huggingface.co/ahk-d/supertonic-3
|
|
12
|
+
- Pinned mirror commit: `5024985bc861c2ae97ef9038dc2fc56f410e95be`
|
|
13
|
+
- Byte-identical copy of the upstream weights, hosted as a redistribution
|
|
14
|
+
safeguard (OpenRAIL-M Section III §4 explicitly permits redistribution).
|
|
15
|
+
- The package downloads from the mirror first; if it's unreachable, falls
|
|
16
|
+
back to the upstream Supertone repo at the pinned upstream commit.
|
|
17
|
+
- **Copyright:** © 2025 Supertone Inc.
|
|
18
|
+
- **License:** [BigScience OpenRAIL-M](./licenses/OpenRAIL-M.txt) (full text shipped under `licenses/`)
|
|
19
|
+
|
|
20
|
+
### What you can do
|
|
21
|
+
- Use the weights in commercial apps, no royalty.
|
|
22
|
+
- Redistribute, fine-tune, derive new models.
|
|
23
|
+
- Modify `.onnx` files (mark them as modified per Section III §4(c)).
|
|
24
|
+
|
|
25
|
+
### What you cannot do (Attachment A — Use Restrictions)
|
|
26
|
+
You may not use the model, or any model derived from it, to:
|
|
27
|
+
1. Violate any law.
|
|
28
|
+
2. Exploit or harm minors.
|
|
29
|
+
3. Generate or spread verifiably false information intended to harm.
|
|
30
|
+
4. Generate or spread personal identifiable information to harm someone.
|
|
31
|
+
5. Generate AI content without **clearly disclosing that it is AI-generated**.
|
|
32
|
+
6. Defame, disparage, or harass.
|
|
33
|
+
7. **Impersonate someone (e.g. deepfakes) without their consent.**
|
|
34
|
+
8. Make fully-automated decisions affecting a person's legal rights.
|
|
35
|
+
9. Discriminate or harm groups based on protected characteristics.
|
|
36
|
+
10. Exploit vulnerable populations.
|
|
37
|
+
11. Provide medical advice / interpret medical results.
|
|
38
|
+
12. Use for law-enforcement, immigration, or asylum prediction.
|
|
39
|
+
|
|
40
|
+
### What you must do when distributing this package or apps built with it
|
|
41
|
+
Per OpenRAIL-M Section III §4:
|
|
42
|
+
- (a) Bind your end users to the same use restrictions in your ToS or license.
|
|
43
|
+
- (b) Ship a copy of the OpenRAIL-M license with the model (we do — see `licenses/`).
|
|
44
|
+
- (c) Mark any modified model files. (We don't modify the `.onnx` files.)
|
|
45
|
+
- (d) Preserve Supertone's copyright and attribution notices. (We do.)
|
|
46
|
+
|
|
47
|
+
> **Practical guidance for apps shipping this package:** add a line to your ToS / "About" screen along the lines of:
|
|
48
|
+
> *"This app uses Supertone's Supertonic-3 model under the BigScience OpenRAIL-M License. Your use of this app's voice features is subject to the OpenRAIL-M Use Restrictions, which prohibit impersonation without consent, generation of misleading content, and other harmful uses."*
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Source-code reference — Supertone/supertonic GitHub
|
|
53
|
+
|
|
54
|
+
We **do not vendor** Supertone's code. Our iOS Swift inference and Android
|
|
55
|
+
Kotlin port ([`ios/Supertonic/`](./ios/Supertonic/), [`android/.../supertonic/`](./android/src/main/java/expo/modules/ttskit/supertonic/))
|
|
56
|
+
were written from scratch using the upstream Python and Swift references as a
|
|
57
|
+
specification:
|
|
58
|
+
|
|
59
|
+
- **Source:** https://github.com/supertone-inc/supertonic
|
|
60
|
+
- **Copyright:** © 2025 Supertone Inc.
|
|
61
|
+
- **License:** [MIT](https://github.com/supertone-inc/supertonic/blob/main/LICENSE)
|
|
62
|
+
|
|
63
|
+
The MIT license on the upstream code does not impose redistribution
|
|
64
|
+
obligations on our independent port; we credit Supertone here for
|
|
65
|
+
transparency and because the code is closely modeled on theirs.
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Runtime dependencies
|
|
70
|
+
|
|
71
|
+
| Package | Source | License |
|
|
72
|
+
|---|---|---|
|
|
73
|
+
| ONNX Runtime (iOS / Android) | https://github.com/microsoft/onnxruntime | MIT |
|
|
74
|
+
| Expo Modules Core | https://github.com/expo/expo | MIT |
|
|
75
|
+
| `expo-speech` (optional system engine) | https://github.com/expo/expo/tree/main/packages/expo-speech | MIT |
|
|
76
|
+
| `expo-asset`, `expo-constants`, `expo-dev-client` | https://github.com/expo/expo | MIT |
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## This package
|
|
81
|
+
|
|
82
|
+
- **Code license:** MIT — see [`LICENSE`](./LICENSE)
|
|
83
|
+
- **Copyright:** © 2026 ahk-d
|
|
84
|
+
|
|
85
|
+
The MIT license on this repository covers only the code in `src/`, `ios/`,
|
|
86
|
+
`android/`, `example/`, and `benchmarks/`. The Supertonic-3 model weights
|
|
87
|
+
downloaded at runtime remain under the OpenRAIL-M license described above.
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 ahk-d
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
# react-native-tts-kit
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="docs/thumbnail.png" alt="react-native-tts-kit — Neural TTS for React Native · On-device · 31 languages" width="1024">
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
**Neural text-to-speech for React Native and Expo. On-device. Sub-100ms. 31 languages.**
|
|
8
|
+
|
|
9
|
+
```ts
|
|
10
|
+
import TTSKit from 'react-native-tts-kit';
|
|
11
|
+
|
|
12
|
+
await TTSKit.speak('Hello, world.');
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
No API keys. No network. No robotic system voice.
|
|
16
|
+
|
|
17
|
+
> **Status:** v0.1 alpha. iOS verified on iPhone (iOS 26+). Android scaffolded but not yet validated on a real device. Feedback via [GitHub issues](https://github.com/ahk-d/react-native-tts-kit/issues).
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Why
|
|
22
|
+
|
|
23
|
+
| | Quality | Offline | Cost | Languages |
|
|
24
|
+
|---|---|---|---|---|
|
|
25
|
+
| `expo-speech` (system) | robotic | ✅ | free | OS-bound |
|
|
26
|
+
| ElevenLabs / OpenAI TTS | excellent | ❌ | per-request | 30+ |
|
|
27
|
+
| **`react-native-tts-kit`** | **neural** | **✅** | **free** | **31** |
|
|
28
|
+
|
|
29
|
+
There was no good answer for on-device neural TTS in React Native. We needed it for [Flowent](https://getflowent.com), so we built it and open-sourced it.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Install
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
npx expo install react-native-tts-kit
|
|
37
|
+
npx expo prebuild --platform ios
|
|
38
|
+
cd ios && pod install && cd ..
|
|
39
|
+
npx expo run:ios --device
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Bare RN: same flow, just install `expo-modules-core` as a peer dep.
|
|
43
|
+
|
|
44
|
+
> Custom native code can't run inside Expo Go — use a dev build. Standard Expo workflow in 2026.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Use
|
|
49
|
+
|
|
50
|
+
```ts
|
|
51
|
+
import TTSKit from 'react-native-tts-kit';
|
|
52
|
+
|
|
53
|
+
// Default: F1 voice, English
|
|
54
|
+
await TTSKit.speak('Hello, world.');
|
|
55
|
+
|
|
56
|
+
// Pair any voice with any of 31 languages
|
|
57
|
+
await TTSKit.speak('Bonjour le monde', { voice: 'F1', language: 'fr' });
|
|
58
|
+
await TTSKit.speak('こんにちは', { voice: 'M2', language: 'ja' });
|
|
59
|
+
await TTSKit.speak('안녕하세요', { voice: 'F3', language: 'ko' });
|
|
60
|
+
|
|
61
|
+
// Stream long text — first audio arrives before synthesis finishes
|
|
62
|
+
const stream = TTSKit.stream(longArticle, { language: 'en' });
|
|
63
|
+
stream.on('chunk', (pcm) => { /* PCM16LE @ 44.1 kHz */ });
|
|
64
|
+
stream.on('end', () => {});
|
|
65
|
+
|
|
66
|
+
// Stop in-flight synthesis
|
|
67
|
+
await TTSKit.stop();
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### First-launch UX
|
|
71
|
+
|
|
72
|
+
The model is ~210 MB (multilingual split — 31 languages, fp16 weights) and downloads on first use. Call `prefetchModel()` from a settings screen so users aren't surprised mid-conversation:
|
|
73
|
+
|
|
74
|
+
```ts
|
|
75
|
+
await TTSKit.prefetchModel((p) => {
|
|
76
|
+
setProgress(p.percent); // 0–100
|
|
77
|
+
});
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Once downloaded, all calls are instant and offline forever.
|
|
81
|
+
|
|
82
|
+
### Voices
|
|
83
|
+
|
|
84
|
+
10 voices, all language-agnostic. Pair any voice with any language.
|
|
85
|
+
|
|
86
|
+
| Male | M1, M2, M3, M4, M5 |
|
|
87
|
+
|-------|---------------------|
|
|
88
|
+
| Female | F1, F2, F3, F4, F5 |
|
|
89
|
+
|
|
90
|
+
```ts
|
|
91
|
+
const voices = await TTSKit.getVoices();
|
|
92
|
+
// [{ id: 'F1', name: 'F1', gender: 'female', engine: 'supertonic' }, ...]
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Languages (31, verified on-device)
|
|
96
|
+
|
|
97
|
+
All 31 languages produce intelligible neural-quality audio on iPhone.
|
|
98
|
+
The example app ([`example/App.tsx`](example/App.tsx)) ships a tappable
|
|
99
|
+
sample sentence for every one.
|
|
100
|
+
|
|
101
|
+
| | | | | |
|
|
102
|
+
|---|---|---|---|---|
|
|
103
|
+
| Arabic (`ar`) | Bulgarian (`bg`) | Czech (`cs`) | Danish (`da`) | German (`de`) |
|
|
104
|
+
| Greek (`el`) | English (`en`) | Spanish (`es`) | Estonian (`et`) | Finnish (`fi`) |
|
|
105
|
+
| French (`fr`) | Hindi (`hi`) | Croatian (`hr`) | Hungarian (`hu`) | Indonesian (`id`) |
|
|
106
|
+
| Italian (`it`) | Japanese (`ja`) | Korean (`ko`) | Lithuanian (`lt`) | Latvian (`lv`) |
|
|
107
|
+
| Dutch (`nl`) | Polish (`pl`) | Portuguese (`pt`) | Romanian (`ro`) | Russian (`ru`) |
|
|
108
|
+
| Slovak (`sk`) | Slovenian (`sl`) | Swedish (`sv`) | Turkish (`tr`) | Ukrainian (`uk`) |
|
|
109
|
+
| Vietnamese (`vi`) | | | | |
|
|
110
|
+
|
|
111
|
+
```ts
|
|
112
|
+
import { SUPERTONIC_LANGUAGES } from 'react-native-tts-kit';
|
|
113
|
+
|
|
114
|
+
await TTSKit.speak('こんにちは', { voice: 'F1', language: 'ja' });
|
|
115
|
+
await TTSKit.speak('Привет', { voice: 'M2', language: 'ru' });
|
|
116
|
+
await TTSKit.speak('नमस्ते', { voice: 'F3', language: 'hi' });
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Voices are language-agnostic — any voice can speak any language.
|
|
120
|
+
|
|
121
|
+
### Engines
|
|
122
|
+
|
|
123
|
+
```ts
|
|
124
|
+
TTSKit.setEngine('supertonic'); // default — neural, on-device
|
|
125
|
+
TTSKit.setEngine('system'); // expo-speech fallback (robotic but free)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
| Engine | Status | Use when |
|
|
129
|
+
|---|---|---|
|
|
130
|
+
| `supertonic` | ✅ shipping | Production neural TTS |
|
|
131
|
+
| `system` | ✅ shipping | Fallback, no model download |
|
|
132
|
+
| `neutts` (voice cloning) | ⏳ v1.1 | 3-second voice clone |
|
|
133
|
+
| `cloud:eleven` / `cloud:openai` | ⏳ v1.2 | When you want premium quality |
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Performance
|
|
138
|
+
|
|
139
|
+
Measured on iPhone (iOS 26.4, Debug build):
|
|
140
|
+
|
|
141
|
+
| Metric | Value |
|
|
142
|
+
|---|---|
|
|
143
|
+
| Time to first audio (1 sentence) | ~70 ms |
|
|
144
|
+
| Real-time factor | < 0.5× |
|
|
145
|
+
| Cold start (first speak after launch) | ~1–2 s |
|
|
146
|
+
| Memory (peak during synthesis) | ~250 MB |
|
|
147
|
+
|
|
148
|
+
Reproducible benchmark harness in [`benchmarks/`](benchmarks/). Numbers across more devices (iPhone 14, Pixel 8, mid-tier Android) coming with v1.0.
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Privacy
|
|
153
|
+
|
|
154
|
+
Text passed to `speak()` and `stream()` is processed entirely on-device. Once the model is downloaded (one-time, on first `prefetchModel()` call), **no text or audio crosses the network at runtime** — synthesis runs in the local ONNX session and audio plays through the platform audio engine.
|
|
155
|
+
|
|
156
|
+
The only network activity this package performs is the initial model download from HuggingFace (see [`ATTRIBUTIONS.md`](ATTRIBUTIONS.md) for endpoints). Downloads are SHA-256-verified against fingerprints baked into the package when present. If you ship your app in a privacy-sensitive context, an offline mode (e.g. bundling the model file with your app's assets) is also supported — see [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Architecture
|
|
161
|
+
|
|
162
|
+
```
|
|
163
|
+
your app
|
|
164
|
+
↓
|
|
165
|
+
TTSKit — public API ([src/index.ts](src/index.ts))
|
|
166
|
+
↓
|
|
167
|
+
SupertonicEngine — JS wrapper, listens for native events
|
|
168
|
+
↓
|
|
169
|
+
RNTTSKitModule — Expo Module bridge ([ios/](ios/), [android/](android/))
|
|
170
|
+
↓
|
|
171
|
+
4 ONNX sessions — duration_predictor → text_encoder
|
|
172
|
+
→ vector_estimator (×8 denoising)
|
|
173
|
+
→ vocoder
|
|
174
|
+
↓
|
|
175
|
+
Float32 PCM @ 44.1 kHz — AVAudioEngine / AudioTrack playback
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Model weights: [Supertonic-3](https://huggingface.co/Supertone/supertonic-3) (99M params, OpenRAIL-M). We host a [pinned mirror](https://huggingface.co/ahk-d/supertonic-3) for resilience.
|
|
179
|
+
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
## License
|
|
183
|
+
|
|
184
|
+
| Component | License |
|
|
185
|
+
|---|---|
|
|
186
|
+
| This package | MIT |
|
|
187
|
+
| Supertonic-3 model weights | [BigScience OpenRAIL-M](licenses/OpenRAIL-M.txt) |
|
|
188
|
+
| ONNX Runtime, Expo Modules | MIT |
|
|
189
|
+
|
|
190
|
+
**If you ship this in your app**, OpenRAIL-M requires you to:
|
|
191
|
+
|
|
192
|
+
1. Bind your end users to the [Use Restrictions](ATTRIBUTIONS.md#what-you-cannot-do-attachment-a--use-restrictions) — most importantly **no impersonation/deepfakes without consent** and **AI-generated audio must be disclosed**.
|
|
193
|
+
2. Ship a copy of [`licenses/OpenRAIL-M.txt`](licenses/OpenRAIL-M.txt) with your app.
|
|
194
|
+
3. Preserve the Supertone copyright notice.
|
|
195
|
+
|
|
196
|
+
A 2-line ToS clause covers it. See [ATTRIBUTIONS.md](ATTRIBUTIONS.md) for boilerplate.
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## Roadmap
|
|
201
|
+
|
|
202
|
+
- **v1.0** — Android validation, benchmarks across 4+ devices, cleaner first-launch UX
|
|
203
|
+
- **v1.1** — Voice cloning via [NeuTTS Air](https://huggingface.co/neuphonic/neutts-air) (3-sec sample → cloned voice)
|
|
204
|
+
- **v1.2** — Cloud engine adapters (ElevenLabs, OpenAI, Cartesia) behind the same API
|
|
205
|
+
- **v2.0** — `@ttskit/web` — same API in the browser, WASM/WebGPU
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## FAQ
|
|
210
|
+
|
|
211
|
+
**How big is the model?** ~210 MB at fp16 (the multilingual split that supports all 31 languages — `vector_estimator.onnx` alone is 138 MB because it carries cross-lingual weights). Downloaded once on first launch and stored in Application Support / app-private files. fp32 fallback is available — see [tools/quantize.md](tools/quantize.md).
|
|
212
|
+
|
|
213
|
+
**Does it work in Expo Go?** No — custom native code requires a dev build. Run `npx expo prebuild` and use `npx expo run:ios --device`.
|
|
214
|
+
|
|
215
|
+
**Why isn't Chinese in the language list?** Supertonic-3's open-source weights cover 31 languages; Mandarin isn't one of them. Use a cloud engine (v1.2) if you need it.
|
|
216
|
+
|
|
217
|
+
**Can I use a custom voice?** Voice cloning lands in v1.1. For now, you have 10 preset voices.
|
|
218
|
+
|
|
219
|
+
**Does this work in production?** It's v0.1 alpha. The pipeline runs but we've validated on one iPhone. Expect rough edges for ~2 weeks post-launch.
|
|
220
|
+
|
|
221
|
+
**How does this compare to `expo-kokoro-onnx`?** Different model (Supertonic-3 vs Kokoro-82M), more languages (31 vs 8), faster TTFA, no espeak-ng dependency. Both are valid choices; we chose Supertonic for Flowent's needs.
|
|
222
|
+
|
|
223
|
+
---
|
|
224
|
+
|
|
225
|
+
## Credits
|
|
226
|
+
|
|
227
|
+
- [Supertone Inc.](https://www.supertone.ai/) — for [Supertonic-3](https://huggingface.co/Supertone/supertonic-3), the model that does the heavy lifting
|
|
228
|
+
- [Microsoft ONNX Runtime](https://onnxruntime.ai/) — inference engine
|
|
229
|
+
- [Expo Modules](https://docs.expo.dev/modules/overview/) — native bridge
|
|
230
|
+
|
|
231
|
+
Built by [@ahk-d](https://github.com/ahk-d) for [Flowent](https://getflowent.com).
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
apply plugin: 'com.android.library'
|
|
2
|
+
apply plugin: 'kotlin-android'
|
|
3
|
+
|
|
4
|
+
group = 'expo.modules.ttskit'
|
|
5
|
+
version = '0.1.0'
|
|
6
|
+
|
|
7
|
+
def expoModulesCorePlugin = new File(project(":expo-modules-core").projectDir.absolutePath, "ExpoModulesCorePlugin.gradle")
|
|
8
|
+
apply from: expoModulesCorePlugin
|
|
9
|
+
applyKotlinExpoModulesCorePlugin()
|
|
10
|
+
useCoreDependencies()
|
|
11
|
+
useExpoPublishing()
|
|
12
|
+
|
|
13
|
+
android {
|
|
14
|
+
namespace "expo.modules.ttskit"
|
|
15
|
+
compileSdkVersion safeExtGet('compileSdkVersion', 34)
|
|
16
|
+
|
|
17
|
+
defaultConfig {
|
|
18
|
+
minSdkVersion safeExtGet('minSdkVersion', 24)
|
|
19
|
+
targetSdkVersion safeExtGet('targetSdkVersion', 34)
|
|
20
|
+
versionCode 1
|
|
21
|
+
versionName "0.1.0"
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
buildFeatures {
|
|
25
|
+
buildConfig true
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
compileOptions {
|
|
29
|
+
sourceCompatibility JavaVersion.VERSION_17
|
|
30
|
+
targetCompatibility JavaVersion.VERSION_17
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
kotlinOptions {
|
|
34
|
+
jvmTarget = '17'
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
packagingOptions {
|
|
38
|
+
pickFirst '**/libonnxruntime.so'
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
dependencies {
|
|
43
|
+
implementation "com.microsoft.onnxruntime:onnxruntime-android:1.18.0"
|
|
44
|
+
implementation "org.jetbrains.kotlinx:kotlinx-coroutines-core:1.7.3"
|
|
45
|
+
implementation "org.jetbrains.kotlinx:kotlinx-coroutines-android:1.7.3"
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
def safeExtGet(prop, fallback) {
|
|
49
|
+
rootProject.ext.has(prop) ? rootProject.ext.get(prop) : fallback
|
|
50
|
+
}
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
package expo.modules.ttskit
|
|
2
|
+
|
|
3
|
+
import android.util.Base64
|
|
4
|
+
import expo.modules.kotlin.Promise
|
|
5
|
+
import expo.modules.kotlin.modules.Module
|
|
6
|
+
import expo.modules.kotlin.modules.ModuleDefinition
|
|
7
|
+
import expo.modules.ttskit.supertonic.AudioEngine
|
|
8
|
+
import expo.modules.ttskit.supertonic.ModelLocator
|
|
9
|
+
import expo.modules.ttskit.supertonic.SupertonicSession
|
|
10
|
+
import kotlinx.coroutines.CoroutineScope
|
|
11
|
+
import kotlinx.coroutines.Dispatchers
|
|
12
|
+
import kotlinx.coroutines.Job
|
|
13
|
+
import kotlinx.coroutines.SupervisorJob
|
|
14
|
+
import kotlinx.coroutines.cancel
|
|
15
|
+
import kotlinx.coroutines.launch
|
|
16
|
+
|
|
17
|
+
class RNTTSKitModule : Module() {
|
|
18
|
+
private var session: SupertonicSession? = null
|
|
19
|
+
private val audio = AudioEngine()
|
|
20
|
+
private val scope = CoroutineScope(Dispatchers.IO + SupervisorJob())
|
|
21
|
+
private var prefetchJob: Job? = null
|
|
22
|
+
|
|
23
|
+
override fun definition() = ModuleDefinition {
|
|
24
|
+
Name("RNTTSKit")
|
|
25
|
+
|
|
26
|
+
Events(
|
|
27
|
+
"onPrefetchProgress",
|
|
28
|
+
"onStreamChunk",
|
|
29
|
+
"onStreamEnd",
|
|
30
|
+
"onStreamError",
|
|
31
|
+
"onSpeakStart",
|
|
32
|
+
"onSpeakDone"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
OnCreate {
|
|
36
|
+
session = SupertonicSession(appContext.reactContext!!)
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
OnDestroy {
|
|
40
|
+
runCatching { session?.tearDown() }
|
|
41
|
+
session = null
|
|
42
|
+
audio.tearDown()
|
|
43
|
+
scope.cancel()
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
AsyncFunction("isAvailable") {
|
|
47
|
+
val ctx = appContext.reactContext ?: return@AsyncFunction false
|
|
48
|
+
ModelLocator.modelExists(ctx)
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
AsyncFunction("prefetch") { promise: Promise ->
|
|
52
|
+
val ctx = appContext.reactContext
|
|
53
|
+
if (ctx == null) {
|
|
54
|
+
promise.reject("CONTEXT_UNAVAILABLE", "React context unavailable", null)
|
|
55
|
+
return@AsyncFunction
|
|
56
|
+
}
|
|
57
|
+
prefetchJob?.cancel()
|
|
58
|
+
prefetchJob = scope.launch {
|
|
59
|
+
try {
|
|
60
|
+
ModelLocator.ensureModel(ctx) { downloaded, total ->
|
|
61
|
+
sendEvent("onPrefetchProgress", mapOf(
|
|
62
|
+
"bytesDownloaded" to downloaded,
|
|
63
|
+
"totalBytes" to total,
|
|
64
|
+
"percent" to if (total > 0) (downloaded.toDouble() / total * 100.0) else 0.0
|
|
65
|
+
))
|
|
66
|
+
}
|
|
67
|
+
session?.loadIfNeeded()
|
|
68
|
+
// Pre-warm default voice so first speak() after prefetch doesn't
|
|
69
|
+
// pay JSON-decode + tensor-alloc cost.
|
|
70
|
+
session?.prewarmDefaultVoice()
|
|
71
|
+
promise.resolve(null)
|
|
72
|
+
} catch (e: Exception) {
|
|
73
|
+
promise.reject("PREFETCH_FAILED", e.message ?: "prefetch failed", e)
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
AsyncFunction("speak") { id: String, text: String, voiceId: String, lang: String, totalStep: Int, speed: Double, volume: Double, promise: Promise ->
|
|
79
|
+
scope.launch {
|
|
80
|
+
try {
|
|
81
|
+
val s = session ?: throw IllegalStateException("session not initialized")
|
|
82
|
+
s.loadIfNeeded()
|
|
83
|
+
// Pipe chunks through AudioTrack as they finish synthesizing rather
|
|
84
|
+
// than synthesizing the whole utterance first. With multi-sentence
|
|
85
|
+
// input this drops perceived TTFA from O(total-synthesis) to
|
|
86
|
+
// O(first-sentence). For a single short input it's the same as
|
|
87
|
+
// before. onSpeakStart fires when the first chunk hits the speaker.
|
|
88
|
+
audio.beginStream(s.sampleRate, volume.toFloat())
|
|
89
|
+
var startEmitted = false
|
|
90
|
+
s.synthesizeStreaming(text, lang, voiceId, totalStep, speed) { samples ->
|
|
91
|
+
if (!startEmitted) {
|
|
92
|
+
sendEvent("onSpeakStart", mapOf("id" to id))
|
|
93
|
+
startEmitted = true
|
|
94
|
+
}
|
|
95
|
+
audio.feedStream(samples)
|
|
96
|
+
}
|
|
97
|
+
audio.endStream()
|
|
98
|
+
sendEvent("onSpeakDone", mapOf("id" to id))
|
|
99
|
+
promise.resolve(null)
|
|
100
|
+
} catch (e: Exception) {
|
|
101
|
+
audio.endStream()
|
|
102
|
+
promise.reject("SYNTHESIS_FAILED", e.message ?: "synthesis failed", e)
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
AsyncFunction("stream") { id: String, text: String, voiceId: String, lang: String, totalStep: Int, speed: Double, volume: Double, promise: Promise ->
|
|
108
|
+
scope.launch {
|
|
109
|
+
val s = session ?: run {
|
|
110
|
+
promise.reject("MODEL_NOT_LOADED", "session not initialized", null)
|
|
111
|
+
return@launch
|
|
112
|
+
}
|
|
113
|
+
try {
|
|
114
|
+
s.loadIfNeeded()
|
|
115
|
+
audio.beginStream(s.sampleRate, volume.toFloat())
|
|
116
|
+
s.synthesizeStreaming(text, lang, voiceId, totalStep, speed) { samples ->
|
|
117
|
+
val pcm16 = SupertonicSession.toPcm16(samples)
|
|
118
|
+
val b64 = Base64.encodeToString(pcm16, Base64.NO_WRAP)
|
|
119
|
+
sendEvent("onStreamChunk", mapOf("id" to id, "pcm" to b64))
|
|
120
|
+
audio.feedStream(samples)
|
|
121
|
+
}
|
|
122
|
+
audio.endStream()
|
|
123
|
+
sendEvent("onStreamEnd", mapOf("id" to id))
|
|
124
|
+
promise.resolve(null)
|
|
125
|
+
} catch (e: Exception) {
|
|
126
|
+
audio.endStream()
|
|
127
|
+
sendEvent("onStreamError", mapOf("id" to id, "message" to (e.message ?: "stream failed")))
|
|
128
|
+
promise.reject("SYNTHESIS_FAILED", e.message ?: "stream failed", e)
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
AsyncFunction("stop") { promise: Promise ->
|
|
134
|
+
audio.stop()
|
|
135
|
+
session?.cancel()
|
|
136
|
+
promise.resolve(null)
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
AsyncFunction("clearCache") { promise: Promise ->
|
|
140
|
+
// Tear down the loaded ONNX sessions before deleting the files they
|
|
141
|
+
// reference. Otherwise the next loadIfNeeded() would short-circuit
|
|
142
|
+
// (isReady == true) and skip re-loading from disk. Then build a fresh
|
|
143
|
+
// session so subsequent prefetch/speak calls have a non-null target.
|
|
144
|
+
val ctx = appContext.reactContext
|
|
145
|
+
prefetchJob?.cancel()
|
|
146
|
+
audio.stop()
|
|
147
|
+
runCatching { session?.cancel() }
|
|
148
|
+
runCatching { session?.tearDown() }
|
|
149
|
+
if (ctx != null) {
|
|
150
|
+
ModelLocator.clearCache(ctx)
|
|
151
|
+
session = SupertonicSession(ctx)
|
|
152
|
+
} else {
|
|
153
|
+
session = null
|
|
154
|
+
}
|
|
155
|
+
promise.resolve(null)
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|