react-native-tts-kit 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ATTRIBUTIONS.md +87 -0
- package/LICENSE +21 -0
- package/README.md +231 -0
- package/android/build.gradle +50 -0
- package/android/src/main/AndroidManifest.xml +3 -0
- package/android/src/main/java/expo/modules/ttskit/RNTTSKitModule.kt +158 -0
- package/android/src/main/java/expo/modules/ttskit/supertonic/AudioEngine.kt +158 -0
- package/android/src/main/java/expo/modules/ttskit/supertonic/ModelLocator.kt +372 -0
- package/android/src/main/java/expo/modules/ttskit/supertonic/SupertonicSession.kt +373 -0
- package/android/src/main/java/expo/modules/ttskit/supertonic/TextFrontend.kt +154 -0
- package/android/src/main/java/expo/modules/ttskit/supertonic/VoicePack.kt +47 -0
- package/build/engines/BufferedStreamEmitter.d.ts +26 -0
- package/build/engines/BufferedStreamEmitter.d.ts.map +1 -0
- package/build/engines/BufferedStreamEmitter.js +68 -0
- package/build/engines/BufferedStreamEmitter.js.map +1 -0
- package/build/engines/Engine.d.ts +15 -0
- package/build/engines/Engine.d.ts.map +1 -0
- package/build/engines/Engine.js +2 -0
- package/build/engines/Engine.js.map +1 -0
- package/build/engines/SupertonicEngine.d.ts +14 -0
- package/build/engines/SupertonicEngine.d.ts.map +1 -0
- package/build/engines/SupertonicEngine.js +183 -0
- package/build/engines/SupertonicEngine.js.map +1 -0
- package/build/engines/SystemEngine.d.ts +13 -0
- package/build/engines/SystemEngine.d.ts.map +1 -0
- package/build/engines/SystemEngine.js +78 -0
- package/build/engines/SystemEngine.js.map +1 -0
- package/build/index.d.ts +46 -0
- package/build/index.d.ts.map +1 -0
- package/build/index.js +118 -0
- package/build/index.js.map +1 -0
- package/build/types.d.ts +77 -0
- package/build/types.d.ts.map +1 -0
- package/build/types.js +2 -0
- package/build/types.js.map +1 -0
- package/build/voices/catalog.d.ts +12 -0
- package/build/voices/catalog.d.ts.map +1 -0
- package/build/voices/catalog.js +28 -0
- package/build/voices/catalog.js.map +1 -0
- package/build/voices/prosody.d.ts +8 -0
- package/build/voices/prosody.d.ts.map +1 -0
- package/build/voices/prosody.js +28 -0
- package/build/voices/prosody.js.map +1 -0
- package/expo-module.config.json +9 -0
- package/ios/RNTTSKit.podspec +28 -0
- package/ios/RNTTSKitModule.swift +133 -0
- package/ios/Supertonic/AudioEngine.swift +110 -0
- package/ios/Supertonic/ModelLocator.swift +416 -0
- package/ios/Supertonic/SupertonicSession.swift +405 -0
- package/ios/Supertonic/TextFrontend.swift +216 -0
- package/ios/Supertonic/VoicePack.swift +51 -0
- package/licenses/OpenRAIL-M.txt +209 -0
- package/package.json +77 -0
- package/src/engines/BufferedStreamEmitter.ts +50 -0
- package/src/engines/Engine.ts +28 -0
- package/src/engines/SupertonicEngine.ts +250 -0
- package/src/engines/SystemEngine.ts +96 -0
- package/src/engines/__tests__/BufferedStreamEmitter.test.ts +65 -0
- package/src/index.ts +156 -0
- package/src/types.ts +95 -0
- package/src/voices/__tests__/catalog.test.ts +46 -0
- package/src/voices/__tests__/prosody.test.ts +63 -0
- package/src/voices/catalog.ts +32 -0
- package/src/voices/prosody.ts +39 -0
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import onnxruntime_objc
|
|
3
|
+
|
|
4
|
+
enum SupertonicError: LocalizedError {
|
|
5
|
+
case modelMissing
|
|
6
|
+
case voiceMissing(String)
|
|
7
|
+
case textTooLong
|
|
8
|
+
case cancelled
|
|
9
|
+
case configMissing(String)
|
|
10
|
+
|
|
11
|
+
var errorDescription: String? {
|
|
12
|
+
switch self {
|
|
13
|
+
case .modelMissing: return "Supertonic asset bundle is missing. Call TTSKit.prefetchModel() first."
|
|
14
|
+
case .voiceMissing(let id): return "Voice \(id) is not available."
|
|
15
|
+
case .textTooLong: return "Input text exceeds the maximum length."
|
|
16
|
+
case .cancelled: return "Synthesis cancelled."
|
|
17
|
+
case .configMissing(let key): return "Required config key missing: \(key)."
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
private struct TTSConfig: Decodable {
|
|
23
|
+
struct AE: Decodable { let sample_rate: Int; let base_chunk_size: Int }
|
|
24
|
+
struct TTL: Decodable { let chunk_compress_factor: Int; let latent_dim: Int }
|
|
25
|
+
let ae: AE
|
|
26
|
+
let ttl: TTL
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/// Real Supertonic inference pipeline (ported from `supertone-inc/supertonic/swift`).
|
|
30
|
+
///
|
|
31
|
+
/// Four ONNX sessions:
|
|
32
|
+
/// 1. duration_predictor → per-chunk duration in seconds
|
|
33
|
+
/// 2. text_encoder → text embedding tensor
|
|
34
|
+
/// 3. vector_estimator → iterative diffusion denoising (totalStep iterations)
|
|
35
|
+
/// 4. vocoder → final waveform
|
|
36
|
+
final class SupertonicSession {
|
|
37
|
+
private var env: ORTEnv?
|
|
38
|
+
private var dpSession: ORTSession?
|
|
39
|
+
private var encSession: ORTSession?
|
|
40
|
+
private var vecSession: ORTSession?
|
|
41
|
+
private var vocSession: ORTSession?
|
|
42
|
+
private var indexer: UnicodeIndexer?
|
|
43
|
+
private var voiceCache: [String: VoicePack] = [:]
|
|
44
|
+
private var config: TTSConfig?
|
|
45
|
+
|
|
46
|
+
private let cancelLock = NSLock()
|
|
47
|
+
private var _cancelled = false
|
|
48
|
+
|
|
49
|
+
var isReady: Bool { dpSession != nil && encSession != nil && vecSession != nil && vocSession != nil && indexer != nil && config != nil }
|
|
50
|
+
var sampleRate: Int { config?.ae.sample_rate ?? 24_000 }
|
|
51
|
+
|
|
52
|
+
func loadIfNeeded() throws {
|
|
53
|
+
guard !isReady else { return }
|
|
54
|
+
guard ModelLocator.modelExists() else { throw SupertonicError.modelMissing }
|
|
55
|
+
|
|
56
|
+
let env = try ORTEnv(loggingLevel: .warning)
|
|
57
|
+
let opts = try ORTSessionOptions()
|
|
58
|
+
try opts.setIntraOpNumThreads(2)
|
|
59
|
+
try opts.setGraphOptimizationLevel(.all)
|
|
60
|
+
|
|
61
|
+
self.env = env
|
|
62
|
+
|
|
63
|
+
// Per-file load + log. Without this, if one of the four sessions
|
|
64
|
+
// throws (e.g. fp16 type mismatch on vector_estimator) it's hard to
|
|
65
|
+
// tell from JS which file failed.
|
|
66
|
+
func loadSession(_ filename: String) throws -> ORTSession {
|
|
67
|
+
let path = ModelLocator.resolvedOnnxURL(for: filename).path
|
|
68
|
+
do {
|
|
69
|
+
let s = try ORTSession(env: env, modelPath: path, sessionOptions: opts)
|
|
70
|
+
NSLog("[ST.load] ok %@ (%@)", filename, ModelLocator.precision.rawValue)
|
|
71
|
+
return s
|
|
72
|
+
} catch {
|
|
73
|
+
NSLog("[ST.load] FAIL %@ (%@) at %@ — %@",
|
|
74
|
+
filename, ModelLocator.precision.rawValue, path,
|
|
75
|
+
String(describing: error))
|
|
76
|
+
throw error
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
self.dpSession = try loadSession("duration_predictor.onnx")
|
|
81
|
+
self.encSession = try loadSession("text_encoder.onnx")
|
|
82
|
+
self.vecSession = try loadSession("vector_estimator.onnx")
|
|
83
|
+
self.vocSession = try loadSession("vocoder.onnx")
|
|
84
|
+
|
|
85
|
+
let cfgURL = ModelLocator.resolvedOnnxURL(for: "tts.json")
|
|
86
|
+
let cfgData = try Data(contentsOf: cfgURL)
|
|
87
|
+
self.config = try JSONDecoder().decode(TTSConfig.self, from: cfgData)
|
|
88
|
+
|
|
89
|
+
let idxURL = ModelLocator.resolvedOnnxURL(for: "unicode_indexer.json")
|
|
90
|
+
self.indexer = try UnicodeIndexer(url: idxURL)
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
func voicePack(for voiceId: String) throws -> VoicePack {
|
|
94
|
+
if let cached = voiceCache[voiceId] { return cached }
|
|
95
|
+
let url = ModelLocator.resolvedVoiceURL(for: voiceId)
|
|
96
|
+
guard FileManager.default.fileExists(atPath: url.path) else {
|
|
97
|
+
throw SupertonicError.voiceMissing(voiceId)
|
|
98
|
+
}
|
|
99
|
+
let pack = try VoicePack(voiceId: voiceId, url: url)
|
|
100
|
+
// Bound the cache. With 10 total voices we never actually evict in
|
|
101
|
+
// practice, but the cap means a future model expansion can't leak.
|
|
102
|
+
if voiceCache.count >= 8 {
|
|
103
|
+
voiceCache.removeAll()
|
|
104
|
+
}
|
|
105
|
+
voiceCache[voiceId] = pack
|
|
106
|
+
return pack
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/// Warm the JSON-decode + tensor-allocation path for the most likely
|
|
110
|
+
/// first-tap voice. Called from `prefetch()` so the user's first `speak()`
|
|
111
|
+
/// doesn't pay the 50–150 ms voice-load cost mid-tap.
|
|
112
|
+
func prewarmDefaultVoice() {
|
|
113
|
+
try? _ = voicePack(for: "F1")
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/// Drop all loaded sessions, indexer, voice tensors. Called from
|
|
117
|
+
/// `OnDestroy` so resources release deterministically rather than waiting
|
|
118
|
+
/// for ARC.
|
|
119
|
+
func tearDown() {
|
|
120
|
+
voiceCache.removeAll()
|
|
121
|
+
indexer = nil
|
|
122
|
+
config = nil
|
|
123
|
+
dpSession = nil
|
|
124
|
+
encSession = nil
|
|
125
|
+
vecSession = nil
|
|
126
|
+
vocSession = nil
|
|
127
|
+
env = nil
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
func beginRun() {
|
|
131
|
+
cancelLock.lock(); _cancelled = false; cancelLock.unlock()
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
func cancel() {
|
|
135
|
+
cancelLock.lock(); _cancelled = true; cancelLock.unlock()
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
private var isCancelled: Bool {
|
|
139
|
+
cancelLock.lock(); defer { cancelLock.unlock() }
|
|
140
|
+
return _cancelled
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// MARK: - Inference
|
|
144
|
+
|
|
145
|
+
/// Synthesize a single (already-chunked) input. Returns float32 PCM in [-1, 1].
|
|
146
|
+
/// Use `synthesize` / `synthesizeStreaming` from the module layer; those handle chunking.
|
|
147
|
+
func synthesizeOne(text: String, lang: String, voiceId: String, totalStep: Int, speed: Double) throws -> [Float] {
|
|
148
|
+
// Per-stage timing. Logs once per call so the line count stays small;
|
|
149
|
+
// remove or guard with a debug flag before tagging a release if it
|
|
150
|
+
// turns out to be noisy.
|
|
151
|
+
let t0 = CFAbsoluteTimeGetCurrent()
|
|
152
|
+
func dMs(_ from: CFAbsoluteTime, _ to: CFAbsoluteTime) -> String {
|
|
153
|
+
String(format: "%.0f", (to - from) * 1000)
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
try loadIfNeeded()
|
|
157
|
+
let tLoad = CFAbsoluteTimeGetCurrent()
|
|
158
|
+
guard let cfg = config, env != nil, let indexer = indexer,
|
|
159
|
+
let dp = dpSession, let enc = encSession, let vec = vecSession, let voc = vocSession else {
|
|
160
|
+
throw SupertonicError.modelMissing
|
|
161
|
+
}
|
|
162
|
+
let voice = try voicePack(for: voiceId)
|
|
163
|
+
let tVoice = CFAbsoluteTimeGetCurrent()
|
|
164
|
+
|
|
165
|
+
let processed = TextFrontend.preprocess(text, lang: lang)
|
|
166
|
+
let textIds: [Int64] = indexer.encode(processed)
|
|
167
|
+
if textIds.isEmpty { return [] }
|
|
168
|
+
let bsz = 1
|
|
169
|
+
let textLen = textIds.count
|
|
170
|
+
let textMask: [Float] = Array(repeating: 1.0, count: textLen)
|
|
171
|
+
let tText = CFAbsoluteTimeGetCurrent()
|
|
172
|
+
|
|
173
|
+
if isCancelled { throw SupertonicError.cancelled }
|
|
174
|
+
|
|
175
|
+
let textIdsValue = try ORTValue(
|
|
176
|
+
tensorData: NSMutableData(bytes: textIds, length: textIds.count * MemoryLayout<Int64>.size),
|
|
177
|
+
elementType: .int64,
|
|
178
|
+
shape: [NSNumber(value: bsz), NSNumber(value: textLen)]
|
|
179
|
+
)
|
|
180
|
+
let textMaskValue = try ORTValue(
|
|
181
|
+
tensorData: NSMutableData(bytes: textMask, length: textMask.count * MemoryLayout<Float>.size),
|
|
182
|
+
elementType: .float,
|
|
183
|
+
shape: [NSNumber(value: bsz), NSNumber(value: 1), NSNumber(value: textLen)]
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
let tTensors = CFAbsoluteTimeGetCurrent()
|
|
187
|
+
|
|
188
|
+
// 1. Duration prediction.
|
|
189
|
+
let dpOut = try dp.run(
|
|
190
|
+
withInputs: ["text_ids": textIdsValue, "style_dp": voice.dpValue, "text_mask": textMaskValue],
|
|
191
|
+
outputNames: ["duration"],
|
|
192
|
+
runOptions: nil
|
|
193
|
+
)
|
|
194
|
+
guard let durValue = dpOut["duration"] else { throw SupertonicError.modelMissing }
|
|
195
|
+
let durData = try durValue.tensorData() as Data
|
|
196
|
+
var duration: [Float] = durData.withUnsafeBytes {
|
|
197
|
+
Array(UnsafeBufferPointer(start: $0.bindMemory(to: Float.self).baseAddress, count: durData.count / 4))
|
|
198
|
+
}
|
|
199
|
+
for i in 0..<duration.count { duration[i] /= Float(speed) }
|
|
200
|
+
let tDP = CFAbsoluteTimeGetCurrent()
|
|
201
|
+
|
|
202
|
+
if isCancelled { throw SupertonicError.cancelled }
|
|
203
|
+
|
|
204
|
+
// 2. Text encoder. Hold the ORTValue across the denoising loop —
|
|
205
|
+
// Swift ARC keeps it alive; the upstream Helper.swift reference uses
|
|
206
|
+
// the same pattern and works.
|
|
207
|
+
let encOut = try enc.run(
|
|
208
|
+
withInputs: ["text_ids": textIdsValue, "style_ttl": voice.ttlValue, "text_mask": textMaskValue],
|
|
209
|
+
outputNames: ["text_emb"],
|
|
210
|
+
runOptions: nil
|
|
211
|
+
)
|
|
212
|
+
guard let textEmb = encOut["text_emb"] else { throw SupertonicError.modelMissing }
|
|
213
|
+
let tEnc = CFAbsoluteTimeGetCurrent()
|
|
214
|
+
|
|
215
|
+
// 3. Sample initial noisy latent + mask.
|
|
216
|
+
let baseChunk = cfg.ae.base_chunk_size
|
|
217
|
+
let chunkCompress = cfg.ttl.chunk_compress_factor
|
|
218
|
+
let latentDimBase = cfg.ttl.latent_dim
|
|
219
|
+
let latentDim = latentDimBase * chunkCompress
|
|
220
|
+
let chunkSize = baseChunk * chunkCompress
|
|
221
|
+
let maxDur = duration.max() ?? 0
|
|
222
|
+
let wavLenMax = Int(maxDur * Float(cfg.ae.sample_rate))
|
|
223
|
+
let latentLen = (wavLenMax + chunkSize - 1) / chunkSize
|
|
224
|
+
let wavLengths = duration.map { Int($0 * Float(cfg.ae.sample_rate)) }
|
|
225
|
+
let latentLengths = wavLengths.map { ($0 + chunkSize - 1) / chunkSize }
|
|
226
|
+
|
|
227
|
+
var noisy = [Float](repeating: 0, count: bsz * latentDim * latentLen)
|
|
228
|
+
// Box-Muller -> gaussian noise, then masked.
|
|
229
|
+
var idx = 0
|
|
230
|
+
for b in 0..<bsz {
|
|
231
|
+
let lLen = latentLengths[b]
|
|
232
|
+
for d in 0..<latentDim {
|
|
233
|
+
for t in 0..<latentLen {
|
|
234
|
+
if t < lLen {
|
|
235
|
+
let u1 = Float.random(in: 1e-7...1.0)
|
|
236
|
+
let u2 = Float.random(in: 0.0...1.0)
|
|
237
|
+
noisy[idx] = sqrt(-2.0 * log(u1)) * cos(2.0 * .pi * u2)
|
|
238
|
+
}
|
|
239
|
+
idx += 1
|
|
240
|
+
}
|
|
241
|
+
_ = d
|
|
242
|
+
}
|
|
243
|
+
_ = b
|
|
244
|
+
}
|
|
245
|
+
var latentMask = [Float](repeating: 0, count: bsz * 1 * latentLen)
|
|
246
|
+
for b in 0..<bsz {
|
|
247
|
+
for t in 0..<latentLengths[b] {
|
|
248
|
+
latentMask[b * latentLen + t] = 1.0
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
let latentMaskValue = try ORTValue(
|
|
253
|
+
tensorData: NSMutableData(bytes: latentMask, length: latentMask.count * MemoryLayout<Float>.size),
|
|
254
|
+
elementType: .float,
|
|
255
|
+
shape: [NSNumber(value: bsz), NSNumber(value: 1), NSNumber(value: latentLen)]
|
|
256
|
+
)
|
|
257
|
+
let totalStepArr = [Float](repeating: Float(totalStep), count: bsz)
|
|
258
|
+
let totalStepValue = try ORTValue(
|
|
259
|
+
tensorData: NSMutableData(bytes: totalStepArr, length: totalStepArr.count * MemoryLayout<Float>.size),
|
|
260
|
+
elementType: .float,
|
|
261
|
+
shape: [NSNumber(value: bsz)]
|
|
262
|
+
)
|
|
263
|
+
let tNoise = CFAbsoluteTimeGetCurrent()
|
|
264
|
+
|
|
265
|
+
// 4. Denoising loop. Per-step time logged so we can see if the bottleneck
|
|
266
|
+
// is ramp-up (first step paying compile cost) vs. steady-state.
|
|
267
|
+
var stepTimes: [Double] = []
|
|
268
|
+
for step in 0..<totalStep {
|
|
269
|
+
let tStepStart = CFAbsoluteTimeGetCurrent()
|
|
270
|
+
if isCancelled { throw SupertonicError.cancelled }
|
|
271
|
+
let xtValue = try ORTValue(
|
|
272
|
+
tensorData: NSMutableData(bytes: noisy, length: noisy.count * MemoryLayout<Float>.size),
|
|
273
|
+
elementType: .float,
|
|
274
|
+
shape: [NSNumber(value: bsz), NSNumber(value: latentDim), NSNumber(value: latentLen)]
|
|
275
|
+
)
|
|
276
|
+
let curStepArr = [Float](repeating: Float(step), count: bsz)
|
|
277
|
+
let curStepValue = try ORTValue(
|
|
278
|
+
tensorData: NSMutableData(bytes: curStepArr, length: curStepArr.count * MemoryLayout<Float>.size),
|
|
279
|
+
elementType: .float,
|
|
280
|
+
shape: [NSNumber(value: bsz)]
|
|
281
|
+
)
|
|
282
|
+
let vecOut = try vec.run(
|
|
283
|
+
withInputs: [
|
|
284
|
+
"noisy_latent": xtValue,
|
|
285
|
+
"text_emb": textEmb,
|
|
286
|
+
"style_ttl": voice.ttlValue,
|
|
287
|
+
"latent_mask": latentMaskValue,
|
|
288
|
+
"text_mask": textMaskValue,
|
|
289
|
+
"current_step": curStepValue,
|
|
290
|
+
"total_step": totalStepValue
|
|
291
|
+
],
|
|
292
|
+
outputNames: ["denoised_latent"],
|
|
293
|
+
runOptions: nil
|
|
294
|
+
)
|
|
295
|
+
guard let denoised = vecOut["denoised_latent"] else { throw SupertonicError.modelMissing }
|
|
296
|
+
let dData = try denoised.tensorData() as Data
|
|
297
|
+
noisy = dData.withUnsafeBytes {
|
|
298
|
+
Array(UnsafeBufferPointer(start: $0.bindMemory(to: Float.self).baseAddress, count: dData.count / 4))
|
|
299
|
+
}
|
|
300
|
+
stepTimes.append((CFAbsoluteTimeGetCurrent() - tStepStart) * 1000)
|
|
301
|
+
}
|
|
302
|
+
let tDiffusion = CFAbsoluteTimeGetCurrent()
|
|
303
|
+
|
|
304
|
+
if isCancelled { throw SupertonicError.cancelled }
|
|
305
|
+
|
|
306
|
+
// 5. Vocoder.
|
|
307
|
+
let finalLatent = try ORTValue(
|
|
308
|
+
tensorData: NSMutableData(bytes: noisy, length: noisy.count * MemoryLayout<Float>.size),
|
|
309
|
+
elementType: .float,
|
|
310
|
+
shape: [NSNumber(value: bsz), NSNumber(value: latentDim), NSNumber(value: latentLen)]
|
|
311
|
+
)
|
|
312
|
+
let vocOut = try voc.run(
|
|
313
|
+
withInputs: ["latent": finalLatent],
|
|
314
|
+
outputNames: ["wav_tts"],
|
|
315
|
+
runOptions: nil
|
|
316
|
+
)
|
|
317
|
+
guard let wav = vocOut["wav_tts"] else { throw SupertonicError.modelMissing }
|
|
318
|
+
let wavData = try wav.tensorData() as Data
|
|
319
|
+
var wavSamples: [Float] = wavData.withUnsafeBytes {
|
|
320
|
+
Array(UnsafeBufferPointer(start: $0.bindMemory(to: Float.self).baseAddress, count: wavData.count / 4))
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Trim to actual duration to drop silence padding.
|
|
324
|
+
let trimLen = min(wavSamples.count, Int(duration[0] * Float(cfg.ae.sample_rate)))
|
|
325
|
+
if trimLen > 0 && trimLen < wavSamples.count {
|
|
326
|
+
wavSamples = Array(wavSamples.prefix(trimLen))
|
|
327
|
+
}
|
|
328
|
+
let tVoc = CFAbsoluteTimeGetCurrent()
|
|
329
|
+
|
|
330
|
+
// One-line summary so this is easy to grep in Xcode console: "[ST.timing]"
|
|
331
|
+
let totalMs = (tVoc - t0) * 1000
|
|
332
|
+
let stepSummary = stepTimes.enumerated()
|
|
333
|
+
.map { String(format: "%d:%.0f", $0.offset, $0.element) }
|
|
334
|
+
.joined(separator: " ")
|
|
335
|
+
NSLog("[ST.timing] total=\(String(format: "%.0f", totalMs))ms "
|
|
336
|
+
+ "load=\(dMs(t0, tLoad)) voice=\(dMs(tLoad, tVoice)) "
|
|
337
|
+
+ "text=\(dMs(tVoice, tText)) tensors=\(dMs(tText, tTensors)) "
|
|
338
|
+
+ "dp=\(dMs(tTensors, tDP)) enc=\(dMs(tDP, tEnc)) "
|
|
339
|
+
+ "noise=\(dMs(tEnc, tNoise)) diffusion=\(dMs(tNoise, tDiffusion)) "
|
|
340
|
+
+ "voc=\(dMs(tDiffusion, tVoc)) "
|
|
341
|
+
+ "chars=\(textIds.count) latentLen=\(latentLen) steps=[\(stepSummary)]")
|
|
342
|
+
return wavSamples
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
/// Single-shot synthesis with chunking + 0.3s silence between chunks.
|
|
346
|
+
func synthesize(text: String, lang: String, voiceId: String, totalStep: Int, speed: Double) throws -> [Float] {
|
|
347
|
+
beginRun()
|
|
348
|
+
let chunks = TextFrontend.chunk(text, lang: lang)
|
|
349
|
+
if chunks.isEmpty { return [] }
|
|
350
|
+
let silenceSamples = Int(0.3 * Double(sampleRate))
|
|
351
|
+
let silence = [Float](repeating: 0, count: silenceSamples)
|
|
352
|
+
|
|
353
|
+
var out: [Float] = []
|
|
354
|
+
for (i, c) in chunks.enumerated() {
|
|
355
|
+
if isCancelled { throw SupertonicError.cancelled }
|
|
356
|
+
let pcm = try synthesizeOne(text: c, lang: lang, voiceId: voiceId, totalStep: totalStep, speed: speed)
|
|
357
|
+
if i > 0 { out.append(contentsOf: silence) }
|
|
358
|
+
out.append(contentsOf: pcm)
|
|
359
|
+
}
|
|
360
|
+
return out
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
/// Streaming: emit one chunk per sentence-ish unit to keep TTFA low.
|
|
364
|
+
func synthesizeStreaming(
|
|
365
|
+
text: String,
|
|
366
|
+
lang: String,
|
|
367
|
+
voiceId: String,
|
|
368
|
+
totalStep: Int,
|
|
369
|
+
speed: Double,
|
|
370
|
+
onChunk: ([Float]) -> Void
|
|
371
|
+
) throws {
|
|
372
|
+
let tStart = CFAbsoluteTimeGetCurrent()
|
|
373
|
+
try loadIfNeeded()
|
|
374
|
+
beginRun()
|
|
375
|
+
let chunks = TextFrontend.chunk(text, lang: lang)
|
|
376
|
+
var firstChunkLogged = false
|
|
377
|
+
for c in chunks {
|
|
378
|
+
if isCancelled { throw SupertonicError.cancelled }
|
|
379
|
+
let pcm = try synthesizeOne(text: c, lang: lang, voiceId: voiceId, totalStep: totalStep, speed: speed)
|
|
380
|
+
if !pcm.isEmpty {
|
|
381
|
+
if !firstChunkLogged {
|
|
382
|
+
let ttfa = (CFAbsoluteTimeGetCurrent() - tStart) * 1000
|
|
383
|
+
NSLog(String(format: "[ST.timing] TTFA=%.0fms (first chunk emitted, chunks=%d)", ttfa, chunks.count))
|
|
384
|
+
firstChunkLogged = true
|
|
385
|
+
}
|
|
386
|
+
onChunk(pcm)
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// MARK: - Bridge helpers
|
|
392
|
+
|
|
393
|
+
/// Converts float32 samples to little-endian PCM16 for transport across the JS bridge.
|
|
394
|
+
static func toPCM16(samples: [Float]) -> Data {
|
|
395
|
+
var out = Data(count: samples.count * 2)
|
|
396
|
+
out.withUnsafeMutableBytes { (raw: UnsafeMutableRawBufferPointer) in
|
|
397
|
+
let int16s = raw.bindMemory(to: Int16.self)
|
|
398
|
+
for i in 0..<samples.count {
|
|
399
|
+
let clamped = max(-1.0, min(1.0, samples[i]))
|
|
400
|
+
int16s[i] = Int16(clamped * 32767.0).littleEndian
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
return out
|
|
404
|
+
}
|
|
405
|
+
}
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
|
|
3
|
+
/// Self-contained text frontend for Supertonic.
|
|
4
|
+
///
|
|
5
|
+
/// Ported from `supertone-inc/supertonic/swift/Sources/Helper.swift` —
|
|
6
|
+
/// keep this file in sync if upstream's preprocessing changes.
|
|
7
|
+
///
|
|
8
|
+
/// Key behaviors:
|
|
9
|
+
/// 1. NFKD-decompose text (`decomposedStringWithCompatibilityMapping`).
|
|
10
|
+
/// 2. Strip emoji + symbols, collapse whitespace, normalize punctuation spacing.
|
|
11
|
+
/// 3. Wrap with `<lang>...</lang>` markers.
|
|
12
|
+
/// 4. Tokenize: codepoint -> indexer[codepoint] -> Int64 token id.
|
|
13
|
+
/// The `unicode_indexer.json` shipped alongside the model is a flat
|
|
14
|
+
/// `[Int64]` array of length 2^16 (BMP); index by codepoint, get token id.
|
|
15
|
+
/// Codepoints outside the array become -1 (the model's PAD/UNK).
|
|
16
|
+
enum TextFrontend {
|
|
17
|
+
static let availableLangs: Set<String> = [
|
|
18
|
+
"en", "ko", "ja", "ar", "bg", "cs", "da", "de", "el", "es", "et", "fi",
|
|
19
|
+
"fr", "hi", "hr", "hu", "id", "it", "lt", "lv", "nl", "pl", "pt", "ro",
|
|
20
|
+
"ru", "sk", "sl", "sv", "tr", "uk", "vi"
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
/// Per-language max chunk length (matches upstream).
|
|
24
|
+
static func maxChunkLength(for lang: String) -> Int {
|
|
25
|
+
return (lang == "ko" || lang == "ja") ? 120 : 300
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/// Apply upstream's text normalization rules and wrap with `<lang>...</lang>`.
|
|
29
|
+
static func preprocess(_ text: String, lang: String) -> String {
|
|
30
|
+
precondition(availableLangs.contains(lang), "Unsupported language: \(lang)")
|
|
31
|
+
|
|
32
|
+
var s = text.decomposedStringWithCompatibilityMapping
|
|
33
|
+
|
|
34
|
+
// Strip wide-Unicode emoji blocks.
|
|
35
|
+
s = String(String.UnicodeScalarView(s.unicodeScalars.filter { scalar in
|
|
36
|
+
let v = scalar.value
|
|
37
|
+
return !((0x1F600...0x1F64F).contains(v) ||
|
|
38
|
+
(0x1F300...0x1F5FF).contains(v) ||
|
|
39
|
+
(0x1F680...0x1F6FF).contains(v) ||
|
|
40
|
+
(0x1F700...0x1F77F).contains(v) ||
|
|
41
|
+
(0x1F780...0x1F7FF).contains(v) ||
|
|
42
|
+
(0x1F800...0x1F8FF).contains(v) ||
|
|
43
|
+
(0x1F900...0x1F9FF).contains(v) ||
|
|
44
|
+
(0x1FA00...0x1FA6F).contains(v) ||
|
|
45
|
+
(0x1FA70...0x1FAFF).contains(v) ||
|
|
46
|
+
(0x2600...0x26FF).contains(v) ||
|
|
47
|
+
(0x2700...0x27BF).contains(v) ||
|
|
48
|
+
(0x1F1E6...0x1F1FF).contains(v))
|
|
49
|
+
}))
|
|
50
|
+
|
|
51
|
+
let replacements: [(String, String)] = [
|
|
52
|
+
("\u{2013}", "-"), ("\u{2011}", "-"), ("\u{2014}", "-"),
|
|
53
|
+
("_", " "),
|
|
54
|
+
("\u{201C}", "\""), ("\u{201D}", "\""),
|
|
55
|
+
("\u{2018}", "'"), ("\u{2019}", "'"),
|
|
56
|
+
("´", "'"), ("`", "'"),
|
|
57
|
+
("[", " "), ("]", " "),
|
|
58
|
+
("|", " "), ("/", " "), ("#", " "),
|
|
59
|
+
("→", " "), ("←", " ")
|
|
60
|
+
]
|
|
61
|
+
for (k, v) in replacements { s = s.replacingOccurrences(of: k, with: v) }
|
|
62
|
+
|
|
63
|
+
for sym in ["♥", "☆", "♡", "©", "\\"] {
|
|
64
|
+
s = s.replacingOccurrences(of: sym, with: "")
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
for (k, v) in [("@", " at "), ("e.g.,", "for example, "), ("i.e.,", "that is, ")] {
|
|
68
|
+
s = s.replacingOccurrences(of: k, with: v)
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
let punctSpacing = [(" ,", ","), (" .", "."), (" !", "!"), (" ?", "?"),
|
|
72
|
+
(" ;", ";"), (" :", ":"), (" '", "'")]
|
|
73
|
+
for (k, v) in punctSpacing { s = s.replacingOccurrences(of: k, with: v) }
|
|
74
|
+
|
|
75
|
+
while s.contains("\"\"") { s = s.replacingOccurrences(of: "\"\"", with: "\"") }
|
|
76
|
+
while s.contains("''") { s = s.replacingOccurrences(of: "''", with: "'") }
|
|
77
|
+
while s.contains("``") { s = s.replacingOccurrences(of: "``", with: "`") }
|
|
78
|
+
|
|
79
|
+
let ws = try! NSRegularExpression(pattern: "\\s+")
|
|
80
|
+
s = ws.stringByReplacingMatches(
|
|
81
|
+
in: s,
|
|
82
|
+
range: NSRange(s.startIndex..., in: s),
|
|
83
|
+
withTemplate: " "
|
|
84
|
+
).trimmingCharacters(in: .whitespacesAndNewlines)
|
|
85
|
+
|
|
86
|
+
if !s.isEmpty {
|
|
87
|
+
let endsWithPunct = try! NSRegularExpression(
|
|
88
|
+
pattern: "[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]$"
|
|
89
|
+
)
|
|
90
|
+
if endsWithPunct.firstMatch(in: s, range: NSRange(s.startIndex..., in: s)) == nil {
|
|
91
|
+
s += "."
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
return "<\(lang)>\(s)</\(lang)>"
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/// Sentence-aware chunking, mirrors `chunkText` in upstream Helper.swift.
|
|
98
|
+
/// Chunks at hard sentence boundaries; falls back to commas / spaces for
|
|
99
|
+
/// pathologically long inputs.
|
|
100
|
+
private static let abbreviations: Set<String> = [
|
|
101
|
+
"Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.",
|
|
102
|
+
"St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.",
|
|
103
|
+
"Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D."
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
static func chunk(_ text: String, lang: String) -> [String] {
|
|
107
|
+
let maxLen = maxChunkLength(for: lang)
|
|
108
|
+
let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
109
|
+
if trimmed.isEmpty { return [] }
|
|
110
|
+
|
|
111
|
+
let paragraphs = splitParagraphs(trimmed)
|
|
112
|
+
|
|
113
|
+
var chunks: [String] = []
|
|
114
|
+
for p in paragraphs.isEmpty ? [trimmed] : paragraphs {
|
|
115
|
+
if p.count <= maxLen { chunks.append(p); continue }
|
|
116
|
+
chunks.append(contentsOf: greedyJoin(splitSentences(p), maxLen: maxLen))
|
|
117
|
+
}
|
|
118
|
+
return chunks.isEmpty ? [trimmed] : chunks
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/// Split on `\n\s*\n+` — blank-line paragraph boundaries, mirroring
|
|
122
|
+
/// upstream Helper.swift / helper.py. The previous implementation used a
|
|
123
|
+
/// no-op ternary that never invoked the regex.
|
|
124
|
+
private static func splitParagraphs(_ text: String) -> [String] {
|
|
125
|
+
let regex = try! NSRegularExpression(pattern: "\\n\\s*\\n+")
|
|
126
|
+
let nsText = text as NSString
|
|
127
|
+
let fullRange = NSRange(location: 0, length: nsText.length)
|
|
128
|
+
let matches = regex.matches(in: text, range: fullRange)
|
|
129
|
+
if matches.isEmpty {
|
|
130
|
+
return [text].filter { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }
|
|
131
|
+
}
|
|
132
|
+
var out: [String] = []
|
|
133
|
+
var cursor = 0
|
|
134
|
+
for m in matches {
|
|
135
|
+
let piece = nsText.substring(with: NSRange(location: cursor, length: m.range.location - cursor))
|
|
136
|
+
let trimmed = piece.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
137
|
+
if !trimmed.isEmpty { out.append(trimmed) }
|
|
138
|
+
cursor = m.range.location + m.range.length
|
|
139
|
+
}
|
|
140
|
+
if cursor < nsText.length {
|
|
141
|
+
let tail = nsText.substring(from: cursor).trimmingCharacters(in: .whitespacesAndNewlines)
|
|
142
|
+
if !tail.isEmpty { out.append(tail) }
|
|
143
|
+
}
|
|
144
|
+
return out
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/// Split into sentences. Now recognises Asian terminal punctuation
|
|
148
|
+
/// (`。!?`) which doesn't require trailing whitespace, so long
|
|
149
|
+
/// Japanese / Chinese strings actually chunk instead of collapsing into a
|
|
150
|
+
/// single oversized chunk that the model truncates.
|
|
151
|
+
private static func splitSentences(_ text: String) -> [String] {
|
|
152
|
+
// Latin-style: sentence-ender + whitespace.
|
|
153
|
+
// Asian-style: 。!? — whitespace optional.
|
|
154
|
+
let regex = try! NSRegularExpression(pattern: "([.!?])\\s+|([。!?])")
|
|
155
|
+
let range = NSRange(text.startIndex..., in: text)
|
|
156
|
+
let matches = regex.matches(in: text, range: range)
|
|
157
|
+
if matches.isEmpty { return [text] }
|
|
158
|
+
|
|
159
|
+
var sentences: [String] = []
|
|
160
|
+
var lastEnd = text.startIndex
|
|
161
|
+
|
|
162
|
+
for m in matches {
|
|
163
|
+
guard let r = Range(m.range, in: text) else { continue }
|
|
164
|
+
let before = String(text[lastEnd..<r.lowerBound])
|
|
165
|
+
let punc = String(text[Range(NSRange(location: m.range.location, length: 1), in: text)!])
|
|
166
|
+
let combined = before.trimmingCharacters(in: .whitespaces) + punc
|
|
167
|
+
let isAbbrev = abbreviations.contains { combined.hasSuffix($0) }
|
|
168
|
+
if !isAbbrev {
|
|
169
|
+
sentences.append(String(text[lastEnd..<r.upperBound]))
|
|
170
|
+
lastEnd = r.upperBound
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
if lastEnd < text.endIndex { sentences.append(String(text[lastEnd...])) }
|
|
174
|
+
return sentences.isEmpty ? [text] : sentences
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
private static func greedyJoin(_ pieces: [String], maxLen: Int) -> [String] {
|
|
178
|
+
var out: [String] = []
|
|
179
|
+
var current = ""
|
|
180
|
+
for piece in pieces {
|
|
181
|
+
let p = piece.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
182
|
+
if p.isEmpty { continue }
|
|
183
|
+
if current.isEmpty {
|
|
184
|
+
current = p
|
|
185
|
+
} else if current.count + 1 + p.count <= maxLen {
|
|
186
|
+
current += " " + p
|
|
187
|
+
} else {
|
|
188
|
+
out.append(current)
|
|
189
|
+
current = p
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
if !current.isEmpty { out.append(current) }
|
|
193
|
+
return out
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/// Loads `unicode_indexer.json` and turns text into Int64 token IDs.
|
|
198
|
+
final class UnicodeIndexer {
|
|
199
|
+
private let table: [Int64]
|
|
200
|
+
|
|
201
|
+
init(url: URL) throws {
|
|
202
|
+
let data = try Data(contentsOf: url, options: .mappedIfSafe)
|
|
203
|
+
self.table = try JSONDecoder().decode([Int64].self, from: data)
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/// Encode a single string into Int64 token ids using NFKD codepoint indexing.
|
|
207
|
+
func encode(_ text: String) -> [Int64] {
|
|
208
|
+
var out: [Int64] = []
|
|
209
|
+
out.reserveCapacity(text.unicodeScalars.count)
|
|
210
|
+
for scalar in text.unicodeScalars {
|
|
211
|
+
let v = Int(scalar.value)
|
|
212
|
+
out.append(v < table.count ? table[v] : -1)
|
|
213
|
+
}
|
|
214
|
+
return out
|
|
215
|
+
}
|
|
216
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import onnxruntime_objc
|
|
3
|
+
|
|
4
|
+
/// Loads a single voice style file from upstream's `voice_styles/<id>.json`.
|
|
5
|
+
///
|
|
6
|
+
/// Each file contains two 3D float tensors used by the duration predictor and
|
|
7
|
+
/// the text encoder respectively:
|
|
8
|
+
///
|
|
9
|
+
/// ```json
|
|
10
|
+
/// {
|
|
11
|
+
/// "style_ttl": { "data": [[[…]]], "dims": [1, D1, D2], "type": "f32" },
|
|
12
|
+
/// "style_dp": { "data": [[[…]]], "dims": [1, D1, D2], "type": "f32" }
|
|
13
|
+
/// }
|
|
14
|
+
/// ```
|
|
15
|
+
final class VoicePack {
|
|
16
|
+
let voiceId: String
|
|
17
|
+
let ttlValue: ORTValue
|
|
18
|
+
let dpValue: ORTValue
|
|
19
|
+
|
|
20
|
+
init(voiceId: String, url: URL) throws {
|
|
21
|
+
self.voiceId = voiceId
|
|
22
|
+
let data = try Data(contentsOf: url)
|
|
23
|
+
let decoded = try JSONDecoder().decode(VoiceStyleJSON.self, from: data)
|
|
24
|
+
|
|
25
|
+
let ttlFlat = decoded.style_ttl.data.flatMap { $0.flatMap { $0 } }
|
|
26
|
+
let ttlShape = decoded.style_ttl.dims.map { NSNumber(value: $0) }
|
|
27
|
+
self.ttlValue = try ORTValue(
|
|
28
|
+
tensorData: NSMutableData(bytes: ttlFlat, length: ttlFlat.count * MemoryLayout<Float>.size),
|
|
29
|
+
elementType: .float,
|
|
30
|
+
shape: ttlShape
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
let dpFlat = decoded.style_dp.data.flatMap { $0.flatMap { $0 } }
|
|
34
|
+
let dpShape = decoded.style_dp.dims.map { NSNumber(value: $0) }
|
|
35
|
+
self.dpValue = try ORTValue(
|
|
36
|
+
tensorData: NSMutableData(bytes: dpFlat, length: dpFlat.count * MemoryLayout<Float>.size),
|
|
37
|
+
elementType: .float,
|
|
38
|
+
shape: dpShape
|
|
39
|
+
)
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
private struct VoiceStyleJSON: Decodable {
|
|
44
|
+
struct Component: Decodable {
|
|
45
|
+
let data: [[[Float]]]
|
|
46
|
+
let dims: [Int]
|
|
47
|
+
let type: String
|
|
48
|
+
}
|
|
49
|
+
let style_ttl: Component
|
|
50
|
+
let style_dp: Component
|
|
51
|
+
}
|