react-native-tts-kit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/ATTRIBUTIONS.md +87 -0
  2. package/LICENSE +21 -0
  3. package/README.md +231 -0
  4. package/android/build.gradle +50 -0
  5. package/android/src/main/AndroidManifest.xml +3 -0
  6. package/android/src/main/java/expo/modules/ttskit/RNTTSKitModule.kt +158 -0
  7. package/android/src/main/java/expo/modules/ttskit/supertonic/AudioEngine.kt +158 -0
  8. package/android/src/main/java/expo/modules/ttskit/supertonic/ModelLocator.kt +372 -0
  9. package/android/src/main/java/expo/modules/ttskit/supertonic/SupertonicSession.kt +373 -0
  10. package/android/src/main/java/expo/modules/ttskit/supertonic/TextFrontend.kt +154 -0
  11. package/android/src/main/java/expo/modules/ttskit/supertonic/VoicePack.kt +47 -0
  12. package/build/engines/BufferedStreamEmitter.d.ts +26 -0
  13. package/build/engines/BufferedStreamEmitter.d.ts.map +1 -0
  14. package/build/engines/BufferedStreamEmitter.js +68 -0
  15. package/build/engines/BufferedStreamEmitter.js.map +1 -0
  16. package/build/engines/Engine.d.ts +15 -0
  17. package/build/engines/Engine.d.ts.map +1 -0
  18. package/build/engines/Engine.js +2 -0
  19. package/build/engines/Engine.js.map +1 -0
  20. package/build/engines/SupertonicEngine.d.ts +14 -0
  21. package/build/engines/SupertonicEngine.d.ts.map +1 -0
  22. package/build/engines/SupertonicEngine.js +183 -0
  23. package/build/engines/SupertonicEngine.js.map +1 -0
  24. package/build/engines/SystemEngine.d.ts +13 -0
  25. package/build/engines/SystemEngine.d.ts.map +1 -0
  26. package/build/engines/SystemEngine.js +78 -0
  27. package/build/engines/SystemEngine.js.map +1 -0
  28. package/build/index.d.ts +46 -0
  29. package/build/index.d.ts.map +1 -0
  30. package/build/index.js +118 -0
  31. package/build/index.js.map +1 -0
  32. package/build/types.d.ts +77 -0
  33. package/build/types.d.ts.map +1 -0
  34. package/build/types.js +2 -0
  35. package/build/types.js.map +1 -0
  36. package/build/voices/catalog.d.ts +12 -0
  37. package/build/voices/catalog.d.ts.map +1 -0
  38. package/build/voices/catalog.js +28 -0
  39. package/build/voices/catalog.js.map +1 -0
  40. package/build/voices/prosody.d.ts +8 -0
  41. package/build/voices/prosody.d.ts.map +1 -0
  42. package/build/voices/prosody.js +28 -0
  43. package/build/voices/prosody.js.map +1 -0
  44. package/expo-module.config.json +9 -0
  45. package/ios/RNTTSKit.podspec +28 -0
  46. package/ios/RNTTSKitModule.swift +133 -0
  47. package/ios/Supertonic/AudioEngine.swift +110 -0
  48. package/ios/Supertonic/ModelLocator.swift +416 -0
  49. package/ios/Supertonic/SupertonicSession.swift +405 -0
  50. package/ios/Supertonic/TextFrontend.swift +216 -0
  51. package/ios/Supertonic/VoicePack.swift +51 -0
  52. package/licenses/OpenRAIL-M.txt +209 -0
  53. package/package.json +77 -0
  54. package/src/engines/BufferedStreamEmitter.ts +50 -0
  55. package/src/engines/Engine.ts +28 -0
  56. package/src/engines/SupertonicEngine.ts +250 -0
  57. package/src/engines/SystemEngine.ts +96 -0
  58. package/src/engines/__tests__/BufferedStreamEmitter.test.ts +65 -0
  59. package/src/index.ts +156 -0
  60. package/src/types.ts +95 -0
  61. package/src/voices/__tests__/catalog.test.ts +46 -0
  62. package/src/voices/__tests__/prosody.test.ts +63 -0
  63. package/src/voices/catalog.ts +32 -0
  64. package/src/voices/prosody.ts +39 -0
@@ -0,0 +1,405 @@
1
+ import Foundation
2
+ import onnxruntime_objc
3
+
4
+ enum SupertonicError: LocalizedError {
5
+ case modelMissing
6
+ case voiceMissing(String)
7
+ case textTooLong
8
+ case cancelled
9
+ case configMissing(String)
10
+
11
+ var errorDescription: String? {
12
+ switch self {
13
+ case .modelMissing: return "Supertonic asset bundle is missing. Call TTSKit.prefetchModel() first."
14
+ case .voiceMissing(let id): return "Voice \(id) is not available."
15
+ case .textTooLong: return "Input text exceeds the maximum length."
16
+ case .cancelled: return "Synthesis cancelled."
17
+ case .configMissing(let key): return "Required config key missing: \(key)."
18
+ }
19
+ }
20
+ }
21
+
22
+ private struct TTSConfig: Decodable {
23
+ struct AE: Decodable { let sample_rate: Int; let base_chunk_size: Int }
24
+ struct TTL: Decodable { let chunk_compress_factor: Int; let latent_dim: Int }
25
+ let ae: AE
26
+ let ttl: TTL
27
+ }
28
+
29
+ /// Real Supertonic inference pipeline (ported from `supertone-inc/supertonic/swift`).
30
+ ///
31
+ /// Four ONNX sessions:
32
+ /// 1. duration_predictor → per-chunk duration in seconds
33
+ /// 2. text_encoder → text embedding tensor
34
+ /// 3. vector_estimator → iterative diffusion denoising (totalStep iterations)
35
+ /// 4. vocoder → final waveform
36
+ final class SupertonicSession {
37
+ private var env: ORTEnv?
38
+ private var dpSession: ORTSession?
39
+ private var encSession: ORTSession?
40
+ private var vecSession: ORTSession?
41
+ private var vocSession: ORTSession?
42
+ private var indexer: UnicodeIndexer?
43
+ private var voiceCache: [String: VoicePack] = [:]
44
+ private var config: TTSConfig?
45
+
46
+ private let cancelLock = NSLock()
47
+ private var _cancelled = false
48
+
49
+ var isReady: Bool { dpSession != nil && encSession != nil && vecSession != nil && vocSession != nil && indexer != nil && config != nil }
50
+ var sampleRate: Int { config?.ae.sample_rate ?? 24_000 }
51
+
52
+ func loadIfNeeded() throws {
53
+ guard !isReady else { return }
54
+ guard ModelLocator.modelExists() else { throw SupertonicError.modelMissing }
55
+
56
+ let env = try ORTEnv(loggingLevel: .warning)
57
+ let opts = try ORTSessionOptions()
58
+ try opts.setIntraOpNumThreads(2)
59
+ try opts.setGraphOptimizationLevel(.all)
60
+
61
+ self.env = env
62
+
63
+ // Per-file load + log. Without this, if one of the four sessions
64
+ // throws (e.g. fp16 type mismatch on vector_estimator) it's hard to
65
+ // tell from JS which file failed.
66
+ func loadSession(_ filename: String) throws -> ORTSession {
67
+ let path = ModelLocator.resolvedOnnxURL(for: filename).path
68
+ do {
69
+ let s = try ORTSession(env: env, modelPath: path, sessionOptions: opts)
70
+ NSLog("[ST.load] ok %@ (%@)", filename, ModelLocator.precision.rawValue)
71
+ return s
72
+ } catch {
73
+ NSLog("[ST.load] FAIL %@ (%@) at %@ — %@",
74
+ filename, ModelLocator.precision.rawValue, path,
75
+ String(describing: error))
76
+ throw error
77
+ }
78
+ }
79
+
80
+ self.dpSession = try loadSession("duration_predictor.onnx")
81
+ self.encSession = try loadSession("text_encoder.onnx")
82
+ self.vecSession = try loadSession("vector_estimator.onnx")
83
+ self.vocSession = try loadSession("vocoder.onnx")
84
+
85
+ let cfgURL = ModelLocator.resolvedOnnxURL(for: "tts.json")
86
+ let cfgData = try Data(contentsOf: cfgURL)
87
+ self.config = try JSONDecoder().decode(TTSConfig.self, from: cfgData)
88
+
89
+ let idxURL = ModelLocator.resolvedOnnxURL(for: "unicode_indexer.json")
90
+ self.indexer = try UnicodeIndexer(url: idxURL)
91
+ }
92
+
93
+ func voicePack(for voiceId: String) throws -> VoicePack {
94
+ if let cached = voiceCache[voiceId] { return cached }
95
+ let url = ModelLocator.resolvedVoiceURL(for: voiceId)
96
+ guard FileManager.default.fileExists(atPath: url.path) else {
97
+ throw SupertonicError.voiceMissing(voiceId)
98
+ }
99
+ let pack = try VoicePack(voiceId: voiceId, url: url)
100
+ // Bound the cache. With 10 total voices we never actually evict in
101
+ // practice, but the cap means a future model expansion can't leak.
102
+ if voiceCache.count >= 8 {
103
+ voiceCache.removeAll()
104
+ }
105
+ voiceCache[voiceId] = pack
106
+ return pack
107
+ }
108
+
109
+ /// Warm the JSON-decode + tensor-allocation path for the most likely
110
+ /// first-tap voice. Called from `prefetch()` so the user's first `speak()`
111
+ /// doesn't pay the 50–150 ms voice-load cost mid-tap.
112
+ func prewarmDefaultVoice() {
113
+ try? _ = voicePack(for: "F1")
114
+ }
115
+
116
+ /// Drop all loaded sessions, indexer, voice tensors. Called from
117
+ /// `OnDestroy` so resources release deterministically rather than waiting
118
+ /// for ARC.
119
+ func tearDown() {
120
+ voiceCache.removeAll()
121
+ indexer = nil
122
+ config = nil
123
+ dpSession = nil
124
+ encSession = nil
125
+ vecSession = nil
126
+ vocSession = nil
127
+ env = nil
128
+ }
129
+
130
+ func beginRun() {
131
+ cancelLock.lock(); _cancelled = false; cancelLock.unlock()
132
+ }
133
+
134
+ func cancel() {
135
+ cancelLock.lock(); _cancelled = true; cancelLock.unlock()
136
+ }
137
+
138
+ private var isCancelled: Bool {
139
+ cancelLock.lock(); defer { cancelLock.unlock() }
140
+ return _cancelled
141
+ }
142
+
143
+ // MARK: - Inference
144
+
145
+ /// Synthesize a single (already-chunked) input. Returns float32 PCM in [-1, 1].
146
+ /// Use `synthesize` / `synthesizeStreaming` from the module layer; those handle chunking.
147
+ func synthesizeOne(text: String, lang: String, voiceId: String, totalStep: Int, speed: Double) throws -> [Float] {
148
+ // Per-stage timing. Logs once per call so the line count stays small;
149
+ // remove or guard with a debug flag before tagging a release if it
150
+ // turns out to be noisy.
151
+ let t0 = CFAbsoluteTimeGetCurrent()
152
+ func dMs(_ from: CFAbsoluteTime, _ to: CFAbsoluteTime) -> String {
153
+ String(format: "%.0f", (to - from) * 1000)
154
+ }
155
+
156
+ try loadIfNeeded()
157
+ let tLoad = CFAbsoluteTimeGetCurrent()
158
+ guard let cfg = config, env != nil, let indexer = indexer,
159
+ let dp = dpSession, let enc = encSession, let vec = vecSession, let voc = vocSession else {
160
+ throw SupertonicError.modelMissing
161
+ }
162
+ let voice = try voicePack(for: voiceId)
163
+ let tVoice = CFAbsoluteTimeGetCurrent()
164
+
165
+ let processed = TextFrontend.preprocess(text, lang: lang)
166
+ let textIds: [Int64] = indexer.encode(processed)
167
+ if textIds.isEmpty { return [] }
168
+ let bsz = 1
169
+ let textLen = textIds.count
170
+ let textMask: [Float] = Array(repeating: 1.0, count: textLen)
171
+ let tText = CFAbsoluteTimeGetCurrent()
172
+
173
+ if isCancelled { throw SupertonicError.cancelled }
174
+
175
+ let textIdsValue = try ORTValue(
176
+ tensorData: NSMutableData(bytes: textIds, length: textIds.count * MemoryLayout<Int64>.size),
177
+ elementType: .int64,
178
+ shape: [NSNumber(value: bsz), NSNumber(value: textLen)]
179
+ )
180
+ let textMaskValue = try ORTValue(
181
+ tensorData: NSMutableData(bytes: textMask, length: textMask.count * MemoryLayout<Float>.size),
182
+ elementType: .float,
183
+ shape: [NSNumber(value: bsz), NSNumber(value: 1), NSNumber(value: textLen)]
184
+ )
185
+
186
+ let tTensors = CFAbsoluteTimeGetCurrent()
187
+
188
+ // 1. Duration prediction.
189
+ let dpOut = try dp.run(
190
+ withInputs: ["text_ids": textIdsValue, "style_dp": voice.dpValue, "text_mask": textMaskValue],
191
+ outputNames: ["duration"],
192
+ runOptions: nil
193
+ )
194
+ guard let durValue = dpOut["duration"] else { throw SupertonicError.modelMissing }
195
+ let durData = try durValue.tensorData() as Data
196
+ var duration: [Float] = durData.withUnsafeBytes {
197
+ Array(UnsafeBufferPointer(start: $0.bindMemory(to: Float.self).baseAddress, count: durData.count / 4))
198
+ }
199
+ for i in 0..<duration.count { duration[i] /= Float(speed) }
200
+ let tDP = CFAbsoluteTimeGetCurrent()
201
+
202
+ if isCancelled { throw SupertonicError.cancelled }
203
+
204
+ // 2. Text encoder. Hold the ORTValue across the denoising loop —
205
+ // Swift ARC keeps it alive; the upstream Helper.swift reference uses
206
+ // the same pattern and works.
207
+ let encOut = try enc.run(
208
+ withInputs: ["text_ids": textIdsValue, "style_ttl": voice.ttlValue, "text_mask": textMaskValue],
209
+ outputNames: ["text_emb"],
210
+ runOptions: nil
211
+ )
212
+ guard let textEmb = encOut["text_emb"] else { throw SupertonicError.modelMissing }
213
+ let tEnc = CFAbsoluteTimeGetCurrent()
214
+
215
+ // 3. Sample initial noisy latent + mask.
216
+ let baseChunk = cfg.ae.base_chunk_size
217
+ let chunkCompress = cfg.ttl.chunk_compress_factor
218
+ let latentDimBase = cfg.ttl.latent_dim
219
+ let latentDim = latentDimBase * chunkCompress
220
+ let chunkSize = baseChunk * chunkCompress
221
+ let maxDur = duration.max() ?? 0
222
+ let wavLenMax = Int(maxDur * Float(cfg.ae.sample_rate))
223
+ let latentLen = (wavLenMax + chunkSize - 1) / chunkSize
224
+ let wavLengths = duration.map { Int($0 * Float(cfg.ae.sample_rate)) }
225
+ let latentLengths = wavLengths.map { ($0 + chunkSize - 1) / chunkSize }
226
+
227
+ var noisy = [Float](repeating: 0, count: bsz * latentDim * latentLen)
228
+ // Box-Muller -> gaussian noise, then masked.
229
+ var idx = 0
230
+ for b in 0..<bsz {
231
+ let lLen = latentLengths[b]
232
+ for d in 0..<latentDim {
233
+ for t in 0..<latentLen {
234
+ if t < lLen {
235
+ let u1 = Float.random(in: 1e-7...1.0)
236
+ let u2 = Float.random(in: 0.0...1.0)
237
+ noisy[idx] = sqrt(-2.0 * log(u1)) * cos(2.0 * .pi * u2)
238
+ }
239
+ idx += 1
240
+ }
241
+ _ = d
242
+ }
243
+ _ = b
244
+ }
245
+ var latentMask = [Float](repeating: 0, count: bsz * 1 * latentLen)
246
+ for b in 0..<bsz {
247
+ for t in 0..<latentLengths[b] {
248
+ latentMask[b * latentLen + t] = 1.0
249
+ }
250
+ }
251
+
252
+ let latentMaskValue = try ORTValue(
253
+ tensorData: NSMutableData(bytes: latentMask, length: latentMask.count * MemoryLayout<Float>.size),
254
+ elementType: .float,
255
+ shape: [NSNumber(value: bsz), NSNumber(value: 1), NSNumber(value: latentLen)]
256
+ )
257
+ let totalStepArr = [Float](repeating: Float(totalStep), count: bsz)
258
+ let totalStepValue = try ORTValue(
259
+ tensorData: NSMutableData(bytes: totalStepArr, length: totalStepArr.count * MemoryLayout<Float>.size),
260
+ elementType: .float,
261
+ shape: [NSNumber(value: bsz)]
262
+ )
263
+ let tNoise = CFAbsoluteTimeGetCurrent()
264
+
265
+ // 4. Denoising loop. Per-step time logged so we can see if the bottleneck
266
+ // is ramp-up (first step paying compile cost) vs. steady-state.
267
+ var stepTimes: [Double] = []
268
+ for step in 0..<totalStep {
269
+ let tStepStart = CFAbsoluteTimeGetCurrent()
270
+ if isCancelled { throw SupertonicError.cancelled }
271
+ let xtValue = try ORTValue(
272
+ tensorData: NSMutableData(bytes: noisy, length: noisy.count * MemoryLayout<Float>.size),
273
+ elementType: .float,
274
+ shape: [NSNumber(value: bsz), NSNumber(value: latentDim), NSNumber(value: latentLen)]
275
+ )
276
+ let curStepArr = [Float](repeating: Float(step), count: bsz)
277
+ let curStepValue = try ORTValue(
278
+ tensorData: NSMutableData(bytes: curStepArr, length: curStepArr.count * MemoryLayout<Float>.size),
279
+ elementType: .float,
280
+ shape: [NSNumber(value: bsz)]
281
+ )
282
+ let vecOut = try vec.run(
283
+ withInputs: [
284
+ "noisy_latent": xtValue,
285
+ "text_emb": textEmb,
286
+ "style_ttl": voice.ttlValue,
287
+ "latent_mask": latentMaskValue,
288
+ "text_mask": textMaskValue,
289
+ "current_step": curStepValue,
290
+ "total_step": totalStepValue
291
+ ],
292
+ outputNames: ["denoised_latent"],
293
+ runOptions: nil
294
+ )
295
+ guard let denoised = vecOut["denoised_latent"] else { throw SupertonicError.modelMissing }
296
+ let dData = try denoised.tensorData() as Data
297
+ noisy = dData.withUnsafeBytes {
298
+ Array(UnsafeBufferPointer(start: $0.bindMemory(to: Float.self).baseAddress, count: dData.count / 4))
299
+ }
300
+ stepTimes.append((CFAbsoluteTimeGetCurrent() - tStepStart) * 1000)
301
+ }
302
+ let tDiffusion = CFAbsoluteTimeGetCurrent()
303
+
304
+ if isCancelled { throw SupertonicError.cancelled }
305
+
306
+ // 5. Vocoder.
307
+ let finalLatent = try ORTValue(
308
+ tensorData: NSMutableData(bytes: noisy, length: noisy.count * MemoryLayout<Float>.size),
309
+ elementType: .float,
310
+ shape: [NSNumber(value: bsz), NSNumber(value: latentDim), NSNumber(value: latentLen)]
311
+ )
312
+ let vocOut = try voc.run(
313
+ withInputs: ["latent": finalLatent],
314
+ outputNames: ["wav_tts"],
315
+ runOptions: nil
316
+ )
317
+ guard let wav = vocOut["wav_tts"] else { throw SupertonicError.modelMissing }
318
+ let wavData = try wav.tensorData() as Data
319
+ var wavSamples: [Float] = wavData.withUnsafeBytes {
320
+ Array(UnsafeBufferPointer(start: $0.bindMemory(to: Float.self).baseAddress, count: wavData.count / 4))
321
+ }
322
+
323
+ // Trim to actual duration to drop silence padding.
324
+ let trimLen = min(wavSamples.count, Int(duration[0] * Float(cfg.ae.sample_rate)))
325
+ if trimLen > 0 && trimLen < wavSamples.count {
326
+ wavSamples = Array(wavSamples.prefix(trimLen))
327
+ }
328
+ let tVoc = CFAbsoluteTimeGetCurrent()
329
+
330
+ // One-line summary so this is easy to grep in Xcode console: "[ST.timing]"
331
+ let totalMs = (tVoc - t0) * 1000
332
+ let stepSummary = stepTimes.enumerated()
333
+ .map { String(format: "%d:%.0f", $0.offset, $0.element) }
334
+ .joined(separator: " ")
335
+ NSLog("[ST.timing] total=\(String(format: "%.0f", totalMs))ms "
336
+ + "load=\(dMs(t0, tLoad)) voice=\(dMs(tLoad, tVoice)) "
337
+ + "text=\(dMs(tVoice, tText)) tensors=\(dMs(tText, tTensors)) "
338
+ + "dp=\(dMs(tTensors, tDP)) enc=\(dMs(tDP, tEnc)) "
339
+ + "noise=\(dMs(tEnc, tNoise)) diffusion=\(dMs(tNoise, tDiffusion)) "
340
+ + "voc=\(dMs(tDiffusion, tVoc)) "
341
+ + "chars=\(textIds.count) latentLen=\(latentLen) steps=[\(stepSummary)]")
342
+ return wavSamples
343
+ }
344
+
345
+ /// Single-shot synthesis with chunking + 0.3s silence between chunks.
346
+ func synthesize(text: String, lang: String, voiceId: String, totalStep: Int, speed: Double) throws -> [Float] {
347
+ beginRun()
348
+ let chunks = TextFrontend.chunk(text, lang: lang)
349
+ if chunks.isEmpty { return [] }
350
+ let silenceSamples = Int(0.3 * Double(sampleRate))
351
+ let silence = [Float](repeating: 0, count: silenceSamples)
352
+
353
+ var out: [Float] = []
354
+ for (i, c) in chunks.enumerated() {
355
+ if isCancelled { throw SupertonicError.cancelled }
356
+ let pcm = try synthesizeOne(text: c, lang: lang, voiceId: voiceId, totalStep: totalStep, speed: speed)
357
+ if i > 0 { out.append(contentsOf: silence) }
358
+ out.append(contentsOf: pcm)
359
+ }
360
+ return out
361
+ }
362
+
363
+ /// Streaming: emit one chunk per sentence-ish unit to keep TTFA low.
364
+ func synthesizeStreaming(
365
+ text: String,
366
+ lang: String,
367
+ voiceId: String,
368
+ totalStep: Int,
369
+ speed: Double,
370
+ onChunk: ([Float]) -> Void
371
+ ) throws {
372
+ let tStart = CFAbsoluteTimeGetCurrent()
373
+ try loadIfNeeded()
374
+ beginRun()
375
+ let chunks = TextFrontend.chunk(text, lang: lang)
376
+ var firstChunkLogged = false
377
+ for c in chunks {
378
+ if isCancelled { throw SupertonicError.cancelled }
379
+ let pcm = try synthesizeOne(text: c, lang: lang, voiceId: voiceId, totalStep: totalStep, speed: speed)
380
+ if !pcm.isEmpty {
381
+ if !firstChunkLogged {
382
+ let ttfa = (CFAbsoluteTimeGetCurrent() - tStart) * 1000
383
+ NSLog(String(format: "[ST.timing] TTFA=%.0fms (first chunk emitted, chunks=%d)", ttfa, chunks.count))
384
+ firstChunkLogged = true
385
+ }
386
+ onChunk(pcm)
387
+ }
388
+ }
389
+ }
390
+
391
+ // MARK: - Bridge helpers
392
+
393
+ /// Converts float32 samples to little-endian PCM16 for transport across the JS bridge.
394
+ static func toPCM16(samples: [Float]) -> Data {
395
+ var out = Data(count: samples.count * 2)
396
+ out.withUnsafeMutableBytes { (raw: UnsafeMutableRawBufferPointer) in
397
+ let int16s = raw.bindMemory(to: Int16.self)
398
+ for i in 0..<samples.count {
399
+ let clamped = max(-1.0, min(1.0, samples[i]))
400
+ int16s[i] = Int16(clamped * 32767.0).littleEndian
401
+ }
402
+ }
403
+ return out
404
+ }
405
+ }
@@ -0,0 +1,216 @@
1
+ import Foundation
2
+
3
+ /// Self-contained text frontend for Supertonic.
4
+ ///
5
+ /// Ported from `supertone-inc/supertonic/swift/Sources/Helper.swift` —
6
+ /// keep this file in sync if upstream's preprocessing changes.
7
+ ///
8
+ /// Key behaviors:
9
+ /// 1. NFKD-decompose text (`decomposedStringWithCompatibilityMapping`).
10
+ /// 2. Strip emoji + symbols, collapse whitespace, normalize punctuation spacing.
11
+ /// 3. Wrap with `<lang>...</lang>` markers.
12
+ /// 4. Tokenize: codepoint -> indexer[codepoint] -> Int64 token id.
13
+ /// The `unicode_indexer.json` shipped alongside the model is a flat
14
+ /// `[Int64]` array of length 2^16 (BMP); index by codepoint, get token id.
15
+ /// Codepoints outside the array become -1 (the model's PAD/UNK).
16
+ enum TextFrontend {
17
+ static let availableLangs: Set<String> = [
18
+ "en", "ko", "ja", "ar", "bg", "cs", "da", "de", "el", "es", "et", "fi",
19
+ "fr", "hi", "hr", "hu", "id", "it", "lt", "lv", "nl", "pl", "pt", "ro",
20
+ "ru", "sk", "sl", "sv", "tr", "uk", "vi"
21
+ ]
22
+
23
+ /// Per-language max chunk length (matches upstream).
24
+ static func maxChunkLength(for lang: String) -> Int {
25
+ return (lang == "ko" || lang == "ja") ? 120 : 300
26
+ }
27
+
28
+ /// Apply upstream's text normalization rules and wrap with `<lang>...</lang>`.
29
+ static func preprocess(_ text: String, lang: String) -> String {
30
+ precondition(availableLangs.contains(lang), "Unsupported language: \(lang)")
31
+
32
+ var s = text.decomposedStringWithCompatibilityMapping
33
+
34
+ // Strip wide-Unicode emoji blocks.
35
+ s = String(String.UnicodeScalarView(s.unicodeScalars.filter { scalar in
36
+ let v = scalar.value
37
+ return !((0x1F600...0x1F64F).contains(v) ||
38
+ (0x1F300...0x1F5FF).contains(v) ||
39
+ (0x1F680...0x1F6FF).contains(v) ||
40
+ (0x1F700...0x1F77F).contains(v) ||
41
+ (0x1F780...0x1F7FF).contains(v) ||
42
+ (0x1F800...0x1F8FF).contains(v) ||
43
+ (0x1F900...0x1F9FF).contains(v) ||
44
+ (0x1FA00...0x1FA6F).contains(v) ||
45
+ (0x1FA70...0x1FAFF).contains(v) ||
46
+ (0x2600...0x26FF).contains(v) ||
47
+ (0x2700...0x27BF).contains(v) ||
48
+ (0x1F1E6...0x1F1FF).contains(v))
49
+ }))
50
+
51
+ let replacements: [(String, String)] = [
52
+ ("\u{2013}", "-"), ("\u{2011}", "-"), ("\u{2014}", "-"),
53
+ ("_", " "),
54
+ ("\u{201C}", "\""), ("\u{201D}", "\""),
55
+ ("\u{2018}", "'"), ("\u{2019}", "'"),
56
+ ("´", "'"), ("`", "'"),
57
+ ("[", " "), ("]", " "),
58
+ ("|", " "), ("/", " "), ("#", " "),
59
+ ("→", " "), ("←", " ")
60
+ ]
61
+ for (k, v) in replacements { s = s.replacingOccurrences(of: k, with: v) }
62
+
63
+ for sym in ["♥", "☆", "♡", "©", "\\"] {
64
+ s = s.replacingOccurrences(of: sym, with: "")
65
+ }
66
+
67
+ for (k, v) in [("@", " at "), ("e.g.,", "for example, "), ("i.e.,", "that is, ")] {
68
+ s = s.replacingOccurrences(of: k, with: v)
69
+ }
70
+
71
+ let punctSpacing = [(" ,", ","), (" .", "."), (" !", "!"), (" ?", "?"),
72
+ (" ;", ";"), (" :", ":"), (" '", "'")]
73
+ for (k, v) in punctSpacing { s = s.replacingOccurrences(of: k, with: v) }
74
+
75
+ while s.contains("\"\"") { s = s.replacingOccurrences(of: "\"\"", with: "\"") }
76
+ while s.contains("''") { s = s.replacingOccurrences(of: "''", with: "'") }
77
+ while s.contains("``") { s = s.replacingOccurrences(of: "``", with: "`") }
78
+
79
+ let ws = try! NSRegularExpression(pattern: "\\s+")
80
+ s = ws.stringByReplacingMatches(
81
+ in: s,
82
+ range: NSRange(s.startIndex..., in: s),
83
+ withTemplate: " "
84
+ ).trimmingCharacters(in: .whitespacesAndNewlines)
85
+
86
+ if !s.isEmpty {
87
+ let endsWithPunct = try! NSRegularExpression(
88
+ pattern: "[.!?;:,'\"\\u201C\\u201D\\u2018\\u2019)\\]}…。」』】〉》›»]$"
89
+ )
90
+ if endsWithPunct.firstMatch(in: s, range: NSRange(s.startIndex..., in: s)) == nil {
91
+ s += "."
92
+ }
93
+ }
94
+ return "<\(lang)>\(s)</\(lang)>"
95
+ }
96
+
97
+ /// Sentence-aware chunking, mirrors `chunkText` in upstream Helper.swift.
98
+ /// Chunks at hard sentence boundaries; falls back to commas / spaces for
99
+ /// pathologically long inputs.
100
+ private static let abbreviations: Set<String> = [
101
+ "Dr.", "Mr.", "Mrs.", "Ms.", "Prof.", "Sr.", "Jr.",
102
+ "St.", "Ave.", "Rd.", "Blvd.", "Dept.", "Inc.", "Ltd.",
103
+ "Co.", "Corp.", "etc.", "vs.", "i.e.", "e.g.", "Ph.D."
104
+ ]
105
+
106
+ static func chunk(_ text: String, lang: String) -> [String] {
107
+ let maxLen = maxChunkLength(for: lang)
108
+ let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines)
109
+ if trimmed.isEmpty { return [] }
110
+
111
+ let paragraphs = splitParagraphs(trimmed)
112
+
113
+ var chunks: [String] = []
114
+ for p in paragraphs.isEmpty ? [trimmed] : paragraphs {
115
+ if p.count <= maxLen { chunks.append(p); continue }
116
+ chunks.append(contentsOf: greedyJoin(splitSentences(p), maxLen: maxLen))
117
+ }
118
+ return chunks.isEmpty ? [trimmed] : chunks
119
+ }
120
+
121
+ /// Split on `\n\s*\n+` — blank-line paragraph boundaries, mirroring
122
+ /// upstream Helper.swift / helper.py. The previous implementation used a
123
+ /// no-op ternary that never invoked the regex.
124
+ private static func splitParagraphs(_ text: String) -> [String] {
125
+ let regex = try! NSRegularExpression(pattern: "\\n\\s*\\n+")
126
+ let nsText = text as NSString
127
+ let fullRange = NSRange(location: 0, length: nsText.length)
128
+ let matches = regex.matches(in: text, range: fullRange)
129
+ if matches.isEmpty {
130
+ return [text].filter { !$0.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }
131
+ }
132
+ var out: [String] = []
133
+ var cursor = 0
134
+ for m in matches {
135
+ let piece = nsText.substring(with: NSRange(location: cursor, length: m.range.location - cursor))
136
+ let trimmed = piece.trimmingCharacters(in: .whitespacesAndNewlines)
137
+ if !trimmed.isEmpty { out.append(trimmed) }
138
+ cursor = m.range.location + m.range.length
139
+ }
140
+ if cursor < nsText.length {
141
+ let tail = nsText.substring(from: cursor).trimmingCharacters(in: .whitespacesAndNewlines)
142
+ if !tail.isEmpty { out.append(tail) }
143
+ }
144
+ return out
145
+ }
146
+
147
+ /// Split into sentences. Now recognises Asian terminal punctuation
148
+ /// (`。!?`) which doesn't require trailing whitespace, so long
149
+ /// Japanese / Chinese strings actually chunk instead of collapsing into a
150
+ /// single oversized chunk that the model truncates.
151
+ private static func splitSentences(_ text: String) -> [String] {
152
+ // Latin-style: sentence-ender + whitespace.
153
+ // Asian-style: 。!? — whitespace optional.
154
+ let regex = try! NSRegularExpression(pattern: "([.!?])\\s+|([。!?])")
155
+ let range = NSRange(text.startIndex..., in: text)
156
+ let matches = regex.matches(in: text, range: range)
157
+ if matches.isEmpty { return [text] }
158
+
159
+ var sentences: [String] = []
160
+ var lastEnd = text.startIndex
161
+
162
+ for m in matches {
163
+ guard let r = Range(m.range, in: text) else { continue }
164
+ let before = String(text[lastEnd..<r.lowerBound])
165
+ let punc = String(text[Range(NSRange(location: m.range.location, length: 1), in: text)!])
166
+ let combined = before.trimmingCharacters(in: .whitespaces) + punc
167
+ let isAbbrev = abbreviations.contains { combined.hasSuffix($0) }
168
+ if !isAbbrev {
169
+ sentences.append(String(text[lastEnd..<r.upperBound]))
170
+ lastEnd = r.upperBound
171
+ }
172
+ }
173
+ if lastEnd < text.endIndex { sentences.append(String(text[lastEnd...])) }
174
+ return sentences.isEmpty ? [text] : sentences
175
+ }
176
+
177
+ private static func greedyJoin(_ pieces: [String], maxLen: Int) -> [String] {
178
+ var out: [String] = []
179
+ var current = ""
180
+ for piece in pieces {
181
+ let p = piece.trimmingCharacters(in: .whitespacesAndNewlines)
182
+ if p.isEmpty { continue }
183
+ if current.isEmpty {
184
+ current = p
185
+ } else if current.count + 1 + p.count <= maxLen {
186
+ current += " " + p
187
+ } else {
188
+ out.append(current)
189
+ current = p
190
+ }
191
+ }
192
+ if !current.isEmpty { out.append(current) }
193
+ return out
194
+ }
195
+ }
196
+
197
+ /// Loads `unicode_indexer.json` and turns text into Int64 token IDs.
198
+ final class UnicodeIndexer {
199
+ private let table: [Int64]
200
+
201
+ init(url: URL) throws {
202
+ let data = try Data(contentsOf: url, options: .mappedIfSafe)
203
+ self.table = try JSONDecoder().decode([Int64].self, from: data)
204
+ }
205
+
206
+ /// Encode a single string into Int64 token ids using NFKD codepoint indexing.
207
+ func encode(_ text: String) -> [Int64] {
208
+ var out: [Int64] = []
209
+ out.reserveCapacity(text.unicodeScalars.count)
210
+ for scalar in text.unicodeScalars {
211
+ let v = Int(scalar.value)
212
+ out.append(v < table.count ? table[v] : -1)
213
+ }
214
+ return out
215
+ }
216
+ }
@@ -0,0 +1,51 @@
1
+ import Foundation
2
+ import onnxruntime_objc
3
+
4
+ /// Loads a single voice style file from upstream's `voice_styles/<id>.json`.
5
+ ///
6
+ /// Each file contains two 3D float tensors used by the duration predictor and
7
+ /// the text encoder respectively:
8
+ ///
9
+ /// ```json
10
+ /// {
11
+ /// "style_ttl": { "data": [[[…]]], "dims": [1, D1, D2], "type": "f32" },
12
+ /// "style_dp": { "data": [[[…]]], "dims": [1, D1, D2], "type": "f32" }
13
+ /// }
14
+ /// ```
15
+ final class VoicePack {
16
+ let voiceId: String
17
+ let ttlValue: ORTValue
18
+ let dpValue: ORTValue
19
+
20
+ init(voiceId: String, url: URL) throws {
21
+ self.voiceId = voiceId
22
+ let data = try Data(contentsOf: url)
23
+ let decoded = try JSONDecoder().decode(VoiceStyleJSON.self, from: data)
24
+
25
+ let ttlFlat = decoded.style_ttl.data.flatMap { $0.flatMap { $0 } }
26
+ let ttlShape = decoded.style_ttl.dims.map { NSNumber(value: $0) }
27
+ self.ttlValue = try ORTValue(
28
+ tensorData: NSMutableData(bytes: ttlFlat, length: ttlFlat.count * MemoryLayout<Float>.size),
29
+ elementType: .float,
30
+ shape: ttlShape
31
+ )
32
+
33
+ let dpFlat = decoded.style_dp.data.flatMap { $0.flatMap { $0 } }
34
+ let dpShape = decoded.style_dp.dims.map { NSNumber(value: $0) }
35
+ self.dpValue = try ORTValue(
36
+ tensorData: NSMutableData(bytes: dpFlat, length: dpFlat.count * MemoryLayout<Float>.size),
37
+ elementType: .float,
38
+ shape: dpShape
39
+ )
40
+ }
41
+ }
42
+
43
+ private struct VoiceStyleJSON: Decodable {
44
+ struct Component: Decodable {
45
+ let data: [[[Float]]]
46
+ let dims: [Int]
47
+ let type: String
48
+ }
49
+ let style_ttl: Component
50
+ let style_dp: Component
51
+ }