@elizaos/capacitor-bun-runtime 2.0.3-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/ElizaosCapacitorBunRuntime.podspec +54 -0
  2. package/LICENSE +21 -0
  3. package/README.md +127 -0
  4. package/dist/esm/definitions.d.ts +136 -0
  5. package/dist/esm/definitions.d.ts.map +1 -0
  6. package/dist/esm/definitions.js +14 -0
  7. package/dist/esm/definitions.js.map +1 -0
  8. package/dist/esm/index.d.ts +9 -0
  9. package/dist/esm/index.d.ts.map +1 -0
  10. package/dist/esm/index.js +11 -0
  11. package/dist/esm/index.js.map +1 -0
  12. package/dist/esm/web.d.ts +19 -0
  13. package/dist/esm/web.d.ts.map +1 -0
  14. package/dist/esm/web.js +44 -0
  15. package/dist/esm/web.js.map +1 -0
  16. package/dist/plugin.cjs.js +63 -0
  17. package/dist/plugin.cjs.js.map +1 -0
  18. package/dist/plugin.js +66 -0
  19. package/dist/plugin.js.map +1 -0
  20. package/ios/Sources/ElizaBunRuntimePlugin/BridgeInstaller.swift +94 -0
  21. package/ios/Sources/ElizaBunRuntimePlugin/ElizaBunRuntime.swift +705 -0
  22. package/ios/Sources/ElizaBunRuntimePlugin/ElizaBunRuntimePlugin.swift +1109 -0
  23. package/ios/Sources/ElizaBunRuntimePlugin/FullBunEngineHost.swift +677 -0
  24. package/ios/Sources/ElizaBunRuntimePlugin/JSContextHelpers.swift +226 -0
  25. package/ios/Sources/ElizaBunRuntimePlugin/SandboxPaths.swift +46 -0
  26. package/ios/Sources/ElizaBunRuntimePlugin/bridge/CryptoBridge.swift +238 -0
  27. package/ios/Sources/ElizaBunRuntimePlugin/bridge/ElizaSqliteVecBridge.m +28 -0
  28. package/ios/Sources/ElizaBunRuntimePlugin/bridge/FSBridge.swift +270 -0
  29. package/ios/Sources/ElizaBunRuntimePlugin/bridge/HTTPBridge.swift +153 -0
  30. package/ios/Sources/ElizaBunRuntimePlugin/bridge/HTTPServerBridge.swift +32 -0
  31. package/ios/Sources/ElizaBunRuntimePlugin/bridge/LlamaBridge.swift +233 -0
  32. package/ios/Sources/ElizaBunRuntimePlugin/bridge/LlamaBridgeImpl.swift +1863 -0
  33. package/ios/Sources/ElizaBunRuntimePlugin/bridge/LogBridge.swift +36 -0
  34. package/ios/Sources/ElizaBunRuntimePlugin/bridge/PathsBridge.swift +41 -0
  35. package/ios/Sources/ElizaBunRuntimePlugin/bridge/ProcessBridge.swift +80 -0
  36. package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteBridge.swift +406 -0
  37. package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteBridgeInstaller.swift +17 -0
  38. package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteVecLoader.swift +66 -0
  39. package/ios/Sources/ElizaBunRuntimePlugin/bridge/UIBridge.swift +72 -0
  40. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlChinesePhonemizer.swift +313 -0
  41. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlConfiguration.swift +28 -0
  42. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlEngine.swift +325 -0
  43. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlHindiPhonemizer.swift +150 -0
  44. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlJapanesePhonemizer.swift +209 -0
  45. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlLatinPhonemizer.swift +374 -0
  46. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlModel.swift +87 -0
  47. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlPhonemizer.swift +679 -0
  48. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlPronunciationDicts.swift +131 -0
  49. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlSupport.swift +24 -0
  50. package/ios/Tests/llama-bridge-smoke-main.swift +92 -0
  51. package/package.json +68 -0
  52. package/src/bridge-contract.test.ts +127 -0
  53. package/src/definitions.d.ts +136 -0
  54. package/src/definitions.d.ts.map +1 -0
  55. package/src/definitions.ts +152 -0
  56. package/src/index.d.ts +9 -0
  57. package/src/index.d.ts.map +1 -0
  58. package/src/index.ts +16 -0
  59. package/src/web.d.ts +19 -0
  60. package/src/web.d.ts.map +1 -0
  61. package/src/web.ts +80 -0
@@ -0,0 +1,679 @@
1
+ import CoreML
2
+ import Foundation
3
+ import NaturalLanguage
4
+
5
+ /// Multilingual phonemizer for Kokoro TTS.
6
+ ///
7
+ /// English: dictionary lookup → suffix stemming → CoreML BART G2P fallback.
8
+ /// Chinese/Japanese/Italian: dedicated language-specific phonemizers.
9
+ /// French/Spanish/Portuguese/Hindi: pronunciation dictionary + rule-based G2P.
10
+ public final class KokoroPhonemizer {
11
+
12
+ /// IPA symbol → token ID mapping (from vocab_index.json).
13
+ private let vocab: [String: Int]
14
+
15
+ /// Reverse mapping for debugging.
16
+ private let idToToken: [Int: String]
17
+
18
+ /// Gold dictionary (high-confidence entries).
19
+ private var goldDict: [String: DictEntry] = [:]
20
+
21
+ /// Silver dictionary (lower-confidence entries).
22
+ private var silverDict: [String: DictEntry] = [:]
23
+
24
+ /// CoreML G2P encoder model.
25
+ private var g2pEncoder: MLModel?
26
+
27
+ /// CoreML G2P decoder model.
28
+ private var g2pDecoder: MLModel?
29
+
30
+ /// G2P vocabulary mappings.
31
+ private var graphemeToId: [String: Int] = [:]
32
+ private var idToPhoneme: [Int: String] = [:]
33
+ private var g2pBosId: Int = 1
34
+ private var g2pEosId: Int = 2
35
+ private var g2pPadId: Int = 0
36
+
37
+ /// NL tagger for POS tagging (heteronym resolution).
38
+ private let tagger = NLTagger(tagSchemes: [.lexicalClass])
39
+
40
+ /// Pad token ID (0).
41
+ public let padId: Int = 0
42
+
43
+ /// Start-of-sequence token ID.
44
+ public let bosId: Int = 1
45
+
46
+ /// End-of-sequence token ID.
47
+ public let eosId: Int = 2
48
+
49
+ /// Dictionary entry: either a simple phoneme string or POS-tagged heteronym.
50
+ enum DictEntry {
51
+ case simple(String)
52
+ case heteronym([String: String])
53
+ }
54
+
55
+ /// Initialize with a vocabulary mapping.
56
+ public init(vocab: [String: Int]) {
57
+ self.vocab = vocab
58
+ self.idToToken = Dictionary(uniqueKeysWithValues: vocab.map { ($1, $0) })
59
+ }
60
+
61
+ /// Load vocabulary from vocab_index.json.
62
+ ///
63
+ /// Format: `{"vocab": {"symbol": id, ...}, "metadata": {...}}`
64
+ public static func loadVocab(from url: URL) throws -> KokoroPhonemizer {
65
+ let data = try Data(contentsOf: url)
66
+ let json = try JSONSerialization.jsonObject(with: data)
67
+
68
+ // Support both flat {sym: id} and nested {vocab: {sym: id}} formats
69
+ let vocab: [String: Int]
70
+ if let nested = json as? [String: Any], let v = nested["vocab"] as? [String: Int] {
71
+ vocab = v
72
+ } else if let flat = json as? [String: Int] {
73
+ vocab = flat
74
+ } else {
75
+ throw NSError(domain: "KokoroTTS", code: -1,
76
+ userInfo: [NSLocalizedDescriptionKey: "Invalid vocab_index.json format"])
77
+ }
78
+ return KokoroPhonemizer(vocab: vocab)
79
+ }
80
+
81
+ /// Load pronunciation dictionaries from directory.
82
+ public func loadDictionaries(from directory: URL, british: Bool = false) throws {
83
+ let prefix = british ? "gb" : "us"
84
+ let goldURL = directory.appendingPathComponent("\(prefix)_gold.json")
85
+ let silverURL = directory.appendingPathComponent("\(prefix)_silver.json")
86
+
87
+ if FileManager.default.fileExists(atPath: goldURL.path) {
88
+ goldDict = try parseDictionary(from: goldURL)
89
+ growDictionary(&goldDict)
90
+ }
91
+ if FileManager.default.fileExists(atPath: silverURL.path) {
92
+ silverDict = try parseDictionary(from: silverURL)
93
+ growDictionary(&silverDict)
94
+ }
95
+ }
96
+
97
+ /// Load separate G2P encoder + decoder CoreML models.
98
+ public func loadG2PModels(encoderURL: URL, decoderURL: URL, vocabURL: URL) throws {
99
+ let config = MLModelConfiguration()
100
+ config.computeUnits = .cpuOnly
101
+ g2pEncoder = try MLModel(contentsOf: encoderURL, configuration: config)
102
+ g2pDecoder = try MLModel(contentsOf: decoderURL, configuration: config)
103
+
104
+ // Load G2P vocabulary
105
+ if FileManager.default.fileExists(atPath: vocabURL.path) {
106
+ let data = try Data(contentsOf: vocabURL)
107
+ if let json = try JSONSerialization.jsonObject(with: data) as? [String: Any] {
108
+ if let g2id = json["grapheme_to_id"] as? [String: Int] {
109
+ graphemeToId = g2id
110
+ }
111
+ if let id2p = json["id_to_phoneme"] as? [String: String] {
112
+ idToPhoneme = Dictionary(uniqueKeysWithValues: id2p.compactMap { k, v in
113
+ Int(k).map { ($0, v) }
114
+ })
115
+ }
116
+ g2pBosId = (json["bos_token_id"] as? Int) ?? 1
117
+ g2pEosId = (json["eos_token_id"] as? Int) ?? 2
118
+ g2pPadId = (json["pad_token_id"] as? Int) ?? 0
119
+ }
120
+ }
121
+ }
122
+
123
+ /// Legacy single-model G2P loading (backward compat).
124
+ public func loadG2PModel(from url: URL) throws {
125
+ let config = MLModelConfiguration()
126
+ config.computeUnits = .cpuOnly
127
+ g2pEncoder = try MLModel(contentsOf: url, configuration: config)
128
+ }
129
+
130
+ // MARK: - Multilingual Phonemizers
131
+
132
+ private lazy var chinesePhonemizer = ChinesePhonemizer()
133
+ private lazy var japanesePhonemizer = JapanesePhonemizer()
134
+ private lazy var hindiPhonemizer = HindiPhonemizer()
135
+ private lazy var frenchPhonemizer = LatinPhonemizer(language: .french)
136
+ private lazy var spanishPhonemizer = LatinPhonemizer(language: .spanish)
137
+ private lazy var portuguesePhonemizer = LatinPhonemizer(language: .portuguese)
138
+ private lazy var italianPhonemizer = LatinPhonemizer(language: .italian)
139
+
140
+ // MARK: - Tokenization
141
+
142
+ /// Convert text to phoneme token IDs using language-appropriate phonemizer.
143
+ public func tokenize(_ text: String, maxLength: Int = 510, language: String = "en") -> [Int] {
144
+ let phonemes: String
145
+ switch language {
146
+ case "zh", "cmn", "chinese", "mandarin":
147
+ phonemes = chinesePhonemizer.phonemize(text)
148
+ case "ja", "japanese":
149
+ phonemes = japanesePhonemizer.phonemize(text)
150
+ case "it", "italian":
151
+ phonemes = phonemizeWithDict(text, dict: PronunciationDicts.it, fallback: italianPhonemizer)
152
+ case "fr", "french":
153
+ phonemes = phonemizeWithDict(text, dict: PronunciationDicts.fr, fallback: frenchPhonemizer)
154
+ case "es", "spanish":
155
+ phonemes = phonemizeWithDict(text, dict: PronunciationDicts.es, fallback: spanishPhonemizer)
156
+ case "pt", "portuguese":
157
+ phonemes = phonemizeWithDict(text, dict: PronunciationDicts.pt, fallback: portuguesePhonemizer)
158
+ case "hi", "hindi":
159
+ phonemes = phonemizeWithDict(text, dict: PronunciationDicts.hi, fallback: hindiPhonemizer)
160
+ default:
161
+ phonemes = textToPhonemes(text)
162
+ }
163
+
164
+ var ids = [bosId]
165
+
166
+ // Tokenize IPA string character by character
167
+ for char in phonemes {
168
+ let s = String(char)
169
+ if let id = vocab[s] {
170
+ ids.append(id)
171
+ }
172
+ // Unknown chars silently dropped
173
+ }
174
+
175
+ ids.append(eosId)
176
+
177
+ if ids.count > maxLength {
178
+ ids = Array(ids.prefix(maxLength - 1)) + [eosId]
179
+ }
180
+
181
+ return ids
182
+ }
183
+
184
+ /// Pad token IDs to a fixed length.
185
+ public func pad(_ ids: [Int], to length: Int) -> [Int] {
186
+ if ids.count >= length { return Array(ids.prefix(length)) }
187
+ return ids + [Int](repeating: padId, count: length - ids.count)
188
+ }
189
+
190
+ // MARK: - Dictionary-Based Phonemization
191
+
192
+ /// Phonemize text using dictionary lookup with rule-based fallback.
193
+ /// Words found in the dictionary use pre-computed IPA with correct stress placement.
194
+ /// Unknown words fall back to the language-specific rule-based G2P.
195
+ private func phonemizeWithDict(_ text: String, dict: [String: String], fallback: LatinPhonemizer) -> String {
196
+ var result = ""
197
+ var lastWasWord = false
198
+
199
+ for ch in text {
200
+ if ch.isWhitespace {
201
+ if lastWasWord { result += " " }
202
+ lastWasWord = false
203
+ } else if ch.isPunctuation || ch.isSymbol {
204
+ if let mapped = punctuationToPhoneme(String(ch)) {
205
+ result += mapped
206
+ }
207
+ lastWasWord = false
208
+ } else if ch.isLetter || ch == "'" || ch == "'" {
209
+ // Accumulate word characters — handled below
210
+ continue
211
+ }
212
+ }
213
+
214
+ // Split into words and look up each one
215
+ let words = text.components(separatedBy: .whitespaces).filter { !$0.isEmpty }
216
+ result = ""
217
+ lastWasWord = false
218
+
219
+ for word in words {
220
+ // Strip trailing punctuation
221
+ var clean = word.lowercased()
222
+ var trailing = ""
223
+ while let last = clean.last, last.isPunctuation || last.isSymbol {
224
+ trailing = String(last) + trailing
225
+ clean = String(clean.dropLast())
226
+ }
227
+ var leading = ""
228
+ while let first = clean.first, first.isPunctuation || first.isSymbol {
229
+ leading += String(first)
230
+ clean = String(clean.dropFirst())
231
+ }
232
+
233
+ // Leading punctuation
234
+ for ch in leading {
235
+ if let mapped = mapPunctuation(ch) { result += mapped }
236
+ }
237
+
238
+ if !clean.isEmpty {
239
+ if lastWasWord { result += " " }
240
+ // Dictionary lookup, then fallback
241
+ if let ipa = dict[clean] {
242
+ result += ipa
243
+ } else {
244
+ result += fallback.phonemizeWord(clean)
245
+ }
246
+ lastWasWord = true
247
+ }
248
+
249
+ // Trailing punctuation
250
+ for ch in trailing {
251
+ if let mapped = mapPunctuation(ch) { result += mapped }
252
+ lastWasWord = false
253
+ }
254
+ }
255
+
256
+ return result
257
+ }
258
+
259
+ /// Dictionary-based phonemization for Hindi (uses HindiPhonemizer as fallback).
260
+ private func phonemizeWithDict(_ text: String, dict: [String: String], fallback: HindiPhonemizer) -> String {
261
+ let words = text.components(separatedBy: .whitespaces).filter { !$0.isEmpty }
262
+ var result = ""
263
+ var lastWasWord = false
264
+
265
+ for word in words {
266
+ var clean = word
267
+ var trailing = ""
268
+ while let last = clean.last, last.isPunctuation || last.isSymbol {
269
+ trailing = String(last) + trailing
270
+ clean = String(clean.dropLast())
271
+ }
272
+
273
+ if !clean.isEmpty {
274
+ if lastWasWord { result += " " }
275
+ if let ipa = dict[clean] {
276
+ result += ipa
277
+ } else {
278
+ result += fallback.phonemizeWord(clean)
279
+ }
280
+ lastWasWord = true
281
+ }
282
+
283
+ for ch in trailing {
284
+ if let mapped = mapPunctuation(ch) { result += mapped }
285
+ lastWasWord = false
286
+ }
287
+ }
288
+
289
+ return result
290
+ }
291
+
292
+ private func mapPunctuation(_ ch: Character) -> String? {
293
+ switch ch {
294
+ case ",", ",": return ","
295
+ case ".", "。": return "."
296
+ case "!", "!": return "!"
297
+ case "?", "?": return "?"
298
+ case ";", ";": return ";"
299
+ case ":": return ":"
300
+ case "।": return "."
301
+ default: return nil
302
+ }
303
+ }
304
+
305
+ // MARK: - Text-to-Phoneme Pipeline
306
+
307
+ func textToPhonemes(_ text: String) -> String {
308
+ let normalized = normalizeText(text)
309
+ let words = splitWords(normalized)
310
+ let posTagged = tagPOS(normalized)
311
+
312
+ var result = ""
313
+ for word in words {
314
+ if word.allSatisfy({ $0.isWhitespace }) {
315
+ result += " "
316
+ continue
317
+ }
318
+ if word.allSatisfy({ $0.isPunctuation || $0.isSymbol }) {
319
+ if let mapped = punctuationToPhoneme(word) {
320
+ result += mapped
321
+ }
322
+ continue
323
+ }
324
+ let pos = posTagged[word.lowercased()]
325
+ if let phonemes = resolveWord(word, pos: pos) {
326
+ result += phonemes
327
+ }
328
+ }
329
+ return result
330
+ }
331
+
332
+ // MARK: - Word Resolution
333
+
334
+ private func resolveWord(_ word: String, pos: String?) -> String? {
335
+ let lower = word.lowercased()
336
+ if let special = specialCase(lower, pos: pos) { return special }
337
+ if let entry = lookupDict(lower, pos: pos) { return entry }
338
+ if let stemmed = stemAndLookup(lower) { return stemmed }
339
+ if let g2p = bartG2P(lower) { return g2p }
340
+ return lower
341
+ }
342
+
343
+ private func lookupDict(_ word: String, pos: String?) -> String? {
344
+ if let entry = goldDict[word] { return resolveEntry(entry, pos: pos) }
345
+ if let entry = silverDict[word] { return resolveEntry(entry, pos: pos) }
346
+ return nil
347
+ }
348
+
349
+ private func resolveEntry(_ entry: DictEntry, pos: String?) -> String {
350
+ switch entry {
351
+ case .simple(let phonemes):
352
+ return phonemes
353
+ case .heteronym(let posMap):
354
+ if let pos, let phonemes = posMap[pos] { return phonemes }
355
+ return posMap["DEFAULT"] ?? posMap.values.first ?? ""
356
+ }
357
+ }
358
+
359
+ // MARK: - Special Cases
360
+
361
+ private func specialCase(_ word: String, pos: String?) -> String? {
362
+ switch word {
363
+ case "eliza": return "ɪlˈaɪzə"
364
+ case "elizaos": return "ɪlˈaɪzə oʊ ɛs"
365
+ case "the": return "ðə"
366
+ case "a":
367
+ if pos == "Determiner" { return "ɐ" }
368
+ return "eɪ"
369
+ case "an": return "ən"
370
+ case "to": return "tʊ"
371
+ case "of": return "ʌv"
372
+ case "i": return "aɪ"
373
+ default: return nil
374
+ }
375
+ }
376
+
377
+ // MARK: - Suffix Stemming
378
+
379
+ private func stemAndLookup(_ word: String) -> String? {
380
+ if let result = stemS(word) { return result }
381
+ if let result = stemEd(word) { return result }
382
+ if let result = stemIng(word) { return result }
383
+ return nil
384
+ }
385
+
386
+ private func stemS(_ word: String) -> String? {
387
+ guard word.hasSuffix("s") && word.count > 2 else { return nil }
388
+ if word.hasSuffix("ies") {
389
+ let stem = String(word.dropLast(3)) + "y"
390
+ if let phonemes = lookupDict(stem, pos: nil) { return phonemes + "z" }
391
+ }
392
+ if word.hasSuffix("es") && word.count > 3 {
393
+ let stem = String(word.dropLast(2))
394
+ if let phonemes = lookupDict(stem, pos: nil) {
395
+ let last = phonemes.last
396
+ if last == "s" || last == "z" || last == "ʃ" || last == "ʒ" { return phonemes + "ɪz" }
397
+ return phonemes + "z"
398
+ }
399
+ }
400
+ let stem = String(word.dropLast(1))
401
+ if let phonemes = lookupDict(stem, pos: nil) {
402
+ let voiceless: Set<Character> = ["p", "t", "k", "f", "θ"]
403
+ if let last = phonemes.last, voiceless.contains(last) { return phonemes + "s" }
404
+ return phonemes + "z"
405
+ }
406
+ return nil
407
+ }
408
+
409
+ private func stemEd(_ word: String) -> String? {
410
+ guard word.hasSuffix("ed") && word.count > 3 else { return nil }
411
+ if word.hasSuffix("ied") {
412
+ let stem = String(word.dropLast(3)) + "y"
413
+ if let phonemes = lookupDict(stem, pos: nil) { return phonemes + "d" }
414
+ }
415
+ let stemEd = String(word.dropLast(2))
416
+ if stemEd.count >= 2 {
417
+ let chars = Array(stemEd)
418
+ if chars[chars.count - 1] == chars[chars.count - 2] {
419
+ let dedoubled = String(stemEd.dropLast(1))
420
+ if let phonemes = lookupDict(dedoubled, pos: nil) {
421
+ return phonemes + edSuffix(phonemes)
422
+ }
423
+ }
424
+ }
425
+ if let phonemes = lookupDict(stemEd, pos: nil) {
426
+ return phonemes + edSuffix(phonemes)
427
+ }
428
+ return nil
429
+ }
430
+
431
+ private func edSuffix(_ phonemes: String) -> String {
432
+ let last = phonemes.last
433
+ if last == "t" || last == "d" { return "ɪd" }
434
+ let voiceless: Set<Character> = ["p", "k", "f", "θ", "s", "ʃ"]
435
+ if let l = last, voiceless.contains(l) { return "t" }
436
+ return "d"
437
+ }
438
+
439
+ private func stemIng(_ word: String) -> String? {
440
+ guard word.hasSuffix("ing") && word.count > 4 else { return nil }
441
+ let stem = String(word.dropLast(3))
442
+ if stem.count >= 2 {
443
+ let chars = Array(stem)
444
+ if chars[chars.count - 1] == chars[chars.count - 2] {
445
+ let dedoubled = String(stem.dropLast(1))
446
+ if let phonemes = lookupDict(dedoubled, pos: nil) { return phonemes + "ɪŋ" }
447
+ }
448
+ }
449
+ if let phonemes = lookupDict(stem, pos: nil) { return phonemes + "ɪŋ" }
450
+ let stemE = stem + "e"
451
+ if let phonemes = lookupDict(stemE, pos: nil) { return phonemes + "ɪŋ" }
452
+ return nil
453
+ }
454
+
455
+ // MARK: - BART G2P Neural Fallback
456
+
457
+ /// Use the CoreML BART encoder-decoder to phonemize an OOV word.
458
+ private func bartG2P(_ word: String) -> String? {
459
+ guard let encoder = g2pEncoder, let decoder = g2pDecoder else { return nil }
460
+ guard !graphemeToId.isEmpty else { return nil }
461
+
462
+ // Encode graphemes
463
+ var inputIds: [Int32] = [Int32(g2pBosId)]
464
+ for char in word {
465
+ let s = String(char)
466
+ if let id = graphemeToId[s] {
467
+ inputIds.append(Int32(id))
468
+ } else if let id = graphemeToId[s.lowercased()] {
469
+ inputIds.append(Int32(id))
470
+ } else {
471
+ inputIds.append(Int32(graphemeToId["<unk>"] ?? 3))
472
+ }
473
+ }
474
+ inputIds.append(Int32(g2pEosId))
475
+
476
+ let seqLen = inputIds.count
477
+ guard seqLen <= 64 else { return nil }
478
+
479
+ do {
480
+ // Run encoder
481
+ let encInput = try MLMultiArray(shape: [1, seqLen as NSNumber], dataType: .int32)
482
+ let encPtr = encInput.dataPointer.assumingMemoryBound(to: Int32.self)
483
+ for i in 0..<seqLen { encPtr[i] = inputIds[i] }
484
+
485
+ let encFeatures = try MLDictionaryFeatureProvider(dictionary: [
486
+ "input_ids": MLFeatureValue(multiArray: encInput),
487
+ ])
488
+ let encOutput = try encoder.prediction(from: encFeatures)
489
+ guard let hiddenStates = encOutput.featureValue(for: "encoder_hidden_states")?.multiArrayValue else {
490
+ return nil
491
+ }
492
+
493
+ // Autoregressive decoding
494
+ var decoderIds: [Int32] = [Int32(g2pBosId)]
495
+ let maxDecLen = 64
496
+
497
+ for step in 0..<maxDecLen {
498
+ let decLen = decoderIds.count
499
+
500
+ let decInput = try MLMultiArray(shape: [1, decLen as NSNumber], dataType: .int32)
501
+ let decPtr = decInput.dataPointer.assumingMemoryBound(to: Int32.self)
502
+ for i in 0..<decLen { decPtr[i] = decoderIds[i] }
503
+
504
+ let posIds = try MLMultiArray(shape: [1, decLen as NSNumber], dataType: .int32)
505
+ let posPtr = posIds.dataPointer.assumingMemoryBound(to: Int32.self)
506
+ for i in 0..<decLen { posPtr[i] = Int32(i) }
507
+
508
+ let mask = try MLMultiArray(shape: [1, decLen as NSNumber, decLen as NSNumber], dataType: .float32)
509
+ let maskPtr = mask.dataPointer.assumingMemoryBound(to: Float.self)
510
+ for i in 0..<decLen {
511
+ for j in 0..<decLen {
512
+ maskPtr[i * decLen + j] = (j <= i) ? 0.0 : -Float.greatestFiniteMagnitude
513
+ }
514
+ }
515
+
516
+ let decFeatures = try MLDictionaryFeatureProvider(dictionary: [
517
+ "decoder_input_ids": MLFeatureValue(multiArray: decInput),
518
+ "encoder_hidden_states": MLFeatureValue(multiArray: hiddenStates),
519
+ "position_ids": MLFeatureValue(multiArray: posIds),
520
+ "causal_mask": MLFeatureValue(multiArray: mask),
521
+ ])
522
+
523
+ let decOutput = try decoder.prediction(from: decFeatures)
524
+ guard let logits = decOutput.featureValue(for: "logits")?.multiArrayValue else {
525
+ break
526
+ }
527
+
528
+ // Greedy: take argmax of last position
529
+ let vocabSize = logits.shape.last!.intValue
530
+ let lastOffset = step * vocabSize
531
+ var maxId = 0
532
+ var maxVal: Float = -.infinity
533
+ if #available(iOS 16.0, *), logits.dataType == .float16 {
534
+ let lPtr = logits.dataPointer.assumingMemoryBound(to: Float16.self)
535
+ for v in 0..<vocabSize {
536
+ let val = Float(lPtr[lastOffset + v])
537
+ if val > maxVal { maxVal = val; maxId = v }
538
+ }
539
+ } else {
540
+ let lPtr = logits.dataPointer.assumingMemoryBound(to: Float.self)
541
+ for v in 0..<vocabSize {
542
+ let val = lPtr[lastOffset + v]
543
+ if val > maxVal { maxVal = val; maxId = v }
544
+ }
545
+ }
546
+
547
+ if maxId == g2pEosId { break }
548
+ decoderIds.append(Int32(maxId))
549
+ }
550
+
551
+ // Convert IDs to phonemes
552
+ var result = ""
553
+ for id in decoderIds.dropFirst() { // skip BOS
554
+ let intId = Int(id)
555
+ if intId != g2pPadId && intId != g2pBosId && intId != g2pEosId,
556
+ let phoneme = idToPhoneme[intId] {
557
+ result += phoneme
558
+ }
559
+ }
560
+ return result.isEmpty ? nil : result
561
+ } catch {
562
+ return nil
563
+ }
564
+ }
565
+
566
+ // MARK: - Text Normalization
567
+
568
+ private func normalizeText(_ text: String) -> String {
569
+ var result = text
570
+ .replacingOccurrences(of: "’", with: "'")
571
+ .replacingOccurrences(of: "‘", with: "'")
572
+ .replacingOccurrences(of: "—", with: ",")
573
+ .replacingOccurrences(of: "–", with: ",")
574
+ let contractions: [(String, String)] = [
575
+ ("can't", "can not"), ("won't", "will not"), ("don't", "do not"),
576
+ ("doesn't", "does not"), ("didn't", "did not"), ("isn't", "is not"),
577
+ ("aren't", "are not"), ("wasn't", "was not"), ("weren't", "were not"),
578
+ ("couldn't", "could not"), ("wouldn't", "would not"), ("shouldn't", "should not"),
579
+ ("haven't", "have not"), ("hasn't", "has not"), ("hadn't", "had not"),
580
+ ("i'm", "i am"), ("i've", "i have"), ("i'll", "i will"), ("i'd", "i would"),
581
+ ("you're", "you are"), ("you've", "you have"), ("you'll", "you will"),
582
+ ("he's", "he is"), ("she's", "she is"), ("it's", "it is"),
583
+ ("we're", "we are"), ("we've", "we have"), ("we'll", "we will"),
584
+ ("they're", "they are"), ("they've", "they have"), ("they'll", "they will"),
585
+ ("that's", "that is"), ("there's", "there is"), ("let's", "let us"),
586
+ ]
587
+ let lower = result.lowercased()
588
+ for (contraction, expansion) in contractions {
589
+ if lower.contains(contraction) {
590
+ result = result.replacingOccurrences(of: contraction, with: expansion, options: .caseInsensitive)
591
+ }
592
+ }
593
+ while result.contains(" ") {
594
+ result = result.replacingOccurrences(of: " ", with: " ")
595
+ }
596
+ return result.trimmingCharacters(in: .whitespaces)
597
+ }
598
+
599
+ private func splitWords(_ text: String) -> [String] {
600
+ var words: [String] = []
601
+ var current = ""
602
+ for char in text {
603
+ if char.isWhitespace {
604
+ if !current.isEmpty { words.append(current); current = "" }
605
+ words.append(" ")
606
+ } else if char.isPunctuation || char.isSymbol {
607
+ if !current.isEmpty { words.append(current); current = "" }
608
+ words.append(String(char))
609
+ } else {
610
+ current.append(char)
611
+ }
612
+ }
613
+ if !current.isEmpty { words.append(current) }
614
+ return words
615
+ }
616
+
617
+ private func punctuationToPhoneme(_ text: String) -> String? {
618
+ switch text {
619
+ case ",": return ","
620
+ case ".": return "."
621
+ case "!": return "!"
622
+ case "?": return "?"
623
+ case ";": return ";"
624
+ case ":": return ":"
625
+ case "-": return "-"
626
+ case "'": return "'"
627
+ default: return nil
628
+ }
629
+ }
630
+
631
+ // MARK: - POS Tagging
632
+
633
+ private func tagPOS(_ text: String) -> [String: String] {
634
+ var result = [String: String]()
635
+ tagger.string = text
636
+ let range = text.startIndex..<text.endIndex
637
+ tagger.enumerateTags(in: range, unit: .word, scheme: .lexicalClass) { tag, tokenRange in
638
+ let word = String(text[tokenRange]).lowercased()
639
+ if let tag { result[word] = tag.rawValue }
640
+ return true
641
+ }
642
+ return result
643
+ }
644
+
645
+ // MARK: - Dictionary Parsing
646
+
647
+ private func parseDictionary(from url: URL) throws -> [String: DictEntry] {
648
+ let data = try Data(contentsOf: url)
649
+ guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any] else { return [:] }
650
+ var dict = [String: DictEntry]()
651
+ for (key, value) in json {
652
+ if let phonemes = value as? String {
653
+ dict[key] = .simple(phonemes)
654
+ } else if let posMap = value as? [String: String?] {
655
+ var resolved = [String: String]()
656
+ for (pos, pron) in posMap {
657
+ if let p = pron { resolved[pos] = p }
658
+ }
659
+ if !resolved.isEmpty { dict[key] = .heteronym(resolved) }
660
+ }
661
+ }
662
+ return dict
663
+ }
664
+
665
+ private func growDictionary(_ dict: inout [String: DictEntry]) {
666
+ var additions = [String: DictEntry]()
667
+ for (key, entry) in dict {
668
+ if key == key.lowercased() && !key.isEmpty {
669
+ let capitalized = key.prefix(1).uppercased() + key.dropFirst()
670
+ if dict[capitalized] == nil { additions[capitalized] = entry }
671
+ }
672
+ if key.first?.isUppercase == true {
673
+ let lower = key.lowercased()
674
+ if dict[lower] == nil { additions[lower] = entry }
675
+ }
676
+ }
677
+ for (key, entry) in additions { dict[key] = entry }
678
+ }
679
+ }