@elizaos/capacitor-bun-runtime 2.0.11-beta.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/ElizaosCapacitorBunRuntime.podspec +54 -0
  2. package/LICENSE +21 -0
  3. package/README.md +127 -0
  4. package/dist/esm/definitions.d.ts +136 -0
  5. package/dist/esm/definitions.d.ts.map +1 -0
  6. package/dist/esm/definitions.js +14 -0
  7. package/dist/esm/definitions.js.map +1 -0
  8. package/dist/esm/index.d.ts +9 -0
  9. package/dist/esm/index.d.ts.map +1 -0
  10. package/dist/esm/index.js +11 -0
  11. package/dist/esm/index.js.map +1 -0
  12. package/dist/esm/web.d.ts +19 -0
  13. package/dist/esm/web.d.ts.map +1 -0
  14. package/dist/esm/web.js +44 -0
  15. package/dist/esm/web.js.map +1 -0
  16. package/dist/plugin.cjs.js +63 -0
  17. package/dist/plugin.cjs.js.map +1 -0
  18. package/dist/plugin.js +66 -0
  19. package/dist/plugin.js.map +1 -0
  20. package/ios/Sources/ElizaBunRuntimePlugin/BridgeInstaller.swift +94 -0
  21. package/ios/Sources/ElizaBunRuntimePlugin/ElizaBunRuntime.swift +705 -0
  22. package/ios/Sources/ElizaBunRuntimePlugin/ElizaBunRuntimePlugin.swift +1109 -0
  23. package/ios/Sources/ElizaBunRuntimePlugin/FullBunEngineHost.swift +677 -0
  24. package/ios/Sources/ElizaBunRuntimePlugin/JSContextHelpers.swift +226 -0
  25. package/ios/Sources/ElizaBunRuntimePlugin/SandboxPaths.swift +46 -0
  26. package/ios/Sources/ElizaBunRuntimePlugin/bridge/CryptoBridge.swift +238 -0
  27. package/ios/Sources/ElizaBunRuntimePlugin/bridge/ElizaSqliteVecBridge.m +28 -0
  28. package/ios/Sources/ElizaBunRuntimePlugin/bridge/FSBridge.swift +270 -0
  29. package/ios/Sources/ElizaBunRuntimePlugin/bridge/HTTPBridge.swift +153 -0
  30. package/ios/Sources/ElizaBunRuntimePlugin/bridge/HTTPServerBridge.swift +32 -0
  31. package/ios/Sources/ElizaBunRuntimePlugin/bridge/LlamaBridge.swift +233 -0
  32. package/ios/Sources/ElizaBunRuntimePlugin/bridge/LlamaBridgeImpl.swift +1863 -0
  33. package/ios/Sources/ElizaBunRuntimePlugin/bridge/LogBridge.swift +36 -0
  34. package/ios/Sources/ElizaBunRuntimePlugin/bridge/PathsBridge.swift +41 -0
  35. package/ios/Sources/ElizaBunRuntimePlugin/bridge/ProcessBridge.swift +80 -0
  36. package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteBridge.swift +406 -0
  37. package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteBridgeInstaller.swift +17 -0
  38. package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteVecLoader.swift +66 -0
  39. package/ios/Sources/ElizaBunRuntimePlugin/bridge/UIBridge.swift +72 -0
  40. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlChinesePhonemizer.swift +313 -0
  41. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlConfiguration.swift +28 -0
  42. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlEngine.swift +325 -0
  43. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlHindiPhonemizer.swift +150 -0
  44. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlJapanesePhonemizer.swift +209 -0
  45. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlLatinPhonemizer.swift +374 -0
  46. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlModel.swift +87 -0
  47. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlPhonemizer.swift +679 -0
  48. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlPronunciationDicts.swift +131 -0
  49. package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlSupport.swift +24 -0
  50. package/ios/Tests/llama-bridge-smoke-main.swift +92 -0
  51. package/package.json +68 -0
  52. package/src/bridge-contract.test.ts +127 -0
  53. package/src/definitions.d.ts +136 -0
  54. package/src/definitions.d.ts.map +1 -0
  55. package/src/definitions.ts +152 -0
  56. package/src/index.d.ts +9 -0
  57. package/src/index.d.ts.map +1 -0
  58. package/src/index.ts +16 -0
  59. package/src/web.d.ts +19 -0
  60. package/src/web.d.ts.map +1 -0
  61. package/src/web.ts +80 -0
@@ -0,0 +1,209 @@
1
+ import Foundation
2
+
3
+ /// Japanese text-to-phoneme conversion for Kokoro TTS.
4
+ ///
5
+ /// Pipeline: Japanese text → CFStringTokenizer (word segmentation + readings)
6
+ /// → katakana → IPA via M2P table → P2R romanization
7
+ ///
8
+ /// Uses Apple's built-in Japanese morphological analysis — no MeCab dependency.
9
+ /// M2P katakana-to-IPA table covers standard and extended katakana (193 entries).
10
+ final class JapanesePhonemizer {
11
+
12
+ // MARK: - Katakana → IPA (M2P)
13
+
14
+ /// Digraph mappings (two-character katakana → IPA). Checked before single chars.
15
+ private static let digraphs: [String: String] = [
16
+ "イェ": "je",
17
+ "ウィ": "wi", "ウゥ": "wu", "ウェ": "we", "ウォ": "wo",
18
+ "キィ": "kyi", "キェ": "kye", "キャ": "kya", "キュ": "kyu", "キョ": "kyo",
19
+ "ギィ": "gyi", "ギェ": "gye", "ギャ": "gya", "ギュ": "gyu", "ギョ": "gyo",
20
+ "クァ": "kwa", "クィ": "kwi", "クゥ": "kwu", "クェ": "kwe", "クォ": "kwo", "クヮ": "kwa",
21
+ "グァ": "gwa", "グィ": "gwi", "グゥ": "gwu", "グェ": "gwe", "グォ": "gwo", "グヮ": "gwa",
22
+ "シェ": "she", "シャ": "sha", "シュ": "shu", "ショ": "sho",
23
+ "ジェ": "je", "ジャ": "ja", "ジュ": "ju", "ジョ": "jo",
24
+ "スィ": "si", "ズィ": "zi",
25
+ "チェ": "che", "チャ": "cha", "チュ": "chu", "チョ": "cho",
26
+ "ヂェ": "je", "ヂャ": "ja", "ヂュ": "ju", "ヂョ": "jo",
27
+ "ツァ": "tsa", "ツィ": "tsi", "ツェ": "tse", "ツォ": "tso",
28
+ "ティ": "ti", "テェ": "tye", "テャ": "tya", "テュ": "tyu", "テョ": "tyo",
29
+ "ディ": "di", "デェ": "dye", "デャ": "dya", "デュ": "dyu", "デョ": "dyo",
30
+ "トゥ": "tu", "ドゥ": "du",
31
+ "ニィ": "nyi", "ニェ": "nye", "ニャ": "nya", "ニュ": "nyu", "ニョ": "nyo",
32
+ "ヒィ": "hyi", "ヒェ": "hye", "ヒャ": "hya", "ヒュ": "hyu", "ヒョ": "hyo",
33
+ "ビィ": "byi", "ビェ": "bye", "ビャ": "bya", "ビュ": "byu", "ビョ": "byo",
34
+ "ピィ": "pyi", "ピェ": "pye", "ピャ": "pya", "ピュ": "pyu", "ピョ": "pyo",
35
+ "ファ": "fa", "フィ": "fi", "フェ": "fe", "フォ": "fo",
36
+ "ミィ": "myi", "ミェ": "mye", "ミャ": "mya", "ミュ": "myu", "ミョ": "myo",
37
+ "リィ": "ryi", "リェ": "rye", "リャ": "rya", "リュ": "ryu", "リョ": "ryo",
38
+ "ヴァ": "va", "ヴィ": "vi", "ヴェ": "ve", "ヴォ": "vo",
39
+ "ヴャ": "bya", "ヴュ": "byu", "ヴョ": "byo",
40
+ ]
41
+
42
+ /// Single katakana → IPA mappings.
43
+ private static let singles: [Character: String] = [
44
+ "ァ": "a", "ア": "a", "ィ": "i", "イ": "i",
45
+ "ゥ": "u", "ウ": "u", "ェ": "e", "エ": "e",
46
+ "ォ": "o", "オ": "o",
47
+ "カ": "ka", "ガ": "ga", "キ": "ki", "ギ": "gi",
48
+ "ク": "ku", "グ": "gu", "ケ": "ke", "ゲ": "ge",
49
+ "コ": "ko", "ゴ": "go",
50
+ "サ": "sa", "ザ": "za", "シ": "shi", "ジ": "ji",
51
+ "ス": "su", "ズ": "zu", "セ": "se", "ゼ": "ze",
52
+ "ソ": "so", "ゾ": "zo",
53
+ "タ": "ta", "ダ": "da", "チ": "chi", "ヂ": "ji",
54
+ "ツ": "tsu", "ヅ": "zu", "テ": "te", "デ": "de",
55
+ "ト": "to", "ド": "do",
56
+ "ナ": "na", "ニ": "ni", "ヌ": "nu", "ネ": "ne", "ノ": "no",
57
+ "ハ": "ha", "バ": "ba", "パ": "pa",
58
+ "ヒ": "hi", "ビ": "bi", "ピ": "pi",
59
+ "フ": "fu", "ブ": "bu", "プ": "pu",
60
+ "ヘ": "he", "ベ": "be", "ペ": "pe",
61
+ "ホ": "ho", "ボ": "bo", "ポ": "po",
62
+ "マ": "ma", "ミ": "mi", "ム": "mu", "メ": "me", "モ": "mo",
63
+ "ャ": "ya", "ヤ": "ya", "ュ": "yu", "ユ": "yu",
64
+ "ョ": "yo", "ヨ": "yo",
65
+ "ラ": "ra", "リ": "ri", "ル": "ru", "レ": "re", "ロ": "ro",
66
+ "ヮ": "wa", "ワ": "wa", "ヰ": "i", "ヱ": "e", "ヲ": "o",
67
+ "ヴ": "vu", "ヵ": "ka", "ヶ": "ke",
68
+ "ヷ": "va", "ヸ": "vi", "ヹ": "ve", "ヺ": "vo",
69
+ "ッ": "ʔ", "ン": "ɴ", "ー": "ː",
70
+ ]
71
+
72
+ // MARK: - Japanese Punctuation
73
+
74
+ private static let punctuationMap: [Character: String] = [
75
+ "「": "\"", "」": "\"", "『": "\"", "』": "\"",
76
+ "【": "\"", "】": "\"", "〈": "\"", "〉": "\"",
77
+ "《": "\"", "》": "\"", "«": "\"", "»": "\"",
78
+ "、": ",", "。": ".", "!": "!", "?": "?",
79
+ "(": "(", ")": ")", ":": ":", ";": ";",
80
+ ]
81
+
82
+ // MARK: - Public API
83
+
84
+ /// Convert Japanese text to phoneme string for Kokoro TTS.
85
+ func phonemize(_ text: String) -> String {
86
+ let locale = Locale(identifier: "ja_JP") as CFLocale
87
+ let cfText = text as CFString
88
+ let length = CFStringGetLength(cfText)
89
+ guard length > 0 else { return "" }
90
+
91
+ let tokenizer = CFStringTokenizerCreate(nil, cfText, CFRangeMake(0, length),
92
+ kCFStringTokenizerUnitWord, locale)
93
+
94
+ // Collect tokens with their positions
95
+ var tokens: [(range: NSRange, word: String, reading: String?)] = []
96
+ var tokenResult = CFStringTokenizerAdvanceToNextToken(tokenizer)
97
+ while tokenResult != [] {
98
+ let range = CFStringTokenizerGetCurrentTokenRange(tokenizer)
99
+ let latin = CFStringTokenizerCopyCurrentTokenAttribute(
100
+ tokenizer, kCFStringTokenizerAttributeLatinTranscription) as? String
101
+ let nsRange = NSRange(location: range.location, length: range.length)
102
+ let word = (text as NSString).substring(with: nsRange)
103
+ tokens.append((range: nsRange, word: word, reading: latin))
104
+ tokenResult = CFStringTokenizerAdvanceToNextToken(tokenizer)
105
+ }
106
+
107
+ var result = ""
108
+ var lastWasWord = false
109
+ var cursor = 0
110
+
111
+ for token in tokens {
112
+ // Handle gaps (punctuation, whitespace) between tokens
113
+ if token.range.location > cursor {
114
+ let gapStart = text.index(text.startIndex, offsetBy: cursor)
115
+ let gapEnd = text.index(text.startIndex, offsetBy: token.range.location)
116
+ for ch in text[gapStart..<gapEnd] {
117
+ if let punct = Self.punctuationMap[ch] {
118
+ result += punct
119
+ lastWasWord = false
120
+ } else if ch.isPunctuation || ch.isSymbol {
121
+ if let ascii = Self.asciiPunct(ch) { result += ascii }
122
+ lastWasWord = false
123
+ } else if ch.isWhitespace {
124
+ lastWasWord = false
125
+ }
126
+ }
127
+ }
128
+
129
+ if let reading = token.reading {
130
+ if lastWasWord { result += " " }
131
+ // Convert romaji reading to katakana, then to IPA
132
+ let katakana = Self.romajiToKatakana(reading)
133
+ result += Self.katakanaToPhonemes(katakana)
134
+ lastWasWord = true
135
+ }
136
+
137
+ cursor = token.range.location + token.range.length
138
+ }
139
+
140
+ // Trailing punctuation
141
+ if cursor < (text as NSString).length {
142
+ let remaining = (text as NSString).substring(from: cursor)
143
+ for ch in remaining {
144
+ if let punct = Self.punctuationMap[ch] {
145
+ result += punct
146
+ } else if ch.isPunctuation || ch.isSymbol {
147
+ if let ascii = Self.asciiPunct(ch) { result += ascii }
148
+ }
149
+ }
150
+ }
151
+
152
+ return result
153
+ }
154
+
155
+ // MARK: - Katakana → Phonemes
156
+
157
+ /// Convert katakana string to phoneme string using M2P table.
158
+ static func katakanaToPhonemes(_ katakana: String) -> String {
159
+ var result = ""
160
+ let chars = Array(katakana)
161
+ var i = 0
162
+
163
+ while i < chars.count {
164
+ // Try digraph first (two characters)
165
+ if i + 1 < chars.count {
166
+ let pair = String(chars[i]) + String(chars[i + 1])
167
+ if let phoneme = digraphs[pair] {
168
+ result += phoneme
169
+ i += 2
170
+ continue
171
+ }
172
+ }
173
+
174
+ // Single character
175
+ if let phoneme = singles[chars[i]] {
176
+ result += phoneme
177
+ }
178
+ // Skip unknown characters silently
179
+ i += 1
180
+ }
181
+
182
+ return result
183
+ }
184
+
185
+ // MARK: - Romaji → Katakana
186
+
187
+ /// Convert romaji to katakana for M2P lookup.
188
+ /// Uses CFStringTransform (Apple's built-in Latin→Katakana).
189
+ static func romajiToKatakana(_ romaji: String) -> String {
190
+ let mutable = NSMutableString(string: romaji)
191
+ CFStringTransform(mutable, nil, kCFStringTransformLatinKatakana, false)
192
+ return mutable as String
193
+ }
194
+
195
+ // MARK: - Helpers
196
+
197
+ private static func asciiPunct(_ ch: Character) -> String? {
198
+ switch ch {
199
+ case ",": return ","
200
+ case ".": return "."
201
+ case "!": return "!"
202
+ case "?": return "?"
203
+ case ";": return ";"
204
+ case ":": return ":"
205
+ case "-": return "-"
206
+ default: return nil
207
+ }
208
+ }
209
+ }
@@ -0,0 +1,374 @@
1
+ import Foundation
2
+
3
+ /// Grapheme-to-phoneme conversion for Latin-script languages (French, Spanish, Portuguese, Italian, German).
4
+ ///
5
+ /// Rule-based orthography→IPA conversion. Each language has specific rules for
6
+ /// digraphs, accent handling, and context-dependent pronunciation.
7
+ final class LatinPhonemizer {
8
+
9
+ enum Language {
10
+ case french, spanish, portuguese, italian
11
+ }
12
+
13
+ private let language: Language
14
+
15
+ init(language: Language) {
16
+ self.language = language
17
+ }
18
+
19
+ // MARK: - Public API
20
+
21
+ func phonemize(_ text: String) -> String {
22
+ let words = tokenize(text)
23
+ var result = ""
24
+ var lastWasWord = false
25
+
26
+ for token in words {
27
+ switch token {
28
+ case .word(let w):
29
+ if lastWasWord { result += " " }
30
+ result += convertWord(w.lowercased())
31
+ lastWasWord = true
32
+ case .punctuation(let p):
33
+ result += p
34
+ lastWasWord = false
35
+ case .space:
36
+ lastWasWord = false
37
+ }
38
+ }
39
+
40
+ return result
41
+ }
42
+
43
+ // MARK: - Tokenization
44
+
45
+ private enum Token {
46
+ case word(String)
47
+ case punctuation(String)
48
+ case space
49
+ }
50
+
51
+ private func tokenize(_ text: String) -> [Token] {
52
+ var tokens: [Token] = []
53
+ var current = ""
54
+
55
+ for ch in text {
56
+ if ch.isWhitespace {
57
+ if !current.isEmpty { tokens.append(.word(current)); current = "" }
58
+ tokens.append(.space)
59
+ } else if ch.isLetter || ch == "'" || ch == "'" || ch == "-" {
60
+ current.append(ch)
61
+ } else if ch.isPunctuation || ch.isSymbol {
62
+ if !current.isEmpty { tokens.append(.word(current)); current = "" }
63
+ tokens.append(.punctuation(String(ch)))
64
+ } else {
65
+ current.append(ch)
66
+ }
67
+ }
68
+ if !current.isEmpty { tokens.append(.word(current)) }
69
+
70
+ return tokens
71
+ }
72
+
73
+ // MARK: - Word Conversion
74
+
75
+ /// Public word-level phonemization for dictionary fallback.
76
+ func phonemizeWord(_ word: String) -> String {
77
+ return convertWord(word.lowercased())
78
+ }
79
+
80
+ private func convertWord(_ word: String) -> String {
81
+ var ipa: String
82
+ switch language {
83
+ case .french: ipa = frenchToIPA(word)
84
+ case .spanish: ipa = spanishToIPA(word)
85
+ case .portuguese: ipa = portugueseToIPA(word)
86
+ case .italian: ipa = italianToIPA(word)
87
+ }
88
+
89
+ // Apply E2M mappings to match Kokoro's training format.
90
+ // Kokoro was trained with espeak-ng output post-processed by misaki.
91
+ // These replace multi-char IPA sequences with single-char equivalents.
92
+ for (from, to) in Self.e2mMappings {
93
+ ipa = ipa.replacingOccurrences(of: from, with: to)
94
+ }
95
+
96
+ // Add primary stress mark for multi-syllable words.
97
+ if ipa.count >= 4 {
98
+ return "ˈ" + ipa
99
+ }
100
+ return ipa
101
+ }
102
+
103
+ // MARK: - E2M Post-Processing (Kokoro Training Format)
104
+
105
+ /// Mappings from standard IPA to Kokoro's internal format.
106
+ /// Sorted longest-first for correct greedy replacement.
107
+ private static let e2mMappings: [(from: String, to: String)] = [
108
+ // Affricates → ligatures (multi-char to single)
109
+ ("dʒ", "ʤ"), ("tʃ", "ʧ"), ("dz", "ʣ"),
110
+ // Consonant normalizations
111
+ ("ʁ", "ɹ"), // French/German uvular → alveolar approximant
112
+ ("ɐ", "ə"), // Near-open central → schwa
113
+ ]
114
+
115
+ // MARK: - French G2P
116
+
117
+ /// French grapheme-to-phoneme rules.
118
+ /// Nasals only before consonants (not before vowels or n/m).
119
+ private static let frenchRules: [(pattern: String, ipa: String)] = [
120
+ // Trigraphs / special combos
121
+ ("eau", "oː"), ("aux", "oː"), ("eux", "øː"), ("oeu", "œː"),
122
+ ("ain", "ɛ̃"), ("ein", "ɛ̃"), ("oin", "wɛ̃"),
123
+ ("ien", "jɛ̃"), ("ion", "jɔ̃"),
124
+ // Digraphs
125
+ ("ou", "uː"), ("oi", "waː"), ("ai", "ɛː"), ("ei", "ɛː"),
126
+ ("au", "oː"), ("eu", "øː"), ("ch", "ʃ"), ("ph", "f"),
127
+ ("th", "t"), ("gn", "ɲ"), ("qu", "k"), ("gu", "ɡ"),
128
+ ("ll", "l"), ("ss", "s"), ("tt", "t"), ("nn", "n"),
129
+ ("mm", "m"), ("pp", "p"), ("rr", "ʁ"), ("ff", "f"),
130
+ // Accented vowels
131
+ ("é", "eː"), ("è", "ɛː"), ("ê", "ɛː"), ("ë", "ɛ"),
132
+ ("à", "aː"), ("â", "ɑː"), ("ù", "yː"), ("û", "yː"),
133
+ ("î", "iː"), ("ï", "i"), ("ô", "oː"), ("ü", "yː"),
134
+ ("ç", "s"), ("œ", "œ"),
135
+ // Basic
136
+ ("a", "a"), ("b", "b"), ("c", "k"), ("d", "d"), ("e", "ə"),
137
+ ("f", "f"), ("g", "ɡ"), ("h", ""), ("i", "i"), ("j", "ʒ"),
138
+ ("k", "k"), ("l", "l"), ("m", "m"), ("n", "n"), ("o", "o"),
139
+ ("p", "p"), ("r", "ʁ"), ("s", "s"), ("t", "t"), ("u", "y"),
140
+ ("v", "v"), ("w", "w"), ("x", "ks"), ("y", "i"), ("z", "z"),
141
+ ]
142
+
143
+ private func frenchToIPA(_ word: String) -> String {
144
+ var result = ""
145
+ let chars = Array(word)
146
+ var i = 0
147
+
148
+ while i < chars.count {
149
+ var matched = false
150
+
151
+ // Context-dependent: c before e/i/y = s, g before e/i = ʒ
152
+ if i + 1 < chars.count {
153
+ let next = chars[i + 1]
154
+ if chars[i] == "c" && "eiéèêëîïy".contains(next) {
155
+ result += "s"
156
+ i += 1
157
+ continue
158
+ }
159
+ if chars[i] == "g" && "eiéèêëîïy".contains(next) {
160
+ result += "ʒ"
161
+ i += 1
162
+ continue
163
+ }
164
+ }
165
+
166
+ // Nasal vowels: on/an/en/in/un before consonant (not before vowel or n/m)
167
+ if i + 1 < chars.count {
168
+ let pair = String(chars[i...i+1])
169
+ let afterNasal: Character? = (i + 2 < chars.count) ? chars[i + 2] : nil
170
+ let nasalFollowedByVowelOrNM = afterNasal != nil && "aeiouyéèêëàâùûîïôüœ".contains(afterNasal!) || afterNasal == "n" || afterNasal == "m"
171
+ if !nasalFollowedByVowelOrNM {
172
+ switch pair {
173
+ case "on", "om": result += "ɔ̃"; i += 2; continue
174
+ case "an", "am": result += "ɑ̃"; i += 2; continue
175
+ case "en", "em": result += "ɑ̃"; i += 2; continue
176
+ case "in", "im": result += "ɛ̃"; i += 2; continue
177
+ case "un", "um": result += "œ̃"; i += 2; continue
178
+ default: break
179
+ }
180
+ }
181
+ }
182
+
183
+ // Try longest match first (3, 2, 1 chars)
184
+ for len in stride(from: min(3, chars.count - i), through: 1, by: -1) {
185
+ let substr = String(chars[i..<i+len])
186
+ if let rule = Self.frenchRules.first(where: { $0.pattern == substr }) {
187
+ result += rule.ipa
188
+ i += len
189
+ matched = true
190
+ break
191
+ }
192
+ }
193
+ if !matched {
194
+ result += String(chars[i])
195
+ i += 1
196
+ }
197
+ }
198
+
199
+ // Drop silent final consonants (French rule: d, t, s, x, z, p are silent at end)
200
+ if result.count > 1 {
201
+ let last = result.last!
202
+ if "dtsxzp".contains(last) {
203
+ result = String(result.dropLast())
204
+ }
205
+ }
206
+
207
+ return result
208
+ }
209
+
210
+ // MARK: - Spanish G2P
211
+
212
+ /// Spanish is very regular — nearly 1:1 grapheme-to-phoneme.
213
+ private static let spanishRules: [(pattern: String, ipa: String)] = [
214
+ // Digraphs
215
+ ("ch", "tʃ"), ("ll", "ʝ"), ("rr", "rː"), ("qu", "k"),
216
+ ("gu", "ɡ"), ("gü", "ɡw"),
217
+ ("ñ", "ɲ"),
218
+ // Accented vowels (stressed — add length)
219
+ ("á", "aː"), ("é", "eː"), ("í", "iː"), ("ó", "oː"), ("ú", "uː"), ("ü", "w"),
220
+ // Basic
221
+ ("a", "a"), ("b", "b"), ("c", "k"), ("d", "d"), ("e", "e"),
222
+ ("f", "f"), ("g", "ɡ"), ("h", ""), ("i", "i"), ("j", "x"),
223
+ ("k", "k"), ("l", "l"), ("m", "m"), ("n", "n"), ("o", "o"),
224
+ ("p", "p"), ("r", "ɾ"), ("s", "s"), ("t", "t"), ("u", "u"),
225
+ ("v", "b"), ("w", "w"), ("x", "ks"), ("y", "ʝ"), ("z", "θ"),
226
+ ]
227
+
228
+ private func spanishToIPA(_ word: String) -> String {
229
+ var result = ""
230
+ let chars = Array(word)
231
+ var i = 0
232
+
233
+ while i < chars.count {
234
+ // Context: c before e/i = θ, g before e/i = x
235
+ if i + 1 < chars.count {
236
+ if chars[i] == "c" && "eiéí".contains(chars[i+1]) {
237
+ result += "θ"
238
+ i += 1
239
+ continue
240
+ }
241
+ if chars[i] == "g" && "eiéí".contains(chars[i+1]) {
242
+ result += "x"
243
+ i += 1
244
+ continue
245
+ }
246
+ }
247
+
248
+ var matched = false
249
+ for len in stride(from: min(2, chars.count - i), through: 1, by: -1) {
250
+ let substr = String(chars[i..<i+len])
251
+ if let rule = Self.spanishRules.first(where: { $0.pattern == substr }) {
252
+ result += rule.ipa
253
+ i += len
254
+ matched = true
255
+ break
256
+ }
257
+ }
258
+ if !matched {
259
+ result += String(chars[i])
260
+ i += 1
261
+ }
262
+ }
263
+
264
+ return result
265
+ }
266
+
267
+ // MARK: - Portuguese G2P
268
+
269
+ private static let portugueseRules: [(pattern: String, ipa: String)] = [
270
+ // Digraphs / trigraphs
271
+ ("ção", "saːw̃"), ("ções", "sõːjs"), ("nh", "ɲ"), ("lh", "ʎ"),
272
+ ("ch", "ʃ"), ("qu", "k"), ("gu", "ɡ"), ("rr", "ʁː"),
273
+ ("ss", "s"), ("sc", "s"),
274
+ // Explicit nasal diphthongs
275
+ ("ão", "aːw̃"), ("ãe", "aːj̃"), ("õe", "oːj̃"),
276
+ // Accented
277
+ ("á", "aː"), ("â", "ɐː"), ("ã", "ɐ̃ː"), ("é", "ɛː"), ("ê", "eː"),
278
+ ("í", "iː"), ("ó", "ɔː"), ("ô", "oː"), ("õ", "õː"), ("ú", "uː"),
279
+ ("ç", "s"),
280
+ // Diphthongs
281
+ ("ou", "oː"), ("ei", "eːj"), ("ai", "aːj"), ("oi", "oːj"),
282
+ // Basic
283
+ ("a", "a"), ("b", "b"), ("c", "k"), ("d", "d"), ("e", "e"),
284
+ ("f", "f"), ("g", "ɡ"), ("h", ""), ("i", "i"), ("j", "ʒ"),
285
+ ("k", "k"), ("l", "l"), ("m", "m"), ("n", "n"), ("o", "o"),
286
+ ("p", "p"), ("r", "ɾ"), ("s", "s"), ("t", "t"), ("u", "u"),
287
+ ("v", "v"), ("w", "w"), ("x", "ʃ"), ("y", "i"), ("z", "z"),
288
+ ]
289
+
290
+ private func portugueseToIPA(_ word: String) -> String {
291
+ var result = ""
292
+ let chars = Array(word)
293
+ var i = 0
294
+
295
+ while i < chars.count {
296
+ // Context: c before e/i = s
297
+ if i + 1 < chars.count && chars[i] == "c" && "eiéí".contains(chars[i+1]) {
298
+ result += "s"
299
+ i += 1
300
+ continue
301
+ }
302
+
303
+ var matched = false
304
+ for len in stride(from: min(4, chars.count - i), through: 1, by: -1) {
305
+ let substr = String(chars[i..<i+len])
306
+ if let rule = Self.portugueseRules.first(where: { $0.pattern == substr }) {
307
+ result += rule.ipa
308
+ i += len
309
+ matched = true
310
+ break
311
+ }
312
+ }
313
+ if !matched {
314
+ result += String(chars[i])
315
+ i += 1
316
+ }
317
+ }
318
+
319
+ return result
320
+ }
321
+
322
+ // MARK: - Italian G2P
323
+
324
+ /// Italian is highly regular — nearly 1:1 grapheme-to-phoneme.
325
+ /// Main exceptions: c/g before e/i, gl, gn, sc digraphs.
326
+ private static let italianRules: [(pattern: String, ipa: String)] = [
327
+ // Trigraphs
328
+ ("gli", "ʎi"), ("sce", "ʃe"), ("sci", "ʃi"),
329
+ ("ghi", "ɡi"), ("ghe", "ɡe"), ("chi", "ki"), ("che", "ke"),
330
+ // Digraphs
331
+ ("gn", "ɲ"), ("gl", "ʎ"), ("sc", "sk"),
332
+ ("gh", "ɡ"), ("ch", "k"), ("qu", "kw"),
333
+ ("ci", "tʃi"), ("ce", "tʃe"),
334
+ ("gi", "dʒi"), ("ge", "dʒe"),
335
+ ("zz", "tːs"), ("ss", "sː"), ("rr", "rː"), ("ll", "lː"),
336
+ ("nn", "nː"), ("mm", "mː"), ("pp", "pː"), ("tt", "tː"),
337
+ ("cc", "kː"), ("ff", "fː"), ("bb", "bː"), ("dd", "dː"),
338
+ ("gg", "ɡː"),
339
+ // Accented vowels
340
+ ("à", "a"), ("è", "ɛ"), ("é", "e"), ("ì", "i"), ("ò", "ɔ"), ("ó", "o"), ("ù", "u"),
341
+ // Basic — Italian vowels are pure, consonants are straightforward
342
+ ("a", "a"), ("b", "b"), ("c", "k"), ("d", "d"), ("e", "e"),
343
+ ("f", "f"), ("g", "ɡ"), ("h", ""), ("i", "i"), ("j", "j"),
344
+ ("k", "k"), ("l", "l"), ("m", "m"), ("n", "n"), ("o", "o"),
345
+ ("p", "p"), ("r", "r"), ("s", "s"), ("t", "t"), ("u", "u"),
346
+ ("v", "v"), ("w", "w"), ("x", "ks"), ("y", "i"), ("z", "ts"),
347
+ ]
348
+
349
+ private func italianToIPA(_ word: String) -> String {
350
+ var result = ""
351
+ let chars = Array(word)
352
+ var i = 0
353
+
354
+ while i < chars.count {
355
+ var matched = false
356
+ for len in stride(from: min(3, chars.count - i), through: 1, by: -1) {
357
+ let substr = String(chars[i..<i+len])
358
+ if let rule = Self.italianRules.first(where: { $0.pattern == substr }) {
359
+ result += rule.ipa
360
+ i += len
361
+ matched = true
362
+ break
363
+ }
364
+ }
365
+ if !matched {
366
+ result += String(chars[i])
367
+ i += 1
368
+ }
369
+ }
370
+
371
+ return result
372
+ }
373
+
374
+ }
@@ -0,0 +1,87 @@
1
+ import CoreML
2
+ import Foundation
3
+
4
+ /// CoreML wrapper for Kokoro-82M end-to-end TTS inference.
5
+ ///
6
+ /// Loads a single pre-compiled `kokoro_5s.mlmodelc` that runs the full pipeline
7
+ /// (BERT → duration → alignment → prosody → decoder) in one CoreML call.
8
+ class KokoroNetwork {
9
+
10
+ private let e2eModel: MLModel
11
+
12
+ /// Load E2E CoreML model from cache directory.
13
+ init(directory: URL, computeUnits: MLComputeUnits = .all) throws {
14
+ let config = MLModelConfiguration()
15
+ config.computeUnits = computeUnits
16
+
17
+ let e2eNames = ["kokoro_5s", "kokoro_10s", "kokoro_15s", "kokoro"]
18
+ var loaded: MLModel?
19
+ for name in e2eNames {
20
+ let url = directory.appendingPathComponent("\(name).mlmodelc", isDirectory: true)
21
+ if FileManager.default.fileExists(atPath: url.path) {
22
+ loaded = try MLModel(contentsOf: url, configuration: config)
23
+ break
24
+ }
25
+ }
26
+
27
+ guard let model = loaded else {
28
+ throw AudioModelError.modelLoadFailed(
29
+ modelId: "kokoro",
30
+ reason: "No Kokoro E2E model found in \(directory.path)")
31
+ }
32
+ e2eModel = model
33
+ }
34
+
35
+ // MARK: - E2E Inference
36
+
37
+ struct E2EOutput {
38
+ let audio: MLMultiArray
39
+ let audioLengthSamples: Int
40
+ let predDur: MLMultiArray
41
+ }
42
+
43
+ func predictE2E(
44
+ inputIds: MLMultiArray,
45
+ attentionMask: MLMultiArray,
46
+ refS: MLMultiArray,
47
+ speed: MLMultiArray? = nil
48
+ ) throws -> E2EOutput {
49
+ let randomPhases = try MLMultiArray(shape: [1, 9], dataType: .float32)
50
+ for i in 0..<9 { randomPhases[i] = NSNumber(value: Float.random(in: 0..<1)) }
51
+
52
+ let speedInput = speed ?? {
53
+ let s = try! MLMultiArray(shape: [1], dataType: .float32)
54
+ s[0] = NSNumber(value: Float(1.0))
55
+ return s
56
+ }()
57
+
58
+ let dict: [String: MLFeatureValue] = [
59
+ "input_ids": MLFeatureValue(multiArray: inputIds),
60
+ "attention_mask": MLFeatureValue(multiArray: attentionMask),
61
+ "ref_s": MLFeatureValue(multiArray: refS),
62
+ "random_phases": MLFeatureValue(multiArray: randomPhases),
63
+ "speed": MLFeatureValue(multiArray: speedInput),
64
+ ]
65
+
66
+ let input = try MLDictionaryFeatureProvider(dictionary: dict)
67
+ let output = try e2eModel.prediction(from: input)
68
+
69
+ guard let audio = output.featureValue(for: "audio")?.multiArrayValue,
70
+ let audioLen = output.featureValue(for: "audio_length_samples")?.multiArrayValue,
71
+ let predDur = output.featureValue(for: "pred_dur")?.multiArrayValue else {
72
+ throw AudioModelError.inferenceFailed(
73
+ operation: "kokoro-e2e", reason: "Missing output tensors")
74
+ }
75
+
76
+ let lengthSamples: Int
77
+ if #available(iOS 16.0, *), audioLen.dataType == .float16 {
78
+ lengthSamples = Int(Float(audioLen.dataPointer.assumingMemoryBound(to: Float16.self).pointee))
79
+ } else if audioLen.dataType == .int32 {
80
+ lengthSamples = Int(audioLen.dataPointer.assumingMemoryBound(to: Int32.self).pointee)
81
+ } else {
82
+ lengthSamples = Int(audioLen.dataPointer.assumingMemoryBound(to: Float.self).pointee)
83
+ }
84
+
85
+ return E2EOutput(audio: audio, audioLengthSamples: lengthSamples, predDur: predDur)
86
+ }
87
+ }