@elizaos/capacitor-bun-runtime 2.0.11-beta.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ElizaosCapacitorBunRuntime.podspec +54 -0
- package/LICENSE +21 -0
- package/README.md +127 -0
- package/dist/esm/definitions.d.ts +136 -0
- package/dist/esm/definitions.d.ts.map +1 -0
- package/dist/esm/definitions.js +14 -0
- package/dist/esm/definitions.js.map +1 -0
- package/dist/esm/index.d.ts +9 -0
- package/dist/esm/index.d.ts.map +1 -0
- package/dist/esm/index.js +11 -0
- package/dist/esm/index.js.map +1 -0
- package/dist/esm/web.d.ts +19 -0
- package/dist/esm/web.d.ts.map +1 -0
- package/dist/esm/web.js +44 -0
- package/dist/esm/web.js.map +1 -0
- package/dist/plugin.cjs.js +63 -0
- package/dist/plugin.cjs.js.map +1 -0
- package/dist/plugin.js +66 -0
- package/dist/plugin.js.map +1 -0
- package/ios/Sources/ElizaBunRuntimePlugin/BridgeInstaller.swift +94 -0
- package/ios/Sources/ElizaBunRuntimePlugin/ElizaBunRuntime.swift +705 -0
- package/ios/Sources/ElizaBunRuntimePlugin/ElizaBunRuntimePlugin.swift +1109 -0
- package/ios/Sources/ElizaBunRuntimePlugin/FullBunEngineHost.swift +677 -0
- package/ios/Sources/ElizaBunRuntimePlugin/JSContextHelpers.swift +226 -0
- package/ios/Sources/ElizaBunRuntimePlugin/SandboxPaths.swift +46 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/CryptoBridge.swift +238 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/ElizaSqliteVecBridge.m +28 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/FSBridge.swift +270 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/HTTPBridge.swift +153 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/HTTPServerBridge.swift +32 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/LlamaBridge.swift +233 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/LlamaBridgeImpl.swift +1863 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/LogBridge.swift +36 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/PathsBridge.swift +41 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/ProcessBridge.swift +80 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteBridge.swift +406 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteBridgeInstaller.swift +17 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteVecLoader.swift +66 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/UIBridge.swift +72 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlChinesePhonemizer.swift +313 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlConfiguration.swift +28 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlEngine.swift +325 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlHindiPhonemizer.swift +150 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlJapanesePhonemizer.swift +209 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlLatinPhonemizer.swift +374 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlModel.swift +87 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlPhonemizer.swift +679 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlPronunciationDicts.swift +131 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlSupport.swift +24 -0
- package/ios/Tests/llama-bridge-smoke-main.swift +92 -0
- package/package.json +68 -0
- package/src/bridge-contract.test.ts +127 -0
- package/src/definitions.d.ts +136 -0
- package/src/definitions.d.ts.map +1 -0
- package/src/definitions.ts +152 -0
- package/src/index.d.ts +9 -0
- package/src/index.d.ts.map +1 -0
- package/src/index.ts +16 -0
- package/src/web.d.ts +19 -0
- package/src/web.d.ts.map +1 -0
- package/src/web.ts +80 -0
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
|
|
3
|
+
/// Japanese text-to-phoneme conversion for Kokoro TTS.
|
|
4
|
+
///
|
|
5
|
+
/// Pipeline: Japanese text → CFStringTokenizer (word segmentation + readings)
|
|
6
|
+
/// → katakana → IPA via M2P table → P2R romanization
|
|
7
|
+
///
|
|
8
|
+
/// Uses Apple's built-in Japanese morphological analysis — no MeCab dependency.
|
|
9
|
+
/// M2P katakana-to-IPA table covers standard and extended katakana (193 entries).
|
|
10
|
+
final class JapanesePhonemizer {
|
|
11
|
+
|
|
12
|
+
// MARK: - Katakana → IPA (M2P)
|
|
13
|
+
|
|
14
|
+
/// Digraph mappings (two-character katakana → IPA). Checked before single chars.
|
|
15
|
+
private static let digraphs: [String: String] = [
|
|
16
|
+
"イェ": "je",
|
|
17
|
+
"ウィ": "wi", "ウゥ": "wu", "ウェ": "we", "ウォ": "wo",
|
|
18
|
+
"キィ": "kyi", "キェ": "kye", "キャ": "kya", "キュ": "kyu", "キョ": "kyo",
|
|
19
|
+
"ギィ": "gyi", "ギェ": "gye", "ギャ": "gya", "ギュ": "gyu", "ギョ": "gyo",
|
|
20
|
+
"クァ": "kwa", "クィ": "kwi", "クゥ": "kwu", "クェ": "kwe", "クォ": "kwo", "クヮ": "kwa",
|
|
21
|
+
"グァ": "gwa", "グィ": "gwi", "グゥ": "gwu", "グェ": "gwe", "グォ": "gwo", "グヮ": "gwa",
|
|
22
|
+
"シェ": "she", "シャ": "sha", "シュ": "shu", "ショ": "sho",
|
|
23
|
+
"ジェ": "je", "ジャ": "ja", "ジュ": "ju", "ジョ": "jo",
|
|
24
|
+
"スィ": "si", "ズィ": "zi",
|
|
25
|
+
"チェ": "che", "チャ": "cha", "チュ": "chu", "チョ": "cho",
|
|
26
|
+
"ヂェ": "je", "ヂャ": "ja", "ヂュ": "ju", "ヂョ": "jo",
|
|
27
|
+
"ツァ": "tsa", "ツィ": "tsi", "ツェ": "tse", "ツォ": "tso",
|
|
28
|
+
"ティ": "ti", "テェ": "tye", "テャ": "tya", "テュ": "tyu", "テョ": "tyo",
|
|
29
|
+
"ディ": "di", "デェ": "dye", "デャ": "dya", "デュ": "dyu", "デョ": "dyo",
|
|
30
|
+
"トゥ": "tu", "ドゥ": "du",
|
|
31
|
+
"ニィ": "nyi", "ニェ": "nye", "ニャ": "nya", "ニュ": "nyu", "ニョ": "nyo",
|
|
32
|
+
"ヒィ": "hyi", "ヒェ": "hye", "ヒャ": "hya", "ヒュ": "hyu", "ヒョ": "hyo",
|
|
33
|
+
"ビィ": "byi", "ビェ": "bye", "ビャ": "bya", "ビュ": "byu", "ビョ": "byo",
|
|
34
|
+
"ピィ": "pyi", "ピェ": "pye", "ピャ": "pya", "ピュ": "pyu", "ピョ": "pyo",
|
|
35
|
+
"ファ": "fa", "フィ": "fi", "フェ": "fe", "フォ": "fo",
|
|
36
|
+
"ミィ": "myi", "ミェ": "mye", "ミャ": "mya", "ミュ": "myu", "ミョ": "myo",
|
|
37
|
+
"リィ": "ryi", "リェ": "rye", "リャ": "rya", "リュ": "ryu", "リョ": "ryo",
|
|
38
|
+
"ヴァ": "va", "ヴィ": "vi", "ヴェ": "ve", "ヴォ": "vo",
|
|
39
|
+
"ヴャ": "bya", "ヴュ": "byu", "ヴョ": "byo",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
/// Single katakana → IPA mappings.
|
|
43
|
+
private static let singles: [Character: String] = [
|
|
44
|
+
"ァ": "a", "ア": "a", "ィ": "i", "イ": "i",
|
|
45
|
+
"ゥ": "u", "ウ": "u", "ェ": "e", "エ": "e",
|
|
46
|
+
"ォ": "o", "オ": "o",
|
|
47
|
+
"カ": "ka", "ガ": "ga", "キ": "ki", "ギ": "gi",
|
|
48
|
+
"ク": "ku", "グ": "gu", "ケ": "ke", "ゲ": "ge",
|
|
49
|
+
"コ": "ko", "ゴ": "go",
|
|
50
|
+
"サ": "sa", "ザ": "za", "シ": "shi", "ジ": "ji",
|
|
51
|
+
"ス": "su", "ズ": "zu", "セ": "se", "ゼ": "ze",
|
|
52
|
+
"ソ": "so", "ゾ": "zo",
|
|
53
|
+
"タ": "ta", "ダ": "da", "チ": "chi", "ヂ": "ji",
|
|
54
|
+
"ツ": "tsu", "ヅ": "zu", "テ": "te", "デ": "de",
|
|
55
|
+
"ト": "to", "ド": "do",
|
|
56
|
+
"ナ": "na", "ニ": "ni", "ヌ": "nu", "ネ": "ne", "ノ": "no",
|
|
57
|
+
"ハ": "ha", "バ": "ba", "パ": "pa",
|
|
58
|
+
"ヒ": "hi", "ビ": "bi", "ピ": "pi",
|
|
59
|
+
"フ": "fu", "ブ": "bu", "プ": "pu",
|
|
60
|
+
"ヘ": "he", "ベ": "be", "ペ": "pe",
|
|
61
|
+
"ホ": "ho", "ボ": "bo", "ポ": "po",
|
|
62
|
+
"マ": "ma", "ミ": "mi", "ム": "mu", "メ": "me", "モ": "mo",
|
|
63
|
+
"ャ": "ya", "ヤ": "ya", "ュ": "yu", "ユ": "yu",
|
|
64
|
+
"ョ": "yo", "ヨ": "yo",
|
|
65
|
+
"ラ": "ra", "リ": "ri", "ル": "ru", "レ": "re", "ロ": "ro",
|
|
66
|
+
"ヮ": "wa", "ワ": "wa", "ヰ": "i", "ヱ": "e", "ヲ": "o",
|
|
67
|
+
"ヴ": "vu", "ヵ": "ka", "ヶ": "ke",
|
|
68
|
+
"ヷ": "va", "ヸ": "vi", "ヹ": "ve", "ヺ": "vo",
|
|
69
|
+
"ッ": "ʔ", "ン": "ɴ", "ー": "ː",
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
// MARK: - Japanese Punctuation
|
|
73
|
+
|
|
74
|
+
private static let punctuationMap: [Character: String] = [
|
|
75
|
+
"「": "\"", "」": "\"", "『": "\"", "』": "\"",
|
|
76
|
+
"【": "\"", "】": "\"", "〈": "\"", "〉": "\"",
|
|
77
|
+
"《": "\"", "》": "\"", "«": "\"", "»": "\"",
|
|
78
|
+
"、": ",", "。": ".", "!": "!", "?": "?",
|
|
79
|
+
"(": "(", ")": ")", ":": ":", ";": ";",
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
// MARK: - Public API
|
|
83
|
+
|
|
84
|
+
/// Convert Japanese text to phoneme string for Kokoro TTS.
|
|
85
|
+
func phonemize(_ text: String) -> String {
|
|
86
|
+
let locale = Locale(identifier: "ja_JP") as CFLocale
|
|
87
|
+
let cfText = text as CFString
|
|
88
|
+
let length = CFStringGetLength(cfText)
|
|
89
|
+
guard length > 0 else { return "" }
|
|
90
|
+
|
|
91
|
+
let tokenizer = CFStringTokenizerCreate(nil, cfText, CFRangeMake(0, length),
|
|
92
|
+
kCFStringTokenizerUnitWord, locale)
|
|
93
|
+
|
|
94
|
+
// Collect tokens with their positions
|
|
95
|
+
var tokens: [(range: NSRange, word: String, reading: String?)] = []
|
|
96
|
+
var tokenResult = CFStringTokenizerAdvanceToNextToken(tokenizer)
|
|
97
|
+
while tokenResult != [] {
|
|
98
|
+
let range = CFStringTokenizerGetCurrentTokenRange(tokenizer)
|
|
99
|
+
let latin = CFStringTokenizerCopyCurrentTokenAttribute(
|
|
100
|
+
tokenizer, kCFStringTokenizerAttributeLatinTranscription) as? String
|
|
101
|
+
let nsRange = NSRange(location: range.location, length: range.length)
|
|
102
|
+
let word = (text as NSString).substring(with: nsRange)
|
|
103
|
+
tokens.append((range: nsRange, word: word, reading: latin))
|
|
104
|
+
tokenResult = CFStringTokenizerAdvanceToNextToken(tokenizer)
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
var result = ""
|
|
108
|
+
var lastWasWord = false
|
|
109
|
+
var cursor = 0
|
|
110
|
+
|
|
111
|
+
for token in tokens {
|
|
112
|
+
// Handle gaps (punctuation, whitespace) between tokens
|
|
113
|
+
if token.range.location > cursor {
|
|
114
|
+
let gapStart = text.index(text.startIndex, offsetBy: cursor)
|
|
115
|
+
let gapEnd = text.index(text.startIndex, offsetBy: token.range.location)
|
|
116
|
+
for ch in text[gapStart..<gapEnd] {
|
|
117
|
+
if let punct = Self.punctuationMap[ch] {
|
|
118
|
+
result += punct
|
|
119
|
+
lastWasWord = false
|
|
120
|
+
} else if ch.isPunctuation || ch.isSymbol {
|
|
121
|
+
if let ascii = Self.asciiPunct(ch) { result += ascii }
|
|
122
|
+
lastWasWord = false
|
|
123
|
+
} else if ch.isWhitespace {
|
|
124
|
+
lastWasWord = false
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if let reading = token.reading {
|
|
130
|
+
if lastWasWord { result += " " }
|
|
131
|
+
// Convert romaji reading to katakana, then to IPA
|
|
132
|
+
let katakana = Self.romajiToKatakana(reading)
|
|
133
|
+
result += Self.katakanaToPhonemes(katakana)
|
|
134
|
+
lastWasWord = true
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
cursor = token.range.location + token.range.length
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Trailing punctuation
|
|
141
|
+
if cursor < (text as NSString).length {
|
|
142
|
+
let remaining = (text as NSString).substring(from: cursor)
|
|
143
|
+
for ch in remaining {
|
|
144
|
+
if let punct = Self.punctuationMap[ch] {
|
|
145
|
+
result += punct
|
|
146
|
+
} else if ch.isPunctuation || ch.isSymbol {
|
|
147
|
+
if let ascii = Self.asciiPunct(ch) { result += ascii }
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
return result
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// MARK: - Katakana → Phonemes
|
|
156
|
+
|
|
157
|
+
/// Convert katakana string to phoneme string using M2P table.
|
|
158
|
+
static func katakanaToPhonemes(_ katakana: String) -> String {
|
|
159
|
+
var result = ""
|
|
160
|
+
let chars = Array(katakana)
|
|
161
|
+
var i = 0
|
|
162
|
+
|
|
163
|
+
while i < chars.count {
|
|
164
|
+
// Try digraph first (two characters)
|
|
165
|
+
if i + 1 < chars.count {
|
|
166
|
+
let pair = String(chars[i]) + String(chars[i + 1])
|
|
167
|
+
if let phoneme = digraphs[pair] {
|
|
168
|
+
result += phoneme
|
|
169
|
+
i += 2
|
|
170
|
+
continue
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
// Single character
|
|
175
|
+
if let phoneme = singles[chars[i]] {
|
|
176
|
+
result += phoneme
|
|
177
|
+
}
|
|
178
|
+
// Skip unknown characters silently
|
|
179
|
+
i += 1
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return result
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// MARK: - Romaji → Katakana
|
|
186
|
+
|
|
187
|
+
/// Convert romaji to katakana for M2P lookup.
|
|
188
|
+
/// Uses CFStringTransform (Apple's built-in Latin→Katakana).
|
|
189
|
+
static func romajiToKatakana(_ romaji: String) -> String {
|
|
190
|
+
let mutable = NSMutableString(string: romaji)
|
|
191
|
+
CFStringTransform(mutable, nil, kCFStringTransformLatinKatakana, false)
|
|
192
|
+
return mutable as String
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// MARK: - Helpers
|
|
196
|
+
|
|
197
|
+
private static func asciiPunct(_ ch: Character) -> String? {
|
|
198
|
+
switch ch {
|
|
199
|
+
case ",": return ","
|
|
200
|
+
case ".": return "."
|
|
201
|
+
case "!": return "!"
|
|
202
|
+
case "?": return "?"
|
|
203
|
+
case ";": return ";"
|
|
204
|
+
case ":": return ":"
|
|
205
|
+
case "-": return "-"
|
|
206
|
+
default: return nil
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
|
|
3
|
+
/// Grapheme-to-phoneme conversion for Latin-script languages (French, Spanish, Portuguese, Italian, German).
|
|
4
|
+
///
|
|
5
|
+
/// Rule-based orthography→IPA conversion. Each language has specific rules for
|
|
6
|
+
/// digraphs, accent handling, and context-dependent pronunciation.
|
|
7
|
+
final class LatinPhonemizer {
|
|
8
|
+
|
|
9
|
+
enum Language {
|
|
10
|
+
case french, spanish, portuguese, italian
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
private let language: Language
|
|
14
|
+
|
|
15
|
+
init(language: Language) {
|
|
16
|
+
self.language = language
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// MARK: - Public API
|
|
20
|
+
|
|
21
|
+
func phonemize(_ text: String) -> String {
|
|
22
|
+
let words = tokenize(text)
|
|
23
|
+
var result = ""
|
|
24
|
+
var lastWasWord = false
|
|
25
|
+
|
|
26
|
+
for token in words {
|
|
27
|
+
switch token {
|
|
28
|
+
case .word(let w):
|
|
29
|
+
if lastWasWord { result += " " }
|
|
30
|
+
result += convertWord(w.lowercased())
|
|
31
|
+
lastWasWord = true
|
|
32
|
+
case .punctuation(let p):
|
|
33
|
+
result += p
|
|
34
|
+
lastWasWord = false
|
|
35
|
+
case .space:
|
|
36
|
+
lastWasWord = false
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return result
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// MARK: - Tokenization
|
|
44
|
+
|
|
45
|
+
private enum Token {
|
|
46
|
+
case word(String)
|
|
47
|
+
case punctuation(String)
|
|
48
|
+
case space
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
private func tokenize(_ text: String) -> [Token] {
|
|
52
|
+
var tokens: [Token] = []
|
|
53
|
+
var current = ""
|
|
54
|
+
|
|
55
|
+
for ch in text {
|
|
56
|
+
if ch.isWhitespace {
|
|
57
|
+
if !current.isEmpty { tokens.append(.word(current)); current = "" }
|
|
58
|
+
tokens.append(.space)
|
|
59
|
+
} else if ch.isLetter || ch == "'" || ch == "'" || ch == "-" {
|
|
60
|
+
current.append(ch)
|
|
61
|
+
} else if ch.isPunctuation || ch.isSymbol {
|
|
62
|
+
if !current.isEmpty { tokens.append(.word(current)); current = "" }
|
|
63
|
+
tokens.append(.punctuation(String(ch)))
|
|
64
|
+
} else {
|
|
65
|
+
current.append(ch)
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
if !current.isEmpty { tokens.append(.word(current)) }
|
|
69
|
+
|
|
70
|
+
return tokens
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// MARK: - Word Conversion
|
|
74
|
+
|
|
75
|
+
/// Public word-level phonemization for dictionary fallback.
|
|
76
|
+
func phonemizeWord(_ word: String) -> String {
|
|
77
|
+
return convertWord(word.lowercased())
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
private func convertWord(_ word: String) -> String {
|
|
81
|
+
var ipa: String
|
|
82
|
+
switch language {
|
|
83
|
+
case .french: ipa = frenchToIPA(word)
|
|
84
|
+
case .spanish: ipa = spanishToIPA(word)
|
|
85
|
+
case .portuguese: ipa = portugueseToIPA(word)
|
|
86
|
+
case .italian: ipa = italianToIPA(word)
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// Apply E2M mappings to match Kokoro's training format.
|
|
90
|
+
// Kokoro was trained with espeak-ng output post-processed by misaki.
|
|
91
|
+
// These replace multi-char IPA sequences with single-char equivalents.
|
|
92
|
+
for (from, to) in Self.e2mMappings {
|
|
93
|
+
ipa = ipa.replacingOccurrences(of: from, with: to)
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Add primary stress mark for multi-syllable words.
|
|
97
|
+
if ipa.count >= 4 {
|
|
98
|
+
return "ˈ" + ipa
|
|
99
|
+
}
|
|
100
|
+
return ipa
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// MARK: - E2M Post-Processing (Kokoro Training Format)
|
|
104
|
+
|
|
105
|
+
/// Mappings from standard IPA to Kokoro's internal format.
|
|
106
|
+
/// Sorted longest-first for correct greedy replacement.
|
|
107
|
+
private static let e2mMappings: [(from: String, to: String)] = [
|
|
108
|
+
// Affricates → ligatures (multi-char to single)
|
|
109
|
+
("dʒ", "ʤ"), ("tʃ", "ʧ"), ("dz", "ʣ"),
|
|
110
|
+
// Consonant normalizations
|
|
111
|
+
("ʁ", "ɹ"), // French/German uvular → alveolar approximant
|
|
112
|
+
("ɐ", "ə"), // Near-open central → schwa
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
// MARK: - French G2P
|
|
116
|
+
|
|
117
|
+
/// French grapheme-to-phoneme rules.
|
|
118
|
+
/// Nasals only before consonants (not before vowels or n/m).
|
|
119
|
+
private static let frenchRules: [(pattern: String, ipa: String)] = [
|
|
120
|
+
// Trigraphs / special combos
|
|
121
|
+
("eau", "oː"), ("aux", "oː"), ("eux", "øː"), ("oeu", "œː"),
|
|
122
|
+
("ain", "ɛ̃"), ("ein", "ɛ̃"), ("oin", "wɛ̃"),
|
|
123
|
+
("ien", "jɛ̃"), ("ion", "jɔ̃"),
|
|
124
|
+
// Digraphs
|
|
125
|
+
("ou", "uː"), ("oi", "waː"), ("ai", "ɛː"), ("ei", "ɛː"),
|
|
126
|
+
("au", "oː"), ("eu", "øː"), ("ch", "ʃ"), ("ph", "f"),
|
|
127
|
+
("th", "t"), ("gn", "ɲ"), ("qu", "k"), ("gu", "ɡ"),
|
|
128
|
+
("ll", "l"), ("ss", "s"), ("tt", "t"), ("nn", "n"),
|
|
129
|
+
("mm", "m"), ("pp", "p"), ("rr", "ʁ"), ("ff", "f"),
|
|
130
|
+
// Accented vowels
|
|
131
|
+
("é", "eː"), ("è", "ɛː"), ("ê", "ɛː"), ("ë", "ɛ"),
|
|
132
|
+
("à", "aː"), ("â", "ɑː"), ("ù", "yː"), ("û", "yː"),
|
|
133
|
+
("î", "iː"), ("ï", "i"), ("ô", "oː"), ("ü", "yː"),
|
|
134
|
+
("ç", "s"), ("œ", "œ"),
|
|
135
|
+
// Basic
|
|
136
|
+
("a", "a"), ("b", "b"), ("c", "k"), ("d", "d"), ("e", "ə"),
|
|
137
|
+
("f", "f"), ("g", "ɡ"), ("h", ""), ("i", "i"), ("j", "ʒ"),
|
|
138
|
+
("k", "k"), ("l", "l"), ("m", "m"), ("n", "n"), ("o", "o"),
|
|
139
|
+
("p", "p"), ("r", "ʁ"), ("s", "s"), ("t", "t"), ("u", "y"),
|
|
140
|
+
("v", "v"), ("w", "w"), ("x", "ks"), ("y", "i"), ("z", "z"),
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
private func frenchToIPA(_ word: String) -> String {
|
|
144
|
+
var result = ""
|
|
145
|
+
let chars = Array(word)
|
|
146
|
+
var i = 0
|
|
147
|
+
|
|
148
|
+
while i < chars.count {
|
|
149
|
+
var matched = false
|
|
150
|
+
|
|
151
|
+
// Context-dependent: c before e/i/y = s, g before e/i = ʒ
|
|
152
|
+
if i + 1 < chars.count {
|
|
153
|
+
let next = chars[i + 1]
|
|
154
|
+
if chars[i] == "c" && "eiéèêëîïy".contains(next) {
|
|
155
|
+
result += "s"
|
|
156
|
+
i += 1
|
|
157
|
+
continue
|
|
158
|
+
}
|
|
159
|
+
if chars[i] == "g" && "eiéèêëîïy".contains(next) {
|
|
160
|
+
result += "ʒ"
|
|
161
|
+
i += 1
|
|
162
|
+
continue
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Nasal vowels: on/an/en/in/un before consonant (not before vowel or n/m)
|
|
167
|
+
if i + 1 < chars.count {
|
|
168
|
+
let pair = String(chars[i...i+1])
|
|
169
|
+
let afterNasal: Character? = (i + 2 < chars.count) ? chars[i + 2] : nil
|
|
170
|
+
let nasalFollowedByVowelOrNM = afterNasal != nil && "aeiouyéèêëàâùûîïôüœ".contains(afterNasal!) || afterNasal == "n" || afterNasal == "m"
|
|
171
|
+
if !nasalFollowedByVowelOrNM {
|
|
172
|
+
switch pair {
|
|
173
|
+
case "on", "om": result += "ɔ̃"; i += 2; continue
|
|
174
|
+
case "an", "am": result += "ɑ̃"; i += 2; continue
|
|
175
|
+
case "en", "em": result += "ɑ̃"; i += 2; continue
|
|
176
|
+
case "in", "im": result += "ɛ̃"; i += 2; continue
|
|
177
|
+
case "un", "um": result += "œ̃"; i += 2; continue
|
|
178
|
+
default: break
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Try longest match first (3, 2, 1 chars)
|
|
184
|
+
for len in stride(from: min(3, chars.count - i), through: 1, by: -1) {
|
|
185
|
+
let substr = String(chars[i..<i+len])
|
|
186
|
+
if let rule = Self.frenchRules.first(where: { $0.pattern == substr }) {
|
|
187
|
+
result += rule.ipa
|
|
188
|
+
i += len
|
|
189
|
+
matched = true
|
|
190
|
+
break
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
if !matched {
|
|
194
|
+
result += String(chars[i])
|
|
195
|
+
i += 1
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Drop silent final consonants (French rule: d, t, s, x, z, p are silent at end)
|
|
200
|
+
if result.count > 1 {
|
|
201
|
+
let last = result.last!
|
|
202
|
+
if "dtsxzp".contains(last) {
|
|
203
|
+
result = String(result.dropLast())
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
return result
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// MARK: - Spanish G2P
|
|
211
|
+
|
|
212
|
+
/// Spanish is very regular — nearly 1:1 grapheme-to-phoneme.
|
|
213
|
+
private static let spanishRules: [(pattern: String, ipa: String)] = [
|
|
214
|
+
// Digraphs
|
|
215
|
+
("ch", "tʃ"), ("ll", "ʝ"), ("rr", "rː"), ("qu", "k"),
|
|
216
|
+
("gu", "ɡ"), ("gü", "ɡw"),
|
|
217
|
+
("ñ", "ɲ"),
|
|
218
|
+
// Accented vowels (stressed — add length)
|
|
219
|
+
("á", "aː"), ("é", "eː"), ("í", "iː"), ("ó", "oː"), ("ú", "uː"), ("ü", "w"),
|
|
220
|
+
// Basic
|
|
221
|
+
("a", "a"), ("b", "b"), ("c", "k"), ("d", "d"), ("e", "e"),
|
|
222
|
+
("f", "f"), ("g", "ɡ"), ("h", ""), ("i", "i"), ("j", "x"),
|
|
223
|
+
("k", "k"), ("l", "l"), ("m", "m"), ("n", "n"), ("o", "o"),
|
|
224
|
+
("p", "p"), ("r", "ɾ"), ("s", "s"), ("t", "t"), ("u", "u"),
|
|
225
|
+
("v", "b"), ("w", "w"), ("x", "ks"), ("y", "ʝ"), ("z", "θ"),
|
|
226
|
+
]
|
|
227
|
+
|
|
228
|
+
private func spanishToIPA(_ word: String) -> String {
|
|
229
|
+
var result = ""
|
|
230
|
+
let chars = Array(word)
|
|
231
|
+
var i = 0
|
|
232
|
+
|
|
233
|
+
while i < chars.count {
|
|
234
|
+
// Context: c before e/i = θ, g before e/i = x
|
|
235
|
+
if i + 1 < chars.count {
|
|
236
|
+
if chars[i] == "c" && "eiéí".contains(chars[i+1]) {
|
|
237
|
+
result += "θ"
|
|
238
|
+
i += 1
|
|
239
|
+
continue
|
|
240
|
+
}
|
|
241
|
+
if chars[i] == "g" && "eiéí".contains(chars[i+1]) {
|
|
242
|
+
result += "x"
|
|
243
|
+
i += 1
|
|
244
|
+
continue
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
var matched = false
|
|
249
|
+
for len in stride(from: min(2, chars.count - i), through: 1, by: -1) {
|
|
250
|
+
let substr = String(chars[i..<i+len])
|
|
251
|
+
if let rule = Self.spanishRules.first(where: { $0.pattern == substr }) {
|
|
252
|
+
result += rule.ipa
|
|
253
|
+
i += len
|
|
254
|
+
matched = true
|
|
255
|
+
break
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
if !matched {
|
|
259
|
+
result += String(chars[i])
|
|
260
|
+
i += 1
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
return result
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// MARK: - Portuguese G2P
|
|
268
|
+
|
|
269
|
+
private static let portugueseRules: [(pattern: String, ipa: String)] = [
|
|
270
|
+
// Digraphs / trigraphs
|
|
271
|
+
("ção", "saːw̃"), ("ções", "sõːjs"), ("nh", "ɲ"), ("lh", "ʎ"),
|
|
272
|
+
("ch", "ʃ"), ("qu", "k"), ("gu", "ɡ"), ("rr", "ʁː"),
|
|
273
|
+
("ss", "s"), ("sc", "s"),
|
|
274
|
+
// Explicit nasal diphthongs
|
|
275
|
+
("ão", "aːw̃"), ("ãe", "aːj̃"), ("õe", "oːj̃"),
|
|
276
|
+
// Accented
|
|
277
|
+
("á", "aː"), ("â", "ɐː"), ("ã", "ɐ̃ː"), ("é", "ɛː"), ("ê", "eː"),
|
|
278
|
+
("í", "iː"), ("ó", "ɔː"), ("ô", "oː"), ("õ", "õː"), ("ú", "uː"),
|
|
279
|
+
("ç", "s"),
|
|
280
|
+
// Diphthongs
|
|
281
|
+
("ou", "oː"), ("ei", "eːj"), ("ai", "aːj"), ("oi", "oːj"),
|
|
282
|
+
// Basic
|
|
283
|
+
("a", "a"), ("b", "b"), ("c", "k"), ("d", "d"), ("e", "e"),
|
|
284
|
+
("f", "f"), ("g", "ɡ"), ("h", ""), ("i", "i"), ("j", "ʒ"),
|
|
285
|
+
("k", "k"), ("l", "l"), ("m", "m"), ("n", "n"), ("o", "o"),
|
|
286
|
+
("p", "p"), ("r", "ɾ"), ("s", "s"), ("t", "t"), ("u", "u"),
|
|
287
|
+
("v", "v"), ("w", "w"), ("x", "ʃ"), ("y", "i"), ("z", "z"),
|
|
288
|
+
]
|
|
289
|
+
|
|
290
|
+
private func portugueseToIPA(_ word: String) -> String {
|
|
291
|
+
var result = ""
|
|
292
|
+
let chars = Array(word)
|
|
293
|
+
var i = 0
|
|
294
|
+
|
|
295
|
+
while i < chars.count {
|
|
296
|
+
// Context: c before e/i = s
|
|
297
|
+
if i + 1 < chars.count && chars[i] == "c" && "eiéí".contains(chars[i+1]) {
|
|
298
|
+
result += "s"
|
|
299
|
+
i += 1
|
|
300
|
+
continue
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
var matched = false
|
|
304
|
+
for len in stride(from: min(4, chars.count - i), through: 1, by: -1) {
|
|
305
|
+
let substr = String(chars[i..<i+len])
|
|
306
|
+
if let rule = Self.portugueseRules.first(where: { $0.pattern == substr }) {
|
|
307
|
+
result += rule.ipa
|
|
308
|
+
i += len
|
|
309
|
+
matched = true
|
|
310
|
+
break
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
if !matched {
|
|
314
|
+
result += String(chars[i])
|
|
315
|
+
i += 1
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
return result
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// MARK: - Italian G2P
|
|
323
|
+
|
|
324
|
+
/// Italian is highly regular — nearly 1:1 grapheme-to-phoneme.
|
|
325
|
+
/// Main exceptions: c/g before e/i, gl, gn, sc digraphs.
|
|
326
|
+
private static let italianRules: [(pattern: String, ipa: String)] = [
|
|
327
|
+
// Trigraphs
|
|
328
|
+
("gli", "ʎi"), ("sce", "ʃe"), ("sci", "ʃi"),
|
|
329
|
+
("ghi", "ɡi"), ("ghe", "ɡe"), ("chi", "ki"), ("che", "ke"),
|
|
330
|
+
// Digraphs
|
|
331
|
+
("gn", "ɲ"), ("gl", "ʎ"), ("sc", "sk"),
|
|
332
|
+
("gh", "ɡ"), ("ch", "k"), ("qu", "kw"),
|
|
333
|
+
("ci", "tʃi"), ("ce", "tʃe"),
|
|
334
|
+
("gi", "dʒi"), ("ge", "dʒe"),
|
|
335
|
+
("zz", "tːs"), ("ss", "sː"), ("rr", "rː"), ("ll", "lː"),
|
|
336
|
+
("nn", "nː"), ("mm", "mː"), ("pp", "pː"), ("tt", "tː"),
|
|
337
|
+
("cc", "kː"), ("ff", "fː"), ("bb", "bː"), ("dd", "dː"),
|
|
338
|
+
("gg", "ɡː"),
|
|
339
|
+
// Accented vowels
|
|
340
|
+
("à", "a"), ("è", "ɛ"), ("é", "e"), ("ì", "i"), ("ò", "ɔ"), ("ó", "o"), ("ù", "u"),
|
|
341
|
+
// Basic — Italian vowels are pure, consonants are straightforward
|
|
342
|
+
("a", "a"), ("b", "b"), ("c", "k"), ("d", "d"), ("e", "e"),
|
|
343
|
+
("f", "f"), ("g", "ɡ"), ("h", ""), ("i", "i"), ("j", "j"),
|
|
344
|
+
("k", "k"), ("l", "l"), ("m", "m"), ("n", "n"), ("o", "o"),
|
|
345
|
+
("p", "p"), ("r", "r"), ("s", "s"), ("t", "t"), ("u", "u"),
|
|
346
|
+
("v", "v"), ("w", "w"), ("x", "ks"), ("y", "i"), ("z", "ts"),
|
|
347
|
+
]
|
|
348
|
+
|
|
349
|
+
private func italianToIPA(_ word: String) -> String {
|
|
350
|
+
var result = ""
|
|
351
|
+
let chars = Array(word)
|
|
352
|
+
var i = 0
|
|
353
|
+
|
|
354
|
+
while i < chars.count {
|
|
355
|
+
var matched = false
|
|
356
|
+
for len in stride(from: min(3, chars.count - i), through: 1, by: -1) {
|
|
357
|
+
let substr = String(chars[i..<i+len])
|
|
358
|
+
if let rule = Self.italianRules.first(where: { $0.pattern == substr }) {
|
|
359
|
+
result += rule.ipa
|
|
360
|
+
i += len
|
|
361
|
+
matched = true
|
|
362
|
+
break
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
if !matched {
|
|
366
|
+
result += String(chars[i])
|
|
367
|
+
i += 1
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
return result
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import CoreML
|
|
2
|
+
import Foundation
|
|
3
|
+
|
|
4
|
+
/// CoreML wrapper for Kokoro-82M end-to-end TTS inference.
|
|
5
|
+
///
|
|
6
|
+
/// Loads a single pre-compiled `kokoro_5s.mlmodelc` that runs the full pipeline
|
|
7
|
+
/// (BERT → duration → alignment → prosody → decoder) in one CoreML call.
|
|
8
|
+
class KokoroNetwork {
|
|
9
|
+
|
|
10
|
+
private let e2eModel: MLModel
|
|
11
|
+
|
|
12
|
+
/// Load E2E CoreML model from cache directory.
|
|
13
|
+
init(directory: URL, computeUnits: MLComputeUnits = .all) throws {
|
|
14
|
+
let config = MLModelConfiguration()
|
|
15
|
+
config.computeUnits = computeUnits
|
|
16
|
+
|
|
17
|
+
let e2eNames = ["kokoro_5s", "kokoro_10s", "kokoro_15s", "kokoro"]
|
|
18
|
+
var loaded: MLModel?
|
|
19
|
+
for name in e2eNames {
|
|
20
|
+
let url = directory.appendingPathComponent("\(name).mlmodelc", isDirectory: true)
|
|
21
|
+
if FileManager.default.fileExists(atPath: url.path) {
|
|
22
|
+
loaded = try MLModel(contentsOf: url, configuration: config)
|
|
23
|
+
break
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
guard let model = loaded else {
|
|
28
|
+
throw AudioModelError.modelLoadFailed(
|
|
29
|
+
modelId: "kokoro",
|
|
30
|
+
reason: "No Kokoro E2E model found in \(directory.path)")
|
|
31
|
+
}
|
|
32
|
+
e2eModel = model
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// MARK: - E2E Inference
|
|
36
|
+
|
|
37
|
+
struct E2EOutput {
|
|
38
|
+
let audio: MLMultiArray
|
|
39
|
+
let audioLengthSamples: Int
|
|
40
|
+
let predDur: MLMultiArray
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
func predictE2E(
|
|
44
|
+
inputIds: MLMultiArray,
|
|
45
|
+
attentionMask: MLMultiArray,
|
|
46
|
+
refS: MLMultiArray,
|
|
47
|
+
speed: MLMultiArray? = nil
|
|
48
|
+
) throws -> E2EOutput {
|
|
49
|
+
let randomPhases = try MLMultiArray(shape: [1, 9], dataType: .float32)
|
|
50
|
+
for i in 0..<9 { randomPhases[i] = NSNumber(value: Float.random(in: 0..<1)) }
|
|
51
|
+
|
|
52
|
+
let speedInput = speed ?? {
|
|
53
|
+
let s = try! MLMultiArray(shape: [1], dataType: .float32)
|
|
54
|
+
s[0] = NSNumber(value: Float(1.0))
|
|
55
|
+
return s
|
|
56
|
+
}()
|
|
57
|
+
|
|
58
|
+
let dict: [String: MLFeatureValue] = [
|
|
59
|
+
"input_ids": MLFeatureValue(multiArray: inputIds),
|
|
60
|
+
"attention_mask": MLFeatureValue(multiArray: attentionMask),
|
|
61
|
+
"ref_s": MLFeatureValue(multiArray: refS),
|
|
62
|
+
"random_phases": MLFeatureValue(multiArray: randomPhases),
|
|
63
|
+
"speed": MLFeatureValue(multiArray: speedInput),
|
|
64
|
+
]
|
|
65
|
+
|
|
66
|
+
let input = try MLDictionaryFeatureProvider(dictionary: dict)
|
|
67
|
+
let output = try e2eModel.prediction(from: input)
|
|
68
|
+
|
|
69
|
+
guard let audio = output.featureValue(for: "audio")?.multiArrayValue,
|
|
70
|
+
let audioLen = output.featureValue(for: "audio_length_samples")?.multiArrayValue,
|
|
71
|
+
let predDur = output.featureValue(for: "pred_dur")?.multiArrayValue else {
|
|
72
|
+
throw AudioModelError.inferenceFailed(
|
|
73
|
+
operation: "kokoro-e2e", reason: "Missing output tensors")
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
let lengthSamples: Int
|
|
77
|
+
if #available(iOS 16.0, *), audioLen.dataType == .float16 {
|
|
78
|
+
lengthSamples = Int(Float(audioLen.dataPointer.assumingMemoryBound(to: Float16.self).pointee))
|
|
79
|
+
} else if audioLen.dataType == .int32 {
|
|
80
|
+
lengthSamples = Int(audioLen.dataPointer.assumingMemoryBound(to: Int32.self).pointee)
|
|
81
|
+
} else {
|
|
82
|
+
lengthSamples = Int(audioLen.dataPointer.assumingMemoryBound(to: Float.self).pointee)
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
return E2EOutput(audio: audio, audioLengthSamples: lengthSamples, predDur: predDur)
|
|
86
|
+
}
|
|
87
|
+
}
|