@elizaos/capacitor-bun-runtime 2.0.3-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ElizaosCapacitorBunRuntime.podspec +54 -0
- package/LICENSE +21 -0
- package/README.md +127 -0
- package/dist/esm/definitions.d.ts +136 -0
- package/dist/esm/definitions.d.ts.map +1 -0
- package/dist/esm/definitions.js +14 -0
- package/dist/esm/definitions.js.map +1 -0
- package/dist/esm/index.d.ts +9 -0
- package/dist/esm/index.d.ts.map +1 -0
- package/dist/esm/index.js +11 -0
- package/dist/esm/index.js.map +1 -0
- package/dist/esm/web.d.ts +19 -0
- package/dist/esm/web.d.ts.map +1 -0
- package/dist/esm/web.js +44 -0
- package/dist/esm/web.js.map +1 -0
- package/dist/plugin.cjs.js +63 -0
- package/dist/plugin.cjs.js.map +1 -0
- package/dist/plugin.js +66 -0
- package/dist/plugin.js.map +1 -0
- package/ios/Sources/ElizaBunRuntimePlugin/BridgeInstaller.swift +94 -0
- package/ios/Sources/ElizaBunRuntimePlugin/ElizaBunRuntime.swift +705 -0
- package/ios/Sources/ElizaBunRuntimePlugin/ElizaBunRuntimePlugin.swift +1109 -0
- package/ios/Sources/ElizaBunRuntimePlugin/FullBunEngineHost.swift +677 -0
- package/ios/Sources/ElizaBunRuntimePlugin/JSContextHelpers.swift +226 -0
- package/ios/Sources/ElizaBunRuntimePlugin/SandboxPaths.swift +46 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/CryptoBridge.swift +238 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/ElizaSqliteVecBridge.m +28 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/FSBridge.swift +270 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/HTTPBridge.swift +153 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/HTTPServerBridge.swift +32 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/LlamaBridge.swift +233 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/LlamaBridgeImpl.swift +1863 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/LogBridge.swift +36 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/PathsBridge.swift +41 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/ProcessBridge.swift +80 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteBridge.swift +406 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteBridgeInstaller.swift +17 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteVecLoader.swift +66 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/UIBridge.swift +72 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlChinesePhonemizer.swift +313 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlConfiguration.swift +28 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlEngine.swift +325 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlHindiPhonemizer.swift +150 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlJapanesePhonemizer.swift +209 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlLatinPhonemizer.swift +374 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlModel.swift +87 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlPhonemizer.swift +679 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlPronunciationDicts.swift +131 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlSupport.swift +24 -0
- package/ios/Tests/llama-bridge-smoke-main.swift +92 -0
- package/package.json +68 -0
- package/src/bridge-contract.test.ts +127 -0
- package/src/definitions.d.ts +136 -0
- package/src/definitions.d.ts.map +1 -0
- package/src/definitions.ts +152 -0
- package/src/index.d.ts +9 -0
- package/src/index.d.ts.map +1 -0
- package/src/index.ts +16 -0
- package/src/web.d.ts +19 -0
- package/src/web.d.ts.map +1 -0
- package/src/web.ts +80 -0
|
@@ -0,0 +1,679 @@
|
|
|
1
|
+
import CoreML
|
|
2
|
+
import Foundation
|
|
3
|
+
import NaturalLanguage
|
|
4
|
+
|
|
5
|
+
/// Multilingual phonemizer for Kokoro TTS.
|
|
6
|
+
///
|
|
7
|
+
/// English: dictionary lookup → suffix stemming → CoreML BART G2P fallback.
|
|
8
|
+
/// Chinese/Japanese/Italian: dedicated language-specific phonemizers.
|
|
9
|
+
/// French/Spanish/Portuguese/Hindi: pronunciation dictionary + rule-based G2P.
|
|
10
|
+
public final class KokoroPhonemizer {
|
|
11
|
+
|
|
12
|
+
/// IPA symbol → token ID mapping (from vocab_index.json).
|
|
13
|
+
private let vocab: [String: Int]
|
|
14
|
+
|
|
15
|
+
/// Reverse mapping for debugging.
|
|
16
|
+
private let idToToken: [Int: String]
|
|
17
|
+
|
|
18
|
+
/// Gold dictionary (high-confidence entries).
|
|
19
|
+
private var goldDict: [String: DictEntry] = [:]
|
|
20
|
+
|
|
21
|
+
/// Silver dictionary (lower-confidence entries).
|
|
22
|
+
private var silverDict: [String: DictEntry] = [:]
|
|
23
|
+
|
|
24
|
+
/// CoreML G2P encoder model.
|
|
25
|
+
private var g2pEncoder: MLModel?
|
|
26
|
+
|
|
27
|
+
/// CoreML G2P decoder model.
|
|
28
|
+
private var g2pDecoder: MLModel?
|
|
29
|
+
|
|
30
|
+
/// G2P vocabulary mappings.
|
|
31
|
+
private var graphemeToId: [String: Int] = [:]
|
|
32
|
+
private var idToPhoneme: [Int: String] = [:]
|
|
33
|
+
private var g2pBosId: Int = 1
|
|
34
|
+
private var g2pEosId: Int = 2
|
|
35
|
+
private var g2pPadId: Int = 0
|
|
36
|
+
|
|
37
|
+
/// NL tagger for POS tagging (heteronym resolution).
|
|
38
|
+
private let tagger = NLTagger(tagSchemes: [.lexicalClass])
|
|
39
|
+
|
|
40
|
+
/// Pad token ID (0).
|
|
41
|
+
public let padId: Int = 0
|
|
42
|
+
|
|
43
|
+
/// Start-of-sequence token ID.
|
|
44
|
+
public let bosId: Int = 1
|
|
45
|
+
|
|
46
|
+
/// End-of-sequence token ID.
|
|
47
|
+
public let eosId: Int = 2
|
|
48
|
+
|
|
49
|
+
/// Dictionary entry: either a simple phoneme string or POS-tagged heteronym.
|
|
50
|
+
enum DictEntry {
|
|
51
|
+
case simple(String)
|
|
52
|
+
case heteronym([String: String])
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/// Initialize with a vocabulary mapping.
|
|
56
|
+
public init(vocab: [String: Int]) {
|
|
57
|
+
self.vocab = vocab
|
|
58
|
+
self.idToToken = Dictionary(uniqueKeysWithValues: vocab.map { ($1, $0) })
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/// Load vocabulary from vocab_index.json.
|
|
62
|
+
///
|
|
63
|
+
/// Format: `{"vocab": {"symbol": id, ...}, "metadata": {...}}`
|
|
64
|
+
public static func loadVocab(from url: URL) throws -> KokoroPhonemizer {
|
|
65
|
+
let data = try Data(contentsOf: url)
|
|
66
|
+
let json = try JSONSerialization.jsonObject(with: data)
|
|
67
|
+
|
|
68
|
+
// Support both flat {sym: id} and nested {vocab: {sym: id}} formats
|
|
69
|
+
let vocab: [String: Int]
|
|
70
|
+
if let nested = json as? [String: Any], let v = nested["vocab"] as? [String: Int] {
|
|
71
|
+
vocab = v
|
|
72
|
+
} else if let flat = json as? [String: Int] {
|
|
73
|
+
vocab = flat
|
|
74
|
+
} else {
|
|
75
|
+
throw NSError(domain: "KokoroTTS", code: -1,
|
|
76
|
+
userInfo: [NSLocalizedDescriptionKey: "Invalid vocab_index.json format"])
|
|
77
|
+
}
|
|
78
|
+
return KokoroPhonemizer(vocab: vocab)
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/// Load pronunciation dictionaries from directory.
|
|
82
|
+
public func loadDictionaries(from directory: URL, british: Bool = false) throws {
|
|
83
|
+
let prefix = british ? "gb" : "us"
|
|
84
|
+
let goldURL = directory.appendingPathComponent("\(prefix)_gold.json")
|
|
85
|
+
let silverURL = directory.appendingPathComponent("\(prefix)_silver.json")
|
|
86
|
+
|
|
87
|
+
if FileManager.default.fileExists(atPath: goldURL.path) {
|
|
88
|
+
goldDict = try parseDictionary(from: goldURL)
|
|
89
|
+
growDictionary(&goldDict)
|
|
90
|
+
}
|
|
91
|
+
if FileManager.default.fileExists(atPath: silverURL.path) {
|
|
92
|
+
silverDict = try parseDictionary(from: silverURL)
|
|
93
|
+
growDictionary(&silverDict)
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/// Load separate G2P encoder + decoder CoreML models.
|
|
98
|
+
public func loadG2PModels(encoderURL: URL, decoderURL: URL, vocabURL: URL) throws {
|
|
99
|
+
let config = MLModelConfiguration()
|
|
100
|
+
config.computeUnits = .cpuOnly
|
|
101
|
+
g2pEncoder = try MLModel(contentsOf: encoderURL, configuration: config)
|
|
102
|
+
g2pDecoder = try MLModel(contentsOf: decoderURL, configuration: config)
|
|
103
|
+
|
|
104
|
+
// Load G2P vocabulary
|
|
105
|
+
if FileManager.default.fileExists(atPath: vocabURL.path) {
|
|
106
|
+
let data = try Data(contentsOf: vocabURL)
|
|
107
|
+
if let json = try JSONSerialization.jsonObject(with: data) as? [String: Any] {
|
|
108
|
+
if let g2id = json["grapheme_to_id"] as? [String: Int] {
|
|
109
|
+
graphemeToId = g2id
|
|
110
|
+
}
|
|
111
|
+
if let id2p = json["id_to_phoneme"] as? [String: String] {
|
|
112
|
+
idToPhoneme = Dictionary(uniqueKeysWithValues: id2p.compactMap { k, v in
|
|
113
|
+
Int(k).map { ($0, v) }
|
|
114
|
+
})
|
|
115
|
+
}
|
|
116
|
+
g2pBosId = (json["bos_token_id"] as? Int) ?? 1
|
|
117
|
+
g2pEosId = (json["eos_token_id"] as? Int) ?? 2
|
|
118
|
+
g2pPadId = (json["pad_token_id"] as? Int) ?? 0
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/// Legacy single-model G2P loading (backward compat).
|
|
124
|
+
public func loadG2PModel(from url: URL) throws {
|
|
125
|
+
let config = MLModelConfiguration()
|
|
126
|
+
config.computeUnits = .cpuOnly
|
|
127
|
+
g2pEncoder = try MLModel(contentsOf: url, configuration: config)
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// MARK: - Multilingual Phonemizers
|
|
131
|
+
|
|
132
|
+
private lazy var chinesePhonemizer = ChinesePhonemizer()
|
|
133
|
+
private lazy var japanesePhonemizer = JapanesePhonemizer()
|
|
134
|
+
private lazy var hindiPhonemizer = HindiPhonemizer()
|
|
135
|
+
private lazy var frenchPhonemizer = LatinPhonemizer(language: .french)
|
|
136
|
+
private lazy var spanishPhonemizer = LatinPhonemizer(language: .spanish)
|
|
137
|
+
private lazy var portuguesePhonemizer = LatinPhonemizer(language: .portuguese)
|
|
138
|
+
private lazy var italianPhonemizer = LatinPhonemizer(language: .italian)
|
|
139
|
+
|
|
140
|
+
// MARK: - Tokenization
|
|
141
|
+
|
|
142
|
+
/// Convert text to phoneme token IDs using language-appropriate phonemizer.
|
|
143
|
+
public func tokenize(_ text: String, maxLength: Int = 510, language: String = "en") -> [Int] {
|
|
144
|
+
let phonemes: String
|
|
145
|
+
switch language {
|
|
146
|
+
case "zh", "cmn", "chinese", "mandarin":
|
|
147
|
+
phonemes = chinesePhonemizer.phonemize(text)
|
|
148
|
+
case "ja", "japanese":
|
|
149
|
+
phonemes = japanesePhonemizer.phonemize(text)
|
|
150
|
+
case "it", "italian":
|
|
151
|
+
phonemes = phonemizeWithDict(text, dict: PronunciationDicts.it, fallback: italianPhonemizer)
|
|
152
|
+
case "fr", "french":
|
|
153
|
+
phonemes = phonemizeWithDict(text, dict: PronunciationDicts.fr, fallback: frenchPhonemizer)
|
|
154
|
+
case "es", "spanish":
|
|
155
|
+
phonemes = phonemizeWithDict(text, dict: PronunciationDicts.es, fallback: spanishPhonemizer)
|
|
156
|
+
case "pt", "portuguese":
|
|
157
|
+
phonemes = phonemizeWithDict(text, dict: PronunciationDicts.pt, fallback: portuguesePhonemizer)
|
|
158
|
+
case "hi", "hindi":
|
|
159
|
+
phonemes = phonemizeWithDict(text, dict: PronunciationDicts.hi, fallback: hindiPhonemizer)
|
|
160
|
+
default:
|
|
161
|
+
phonemes = textToPhonemes(text)
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
var ids = [bosId]
|
|
165
|
+
|
|
166
|
+
// Tokenize IPA string character by character
|
|
167
|
+
for char in phonemes {
|
|
168
|
+
let s = String(char)
|
|
169
|
+
if let id = vocab[s] {
|
|
170
|
+
ids.append(id)
|
|
171
|
+
}
|
|
172
|
+
// Unknown chars silently dropped
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
ids.append(eosId)
|
|
176
|
+
|
|
177
|
+
if ids.count > maxLength {
|
|
178
|
+
ids = Array(ids.prefix(maxLength - 1)) + [eosId]
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
return ids
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/// Pad token IDs to a fixed length.
|
|
185
|
+
public func pad(_ ids: [Int], to length: Int) -> [Int] {
|
|
186
|
+
if ids.count >= length { return Array(ids.prefix(length)) }
|
|
187
|
+
return ids + [Int](repeating: padId, count: length - ids.count)
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// MARK: - Dictionary-Based Phonemization
|
|
191
|
+
|
|
192
|
+
/// Phonemize text using dictionary lookup with rule-based fallback.
|
|
193
|
+
/// Words found in the dictionary use pre-computed IPA with correct stress placement.
|
|
194
|
+
/// Unknown words fall back to the language-specific rule-based G2P.
|
|
195
|
+
private func phonemizeWithDict(_ text: String, dict: [String: String], fallback: LatinPhonemizer) -> String {
|
|
196
|
+
var result = ""
|
|
197
|
+
var lastWasWord = false
|
|
198
|
+
|
|
199
|
+
for ch in text {
|
|
200
|
+
if ch.isWhitespace {
|
|
201
|
+
if lastWasWord { result += " " }
|
|
202
|
+
lastWasWord = false
|
|
203
|
+
} else if ch.isPunctuation || ch.isSymbol {
|
|
204
|
+
if let mapped = punctuationToPhoneme(String(ch)) {
|
|
205
|
+
result += mapped
|
|
206
|
+
}
|
|
207
|
+
lastWasWord = false
|
|
208
|
+
} else if ch.isLetter || ch == "'" || ch == "'" {
|
|
209
|
+
// Accumulate word characters — handled below
|
|
210
|
+
continue
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// Split into words and look up each one
|
|
215
|
+
let words = text.components(separatedBy: .whitespaces).filter { !$0.isEmpty }
|
|
216
|
+
result = ""
|
|
217
|
+
lastWasWord = false
|
|
218
|
+
|
|
219
|
+
for word in words {
|
|
220
|
+
// Strip trailing punctuation
|
|
221
|
+
var clean = word.lowercased()
|
|
222
|
+
var trailing = ""
|
|
223
|
+
while let last = clean.last, last.isPunctuation || last.isSymbol {
|
|
224
|
+
trailing = String(last) + trailing
|
|
225
|
+
clean = String(clean.dropLast())
|
|
226
|
+
}
|
|
227
|
+
var leading = ""
|
|
228
|
+
while let first = clean.first, first.isPunctuation || first.isSymbol {
|
|
229
|
+
leading += String(first)
|
|
230
|
+
clean = String(clean.dropFirst())
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Leading punctuation
|
|
234
|
+
for ch in leading {
|
|
235
|
+
if let mapped = mapPunctuation(ch) { result += mapped }
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
if !clean.isEmpty {
|
|
239
|
+
if lastWasWord { result += " " }
|
|
240
|
+
// Dictionary lookup, then fallback
|
|
241
|
+
if let ipa = dict[clean] {
|
|
242
|
+
result += ipa
|
|
243
|
+
} else {
|
|
244
|
+
result += fallback.phonemizeWord(clean)
|
|
245
|
+
}
|
|
246
|
+
lastWasWord = true
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// Trailing punctuation
|
|
250
|
+
for ch in trailing {
|
|
251
|
+
if let mapped = mapPunctuation(ch) { result += mapped }
|
|
252
|
+
lastWasWord = false
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
return result
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/// Dictionary-based phonemization for Hindi (uses HindiPhonemizer as fallback).
|
|
260
|
+
private func phonemizeWithDict(_ text: String, dict: [String: String], fallback: HindiPhonemizer) -> String {
|
|
261
|
+
let words = text.components(separatedBy: .whitespaces).filter { !$0.isEmpty }
|
|
262
|
+
var result = ""
|
|
263
|
+
var lastWasWord = false
|
|
264
|
+
|
|
265
|
+
for word in words {
|
|
266
|
+
var clean = word
|
|
267
|
+
var trailing = ""
|
|
268
|
+
while let last = clean.last, last.isPunctuation || last.isSymbol {
|
|
269
|
+
trailing = String(last) + trailing
|
|
270
|
+
clean = String(clean.dropLast())
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
if !clean.isEmpty {
|
|
274
|
+
if lastWasWord { result += " " }
|
|
275
|
+
if let ipa = dict[clean] {
|
|
276
|
+
result += ipa
|
|
277
|
+
} else {
|
|
278
|
+
result += fallback.phonemizeWord(clean)
|
|
279
|
+
}
|
|
280
|
+
lastWasWord = true
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
for ch in trailing {
|
|
284
|
+
if let mapped = mapPunctuation(ch) { result += mapped }
|
|
285
|
+
lastWasWord = false
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
return result
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
private func mapPunctuation(_ ch: Character) -> String? {
|
|
293
|
+
switch ch {
|
|
294
|
+
case ",", ",": return ","
|
|
295
|
+
case ".", "。": return "."
|
|
296
|
+
case "!", "!": return "!"
|
|
297
|
+
case "?", "?": return "?"
|
|
298
|
+
case ";", ";": return ";"
|
|
299
|
+
case ":": return ":"
|
|
300
|
+
case "।": return "."
|
|
301
|
+
default: return nil
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// MARK: - Text-to-Phoneme Pipeline
|
|
306
|
+
|
|
307
|
+
func textToPhonemes(_ text: String) -> String {
|
|
308
|
+
let normalized = normalizeText(text)
|
|
309
|
+
let words = splitWords(normalized)
|
|
310
|
+
let posTagged = tagPOS(normalized)
|
|
311
|
+
|
|
312
|
+
var result = ""
|
|
313
|
+
for word in words {
|
|
314
|
+
if word.allSatisfy({ $0.isWhitespace }) {
|
|
315
|
+
result += " "
|
|
316
|
+
continue
|
|
317
|
+
}
|
|
318
|
+
if word.allSatisfy({ $0.isPunctuation || $0.isSymbol }) {
|
|
319
|
+
if let mapped = punctuationToPhoneme(word) {
|
|
320
|
+
result += mapped
|
|
321
|
+
}
|
|
322
|
+
continue
|
|
323
|
+
}
|
|
324
|
+
let pos = posTagged[word.lowercased()]
|
|
325
|
+
if let phonemes = resolveWord(word, pos: pos) {
|
|
326
|
+
result += phonemes
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
return result
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// MARK: - Word Resolution
|
|
333
|
+
|
|
334
|
+
private func resolveWord(_ word: String, pos: String?) -> String? {
|
|
335
|
+
let lower = word.lowercased()
|
|
336
|
+
if let special = specialCase(lower, pos: pos) { return special }
|
|
337
|
+
if let entry = lookupDict(lower, pos: pos) { return entry }
|
|
338
|
+
if let stemmed = stemAndLookup(lower) { return stemmed }
|
|
339
|
+
if let g2p = bartG2P(lower) { return g2p }
|
|
340
|
+
return lower
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
private func lookupDict(_ word: String, pos: String?) -> String? {
|
|
344
|
+
if let entry = goldDict[word] { return resolveEntry(entry, pos: pos) }
|
|
345
|
+
if let entry = silverDict[word] { return resolveEntry(entry, pos: pos) }
|
|
346
|
+
return nil
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
private func resolveEntry(_ entry: DictEntry, pos: String?) -> String {
|
|
350
|
+
switch entry {
|
|
351
|
+
case .simple(let phonemes):
|
|
352
|
+
return phonemes
|
|
353
|
+
case .heteronym(let posMap):
|
|
354
|
+
if let pos, let phonemes = posMap[pos] { return phonemes }
|
|
355
|
+
return posMap["DEFAULT"] ?? posMap.values.first ?? ""
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// MARK: - Special Cases
|
|
360
|
+
|
|
361
|
+
private func specialCase(_ word: String, pos: String?) -> String? {
|
|
362
|
+
switch word {
|
|
363
|
+
case "eliza": return "ɪlˈaɪzə"
|
|
364
|
+
case "elizaos": return "ɪlˈaɪzə oʊ ɛs"
|
|
365
|
+
case "the": return "ðə"
|
|
366
|
+
case "a":
|
|
367
|
+
if pos == "Determiner" { return "ɐ" }
|
|
368
|
+
return "eɪ"
|
|
369
|
+
case "an": return "ən"
|
|
370
|
+
case "to": return "tʊ"
|
|
371
|
+
case "of": return "ʌv"
|
|
372
|
+
case "i": return "aɪ"
|
|
373
|
+
default: return nil
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// MARK: - Suffix Stemming
|
|
378
|
+
|
|
379
|
+
private func stemAndLookup(_ word: String) -> String? {
|
|
380
|
+
if let result = stemS(word) { return result }
|
|
381
|
+
if let result = stemEd(word) { return result }
|
|
382
|
+
if let result = stemIng(word) { return result }
|
|
383
|
+
return nil
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
private func stemS(_ word: String) -> String? {
|
|
387
|
+
guard word.hasSuffix("s") && word.count > 2 else { return nil }
|
|
388
|
+
if word.hasSuffix("ies") {
|
|
389
|
+
let stem = String(word.dropLast(3)) + "y"
|
|
390
|
+
if let phonemes = lookupDict(stem, pos: nil) { return phonemes + "z" }
|
|
391
|
+
}
|
|
392
|
+
if word.hasSuffix("es") && word.count > 3 {
|
|
393
|
+
let stem = String(word.dropLast(2))
|
|
394
|
+
if let phonemes = lookupDict(stem, pos: nil) {
|
|
395
|
+
let last = phonemes.last
|
|
396
|
+
if last == "s" || last == "z" || last == "ʃ" || last == "ʒ" { return phonemes + "ɪz" }
|
|
397
|
+
return phonemes + "z"
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
let stem = String(word.dropLast(1))
|
|
401
|
+
if let phonemes = lookupDict(stem, pos: nil) {
|
|
402
|
+
let voiceless: Set<Character> = ["p", "t", "k", "f", "θ"]
|
|
403
|
+
if let last = phonemes.last, voiceless.contains(last) { return phonemes + "s" }
|
|
404
|
+
return phonemes + "z"
|
|
405
|
+
}
|
|
406
|
+
return nil
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
private func stemEd(_ word: String) -> String? {
|
|
410
|
+
guard word.hasSuffix("ed") && word.count > 3 else { return nil }
|
|
411
|
+
if word.hasSuffix("ied") {
|
|
412
|
+
let stem = String(word.dropLast(3)) + "y"
|
|
413
|
+
if let phonemes = lookupDict(stem, pos: nil) { return phonemes + "d" }
|
|
414
|
+
}
|
|
415
|
+
let stemEd = String(word.dropLast(2))
|
|
416
|
+
if stemEd.count >= 2 {
|
|
417
|
+
let chars = Array(stemEd)
|
|
418
|
+
if chars[chars.count - 1] == chars[chars.count - 2] {
|
|
419
|
+
let dedoubled = String(stemEd.dropLast(1))
|
|
420
|
+
if let phonemes = lookupDict(dedoubled, pos: nil) {
|
|
421
|
+
return phonemes + edSuffix(phonemes)
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
if let phonemes = lookupDict(stemEd, pos: nil) {
|
|
426
|
+
return phonemes + edSuffix(phonemes)
|
|
427
|
+
}
|
|
428
|
+
return nil
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
private func edSuffix(_ phonemes: String) -> String {
|
|
432
|
+
let last = phonemes.last
|
|
433
|
+
if last == "t" || last == "d" { return "ɪd" }
|
|
434
|
+
let voiceless: Set<Character> = ["p", "k", "f", "θ", "s", "ʃ"]
|
|
435
|
+
if let l = last, voiceless.contains(l) { return "t" }
|
|
436
|
+
return "d"
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
private func stemIng(_ word: String) -> String? {
|
|
440
|
+
guard word.hasSuffix("ing") && word.count > 4 else { return nil }
|
|
441
|
+
let stem = String(word.dropLast(3))
|
|
442
|
+
if stem.count >= 2 {
|
|
443
|
+
let chars = Array(stem)
|
|
444
|
+
if chars[chars.count - 1] == chars[chars.count - 2] {
|
|
445
|
+
let dedoubled = String(stem.dropLast(1))
|
|
446
|
+
if let phonemes = lookupDict(dedoubled, pos: nil) { return phonemes + "ɪŋ" }
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
if let phonemes = lookupDict(stem, pos: nil) { return phonemes + "ɪŋ" }
|
|
450
|
+
let stemE = stem + "e"
|
|
451
|
+
if let phonemes = lookupDict(stemE, pos: nil) { return phonemes + "ɪŋ" }
|
|
452
|
+
return nil
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
// MARK: - BART G2P Neural Fallback
|
|
456
|
+
|
|
457
|
+
/// Use the CoreML BART encoder-decoder to phonemize an OOV word.
|
|
458
|
+
private func bartG2P(_ word: String) -> String? {
|
|
459
|
+
guard let encoder = g2pEncoder, let decoder = g2pDecoder else { return nil }
|
|
460
|
+
guard !graphemeToId.isEmpty else { return nil }
|
|
461
|
+
|
|
462
|
+
// Encode graphemes
|
|
463
|
+
var inputIds: [Int32] = [Int32(g2pBosId)]
|
|
464
|
+
for char in word {
|
|
465
|
+
let s = String(char)
|
|
466
|
+
if let id = graphemeToId[s] {
|
|
467
|
+
inputIds.append(Int32(id))
|
|
468
|
+
} else if let id = graphemeToId[s.lowercased()] {
|
|
469
|
+
inputIds.append(Int32(id))
|
|
470
|
+
} else {
|
|
471
|
+
inputIds.append(Int32(graphemeToId["<unk>"] ?? 3))
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
inputIds.append(Int32(g2pEosId))
|
|
475
|
+
|
|
476
|
+
let seqLen = inputIds.count
|
|
477
|
+
guard seqLen <= 64 else { return nil }
|
|
478
|
+
|
|
479
|
+
do {
|
|
480
|
+
// Run encoder
|
|
481
|
+
let encInput = try MLMultiArray(shape: [1, seqLen as NSNumber], dataType: .int32)
|
|
482
|
+
let encPtr = encInput.dataPointer.assumingMemoryBound(to: Int32.self)
|
|
483
|
+
for i in 0..<seqLen { encPtr[i] = inputIds[i] }
|
|
484
|
+
|
|
485
|
+
let encFeatures = try MLDictionaryFeatureProvider(dictionary: [
|
|
486
|
+
"input_ids": MLFeatureValue(multiArray: encInput),
|
|
487
|
+
])
|
|
488
|
+
let encOutput = try encoder.prediction(from: encFeatures)
|
|
489
|
+
guard let hiddenStates = encOutput.featureValue(for: "encoder_hidden_states")?.multiArrayValue else {
|
|
490
|
+
return nil
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
// Autoregressive decoding
|
|
494
|
+
var decoderIds: [Int32] = [Int32(g2pBosId)]
|
|
495
|
+
let maxDecLen = 64
|
|
496
|
+
|
|
497
|
+
for step in 0..<maxDecLen {
|
|
498
|
+
let decLen = decoderIds.count
|
|
499
|
+
|
|
500
|
+
let decInput = try MLMultiArray(shape: [1, decLen as NSNumber], dataType: .int32)
|
|
501
|
+
let decPtr = decInput.dataPointer.assumingMemoryBound(to: Int32.self)
|
|
502
|
+
for i in 0..<decLen { decPtr[i] = decoderIds[i] }
|
|
503
|
+
|
|
504
|
+
let posIds = try MLMultiArray(shape: [1, decLen as NSNumber], dataType: .int32)
|
|
505
|
+
let posPtr = posIds.dataPointer.assumingMemoryBound(to: Int32.self)
|
|
506
|
+
for i in 0..<decLen { posPtr[i] = Int32(i) }
|
|
507
|
+
|
|
508
|
+
let mask = try MLMultiArray(shape: [1, decLen as NSNumber, decLen as NSNumber], dataType: .float32)
|
|
509
|
+
let maskPtr = mask.dataPointer.assumingMemoryBound(to: Float.self)
|
|
510
|
+
for i in 0..<decLen {
|
|
511
|
+
for j in 0..<decLen {
|
|
512
|
+
maskPtr[i * decLen + j] = (j <= i) ? 0.0 : -Float.greatestFiniteMagnitude
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
let decFeatures = try MLDictionaryFeatureProvider(dictionary: [
|
|
517
|
+
"decoder_input_ids": MLFeatureValue(multiArray: decInput),
|
|
518
|
+
"encoder_hidden_states": MLFeatureValue(multiArray: hiddenStates),
|
|
519
|
+
"position_ids": MLFeatureValue(multiArray: posIds),
|
|
520
|
+
"causal_mask": MLFeatureValue(multiArray: mask),
|
|
521
|
+
])
|
|
522
|
+
|
|
523
|
+
let decOutput = try decoder.prediction(from: decFeatures)
|
|
524
|
+
guard let logits = decOutput.featureValue(for: "logits")?.multiArrayValue else {
|
|
525
|
+
break
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
// Greedy: take argmax of last position
|
|
529
|
+
let vocabSize = logits.shape.last!.intValue
|
|
530
|
+
let lastOffset = step * vocabSize
|
|
531
|
+
var maxId = 0
|
|
532
|
+
var maxVal: Float = -.infinity
|
|
533
|
+
if #available(iOS 16.0, *), logits.dataType == .float16 {
|
|
534
|
+
let lPtr = logits.dataPointer.assumingMemoryBound(to: Float16.self)
|
|
535
|
+
for v in 0..<vocabSize {
|
|
536
|
+
let val = Float(lPtr[lastOffset + v])
|
|
537
|
+
if val > maxVal { maxVal = val; maxId = v }
|
|
538
|
+
}
|
|
539
|
+
} else {
|
|
540
|
+
let lPtr = logits.dataPointer.assumingMemoryBound(to: Float.self)
|
|
541
|
+
for v in 0..<vocabSize {
|
|
542
|
+
let val = lPtr[lastOffset + v]
|
|
543
|
+
if val > maxVal { maxVal = val; maxId = v }
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
if maxId == g2pEosId { break }
|
|
548
|
+
decoderIds.append(Int32(maxId))
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
// Convert IDs to phonemes
|
|
552
|
+
var result = ""
|
|
553
|
+
for id in decoderIds.dropFirst() { // skip BOS
|
|
554
|
+
let intId = Int(id)
|
|
555
|
+
if intId != g2pPadId && intId != g2pBosId && intId != g2pEosId,
|
|
556
|
+
let phoneme = idToPhoneme[intId] {
|
|
557
|
+
result += phoneme
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
return result.isEmpty ? nil : result
|
|
561
|
+
} catch {
|
|
562
|
+
return nil
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
// MARK: - Text Normalization
|
|
567
|
+
|
|
568
|
+
private func normalizeText(_ text: String) -> String {
|
|
569
|
+
var result = text
|
|
570
|
+
.replacingOccurrences(of: "’", with: "'")
|
|
571
|
+
.replacingOccurrences(of: "‘", with: "'")
|
|
572
|
+
.replacingOccurrences(of: "—", with: ",")
|
|
573
|
+
.replacingOccurrences(of: "–", with: ",")
|
|
574
|
+
let contractions: [(String, String)] = [
|
|
575
|
+
("can't", "can not"), ("won't", "will not"), ("don't", "do not"),
|
|
576
|
+
("doesn't", "does not"), ("didn't", "did not"), ("isn't", "is not"),
|
|
577
|
+
("aren't", "are not"), ("wasn't", "was not"), ("weren't", "were not"),
|
|
578
|
+
("couldn't", "could not"), ("wouldn't", "would not"), ("shouldn't", "should not"),
|
|
579
|
+
("haven't", "have not"), ("hasn't", "has not"), ("hadn't", "had not"),
|
|
580
|
+
("i'm", "i am"), ("i've", "i have"), ("i'll", "i will"), ("i'd", "i would"),
|
|
581
|
+
("you're", "you are"), ("you've", "you have"), ("you'll", "you will"),
|
|
582
|
+
("he's", "he is"), ("she's", "she is"), ("it's", "it is"),
|
|
583
|
+
("we're", "we are"), ("we've", "we have"), ("we'll", "we will"),
|
|
584
|
+
("they're", "they are"), ("they've", "they have"), ("they'll", "they will"),
|
|
585
|
+
("that's", "that is"), ("there's", "there is"), ("let's", "let us"),
|
|
586
|
+
]
|
|
587
|
+
let lower = result.lowercased()
|
|
588
|
+
for (contraction, expansion) in contractions {
|
|
589
|
+
if lower.contains(contraction) {
|
|
590
|
+
result = result.replacingOccurrences(of: contraction, with: expansion, options: .caseInsensitive)
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
while result.contains(" ") {
|
|
594
|
+
result = result.replacingOccurrences(of: " ", with: " ")
|
|
595
|
+
}
|
|
596
|
+
return result.trimmingCharacters(in: .whitespaces)
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
private func splitWords(_ text: String) -> [String] {
|
|
600
|
+
var words: [String] = []
|
|
601
|
+
var current = ""
|
|
602
|
+
for char in text {
|
|
603
|
+
if char.isWhitespace {
|
|
604
|
+
if !current.isEmpty { words.append(current); current = "" }
|
|
605
|
+
words.append(" ")
|
|
606
|
+
} else if char.isPunctuation || char.isSymbol {
|
|
607
|
+
if !current.isEmpty { words.append(current); current = "" }
|
|
608
|
+
words.append(String(char))
|
|
609
|
+
} else {
|
|
610
|
+
current.append(char)
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
if !current.isEmpty { words.append(current) }
|
|
614
|
+
return words
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
private func punctuationToPhoneme(_ text: String) -> String? {
|
|
618
|
+
switch text {
|
|
619
|
+
case ",": return ","
|
|
620
|
+
case ".": return "."
|
|
621
|
+
case "!": return "!"
|
|
622
|
+
case "?": return "?"
|
|
623
|
+
case ";": return ";"
|
|
624
|
+
case ":": return ":"
|
|
625
|
+
case "-": return "-"
|
|
626
|
+
case "'": return "'"
|
|
627
|
+
default: return nil
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
|
|
631
|
+
// MARK: - POS Tagging
|
|
632
|
+
|
|
633
|
+
private func tagPOS(_ text: String) -> [String: String] {
|
|
634
|
+
var result = [String: String]()
|
|
635
|
+
tagger.string = text
|
|
636
|
+
let range = text.startIndex..<text.endIndex
|
|
637
|
+
tagger.enumerateTags(in: range, unit: .word, scheme: .lexicalClass) { tag, tokenRange in
|
|
638
|
+
let word = String(text[tokenRange]).lowercased()
|
|
639
|
+
if let tag { result[word] = tag.rawValue }
|
|
640
|
+
return true
|
|
641
|
+
}
|
|
642
|
+
return result
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
// MARK: - Dictionary Parsing
|
|
646
|
+
|
|
647
|
+
private func parseDictionary(from url: URL) throws -> [String: DictEntry] {
|
|
648
|
+
let data = try Data(contentsOf: url)
|
|
649
|
+
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any] else { return [:] }
|
|
650
|
+
var dict = [String: DictEntry]()
|
|
651
|
+
for (key, value) in json {
|
|
652
|
+
if let phonemes = value as? String {
|
|
653
|
+
dict[key] = .simple(phonemes)
|
|
654
|
+
} else if let posMap = value as? [String: String?] {
|
|
655
|
+
var resolved = [String: String]()
|
|
656
|
+
for (pos, pron) in posMap {
|
|
657
|
+
if let p = pron { resolved[pos] = p }
|
|
658
|
+
}
|
|
659
|
+
if !resolved.isEmpty { dict[key] = .heteronym(resolved) }
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
return dict
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
private func growDictionary(_ dict: inout [String: DictEntry]) {
|
|
666
|
+
var additions = [String: DictEntry]()
|
|
667
|
+
for (key, entry) in dict {
|
|
668
|
+
if key == key.lowercased() && !key.isEmpty {
|
|
669
|
+
let capitalized = key.prefix(1).uppercased() + key.dropFirst()
|
|
670
|
+
if dict[capitalized] == nil { additions[capitalized] = entry }
|
|
671
|
+
}
|
|
672
|
+
if key.first?.isUppercase == true {
|
|
673
|
+
let lower = key.lowercased()
|
|
674
|
+
if dict[lower] == nil { additions[lower] = entry }
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
for (key, entry) in additions { dict[key] = entry }
|
|
678
|
+
}
|
|
679
|
+
}
|