@elizaos/capacitor-bun-runtime 2.0.3-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ElizaosCapacitorBunRuntime.podspec +54 -0
- package/LICENSE +21 -0
- package/README.md +127 -0
- package/dist/esm/definitions.d.ts +136 -0
- package/dist/esm/definitions.d.ts.map +1 -0
- package/dist/esm/definitions.js +14 -0
- package/dist/esm/definitions.js.map +1 -0
- package/dist/esm/index.d.ts +9 -0
- package/dist/esm/index.d.ts.map +1 -0
- package/dist/esm/index.js +11 -0
- package/dist/esm/index.js.map +1 -0
- package/dist/esm/web.d.ts +19 -0
- package/dist/esm/web.d.ts.map +1 -0
- package/dist/esm/web.js +44 -0
- package/dist/esm/web.js.map +1 -0
- package/dist/plugin.cjs.js +63 -0
- package/dist/plugin.cjs.js.map +1 -0
- package/dist/plugin.js +66 -0
- package/dist/plugin.js.map +1 -0
- package/ios/Sources/ElizaBunRuntimePlugin/BridgeInstaller.swift +94 -0
- package/ios/Sources/ElizaBunRuntimePlugin/ElizaBunRuntime.swift +705 -0
- package/ios/Sources/ElizaBunRuntimePlugin/ElizaBunRuntimePlugin.swift +1109 -0
- package/ios/Sources/ElizaBunRuntimePlugin/FullBunEngineHost.swift +677 -0
- package/ios/Sources/ElizaBunRuntimePlugin/JSContextHelpers.swift +226 -0
- package/ios/Sources/ElizaBunRuntimePlugin/SandboxPaths.swift +46 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/CryptoBridge.swift +238 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/ElizaSqliteVecBridge.m +28 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/FSBridge.swift +270 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/HTTPBridge.swift +153 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/HTTPServerBridge.swift +32 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/LlamaBridge.swift +233 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/LlamaBridgeImpl.swift +1863 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/LogBridge.swift +36 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/PathsBridge.swift +41 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/ProcessBridge.swift +80 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteBridge.swift +406 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteBridgeInstaller.swift +17 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/SqliteVecLoader.swift +66 -0
- package/ios/Sources/ElizaBunRuntimePlugin/bridge/UIBridge.swift +72 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlChinesePhonemizer.swift +313 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlConfiguration.swift +28 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlEngine.swift +325 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlHindiPhonemizer.swift +150 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlJapanesePhonemizer.swift +209 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlLatinPhonemizer.swift +374 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlModel.swift +87 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlPhonemizer.swift +679 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlPronunciationDicts.swift +131 -0
- package/ios/Sources/ElizaBunRuntimePlugin/kokoro/KokoroCoreMlSupport.swift +24 -0
- package/ios/Tests/llama-bridge-smoke-main.swift +92 -0
- package/package.json +68 -0
- package/src/bridge-contract.test.ts +127 -0
- package/src/definitions.d.ts +136 -0
- package/src/definitions.d.ts.map +1 -0
- package/src/definitions.ts +152 -0
- package/src/index.d.ts +9 -0
- package/src/index.d.ts.map +1 -0
- package/src/index.ts +16 -0
- package/src/web.d.ts +19 -0
- package/src/web.d.ts.map +1 -0
- package/src/web.ts +80 -0
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
import CoreML
|
|
2
|
+
import Foundation
|
|
3
|
+
|
|
4
|
+
@available(iOS 18.0, *)
|
|
5
|
+
final class KokoroCoreMlEngine {
|
|
6
|
+
static let shared = KokoroCoreMlEngine()
|
|
7
|
+
|
|
8
|
+
private struct LoadedModel {
|
|
9
|
+
let directory: URL
|
|
10
|
+
let config: KokoroConfig
|
|
11
|
+
let network: KokoroNetwork
|
|
12
|
+
let phonemizer: KokoroPhonemizer
|
|
13
|
+
let voiceEmbeddings: [String: [Float]]
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
private let queue = DispatchQueue(label: "ai.eliza.kokoro.coreml")
|
|
17
|
+
private var loaded: LoadedModel?
|
|
18
|
+
|
|
19
|
+
private init() {}
|
|
20
|
+
|
|
21
|
+
func synthesize(
|
|
22
|
+
modelDirectory: URL,
|
|
23
|
+
text: String,
|
|
24
|
+
voice: String?,
|
|
25
|
+
maxSamples: Int
|
|
26
|
+
) throws -> (samples: [Float], sampleRate: Int, durationMs: Double, voice: String) {
|
|
27
|
+
try queue.sync {
|
|
28
|
+
let start = DispatchTime.now()
|
|
29
|
+
let model = try loadModel(at: modelDirectory)
|
|
30
|
+
let selectedVoice = resolveVoice(voice, available: model.voiceEmbeddings)
|
|
31
|
+
guard let styleVector = model.voiceEmbeddings[selectedVoice] else {
|
|
32
|
+
throw AudioModelError.voiceNotFound(
|
|
33
|
+
voice: selectedVoice,
|
|
34
|
+
searchPath: "Available: \(Array(model.voiceEmbeddings.keys).sorted().prefix(8).joined(separator: ", "))"
|
|
35
|
+
)
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
let chunks = chunkText(
|
|
39
|
+
text,
|
|
40
|
+
phonemizer: model.phonemizer,
|
|
41
|
+
maxTokenCount: min(96, model.config.maxPhonemeLength - 4),
|
|
42
|
+
language: language(for: selectedVoice)
|
|
43
|
+
)
|
|
44
|
+
var samples: [Float] = []
|
|
45
|
+
samples.reserveCapacity(min(maxSamples, 24_000 * max(1, chunks.count * 2)))
|
|
46
|
+
for (index, chunk) in chunks.enumerated() {
|
|
47
|
+
let chunkSamples = try synthesizeChunk(
|
|
48
|
+
model: model,
|
|
49
|
+
text: chunk,
|
|
50
|
+
language: language(for: selectedVoice),
|
|
51
|
+
styleVector: styleVector,
|
|
52
|
+
maxSamples: maxSamples - samples.count
|
|
53
|
+
)
|
|
54
|
+
samples.append(contentsOf: chunkSamples)
|
|
55
|
+
if index < chunks.count - 1, samples.count < maxSamples {
|
|
56
|
+
samples.append(contentsOf: Array(repeating: 0, count: min(2_400, maxSamples - samples.count)))
|
|
57
|
+
}
|
|
58
|
+
if samples.count >= maxSamples { break }
|
|
59
|
+
}
|
|
60
|
+
guard !samples.isEmpty else {
|
|
61
|
+
throw AudioModelError.inferenceFailed(operation: "kokoro-coreml", reason: "model returned empty audio")
|
|
62
|
+
}
|
|
63
|
+
conditionAudio(&samples, sampleRate: model.config.sampleRate, maxSamples: maxSamples)
|
|
64
|
+
let elapsedNs = DispatchTime.now().uptimeNanoseconds - start.uptimeNanoseconds
|
|
65
|
+
return (
|
|
66
|
+
samples,
|
|
67
|
+
model.config.sampleRate,
|
|
68
|
+
Double(elapsedNs) / 1_000_000.0,
|
|
69
|
+
selectedVoice
|
|
70
|
+
)
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
private func synthesizeChunk(
|
|
75
|
+
model: LoadedModel,
|
|
76
|
+
text: String,
|
|
77
|
+
language: String,
|
|
78
|
+
styleVector: [Float],
|
|
79
|
+
maxSamples: Int
|
|
80
|
+
) throws -> [Float] {
|
|
81
|
+
let tokenIds = model.phonemizer.tokenize(
|
|
82
|
+
text,
|
|
83
|
+
maxLength: model.config.maxPhonemeLength,
|
|
84
|
+
language: language
|
|
85
|
+
)
|
|
86
|
+
let tokenCount = min(tokenIds.count, model.config.maxPhonemeLength)
|
|
87
|
+
let paddedIds = model.phonemizer.pad(Array(tokenIds.prefix(model.config.maxPhonemeLength)), to: model.config.maxPhonemeLength)
|
|
88
|
+
let inputIds = try createInt32Array(shape: [1, model.config.maxPhonemeLength], values: paddedIds.map { Int32($0) })
|
|
89
|
+
let mask = try createInt32Array(shape: [1, model.config.maxPhonemeLength], values: (0..<model.config.maxPhonemeLength).map { Int32($0 < tokenCount ? 1 : 0) })
|
|
90
|
+
let refS = try createFloatArray(shape: [1, model.config.styleDim], values: styleVector)
|
|
91
|
+
let speed = try createFloatArray(shape: [1], values: [1.0])
|
|
92
|
+
let output = try model.network.predictE2E(
|
|
93
|
+
inputIds: inputIds,
|
|
94
|
+
attentionMask: mask,
|
|
95
|
+
refS: refS,
|
|
96
|
+
speed: speed
|
|
97
|
+
)
|
|
98
|
+
let validSamples = min(output.audioLengthSamples, output.audio.count, maxSamples)
|
|
99
|
+
guard validSamples > 0 else {
|
|
100
|
+
throw AudioModelError.inferenceFailed(operation: "kokoro-coreml", reason: "model returned empty audio")
|
|
101
|
+
}
|
|
102
|
+
var samples = [Float](repeating: 0, count: validSamples)
|
|
103
|
+
if #available(iOS 16.0, *), output.audio.dataType == .float16 {
|
|
104
|
+
let ptr = output.audio.dataPointer.bindMemory(to: Float16.self, capacity: validSamples)
|
|
105
|
+
for index in 0..<validSamples { samples[index] = Float(ptr[index]) }
|
|
106
|
+
} else {
|
|
107
|
+
let ptr = output.audio.dataPointer.bindMemory(to: Float.self, capacity: validSamples)
|
|
108
|
+
for index in 0..<validSamples { samples[index] = ptr[index] }
|
|
109
|
+
}
|
|
110
|
+
return samples
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
private func chunkText(
|
|
114
|
+
_ text: String,
|
|
115
|
+
phonemizer: KokoroPhonemizer,
|
|
116
|
+
maxTokenCount: Int,
|
|
117
|
+
language: String
|
|
118
|
+
) -> [String] {
|
|
119
|
+
let normalized = text
|
|
120
|
+
.replacingOccurrences(of: "\n", with: " ")
|
|
121
|
+
.replacingOccurrences(of: "’", with: "'")
|
|
122
|
+
.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
123
|
+
guard !normalized.isEmpty else { return [] }
|
|
124
|
+
if tokenCount(normalized, phonemizer: phonemizer, language: language) <= maxTokenCount {
|
|
125
|
+
return [normalized]
|
|
126
|
+
}
|
|
127
|
+
var segments: [String] = []
|
|
128
|
+
var current = ""
|
|
129
|
+
for char in normalized {
|
|
130
|
+
current.append(char)
|
|
131
|
+
if ".!?,;:".contains(char) {
|
|
132
|
+
let part = current.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
133
|
+
if !part.isEmpty { segments.append(part) }
|
|
134
|
+
current = ""
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
let tail = current.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
138
|
+
if !tail.isEmpty { segments.append(tail) }
|
|
139
|
+
var chunks: [String] = []
|
|
140
|
+
for segment in segments {
|
|
141
|
+
appendChunk(segment, to: &chunks, phonemizer: phonemizer, maxTokenCount: maxTokenCount, language: language)
|
|
142
|
+
}
|
|
143
|
+
return chunks.isEmpty ? [normalized] : chunks
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
private func appendChunk(
|
|
147
|
+
_ segment: String,
|
|
148
|
+
to chunks: inout [String],
|
|
149
|
+
phonemizer: KokoroPhonemizer,
|
|
150
|
+
maxTokenCount: Int,
|
|
151
|
+
language: String
|
|
152
|
+
) {
|
|
153
|
+
if tokenCount(segment, phonemizer: phonemizer, language: language) <= maxTokenCount {
|
|
154
|
+
chunks.append(segment)
|
|
155
|
+
return
|
|
156
|
+
}
|
|
157
|
+
var current = ""
|
|
158
|
+
for word in segment.split(separator: " ") {
|
|
159
|
+
let candidate = current.isEmpty ? String(word) : "\(current) \(word)"
|
|
160
|
+
if tokenCount(candidate, phonemizer: phonemizer, language: language) <= maxTokenCount || current.isEmpty {
|
|
161
|
+
current = candidate
|
|
162
|
+
} else {
|
|
163
|
+
chunks.append(current)
|
|
164
|
+
current = String(word)
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
if !current.isEmpty { chunks.append(current) }
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
private func tokenCount(_ text: String, phonemizer: KokoroPhonemizer, language: String) -> Int {
|
|
171
|
+
phonemizer.tokenize(text, maxLength: 4_096, language: language).count
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
func diagnostics(modelDirectory: URL?) -> [String: Any] {
|
|
175
|
+
var payload: [String: Any] = [
|
|
176
|
+
"available": false,
|
|
177
|
+
"loaded": loaded != nil,
|
|
178
|
+
"requiresIos": "18.0",
|
|
179
|
+
]
|
|
180
|
+
guard let modelDirectory else { return payload }
|
|
181
|
+
payload["directory"] = modelDirectory.path
|
|
182
|
+
payload["files"] = [
|
|
183
|
+
"model": describeFile(modelDirectory.appendingPathComponent("kokoro_5s.mlmodelc", isDirectory: true)),
|
|
184
|
+
"g2pEncoder": describeFile(modelDirectory.appendingPathComponent("G2PEncoder.mlmodelc", isDirectory: true)),
|
|
185
|
+
"g2pDecoder": describeFile(modelDirectory.appendingPathComponent("G2PDecoder.mlmodelc", isDirectory: true)),
|
|
186
|
+
"vocab": describeFile(modelDirectory.appendingPathComponent("vocab_index.json")),
|
|
187
|
+
"voice": describeFile(modelDirectory.appendingPathComponent("voices/af_heart.json")),
|
|
188
|
+
]
|
|
189
|
+
payload["available"] = Self.hasRequiredAssets(in: modelDirectory)
|
|
190
|
+
if let loaded, loaded.directory.path == modelDirectory.path {
|
|
191
|
+
payload["loadedVoiceCount"] = loaded.voiceEmbeddings.count
|
|
192
|
+
}
|
|
193
|
+
return payload
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
static func modelDirectory(in bundleDir: String) -> URL? {
|
|
197
|
+
let dir = URL(fileURLWithPath: bundleDir, isDirectory: true)
|
|
198
|
+
.appendingPathComponent("tts", isDirectory: true)
|
|
199
|
+
.appendingPathComponent("kokoro-coreml", isDirectory: true)
|
|
200
|
+
return hasRequiredAssets(in: dir) ? dir : nil
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
static func hasRequiredAssets(in directory: URL) -> Bool {
|
|
204
|
+
let fm = FileManager.default
|
|
205
|
+
let required = [
|
|
206
|
+
directory.appendingPathComponent("kokoro_5s.mlmodelc", isDirectory: true),
|
|
207
|
+
directory.appendingPathComponent("vocab_index.json"),
|
|
208
|
+
directory.appendingPathComponent("voices/af_heart.json"),
|
|
209
|
+
]
|
|
210
|
+
return required.allSatisfy { fm.fileExists(atPath: $0.path) }
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
private func loadModel(at directory: URL) throws -> LoadedModel {
|
|
214
|
+
if let loaded, loaded.directory.path == directory.path {
|
|
215
|
+
return loaded
|
|
216
|
+
}
|
|
217
|
+
guard Self.hasRequiredAssets(in: directory) else {
|
|
218
|
+
throw AudioModelError.modelLoadFailed(modelId: "kokoro-coreml", reason: "missing required CoreML Kokoro assets under \(directory.path)")
|
|
219
|
+
}
|
|
220
|
+
let config = KokoroConfig.default
|
|
221
|
+
let phonemizer = try KokoroPhonemizer.loadVocab(from: directory.appendingPathComponent("vocab_index.json"))
|
|
222
|
+
try phonemizer.loadDictionaries(from: directory)
|
|
223
|
+
let encoder = directory.appendingPathComponent("G2PEncoder.mlmodelc", isDirectory: true)
|
|
224
|
+
let decoder = directory.appendingPathComponent("G2PDecoder.mlmodelc", isDirectory: true)
|
|
225
|
+
let g2pVocab = directory.appendingPathComponent("g2p_vocab.json")
|
|
226
|
+
if FileManager.default.fileExists(atPath: encoder.path),
|
|
227
|
+
FileManager.default.fileExists(atPath: decoder.path),
|
|
228
|
+
FileManager.default.fileExists(atPath: g2pVocab.path) {
|
|
229
|
+
try phonemizer.loadG2PModels(encoderURL: encoder, decoderURL: decoder, vocabURL: g2pVocab)
|
|
230
|
+
}
|
|
231
|
+
let voiceEmbeddings = try loadVoiceEmbeddings(from: directory.appendingPathComponent("voices", isDirectory: true), styleDim: config.styleDim)
|
|
232
|
+
guard !voiceEmbeddings.isEmpty else {
|
|
233
|
+
throw AudioModelError.modelLoadFailed(modelId: "kokoro-coreml", reason: "no Kokoro voice embeddings found")
|
|
234
|
+
}
|
|
235
|
+
let network = try KokoroNetwork(directory: directory, computeUnits: .all)
|
|
236
|
+
let loaded = LoadedModel(
|
|
237
|
+
directory: directory,
|
|
238
|
+
config: config,
|
|
239
|
+
network: network,
|
|
240
|
+
phonemizer: phonemizer,
|
|
241
|
+
voiceEmbeddings: voiceEmbeddings
|
|
242
|
+
)
|
|
243
|
+
self.loaded = loaded
|
|
244
|
+
AudioLog.modelLoading.info("Kokoro CoreML loaded voices=\(voiceEmbeddings.count) directory=\(directory.path)")
|
|
245
|
+
return loaded
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
private func loadVoiceEmbeddings(from directory: URL, styleDim: Int) throws -> [String: [Float]] {
|
|
249
|
+
guard FileManager.default.fileExists(atPath: directory.path) else { return [:] }
|
|
250
|
+
let files = try FileManager.default.contentsOfDirectory(at: directory, includingPropertiesForKeys: nil)
|
|
251
|
+
var embeddings: [String: [Float]] = [:]
|
|
252
|
+
for file in files where file.pathExtension.lowercased() == "json" {
|
|
253
|
+
let data = try Data(contentsOf: file)
|
|
254
|
+
guard let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
|
|
255
|
+
let embedding = json["embedding"] as? [Double] else {
|
|
256
|
+
continue
|
|
257
|
+
}
|
|
258
|
+
embeddings[file.deletingPathExtension().lastPathComponent] = embedding.prefix(styleDim).map(Float.init)
|
|
259
|
+
}
|
|
260
|
+
return embeddings
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
private func resolveVoice(_ requested: String?, available: [String: [Float]]) -> String {
|
|
264
|
+
let candidate = requested?.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
265
|
+
if let candidate, !candidate.isEmpty, available[candidate] != nil {
|
|
266
|
+
return candidate
|
|
267
|
+
}
|
|
268
|
+
if available["af_heart"] != nil { return "af_heart" }
|
|
269
|
+
if available["af_bella"] != nil { return "af_bella" }
|
|
270
|
+
return available.keys.sorted().first ?? "af_heart"
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
private func language(for voice: String) -> String {
|
|
274
|
+
if voice.hasPrefix("jf_") || voice.hasPrefix("jm_") { return "ja" }
|
|
275
|
+
if voice.hasPrefix("zf_") || voice.hasPrefix("zm_") { return "zh" }
|
|
276
|
+
if voice.hasPrefix("hf_") || voice.hasPrefix("hm_") { return "hi" }
|
|
277
|
+
if voice.hasPrefix("ff_") { return "fr" }
|
|
278
|
+
if voice.hasPrefix("ef_") || voice.hasPrefix("em_") { return "es" }
|
|
279
|
+
if voice.hasPrefix("pf_") || voice.hasPrefix("pm_") { return "pt" }
|
|
280
|
+
if voice.hasPrefix("if_") || voice.hasPrefix("im_") { return "it" }
|
|
281
|
+
return "en"
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
private func createInt32Array(shape: [Int], values: [Int32]) throws -> MLMultiArray {
|
|
285
|
+
let arr = try MLMultiArray(shape: shape.map { NSNumber(value: $0) }, dataType: .int32)
|
|
286
|
+
let ptr = arr.dataPointer.assumingMemoryBound(to: Int32.self)
|
|
287
|
+
for index in 0..<values.count { ptr[index] = values[index] }
|
|
288
|
+
return arr
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
private func createFloatArray(shape: [Int], values: [Float]) throws -> MLMultiArray {
|
|
292
|
+
let arr = try MLMultiArray(shape: shape.map { NSNumber(value: $0) }, dataType: .float32)
|
|
293
|
+
let ptr = arr.dataPointer.assumingMemoryBound(to: Float.self)
|
|
294
|
+
for index in 0..<values.count { ptr[index] = values[index] }
|
|
295
|
+
return arr
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
private func conditionAudio(_ samples: inout [Float], sampleRate: Int, maxSamples: Int) {
|
|
299
|
+
guard !samples.isEmpty else { return }
|
|
300
|
+
for index in samples.indices {
|
|
301
|
+
if !samples[index].isFinite { samples[index] = 0 }
|
|
302
|
+
}
|
|
303
|
+
let trailingSilence = min(Int(0.250 * Double(sampleRate)), max(0, maxSamples - samples.count))
|
|
304
|
+
if trailingSilence > 0 {
|
|
305
|
+
samples.append(contentsOf: repeatElement(Float(0), count: trailingSilence))
|
|
306
|
+
}
|
|
307
|
+
if samples.count > maxSamples {
|
|
308
|
+
samples = Array(samples.prefix(maxSamples))
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
private func describeFile(_ url: URL) -> [String: Any] {
|
|
313
|
+
let fm = FileManager.default
|
|
314
|
+
var payload: [String: Any] = [
|
|
315
|
+
"path": url.path,
|
|
316
|
+
"exists": fm.fileExists(atPath: url.path),
|
|
317
|
+
"readable": fm.isReadableFile(atPath: url.path),
|
|
318
|
+
]
|
|
319
|
+
if let attrs = try? fm.attributesOfItem(atPath: url.path),
|
|
320
|
+
let size = attrs[.size] as? NSNumber {
|
|
321
|
+
payload["bytes"] = size
|
|
322
|
+
}
|
|
323
|
+
return payload
|
|
324
|
+
}
|
|
325
|
+
}
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
|
|
3
|
+
/// Hindi text-to-phoneme conversion for Kokoro TTS.
|
|
4
|
+
///
|
|
5
|
+
/// Pipeline: Hindi text → CFStringTransform (Devanagari → IAST romanization) → IPA
|
|
6
|
+
/// Uses Apple's built-in transliteration — no external dependencies.
|
|
7
|
+
final class HindiPhonemizer {
|
|
8
|
+
|
|
9
|
+
// MARK: - IAST Romanization → IPA
|
|
10
|
+
|
|
11
|
+
/// Map IAST-style romanization (from CFStringTransform) to IPA.
|
|
12
|
+
private static let consonantMap: [String: String] = [
|
|
13
|
+
"kh": "kʰ", "gh": "ɡʱ", "ch": "tʃ", "jh": "dʒʱ",
|
|
14
|
+
"th": "tʰ", "dh": "dʱ", "ph": "pʰ", "bh": "bʱ",
|
|
15
|
+
"sh": "ʃ", "ṣ": "ʂ",
|
|
16
|
+
"k": "k", "g": "ɡ", "ṅ": "ŋ",
|
|
17
|
+
"c": "tʃ", "j": "dʒ", "ñ": "ɲ",
|
|
18
|
+
"ṭ": "ʈ", "ḍ": "ɖ", "ṇ": "ɳ",
|
|
19
|
+
"t": "t", "d": "d", "n": "n",
|
|
20
|
+
"p": "p", "b": "b", "m": "m",
|
|
21
|
+
"y": "j", "r": "ɾ", "l": "l", "v": "ʋ",
|
|
22
|
+
"s": "s", "h": "ɦ",
|
|
23
|
+
"ṛ": "ɾ", "ṁ": "̃",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
private static let vowelMap: [String: String] = [
|
|
27
|
+
"ā": "aː", "ī": "iː", "ū": "uː", "ē": "eː", "ō": "oː",
|
|
28
|
+
"ai": "ɛː", "au": "ɔː",
|
|
29
|
+
"a": "ə", "i": "ɪ", "u": "ʊ", "e": "e", "o": "o",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
// MARK: - Hindi Punctuation
|
|
33
|
+
|
|
34
|
+
private static let punctuationMap: [Character: String] = [
|
|
35
|
+
"।": ".", "॥": ".", ",": ",",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
// MARK: - Public API
|
|
39
|
+
|
|
40
|
+
/// Phonemize a single word (for dictionary fallback).
|
|
41
|
+
func phonemizeWord(_ word: String) -> String {
|
|
42
|
+
let m = NSMutableString(string: word)
|
|
43
|
+
CFStringTransform(m, nil, kCFStringTransformToLatin, false)
|
|
44
|
+
let latin = (m as String).lowercased()
|
|
45
|
+
let ipa = Self.romanToIPA(latin)
|
|
46
|
+
return ipa.count >= 4 ? "ˈ" + ipa : ipa
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
func phonemize(_ text: String) -> String {
|
|
50
|
+
var result = ""
|
|
51
|
+
var lastWasWord = false
|
|
52
|
+
|
|
53
|
+
let locale = Locale(identifier: "hi") as CFLocale
|
|
54
|
+
let cfText = text as CFString
|
|
55
|
+
let length = CFStringGetLength(cfText)
|
|
56
|
+
guard length > 0 else { return "" }
|
|
57
|
+
|
|
58
|
+
let tokenizer = CFStringTokenizerCreate(nil, cfText, CFRangeMake(0, length),
|
|
59
|
+
kCFStringTokenizerUnitWord, locale)
|
|
60
|
+
|
|
61
|
+
var tokens: [(range: NSRange, word: String, reading: String?)] = []
|
|
62
|
+
var tokenResult = CFStringTokenizerAdvanceToNextToken(tokenizer)
|
|
63
|
+
while tokenResult != [] {
|
|
64
|
+
let range = CFStringTokenizerGetCurrentTokenRange(tokenizer)
|
|
65
|
+
let latin = CFStringTokenizerCopyCurrentTokenAttribute(
|
|
66
|
+
tokenizer, kCFStringTokenizerAttributeLatinTranscription) as? String
|
|
67
|
+
let nsRange = NSRange(location: range.location, length: range.length)
|
|
68
|
+
let word = (text as NSString).substring(with: nsRange)
|
|
69
|
+
tokens.append((range: nsRange, word: word, reading: latin))
|
|
70
|
+
tokenResult = CFStringTokenizerAdvanceToNextToken(tokenizer)
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
var cursor = 0
|
|
74
|
+
for token in tokens {
|
|
75
|
+
if token.range.location > cursor {
|
|
76
|
+
let gapStart = text.index(text.startIndex, offsetBy: cursor)
|
|
77
|
+
let gapEnd = text.index(text.startIndex, offsetBy: token.range.location)
|
|
78
|
+
for ch in text[gapStart..<gapEnd] {
|
|
79
|
+
if let punct = Self.punctuationMap[ch] {
|
|
80
|
+
result += punct
|
|
81
|
+
lastWasWord = false
|
|
82
|
+
} else if ch.isPunctuation {
|
|
83
|
+
result += String(ch)
|
|
84
|
+
lastWasWord = false
|
|
85
|
+
} else if ch.isWhitespace {
|
|
86
|
+
lastWasWord = false
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
if let reading = token.reading {
|
|
92
|
+
if lastWasWord { result += " " }
|
|
93
|
+
result += Self.romanToIPA(reading.lowercased())
|
|
94
|
+
lastWasWord = true
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
cursor = token.range.location + token.range.length
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
if cursor < (text as NSString).length {
|
|
101
|
+
let remaining = (text as NSString).substring(from: cursor)
|
|
102
|
+
for ch in remaining {
|
|
103
|
+
if let punct = Self.punctuationMap[ch] { result += punct }
|
|
104
|
+
else if ch.isPunctuation { result += String(ch) }
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return result
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// MARK: - Romanization → IPA
|
|
112
|
+
|
|
113
|
+
static func romanToIPA(_ roman: String) -> String {
|
|
114
|
+
var result = ""
|
|
115
|
+
let chars = Array(roman)
|
|
116
|
+
var i = 0
|
|
117
|
+
|
|
118
|
+
while i < chars.count {
|
|
119
|
+
// Try 2-char sequences (aspirated consonants, diphthongs)
|
|
120
|
+
if i + 1 < chars.count {
|
|
121
|
+
let pair = String(chars[i...i+1])
|
|
122
|
+
if let ipa = consonantMap[pair] {
|
|
123
|
+
result += ipa
|
|
124
|
+
i += 2
|
|
125
|
+
continue
|
|
126
|
+
}
|
|
127
|
+
if let ipa = vowelMap[pair] {
|
|
128
|
+
result += ipa
|
|
129
|
+
i += 2
|
|
130
|
+
continue
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
// Single character
|
|
135
|
+
let single = String(chars[i])
|
|
136
|
+
if let ipa = consonantMap[single] {
|
|
137
|
+
result += ipa
|
|
138
|
+
} else if let ipa = vowelMap[single] {
|
|
139
|
+
result += ipa
|
|
140
|
+
} else if chars[i] == "ṣ" {
|
|
141
|
+
result += "ʂ"
|
|
142
|
+
} else {
|
|
143
|
+
result += single
|
|
144
|
+
}
|
|
145
|
+
i += 1
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
return result
|
|
149
|
+
}
|
|
150
|
+
}
|